xref: /vim-8.2.3635/runtime/tools/unicode.vim (revision 56994d21)
1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
6" Might need to update the URL to the emoji-data.txt
7" Usage: Vim -S <this-file>
8"
9" Author: Bram Moolenaar
10" Last Update: 2020 Aug 24
11
12" Parse lines of UnicodeData.txt.  Creates a list of lists in s:dataprops.
13func! ParseDataToProps()
14  let s:dataprops = []
15  let lnum = 1
16  while lnum <= line('$')
17    let l = split(getline(lnum), '\s*;\s*', 1)
18    if len(l) != 15
19      echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
20      return
21    endif
22    call add(s:dataprops, l)
23    let lnum += 1
24  endwhile
25endfunc
26
27" Parse lines of CaseFolding.txt.  Creates a list of lists in s:foldprops.
28func! ParseFoldProps()
29  let s:foldprops = []
30  let lnum = 1
31  while lnum <= line('$')
32    let line = getline(lnum)
33    if line !~ '^#' && line !~ '^\s*$'
34      let l = split(line, '\s*;\s*', 1)
35      if len(l) != 4
36        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
37        return
38      endif
39      call add(s:foldprops, l)
40    endif
41    let lnum += 1
42  endwhile
43endfunc
44
45" Parse lines of EastAsianWidth.txt.  Creates a list of lists in s:widthprops.
46func! ParseWidthProps()
47  let s:widthprops = []
48  let lnum = 1
49  while lnum <= line('$')
50    let line = getline(lnum)
51    if line !~ '^#' && line !~ '^\s*$'
52      let l = split(line, '\s*;\s*', 1)
53      if len(l) != 2
54        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
55        return
56      endif
57      call add(s:widthprops, l)
58    endif
59    let lnum += 1
60  endwhile
61endfunc
62
63" Build the toLower or toUpper table in a new buffer.
64" Uses s:dataprops.
65func! BuildCaseTable(name, index)
66  let start = -1
67  let end = -1
68  let step = 0
69  let add = -1
70  let ranges = []
71  for p in s:dataprops
72    if p[a:index] != ''
73      let n = ('0x' . p[0]) + 0
74      let nl = ('0x' . p[a:index]) + 0
75      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
76        " continue with same range.
77        let step = n - end
78        let end = n
79      else
80        if start >= 0
81          " produce previous range
82          call Range(ranges, start, end, step, add)
83        endif
84        let start = n
85        let end = n
86        let step = 0
87        let add = nl - n
88      endif
89    endif
90  endfor
91  if start >= 0
92    call Range(ranges, start, end, step, add)
93  endif
94
95  " New buffer to put the result in.
96  new
97  exe "file to" . a:name
98  call setline(1, "static convertStruct to" . a:name . "[] =")
99  call setline(2, "{")
100  call append('$', ranges)
101  call setline('$', getline('$')[:-2])  " remove last comma
102  call setline(line('$') + 1, "};")
103  wincmd p
104endfunc
105
106" Build the foldCase table in a new buffer.
107" Uses s:foldprops.
108func! BuildFoldTable()
109  let start = -1
110  let end = -1
111  let step = 0
112  let add = -1
113  let ranges = []
114  for p in s:foldprops
115    if p[1] == 'C' || p[1] == 'S'
116      let n = ('0x' . p[0]) + 0
117      let nl = ('0x' . p[2]) + 0
118      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
119        " continue with same range.
120        let step = n - end
121        let end = n
122      else
123        if start >= 0
124          " produce previous range
125          call Range(ranges, start, end, step, add)
126        endif
127        let start = n
128        let end = n
129        let step = 0
130        let add = nl - n
131      endif
132    endif
133  endfor
134  if start >= 0
135    call Range(ranges, start, end, step, add)
136  endif
137
138  " New buffer to put the result in.
139  new
140  file foldCase
141  call setline(1, "static convertStruct foldCase[] =")
142  call setline(2, "{")
143  call append('$', ranges)
144  call setline('$', getline('$')[:-2])  " remove last comma
145  call setline(line('$') + 1, "};")
146  wincmd p
147endfunc
148
149func! Range(ranges, start, end, step, add)
150  let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
151  call add(a:ranges, s)
152endfunc
153
154" Build the combining table.
155" Uses s:dataprops.
156func! BuildCombiningTable()
157  let start = -1
158  let end = -1
159  let ranges = []
160  for p in s:dataprops
161    if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
162      let n = ('0x' . p[0]) + 0
163      if start >= 0 && end + 1 == n
164        " continue with same range.
165        let end = n
166      else
167        if start >= 0
168          " produce previous range
169          call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
170        endif
171        let start = n
172        let end = n
173      endif
174    endif
175  endfor
176  if start >= 0
177    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
178  endif
179
180  " New buffer to put the result in.
181  new
182  file combining
183  call setline(1, "    static struct interval combining[] =")
184  call setline(2, "    {")
185  call append('$', ranges)
186  call setline('$', getline('$')[:-2])  " remove last comma
187  call setline(line('$') + 1, "    };")
188  wincmd p
189endfunc
190
191" Build the double width or ambiguous width table in a new buffer.
192" Uses s:widthprops and s:dataprops.
193func! BuildWidthTable(pattern, tableName)
194  let start = -1
195  let end = -1
196  let ranges = []
197  let dataidx = 0
198  for p in s:widthprops
199    if p[1][0] =~ a:pattern
200      if p[0] =~ '\.\.'
201        " It is a range.  we don't check for composing char then.
202        let rng = split(p[0], '\.\.')
203        if len(rng) != 2
204          echoerr "Cannot parse range: '" . p[0] . "' in width table"
205        endif
206        let n = ('0x' . rng[0]) + 0
207        let n_last =  ('0x' . rng[1]) + 0
208      else
209        let n = ('0x' . p[0]) + 0
210        let n_last = n
211      endif
212      " Find this char in the data table.
213      while 1
214        let dn = ('0x' . s:dataprops[dataidx][0]) + 0
215        if dn >= n
216          break
217        endif
218        let dataidx += 1
219      endwhile
220      if dn != n && n_last == n
221        echoerr "Cannot find character " . n . " in data table"
222      endif
223      " Only use the char when it's not a composing char.
224      " But use all chars from a range.
225      let dp = s:dataprops[dataidx]
226      if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
227        if start >= 0 && end + 1 == n
228          " continue with same range.
229        else
230          if start >= 0
231            " produce previous range
232            call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
233	    if a:pattern == 'A'
234	      call add(s:ambitable, [start, end])
235	    else
236	      call add(s:doubletable, [start, end])
237	    endif
238          endif
239          let start = n
240        endif
241        let end = n_last
242      endif
243    endif
244  endfor
245  if start >= 0
246    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
247    if a:pattern == 'A'
248      call add(s:ambitable, [start, end])
249    else
250      call add(s:doubletable, [start, end])
251    endif
252  endif
253
254  " New buffer to put the result in.
255  new
256  exe "file " . a:tableName
257  call setline(1, "    static struct interval " . a:tableName . "[] =")
258  call setline(2, "    {")
259  call append('$', ranges)
260  call setline('$', getline('$')[:-2])  " remove last comma
261  call setline(line('$') + 1, "    };")
262  wincmd p
263endfunc
264
265
266" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
267" and put them in dictionary "chardict"
268func AddLinesToCharDict(lines, chardict)
269  for line in a:lines
270    let tokens = split(line, '\.\.')
271    let first = str2nr(tokens[0], 16)
272    if len(tokens) == 1
273      let last = first
274    else
275      let last = str2nr(tokens[1], 16)
276    endif
277    for nr in range(first, last)
278      let a:chardict[nr] = 1
279    endfor
280  endfor
281endfunc
282
283func Test_AddLinesToCharDict()
284  let dict = {}
285  call AddLinesToCharDict([
286	\ '1234 blah blah',
287	\ '1235 blah blah',
288	\ '12a0..12a2 blah blah',
289	\ '12a1 blah blah',
290	\ ], dict)
291  call assert_equal({0x1234: 1, 0x1235: 1,
292	\ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
293	\ }, dict)
294  if v:errors != []
295    echoerr 'AddLinesToCharDict' v:errors
296    return 1
297  endif
298  return 0
299endfunc
300
301
302func CharDictToPairList(chardict)
303  let result = []
304  let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
305  let low = keys[0]
306  let high = keys[0]
307  for key in keys
308    if key > high + 1
309      call add(result, [low, high])
310      let low = key
311      let high = key
312    else
313      let high = key
314    endif
315  endfor
316  call add(result, [low, high])
317  return result
318endfunc
319
320func Test_CharDictToPairList()
321  let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
322	\ 0x1024: 1,
323	\ 0x2022: 1,
324	\ 0x2024: 1, 0x2025: 1}
325  call assert_equal([
326	\ [0x1020, 0x1022],
327	\ [0x1024, 0x1024],
328	\ [0x2022, 0x2022],
329	\ [0x2024, 0x2025],
330	\ ], CharDictToPairList(dict))
331  if v:errors != []
332    echoerr 'CharDictToPairList' v:errors
333    return 1
334  endif
335  return 0
336endfunc
337
338
339" Build the amoji width table in a new buffer.
340func BuildEmojiTable()
341  " First make the table for all emojis.
342  let pattern = '; Emoji\s\+#\s'
343  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
344
345  " Make a dictionary with an entry for each character.
346  let chardict = {}
347  call AddLinesToCharDict(lines, chardict)
348  let pairlist = CharDictToPairList(chardict)
349  let allranges = map(pairlist, 'printf("    {0x%04x, 0x%04x},", v:val[0], v:val[1])')
350
351  " New buffer to put the result in.
352  new
353  exe 'file emoji_all'
354  call setline(1, "static struct interval emoji_all[] =")
355  call setline(2, "{")
356  call append('$', allranges)
357  call setline('$', getline('$')[:-2])  " remove last comma
358  call setline(line('$') + 1, "};")
359  wincmd p
360
361  " Make the table for wide emojis.
362  let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
363  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
364
365  " Make a dictionary with an entry for each character.
366  let chardict = {}
367  call AddLinesToCharDict(lines, chardict)
368
369  " exclude characters that are in the "ambiguous" or "doublewidth" table
370  for ambi in s:ambitable
371    for nr in range(ambi[0], ambi[1])
372      if has_key(chardict, nr)
373	call remove(chardict, nr)
374      endif
375    endfor
376  endfor
377
378  for wide in s:doubletable
379    for nr in range(wide[0], wide[1])
380      if has_key(chardict, nr)
381	call remove(chardict, nr)
382      endif
383    endfor
384  endfor
385
386  let pairlist = CharDictToPairList(chardict)
387  let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
388
389  " New buffer to put the result in.
390  new
391  exe 'file emoji_wide'
392  call setline(1, "    static struct interval emoji_wide[] =")
393  call setline(2, "    {")
394  call append('$', wide_ranges)
395  call setline('$', getline('$')[:-2])  " remove last comma
396  call setline(line('$') + 1, "    };")
397  wincmd p
398endfunc
399
400" First test a few things
401let v:errors = []
402if Test_AddLinesToCharDict() || Test_CharDictToPairList()
403  finish
404endif
405
406
407" Try to avoid hitting E36
408set equalalways
409
410" Edit the Unicode text file.  Requires the netrw plugin.
411edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
412
413" Parse each line, create a list of lists.
414call ParseDataToProps()
415
416" Build the toLower table.
417call BuildCaseTable("Lower", 13)
418
419" Build the toUpper table.
420call BuildCaseTable("Upper", 12)
421
422" Build the ranges of composing chars.
423call BuildCombiningTable()
424
425" Edit the case folding text file.  Requires the netrw plugin.
426edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
427
428" Parse each line, create a list of lists.
429call ParseFoldProps()
430
431" Build the foldCase table.
432call BuildFoldTable()
433
434" Edit the width text file.  Requires the netrw plugin.
435edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
436
437" Parse each line, create a list of lists.
438call ParseWidthProps()
439
440" Build the double width table.
441let s:doubletable = []
442call BuildWidthTable('[WF]', 'doublewidth')
443
444" Build the ambiguous width table.
445let s:ambitable = []
446call BuildWidthTable('A', 'ambiguous')
447
448" Edit the emoji text file.  Requires the netrw plugin.
449edit https://unicode.org/Public/emoji/12.1/emoji-data.txt
450
451" Build the emoji table. Ver. 1.0 - 6.0
452" Must come after the "ambiguous" and "doublewidth" tables
453call BuildEmojiTable()
454