xref: /vim-8.2.3635/runtime/tools/unicode.vim (revision e30d1025)
1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
6" Might need to update the URL to the emoji-data.txt
7" Usage: Vim -S <this-file>
8"
9" Author: Bram Moolenaar
10" Last Update: 2020 Aug 24
11
12" Parse lines of UnicodeData.txt.  Creates a list of lists in s:dataprops.
13func! ParseDataToProps()
14  let s:dataprops = []
15  let lnum = 1
16  while lnum <= line('$')
17    let l = split(getline(lnum), '\s*;\s*', 1)
18    if len(l) != 15
19      echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
20      return
21    endif
22    call add(s:dataprops, l)
23    let lnum += 1
24  endwhile
25endfunc
26
27" Parse lines of CaseFolding.txt.  Creates a list of lists in s:foldprops.
28func! ParseFoldProps()
29  let s:foldprops = []
30  let lnum = 1
31  while lnum <= line('$')
32    let line = getline(lnum)
33    if line !~ '^#' && line !~ '^\s*$'
34      let l = split(line, '\s*;\s*', 1)
35      if len(l) != 4
36        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
37        return
38      endif
39      call add(s:foldprops, l)
40    endif
41    let lnum += 1
42  endwhile
43endfunc
44
45" Parse lines of EastAsianWidth.txt.  Creates a list of lists in s:widthprops.
46func! ParseWidthProps()
47  let s:widthprops = []
48  let lnum = 1
49  while lnum <= line('$')
50    let line = getline(lnum)
51    if line !~ '^#' && line !~ '^\s*$'
52      let l = split(line, '\s*;\s*', 1)
53      if len(l) != 2
54        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
55        return
56      endif
57      call add(s:widthprops, l)
58    endif
59    let lnum += 1
60  endwhile
61endfunc
62
63" Build the toLower or toUpper table in a new buffer.
64" Uses s:dataprops.
65func! BuildCaseTable(name, index)
66  let start = -1
67  let end = -1
68  let step = 0
69  let add = -1
70  let ranges = []
71  for p in s:dataprops
72    if p[a:index] != ''
73      let n = ('0x' . p[0]) + 0
74      let nl = ('0x' . p[a:index]) + 0
75      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
76        " continue with same range.
77        let step = n - end
78        let end = n
79      else
80        if start >= 0
81          " produce previous range
82          call Range(ranges, start, end, step, add)
83        endif
84        let start = n
85        let end = n
86        let step = 0
87        let add = nl - n
88      endif
89    endif
90  endfor
91  if start >= 0
92    call Range(ranges, start, end, step, add)
93  endif
94
95  " New buffer to put the result in.
96  new
97  exe "file to" . a:name
98  call setline(1, "static convertStruct to" . a:name . "[] =")
99  call setline(2, "{")
100  call append('$', ranges)
101  call setline('$', getline('$')[:-2])  " remove last comma
102  call setline(line('$') + 1, "};")
103  wincmd p
104endfunc
105
106" Build the foldCase table in a new buffer.
107" Uses s:foldprops.
108func! BuildFoldTable()
109  let start = -1
110  let end = -1
111  let step = 0
112  let add = -1
113  let ranges = []
114  for p in s:foldprops
115    if p[1] == 'C' || p[1] == 'S'
116      let n = ('0x' . p[0]) + 0
117      let nl = ('0x' . p[2]) + 0
118      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
119        " continue with same range.
120        let step = n - end
121        let end = n
122      else
123        if start >= 0
124          " produce previous range
125          call Range(ranges, start, end, step, add)
126        endif
127        let start = n
128        let end = n
129        let step = 0
130        let add = nl - n
131      endif
132    endif
133  endfor
134  if start >= 0
135    call Range(ranges, start, end, step, add)
136  endif
137
138  " New buffer to put the result in.
139  new
140  file foldCase
141  call setline(1, "static convertStruct foldCase[] =")
142  call setline(2, "{")
143  call append('$', ranges)
144  call setline('$', getline('$')[:-2])  " remove last comma
145  call setline(line('$') + 1, "};")
146  wincmd p
147endfunc
148
149func! Range(ranges, start, end, step, add)
150  let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
151  call add(a:ranges, s)
152endfunc
153
154" Build the combining table.
155" Uses s:dataprops.
156func! BuildCombiningTable()
157  let start = -1
158  let end = -1
159  let ranges = []
160  for p in s:dataprops
161    if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
162      let n = ('0x' . p[0]) + 0
163      if start >= 0 && end + 1 == n
164        " continue with same range.
165        let end = n
166      else
167        if start >= 0
168          " produce previous range
169          call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
170        endif
171        let start = n
172        let end = n
173      endif
174    endif
175  endfor
176  if start >= 0
177    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
178  endif
179
180  " New buffer to put the result in.
181  new
182  file combining
183  call setline(1, "    static struct interval combining[] =")
184  call setline(2, "    {")
185  call append('$', ranges)
186  call setline('$', getline('$')[:-2])  " remove last comma
187  call setline(line('$') + 1, "    };")
188  wincmd p
189endfunc
190
191" Build the double width or ambiguous width table in a new buffer.
192" Uses s:widthprops and s:dataprops.
193func! BuildWidthTable(pattern, tableName)
194  let start = -1
195  let end = -1
196  let ranges = []
197  let dataidx = 0
198  " Account for indentation differences between ambiguous and doublewidth
199  " table in mbyte.c
200  if a:pattern == 'A'
201    let spc = '    '
202  else
203    let spc = "\t"
204  endif
205  for p in s:widthprops
206    if p[1][0] =~ a:pattern
207      if p[0] =~ '\.\.'
208        " It is a range.  we don't check for composing char then.
209        let rng = split(p[0], '\.\.')
210        if len(rng) != 2
211          echoerr "Cannot parse range: '" . p[0] . "' in width table"
212        endif
213        let n = ('0x' . rng[0]) + 0
214        let n_last =  ('0x' . rng[1]) + 0
215      else
216        let n = ('0x' . p[0]) + 0
217        let n_last = n
218      endif
219      " Find this char in the data table.
220      while 1
221        let dn = ('0x' . s:dataprops[dataidx][0]) + 0
222        if dn >= n
223          break
224        endif
225        let dataidx += 1
226      endwhile
227      if dn != n && n_last == n
228        echoerr "Cannot find character " . n . " in data table"
229      endif
230      " Only use the char when it's not a composing char.
231      " But use all chars from a range.
232      let dp = s:dataprops[dataidx]
233      if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
234        if start >= 0 && end + 1 == n
235          " continue with same range.
236        else
237          if start >= 0
238            " produce previous range
239            call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
240	    if a:pattern == 'A'
241	      call add(s:ambitable, [start, end])
242	    else
243	      call add(s:doubletable, [start, end])
244	    endif
245          endif
246          let start = n
247        endif
248        let end = n_last
249      endif
250    endif
251  endfor
252  if start >= 0
253    call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
254    if a:pattern == 'A'
255      call add(s:ambitable, [start, end])
256    else
257      call add(s:doubletable, [start, end])
258    endif
259  endif
260
261  " New buffer to put the result in.
262  new
263  exe "file " . a:tableName
264  if a:pattern == 'A'
265    call setline(1, "static struct interval " . a:tableName . "[] =")
266    call setline(2, "{")
267  else
268    call setline(1, "    static struct interval " . a:tableName . "[] =")
269    call setline(2, "    {")
270  endif
271  call append('$', ranges)
272  call setline('$', getline('$')[:-2])  " remove last comma
273  if a:pattern == 'A'
274    call setline(line('$') + 1, "};")
275  else
276    call setline(line('$') + 1, "    };")
277  endif
278  wincmd p
279endfunc
280
281
282" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
283" and put them in dictionary "chardict"
284func AddLinesToCharDict(lines, chardict)
285  for line in a:lines
286    let tokens = split(line, '\.\.')
287    let first = str2nr(tokens[0], 16)
288    if len(tokens) == 1
289      let last = first
290    else
291      let last = str2nr(tokens[1], 16)
292    endif
293    for nr in range(first, last)
294      let a:chardict[nr] = 1
295    endfor
296  endfor
297endfunc
298
299func Test_AddLinesToCharDict()
300  let dict = {}
301  call AddLinesToCharDict([
302	\ '1234 blah blah',
303	\ '1235 blah blah',
304	\ '12a0..12a2 blah blah',
305	\ '12a1 blah blah',
306	\ ], dict)
307  call assert_equal({0x1234: 1, 0x1235: 1,
308	\ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
309	\ }, dict)
310  if v:errors != []
311    echoerr 'AddLinesToCharDict' v:errors
312    return 1
313  endif
314  return 0
315endfunc
316
317
318func CharDictToPairList(chardict)
319  let result = []
320  let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
321  let low = keys[0]
322  let high = keys[0]
323  for key in keys
324    if key > high + 1
325      call add(result, [low, high])
326      let low = key
327      let high = key
328    else
329      let high = key
330    endif
331  endfor
332  call add(result, [low, high])
333  return result
334endfunc
335
336func Test_CharDictToPairList()
337  let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
338	\ 0x1024: 1,
339	\ 0x2022: 1,
340	\ 0x2024: 1, 0x2025: 1}
341  call assert_equal([
342	\ [0x1020, 0x1022],
343	\ [0x1024, 0x1024],
344	\ [0x2022, 0x2022],
345	\ [0x2024, 0x2025],
346	\ ], CharDictToPairList(dict))
347  if v:errors != []
348    echoerr 'CharDictToPairList' v:errors
349    return 1
350  endif
351  return 0
352endfunc
353
354
355" Build the amoji width table in a new buffer.
356func BuildEmojiTable()
357  " First make the table for all emojis.
358  let pattern = '; Emoji\s\+#\s'
359  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
360
361  " Make a dictionary with an entry for each character.
362  let chardict = {}
363  call AddLinesToCharDict(lines, chardict)
364  let pairlist = CharDictToPairList(chardict)
365  let allranges = map(pairlist, 'printf("    {0x%04x, 0x%04x},", v:val[0], v:val[1])')
366
367  " New buffer to put the result in.
368  new
369  exe 'file emoji_all'
370  call setline(1, "static struct interval emoji_all[] =")
371  call setline(2, "{")
372  call append('$', allranges)
373  call setline('$', getline('$')[:-2])  " remove last comma
374  call setline(line('$') + 1, "};")
375  wincmd p
376
377  " Make the table for wide emojis.
378  let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
379  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
380
381  " Make a dictionary with an entry for each character.
382  let chardict = {}
383  call AddLinesToCharDict(lines, chardict)
384
385  " exclude characters that are in the "ambiguous" or "doublewidth" table
386  for ambi in s:ambitable
387    for nr in range(ambi[0], ambi[1])
388      if has_key(chardict, nr)
389	call remove(chardict, nr)
390      endif
391    endfor
392  endfor
393
394  for wide in s:doubletable
395    for nr in range(wide[0], wide[1])
396      if has_key(chardict, nr)
397	call remove(chardict, nr)
398      endif
399    endfor
400  endfor
401
402  let pairlist = CharDictToPairList(chardict)
403  let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
404
405  " New buffer to put the result in.
406  new
407  exe 'file emoji_wide'
408  call setline(1, "    static struct interval emoji_wide[] =")
409  call setline(2, "    {")
410  call append('$', wide_ranges)
411  call setline('$', getline('$')[:-2])  " remove last comma
412  call setline(line('$') + 1, "    };")
413  wincmd p
414endfunc
415
416" First test a few things
417let v:errors = []
418if Test_AddLinesToCharDict() || Test_CharDictToPairList()
419  finish
420endif
421
422
423" Try to avoid hitting E36
424set equalalways
425
426" Edit the Unicode text file.  Requires the netrw plugin.
427edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
428
429" Parse each line, create a list of lists.
430call ParseDataToProps()
431
432" Build the toLower table.
433call BuildCaseTable("Lower", 13)
434
435" Build the toUpper table.
436call BuildCaseTable("Upper", 12)
437
438" Build the ranges of composing chars.
439call BuildCombiningTable()
440
441" Edit the case folding text file.  Requires the netrw plugin.
442edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
443
444" Parse each line, create a list of lists.
445call ParseFoldProps()
446
447" Build the foldCase table.
448call BuildFoldTable()
449
450" Edit the width text file.  Requires the netrw plugin.
451edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
452
453" Parse each line, create a list of lists.
454call ParseWidthProps()
455
456" Build the double width table.
457let s:doubletable = []
458call BuildWidthTable('[WF]', 'doublewidth')
459
460" Build the ambiguous width table.
461let s:ambitable = []
462call BuildWidthTable('A', 'ambiguous')
463
464" Edit the emoji text file.  Requires the netrw plugin.
465edit https://unicode.org/Public/emoji/12.1/emoji-data.txt
466
467" Build the emoji table. Ver. 1.0 - 6.0
468" Must come after the "ambiguous" and "doublewidth" tables
469call BuildEmojiTable()
470