xref: /vim-8.2.3635/runtime/tools/unicode.vim (revision 5f1920ad)
1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
6" Might need to update the URL to the emoji-data.txt
7" Usage: Vim -S <this-file>
8"
9" Author: Bram Moolenaar
10" Last Update: 2010 Jan 12
11
12" Parse lines of UnicodeData.txt.  Creates a list of lists in s:dataprops.
13func! ParseDataToProps()
14  let s:dataprops = []
15  let lnum = 1
16  while lnum <= line('$')
17    let l = split(getline(lnum), '\s*;\s*', 1)
18    if len(l) != 15
19      echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
20      return
21    endif
22    call add(s:dataprops, l)
23    let lnum += 1
24  endwhile
25endfunc
26
27" Parse lines of CaseFolding.txt.  Creates a list of lists in s:foldprops.
28func! ParseFoldProps()
29  let s:foldprops = []
30  let lnum = 1
31  while lnum <= line('$')
32    let line = getline(lnum)
33    if line !~ '^#' && line !~ '^\s*$'
34      let l = split(line, '\s*;\s*', 1)
35      if len(l) != 4
36        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
37        return
38      endif
39      call add(s:foldprops, l)
40    endif
41    let lnum += 1
42  endwhile
43endfunc
44
45" Parse lines of EastAsianWidth.txt.  Creates a list of lists in s:widthprops.
46func! ParseWidthProps()
47  let s:widthprops = []
48  let lnum = 1
49  while lnum <= line('$')
50    let line = getline(lnum)
51    if line !~ '^#' && line !~ '^\s*$'
52      let l = split(line, '\s*;\s*', 1)
53      if len(l) != 2
54        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
55        return
56      endif
57      call add(s:widthprops, l)
58    endif
59    let lnum += 1
60  endwhile
61endfunc
62
63" Build the toLower or toUpper table in a new buffer.
64" Uses s:dataprops.
65func! BuildCaseTable(name, index)
66  let start = -1
67  let end = -1
68  let step = 0
69  let add = -1
70  let ranges = []
71  for p in s:dataprops
72    if p[a:index] != ''
73      let n = ('0x' . p[0]) + 0
74      let nl = ('0x' . p[a:index]) + 0
75      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
76        " continue with same range.
77        let step = n - end
78        let end = n
79      else
80        if start >= 0
81          " produce previous range
82          call Range(ranges, start, end, step, add)
83        endif
84        let start = n
85        let end = n
86        let step = 0
87        let add = nl - n
88      endif
89    endif
90  endfor
91  if start >= 0
92    call Range(ranges, start, end, step, add)
93  endif
94
95  " New buffer to put the result in.
96  new
97  exe "file to" . a:name
98  call setline(1, "static convertStruct to" . a:name . "[] =")
99  call setline(2, "{")
100  call append('$', ranges)
101  call setline('$', getline('$')[:-2])  " remove last comma
102  call setline(line('$') + 1, "};")
103  wincmd p
104endfunc
105
106" Build the foldCase table in a new buffer.
107" Uses s:foldprops.
108func! BuildFoldTable()
109  let start = -1
110  let end = -1
111  let step = 0
112  let add = -1
113  let ranges = []
114  for p in s:foldprops
115    if p[1] == 'C' || p[1] == 'S'
116      let n = ('0x' . p[0]) + 0
117      let nl = ('0x' . p[2]) + 0
118      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
119        " continue with same range.
120        let step = n - end
121        let end = n
122      else
123        if start >= 0
124          " produce previous range
125          call Range(ranges, start, end, step, add)
126        endif
127        let start = n
128        let end = n
129        let step = 0
130        let add = nl - n
131      endif
132    endif
133  endfor
134  if start >= 0
135    call Range(ranges, start, end, step, add)
136  endif
137
138  " New buffer to put the result in.
139  new
140  file foldCase
141  call setline(1, "static convertStruct foldCase[] =")
142  call setline(2, "{")
143  call append('$', ranges)
144  call setline('$', getline('$')[:-2])  " remove last comma
145  call setline(line('$') + 1, "};")
146  wincmd p
147endfunc
148
149func! Range(ranges, start, end, step, add)
150  let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
151  call add(a:ranges, s)
152endfunc
153
154" Build the combining table.
155" Uses s:dataprops.
156func! BuildCombiningTable()
157  let start = -1
158  let end = -1
159  let ranges = []
160  for p in s:dataprops
161    if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
162      let n = ('0x' . p[0]) + 0
163      if start >= 0 && end + 1 == n
164        " continue with same range.
165        let end = n
166      else
167        if start >= 0
168          " produce previous range
169          call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
170        endif
171        let start = n
172        let end = n
173      endif
174    endif
175  endfor
176  if start >= 0
177    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
178  endif
179
180  " New buffer to put the result in.
181  new
182  file combining
183  call setline(1, "    static struct interval combining[] =")
184  call setline(2, "    {")
185  call append('$', ranges)
186  call setline('$', getline('$')[:-2])  " remove last comma
187  call setline(line('$') + 1, "    };")
188  wincmd p
189endfunc
190
191" Build the double width or ambiguous width table in a new buffer.
192" Uses s:widthprops and s:dataprops.
193func! BuildWidthTable(pattern, tableName)
194  let start = -1
195  let end = -1
196  let ranges = []
197  let dataidx = 0
198  for p in s:widthprops
199    if p[1][0] =~ a:pattern
200      if p[0] =~ '\.\.'
201        " It is a range.  we don't check for composing char then.
202        let rng = split(p[0], '\.\.')
203        if len(rng) != 2
204          echoerr "Cannot parse range: '" . p[0] . "' in width table"
205        endif
206        let n = ('0x' . rng[0]) + 0
207        let n_last =  ('0x' . rng[1]) + 0
208      else
209        let n = ('0x' . p[0]) + 0
210        let n_last = n
211      endif
212      " Find this char in the data table.
213      while 1
214        let dn = ('0x' . s:dataprops[dataidx][0]) + 0
215        if dn >= n
216          break
217        endif
218        let dataidx += 1
219      endwhile
220      if dn != n && n_last == n
221        echoerr "Cannot find character " . n . " in data table"
222      endif
223      " Only use the char when it's not a composing char.
224      " But use all chars from a range.
225      let dp = s:dataprops[dataidx]
226      if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
227        if start >= 0 && end + 1 == n
228          " continue with same range.
229        else
230          if start >= 0
231            " produce previous range
232            call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
233	    if a:pattern == 'A'
234	      call add(s:ambitable, [start, end])
235	    else
236	      call add(s:doubletable, [start, end])
237	    endif
238          endif
239          let start = n
240        endif
241        let end = n_last
242      endif
243    endif
244  endfor
245  if start >= 0
246    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
247    if a:pattern == 'A'
248      call add(s:ambitable, [start, end])
249    else
250      call add(s:doubletable, [start, end])
251    endif
252  endif
253
254  " New buffer to put the result in.
255  new
256  exe "file " . a:tableName
257  call setline(1, "    static struct interval " . a:tableName . "[] =")
258  call setline(2, "    {")
259  call append('$', ranges)
260  call setline('$', getline('$')[:-2])  " remove last comma
261  call setline(line('$') + 1, "    };")
262  wincmd p
263endfunc
264
265" Build the amoji width table in a new buffer.
266func! BuildEmojiTable(pattern, tableName)
267  let alltokens = []
268  let widthtokens = []
269  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")')
270  for n in range(len(lines))
271    let line = lines[n]
272    let token = split(line, '\.\.')
273    let first = ('0x' . token[0]) + 0
274    if len(token) == 1
275      let last = first
276    else
277      let last = ('0x' . token[1]) + 0
278    endif
279
280    let token = [first, last]
281    if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1])
282      let alltokens[-1][1] = token[1]
283    else
284      call add(alltokens, token)
285    endif
286
287    " Characters below 1F000 may be considered single width traditionally,
288    " making them double width causes problems.
289    if first < 0x1f000
290      continue
291    endif
292
293    " exclude characters that are in the "ambiguous" or "doublewidth" table
294    for ambi in s:ambitable
295      if first >= ambi[0] && first <= ambi[1]
296	let first = ambi[1] + 1
297      endif
298      if last >= ambi[0] && last <= ambi[1]
299	let last = ambi[0] - 1
300      endif
301    endfor
302    for double in s:doubletable
303      if first >= double[0] && first <= double[1]
304	let first = double[1] + 1
305      endif
306      if last >= double[0] && last <= double[1]
307	let last = double[0] - 1
308      endif
309    endfor
310
311    if first <= last
312      let token = [first, last]
313      if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1])
314	let widthtokens[-1][1] = token[1]
315      else
316	call add(widthtokens, token)
317      endif
318    endif
319  endfor
320  let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
321  let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
322
323  " New buffer to put the result in.
324  new
325  exe "file " . a:tableName . '_all'
326  call setline(1, "    static struct interval " . a:tableName . "_all[] =")
327  call setline(2, "    {")
328  call append('$', allranges)
329  call setline('$', getline('$')[:-2])  " remove last comma
330  call setline(line('$') + 1, "    };")
331  wincmd p
332
333  " New buffer to put the result in.
334  new
335  exe "file " . a:tableName . '_width'
336  call setline(1, "    static struct interval " . a:tableName . "_width[] =")
337  call setline(2, "    {")
338  call append('$', widthranges)
339  call setline('$', getline('$')[:-2])  " remove last comma
340  call setline(line('$') + 1, "    };")
341  wincmd p
342endfunc
343
344" Try to avoid hitting E36
345set equalalways
346
347" Edit the Unicode text file.  Requires the netrw plugin.
348edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
349
350" Parse each line, create a list of lists.
351call ParseDataToProps()
352
353" Build the toLower table.
354call BuildCaseTable("Lower", 13)
355
356" Build the toUpper table.
357call BuildCaseTable("Upper", 12)
358
359" Build the ranges of composing chars.
360call BuildCombiningTable()
361
362" Edit the case folding text file.  Requires the netrw plugin.
363edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
364
365" Parse each line, create a list of lists.
366call ParseFoldProps()
367
368" Build the foldCase table.
369call BuildFoldTable()
370
371" Edit the width text file.  Requires the netrw plugin.
372edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
373
374" Parse each line, create a list of lists.
375call ParseWidthProps()
376
377" Build the double width table.
378let s:doubletable = []
379call BuildWidthTable('[WF]', 'doublewidth')
380
381" Build the ambiguous width table.
382let s:ambitable = []
383call BuildWidthTable('A', 'ambiguous')
384
385" Edit the emoji text file.  Requires the netrw plugin.
386edit https://www.unicode.org/Public/emoji/11.0/emoji-data.txt
387"edit http://www.unicode.org/Public/emoji/latest/emoji-data.txt
388
389" Build the emoji table. Ver. 1.0 - 6.0
390" Must come after the "ambiguous" table
391call BuildEmojiTable('; Emoji\s\+#\s\+\d\+\.\d', 'emoji')
392