xref: /vim-8.2.3635/runtime/tools/unicode.vim (revision cb03397a)
1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
6" Usage: Vim -S <this-file>
7"
8" Author: Bram Moolenaar
9" Last Update: 2010 Jan 12
10
11" Parse lines of UnicodeData.txt.  Creates a list of lists in s:dataprops.
12func! ParseDataToProps()
13  let s:dataprops = []
14  let lnum = 1
15  while lnum <= line('$')
16    let l = split(getline(lnum), '\s*;\s*', 1)
17    if len(l) != 15
18      echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
19      return
20    endif
21    call add(s:dataprops, l)
22    let lnum += 1
23  endwhile
24endfunc
25
26" Parse lines of CaseFolding.txt.  Creates a list of lists in s:foldprops.
27func! ParseFoldProps()
28  let s:foldprops = []
29  let lnum = 1
30  while lnum <= line('$')
31    let line = getline(lnum)
32    if line !~ '^#' && line !~ '^\s*$'
33      let l = split(line, '\s*;\s*', 1)
34      if len(l) != 4
35        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
36        return
37      endif
38      call add(s:foldprops, l)
39    endif
40    let lnum += 1
41  endwhile
42endfunc
43
44" Parse lines of EastAsianWidth.txt.  Creates a list of lists in s:widthprops.
45func! ParseWidthProps()
46  let s:widthprops = []
47  let lnum = 1
48  while lnum <= line('$')
49    let line = getline(lnum)
50    if line !~ '^#' && line !~ '^\s*$'
51      let l = split(line, '\s*;\s*', 1)
52      if len(l) != 2
53        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
54        return
55      endif
56      call add(s:widthprops, l)
57    endif
58    let lnum += 1
59  endwhile
60endfunc
61
62" Build the toLower or toUpper table in a new buffer.
63" Uses s:dataprops.
64func! BuildCaseTable(name, index)
65  let start = -1
66  let end = -1
67  let step = 0
68  let add = -1
69  let ranges = []
70  for p in s:dataprops
71    if p[a:index] != ''
72      let n = ('0x' . p[0]) + 0
73      let nl = ('0x' . p[a:index]) + 0
74      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
75        " continue with same range.
76        let step = n - end
77        let end = n
78      else
79        if start >= 0
80          " produce previous range
81          call Range(ranges, start, end, step, add)
82        endif
83        let start = n
84        let end = n
85        let step = 0
86        let add = nl - n
87      endif
88    endif
89  endfor
90  if start >= 0
91    call Range(ranges, start, end, step, add)
92  endif
93
94  " New buffer to put the result in.
95  new
96  exe "file to" . a:name
97  call setline(1, "static convertStruct to" . a:name . "[] =")
98  call setline(2, "{")
99  call append('$', ranges)
100  call setline('$', getline('$')[:-2])  " remove last comma
101  call setline(line('$') + 1, "};")
102  wincmd p
103endfunc
104
105" Build the foldCase table in a new buffer.
106" Uses s:foldprops.
107func! BuildFoldTable()
108  let start = -1
109  let end = -1
110  let step = 0
111  let add = -1
112  let ranges = []
113  for p in s:foldprops
114    if p[1] == 'C' || p[1] == 'S'
115      let n = ('0x' . p[0]) + 0
116      let nl = ('0x' . p[2]) + 0
117      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
118        " continue with same range.
119        let step = n - end
120        let end = n
121      else
122        if start >= 0
123          " produce previous range
124          call Range(ranges, start, end, step, add)
125        endif
126        let start = n
127        let end = n
128        let step = 0
129        let add = nl - n
130      endif
131    endif
132  endfor
133  if start >= 0
134    call Range(ranges, start, end, step, add)
135  endif
136
137  " New buffer to put the result in.
138  new
139  file foldCase
140  call setline(1, "static convertStruct foldCase[] =")
141  call setline(2, "{")
142  call append('$', ranges)
143  call setline('$', getline('$')[:-2])  " remove last comma
144  call setline(line('$') + 1, "};")
145  wincmd p
146endfunc
147
148func! Range(ranges, start, end, step, add)
149  let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
150  call add(a:ranges, s)
151endfunc
152
153" Build the combining table.
154" Uses s:dataprops.
155func! BuildCombiningTable()
156  let start = -1
157  let end = -1
158  let ranges = []
159  for p in s:dataprops
160    if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
161      let n = ('0x' . p[0]) + 0
162      if start >= 0 && end + 1 == n
163        " continue with same range.
164        let end = n
165      else
166        if start >= 0
167          " produce previous range
168          call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
169        endif
170        let start = n
171        let end = n
172      endif
173    endif
174  endfor
175  if start >= 0
176    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
177  endif
178
179  " New buffer to put the result in.
180  new
181  file combining
182  call setline(1, "    static struct interval combining[] =")
183  call setline(2, "    {")
184  call append('$', ranges)
185  call setline('$', getline('$')[:-2])  " remove last comma
186  call setline(line('$') + 1, "    };")
187  wincmd p
188endfunc
189
190" Build the double width or ambiguous width table in a new buffer.
191" Uses s:widthprops and s:dataprops.
192func! BuildWidthTable(pattern, tableName)
193  let start = -1
194  let end = -1
195  let ranges = []
196  let dataidx = 0
197  for p in s:widthprops
198    if p[1][0] =~ a:pattern
199      if p[0] =~ '\.\.'
200        " It is a range.  we don't check for composing char then.
201        let rng = split(p[0], '\.\.')
202        if len(rng) != 2
203          echoerr "Cannot parse range: '" . p[0] . "' in width table"
204        endif
205        let n = ('0x' . rng[0]) + 0
206        let n_last =  ('0x' . rng[1]) + 0
207      else
208        let n = ('0x' . p[0]) + 0
209        let n_last = n
210      endif
211      " Find this char in the data table.
212      while 1
213        let dn = ('0x' . s:dataprops[dataidx][0]) + 0
214        if dn >= n
215          break
216        endif
217        let dataidx += 1
218      endwhile
219      if dn != n && n_last == n
220        echoerr "Cannot find character " . n . " in data table"
221      endif
222      " Only use the char when it's not a composing char.
223      " But use all chars from a range.
224      let dp = s:dataprops[dataidx]
225      if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
226        if start >= 0 && end + 1 == n
227          " continue with same range.
228        else
229          if start >= 0
230            " produce previous range
231            call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
232	    if a:pattern == 'A'
233	      call add(s:ambitable, [start, end])
234	    else
235	      call add(s:doubletable, [start, end])
236	    endif
237          endif
238          let start = n
239        endif
240        let end = n_last
241      endif
242    endif
243  endfor
244  if start >= 0
245    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
246    if a:pattern == 'A'
247      call add(s:ambitable, [start, end])
248    else
249      call add(s:doubletable, [start, end])
250    endif
251  endif
252
253  " New buffer to put the result in.
254  new
255  exe "file " . a:tableName
256  call setline(1, "    static struct interval " . a:tableName . "[] =")
257  call setline(2, "    {")
258  call append('$', ranges)
259  call setline('$', getline('$')[:-2])  " remove last comma
260  call setline(line('$') + 1, "    };")
261  wincmd p
262endfunc
263
264" Build the amoji width table in a new buffer.
265func! BuildEmojiTable(pattern, tableName)
266  let alltokens = []
267  let widthtokens = []
268  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")')
269  for n in range(len(lines))
270    let line = lines[n]
271    let token = split(line, '\.\.')
272    let first = ('0x' . token[0]) + 0
273    if len(token) == 1
274      let last = first
275    else
276      let last = ('0x' . token[1]) + 0
277    endif
278
279    let token = [first, last]
280    if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1])
281      let alltokens[-1][1] = token[1]
282    else
283      call add(alltokens, token)
284    endif
285
286    " Characters below 1F000 may be considered single width traditionally,
287    " making them double width causes problems.
288    if first < 0x1f000
289      continue
290    endif
291
292    " exclude characters that are in the "ambiguous" or "doublewidth" table
293    for ambi in s:ambitable
294      if first >= ambi[0] && first <= ambi[1]
295	let first = ambi[1] + 1
296      endif
297      if last >= ambi[0] && last <= ambi[1]
298	let last = ambi[0] - 1
299      endif
300    endfor
301    for double in s:doubletable
302      if first >= double[0] && first <= double[1]
303	let first = double[1] + 1
304      endif
305      if last >= double[0] && last <= double[1]
306	let last = double[0] - 1
307      endif
308    endfor
309
310    if first <= last
311      let token = [first, last]
312      if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1])
313	let widthtokens[-1][1] = token[1]
314      else
315	call add(widthtokens, token)
316      endif
317    endif
318  endfor
319  let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
320  let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
321
322  " New buffer to put the result in.
323  new
324  exe "file " . a:tableName . '_all'
325  call setline(1, "    static struct interval " . a:tableName . "_all[] =")
326  call setline(2, "    {")
327  call append('$', allranges)
328  call setline('$', getline('$')[:-2])  " remove last comma
329  call setline(line('$') + 1, "    };")
330  wincmd p
331
332  " New buffer to put the result in.
333  new
334  exe "file " . a:tableName . '_width'
335  call setline(1, "    static struct interval " . a:tableName . "_width[] =")
336  call setline(2, "    {")
337  call append('$', widthranges)
338  call setline('$', getline('$')[:-2])  " remove last comma
339  call setline(line('$') + 1, "    };")
340  wincmd p
341endfunc
342
343" Try to avoid hitting E36
344set equalalways
345
346" Edit the Unicode text file.  Requires the netrw plugin.
347edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
348
349" Parse each line, create a list of lists.
350call ParseDataToProps()
351
352" Build the toLower table.
353call BuildCaseTable("Lower", 13)
354
355" Build the toUpper table.
356call BuildCaseTable("Upper", 12)
357
358" Build the ranges of composing chars.
359call BuildCombiningTable()
360
361" Edit the case folding text file.  Requires the netrw plugin.
362edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
363
364" Parse each line, create a list of lists.
365call ParseFoldProps()
366
367" Build the foldCase table.
368call BuildFoldTable()
369
370" Edit the width text file.  Requires the netrw plugin.
371edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
372
373" Parse each line, create a list of lists.
374call ParseWidthProps()
375
376" Build the double width table.
377let s:doubletable = []
378call BuildWidthTable('[WF]', 'doublewidth')
379
380" Build the ambiguous width table.
381let s:ambitable = []
382call BuildWidthTable('A', 'ambiguous')
383
384" Edit the emoji text file.  Requires the netrw plugin.
385edit http://www.unicode.org/Public/emoji/3.0/emoji-data.txt
386
387" Build the emoji table. Ver. 1.0 - 6.0
388" Must come after the "ambiguous" table
389call BuildEmojiTable('; Emoji\s\+# [1-6]\.[0-9]', 'emoji')
390