xref: /vim-8.2.3635/runtime/tools/unicode.vim (revision d887297a)
13e8cb587SBram Moolenaar" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
23e8cb587SBram Moolenaar" The format of the UnicodeData.txt file is explained here:
33e8cb587SBram Moolenaar" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
43e8cb587SBram Moolenaar" For the other files see the header.
53e8cb587SBram Moolenaar"
6383aa84cSBram Moolenaar" Might need to update the URL to the emoji-data.txt
73e8cb587SBram Moolenaar" Usage: Vim -S <this-file>
83e8cb587SBram Moolenaar"
93e8cb587SBram Moolenaar" Author: Bram Moolenaar
10207f0093SBram Moolenaar" Last Update: 2020 Aug 24
113e8cb587SBram Moolenaar
123e8cb587SBram Moolenaar" Parse lines of UnicodeData.txt.  Creates a list of lists in s:dataprops.
133e8cb587SBram Moolenaarfunc! ParseDataToProps()
143e8cb587SBram Moolenaar  let s:dataprops = []
153e8cb587SBram Moolenaar  let lnum = 1
163e8cb587SBram Moolenaar  while lnum <= line('$')
173e8cb587SBram Moolenaar    let l = split(getline(lnum), '\s*;\s*', 1)
183e8cb587SBram Moolenaar    if len(l) != 15
193e8cb587SBram Moolenaar      echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
203e8cb587SBram Moolenaar      return
213e8cb587SBram Moolenaar    endif
223e8cb587SBram Moolenaar    call add(s:dataprops, l)
233e8cb587SBram Moolenaar    let lnum += 1
243e8cb587SBram Moolenaar  endwhile
253e8cb587SBram Moolenaarendfunc
263e8cb587SBram Moolenaar
273e8cb587SBram Moolenaar" Parse lines of CaseFolding.txt.  Creates a list of lists in s:foldprops.
283e8cb587SBram Moolenaarfunc! ParseFoldProps()
293e8cb587SBram Moolenaar  let s:foldprops = []
303e8cb587SBram Moolenaar  let lnum = 1
313e8cb587SBram Moolenaar  while lnum <= line('$')
323e8cb587SBram Moolenaar    let line = getline(lnum)
333e8cb587SBram Moolenaar    if line !~ '^#' && line !~ '^\s*$'
343e8cb587SBram Moolenaar      let l = split(line, '\s*;\s*', 1)
353e8cb587SBram Moolenaar      if len(l) != 4
363e8cb587SBram Moolenaar        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
373e8cb587SBram Moolenaar        return
383e8cb587SBram Moolenaar      endif
393e8cb587SBram Moolenaar      call add(s:foldprops, l)
403e8cb587SBram Moolenaar    endif
413e8cb587SBram Moolenaar    let lnum += 1
423e8cb587SBram Moolenaar  endwhile
433e8cb587SBram Moolenaarendfunc
443e8cb587SBram Moolenaar
453e8cb587SBram Moolenaar" Parse lines of EastAsianWidth.txt.  Creates a list of lists in s:widthprops.
463e8cb587SBram Moolenaarfunc! ParseWidthProps()
473e8cb587SBram Moolenaar  let s:widthprops = []
483e8cb587SBram Moolenaar  let lnum = 1
493e8cb587SBram Moolenaar  while lnum <= line('$')
503e8cb587SBram Moolenaar    let line = getline(lnum)
513e8cb587SBram Moolenaar    if line !~ '^#' && line !~ '^\s*$'
523e8cb587SBram Moolenaar      let l = split(line, '\s*;\s*', 1)
533e8cb587SBram Moolenaar      if len(l) != 2
543e8cb587SBram Moolenaar        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
553e8cb587SBram Moolenaar        return
563e8cb587SBram Moolenaar      endif
573e8cb587SBram Moolenaar      call add(s:widthprops, l)
583e8cb587SBram Moolenaar    endif
593e8cb587SBram Moolenaar    let lnum += 1
603e8cb587SBram Moolenaar  endwhile
613e8cb587SBram Moolenaarendfunc
623e8cb587SBram Moolenaar
633e8cb587SBram Moolenaar" Build the toLower or toUpper table in a new buffer.
643e8cb587SBram Moolenaar" Uses s:dataprops.
653e8cb587SBram Moolenaarfunc! BuildCaseTable(name, index)
663e8cb587SBram Moolenaar  let start = -1
673e8cb587SBram Moolenaar  let end = -1
683e8cb587SBram Moolenaar  let step = 0
693e8cb587SBram Moolenaar  let add = -1
703e8cb587SBram Moolenaar  let ranges = []
713e8cb587SBram Moolenaar  for p in s:dataprops
723e8cb587SBram Moolenaar    if p[a:index] != ''
733e8cb587SBram Moolenaar      let n = ('0x' . p[0]) + 0
743e8cb587SBram Moolenaar      let nl = ('0x' . p[a:index]) + 0
753e8cb587SBram Moolenaar      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
763e8cb587SBram Moolenaar        " continue with same range.
773e8cb587SBram Moolenaar        let step = n - end
783e8cb587SBram Moolenaar        let end = n
793e8cb587SBram Moolenaar      else
803e8cb587SBram Moolenaar        if start >= 0
813e8cb587SBram Moolenaar          " produce previous range
823e8cb587SBram Moolenaar          call Range(ranges, start, end, step, add)
833e8cb587SBram Moolenaar        endif
843e8cb587SBram Moolenaar        let start = n
853e8cb587SBram Moolenaar        let end = n
863e8cb587SBram Moolenaar        let step = 0
873e8cb587SBram Moolenaar        let add = nl - n
883e8cb587SBram Moolenaar      endif
893e8cb587SBram Moolenaar    endif
903e8cb587SBram Moolenaar  endfor
913e8cb587SBram Moolenaar  if start >= 0
923e8cb587SBram Moolenaar    call Range(ranges, start, end, step, add)
933e8cb587SBram Moolenaar  endif
943e8cb587SBram Moolenaar
953e8cb587SBram Moolenaar  " New buffer to put the result in.
963e8cb587SBram Moolenaar  new
973e8cb587SBram Moolenaar  exe "file to" . a:name
983e8cb587SBram Moolenaar  call setline(1, "static convertStruct to" . a:name . "[] =")
993e8cb587SBram Moolenaar  call setline(2, "{")
1003e8cb587SBram Moolenaar  call append('$', ranges)
1013e8cb587SBram Moolenaar  call setline('$', getline('$')[:-2])  " remove last comma
1023e8cb587SBram Moolenaar  call setline(line('$') + 1, "};")
1033e8cb587SBram Moolenaar  wincmd p
1043e8cb587SBram Moolenaarendfunc
1053e8cb587SBram Moolenaar
1063e8cb587SBram Moolenaar" Build the foldCase table in a new buffer.
1073e8cb587SBram Moolenaar" Uses s:foldprops.
1083e8cb587SBram Moolenaarfunc! BuildFoldTable()
1093e8cb587SBram Moolenaar  let start = -1
1103e8cb587SBram Moolenaar  let end = -1
1113e8cb587SBram Moolenaar  let step = 0
1123e8cb587SBram Moolenaar  let add = -1
1133e8cb587SBram Moolenaar  let ranges = []
1143e8cb587SBram Moolenaar  for p in s:foldprops
1153e8cb587SBram Moolenaar    if p[1] == 'C' || p[1] == 'S'
1163e8cb587SBram Moolenaar      let n = ('0x' . p[0]) + 0
1173e8cb587SBram Moolenaar      let nl = ('0x' . p[2]) + 0
1183e8cb587SBram Moolenaar      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
1193e8cb587SBram Moolenaar        " continue with same range.
1203e8cb587SBram Moolenaar        let step = n - end
1213e8cb587SBram Moolenaar        let end = n
1223e8cb587SBram Moolenaar      else
1233e8cb587SBram Moolenaar        if start >= 0
1243e8cb587SBram Moolenaar          " produce previous range
1253e8cb587SBram Moolenaar          call Range(ranges, start, end, step, add)
1263e8cb587SBram Moolenaar        endif
1273e8cb587SBram Moolenaar        let start = n
1283e8cb587SBram Moolenaar        let end = n
1293e8cb587SBram Moolenaar        let step = 0
1303e8cb587SBram Moolenaar        let add = nl - n
1313e8cb587SBram Moolenaar      endif
1323e8cb587SBram Moolenaar    endif
1333e8cb587SBram Moolenaar  endfor
1343e8cb587SBram Moolenaar  if start >= 0
1353e8cb587SBram Moolenaar    call Range(ranges, start, end, step, add)
1363e8cb587SBram Moolenaar  endif
1373e8cb587SBram Moolenaar
1383e8cb587SBram Moolenaar  " New buffer to put the result in.
1393e8cb587SBram Moolenaar  new
1403e8cb587SBram Moolenaar  file foldCase
1413e8cb587SBram Moolenaar  call setline(1, "static convertStruct foldCase[] =")
1423e8cb587SBram Moolenaar  call setline(2, "{")
1433e8cb587SBram Moolenaar  call append('$', ranges)
1443e8cb587SBram Moolenaar  call setline('$', getline('$')[:-2])  " remove last comma
1453e8cb587SBram Moolenaar  call setline(line('$') + 1, "};")
1463e8cb587SBram Moolenaar  wincmd p
1473e8cb587SBram Moolenaarendfunc
1483e8cb587SBram Moolenaar
1493e8cb587SBram Moolenaarfunc! Range(ranges, start, end, step, add)
1503e8cb587SBram Moolenaar  let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
1513e8cb587SBram Moolenaar  call add(a:ranges, s)
1523e8cb587SBram Moolenaarendfunc
1533e8cb587SBram Moolenaar
1543e8cb587SBram Moolenaar" Build the combining table.
1553e8cb587SBram Moolenaar" Uses s:dataprops.
1563e8cb587SBram Moolenaarfunc! BuildCombiningTable()
1573e8cb587SBram Moolenaar  let start = -1
1583e8cb587SBram Moolenaar  let end = -1
1593e8cb587SBram Moolenaar  let ranges = []
1603e8cb587SBram Moolenaar  for p in s:dataprops
1613e8cb587SBram Moolenaar    if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
1623e8cb587SBram Moolenaar      let n = ('0x' . p[0]) + 0
1633e8cb587SBram Moolenaar      if start >= 0 && end + 1 == n
1643e8cb587SBram Moolenaar        " continue with same range.
1653e8cb587SBram Moolenaar        let end = n
1663e8cb587SBram Moolenaar      else
1673e8cb587SBram Moolenaar        if start >= 0
1683e8cb587SBram Moolenaar          " produce previous range
1693e8cb587SBram Moolenaar          call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
1703e8cb587SBram Moolenaar        endif
1713e8cb587SBram Moolenaar        let start = n
1723e8cb587SBram Moolenaar        let end = n
1733e8cb587SBram Moolenaar      endif
1743e8cb587SBram Moolenaar    endif
1753e8cb587SBram Moolenaar  endfor
1763e8cb587SBram Moolenaar  if start >= 0
1773e8cb587SBram Moolenaar    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
1783e8cb587SBram Moolenaar  endif
1793e8cb587SBram Moolenaar
1803e8cb587SBram Moolenaar  " New buffer to put the result in.
1813e8cb587SBram Moolenaar  new
1823e8cb587SBram Moolenaar  file combining
1833e8cb587SBram Moolenaar  call setline(1, "    static struct interval combining[] =")
1843e8cb587SBram Moolenaar  call setline(2, "    {")
1853e8cb587SBram Moolenaar  call append('$', ranges)
1863e8cb587SBram Moolenaar  call setline('$', getline('$')[:-2])  " remove last comma
1873e8cb587SBram Moolenaar  call setline(line('$') + 1, "    };")
1883e8cb587SBram Moolenaar  wincmd p
1893e8cb587SBram Moolenaarendfunc
1903e8cb587SBram Moolenaar
191da4d7a92SBram Moolenaar" Build the double width or ambiguous width table in a new buffer.
1923e8cb587SBram Moolenaar" Uses s:widthprops and s:dataprops.
193da4d7a92SBram Moolenaarfunc! BuildWidthTable(pattern, tableName)
1943e8cb587SBram Moolenaar  let start = -1
1953e8cb587SBram Moolenaar  let end = -1
1963e8cb587SBram Moolenaar  let ranges = []
1973e8cb587SBram Moolenaar  let dataidx = 0
198*d887297aSChristian Brabandt  " Account for indentation differences between ambiguous and doublewidth
199*d887297aSChristian Brabandt  " table in mbyte.c
200*d887297aSChristian Brabandt  if a:pattern == 'A'
201*d887297aSChristian Brabandt    let spc = '    '
202*d887297aSChristian Brabandt  else
203*d887297aSChristian Brabandt    let spc = "\t"
204*d887297aSChristian Brabandt  endif
2053e8cb587SBram Moolenaar  for p in s:widthprops
206da4d7a92SBram Moolenaar    if p[1][0] =~ a:pattern
207da4d7a92SBram Moolenaar      if p[0] =~ '\.\.'
208da4d7a92SBram Moolenaar        " It is a range.  we don't check for composing char then.
209da4d7a92SBram Moolenaar        let rng = split(p[0], '\.\.')
210da4d7a92SBram Moolenaar        if len(rng) != 2
211da4d7a92SBram Moolenaar          echoerr "Cannot parse range: '" . p[0] . "' in width table"
212da4d7a92SBram Moolenaar        endif
213da4d7a92SBram Moolenaar        let n = ('0x' . rng[0]) + 0
214da4d7a92SBram Moolenaar        let n_last =  ('0x' . rng[1]) + 0
215da4d7a92SBram Moolenaar      else
2163e8cb587SBram Moolenaar        let n = ('0x' . p[0]) + 0
217da4d7a92SBram Moolenaar        let n_last = n
218da4d7a92SBram Moolenaar      endif
2193e8cb587SBram Moolenaar      " Find this char in the data table.
2203e8cb587SBram Moolenaar      while 1
2213e8cb587SBram Moolenaar        let dn = ('0x' . s:dataprops[dataidx][0]) + 0
2223e8cb587SBram Moolenaar        if dn >= n
2233e8cb587SBram Moolenaar          break
2243e8cb587SBram Moolenaar        endif
2253e8cb587SBram Moolenaar        let dataidx += 1
2263e8cb587SBram Moolenaar      endwhile
227da4d7a92SBram Moolenaar      if dn != n && n_last == n
2283e8cb587SBram Moolenaar        echoerr "Cannot find character " . n . " in data table"
2293e8cb587SBram Moolenaar      endif
2303e8cb587SBram Moolenaar      " Only use the char when it's not a composing char.
231da4d7a92SBram Moolenaar      " But use all chars from a range.
2323e8cb587SBram Moolenaar      let dp = s:dataprops[dataidx]
233da4d7a92SBram Moolenaar      if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
2343e8cb587SBram Moolenaar        if start >= 0 && end + 1 == n
2353e8cb587SBram Moolenaar          " continue with same range.
2363e8cb587SBram Moolenaar        else
2373e8cb587SBram Moolenaar          if start >= 0
2383e8cb587SBram Moolenaar            " produce previous range
239*d887297aSChristian Brabandt            call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
240b86f10eeSBram Moolenaar	    if a:pattern == 'A'
241b86f10eeSBram Moolenaar	      call add(s:ambitable, [start, end])
242b86f10eeSBram Moolenaar	    else
243b86f10eeSBram Moolenaar	      call add(s:doubletable, [start, end])
244b86f10eeSBram Moolenaar	    endif
2453e8cb587SBram Moolenaar          endif
2463e8cb587SBram Moolenaar          let start = n
2473e8cb587SBram Moolenaar        endif
248da4d7a92SBram Moolenaar        let end = n_last
2493e8cb587SBram Moolenaar      endif
2503e8cb587SBram Moolenaar    endif
2513e8cb587SBram Moolenaar  endfor
2523e8cb587SBram Moolenaar  if start >= 0
253*d887297aSChristian Brabandt    call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
254b86f10eeSBram Moolenaar    if a:pattern == 'A'
255b86f10eeSBram Moolenaar      call add(s:ambitable, [start, end])
256b86f10eeSBram Moolenaar    else
257b86f10eeSBram Moolenaar      call add(s:doubletable, [start, end])
258b86f10eeSBram Moolenaar    endif
2593e8cb587SBram Moolenaar  endif
2603e8cb587SBram Moolenaar
2613e8cb587SBram Moolenaar  " New buffer to put the result in.
2623e8cb587SBram Moolenaar  new
263da4d7a92SBram Moolenaar  exe "file " . a:tableName
264*d887297aSChristian Brabandt  if a:pattern == 'A'
265da4d7a92SBram Moolenaar    call setline(1, "static struct interval " . a:tableName . "[] =")
2663e8cb587SBram Moolenaar    call setline(2, "{")
267*d887297aSChristian Brabandt  else
268*d887297aSChristian Brabandt    call setline(1, "    static struct interval " . a:tableName . "[] =")
269*d887297aSChristian Brabandt    call setline(2, "    {")
270*d887297aSChristian Brabandt  endif
2713e8cb587SBram Moolenaar  call append('$', ranges)
2723e8cb587SBram Moolenaar  call setline('$', getline('$')[:-2])  " remove last comma
273*d887297aSChristian Brabandt  if a:pattern == 'A'
2743e8cb587SBram Moolenaar    call setline(line('$') + 1, "};")
275*d887297aSChristian Brabandt  else
276*d887297aSChristian Brabandt    call setline(line('$') + 1, "    };")
277*d887297aSChristian Brabandt  endif
2783e8cb587SBram Moolenaar  wincmd p
2793e8cb587SBram Moolenaarendfunc
2803e8cb587SBram Moolenaar
281207f0093SBram Moolenaar
282207f0093SBram Moolenaar" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
283207f0093SBram Moolenaar" and put them in dictionary "chardict"
284207f0093SBram Moolenaarfunc AddLinesToCharDict(lines, chardict)
285207f0093SBram Moolenaar  for line in a:lines
286207f0093SBram Moolenaar    let tokens = split(line, '\.\.')
287207f0093SBram Moolenaar    let first = str2nr(tokens[0], 16)
288207f0093SBram Moolenaar    if len(tokens) == 1
289b86f10eeSBram Moolenaar      let last = first
290b86f10eeSBram Moolenaar    else
291207f0093SBram Moolenaar      let last = str2nr(tokens[1], 16)
2923848e00eSBram Moolenaar    endif
293207f0093SBram Moolenaar    for nr in range(first, last)
294207f0093SBram Moolenaar      let a:chardict[nr] = 1
295207f0093SBram Moolenaar    endfor
296207f0093SBram Moolenaar  endfor
297207f0093SBram Moolenaarendfunc
298b86f10eeSBram Moolenaar
299207f0093SBram Moolenaarfunc Test_AddLinesToCharDict()
300207f0093SBram Moolenaar  let dict = {}
301207f0093SBram Moolenaar  call AddLinesToCharDict([
302207f0093SBram Moolenaar	\ '1234 blah blah',
303207f0093SBram Moolenaar	\ '1235 blah blah',
304207f0093SBram Moolenaar	\ '12a0..12a2 blah blah',
305207f0093SBram Moolenaar	\ '12a1 blah blah',
306207f0093SBram Moolenaar	\ ], dict)
307207f0093SBram Moolenaar  call assert_equal({0x1234: 1, 0x1235: 1,
308207f0093SBram Moolenaar	\ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
309207f0093SBram Moolenaar	\ }, dict)
310207f0093SBram Moolenaar  if v:errors != []
311207f0093SBram Moolenaar    echoerr 'AddLinesToCharDict' v:errors
312207f0093SBram Moolenaar    return 1
313207f0093SBram Moolenaar  endif
314207f0093SBram Moolenaar  return 0
315207f0093SBram Moolenaarendfunc
316207f0093SBram Moolenaar
317207f0093SBram Moolenaar
318207f0093SBram Moolenaarfunc CharDictToPairList(chardict)
319207f0093SBram Moolenaar  let result = []
320207f0093SBram Moolenaar  let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
321207f0093SBram Moolenaar  let low = keys[0]
322207f0093SBram Moolenaar  let high = keys[0]
323207f0093SBram Moolenaar  for key in keys
324207f0093SBram Moolenaar    if key > high + 1
325207f0093SBram Moolenaar      call add(result, [low, high])
326207f0093SBram Moolenaar      let low = key
327207f0093SBram Moolenaar      let high = key
328b86f10eeSBram Moolenaar    else
329207f0093SBram Moolenaar      let high = key
330b86f10eeSBram Moolenaar    endif
3313848e00eSBram Moolenaar  endfor
332207f0093SBram Moolenaar  call add(result, [low, high])
333207f0093SBram Moolenaar  return result
334207f0093SBram Moolenaarendfunc
335b86f10eeSBram Moolenaar
336207f0093SBram Moolenaarfunc Test_CharDictToPairList()
337207f0093SBram Moolenaar  let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
338207f0093SBram Moolenaar	\ 0x1024: 1,
339207f0093SBram Moolenaar	\ 0x2022: 1,
340207f0093SBram Moolenaar	\ 0x2024: 1, 0x2025: 1}
341207f0093SBram Moolenaar  call assert_equal([
342207f0093SBram Moolenaar	\ [0x1020, 0x1022],
343207f0093SBram Moolenaar	\ [0x1024, 0x1024],
344207f0093SBram Moolenaar	\ [0x2022, 0x2022],
345207f0093SBram Moolenaar	\ [0x2024, 0x2025],
346207f0093SBram Moolenaar	\ ], CharDictToPairList(dict))
347207f0093SBram Moolenaar  if v:errors != []
348207f0093SBram Moolenaar    echoerr 'CharDictToPairList' v:errors
349207f0093SBram Moolenaar    return 1
350b86f10eeSBram Moolenaar  endif
351207f0093SBram Moolenaar  return 0
352207f0093SBram Moolenaarendfunc
353207f0093SBram Moolenaar
354207f0093SBram Moolenaar
355207f0093SBram Moolenaar" Build the amoji width table in a new buffer.
356207f0093SBram Moolenaarfunc BuildEmojiTable()
357207f0093SBram Moolenaar  " First make the table for all emojis.
358207f0093SBram Moolenaar  let pattern = '; Emoji\s\+#\s'
359207f0093SBram Moolenaar  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
360207f0093SBram Moolenaar
361207f0093SBram Moolenaar  " Make a dictionary with an entry for each character.
362207f0093SBram Moolenaar  let chardict = {}
363207f0093SBram Moolenaar  call AddLinesToCharDict(lines, chardict)
364207f0093SBram Moolenaar  let pairlist = CharDictToPairList(chardict)
365207f0093SBram Moolenaar  let allranges = map(pairlist, 'printf("    {0x%04x, 0x%04x},", v:val[0], v:val[1])')
3663848e00eSBram Moolenaar
3673848e00eSBram Moolenaar  " New buffer to put the result in.
3683848e00eSBram Moolenaar  new
369207f0093SBram Moolenaar  exe 'file emoji_all'
370207f0093SBram Moolenaar  call setline(1, "static struct interval emoji_all[] =")
3713848e00eSBram Moolenaar  call setline(2, "{")
372b86f10eeSBram Moolenaar  call append('$', allranges)
373b86f10eeSBram Moolenaar  call setline('$', getline('$')[:-2])  " remove last comma
374b86f10eeSBram Moolenaar  call setline(line('$') + 1, "};")
375b86f10eeSBram Moolenaar  wincmd p
376b86f10eeSBram Moolenaar
377207f0093SBram Moolenaar  " Make the table for wide emojis.
378207f0093SBram Moolenaar  let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
379207f0093SBram Moolenaar  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
380207f0093SBram Moolenaar
381207f0093SBram Moolenaar  " Make a dictionary with an entry for each character.
382207f0093SBram Moolenaar  let chardict = {}
383207f0093SBram Moolenaar  call AddLinesToCharDict(lines, chardict)
384207f0093SBram Moolenaar
385207f0093SBram Moolenaar  " exclude characters that are in the "ambiguous" or "doublewidth" table
386207f0093SBram Moolenaar  for ambi in s:ambitable
387207f0093SBram Moolenaar    for nr in range(ambi[0], ambi[1])
388207f0093SBram Moolenaar      if has_key(chardict, nr)
389207f0093SBram Moolenaar	call remove(chardict, nr)
390207f0093SBram Moolenaar      endif
391207f0093SBram Moolenaar    endfor
392207f0093SBram Moolenaar  endfor
393207f0093SBram Moolenaar
394207f0093SBram Moolenaar  for wide in s:doubletable
395207f0093SBram Moolenaar    for nr in range(wide[0], wide[1])
396207f0093SBram Moolenaar      if has_key(chardict, nr)
397207f0093SBram Moolenaar	call remove(chardict, nr)
398207f0093SBram Moolenaar      endif
399207f0093SBram Moolenaar    endfor
400207f0093SBram Moolenaar  endfor
401207f0093SBram Moolenaar
402207f0093SBram Moolenaar  let pairlist = CharDictToPairList(chardict)
403207f0093SBram Moolenaar  let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
404207f0093SBram Moolenaar
405b86f10eeSBram Moolenaar  " New buffer to put the result in.
406b86f10eeSBram Moolenaar  new
407207f0093SBram Moolenaar  exe 'file emoji_wide'
408207f0093SBram Moolenaar  call setline(1, "    static struct interval emoji_wide[] =")
409b86f10eeSBram Moolenaar  call setline(2, "    {")
410207f0093SBram Moolenaar  call append('$', wide_ranges)
4113848e00eSBram Moolenaar  call setline('$', getline('$')[:-2])  " remove last comma
4123848e00eSBram Moolenaar  call setline(line('$') + 1, "    };")
4133848e00eSBram Moolenaar  wincmd p
4143848e00eSBram Moolenaarendfunc
4153e8cb587SBram Moolenaar
416207f0093SBram Moolenaar" First test a few things
417207f0093SBram Moolenaarlet v:errors = []
418207f0093SBram Moolenaarif Test_AddLinesToCharDict() || Test_CharDictToPairList()
419207f0093SBram Moolenaar  finish
420207f0093SBram Moolenaarendif
421207f0093SBram Moolenaar
422207f0093SBram Moolenaar
42366312acfSBram Moolenaar" Try to avoid hitting E36
42466312acfSBram Moolenaarset equalalways
4253e8cb587SBram Moolenaar
4263e8cb587SBram Moolenaar" Edit the Unicode text file.  Requires the netrw plugin.
4273e8cb587SBram Moolenaaredit http://unicode.org/Public/UNIDATA/UnicodeData.txt
4283e8cb587SBram Moolenaar
4293e8cb587SBram Moolenaar" Parse each line, create a list of lists.
4303e8cb587SBram Moolenaarcall ParseDataToProps()
4313e8cb587SBram Moolenaar
4323e8cb587SBram Moolenaar" Build the toLower table.
4333e8cb587SBram Moolenaarcall BuildCaseTable("Lower", 13)
4343e8cb587SBram Moolenaar
4353e8cb587SBram Moolenaar" Build the toUpper table.
4363e8cb587SBram Moolenaarcall BuildCaseTable("Upper", 12)
4373e8cb587SBram Moolenaar
4383e8cb587SBram Moolenaar" Build the ranges of composing chars.
4393e8cb587SBram Moolenaarcall BuildCombiningTable()
4403e8cb587SBram Moolenaar
4413e8cb587SBram Moolenaar" Edit the case folding text file.  Requires the netrw plugin.
4423e8cb587SBram Moolenaaredit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
4433e8cb587SBram Moolenaar
4443e8cb587SBram Moolenaar" Parse each line, create a list of lists.
4453e8cb587SBram Moolenaarcall ParseFoldProps()
4463e8cb587SBram Moolenaar
4473e8cb587SBram Moolenaar" Build the foldCase table.
4483e8cb587SBram Moolenaarcall BuildFoldTable()
4493e8cb587SBram Moolenaar
4503e8cb587SBram Moolenaar" Edit the width text file.  Requires the netrw plugin.
4513e8cb587SBram Moolenaaredit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
4523e8cb587SBram Moolenaar
4533e8cb587SBram Moolenaar" Parse each line, create a list of lists.
4543e8cb587SBram Moolenaarcall ParseWidthProps()
4553e8cb587SBram Moolenaar
456da4d7a92SBram Moolenaar" Build the double width table.
457b86f10eeSBram Moolenaarlet s:doubletable = []
458da4d7a92SBram Moolenaarcall BuildWidthTable('[WF]', 'doublewidth')
459da4d7a92SBram Moolenaar
460da4d7a92SBram Moolenaar" Build the ambiguous width table.
461b86f10eeSBram Moolenaarlet s:ambitable = []
462da4d7a92SBram Moolenaarcall BuildWidthTable('A', 'ambiguous')
4633848e00eSBram Moolenaar
4643848e00eSBram Moolenaar" Edit the emoji text file.  Requires the netrw plugin.
465207f0093SBram Moolenaaredit https://unicode.org/Public/emoji/12.1/emoji-data.txt
4663848e00eSBram Moolenaar
4673848e00eSBram Moolenaar" Build the emoji table. Ver. 1.0 - 6.0
468207f0093SBram Moolenaar" Must come after the "ambiguous" and "doublewidth" tables
469207f0093SBram Moolenaarcall BuildEmojiTable()
470