13e8cb587SBram Moolenaar" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. 23e8cb587SBram Moolenaar" The format of the UnicodeData.txt file is explained here: 33e8cb587SBram Moolenaar" http://www.unicode.org/Public/5.1.0/ucd/UCD.html 43e8cb587SBram Moolenaar" For the other files see the header. 53e8cb587SBram Moolenaar" 6383aa84cSBram Moolenaar" Might need to update the URL to the emoji-data.txt 73e8cb587SBram Moolenaar" Usage: Vim -S <this-file> 83e8cb587SBram Moolenaar" 93e8cb587SBram Moolenaar" Author: Bram Moolenaar 10207f0093SBram Moolenaar" Last Update: 2020 Aug 24 113e8cb587SBram Moolenaar 123e8cb587SBram Moolenaar" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. 133e8cb587SBram Moolenaarfunc! ParseDataToProps() 143e8cb587SBram Moolenaar let s:dataprops = [] 153e8cb587SBram Moolenaar let lnum = 1 163e8cb587SBram Moolenaar while lnum <= line('$') 173e8cb587SBram Moolenaar let l = split(getline(lnum), '\s*;\s*', 1) 183e8cb587SBram Moolenaar if len(l) != 15 193e8cb587SBram Moolenaar echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' 203e8cb587SBram Moolenaar return 213e8cb587SBram Moolenaar endif 223e8cb587SBram Moolenaar call add(s:dataprops, l) 233e8cb587SBram Moolenaar let lnum += 1 243e8cb587SBram Moolenaar endwhile 253e8cb587SBram Moolenaarendfunc 263e8cb587SBram Moolenaar 273e8cb587SBram Moolenaar" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. 283e8cb587SBram Moolenaarfunc! ParseFoldProps() 293e8cb587SBram Moolenaar let s:foldprops = [] 303e8cb587SBram Moolenaar let lnum = 1 313e8cb587SBram Moolenaar while lnum <= line('$') 323e8cb587SBram Moolenaar let line = getline(lnum) 333e8cb587SBram Moolenaar if line !~ '^#' && line !~ '^\s*$' 343e8cb587SBram Moolenaar let l = split(line, '\s*;\s*', 1) 353e8cb587SBram Moolenaar if len(l) != 4 363e8cb587SBram Moolenaar echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' 373e8cb587SBram Moolenaar return 383e8cb587SBram Moolenaar endif 393e8cb587SBram Moolenaar call add(s:foldprops, l) 403e8cb587SBram Moolenaar endif 413e8cb587SBram Moolenaar let lnum += 1 423e8cb587SBram Moolenaar endwhile 433e8cb587SBram Moolenaarendfunc 443e8cb587SBram Moolenaar 453e8cb587SBram Moolenaar" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. 463e8cb587SBram Moolenaarfunc! ParseWidthProps() 473e8cb587SBram Moolenaar let s:widthprops = [] 483e8cb587SBram Moolenaar let lnum = 1 493e8cb587SBram Moolenaar while lnum <= line('$') 503e8cb587SBram Moolenaar let line = getline(lnum) 513e8cb587SBram Moolenaar if line !~ '^#' && line !~ '^\s*$' 523e8cb587SBram Moolenaar let l = split(line, '\s*;\s*', 1) 533e8cb587SBram Moolenaar if len(l) != 2 543e8cb587SBram Moolenaar echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' 553e8cb587SBram Moolenaar return 563e8cb587SBram Moolenaar endif 573e8cb587SBram Moolenaar call add(s:widthprops, l) 583e8cb587SBram Moolenaar endif 593e8cb587SBram Moolenaar let lnum += 1 603e8cb587SBram Moolenaar endwhile 613e8cb587SBram Moolenaarendfunc 623e8cb587SBram Moolenaar 633e8cb587SBram Moolenaar" Build the toLower or toUpper table in a new buffer. 643e8cb587SBram Moolenaar" Uses s:dataprops. 653e8cb587SBram Moolenaarfunc! BuildCaseTable(name, index) 663e8cb587SBram Moolenaar let start = -1 673e8cb587SBram Moolenaar let end = -1 683e8cb587SBram Moolenaar let step = 0 693e8cb587SBram Moolenaar let add = -1 703e8cb587SBram Moolenaar let ranges = [] 713e8cb587SBram Moolenaar for p in s:dataprops 723e8cb587SBram Moolenaar if p[a:index] != '' 733e8cb587SBram Moolenaar let n = ('0x' . p[0]) + 0 743e8cb587SBram Moolenaar let nl = ('0x' . p[a:index]) + 0 753e8cb587SBram Moolenaar if start >= 0 && add == nl - n && (step == 0 || n - end == step) 763e8cb587SBram Moolenaar " continue with same range. 773e8cb587SBram Moolenaar let step = n - end 783e8cb587SBram Moolenaar let end = n 793e8cb587SBram Moolenaar else 803e8cb587SBram Moolenaar if start >= 0 813e8cb587SBram Moolenaar " produce previous range 823e8cb587SBram Moolenaar call Range(ranges, start, end, step, add) 833e8cb587SBram Moolenaar endif 843e8cb587SBram Moolenaar let start = n 853e8cb587SBram Moolenaar let end = n 863e8cb587SBram Moolenaar let step = 0 873e8cb587SBram Moolenaar let add = nl - n 883e8cb587SBram Moolenaar endif 893e8cb587SBram Moolenaar endif 903e8cb587SBram Moolenaar endfor 913e8cb587SBram Moolenaar if start >= 0 923e8cb587SBram Moolenaar call Range(ranges, start, end, step, add) 933e8cb587SBram Moolenaar endif 943e8cb587SBram Moolenaar 953e8cb587SBram Moolenaar " New buffer to put the result in. 963e8cb587SBram Moolenaar new 973e8cb587SBram Moolenaar exe "file to" . a:name 983e8cb587SBram Moolenaar call setline(1, "static convertStruct to" . a:name . "[] =") 993e8cb587SBram Moolenaar call setline(2, "{") 1003e8cb587SBram Moolenaar call append('$', ranges) 1013e8cb587SBram Moolenaar call setline('$', getline('$')[:-2]) " remove last comma 1023e8cb587SBram Moolenaar call setline(line('$') + 1, "};") 1033e8cb587SBram Moolenaar wincmd p 1043e8cb587SBram Moolenaarendfunc 1053e8cb587SBram Moolenaar 1063e8cb587SBram Moolenaar" Build the foldCase table in a new buffer. 1073e8cb587SBram Moolenaar" Uses s:foldprops. 1083e8cb587SBram Moolenaarfunc! BuildFoldTable() 1093e8cb587SBram Moolenaar let start = -1 1103e8cb587SBram Moolenaar let end = -1 1113e8cb587SBram Moolenaar let step = 0 1123e8cb587SBram Moolenaar let add = -1 1133e8cb587SBram Moolenaar let ranges = [] 1143e8cb587SBram Moolenaar for p in s:foldprops 1153e8cb587SBram Moolenaar if p[1] == 'C' || p[1] == 'S' 1163e8cb587SBram Moolenaar let n = ('0x' . p[0]) + 0 1173e8cb587SBram Moolenaar let nl = ('0x' . p[2]) + 0 1183e8cb587SBram Moolenaar if start >= 0 && add == nl - n && (step == 0 || n - end == step) 1193e8cb587SBram Moolenaar " continue with same range. 1203e8cb587SBram Moolenaar let step = n - end 1213e8cb587SBram Moolenaar let end = n 1223e8cb587SBram Moolenaar else 1233e8cb587SBram Moolenaar if start >= 0 1243e8cb587SBram Moolenaar " produce previous range 1253e8cb587SBram Moolenaar call Range(ranges, start, end, step, add) 1263e8cb587SBram Moolenaar endif 1273e8cb587SBram Moolenaar let start = n 1283e8cb587SBram Moolenaar let end = n 1293e8cb587SBram Moolenaar let step = 0 1303e8cb587SBram Moolenaar let add = nl - n 1313e8cb587SBram Moolenaar endif 1323e8cb587SBram Moolenaar endif 1333e8cb587SBram Moolenaar endfor 1343e8cb587SBram Moolenaar if start >= 0 1353e8cb587SBram Moolenaar call Range(ranges, start, end, step, add) 1363e8cb587SBram Moolenaar endif 1373e8cb587SBram Moolenaar 1383e8cb587SBram Moolenaar " New buffer to put the result in. 1393e8cb587SBram Moolenaar new 1403e8cb587SBram Moolenaar file foldCase 1413e8cb587SBram Moolenaar call setline(1, "static convertStruct foldCase[] =") 1423e8cb587SBram Moolenaar call setline(2, "{") 1433e8cb587SBram Moolenaar call append('$', ranges) 1443e8cb587SBram Moolenaar call setline('$', getline('$')[:-2]) " remove last comma 1453e8cb587SBram Moolenaar call setline(line('$') + 1, "};") 1463e8cb587SBram Moolenaar wincmd p 1473e8cb587SBram Moolenaarendfunc 1483e8cb587SBram Moolenaar 1493e8cb587SBram Moolenaarfunc! Range(ranges, start, end, step, add) 1503e8cb587SBram Moolenaar let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) 1513e8cb587SBram Moolenaar call add(a:ranges, s) 1523e8cb587SBram Moolenaarendfunc 1533e8cb587SBram Moolenaar 1543e8cb587SBram Moolenaar" Build the combining table. 1553e8cb587SBram Moolenaar" Uses s:dataprops. 1563e8cb587SBram Moolenaarfunc! BuildCombiningTable() 1573e8cb587SBram Moolenaar let start = -1 1583e8cb587SBram Moolenaar let end = -1 1593e8cb587SBram Moolenaar let ranges = [] 1603e8cb587SBram Moolenaar for p in s:dataprops 1613e8cb587SBram Moolenaar if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me' 1623e8cb587SBram Moolenaar let n = ('0x' . p[0]) + 0 1633e8cb587SBram Moolenaar if start >= 0 && end + 1 == n 1643e8cb587SBram Moolenaar " continue with same range. 1653e8cb587SBram Moolenaar let end = n 1663e8cb587SBram Moolenaar else 1673e8cb587SBram Moolenaar if start >= 0 1683e8cb587SBram Moolenaar " produce previous range 1693e8cb587SBram Moolenaar call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 1703e8cb587SBram Moolenaar endif 1713e8cb587SBram Moolenaar let start = n 1723e8cb587SBram Moolenaar let end = n 1733e8cb587SBram Moolenaar endif 1743e8cb587SBram Moolenaar endif 1753e8cb587SBram Moolenaar endfor 1763e8cb587SBram Moolenaar if start >= 0 1773e8cb587SBram Moolenaar call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 1783e8cb587SBram Moolenaar endif 1793e8cb587SBram Moolenaar 1803e8cb587SBram Moolenaar " New buffer to put the result in. 1813e8cb587SBram Moolenaar new 1823e8cb587SBram Moolenaar file combining 1833e8cb587SBram Moolenaar call setline(1, " static struct interval combining[] =") 1843e8cb587SBram Moolenaar call setline(2, " {") 1853e8cb587SBram Moolenaar call append('$', ranges) 1863e8cb587SBram Moolenaar call setline('$', getline('$')[:-2]) " remove last comma 1873e8cb587SBram Moolenaar call setline(line('$') + 1, " };") 1883e8cb587SBram Moolenaar wincmd p 1893e8cb587SBram Moolenaarendfunc 1903e8cb587SBram Moolenaar 191da4d7a92SBram Moolenaar" Build the double width or ambiguous width table in a new buffer. 1923e8cb587SBram Moolenaar" Uses s:widthprops and s:dataprops. 193da4d7a92SBram Moolenaarfunc! BuildWidthTable(pattern, tableName) 1943e8cb587SBram Moolenaar let start = -1 1953e8cb587SBram Moolenaar let end = -1 1963e8cb587SBram Moolenaar let ranges = [] 1973e8cb587SBram Moolenaar let dataidx = 0 198*d887297aSChristian Brabandt " Account for indentation differences between ambiguous and doublewidth 199*d887297aSChristian Brabandt " table in mbyte.c 200*d887297aSChristian Brabandt if a:pattern == 'A' 201*d887297aSChristian Brabandt let spc = ' ' 202*d887297aSChristian Brabandt else 203*d887297aSChristian Brabandt let spc = "\t" 204*d887297aSChristian Brabandt endif 2053e8cb587SBram Moolenaar for p in s:widthprops 206da4d7a92SBram Moolenaar if p[1][0] =~ a:pattern 207da4d7a92SBram Moolenaar if p[0] =~ '\.\.' 208da4d7a92SBram Moolenaar " It is a range. we don't check for composing char then. 209da4d7a92SBram Moolenaar let rng = split(p[0], '\.\.') 210da4d7a92SBram Moolenaar if len(rng) != 2 211da4d7a92SBram Moolenaar echoerr "Cannot parse range: '" . p[0] . "' in width table" 212da4d7a92SBram Moolenaar endif 213da4d7a92SBram Moolenaar let n = ('0x' . rng[0]) + 0 214da4d7a92SBram Moolenaar let n_last = ('0x' . rng[1]) + 0 215da4d7a92SBram Moolenaar else 2163e8cb587SBram Moolenaar let n = ('0x' . p[0]) + 0 217da4d7a92SBram Moolenaar let n_last = n 218da4d7a92SBram Moolenaar endif 2193e8cb587SBram Moolenaar " Find this char in the data table. 2203e8cb587SBram Moolenaar while 1 2213e8cb587SBram Moolenaar let dn = ('0x' . s:dataprops[dataidx][0]) + 0 2223e8cb587SBram Moolenaar if dn >= n 2233e8cb587SBram Moolenaar break 2243e8cb587SBram Moolenaar endif 2253e8cb587SBram Moolenaar let dataidx += 1 2263e8cb587SBram Moolenaar endwhile 227da4d7a92SBram Moolenaar if dn != n && n_last == n 2283e8cb587SBram Moolenaar echoerr "Cannot find character " . n . " in data table" 2293e8cb587SBram Moolenaar endif 2303e8cb587SBram Moolenaar " Only use the char when it's not a composing char. 231da4d7a92SBram Moolenaar " But use all chars from a range. 2323e8cb587SBram Moolenaar let dp = s:dataprops[dataidx] 233da4d7a92SBram Moolenaar if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') 2343e8cb587SBram Moolenaar if start >= 0 && end + 1 == n 2353e8cb587SBram Moolenaar " continue with same range. 2363e8cb587SBram Moolenaar else 2373e8cb587SBram Moolenaar if start >= 0 2383e8cb587SBram Moolenaar " produce previous range 239*d887297aSChristian Brabandt call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end)) 240b86f10eeSBram Moolenaar if a:pattern == 'A' 241b86f10eeSBram Moolenaar call add(s:ambitable, [start, end]) 242b86f10eeSBram Moolenaar else 243b86f10eeSBram Moolenaar call add(s:doubletable, [start, end]) 244b86f10eeSBram Moolenaar endif 2453e8cb587SBram Moolenaar endif 2463e8cb587SBram Moolenaar let start = n 2473e8cb587SBram Moolenaar endif 248da4d7a92SBram Moolenaar let end = n_last 2493e8cb587SBram Moolenaar endif 2503e8cb587SBram Moolenaar endif 2513e8cb587SBram Moolenaar endfor 2523e8cb587SBram Moolenaar if start >= 0 253*d887297aSChristian Brabandt call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end)) 254b86f10eeSBram Moolenaar if a:pattern == 'A' 255b86f10eeSBram Moolenaar call add(s:ambitable, [start, end]) 256b86f10eeSBram Moolenaar else 257b86f10eeSBram Moolenaar call add(s:doubletable, [start, end]) 258b86f10eeSBram Moolenaar endif 2593e8cb587SBram Moolenaar endif 2603e8cb587SBram Moolenaar 2613e8cb587SBram Moolenaar " New buffer to put the result in. 2623e8cb587SBram Moolenaar new 263da4d7a92SBram Moolenaar exe "file " . a:tableName 264*d887297aSChristian Brabandt if a:pattern == 'A' 265da4d7a92SBram Moolenaar call setline(1, "static struct interval " . a:tableName . "[] =") 2663e8cb587SBram Moolenaar call setline(2, "{") 267*d887297aSChristian Brabandt else 268*d887297aSChristian Brabandt call setline(1, " static struct interval " . a:tableName . "[] =") 269*d887297aSChristian Brabandt call setline(2, " {") 270*d887297aSChristian Brabandt endif 2713e8cb587SBram Moolenaar call append('$', ranges) 2723e8cb587SBram Moolenaar call setline('$', getline('$')[:-2]) " remove last comma 273*d887297aSChristian Brabandt if a:pattern == 'A' 2743e8cb587SBram Moolenaar call setline(line('$') + 1, "};") 275*d887297aSChristian Brabandt else 276*d887297aSChristian Brabandt call setline(line('$') + 1, " };") 277*d887297aSChristian Brabandt endif 2783e8cb587SBram Moolenaar wincmd p 2793e8cb587SBram Moolenaarendfunc 2803e8cb587SBram Moolenaar 281207f0093SBram Moolenaar 282207f0093SBram Moolenaar" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..." 283207f0093SBram Moolenaar" and put them in dictionary "chardict" 284207f0093SBram Moolenaarfunc AddLinesToCharDict(lines, chardict) 285207f0093SBram Moolenaar for line in a:lines 286207f0093SBram Moolenaar let tokens = split(line, '\.\.') 287207f0093SBram Moolenaar let first = str2nr(tokens[0], 16) 288207f0093SBram Moolenaar if len(tokens) == 1 289b86f10eeSBram Moolenaar let last = first 290b86f10eeSBram Moolenaar else 291207f0093SBram Moolenaar let last = str2nr(tokens[1], 16) 2923848e00eSBram Moolenaar endif 293207f0093SBram Moolenaar for nr in range(first, last) 294207f0093SBram Moolenaar let a:chardict[nr] = 1 295207f0093SBram Moolenaar endfor 296207f0093SBram Moolenaar endfor 297207f0093SBram Moolenaarendfunc 298b86f10eeSBram Moolenaar 299207f0093SBram Moolenaarfunc Test_AddLinesToCharDict() 300207f0093SBram Moolenaar let dict = {} 301207f0093SBram Moolenaar call AddLinesToCharDict([ 302207f0093SBram Moolenaar \ '1234 blah blah', 303207f0093SBram Moolenaar \ '1235 blah blah', 304207f0093SBram Moolenaar \ '12a0..12a2 blah blah', 305207f0093SBram Moolenaar \ '12a1 blah blah', 306207f0093SBram Moolenaar \ ], dict) 307207f0093SBram Moolenaar call assert_equal({0x1234: 1, 0x1235: 1, 308207f0093SBram Moolenaar \ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1, 309207f0093SBram Moolenaar \ }, dict) 310207f0093SBram Moolenaar if v:errors != [] 311207f0093SBram Moolenaar echoerr 'AddLinesToCharDict' v:errors 312207f0093SBram Moolenaar return 1 313207f0093SBram Moolenaar endif 314207f0093SBram Moolenaar return 0 315207f0093SBram Moolenaarendfunc 316207f0093SBram Moolenaar 317207f0093SBram Moolenaar 318207f0093SBram Moolenaarfunc CharDictToPairList(chardict) 319207f0093SBram Moolenaar let result = [] 320207f0093SBram Moolenaar let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N') 321207f0093SBram Moolenaar let low = keys[0] 322207f0093SBram Moolenaar let high = keys[0] 323207f0093SBram Moolenaar for key in keys 324207f0093SBram Moolenaar if key > high + 1 325207f0093SBram Moolenaar call add(result, [low, high]) 326207f0093SBram Moolenaar let low = key 327207f0093SBram Moolenaar let high = key 328b86f10eeSBram Moolenaar else 329207f0093SBram Moolenaar let high = key 330b86f10eeSBram Moolenaar endif 3313848e00eSBram Moolenaar endfor 332207f0093SBram Moolenaar call add(result, [low, high]) 333207f0093SBram Moolenaar return result 334207f0093SBram Moolenaarendfunc 335b86f10eeSBram Moolenaar 336207f0093SBram Moolenaarfunc Test_CharDictToPairList() 337207f0093SBram Moolenaar let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1, 338207f0093SBram Moolenaar \ 0x1024: 1, 339207f0093SBram Moolenaar \ 0x2022: 1, 340207f0093SBram Moolenaar \ 0x2024: 1, 0x2025: 1} 341207f0093SBram Moolenaar call assert_equal([ 342207f0093SBram Moolenaar \ [0x1020, 0x1022], 343207f0093SBram Moolenaar \ [0x1024, 0x1024], 344207f0093SBram Moolenaar \ [0x2022, 0x2022], 345207f0093SBram Moolenaar \ [0x2024, 0x2025], 346207f0093SBram Moolenaar \ ], CharDictToPairList(dict)) 347207f0093SBram Moolenaar if v:errors != [] 348207f0093SBram Moolenaar echoerr 'CharDictToPairList' v:errors 349207f0093SBram Moolenaar return 1 350b86f10eeSBram Moolenaar endif 351207f0093SBram Moolenaar return 0 352207f0093SBram Moolenaarendfunc 353207f0093SBram Moolenaar 354207f0093SBram Moolenaar 355207f0093SBram Moolenaar" Build the amoji width table in a new buffer. 356207f0093SBram Moolenaarfunc BuildEmojiTable() 357207f0093SBram Moolenaar " First make the table for all emojis. 358207f0093SBram Moolenaar let pattern = '; Emoji\s\+#\s' 359207f0093SBram Moolenaar let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') 360207f0093SBram Moolenaar 361207f0093SBram Moolenaar " Make a dictionary with an entry for each character. 362207f0093SBram Moolenaar let chardict = {} 363207f0093SBram Moolenaar call AddLinesToCharDict(lines, chardict) 364207f0093SBram Moolenaar let pairlist = CharDictToPairList(chardict) 365207f0093SBram Moolenaar let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])') 3663848e00eSBram Moolenaar 3673848e00eSBram Moolenaar " New buffer to put the result in. 3683848e00eSBram Moolenaar new 369207f0093SBram Moolenaar exe 'file emoji_all' 370207f0093SBram Moolenaar call setline(1, "static struct interval emoji_all[] =") 3713848e00eSBram Moolenaar call setline(2, "{") 372b86f10eeSBram Moolenaar call append('$', allranges) 373b86f10eeSBram Moolenaar call setline('$', getline('$')[:-2]) " remove last comma 374b86f10eeSBram Moolenaar call setline(line('$') + 1, "};") 375b86f10eeSBram Moolenaar wincmd p 376b86f10eeSBram Moolenaar 377207f0093SBram Moolenaar " Make the table for wide emojis. 378207f0093SBram Moolenaar let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s' 379207f0093SBram Moolenaar let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') 380207f0093SBram Moolenaar 381207f0093SBram Moolenaar " Make a dictionary with an entry for each character. 382207f0093SBram Moolenaar let chardict = {} 383207f0093SBram Moolenaar call AddLinesToCharDict(lines, chardict) 384207f0093SBram Moolenaar 385207f0093SBram Moolenaar " exclude characters that are in the "ambiguous" or "doublewidth" table 386207f0093SBram Moolenaar for ambi in s:ambitable 387207f0093SBram Moolenaar for nr in range(ambi[0], ambi[1]) 388207f0093SBram Moolenaar if has_key(chardict, nr) 389207f0093SBram Moolenaar call remove(chardict, nr) 390207f0093SBram Moolenaar endif 391207f0093SBram Moolenaar endfor 392207f0093SBram Moolenaar endfor 393207f0093SBram Moolenaar 394207f0093SBram Moolenaar for wide in s:doubletable 395207f0093SBram Moolenaar for nr in range(wide[0], wide[1]) 396207f0093SBram Moolenaar if has_key(chardict, nr) 397207f0093SBram Moolenaar call remove(chardict, nr) 398207f0093SBram Moolenaar endif 399207f0093SBram Moolenaar endfor 400207f0093SBram Moolenaar endfor 401207f0093SBram Moolenaar 402207f0093SBram Moolenaar let pairlist = CharDictToPairList(chardict) 403207f0093SBram Moolenaar let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') 404207f0093SBram Moolenaar 405b86f10eeSBram Moolenaar " New buffer to put the result in. 406b86f10eeSBram Moolenaar new 407207f0093SBram Moolenaar exe 'file emoji_wide' 408207f0093SBram Moolenaar call setline(1, " static struct interval emoji_wide[] =") 409b86f10eeSBram Moolenaar call setline(2, " {") 410207f0093SBram Moolenaar call append('$', wide_ranges) 4113848e00eSBram Moolenaar call setline('$', getline('$')[:-2]) " remove last comma 4123848e00eSBram Moolenaar call setline(line('$') + 1, " };") 4133848e00eSBram Moolenaar wincmd p 4143848e00eSBram Moolenaarendfunc 4153e8cb587SBram Moolenaar 416207f0093SBram Moolenaar" First test a few things 417207f0093SBram Moolenaarlet v:errors = [] 418207f0093SBram Moolenaarif Test_AddLinesToCharDict() || Test_CharDictToPairList() 419207f0093SBram Moolenaar finish 420207f0093SBram Moolenaarendif 421207f0093SBram Moolenaar 422207f0093SBram Moolenaar 42366312acfSBram Moolenaar" Try to avoid hitting E36 42466312acfSBram Moolenaarset equalalways 4253e8cb587SBram Moolenaar 4263e8cb587SBram Moolenaar" Edit the Unicode text file. Requires the netrw plugin. 4273e8cb587SBram Moolenaaredit http://unicode.org/Public/UNIDATA/UnicodeData.txt 4283e8cb587SBram Moolenaar 4293e8cb587SBram Moolenaar" Parse each line, create a list of lists. 4303e8cb587SBram Moolenaarcall ParseDataToProps() 4313e8cb587SBram Moolenaar 4323e8cb587SBram Moolenaar" Build the toLower table. 4333e8cb587SBram Moolenaarcall BuildCaseTable("Lower", 13) 4343e8cb587SBram Moolenaar 4353e8cb587SBram Moolenaar" Build the toUpper table. 4363e8cb587SBram Moolenaarcall BuildCaseTable("Upper", 12) 4373e8cb587SBram Moolenaar 4383e8cb587SBram Moolenaar" Build the ranges of composing chars. 4393e8cb587SBram Moolenaarcall BuildCombiningTable() 4403e8cb587SBram Moolenaar 4413e8cb587SBram Moolenaar" Edit the case folding text file. Requires the netrw plugin. 4423e8cb587SBram Moolenaaredit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt 4433e8cb587SBram Moolenaar 4443e8cb587SBram Moolenaar" Parse each line, create a list of lists. 4453e8cb587SBram Moolenaarcall ParseFoldProps() 4463e8cb587SBram Moolenaar 4473e8cb587SBram Moolenaar" Build the foldCase table. 4483e8cb587SBram Moolenaarcall BuildFoldTable() 4493e8cb587SBram Moolenaar 4503e8cb587SBram Moolenaar" Edit the width text file. Requires the netrw plugin. 4513e8cb587SBram Moolenaaredit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt 4523e8cb587SBram Moolenaar 4533e8cb587SBram Moolenaar" Parse each line, create a list of lists. 4543e8cb587SBram Moolenaarcall ParseWidthProps() 4553e8cb587SBram Moolenaar 456da4d7a92SBram Moolenaar" Build the double width table. 457b86f10eeSBram Moolenaarlet s:doubletable = [] 458da4d7a92SBram Moolenaarcall BuildWidthTable('[WF]', 'doublewidth') 459da4d7a92SBram Moolenaar 460da4d7a92SBram Moolenaar" Build the ambiguous width table. 461b86f10eeSBram Moolenaarlet s:ambitable = [] 462da4d7a92SBram Moolenaarcall BuildWidthTable('A', 'ambiguous') 4633848e00eSBram Moolenaar 4643848e00eSBram Moolenaar" Edit the emoji text file. Requires the netrw plugin. 465207f0093SBram Moolenaaredit https://unicode.org/Public/emoji/12.1/emoji-data.txt 4663848e00eSBram Moolenaar 4673848e00eSBram Moolenaar" Build the emoji table. Ver. 1.0 - 6.0 468207f0093SBram Moolenaar" Must come after the "ambiguous" and "doublewidth" tables 469207f0093SBram Moolenaarcall BuildEmojiTable() 470