1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. 2" The format of the UnicodeData.txt file is explained here: 3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html 4" For the other files see the header. 5" 6" Might need to update the URL to the emoji-data.txt 7" Usage: Vim -S <this-file> 8" 9" Author: Bram Moolenaar 10" Last Update: 2020 Aug 24 11 12" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. 13func! ParseDataToProps() 14 let s:dataprops = [] 15 let lnum = 1 16 while lnum <= line('$') 17 let l = split(getline(lnum), '\s*;\s*', 1) 18 if len(l) != 15 19 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' 20 return 21 endif 22 call add(s:dataprops, l) 23 let lnum += 1 24 endwhile 25endfunc 26 27" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. 28func! ParseFoldProps() 29 let s:foldprops = [] 30 let lnum = 1 31 while lnum <= line('$') 32 let line = getline(lnum) 33 if line !~ '^#' && line !~ '^\s*$' 34 let l = split(line, '\s*;\s*', 1) 35 if len(l) != 4 36 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' 37 return 38 endif 39 call add(s:foldprops, l) 40 endif 41 let lnum += 1 42 endwhile 43endfunc 44 45" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. 46func! ParseWidthProps() 47 let s:widthprops = [] 48 let lnum = 1 49 while lnum <= line('$') 50 let line = getline(lnum) 51 if line !~ '^#' && line !~ '^\s*$' 52 let l = split(line, '\s*;\s*', 1) 53 if len(l) != 2 54 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' 55 return 56 endif 57 call add(s:widthprops, l) 58 endif 59 let lnum += 1 60 endwhile 61endfunc 62 63" Build the toLower or toUpper table in a new buffer. 64" Uses s:dataprops. 65func! BuildCaseTable(name, index) 66 let start = -1 67 let end = -1 68 let step = 0 69 let add = -1 70 let ranges = [] 71 for p in s:dataprops 72 if p[a:index] != '' 73 let n = ('0x' . p[0]) + 0 74 let nl = ('0x' . p[a:index]) + 0 75 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 76 " continue with same range. 77 let step = n - end 78 let end = n 79 else 80 if start >= 0 81 " produce previous range 82 call Range(ranges, start, end, step, add) 83 endif 84 let start = n 85 let end = n 86 let step = 0 87 let add = nl - n 88 endif 89 endif 90 endfor 91 if start >= 0 92 call Range(ranges, start, end, step, add) 93 endif 94 95 " New buffer to put the result in. 96 new 97 exe "file to" . a:name 98 call setline(1, "static convertStruct to" . a:name . "[] =") 99 call setline(2, "{") 100 call append('$', ranges) 101 call setline('$', getline('$')[:-2]) " remove last comma 102 call setline(line('$') + 1, "};") 103 wincmd p 104endfunc 105 106" Build the foldCase table in a new buffer. 107" Uses s:foldprops. 108func! BuildFoldTable() 109 let start = -1 110 let end = -1 111 let step = 0 112 let add = -1 113 let ranges = [] 114 for p in s:foldprops 115 if p[1] == 'C' || p[1] == 'S' 116 let n = ('0x' . p[0]) + 0 117 let nl = ('0x' . p[2]) + 0 118 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 119 " continue with same range. 120 let step = n - end 121 let end = n 122 else 123 if start >= 0 124 " produce previous range 125 call Range(ranges, start, end, step, add) 126 endif 127 let start = n 128 let end = n 129 let step = 0 130 let add = nl - n 131 endif 132 endif 133 endfor 134 if start >= 0 135 call Range(ranges, start, end, step, add) 136 endif 137 138 " New buffer to put the result in. 139 new 140 file foldCase 141 call setline(1, "static convertStruct foldCase[] =") 142 call setline(2, "{") 143 call append('$', ranges) 144 call setline('$', getline('$')[:-2]) " remove last comma 145 call setline(line('$') + 1, "};") 146 wincmd p 147endfunc 148 149func! Range(ranges, start, end, step, add) 150 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) 151 call add(a:ranges, s) 152endfunc 153 154" Build the combining table. 155" Uses s:dataprops. 156func! BuildCombiningTable() 157 let start = -1 158 let end = -1 159 let ranges = [] 160 for p in s:dataprops 161 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me' 162 let n = ('0x' . p[0]) + 0 163 if start >= 0 && end + 1 == n 164 " continue with same range. 165 let end = n 166 else 167 if start >= 0 168 " produce previous range 169 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 170 endif 171 let start = n 172 let end = n 173 endif 174 endif 175 endfor 176 if start >= 0 177 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 178 endif 179 180 " New buffer to put the result in. 181 new 182 file combining 183 call setline(1, " static struct interval combining[] =") 184 call setline(2, " {") 185 call append('$', ranges) 186 call setline('$', getline('$')[:-2]) " remove last comma 187 call setline(line('$') + 1, " };") 188 wincmd p 189endfunc 190 191" Build the double width or ambiguous width table in a new buffer. 192" Uses s:widthprops and s:dataprops. 193func! BuildWidthTable(pattern, tableName) 194 let start = -1 195 let end = -1 196 let ranges = [] 197 let dataidx = 0 198 " Account for indentation differences between ambiguous and doublewidth 199 " table in mbyte.c 200 if a:pattern == 'A' 201 let spc = ' ' 202 else 203 let spc = "\t" 204 endif 205 for p in s:widthprops 206 if p[1][0] =~ a:pattern 207 if p[0] =~ '\.\.' 208 " It is a range. we don't check for composing char then. 209 let rng = split(p[0], '\.\.') 210 if len(rng) != 2 211 echoerr "Cannot parse range: '" . p[0] . "' in width table" 212 endif 213 let n = ('0x' . rng[0]) + 0 214 let n_last = ('0x' . rng[1]) + 0 215 else 216 let n = ('0x' . p[0]) + 0 217 let n_last = n 218 endif 219 " Find this char in the data table. 220 while 1 221 let dn = ('0x' . s:dataprops[dataidx][0]) + 0 222 if dn >= n 223 break 224 endif 225 let dataidx += 1 226 endwhile 227 if dn != n && n_last == n 228 echoerr "Cannot find character " . n . " in data table" 229 endif 230 " Only use the char when it's not a composing char. 231 " But use all chars from a range. 232 let dp = s:dataprops[dataidx] 233 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') 234 if start >= 0 && end + 1 == n 235 " continue with same range. 236 else 237 if start >= 0 238 " produce previous range 239 call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end)) 240 if a:pattern == 'A' 241 call add(s:ambitable, [start, end]) 242 else 243 call add(s:doubletable, [start, end]) 244 endif 245 endif 246 let start = n 247 endif 248 let end = n_last 249 endif 250 endif 251 endfor 252 if start >= 0 253 call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end)) 254 if a:pattern == 'A' 255 call add(s:ambitable, [start, end]) 256 else 257 call add(s:doubletable, [start, end]) 258 endif 259 endif 260 261 " New buffer to put the result in. 262 new 263 exe "file " . a:tableName 264 if a:pattern == 'A' 265 call setline(1, "static struct interval " . a:tableName . "[] =") 266 call setline(2, "{") 267 else 268 call setline(1, " static struct interval " . a:tableName . "[] =") 269 call setline(2, " {") 270 endif 271 call append('$', ranges) 272 call setline('$', getline('$')[:-2]) " remove last comma 273 if a:pattern == 'A' 274 call setline(line('$') + 1, "};") 275 else 276 call setline(line('$') + 1, " };") 277 endif 278 wincmd p 279endfunc 280 281 282" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..." 283" and put them in dictionary "chardict" 284func AddLinesToCharDict(lines, chardict) 285 for line in a:lines 286 let tokens = split(line, '\.\.') 287 let first = str2nr(tokens[0], 16) 288 if len(tokens) == 1 289 let last = first 290 else 291 let last = str2nr(tokens[1], 16) 292 endif 293 for nr in range(first, last) 294 let a:chardict[nr] = 1 295 endfor 296 endfor 297endfunc 298 299func Test_AddLinesToCharDict() 300 let dict = {} 301 call AddLinesToCharDict([ 302 \ '1234 blah blah', 303 \ '1235 blah blah', 304 \ '12a0..12a2 blah blah', 305 \ '12a1 blah blah', 306 \ ], dict) 307 call assert_equal({0x1234: 1, 0x1235: 1, 308 \ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1, 309 \ }, dict) 310 if v:errors != [] 311 echoerr 'AddLinesToCharDict' v:errors 312 return 1 313 endif 314 return 0 315endfunc 316 317 318func CharDictToPairList(chardict) 319 let result = [] 320 let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N') 321 let low = keys[0] 322 let high = keys[0] 323 for key in keys 324 if key > high + 1 325 call add(result, [low, high]) 326 let low = key 327 let high = key 328 else 329 let high = key 330 endif 331 endfor 332 call add(result, [low, high]) 333 return result 334endfunc 335 336func Test_CharDictToPairList() 337 let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1, 338 \ 0x1024: 1, 339 \ 0x2022: 1, 340 \ 0x2024: 1, 0x2025: 1} 341 call assert_equal([ 342 \ [0x1020, 0x1022], 343 \ [0x1024, 0x1024], 344 \ [0x2022, 0x2022], 345 \ [0x2024, 0x2025], 346 \ ], CharDictToPairList(dict)) 347 if v:errors != [] 348 echoerr 'CharDictToPairList' v:errors 349 return 1 350 endif 351 return 0 352endfunc 353 354 355" Build the amoji width table in a new buffer. 356func BuildEmojiTable() 357 " First make the table for all emojis. 358 let pattern = '; Emoji\s\+#\s' 359 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') 360 361 " Make a dictionary with an entry for each character. 362 let chardict = {} 363 call AddLinesToCharDict(lines, chardict) 364 let pairlist = CharDictToPairList(chardict) 365 let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])') 366 367 " New buffer to put the result in. 368 new 369 exe 'file emoji_all' 370 call setline(1, "static struct interval emoji_all[] =") 371 call setline(2, "{") 372 call append('$', allranges) 373 call setline('$', getline('$')[:-2]) " remove last comma 374 call setline(line('$') + 1, "};") 375 wincmd p 376 377 " Make the table for wide emojis. 378 let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s' 379 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') 380 381 " Make a dictionary with an entry for each character. 382 let chardict = {} 383 call AddLinesToCharDict(lines, chardict) 384 385 " exclude characters that are in the "ambiguous" or "doublewidth" table 386 for ambi in s:ambitable 387 for nr in range(ambi[0], ambi[1]) 388 if has_key(chardict, nr) 389 call remove(chardict, nr) 390 endif 391 endfor 392 endfor 393 394 for wide in s:doubletable 395 for nr in range(wide[0], wide[1]) 396 if has_key(chardict, nr) 397 call remove(chardict, nr) 398 endif 399 endfor 400 endfor 401 402 let pairlist = CharDictToPairList(chardict) 403 let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') 404 405 " New buffer to put the result in. 406 new 407 exe 'file emoji_wide' 408 call setline(1, " static struct interval emoji_wide[] =") 409 call setline(2, " {") 410 call append('$', wide_ranges) 411 call setline('$', getline('$')[:-2]) " remove last comma 412 call setline(line('$') + 1, " };") 413 wincmd p 414endfunc 415 416" First test a few things 417let v:errors = [] 418if Test_AddLinesToCharDict() || Test_CharDictToPairList() 419 finish 420endif 421 422 423" Try to avoid hitting E36 424set equalalways 425 426" Edit the Unicode text file. Requires the netrw plugin. 427edit http://unicode.org/Public/UNIDATA/UnicodeData.txt 428 429" Parse each line, create a list of lists. 430call ParseDataToProps() 431 432" Build the toLower table. 433call BuildCaseTable("Lower", 13) 434 435" Build the toUpper table. 436call BuildCaseTable("Upper", 12) 437 438" Build the ranges of composing chars. 439call BuildCombiningTable() 440 441" Edit the case folding text file. Requires the netrw plugin. 442edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt 443 444" Parse each line, create a list of lists. 445call ParseFoldProps() 446 447" Build the foldCase table. 448call BuildFoldTable() 449 450" Edit the width text file. Requires the netrw plugin. 451edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt 452 453" Parse each line, create a list of lists. 454call ParseWidthProps() 455 456" Build the double width table. 457let s:doubletable = [] 458call BuildWidthTable('[WF]', 'doublewidth') 459 460" Build the ambiguous width table. 461let s:ambitable = [] 462call BuildWidthTable('A', 'ambiguous') 463 464" Edit the emoji text file. Requires the netrw plugin. 465edit https://unicode.org/Public/emoji/12.1/emoji-data.txt 466 467" Build the emoji table. Ver. 1.0 - 6.0 468" Must come after the "ambiguous" and "doublewidth" tables 469call BuildEmojiTable() 470