1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. 2" The format of the UnicodeData.txt file is explained here: 3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html 4" For the other files see the header. 5" 6" Might need to update the URL to the emoji-data.txt 7" Usage: Vim -S <this-file> 8" 9" Author: Bram Moolenaar 10" Last Update: 2020 Aug 24 11 12" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. 13func! ParseDataToProps() 14 let s:dataprops = [] 15 let lnum = 1 16 while lnum <= line('$') 17 let l = split(getline(lnum), '\s*;\s*', 1) 18 if len(l) != 15 19 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' 20 return 21 endif 22 call add(s:dataprops, l) 23 let lnum += 1 24 endwhile 25endfunc 26 27" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. 28func! ParseFoldProps() 29 let s:foldprops = [] 30 let lnum = 1 31 while lnum <= line('$') 32 let line = getline(lnum) 33 if line !~ '^#' && line !~ '^\s*$' 34 let l = split(line, '\s*;\s*', 1) 35 if len(l) != 4 36 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' 37 return 38 endif 39 call add(s:foldprops, l) 40 endif 41 let lnum += 1 42 endwhile 43endfunc 44 45" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. 46func! ParseWidthProps() 47 let s:widthprops = [] 48 let lnum = 1 49 while lnum <= line('$') 50 let line = getline(lnum) 51 if line !~ '^#' && line !~ '^\s*$' 52 let l = split(line, '\s*;\s*', 1) 53 if len(l) != 2 54 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' 55 return 56 endif 57 call add(s:widthprops, l) 58 endif 59 let lnum += 1 60 endwhile 61endfunc 62 63" Build the toLower or toUpper table in a new buffer. 64" Uses s:dataprops. 65func! BuildCaseTable(name, index) 66 let start = -1 67 let end = -1 68 let step = 0 69 let add = -1 70 let ranges = [] 71 for p in s:dataprops 72 if p[a:index] != '' 73 let n = ('0x' . p[0]) + 0 74 let nl = ('0x' . p[a:index]) + 0 75 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 76 " continue with same range. 77 let step = n - end 78 let end = n 79 else 80 if start >= 0 81 " produce previous range 82 call Range(ranges, start, end, step, add) 83 endif 84 let start = n 85 let end = n 86 let step = 0 87 let add = nl - n 88 endif 89 endif 90 endfor 91 if start >= 0 92 call Range(ranges, start, end, step, add) 93 endif 94 95 " New buffer to put the result in. 96 new 97 exe "file to" . a:name 98 call setline(1, "static convertStruct to" . a:name . "[] =") 99 call setline(2, "{") 100 call append('$', ranges) 101 call setline('$', getline('$')[:-2]) " remove last comma 102 call setline(line('$') + 1, "};") 103 wincmd p 104endfunc 105 106" Build the foldCase table in a new buffer. 107" Uses s:foldprops. 108func! BuildFoldTable() 109 let start = -1 110 let end = -1 111 let step = 0 112 let add = -1 113 let ranges = [] 114 for p in s:foldprops 115 if p[1] == 'C' || p[1] == 'S' 116 let n = ('0x' . p[0]) + 0 117 let nl = ('0x' . p[2]) + 0 118 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 119 " continue with same range. 120 let step = n - end 121 let end = n 122 else 123 if start >= 0 124 " produce previous range 125 call Range(ranges, start, end, step, add) 126 endif 127 let start = n 128 let end = n 129 let step = 0 130 let add = nl - n 131 endif 132 endif 133 endfor 134 if start >= 0 135 call Range(ranges, start, end, step, add) 136 endif 137 138 " New buffer to put the result in. 139 new 140 file foldCase 141 call setline(1, "static convertStruct foldCase[] =") 142 call setline(2, "{") 143 call append('$', ranges) 144 call setline('$', getline('$')[:-2]) " remove last comma 145 call setline(line('$') + 1, "};") 146 wincmd p 147endfunc 148 149func! Range(ranges, start, end, step, add) 150 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) 151 call add(a:ranges, s) 152endfunc 153 154" Build the combining table. 155" Uses s:dataprops. 156func! BuildCombiningTable() 157 let start = -1 158 let end = -1 159 let ranges = [] 160 for p in s:dataprops 161 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me' 162 let n = ('0x' . p[0]) + 0 163 if start >= 0 && end + 1 == n 164 " continue with same range. 165 let end = n 166 else 167 if start >= 0 168 " produce previous range 169 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 170 endif 171 let start = n 172 let end = n 173 endif 174 endif 175 endfor 176 if start >= 0 177 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 178 endif 179 180 " New buffer to put the result in. 181 new 182 file combining 183 call setline(1, " static struct interval combining[] =") 184 call setline(2, " {") 185 call append('$', ranges) 186 call setline('$', getline('$')[:-2]) " remove last comma 187 call setline(line('$') + 1, " };") 188 wincmd p 189endfunc 190 191" Build the double width or ambiguous width table in a new buffer. 192" Uses s:widthprops and s:dataprops. 193func! BuildWidthTable(pattern, tableName) 194 let start = -1 195 let end = -1 196 let ranges = [] 197 let dataidx = 0 198 for p in s:widthprops 199 if p[1][0] =~ a:pattern 200 if p[0] =~ '\.\.' 201 " It is a range. we don't check for composing char then. 202 let rng = split(p[0], '\.\.') 203 if len(rng) != 2 204 echoerr "Cannot parse range: '" . p[0] . "' in width table" 205 endif 206 let n = ('0x' . rng[0]) + 0 207 let n_last = ('0x' . rng[1]) + 0 208 else 209 let n = ('0x' . p[0]) + 0 210 let n_last = n 211 endif 212 " Find this char in the data table. 213 while 1 214 let dn = ('0x' . s:dataprops[dataidx][0]) + 0 215 if dn >= n 216 break 217 endif 218 let dataidx += 1 219 endwhile 220 if dn != n && n_last == n 221 echoerr "Cannot find character " . n . " in data table" 222 endif 223 " Only use the char when it's not a composing char. 224 " But use all chars from a range. 225 let dp = s:dataprops[dataidx] 226 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') 227 if start >= 0 && end + 1 == n 228 " continue with same range. 229 else 230 if start >= 0 231 " produce previous range 232 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 233 if a:pattern == 'A' 234 call add(s:ambitable, [start, end]) 235 else 236 call add(s:doubletable, [start, end]) 237 endif 238 endif 239 let start = n 240 endif 241 let end = n_last 242 endif 243 endif 244 endfor 245 if start >= 0 246 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 247 if a:pattern == 'A' 248 call add(s:ambitable, [start, end]) 249 else 250 call add(s:doubletable, [start, end]) 251 endif 252 endif 253 254 " New buffer to put the result in. 255 new 256 exe "file " . a:tableName 257 call setline(1, " static struct interval " . a:tableName . "[] =") 258 call setline(2, " {") 259 call append('$', ranges) 260 call setline('$', getline('$')[:-2]) " remove last comma 261 call setline(line('$') + 1, " };") 262 wincmd p 263endfunc 264 265 266" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..." 267" and put them in dictionary "chardict" 268func AddLinesToCharDict(lines, chardict) 269 for line in a:lines 270 let tokens = split(line, '\.\.') 271 let first = str2nr(tokens[0], 16) 272 if len(tokens) == 1 273 let last = first 274 else 275 let last = str2nr(tokens[1], 16) 276 endif 277 for nr in range(first, last) 278 let a:chardict[nr] = 1 279 endfor 280 endfor 281endfunc 282 283func Test_AddLinesToCharDict() 284 let dict = {} 285 call AddLinesToCharDict([ 286 \ '1234 blah blah', 287 \ '1235 blah blah', 288 \ '12a0..12a2 blah blah', 289 \ '12a1 blah blah', 290 \ ], dict) 291 call assert_equal({0x1234: 1, 0x1235: 1, 292 \ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1, 293 \ }, dict) 294 if v:errors != [] 295 echoerr 'AddLinesToCharDict' v:errors 296 return 1 297 endif 298 return 0 299endfunc 300 301 302func CharDictToPairList(chardict) 303 let result = [] 304 let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N') 305 let low = keys[0] 306 let high = keys[0] 307 for key in keys 308 if key > high + 1 309 call add(result, [low, high]) 310 let low = key 311 let high = key 312 else 313 let high = key 314 endif 315 endfor 316 call add(result, [low, high]) 317 return result 318endfunc 319 320func Test_CharDictToPairList() 321 let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1, 322 \ 0x1024: 1, 323 \ 0x2022: 1, 324 \ 0x2024: 1, 0x2025: 1} 325 call assert_equal([ 326 \ [0x1020, 0x1022], 327 \ [0x1024, 0x1024], 328 \ [0x2022, 0x2022], 329 \ [0x2024, 0x2025], 330 \ ], CharDictToPairList(dict)) 331 if v:errors != [] 332 echoerr 'CharDictToPairList' v:errors 333 return 1 334 endif 335 return 0 336endfunc 337 338 339" Build the amoji width table in a new buffer. 340func BuildEmojiTable() 341 " First make the table for all emojis. 342 let pattern = '; Emoji\s\+#\s' 343 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') 344 345 " Make a dictionary with an entry for each character. 346 let chardict = {} 347 call AddLinesToCharDict(lines, chardict) 348 let pairlist = CharDictToPairList(chardict) 349 let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])') 350 351 " New buffer to put the result in. 352 new 353 exe 'file emoji_all' 354 call setline(1, "static struct interval emoji_all[] =") 355 call setline(2, "{") 356 call append('$', allranges) 357 call setline('$', getline('$')[:-2]) " remove last comma 358 call setline(line('$') + 1, "};") 359 wincmd p 360 361 " Make the table for wide emojis. 362 let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s' 363 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') 364 365 " Make a dictionary with an entry for each character. 366 let chardict = {} 367 call AddLinesToCharDict(lines, chardict) 368 369 " exclude characters that are in the "ambiguous" or "doublewidth" table 370 for ambi in s:ambitable 371 for nr in range(ambi[0], ambi[1]) 372 if has_key(chardict, nr) 373 call remove(chardict, nr) 374 endif 375 endfor 376 endfor 377 378 for wide in s:doubletable 379 for nr in range(wide[0], wide[1]) 380 if has_key(chardict, nr) 381 call remove(chardict, nr) 382 endif 383 endfor 384 endfor 385 386 let pairlist = CharDictToPairList(chardict) 387 let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') 388 389 " New buffer to put the result in. 390 new 391 exe 'file emoji_wide' 392 call setline(1, " static struct interval emoji_wide[] =") 393 call setline(2, " {") 394 call append('$', wide_ranges) 395 call setline('$', getline('$')[:-2]) " remove last comma 396 call setline(line('$') + 1, " };") 397 wincmd p 398endfunc 399 400" First test a few things 401let v:errors = [] 402if Test_AddLinesToCharDict() || Test_CharDictToPairList() 403 finish 404endif 405 406 407" Try to avoid hitting E36 408set equalalways 409 410" Edit the Unicode text file. Requires the netrw plugin. 411edit http://unicode.org/Public/UNIDATA/UnicodeData.txt 412 413" Parse each line, create a list of lists. 414call ParseDataToProps() 415 416" Build the toLower table. 417call BuildCaseTable("Lower", 13) 418 419" Build the toUpper table. 420call BuildCaseTable("Upper", 12) 421 422" Build the ranges of composing chars. 423call BuildCombiningTable() 424 425" Edit the case folding text file. Requires the netrw plugin. 426edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt 427 428" Parse each line, create a list of lists. 429call ParseFoldProps() 430 431" Build the foldCase table. 432call BuildFoldTable() 433 434" Edit the width text file. Requires the netrw plugin. 435edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt 436 437" Parse each line, create a list of lists. 438call ParseWidthProps() 439 440" Build the double width table. 441let s:doubletable = [] 442call BuildWidthTable('[WF]', 'doublewidth') 443 444" Build the ambiguous width table. 445let s:ambitable = [] 446call BuildWidthTable('A', 'ambiguous') 447 448" Edit the emoji text file. Requires the netrw plugin. 449edit https://unicode.org/Public/emoji/12.1/emoji-data.txt 450 451" Build the emoji table. Ver. 1.0 - 6.0 452" Must come after the "ambiguous" and "doublewidth" tables 453call BuildEmojiTable() 454