1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. 2" The format of the UnicodeData.txt file is explained here: 3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html 4" For the other files see the header. 5" 6" Might need to update the URL to the emoji-data.txt 7" Usage: Vim -S <this-file> 8" 9" Author: Bram Moolenaar 10" Last Update: 2010 Jan 12 11 12" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. 13func! ParseDataToProps() 14 let s:dataprops = [] 15 let lnum = 1 16 while lnum <= line('$') 17 let l = split(getline(lnum), '\s*;\s*', 1) 18 if len(l) != 15 19 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' 20 return 21 endif 22 call add(s:dataprops, l) 23 let lnum += 1 24 endwhile 25endfunc 26 27" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. 28func! ParseFoldProps() 29 let s:foldprops = [] 30 let lnum = 1 31 while lnum <= line('$') 32 let line = getline(lnum) 33 if line !~ '^#' && line !~ '^\s*$' 34 let l = split(line, '\s*;\s*', 1) 35 if len(l) != 4 36 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' 37 return 38 endif 39 call add(s:foldprops, l) 40 endif 41 let lnum += 1 42 endwhile 43endfunc 44 45" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. 46func! ParseWidthProps() 47 let s:widthprops = [] 48 let lnum = 1 49 while lnum <= line('$') 50 let line = getline(lnum) 51 if line !~ '^#' && line !~ '^\s*$' 52 let l = split(line, '\s*;\s*', 1) 53 if len(l) != 2 54 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' 55 return 56 endif 57 call add(s:widthprops, l) 58 endif 59 let lnum += 1 60 endwhile 61endfunc 62 63" Build the toLower or toUpper table in a new buffer. 64" Uses s:dataprops. 65func! BuildCaseTable(name, index) 66 let start = -1 67 let end = -1 68 let step = 0 69 let add = -1 70 let ranges = [] 71 for p in s:dataprops 72 if p[a:index] != '' 73 let n = ('0x' . p[0]) + 0 74 let nl = ('0x' . p[a:index]) + 0 75 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 76 " continue with same range. 77 let step = n - end 78 let end = n 79 else 80 if start >= 0 81 " produce previous range 82 call Range(ranges, start, end, step, add) 83 endif 84 let start = n 85 let end = n 86 let step = 0 87 let add = nl - n 88 endif 89 endif 90 endfor 91 if start >= 0 92 call Range(ranges, start, end, step, add) 93 endif 94 95 " New buffer to put the result in. 96 new 97 exe "file to" . a:name 98 call setline(1, "static convertStruct to" . a:name . "[] =") 99 call setline(2, "{") 100 call append('$', ranges) 101 call setline('$', getline('$')[:-2]) " remove last comma 102 call setline(line('$') + 1, "};") 103 wincmd p 104endfunc 105 106" Build the foldCase table in a new buffer. 107" Uses s:foldprops. 108func! BuildFoldTable() 109 let start = -1 110 let end = -1 111 let step = 0 112 let add = -1 113 let ranges = [] 114 for p in s:foldprops 115 if p[1] == 'C' || p[1] == 'S' 116 let n = ('0x' . p[0]) + 0 117 let nl = ('0x' . p[2]) + 0 118 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 119 " continue with same range. 120 let step = n - end 121 let end = n 122 else 123 if start >= 0 124 " produce previous range 125 call Range(ranges, start, end, step, add) 126 endif 127 let start = n 128 let end = n 129 let step = 0 130 let add = nl - n 131 endif 132 endif 133 endfor 134 if start >= 0 135 call Range(ranges, start, end, step, add) 136 endif 137 138 " New buffer to put the result in. 139 new 140 file foldCase 141 call setline(1, "static convertStruct foldCase[] =") 142 call setline(2, "{") 143 call append('$', ranges) 144 call setline('$', getline('$')[:-2]) " remove last comma 145 call setline(line('$') + 1, "};") 146 wincmd p 147endfunc 148 149func! Range(ranges, start, end, step, add) 150 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) 151 call add(a:ranges, s) 152endfunc 153 154" Build the combining table. 155" Uses s:dataprops. 156func! BuildCombiningTable() 157 let start = -1 158 let end = -1 159 let ranges = [] 160 for p in s:dataprops 161 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me' 162 let n = ('0x' . p[0]) + 0 163 if start >= 0 && end + 1 == n 164 " continue with same range. 165 let end = n 166 else 167 if start >= 0 168 " produce previous range 169 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 170 endif 171 let start = n 172 let end = n 173 endif 174 endif 175 endfor 176 if start >= 0 177 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 178 endif 179 180 " New buffer to put the result in. 181 new 182 file combining 183 call setline(1, " static struct interval combining[] =") 184 call setline(2, " {") 185 call append('$', ranges) 186 call setline('$', getline('$')[:-2]) " remove last comma 187 call setline(line('$') + 1, " };") 188 wincmd p 189endfunc 190 191" Build the double width or ambiguous width table in a new buffer. 192" Uses s:widthprops and s:dataprops. 193func! BuildWidthTable(pattern, tableName) 194 let start = -1 195 let end = -1 196 let ranges = [] 197 let dataidx = 0 198 for p in s:widthprops 199 if p[1][0] =~ a:pattern 200 if p[0] =~ '\.\.' 201 " It is a range. we don't check for composing char then. 202 let rng = split(p[0], '\.\.') 203 if len(rng) != 2 204 echoerr "Cannot parse range: '" . p[0] . "' in width table" 205 endif 206 let n = ('0x' . rng[0]) + 0 207 let n_last = ('0x' . rng[1]) + 0 208 else 209 let n = ('0x' . p[0]) + 0 210 let n_last = n 211 endif 212 " Find this char in the data table. 213 while 1 214 let dn = ('0x' . s:dataprops[dataidx][0]) + 0 215 if dn >= n 216 break 217 endif 218 let dataidx += 1 219 endwhile 220 if dn != n && n_last == n 221 echoerr "Cannot find character " . n . " in data table" 222 endif 223 " Only use the char when it's not a composing char. 224 " But use all chars from a range. 225 let dp = s:dataprops[dataidx] 226 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') 227 if start >= 0 && end + 1 == n 228 " continue with same range. 229 else 230 if start >= 0 231 " produce previous range 232 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 233 if a:pattern == 'A' 234 call add(s:ambitable, [start, end]) 235 else 236 call add(s:doubletable, [start, end]) 237 endif 238 endif 239 let start = n 240 endif 241 let end = n_last 242 endif 243 endif 244 endfor 245 if start >= 0 246 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 247 if a:pattern == 'A' 248 call add(s:ambitable, [start, end]) 249 else 250 call add(s:doubletable, [start, end]) 251 endif 252 endif 253 254 " New buffer to put the result in. 255 new 256 exe "file " . a:tableName 257 call setline(1, " static struct interval " . a:tableName . "[] =") 258 call setline(2, " {") 259 call append('$', ranges) 260 call setline('$', getline('$')[:-2]) " remove last comma 261 call setline(line('$') + 1, " };") 262 wincmd p 263endfunc 264 265" Build the amoji width table in a new buffer. 266func! BuildEmojiTable(pattern, tableName) 267 let alltokens = [] 268 let widthtokens = [] 269 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")') 270 for n in range(len(lines)) 271 let line = lines[n] 272 let token = split(line, '\.\.') 273 let first = ('0x' . token[0]) + 0 274 if len(token) == 1 275 let last = first 276 else 277 let last = ('0x' . token[1]) + 0 278 endif 279 280 let token = [first, last] 281 if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1]) 282 let alltokens[-1][1] = token[1] 283 else 284 call add(alltokens, token) 285 endif 286 287 " Characters below 1F000 may be considered single width traditionally, 288 " making them double width causes problems. 289 if first < 0x1f000 290 continue 291 endif 292 293 " exclude characters that are in the "ambiguous" or "doublewidth" table 294 for ambi in s:ambitable 295 if first >= ambi[0] && first <= ambi[1] 296 let first = ambi[1] + 1 297 endif 298 if last >= ambi[0] && last <= ambi[1] 299 let last = ambi[0] - 1 300 endif 301 endfor 302 for double in s:doubletable 303 if first >= double[0] && first <= double[1] 304 let first = double[1] + 1 305 endif 306 if last >= double[0] && last <= double[1] 307 let last = double[0] - 1 308 endif 309 endfor 310 311 if first <= last 312 let token = [first, last] 313 if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1]) 314 let widthtokens[-1][1] = token[1] 315 else 316 call add(widthtokens, token) 317 endif 318 endif 319 endfor 320 let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') 321 let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') 322 323 " New buffer to put the result in. 324 new 325 exe "file " . a:tableName . '_all' 326 call setline(1, " static struct interval " . a:tableName . "_all[] =") 327 call setline(2, " {") 328 call append('$', allranges) 329 call setline('$', getline('$')[:-2]) " remove last comma 330 call setline(line('$') + 1, " };") 331 wincmd p 332 333 " New buffer to put the result in. 334 new 335 exe "file " . a:tableName . '_width' 336 call setline(1, " static struct interval " . a:tableName . "_width[] =") 337 call setline(2, " {") 338 call append('$', widthranges) 339 call setline('$', getline('$')[:-2]) " remove last comma 340 call setline(line('$') + 1, " };") 341 wincmd p 342endfunc 343 344" Try to avoid hitting E36 345set equalalways 346 347" Edit the Unicode text file. Requires the netrw plugin. 348edit http://unicode.org/Public/UNIDATA/UnicodeData.txt 349 350" Parse each line, create a list of lists. 351call ParseDataToProps() 352 353" Build the toLower table. 354call BuildCaseTable("Lower", 13) 355 356" Build the toUpper table. 357call BuildCaseTable("Upper", 12) 358 359" Build the ranges of composing chars. 360call BuildCombiningTable() 361 362" Edit the case folding text file. Requires the netrw plugin. 363edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt 364 365" Parse each line, create a list of lists. 366call ParseFoldProps() 367 368" Build the foldCase table. 369call BuildFoldTable() 370 371" Edit the width text file. Requires the netrw plugin. 372edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt 373 374" Parse each line, create a list of lists. 375call ParseWidthProps() 376 377" Build the double width table. 378let s:doubletable = [] 379call BuildWidthTable('[WF]', 'doublewidth') 380 381" Build the ambiguous width table. 382let s:ambitable = [] 383call BuildWidthTable('A', 'ambiguous') 384 385" Edit the emoji text file. Requires the netrw plugin. 386edit https://www.unicode.org/Public/emoji/11.0/emoji-data.txt 387"edit http://www.unicode.org/Public/emoji/latest/emoji-data.txt 388 389" Build the emoji table. Ver. 1.0 - 6.0 390" Must come after the "ambiguous" table 391call BuildEmojiTable('; Emoji\s\+#\s\+\d\+\.\d', 'emoji') 392