1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. 2" The format of the UnicodeData.txt file is explained here: 3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html 4" For the other files see the header. 5" 6" Usage: Vim -S <this-file> 7" 8" Author: Bram Moolenaar 9" Last Update: 2010 Jan 12 10 11" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. 12func! ParseDataToProps() 13 let s:dataprops = [] 14 let lnum = 1 15 while lnum <= line('$') 16 let l = split(getline(lnum), '\s*;\s*', 1) 17 if len(l) != 15 18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' 19 return 20 endif 21 call add(s:dataprops, l) 22 let lnum += 1 23 endwhile 24endfunc 25 26" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. 27func! ParseFoldProps() 28 let s:foldprops = [] 29 let lnum = 1 30 while lnum <= line('$') 31 let line = getline(lnum) 32 if line !~ '^#' && line !~ '^\s*$' 33 let l = split(line, '\s*;\s*', 1) 34 if len(l) != 4 35 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' 36 return 37 endif 38 call add(s:foldprops, l) 39 endif 40 let lnum += 1 41 endwhile 42endfunc 43 44" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. 45func! ParseWidthProps() 46 let s:widthprops = [] 47 let lnum = 1 48 while lnum <= line('$') 49 let line = getline(lnum) 50 if line !~ '^#' && line !~ '^\s*$' 51 let l = split(line, '\s*;\s*', 1) 52 if len(l) != 2 53 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' 54 return 55 endif 56 call add(s:widthprops, l) 57 endif 58 let lnum += 1 59 endwhile 60endfunc 61 62" Build the toLower or toUpper table in a new buffer. 63" Uses s:dataprops. 64func! BuildCaseTable(name, index) 65 let start = -1 66 let end = -1 67 let step = 0 68 let add = -1 69 let ranges = [] 70 for p in s:dataprops 71 if p[a:index] != '' 72 let n = ('0x' . p[0]) + 0 73 let nl = ('0x' . p[a:index]) + 0 74 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 75 " continue with same range. 76 let step = n - end 77 let end = n 78 else 79 if start >= 0 80 " produce previous range 81 call Range(ranges, start, end, step, add) 82 endif 83 let start = n 84 let end = n 85 let step = 0 86 let add = nl - n 87 endif 88 endif 89 endfor 90 if start >= 0 91 call Range(ranges, start, end, step, add) 92 endif 93 94 " New buffer to put the result in. 95 new 96 exe "file to" . a:name 97 call setline(1, "static convertStruct to" . a:name . "[] =") 98 call setline(2, "{") 99 call append('$', ranges) 100 call setline('$', getline('$')[:-2]) " remove last comma 101 call setline(line('$') + 1, "};") 102 wincmd p 103endfunc 104 105" Build the foldCase table in a new buffer. 106" Uses s:foldprops. 107func! BuildFoldTable() 108 let start = -1 109 let end = -1 110 let step = 0 111 let add = -1 112 let ranges = [] 113 for p in s:foldprops 114 if p[1] == 'C' || p[1] == 'S' 115 let n = ('0x' . p[0]) + 0 116 let nl = ('0x' . p[2]) + 0 117 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 118 " continue with same range. 119 let step = n - end 120 let end = n 121 else 122 if start >= 0 123 " produce previous range 124 call Range(ranges, start, end, step, add) 125 endif 126 let start = n 127 let end = n 128 let step = 0 129 let add = nl - n 130 endif 131 endif 132 endfor 133 if start >= 0 134 call Range(ranges, start, end, step, add) 135 endif 136 137 " New buffer to put the result in. 138 new 139 file foldCase 140 call setline(1, "static convertStruct foldCase[] =") 141 call setline(2, "{") 142 call append('$', ranges) 143 call setline('$', getline('$')[:-2]) " remove last comma 144 call setline(line('$') + 1, "};") 145 wincmd p 146endfunc 147 148func! Range(ranges, start, end, step, add) 149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) 150 call add(a:ranges, s) 151endfunc 152 153" Build the combining table. 154" Uses s:dataprops. 155func! BuildCombiningTable() 156 let start = -1 157 let end = -1 158 let ranges = [] 159 for p in s:dataprops 160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me' 161 let n = ('0x' . p[0]) + 0 162 if start >= 0 && end + 1 == n 163 " continue with same range. 164 let end = n 165 else 166 if start >= 0 167 " produce previous range 168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 169 endif 170 let start = n 171 let end = n 172 endif 173 endif 174 endfor 175 if start >= 0 176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 177 endif 178 179 " New buffer to put the result in. 180 new 181 file combining 182 call setline(1, " static struct interval combining[] =") 183 call setline(2, " {") 184 call append('$', ranges) 185 call setline('$', getline('$')[:-2]) " remove last comma 186 call setline(line('$') + 1, " };") 187 wincmd p 188endfunc 189 190" Build the double width or ambiguous width table in a new buffer. 191" Uses s:widthprops and s:dataprops. 192func! BuildWidthTable(pattern, tableName) 193 let start = -1 194 let end = -1 195 let ranges = [] 196 let dataidx = 0 197 for p in s:widthprops 198 if p[1][0] =~ a:pattern 199 if p[0] =~ '\.\.' 200 " It is a range. we don't check for composing char then. 201 let rng = split(p[0], '\.\.') 202 if len(rng) != 2 203 echoerr "Cannot parse range: '" . p[0] . "' in width table" 204 endif 205 let n = ('0x' . rng[0]) + 0 206 let n_last = ('0x' . rng[1]) + 0 207 else 208 let n = ('0x' . p[0]) + 0 209 let n_last = n 210 endif 211 " Find this char in the data table. 212 while 1 213 let dn = ('0x' . s:dataprops[dataidx][0]) + 0 214 if dn >= n 215 break 216 endif 217 let dataidx += 1 218 endwhile 219 if dn != n && n_last == n 220 echoerr "Cannot find character " . n . " in data table" 221 endif 222 " Only use the char when it's not a composing char. 223 " But use all chars from a range. 224 let dp = s:dataprops[dataidx] 225 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') 226 if start >= 0 && end + 1 == n 227 " continue with same range. 228 else 229 if start >= 0 230 " produce previous range 231 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 232 if a:pattern == 'A' 233 call add(s:ambitable, [start, end]) 234 else 235 call add(s:doubletable, [start, end]) 236 endif 237 endif 238 let start = n 239 endif 240 let end = n_last 241 endif 242 endif 243 endfor 244 if start >= 0 245 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 246 if a:pattern == 'A' 247 call add(s:ambitable, [start, end]) 248 else 249 call add(s:doubletable, [start, end]) 250 endif 251 endif 252 253 " New buffer to put the result in. 254 new 255 exe "file " . a:tableName 256 call setline(1, " static struct interval " . a:tableName . "[] =") 257 call setline(2, " {") 258 call append('$', ranges) 259 call setline('$', getline('$')[:-2]) " remove last comma 260 call setline(line('$') + 1, " };") 261 wincmd p 262endfunc 263 264" Build the amoji width table in a new buffer. 265func! BuildEmojiTable(pattern, tableName) 266 let alltokens = [] 267 let widthtokens = [] 268 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")') 269 for n in range(len(lines)) 270 let line = lines[n] 271 let token = split(line, '\.\.') 272 let first = ('0x' . token[0]) + 0 273 if len(token) == 1 274 let last = first 275 else 276 let last = ('0x' . token[1]) + 0 277 endif 278 279 let token = [first, last] 280 if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1]) 281 let alltokens[-1][1] = token[1] 282 else 283 call add(alltokens, token) 284 endif 285 286 " Characters below 1F000 may be considered single width traditionally, 287 " making them double width causes problems. 288 if first < 0x1f000 289 continue 290 endif 291 292 " exclude characters that are in the "ambiguous" or "doublewidth" table 293 for ambi in s:ambitable 294 if first >= ambi[0] && first <= ambi[1] 295 let first = ambi[1] + 1 296 endif 297 if last >= ambi[0] && last <= ambi[1] 298 let last = ambi[0] - 1 299 endif 300 endfor 301 for double in s:doubletable 302 if first >= double[0] && first <= double[1] 303 let first = double[1] + 1 304 endif 305 if last >= double[0] && last <= double[1] 306 let last = double[0] - 1 307 endif 308 endfor 309 310 if first <= last 311 let token = [first, last] 312 if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1]) 313 let widthtokens[-1][1] = token[1] 314 else 315 call add(widthtokens, token) 316 endif 317 endif 318 endfor 319 let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') 320 let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') 321 322 " New buffer to put the result in. 323 new 324 exe "file " . a:tableName . '_all' 325 call setline(1, " static struct interval " . a:tableName . "_all[] =") 326 call setline(2, " {") 327 call append('$', allranges) 328 call setline('$', getline('$')[:-2]) " remove last comma 329 call setline(line('$') + 1, " };") 330 wincmd p 331 332 " New buffer to put the result in. 333 new 334 exe "file " . a:tableName . '_width' 335 call setline(1, " static struct interval " . a:tableName . "_width[] =") 336 call setline(2, " {") 337 call append('$', widthranges) 338 call setline('$', getline('$')[:-2]) " remove last comma 339 call setline(line('$') + 1, " };") 340 wincmd p 341endfunc 342 343" Try to avoid hitting E36 344set equalalways 345 346" Edit the Unicode text file. Requires the netrw plugin. 347edit http://unicode.org/Public/UNIDATA/UnicodeData.txt 348 349" Parse each line, create a list of lists. 350call ParseDataToProps() 351 352" Build the toLower table. 353call BuildCaseTable("Lower", 13) 354 355" Build the toUpper table. 356call BuildCaseTable("Upper", 12) 357 358" Build the ranges of composing chars. 359call BuildCombiningTable() 360 361" Edit the case folding text file. Requires the netrw plugin. 362edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt 363 364" Parse each line, create a list of lists. 365call ParseFoldProps() 366 367" Build the foldCase table. 368call BuildFoldTable() 369 370" Edit the width text file. Requires the netrw plugin. 371edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt 372 373" Parse each line, create a list of lists. 374call ParseWidthProps() 375 376" Build the double width table. 377let s:doubletable = [] 378call BuildWidthTable('[WF]', 'doublewidth') 379 380" Build the ambiguous width table. 381let s:ambitable = [] 382call BuildWidthTable('A', 'ambiguous') 383 384" Edit the emoji text file. Requires the netrw plugin. 385edit http://www.unicode.org/Public/emoji/3.0/emoji-data.txt 386 387" Build the emoji table. Ver. 1.0 - 6.0 388" Must come after the "ambiguous" table 389call BuildEmojiTable('; Emoji\s\+# [1-6]\.[0-9]', 'emoji') 390