1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. 2" The format of the UnicodeData.txt file is explained here: 3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html 4" For the other files see the header. 5" 6" Usage: Vim -S <this-file> 7" 8" Author: Bram Moolenaar 9" Last Update: 2010 Jan 12 10 11" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. 12func! ParseDataToProps() 13 let s:dataprops = [] 14 let lnum = 1 15 while lnum <= line('$') 16 let l = split(getline(lnum), '\s*;\s*', 1) 17 if len(l) != 15 18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' 19 return 20 endif 21 call add(s:dataprops, l) 22 let lnum += 1 23 endwhile 24endfunc 25 26" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. 27func! ParseFoldProps() 28 let s:foldprops = [] 29 let lnum = 1 30 while lnum <= line('$') 31 let line = getline(lnum) 32 if line !~ '^#' && line !~ '^\s*$' 33 let l = split(line, '\s*;\s*', 1) 34 if len(l) != 4 35 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' 36 return 37 endif 38 call add(s:foldprops, l) 39 endif 40 let lnum += 1 41 endwhile 42endfunc 43 44" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. 45func! ParseWidthProps() 46 let s:widthprops = [] 47 let lnum = 1 48 while lnum <= line('$') 49 let line = getline(lnum) 50 if line !~ '^#' && line !~ '^\s*$' 51 let l = split(line, '\s*;\s*', 1) 52 if len(l) != 2 53 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' 54 return 55 endif 56 call add(s:widthprops, l) 57 endif 58 let lnum += 1 59 endwhile 60endfunc 61 62" Build the toLower or toUpper table in a new buffer. 63" Uses s:dataprops. 64func! BuildCaseTable(name, index) 65 let start = -1 66 let end = -1 67 let step = 0 68 let add = -1 69 let ranges = [] 70 for p in s:dataprops 71 if p[a:index] != '' 72 let n = ('0x' . p[0]) + 0 73 let nl = ('0x' . p[a:index]) + 0 74 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 75 " continue with same range. 76 let step = n - end 77 let end = n 78 else 79 if start >= 0 80 " produce previous range 81 call Range(ranges, start, end, step, add) 82 endif 83 let start = n 84 let end = n 85 let step = 0 86 let add = nl - n 87 endif 88 endif 89 endfor 90 if start >= 0 91 call Range(ranges, start, end, step, add) 92 endif 93 94 " New buffer to put the result in. 95 new 96 exe "file to" . a:name 97 call setline(1, "static convertStruct to" . a:name . "[] =") 98 call setline(2, "{") 99 call append('$', ranges) 100 call setline('$', getline('$')[:-2]) " remove last comma 101 call setline(line('$') + 1, "};") 102 wincmd p 103endfunc 104 105" Build the foldCase table in a new buffer. 106" Uses s:foldprops. 107func! BuildFoldTable() 108 let start = -1 109 let end = -1 110 let step = 0 111 let add = -1 112 let ranges = [] 113 for p in s:foldprops 114 if p[1] == 'C' || p[1] == 'S' 115 let n = ('0x' . p[0]) + 0 116 let nl = ('0x' . p[2]) + 0 117 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 118 " continue with same range. 119 let step = n - end 120 let end = n 121 else 122 if start >= 0 123 " produce previous range 124 call Range(ranges, start, end, step, add) 125 endif 126 let start = n 127 let end = n 128 let step = 0 129 let add = nl - n 130 endif 131 endif 132 endfor 133 if start >= 0 134 call Range(ranges, start, end, step, add) 135 endif 136 137 " New buffer to put the result in. 138 new 139 file foldCase 140 call setline(1, "static convertStruct foldCase[] =") 141 call setline(2, "{") 142 call append('$', ranges) 143 call setline('$', getline('$')[:-2]) " remove last comma 144 call setline(line('$') + 1, "};") 145 wincmd p 146endfunc 147 148func! Range(ranges, start, end, step, add) 149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) 150 call add(a:ranges, s) 151endfunc 152 153" Build the combining table. 154" Uses s:dataprops. 155func! BuildCombiningTable() 156 let start = -1 157 let end = -1 158 let ranges = [] 159 for p in s:dataprops 160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me' 161 let n = ('0x' . p[0]) + 0 162 if start >= 0 && end + 1 == n 163 " continue with same range. 164 let end = n 165 else 166 if start >= 0 167 " produce previous range 168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 169 endif 170 let start = n 171 let end = n 172 endif 173 endif 174 endfor 175 if start >= 0 176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 177 endif 178 179 " New buffer to put the result in. 180 new 181 file combining 182 call setline(1, " static struct interval combining[] =") 183 call setline(2, " {") 184 call append('$', ranges) 185 call setline('$', getline('$')[:-2]) " remove last comma 186 call setline(line('$') + 1, " };") 187 wincmd p 188endfunc 189 190" Build the double width or ambiguous width table in a new buffer. 191" Uses s:widthprops and s:dataprops. 192func! BuildWidthTable(pattern, tableName) 193 let start = -1 194 let end = -1 195 let ranges = [] 196 let dataidx = 0 197 for p in s:widthprops 198 if p[1][0] =~ a:pattern 199 if p[0] =~ '\.\.' 200 " It is a range. we don't check for composing char then. 201 let rng = split(p[0], '\.\.') 202 if len(rng) != 2 203 echoerr "Cannot parse range: '" . p[0] . "' in width table" 204 endif 205 let n = ('0x' . rng[0]) + 0 206 let n_last = ('0x' . rng[1]) + 0 207 else 208 let n = ('0x' . p[0]) + 0 209 let n_last = n 210 endif 211 " Find this char in the data table. 212 while 1 213 let dn = ('0x' . s:dataprops[dataidx][0]) + 0 214 if dn >= n 215 break 216 endif 217 let dataidx += 1 218 endwhile 219 if dn != n && n_last == n 220 echoerr "Cannot find character " . n . " in data table" 221 endif 222 " Only use the char when it's not a composing char. 223 " But use all chars from a range. 224 let dp = s:dataprops[dataidx] 225 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') 226 if start >= 0 && end + 1 == n 227 " continue with same range. 228 else 229 if start >= 0 230 " produce previous range 231 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 232 if a:pattern == 'A' 233 call add(s:ambitable, [start, end]) 234 else 235 call add(s:doubletable, [start, end]) 236 endif 237 endif 238 let start = n 239 endif 240 let end = n_last 241 endif 242 endif 243 endfor 244 if start >= 0 245 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 246 if a:pattern == 'A' 247 call add(s:ambitable, [start, end]) 248 else 249 call add(s:doubletable, [start, end]) 250 endif 251 endif 252 253 " New buffer to put the result in. 254 new 255 exe "file " . a:tableName 256 call setline(1, " static struct interval " . a:tableName . "[] =") 257 call setline(2, " {") 258 call append('$', ranges) 259 call setline('$', getline('$')[:-2]) " remove last comma 260 call setline(line('$') + 1, " };") 261 wincmd p 262endfunc 263 264" Build the amoji width table in a new buffer. 265func! BuildEmojiTable(pattern, tableName) 266 let alltokens = [] 267 let widthtokens = [] 268 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")') 269 for n in range(len(lines)) 270 let line = lines[n] 271 let token = split(line, '\.\.') 272 let first = ('0x' . token[0]) + 0 273 if len(token) == 1 274 let last = first 275 else 276 let last = ('0x' . token[1]) + 0 277 endif 278 279 let token = [first, last] 280 if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1]) 281 let alltokens[-1][1] = token[1] 282 else 283 call add(alltokens, token) 284 endif 285 286 " exclude characters that are in the "ambiguous" or "doublewidth" table 287 for ambi in s:ambitable 288 if first >= ambi[0] && first <= ambi[1] 289 let first = ambi[1] + 1 290 endif 291 if last >= ambi[0] && last <= ambi[1] 292 let last = ambi[0] - 1 293 endif 294 endfor 295 for double in s:doubletable 296 if first >= double[0] && first <= double[1] 297 let first = double[1] + 1 298 endif 299 if last >= double[0] && last <= double[1] 300 let last = double[0] - 1 301 endif 302 endfor 303 304 if first <= last 305 let token = [first, last] 306 if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1]) 307 let widthtokens[-1][1] = token[1] 308 else 309 call add(widthtokens, token) 310 endif 311 endif 312 endfor 313 let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') 314 let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') 315 316 " New buffer to put the result in. 317 new 318 exe "file " . a:tableName . '_all' 319 call setline(1, " static struct interval " . a:tableName . "_all[] =") 320 call setline(2, " {") 321 call append('$', allranges) 322 call setline('$', getline('$')[:-2]) " remove last comma 323 call setline(line('$') + 1, " };") 324 wincmd p 325 326 " New buffer to put the result in. 327 new 328 exe "file " . a:tableName . '_width' 329 call setline(1, " static struct interval " . a:tableName . "_width[] =") 330 call setline(2, " {") 331 call append('$', widthranges) 332 call setline('$', getline('$')[:-2]) " remove last comma 333 call setline(line('$') + 1, " };") 334 wincmd p 335endfunc 336 337" Try to avoid hitting E36 338set equalalways 339 340" Edit the Unicode text file. Requires the netrw plugin. 341edit http://unicode.org/Public/UNIDATA/UnicodeData.txt 342 343" Parse each line, create a list of lists. 344call ParseDataToProps() 345 346" Build the toLower table. 347call BuildCaseTable("Lower", 13) 348 349" Build the toUpper table. 350call BuildCaseTable("Upper", 12) 351 352" Build the ranges of composing chars. 353call BuildCombiningTable() 354 355" Edit the case folding text file. Requires the netrw plugin. 356edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt 357 358" Parse each line, create a list of lists. 359call ParseFoldProps() 360 361" Build the foldCase table. 362call BuildFoldTable() 363 364" Edit the width text file. Requires the netrw plugin. 365edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt 366 367" Parse each line, create a list of lists. 368call ParseWidthProps() 369 370" Build the double width table. 371let s:doubletable = [] 372call BuildWidthTable('[WF]', 'doublewidth') 373 374" Build the ambiguous width table. 375let s:ambitable = [] 376call BuildWidthTable('A', 'ambiguous') 377 378" Edit the emoji text file. Requires the netrw plugin. 379edit http://www.unicode.org/Public/emoji/3.0/emoji-data.txt 380 381" Build the emoji table. Ver. 1.0 - 6.0 382" Must come after the "ambiguous" table 383call BuildEmojiTable('; Emoji\s\+# [1-6]\.[0-9]', 'emoji') 384