1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. 2" The format of the UnicodeData.txt file is explained here: 3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html 4" For the other files see the header. 5" 6" Usage: Vim -S <this-file> 7" 8" Author: Bram Moolenaar 9" Last Update: 2010 Jan 12 10 11" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. 12func! ParseDataToProps() 13 let s:dataprops = [] 14 let lnum = 1 15 while lnum <= line('$') 16 let l = split(getline(lnum), '\s*;\s*', 1) 17 if len(l) != 15 18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' 19 return 20 endif 21 call add(s:dataprops, l) 22 let lnum += 1 23 endwhile 24endfunc 25 26" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. 27func! ParseFoldProps() 28 let s:foldprops = [] 29 let lnum = 1 30 while lnum <= line('$') 31 let line = getline(lnum) 32 if line !~ '^#' && line !~ '^\s*$' 33 let l = split(line, '\s*;\s*', 1) 34 if len(l) != 4 35 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' 36 return 37 endif 38 call add(s:foldprops, l) 39 endif 40 let lnum += 1 41 endwhile 42endfunc 43 44" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. 45func! ParseWidthProps() 46 let s:widthprops = [] 47 let lnum = 1 48 while lnum <= line('$') 49 let line = getline(lnum) 50 if line !~ '^#' && line !~ '^\s*$' 51 let l = split(line, '\s*;\s*', 1) 52 if len(l) != 2 53 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' 54 return 55 endif 56 call add(s:widthprops, l) 57 endif 58 let lnum += 1 59 endwhile 60endfunc 61 62" Build the toLower or toUpper table in a new buffer. 63" Uses s:dataprops. 64func! BuildCaseTable(name, index) 65 let start = -1 66 let end = -1 67 let step = 0 68 let add = -1 69 let ranges = [] 70 for p in s:dataprops 71 if p[a:index] != '' 72 let n = ('0x' . p[0]) + 0 73 let nl = ('0x' . p[a:index]) + 0 74 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 75 " continue with same range. 76 let step = n - end 77 let end = n 78 else 79 if start >= 0 80 " produce previous range 81 call Range(ranges, start, end, step, add) 82 endif 83 let start = n 84 let end = n 85 let step = 0 86 let add = nl - n 87 endif 88 endif 89 endfor 90 if start >= 0 91 call Range(ranges, start, end, step, add) 92 endif 93 94 " New buffer to put the result in. 95 new 96 exe "file to" . a:name 97 call setline(1, "static convertStruct to" . a:name . "[] =") 98 call setline(2, "{") 99 call append('$', ranges) 100 call setline('$', getline('$')[:-2]) " remove last comma 101 call setline(line('$') + 1, "};") 102 wincmd p 103endfunc 104 105" Build the foldCase table in a new buffer. 106" Uses s:foldprops. 107func! BuildFoldTable() 108 let start = -1 109 let end = -1 110 let step = 0 111 let add = -1 112 let ranges = [] 113 for p in s:foldprops 114 if p[1] == 'C' || p[1] == 'S' 115 let n = ('0x' . p[0]) + 0 116 let nl = ('0x' . p[2]) + 0 117 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 118 " continue with same range. 119 let step = n - end 120 let end = n 121 else 122 if start >= 0 123 " produce previous range 124 call Range(ranges, start, end, step, add) 125 endif 126 let start = n 127 let end = n 128 let step = 0 129 let add = nl - n 130 endif 131 endif 132 endfor 133 if start >= 0 134 call Range(ranges, start, end, step, add) 135 endif 136 137 " New buffer to put the result in. 138 new 139 file foldCase 140 call setline(1, "static convertStruct foldCase[] =") 141 call setline(2, "{") 142 call append('$', ranges) 143 call setline('$', getline('$')[:-2]) " remove last comma 144 call setline(line('$') + 1, "};") 145 wincmd p 146endfunc 147 148func! Range(ranges, start, end, step, add) 149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) 150 call add(a:ranges, s) 151endfunc 152 153" Build the combining table. 154" Uses s:dataprops. 155func! BuildCombiningTable() 156 let start = -1 157 let end = -1 158 let ranges = [] 159 for p in s:dataprops 160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me' 161 let n = ('0x' . p[0]) + 0 162 if start >= 0 && end + 1 == n 163 " continue with same range. 164 let end = n 165 else 166 if start >= 0 167 " produce previous range 168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 169 endif 170 let start = n 171 let end = n 172 endif 173 endif 174 endfor 175 if start >= 0 176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 177 endif 178 179 " New buffer to put the result in. 180 new 181 file combining 182 call setline(1, " static struct interval combining[] =") 183 call setline(2, " {") 184 call append('$', ranges) 185 call setline('$', getline('$')[:-2]) " remove last comma 186 call setline(line('$') + 1, " };") 187 wincmd p 188endfunc 189 190" Build the ambiguous table in a new buffer. 191" Uses s:widthprops and s:dataprops. 192func! BuildAmbiguousTable() 193 let start = -1 194 let end = -1 195 let ranges = [] 196 let dataidx = 0 197 for p in s:widthprops 198 if p[1][0] == 'A' 199 let n = ('0x' . p[0]) + 0 200 " Find this char in the data table. 201 while 1 202 let dn = ('0x' . s:dataprops[dataidx][0]) + 0 203 if dn >= n 204 break 205 endif 206 let dataidx += 1 207 endwhile 208 if dn != n 209 echoerr "Cannot find character " . n . " in data table" 210 endif 211 " Only use the char when it's not a composing char. 212 let dp = s:dataprops[dataidx] 213 if dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me' 214 if start >= 0 && end + 1 == n 215 " continue with same range. 216 let end = n 217 else 218 if start >= 0 219 " produce previous range 220 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 221 endif 222 let start = n 223 if p[0] =~ '\.\.' 224 let end = ('0x' . substitute(p[0], '.*\.\.', '', '')) + 0 225 else 226 let end = n 227 endif 228 endif 229 endif 230 endif 231 endfor 232 if start >= 0 233 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 234 endif 235 236 " New buffer to put the result in. 237 new 238 file ambiguous 239 call setline(1, " static struct interval ambiguous[] =") 240 call setline(2, " {") 241 call append('$', ranges) 242 call setline('$', getline('$')[:-2]) " remove last comma 243 call setline(line('$') + 1, " };") 244 wincmd p 245endfunc 246 247 248 249" Edit the Unicode text file. Requires the netrw plugin. 250edit http://unicode.org/Public/UNIDATA/UnicodeData.txt 251 252" Parse each line, create a list of lists. 253call ParseDataToProps() 254 255" Build the toLower table. 256call BuildCaseTable("Lower", 13) 257 258" Build the toUpper table. 259call BuildCaseTable("Upper", 12) 260 261" Build the ranges of composing chars. 262call BuildCombiningTable() 263 264" Edit the case folding text file. Requires the netrw plugin. 265edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt 266 267" Parse each line, create a list of lists. 268call ParseFoldProps() 269 270" Build the foldCase table. 271call BuildFoldTable() 272 273" Edit the width text file. Requires the netrw plugin. 274edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt 275 276" Parse each line, create a list of lists. 277call ParseWidthProps() 278 279" Build the ambiguous table. 280call BuildAmbiguousTable() 281