1 2source [file join [file dirname [info script]] parseunicode.tcl] 3 4proc print_rd {map} { 5 global tl_lookup_table 6 set aChar [list] 7 set lRange [list] 8 9 set nRange 1 10 set iFirst [lindex $map 0 0] 11 set cPrev [lindex $map 0 1] 12 set fPrev [lindex $map 0 2] 13 14 foreach m [lrange $map 1 end] { 15 foreach {i c f} $m {} 16 17 if {$cPrev == $c && $fPrev==$f} { 18 for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { 19 if {[info exists tl_lookup_table($j)]==0} break 20 } 21 22 if {$j==$i} { 23 set nNew [expr {(1 + $i - $iFirst)}] 24 if {$nNew<=8} { 25 set nRange $nNew 26 continue 27 } 28 } 29 } 30 31 lappend lRange [list $iFirst $nRange] 32 lappend aChar $cPrev 33 lappend aFlag $fPrev 34 35 set iFirst $i 36 set cPrev $c 37 set fPrev $f 38 set nRange 1 39 } 40 lappend lRange [list $iFirst $nRange] 41 lappend aChar $cPrev 42 lappend aFlag $fPrev 43 44 puts "/*" 45 puts "** If the argument is a codepoint corresponding to a lowercase letter" 46 puts "** in the ASCII range with a diacritic added, return the codepoint" 47 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" 48 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" 49 puts "** E\"). The resuls of passing a codepoint that corresponds to an" 50 puts "** uppercase letter are undefined." 51 puts "*/" 52 puts "static int ${::remove_diacritic}(int c, int bComplex)\{" 53 puts " unsigned short aDia\[\] = \{" 54 puts -nonewline " 0, " 55 set i 1 56 foreach r $lRange { 57 foreach {iCode nRange} $r {} 58 if {($i % 8)==0} {puts "" ; puts -nonewline " " } 59 incr i 60 61 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] 62 puts -nonewline ", " 63 } 64 puts "" 65 puts " \};" 66 puts "#define HIBIT ((unsigned char)0x80)" 67 puts " unsigned char aChar\[\] = \{" 68 puts -nonewline " '\\0', " 69 set i 1 70 foreach c $aChar f $aFlag { 71 if { $f } { 72 set str "'$c'|HIBIT, " 73 } else { 74 set str "'$c', " 75 } 76 if {$c == ""} { set str "'\\0', " } 77 78 if {($i % 6)==0} {puts "" ; puts -nonewline " " } 79 incr i 80 puts -nonewline "$str" 81 } 82 puts "" 83 puts " \};" 84 puts { 85 unsigned int key = (((unsigned int)c)<<3) | 0x00000007; 86 int iRes = 0; 87 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; 88 int iLo = 0; 89 while( iHi>=iLo ){ 90 int iTest = (iHi + iLo) / 2; 91 if( key >= aDia[iTest] ){ 92 iRes = iTest; 93 iLo = iTest+1; 94 }else{ 95 iHi = iTest-1; 96 } 97 } 98 assert( key>=aDia[iRes] ); 99 if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; 100 return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);} 101 puts "\}" 102} 103 104proc print_isdiacritic {zFunc map} { 105 106 set lCode [list] 107 foreach m $map { 108 foreach {code char flag} $m {} 109 if {$flag} continue 110 if {$code && $char == ""} { lappend lCode $code } 111 } 112 set lCode [lsort -integer $lCode] 113 set iFirst [lindex $lCode 0] 114 set iLast [lindex $lCode end] 115 116 set i1 0 117 set i2 0 118 119 foreach c $lCode { 120 set i [expr $c - $iFirst] 121 if {$i < 32} { 122 set i1 [expr {$i1 | (1<<$i)}] 123 } else { 124 set i2 [expr {$i2 | (1<<($i-32))}] 125 } 126 } 127 128 puts "/*" 129 puts "** Return true if the argument interpreted as a unicode codepoint" 130 puts "** is a diacritical modifier character." 131 puts "*/" 132 puts "int ${zFunc}\(int c)\{" 133 puts " unsigned int mask0 = [format "0x%08X" $i1];" 134 puts " unsigned int mask1 = [format "0x%08X" $i2];" 135 136 puts " if( c<$iFirst || c>$iLast ) return 0;" 137 puts " return (c < $iFirst+32) ?" 138 puts " (mask0 & ((unsigned int)1 << (c-$iFirst))) :" 139 puts " (mask1 & ((unsigned int)1 << (c-$iFirst-32)));" 140 puts "\}" 141} 142 143 144#------------------------------------------------------------------------- 145 146proc an_load_separator_ranges {} { 147 global unicodedata.txt 148 set lSep [an_load_unicodedata_text ${unicodedata.txt}] 149 unset -nocomplain iFirst 150 unset -nocomplain nRange 151 set lRange [list] 152 foreach sep $lSep { 153 if {0==[info exists iFirst]} { 154 set iFirst $sep 155 set nRange 1 156 } elseif { $sep == ($iFirst+$nRange) } { 157 incr nRange 158 } else { 159 lappend lRange [list $iFirst $nRange] 160 set iFirst $sep 161 set nRange 1 162 } 163 } 164 lappend lRange [list $iFirst $nRange] 165 set lRange 166} 167 168proc an_print_range_array {lRange} { 169 set iFirstMax 0 170 set nRangeMax 0 171 foreach range $lRange { 172 foreach {iFirst nRange} $range {} 173 if {$iFirst > $iFirstMax} {set iFirstMax $iFirst} 174 if {$nRange > $nRangeMax} {set nRangeMax $nRange} 175 } 176 if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"} 177 if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"} 178 179 puts -nonewline " " 180 puts [string trim { 181 /* Each unsigned integer in the following array corresponds to a contiguous 182 ** range of unicode codepoints that are not either letters or numbers (i.e. 183 ** codepoints for which this function should return 0). 184 ** 185 ** The most significant 22 bits in each 32-bit value contain the first 186 ** codepoint in the range. The least significant 10 bits are used to store 187 ** the size of the range (always at least 1). In other words, the value 188 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 189 ** C. It is not possible to represent a range larger than 1023 codepoints 190 ** using this format. 191 */ 192 }] 193 puts -nonewline " static const unsigned int aEntry\[\] = \{" 194 set i 0 195 foreach range $lRange { 196 foreach {iFirst nRange} $range {} 197 set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]] 198 199 if {($i % 5)==0} {puts "" ; puts -nonewline " "} 200 puts -nonewline " $u32," 201 incr i 202 } 203 puts "" 204 puts " \};" 205} 206 207proc an_print_ascii_bitmap {lRange} { 208 foreach range $lRange { 209 foreach {iFirst nRange} $range {} 210 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { 211 if {$i<=127} { set a($i) 1 } 212 } 213 } 214 215 set aAscii [list 0 0 0 0] 216 foreach key [array names a] { 217 set idx [expr $key >> 5] 218 lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))] 219 } 220 221 puts " static const unsigned int aAscii\[4\] = \{" 222 puts -nonewline " " 223 foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] } 224 puts "" 225 puts " \};" 226} 227 228proc print_isalnum {zFunc lRange} { 229 puts "/*" 230 puts "** Return true if the argument corresponds to a unicode codepoint" 231 puts "** classified as either a letter or a number. Otherwise false." 232 puts "**" 233 puts "** The results are undefined if the value passed to this function" 234 puts "** is less than zero." 235 puts "*/" 236 puts "int ${zFunc}\(int c)\{" 237 an_print_range_array $lRange 238 an_print_ascii_bitmap $lRange 239 puts { 240 if( (unsigned int)c<128 ){ 241 return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 ); 242 }else if( (unsigned int)c<(1<<22) ){ 243 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; 244 int iRes = 0; 245 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; 246 int iLo = 0; 247 while( iHi>=iLo ){ 248 int iTest = (iHi + iLo) / 2; 249 if( key >= aEntry[iTest] ){ 250 iRes = iTest; 251 iLo = iTest+1; 252 }else{ 253 iHi = iTest-1; 254 } 255 } 256 assert( aEntry[0]<key ); 257 assert( key>=aEntry[iRes] ); 258 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); 259 } 260 return 1;} 261 puts "\}" 262} 263 264proc print_test_isalnum {zFunc lRange} { 265 foreach range $lRange { 266 foreach {iFirst nRange} $range {} 267 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 } 268 } 269 270 puts "static int isalnum_test(int *piCode)\{" 271 puts -nonewline " unsigned char aAlnum\[\] = \{" 272 for {set i 0} {$i < 70000} {incr i} { 273 if {($i % 32)==0} { puts "" ; puts -nonewline " " } 274 set bFlag [expr ![info exists a($i)]] 275 puts -nonewline "${bFlag}," 276 } 277 puts "" 278 puts " \};" 279 280 puts -nonewline " int aLargeSep\[\] = \{" 281 set i 0 282 foreach iSep [lsort -integer [array names a]] { 283 if {$iSep<70000} continue 284 if {($i % 8)==0} { puts "" ; puts -nonewline " " } 285 puts -nonewline " $iSep," 286 incr i 287 } 288 puts "" 289 puts " \};" 290 puts -nonewline " int aLargeOther\[\] = \{" 291 set i 0 292 foreach iSep [lsort -integer [array names a]] { 293 if {$iSep<70000} continue 294 if {[info exists a([expr $iSep-1])]==0} { 295 if {($i % 8)==0} { puts "" ; puts -nonewline " " } 296 puts -nonewline " [expr $iSep-1]," 297 incr i 298 } 299 if {[info exists a([expr $iSep+1])]==0} { 300 if {($i % 8)==0} { puts "" ; puts -nonewline " " } 301 puts -nonewline " [expr $iSep+1]," 302 incr i 303 } 304 } 305 puts "" 306 puts " \};" 307 308 puts [subst -nocommands { 309 int i; 310 for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){ 311 if( ${zFunc}(i)!=aAlnum[i] ){ 312 *piCode = i; 313 return 1; 314 } 315 } 316 for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){ 317 if( ${zFunc}(aLargeSep[i])!=0 ){ 318 *piCode = aLargeSep[i]; 319 return 1; 320 } 321 } 322 for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){ 323 if( ${zFunc}(aLargeOther[i])!=1 ){ 324 *piCode = aLargeOther[i]; 325 return 1; 326 } 327 } 328 }] 329 puts " return 0;" 330 puts "\}" 331} 332 333#------------------------------------------------------------------------- 334 335proc tl_create_records {} { 336 global tl_lookup_table 337 338 set iFirst "" 339 set nOff 0 340 set nRange 0 341 set nIncr 0 342 343 set lRecord [list] 344 foreach code [lsort -integer [array names tl_lookup_table]] { 345 set mapping $tl_lookup_table($code) 346 if {$iFirst == ""} { 347 set iFirst $code 348 set nOff [expr $mapping - $code] 349 set nRange 1 350 set nIncr 1 351 } else { 352 set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))] 353 if { $nRange==1 && ($diff==1 || $diff==2) } { 354 set nIncr $diff 355 } 356 357 if {$diff != $nIncr || ($mapping - $code)!=$nOff} { 358 if { $nRange==1 } {set nIncr 1} 359 lappend lRecord [list $iFirst $nIncr $nRange $nOff] 360 set iFirst $code 361 set nOff [expr $mapping - $code] 362 set nRange 1 363 set nIncr 1 364 } else { 365 incr nRange 366 } 367 } 368 } 369 370 lappend lRecord [list $iFirst $nIncr $nRange $nOff] 371 372 set lRecord 373} 374 375proc tl_print_table_header {} { 376 puts -nonewline " " 377 puts [string trim { 378 /* Each entry in the following array defines a rule for folding a range 379 ** of codepoints to lower case. The rule applies to a range of nRange 380 ** codepoints starting at codepoint iCode. 381 ** 382 ** If the least significant bit in flags is clear, then the rule applies 383 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and 384 ** need to be folded). Or, if it is set, then the rule only applies to 385 ** every second codepoint in the range, starting with codepoint C. 386 ** 387 ** The 7 most significant bits in flags are an index into the aiOff[] 388 ** array. If a specific codepoint C does require folding, then its lower 389 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF). 390 ** 391 ** The contents of this array are generated by parsing the CaseFolding.txt 392 ** file distributed as part of the "Unicode Character Database". See 393 ** http://www.unicode.org for details. 394 */ 395 }] 396 puts " static const struct TableEntry \{" 397 puts " unsigned short iCode;" 398 puts " unsigned char flags;" 399 puts " unsigned char nRange;" 400 puts " \} aEntry\[\] = \{" 401} 402 403proc tl_print_table_entry {togglevar entry liOff} { 404 upvar $togglevar t 405 foreach {iFirst nIncr nRange nOff} $entry {} 406 407 if {$iFirst > (1<<16)} { return 1 } 408 409 if {[info exists t]==0} {set t 0} 410 if {$t==0} { puts -nonewline " " } 411 412 set flags 0 413 if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]} 414 if {$nOff<0} { incr nOff [expr (1<<16)] } 415 416 set idx [lsearch $liOff $nOff] 417 if {$idx<0} {error "malfunction generating aiOff"} 418 set flags [expr $flags + $idx*2] 419 420 set txt "{$iFirst, $flags, $nRange}," 421 if {$t==2} { 422 puts $txt 423 } else { 424 puts -nonewline [format "% -23s" $txt] 425 } 426 set t [expr ($t+1)%3] 427 428 return 0 429} 430 431proc tl_print_table_footer {togglevar} { 432 upvar $togglevar t 433 if {$t!=0} {puts ""} 434 puts " \};" 435} 436 437proc tl_print_if_entry {entry} { 438 foreach {iFirst nIncr nRange nOff} $entry {} 439 if {$nIncr==2} {error "tl_print_if_entry needs improvement!"} 440 441 puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{" 442 puts " ret = c + $nOff;" 443 puts " \}" 444} 445 446proc tl_generate_ioff_table {lRecord} { 447 foreach entry $lRecord { 448 foreach {iFirst nIncr nRange iOff} $entry {} 449 if {$iOff<0} { incr iOff [expr (1<<16)] } 450 if {[info exists a($iOff)]} continue 451 set a($iOff) 1 452 } 453 454 set liOff [lsort -integer [array names a]] 455 if {[llength $liOff]>128} { error "Too many distinct ioffs" } 456 return $liOff 457} 458 459proc tl_print_ioff_table {liOff} { 460 puts -nonewline " static const unsigned short aiOff\[\] = \{" 461 set i 0 462 foreach off $liOff { 463 if {($i % 8)==0} {puts "" ; puts -nonewline " "} 464 puts -nonewline [format "% -7s" "$off,"] 465 incr i 466 } 467 puts "" 468 puts " \};" 469 470} 471 472proc print_fold {zFunc} { 473 474 set lRecord [tl_create_records] 475 476 set lHigh [list] 477 puts "/*" 478 puts "** Interpret the argument as a unicode codepoint. If the codepoint" 479 puts "** is an upper case character that has a lower case equivalent," 480 puts "** return the codepoint corresponding to the lower case version." 481 puts "** Otherwise, return a copy of the argument." 482 puts "**" 483 puts "** The results are undefined if the value passed to this function" 484 puts "** is less than zero." 485 puts "*/" 486 puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{" 487 488 set liOff [tl_generate_ioff_table $lRecord] 489 tl_print_table_header 490 foreach entry $lRecord { 491 if {[tl_print_table_entry toggle $entry $liOff]} { 492 lappend lHigh $entry 493 } 494 } 495 tl_print_table_footer toggle 496 tl_print_ioff_table $liOff 497 498 puts [subst -nocommands { 499 int ret = c; 500 501 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); 502 503 if( c<128 ){ 504 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); 505 }else if( c<65536 ){ 506 const struct TableEntry *p; 507 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; 508 int iLo = 0; 509 int iRes = -1; 510 511 assert( c>aEntry[0].iCode ); 512 while( iHi>=iLo ){ 513 int iTest = (iHi + iLo) / 2; 514 int cmp = (c - aEntry[iTest].iCode); 515 if( cmp>=0 ){ 516 iRes = iTest; 517 iLo = iTest+1; 518 }else{ 519 iHi = iTest-1; 520 } 521 } 522 523 assert( iRes>=0 && c>=aEntry[iRes].iCode ); 524 p = &aEntry[iRes]; 525 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ 526 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; 527 assert( ret>0 ); 528 } 529 530 if( eRemoveDiacritic ){ 531 ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2); 532 } 533 } 534 }] 535 536 foreach entry $lHigh { 537 tl_print_if_entry $entry 538 } 539 540 puts "" 541 puts " return ret;" 542 puts "\}" 543} 544 545proc code {txt} { 546 set txt [string trimright $txt] 547 set txt [string trimleft $txt "\n"] 548 set n [expr {[string length $txt] - [string length [string trim $txt]]}] 549 set ret "" 550 foreach L [split $txt "\n"] { 551 append ret "[string range $L $n end]\n" 552 } 553 return [uplevel "subst -nocommands {$ret}"] 554} 555 556proc intarray {lInt} { 557 set ret "" 558 set n [llength $lInt] 559 for {set i 0} {$i < $n} {incr i 10} { 560 append ret "\n " 561 foreach int [lrange $lInt $i [expr $i+9]] { 562 append ret [format "%-7s" "$int, "] 563 } 564 } 565 append ret "\n " 566 set ret 567} 568 569proc categories_switch {Cvar first lSecond} { 570 upvar $Cvar C 571 set ret "" 572 append ret "case '$first':\n" 573 append ret " switch( zCat\[1\] ){\n" 574 foreach s $lSecond { 575 append ret " case '$s': aArray\[$C($first$s)\] = 1; break;\n" 576 } 577 append ret " case '*': \n" 578 foreach s $lSecond { 579 append ret " aArray\[$C($first$s)\] = 1;\n" 580 } 581 append ret " break;\n" 582 append ret " default: return 1;" 583 append ret " }\n" 584 append ret " break;\n" 585} 586 587# Argument is a list. Each element of which is itself a list of two elements: 588# 589# * the codepoint 590# * the category 591# 592# List elements are sorted in order of codepoint. 593# 594proc print_categories {lMap} { 595 set categories { 596 Cc Cf Cn Cs 597 Ll Lm Lo Lt Lu 598 Mc Me Mn 599 Nd Nl No 600 Pc Pd Pe Pf Pi Po Ps 601 Sc Sk Sm So 602 Zl Zp Zs 603 604 LC Co 605 } 606 607 for {set i 0} {$i < [llength $categories]} {incr i} { 608 set C([lindex $categories $i]) [expr 1+$i] 609 } 610 611 set caseC [categories_switch C C {c f n s o}] 612 set caseL [categories_switch C L {l m o t u C}] 613 set caseM [categories_switch C M {c e n}] 614 set caseN [categories_switch C N {d l o}] 615 set caseP [categories_switch C P {c d e f i o s}] 616 set caseS [categories_switch C S {c k m o}] 617 set caseZ [categories_switch C Z {l p s}] 618 619 set nCat [expr [llength [array names C]] + 1] 620 puts [code { 621 int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ 622 aArray[0] = 1; 623 switch( zCat[0] ){ 624 $caseC 625 $caseL 626 $caseM 627 $caseN 628 $caseP 629 $caseS 630 $caseZ 631 } 632 return 0; 633 } 634 }] 635 636 set nRepeat 0 637 set first [lindex $lMap 0 0] 638 set class [lindex $lMap 0 1] 639 set prev -1 640 641 set CASE(0) "Lu" 642 set CASE(1) "Ll" 643 644 foreach m $lMap { 645 foreach {codepoint cl} $m {} 646 set codepoint [expr "0x$codepoint"] 647 if {$codepoint>=(1<<20)} continue 648 649 set bNew 0 650 if {$codepoint!=($prev+1)} { 651 set bNew 1 652 } elseif { 653 $cl==$class || ($class=="LC" && $cl==$CASE([expr $nRepeat & 0x01])) 654 } { 655 incr nRepeat 656 } elseif {$class=="Lu" && $nRepeat==1 && $cl=="Ll"} { 657 set class LC 658 incr nRepeat 659 } else { 660 set bNew 1 661 } 662 if {$bNew} { 663 lappend lEntries [list $first $class $nRepeat] 664 set nRepeat 1 665 set first $codepoint 666 set class $cl 667 } 668 set prev $codepoint 669 } 670 if {$nRepeat>0} { 671 lappend lEntries [list $first $class $nRepeat] 672 } 673 674 set aBlock [list 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] 675 set aMap [list] 676 foreach e $lEntries { 677 foreach {cp class nRepeat} $e {} 678 set block [expr ($cp>>16)] 679 if {$block>0 && [lindex $aBlock $block]==0} { 680 for {set i 1} {$i<=$block} {incr i} { 681 if {[lindex $aBlock $i]==0} { 682 lset aBlock $i [llength $aMap] 683 } 684 } 685 } 686 lappend aMap [expr {$cp & 0xFFFF}] 687 lappend aData [expr {($nRepeat << 5) + $C($class)}] 688 } 689 for {set i 1} {$i<[llength $aBlock]} {incr i} { 690 if {[lindex $aBlock $i]==0} { 691 lset aBlock $i [llength $aMap] 692 } 693 } 694 695 set aBlockArray [intarray $aBlock] 696 set aMapArray [intarray $aMap] 697 set aDataArray [intarray $aData] 698 puts [code { 699 static u16 aFts5UnicodeBlock[] = {$aBlockArray}; 700 static u16 aFts5UnicodeMap[] = {$aMapArray}; 701 static u16 aFts5UnicodeData[] = {$aDataArray}; 702 703 int sqlite3Fts5UnicodeCategory(u32 iCode) { 704 int iRes = -1; 705 int iHi; 706 int iLo; 707 int ret; 708 u16 iKey; 709 710 if( iCode>=(1<<20) ){ 711 return 0; 712 } 713 iLo = aFts5UnicodeBlock[(iCode>>16)]; 714 iHi = aFts5UnicodeBlock[1+(iCode>>16)]; 715 iKey = (iCode & 0xFFFF); 716 while( iHi>iLo ){ 717 int iTest = (iHi + iLo) / 2; 718 assert( iTest>=iLo && iTest<iHi ); 719 if( iKey>=aFts5UnicodeMap[iTest] ){ 720 iRes = iTest; 721 iLo = iTest+1; 722 }else{ 723 iHi = iTest; 724 } 725 } 726 727 if( iRes<0 ) return 0; 728 if( iKey>=(aFts5UnicodeMap[iRes]+(aFts5UnicodeData[iRes]>>5)) ) return 0; 729 ret = aFts5UnicodeData[iRes] & 0x1F; 730 if( ret!=$C(LC) ) return ret; 731 return ((iKey - aFts5UnicodeMap[iRes]) & 0x01) ? $C(Ll) : $C(Lu); 732 } 733 734 void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){ 735 int i = 0; 736 int iTbl = 0; 737 while( i<128 ){ 738 int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ]; 739 int n = (aFts5UnicodeData[iTbl] >> 5) + i; 740 for(; i<128 && i<n; i++){ 741 aAscii[i] = (u8)bToken; 742 } 743 iTbl++; 744 } 745 aAscii[0] = 0; /* 0x00 is never a token character */ 746 } 747 }] 748} 749 750proc print_test_categories {lMap} { 751 752 set lCP [list] 753 foreach e $lMap { 754 foreach {cp cat} $e {} 755 if {[expr 0x$cp] < (1<<20)} { 756 lappend lCP "{0x$cp, \"$cat\"}, " 757 } 758 } 759 760 set aCP "\n" 761 for {set i 0} {$i < [llength $lCP]} {incr i 4} { 762 append aCP " [join [lrange $lCP $i $i+3]]\n" 763 } 764 765 766 puts [code { 767 static int categories_test (int *piCode){ 768 struct Codepoint { 769 int iCode; 770 const char *zCat; 771 } aCP[] = {$aCP}; 772 int i; 773 int iCP = 0; 774 775 for(i=0; i<1000000; i++){ 776 u8 aArray[40]; 777 int cat = 0; 778 int c = 0; 779 memset(aArray, 0, sizeof(aArray)); 780 if( aCP[iCP].iCode==i ){ 781 sqlite3Fts5UnicodeCatParse(aCP[iCP].zCat, aArray); 782 iCP++; 783 }else{ 784 aArray[0] = 1; 785 } 786 787 c = sqlite3Fts5UnicodeCategory((u32)i); 788 if( aArray[c]==0 ){ 789 *piCode = i; 790 return 1; 791 } 792 } 793 794 return 0; 795 } 796 }] 797} 798 799proc print_fold_test {zFunc mappings} { 800 global tl_lookup_table 801 802 foreach m $mappings { 803 set c [lindex $m 1] 804 if {$c == ""} { 805 set extra([lindex $m 0]) 0 806 } else { 807 scan $c %c i 808 set extra([lindex $m 0]) $i 809 } 810 } 811 812 puts "static int fold_test(int *piCode)\{" 813 puts -nonewline " static int aLookup\[\] = \{" 814 for {set i 0} {$i < 70000} {incr i} { 815 816 set expected $i 817 catch { set expected $tl_lookup_table($i) } 818 set expected2 $expected 819 catch { set expected2 $extra($expected2) } 820 821 if {($i % 4)==0} { puts "" ; puts -nonewline " " } 822 puts -nonewline "$expected, $expected2, " 823 } 824 puts " \};" 825 puts " int i;" 826 puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{" 827 puts " int iCode = (i/2);" 828 puts " int bFlag = i & 0x0001;" 829 puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{" 830 puts " *piCode = iCode;" 831 puts " return 1;" 832 puts " \}" 833 puts " \}" 834 puts " return 0;" 835 puts "\}" 836} 837 838 839proc print_fileheader {} { 840 puts [string trim { 841/* 842** 2012-05-25 843** 844** The author disclaims copyright to this source code. In place of 845** a legal notice, here is a blessing: 846** 847** May you do good and not evil. 848** May you find forgiveness for yourself and forgive others. 849** May you share freely, never taking more than you give. 850** 851****************************************************************************** 852*/ 853 854/* 855** DO NOT EDIT THIS MACHINE GENERATED FILE. 856*/ 857 }] 858 puts "" 859 if {$::generate_fts5_code} { 860 # no-op 861 } else { 862 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE" 863 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" 864 } 865 puts "" 866 puts "#include <assert.h>" 867 puts "" 868} 869 870proc print_test_main {} { 871 puts "" 872 puts "#include <stdio.h>" 873 puts "" 874 puts "int main(int argc, char **argv)\{" 875 puts " int r1, r2, r3;" 876 puts " int code;" 877 puts " r3 = 0;" 878 puts " r1 = isalnum_test(&code);" 879 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" 880 puts " else printf(\"isalnum(): test passed\\n\");" 881 puts " r2 = fold_test(&code);" 882 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" 883 puts " else printf(\"fold(): test passed\\n\");" 884 if {$::generate_fts5_code} { 885 puts " r3 = categories_test(&code);" 886 puts " if( r3 ) printf(\"categories(): Problem with code %d\\n\",code);" 887 puts " else printf(\"categories(): test passed\\n\");" 888 } 889 puts " return (r1 || r2 || r3);" 890 puts "\}" 891} 892 893# Proces the command line arguments. Exit early if they are not to 894# our liking. 895# 896proc usage {} { 897 puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? " 898 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>" 899 exit 1 900} 901if {[llength $argv]<2} usage 902set unicodedata.txt [lindex $argv end] 903set casefolding.txt [lindex $argv end-1] 904 905set remove_diacritic remove_diacritic 906set generate_test_code 0 907set generate_fts5_code 0 908set function_prefix "sqlite3Fts" 909for {set i 0} {$i < [llength $argv]-2} {incr i} { 910 switch -- [lindex $argv $i] { 911 -test { 912 set generate_test_code 1 913 } 914 -fts5 { 915 set function_prefix sqlite3Fts5 916 set generate_fts5_code 1 917 set remove_diacritic fts5_remove_diacritic 918 } 919 default { 920 usage 921 } 922 } 923} 924 925print_fileheader 926 927if {$::generate_test_code} { 928 puts "typedef unsigned short int u16;" 929 puts "typedef unsigned char u8;" 930 puts "#include <string.h>" 931} 932 933# Print the isalnum() function to stdout. 934# 935set lRange [an_load_separator_ranges] 936if {$generate_fts5_code==0} { 937 print_isalnum ${function_prefix}UnicodeIsalnum $lRange 938} 939 940# Leave a gap between the two generated C functions. 941# 942puts "" 943puts "" 944 945# Load the fold data. This is used by the [rd_XXX] commands 946# as well as [print_fold]. 947tl_load_casefolding_txt ${casefolding.txt} 948 949set mappings [rd_load_unicodedata_text ${unicodedata.txt}] 950print_rd $mappings 951puts "" 952puts "" 953print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings 954puts "" 955puts "" 956 957# Print the fold() function to stdout. 958# 959print_fold ${function_prefix}UnicodeFold 960 961if {$generate_fts5_code} { 962 puts "" 963 puts "" 964 print_categories [cc_load_unicodedata_text ${unicodedata.txt}] 965} 966 967# Print the test routines and main() function to stdout, if -test 968# was specified. 969# 970if {$::generate_test_code} { 971 if {$generate_fts5_code==0} { 972 print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange 973 } 974 print_fold_test ${function_prefix}UnicodeFold $mappings 975 print_test_categories [cc_load_unicodedata_text ${unicodedata.txt}] 976 print_test_main 977} 978 979if {$generate_fts5_code} { 980 # no-op 981} else { 982 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" 983 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" 984} 985