1
2source [file join [file dirname [info script]] parseunicode.tcl]
3
4proc print_rd {map} {
5  global tl_lookup_table
6  set aChar [list]
7  set lRange [list]
8
9  set nRange 1
10  set iFirst  [lindex $map 0 0]
11  set cPrev   [lindex $map 0 1]
12  set fPrev   [lindex $map 0 2]
13
14  foreach m [lrange $map 1 end] {
15    foreach {i c f} $m {}
16
17    if {$cPrev == $c && $fPrev==$f} {
18      for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
19        if {[info exists tl_lookup_table($j)]==0} break
20      }
21
22      if {$j==$i} {
23        set nNew [expr {(1 + $i - $iFirst)}]
24        if {$nNew<=8} {
25          set nRange $nNew
26          continue
27        }
28      }
29    }
30
31    lappend lRange [list $iFirst $nRange]
32    lappend aChar  $cPrev
33    lappend aFlag  $fPrev
34
35    set iFirst $i
36    set cPrev  $c
37    set fPrev  $f
38    set nRange 1
39  }
40  lappend lRange [list $iFirst $nRange]
41  lappend aChar $cPrev
42  lappend aFlag $fPrev
43
44  puts "/*"
45  puts "** If the argument is a codepoint corresponding to a lowercase letter"
46  puts "** in the ASCII range with a diacritic added, return the codepoint"
47  puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
48  puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
49  puts "** E\"). The resuls of passing a codepoint that corresponds to an"
50  puts "** uppercase letter are undefined."
51  puts "*/"
52  puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
53  puts "  unsigned short aDia\[\] = \{"
54  puts -nonewline "        0, "
55  set i 1
56  foreach r $lRange {
57    foreach {iCode nRange} $r {}
58    if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
59    incr i
60
61    puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
62    puts -nonewline ", "
63  }
64  puts ""
65  puts "  \};"
66  puts "#define HIBIT ((unsigned char)0x80)"
67  puts "  unsigned char aChar\[\] = \{"
68  puts -nonewline "    '\\0',      "
69  set i 1
70  foreach c $aChar f $aFlag {
71    if { $f } {
72      set str "'$c'|HIBIT, "
73    } else {
74      set str "'$c',       "
75    }
76    if {$c == ""} { set str "'\\0',      " }
77
78    if {($i % 6)==0} {puts "" ; puts -nonewline "    " }
79    incr i
80    puts -nonewline "$str"
81  }
82  puts ""
83  puts "  \};"
84  puts {
85  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
86  int iRes = 0;
87  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
88  int iLo = 0;
89  while( iHi>=iLo ){
90    int iTest = (iHi + iLo) / 2;
91    if( key >= aDia[iTest] ){
92      iRes = iTest;
93      iLo = iTest+1;
94    }else{
95      iHi = iTest-1;
96    }
97  }
98  assert( key>=aDia[iRes] );
99  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
100  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
101  puts "\}"
102}
103
104proc print_isdiacritic {zFunc map} {
105
106  set lCode [list]
107  foreach m $map {
108    foreach {code char flag} $m {}
109    if {$flag} continue
110    if {$code && $char == ""} { lappend lCode $code }
111  }
112  set lCode [lsort -integer $lCode]
113  set iFirst [lindex $lCode 0]
114  set iLast [lindex $lCode end]
115
116  set i1 0
117  set i2 0
118
119  foreach c $lCode {
120    set i [expr $c - $iFirst]
121    if {$i < 32} {
122      set i1 [expr {$i1 | (1<<$i)}]
123    } else {
124      set i2 [expr {$i2 | (1<<($i-32))}]
125    }
126  }
127
128  puts "/*"
129  puts "** Return true if the argument interpreted as a unicode codepoint"
130  puts "** is a diacritical modifier character."
131  puts "*/"
132  puts "int ${zFunc}\(int c)\{"
133  puts "  unsigned int mask0 = [format "0x%08X" $i1];"
134  puts "  unsigned int mask1 = [format "0x%08X" $i2];"
135
136  puts "  if( c<$iFirst || c>$iLast ) return 0;"
137  puts "  return (c < $iFirst+32) ?"
138  puts "      (mask0 & ((unsigned int)1 << (c-$iFirst))) :"
139  puts "      (mask1 & ((unsigned int)1 << (c-$iFirst-32)));"
140  puts "\}"
141}
142
143
144#-------------------------------------------------------------------------
145
146proc an_load_separator_ranges {} {
147  global unicodedata.txt
148  set lSep [an_load_unicodedata_text ${unicodedata.txt}]
149  unset -nocomplain iFirst
150  unset -nocomplain nRange
151  set lRange [list]
152  foreach sep $lSep {
153    if {0==[info exists iFirst]} {
154      set iFirst $sep
155      set nRange 1
156    } elseif { $sep == ($iFirst+$nRange) } {
157      incr nRange
158    } else {
159      lappend lRange [list $iFirst $nRange]
160      set iFirst $sep
161      set nRange 1
162    }
163  }
164  lappend lRange [list $iFirst $nRange]
165  set lRange
166}
167
168proc an_print_range_array {lRange} {
169  set iFirstMax 0
170  set nRangeMax 0
171  foreach range $lRange {
172    foreach {iFirst nRange} $range {}
173    if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
174    if {$nRange > $nRangeMax} {set nRangeMax $nRange}
175  }
176  if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
177  if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
178
179  puts -nonewline "  "
180  puts [string trim {
181  /* Each unsigned integer in the following array corresponds to a contiguous
182  ** range of unicode codepoints that are not either letters or numbers (i.e.
183  ** codepoints for which this function should return 0).
184  **
185  ** The most significant 22 bits in each 32-bit value contain the first
186  ** codepoint in the range. The least significant 10 bits are used to store
187  ** the size of the range (always at least 1). In other words, the value
188  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
189  ** C. It is not possible to represent a range larger than 1023 codepoints
190  ** using this format.
191  */
192  }]
193  puts -nonewline "  static const unsigned int aEntry\[\] = \{"
194  set i 0
195  foreach range $lRange {
196    foreach {iFirst nRange} $range {}
197    set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
198
199    if {($i % 5)==0} {puts "" ; puts -nonewline "   "}
200    puts -nonewline " $u32,"
201    incr i
202  }
203  puts ""
204  puts "  \};"
205}
206
207proc an_print_ascii_bitmap {lRange} {
208  foreach range $lRange {
209    foreach {iFirst nRange} $range {}
210    for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
211      if {$i<=127} { set a($i) 1 }
212    }
213  }
214
215  set aAscii [list 0 0 0 0]
216  foreach key [array names a] {
217    set idx [expr $key >> 5]
218    lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
219  }
220
221  puts "  static const unsigned int aAscii\[4\] = \{"
222  puts -nonewline "   "
223  foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
224  puts ""
225  puts "  \};"
226}
227
228proc print_isalnum {zFunc lRange} {
229  puts "/*"
230  puts "** Return true if the argument corresponds to a unicode codepoint"
231  puts "** classified as either a letter or a number. Otherwise false."
232  puts "**"
233  puts "** The results are undefined if the value passed to this function"
234  puts "** is less than zero."
235  puts "*/"
236  puts "int ${zFunc}\(int c)\{"
237  an_print_range_array $lRange
238  an_print_ascii_bitmap $lRange
239  puts {
240  if( (unsigned int)c<128 ){
241    return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
242  }else if( (unsigned int)c<(1<<22) ){
243    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
244    int iRes = 0;
245    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
246    int iLo = 0;
247    while( iHi>=iLo ){
248      int iTest = (iHi + iLo) / 2;
249      if( key >= aEntry[iTest] ){
250        iRes = iTest;
251        iLo = iTest+1;
252      }else{
253        iHi = iTest-1;
254      }
255    }
256    assert( aEntry[0]<key );
257    assert( key>=aEntry[iRes] );
258    return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
259  }
260  return 1;}
261  puts "\}"
262}
263
264proc print_test_isalnum {zFunc lRange} {
265  foreach range $lRange {
266    foreach {iFirst nRange} $range {}
267    for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
268  }
269
270  puts "static int isalnum_test(int *piCode)\{"
271  puts -nonewline "  unsigned char aAlnum\[\] = \{"
272  for {set i 0} {$i < 70000} {incr i} {
273    if {($i % 32)==0} { puts "" ; puts -nonewline "    " }
274    set bFlag [expr ![info exists a($i)]]
275    puts -nonewline "${bFlag},"
276  }
277  puts ""
278  puts "  \};"
279
280  puts -nonewline "  int aLargeSep\[\] = \{"
281  set i 0
282  foreach iSep [lsort -integer [array names a]] {
283    if {$iSep<70000} continue
284    if {($i % 8)==0} { puts "" ; puts -nonewline "   " }
285    puts -nonewline " $iSep,"
286    incr i
287  }
288  puts ""
289  puts "  \};"
290  puts -nonewline "  int aLargeOther\[\] = \{"
291  set i 0
292  foreach iSep [lsort -integer [array names a]] {
293    if {$iSep<70000} continue
294    if {[info exists a([expr $iSep-1])]==0} {
295      if {($i % 8)==0} { puts "" ; puts -nonewline "   " }
296      puts -nonewline " [expr $iSep-1],"
297      incr i
298    }
299    if {[info exists a([expr $iSep+1])]==0} {
300      if {($i % 8)==0} { puts "" ; puts -nonewline "   " }
301      puts -nonewline " [expr $iSep+1],"
302      incr i
303    }
304  }
305  puts ""
306  puts "  \};"
307
308  puts [subst -nocommands {
309  int i;
310  for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
311    if( ${zFunc}(i)!=aAlnum[i] ){
312      *piCode = i;
313      return 1;
314    }
315  }
316  for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
317    if( ${zFunc}(aLargeSep[i])!=0 ){
318      *piCode = aLargeSep[i];
319      return 1;
320    }
321  }
322  for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
323    if( ${zFunc}(aLargeOther[i])!=1 ){
324      *piCode = aLargeOther[i];
325      return 1;
326    }
327  }
328  }]
329  puts "  return 0;"
330  puts "\}"
331}
332
333#-------------------------------------------------------------------------
334
335proc tl_create_records {} {
336  global tl_lookup_table
337
338  set iFirst ""
339  set nOff 0
340  set nRange 0
341  set nIncr 0
342
343  set lRecord [list]
344  foreach code [lsort -integer [array names tl_lookup_table]] {
345    set mapping $tl_lookup_table($code)
346    if {$iFirst == ""} {
347      set iFirst $code
348      set nOff   [expr $mapping - $code]
349      set nRange 1
350      set nIncr 1
351    } else {
352      set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
353      if { $nRange==1 && ($diff==1 || $diff==2) } {
354        set nIncr $diff
355      }
356
357      if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
358        if { $nRange==1 } {set nIncr 1}
359        lappend lRecord [list $iFirst $nIncr $nRange $nOff]
360        set iFirst $code
361        set nOff   [expr $mapping - $code]
362        set nRange 1
363        set nIncr 1
364      } else {
365        incr nRange
366      }
367    }
368  }
369
370  lappend lRecord [list $iFirst $nIncr $nRange $nOff]
371
372  set lRecord
373}
374
375proc tl_print_table_header {} {
376  puts -nonewline "  "
377  puts [string trim {
378  /* Each entry in the following array defines a rule for folding a range
379  ** of codepoints to lower case. The rule applies to a range of nRange
380  ** codepoints starting at codepoint iCode.
381  **
382  ** If the least significant bit in flags is clear, then the rule applies
383  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
384  ** need to be folded). Or, if it is set, then the rule only applies to
385  ** every second codepoint in the range, starting with codepoint C.
386  **
387  ** The 7 most significant bits in flags are an index into the aiOff[]
388  ** array. If a specific codepoint C does require folding, then its lower
389  ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
390  **
391  ** The contents of this array are generated by parsing the CaseFolding.txt
392  ** file distributed as part of the "Unicode Character Database". See
393  ** http://www.unicode.org for details.
394  */
395  }]
396  puts "  static const struct TableEntry \{"
397  puts "    unsigned short iCode;"
398  puts "    unsigned char flags;"
399  puts "    unsigned char nRange;"
400  puts "  \} aEntry\[\] = \{"
401}
402
403proc tl_print_table_entry {togglevar entry liOff} {
404  upvar $togglevar t
405  foreach {iFirst nIncr nRange nOff} $entry {}
406
407  if {$iFirst > (1<<16)} { return 1 }
408
409  if {[info exists t]==0} {set t 0}
410  if {$t==0} { puts -nonewline "    " }
411
412  set flags 0
413  if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
414  if {$nOff<0}   { incr nOff [expr (1<<16)] }
415
416  set idx [lsearch $liOff $nOff]
417  if {$idx<0} {error "malfunction generating aiOff"}
418  set flags [expr $flags + $idx*2]
419
420  set txt "{$iFirst, $flags, $nRange},"
421  if {$t==2} {
422    puts $txt
423  } else {
424    puts -nonewline [format "% -23s" $txt]
425  }
426  set t [expr ($t+1)%3]
427
428  return 0
429}
430
431proc tl_print_table_footer {togglevar} {
432  upvar $togglevar t
433  if {$t!=0} {puts ""}
434  puts "  \};"
435}
436
437proc tl_print_if_entry {entry} {
438  foreach {iFirst nIncr nRange nOff} $entry {}
439  if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
440
441  puts "  else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
442  puts "    ret = c + $nOff;"
443  puts "  \}"
444}
445
446proc tl_generate_ioff_table {lRecord} {
447  foreach entry $lRecord {
448    foreach {iFirst nIncr nRange iOff} $entry {}
449    if {$iOff<0}   { incr iOff [expr (1<<16)] }
450    if {[info exists a($iOff)]} continue
451    set a($iOff) 1
452  }
453
454  set liOff [lsort -integer [array names a]]
455  if {[llength $liOff]>128} { error "Too many distinct ioffs" }
456  return $liOff
457}
458
459proc tl_print_ioff_table {liOff} {
460  puts -nonewline "  static const unsigned short aiOff\[\] = \{"
461  set i 0
462  foreach off $liOff {
463    if {($i % 8)==0} {puts "" ; puts -nonewline "   "}
464    puts -nonewline [format "% -7s" "$off,"]
465    incr i
466  }
467  puts ""
468  puts "  \};"
469
470}
471
472proc print_fold {zFunc} {
473
474  set lRecord [tl_create_records]
475
476  set lHigh [list]
477  puts "/*"
478  puts "** Interpret the argument as a unicode codepoint. If the codepoint"
479  puts "** is an upper case character that has a lower case equivalent,"
480  puts "** return the codepoint corresponding to the lower case version."
481  puts "** Otherwise, return a copy of the argument."
482  puts "**"
483  puts "** The results are undefined if the value passed to this function"
484  puts "** is less than zero."
485  puts "*/"
486  puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"
487
488  set liOff [tl_generate_ioff_table $lRecord]
489  tl_print_table_header
490  foreach entry $lRecord {
491    if {[tl_print_table_entry toggle $entry $liOff]} {
492      lappend lHigh $entry
493    }
494  }
495  tl_print_table_footer toggle
496  tl_print_ioff_table $liOff
497
498  puts [subst -nocommands {
499  int ret = c;
500
501  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
502
503  if( c<128 ){
504    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
505  }else if( c<65536 ){
506    const struct TableEntry *p;
507    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
508    int iLo = 0;
509    int iRes = -1;
510
511    assert( c>aEntry[0].iCode );
512    while( iHi>=iLo ){
513      int iTest = (iHi + iLo) / 2;
514      int cmp = (c - aEntry[iTest].iCode);
515      if( cmp>=0 ){
516        iRes = iTest;
517        iLo = iTest+1;
518      }else{
519        iHi = iTest-1;
520      }
521    }
522
523    assert( iRes>=0 && c>=aEntry[iRes].iCode );
524    p = &aEntry[iRes];
525    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
526      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
527      assert( ret>0 );
528    }
529
530    if( eRemoveDiacritic ){
531      ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
532    }
533  }
534  }]
535
536  foreach entry $lHigh {
537    tl_print_if_entry $entry
538  }
539
540  puts ""
541  puts "  return ret;"
542  puts "\}"
543}
544
545proc code {txt} {
546  set txt [string trimright $txt]
547  set txt [string trimleft $txt "\n"]
548  set n [expr {[string length $txt] - [string length [string trim $txt]]}]
549  set ret ""
550  foreach L [split $txt "\n"] {
551    append ret "[string range $L $n end]\n"
552  }
553  return [uplevel "subst -nocommands {$ret}"]
554}
555
556proc intarray {lInt} {
557  set ret ""
558  set n [llength $lInt]
559  for {set i 0} {$i < $n} {incr i 10} {
560    append ret "\n    "
561    foreach int [lrange $lInt $i [expr $i+9]] {
562      append ret [format "%-7s" "$int, "]
563    }
564  }
565  append ret "\n  "
566  set ret
567}
568
569proc categories_switch {Cvar first lSecond} {
570  upvar $Cvar C
571  set ret ""
572  append ret "case '$first':\n"
573  append ret "          switch( zCat\[1\] ){\n"
574  foreach s $lSecond {
575    append ret "            case '$s': aArray\[$C($first$s)\] = 1; break;\n"
576  }
577  append ret "            case '*': \n"
578  foreach s $lSecond {
579    append ret "              aArray\[$C($first$s)\] = 1;\n"
580  }
581  append ret "              break;\n"
582  append ret "            default: return 1;"
583  append ret "          }\n"
584  append ret "          break;\n"
585}
586
587# Argument is a list. Each element of which is itself a list of two elements:
588#
589#   * the codepoint
590#   * the category
591#
592# List elements are sorted in order of codepoint.
593#
594proc print_categories {lMap} {
595  set categories {
596    Cc Cf Cn Cs
597    Ll Lm Lo Lt Lu
598    Mc Me Mn
599    Nd Nl No
600    Pc Pd Pe Pf Pi Po Ps
601    Sc Sk Sm So
602    Zl Zp Zs
603
604    LC Co
605  }
606
607  for {set i 0} {$i < [llength $categories]} {incr i} {
608    set C([lindex $categories $i]) [expr 1+$i]
609  }
610
611  set caseC [categories_switch C C {c f n s o}]
612  set caseL [categories_switch C L {l m o t u C}]
613  set caseM [categories_switch C M {c e n}]
614  set caseN [categories_switch C N {d l o}]
615  set caseP [categories_switch C P {c d e f i o s}]
616  set caseS [categories_switch C S {c k m o}]
617  set caseZ [categories_switch C Z {l p s}]
618
619  set nCat [expr [llength [array names C]] + 1]
620  puts [code {
621    int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){
622      aArray[0] = 1;
623      switch( zCat[0] ){
624        $caseC
625        $caseL
626        $caseM
627        $caseN
628        $caseP
629        $caseS
630        $caseZ
631      }
632      return 0;
633    }
634  }]
635
636  set nRepeat 0
637  set first   [lindex $lMap 0 0]
638  set class   [lindex $lMap 0 1]
639  set prev -1
640
641  set CASE(0) "Lu"
642  set CASE(1) "Ll"
643
644  foreach m $lMap {
645    foreach {codepoint cl} $m {}
646    set codepoint [expr "0x$codepoint"]
647    if {$codepoint>=(1<<20)} continue
648
649    set bNew 0
650    if {$codepoint!=($prev+1)} {
651      set bNew 1
652    } elseif {
653      $cl==$class || ($class=="LC" && $cl==$CASE([expr $nRepeat & 0x01]))
654    } {
655      incr nRepeat
656    } elseif {$class=="Lu" && $nRepeat==1 && $cl=="Ll"} {
657      set class LC
658      incr nRepeat
659    } else {
660      set bNew 1
661    }
662    if {$bNew} {
663      lappend lEntries [list $first $class $nRepeat]
664      set nRepeat 1
665      set first $codepoint
666      set class $cl
667    }
668    set prev $codepoint
669  }
670  if {$nRepeat>0} {
671    lappend lEntries [list $first $class $nRepeat]
672  }
673
674  set aBlock [list 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
675  set aMap [list]
676  foreach e $lEntries {
677    foreach {cp class nRepeat} $e {}
678    set block [expr ($cp>>16)]
679    if {$block>0 && [lindex $aBlock $block]==0} {
680      for {set i 1} {$i<=$block} {incr i} {
681        if {[lindex $aBlock $i]==0} {
682          lset aBlock $i [llength $aMap]
683        }
684      }
685    }
686    lappend aMap [expr {$cp & 0xFFFF}]
687    lappend aData [expr {($nRepeat << 5) + $C($class)}]
688  }
689  for {set i 1} {$i<[llength $aBlock]} {incr i} {
690    if {[lindex $aBlock $i]==0} {
691      lset aBlock $i [llength $aMap]
692    }
693  }
694
695  set aBlockArray [intarray $aBlock]
696  set aMapArray [intarray $aMap]
697  set aDataArray [intarray $aData]
698  puts [code {
699    static u16 aFts5UnicodeBlock[] = {$aBlockArray};
700    static u16 aFts5UnicodeMap[] = {$aMapArray};
701    static u16 aFts5UnicodeData[] = {$aDataArray};
702
703    int sqlite3Fts5UnicodeCategory(u32 iCode) {
704      int iRes = -1;
705      int iHi;
706      int iLo;
707      int ret;
708      u16 iKey;
709
710      if( iCode>=(1<<20) ){
711        return 0;
712      }
713      iLo = aFts5UnicodeBlock[(iCode>>16)];
714      iHi = aFts5UnicodeBlock[1+(iCode>>16)];
715      iKey = (iCode & 0xFFFF);
716      while( iHi>iLo ){
717        int iTest = (iHi + iLo) / 2;
718        assert( iTest>=iLo && iTest<iHi );
719        if( iKey>=aFts5UnicodeMap[iTest] ){
720          iRes = iTest;
721          iLo = iTest+1;
722        }else{
723          iHi = iTest;
724        }
725      }
726
727      if( iRes<0 ) return 0;
728      if( iKey>=(aFts5UnicodeMap[iRes]+(aFts5UnicodeData[iRes]>>5)) ) return 0;
729      ret = aFts5UnicodeData[iRes] & 0x1F;
730      if( ret!=$C(LC) ) return ret;
731      return ((iKey - aFts5UnicodeMap[iRes]) & 0x01) ? $C(Ll) : $C(Lu);
732    }
733
734    void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){
735      int i = 0;
736      int iTbl = 0;
737      while( i<128 ){
738        int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
739        int n = (aFts5UnicodeData[iTbl] >> 5) + i;
740        for(; i<128 && i<n; i++){
741          aAscii[i] = (u8)bToken;
742        }
743        iTbl++;
744      }
745      aAscii[0] = 0;                  /* 0x00 is never a token character */
746    }
747  }]
748}
749
750proc print_test_categories {lMap} {
751
752  set lCP [list]
753  foreach e $lMap {
754    foreach {cp cat} $e {}
755    if {[expr 0x$cp] < (1<<20)} {
756      lappend lCP "{0x$cp, \"$cat\"}, "
757    }
758  }
759
760  set aCP "\n"
761  for {set i 0} {$i < [llength $lCP]} {incr i 4} {
762    append aCP "    [join [lrange $lCP $i $i+3]]\n"
763  }
764
765
766  puts [code {
767    static int categories_test (int *piCode){
768      struct Codepoint {
769        int iCode;
770        const char *zCat;
771      } aCP[] = {$aCP};
772      int i;
773      int iCP = 0;
774
775      for(i=0; i<1000000; i++){
776        u8 aArray[40];
777        int cat = 0;
778        int c = 0;
779        memset(aArray, 0, sizeof(aArray));
780        if( aCP[iCP].iCode==i ){
781          sqlite3Fts5UnicodeCatParse(aCP[iCP].zCat, aArray);
782          iCP++;
783        }else{
784          aArray[0] = 1;
785        }
786
787        c = sqlite3Fts5UnicodeCategory((u32)i);
788        if( aArray[c]==0 ){
789          *piCode = i;
790          return 1;
791        }
792      }
793
794      return 0;
795    }
796  }]
797}
798
799proc print_fold_test {zFunc mappings} {
800  global tl_lookup_table
801
802  foreach m $mappings {
803    set c [lindex $m 1]
804    if {$c == ""} {
805      set extra([lindex $m 0]) 0
806    } else {
807      scan $c %c i
808      set extra([lindex $m 0]) $i
809    }
810  }
811
812  puts "static int fold_test(int *piCode)\{"
813  puts -nonewline "  static int aLookup\[\] = \{"
814  for {set i 0} {$i < 70000} {incr i} {
815
816    set expected $i
817    catch { set expected $tl_lookup_table($i) }
818    set expected2 $expected
819    catch { set expected2 $extra($expected2) }
820
821    if {($i % 4)==0}  { puts "" ; puts -nonewline "    " }
822    puts -nonewline "$expected, $expected2, "
823  }
824  puts "  \};"
825  puts "  int i;"
826  puts "  for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
827  puts "    int iCode = (i/2);"
828  puts "    int bFlag = i & 0x0001;"
829  puts "    if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
830  puts "      *piCode = iCode;"
831  puts "      return 1;"
832  puts "    \}"
833  puts "  \}"
834  puts "  return 0;"
835  puts "\}"
836}
837
838
839proc print_fileheader {} {
840  puts [string trim {
841/*
842** 2012-05-25
843**
844** The author disclaims copyright to this source code.  In place of
845** a legal notice, here is a blessing:
846**
847**    May you do good and not evil.
848**    May you find forgiveness for yourself and forgive others.
849**    May you share freely, never taking more than you give.
850**
851******************************************************************************
852*/
853
854/*
855** DO NOT EDIT THIS MACHINE GENERATED FILE.
856*/
857  }]
858  puts ""
859  if {$::generate_fts5_code} {
860    # no-op
861  } else {
862    puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"
863    puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
864  }
865  puts ""
866  puts "#include <assert.h>"
867  puts ""
868}
869
870proc print_test_main {} {
871  puts ""
872  puts "#include <stdio.h>"
873  puts ""
874  puts "int main(int argc, char **argv)\{"
875  puts "  int r1, r2, r3;"
876  puts "  int code;"
877  puts "  r3 = 0;"
878  puts "  r1 = isalnum_test(&code);"
879  puts "  if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
880  puts "  else printf(\"isalnum(): test passed\\n\");"
881  puts "  r2 = fold_test(&code);"
882  puts "  if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
883  puts "  else printf(\"fold(): test passed\\n\");"
884  if {$::generate_fts5_code} {
885    puts "  r3 = categories_test(&code);"
886    puts "  if( r3 ) printf(\"categories(): Problem with code %d\\n\",code);"
887    puts "  else printf(\"categories(): test passed\\n\");"
888  }
889  puts "  return (r1 || r2 || r3);"
890  puts "\}"
891}
892
893# Proces the command line arguments. Exit early if they are not to
894# our liking.
895#
896proc usage {} {
897  puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? "
898  puts            stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
899  exit 1
900}
901if {[llength $argv]<2} usage
902set unicodedata.txt [lindex $argv end]
903set casefolding.txt [lindex $argv end-1]
904
905set remove_diacritic remove_diacritic
906set generate_test_code 0
907set generate_fts5_code 0
908set function_prefix "sqlite3Fts"
909for {set i 0} {$i < [llength $argv]-2} {incr i} {
910  switch -- [lindex $argv $i] {
911    -test {
912      set generate_test_code 1
913    }
914    -fts5 {
915      set function_prefix sqlite3Fts5
916      set generate_fts5_code 1
917      set remove_diacritic fts5_remove_diacritic
918    }
919    default {
920      usage
921    }
922  }
923}
924
925print_fileheader
926
927if {$::generate_test_code} {
928  puts "typedef unsigned short int u16;"
929  puts "typedef unsigned char u8;"
930  puts "#include <string.h>"
931}
932
933# Print the isalnum() function to stdout.
934#
935set lRange [an_load_separator_ranges]
936if {$generate_fts5_code==0} {
937  print_isalnum ${function_prefix}UnicodeIsalnum $lRange
938}
939
940# Leave a gap between the two generated C functions.
941#
942puts ""
943puts ""
944
945# Load the fold data. This is used by the [rd_XXX] commands
946# as well as [print_fold].
947tl_load_casefolding_txt ${casefolding.txt}
948
949set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
950print_rd $mappings
951puts ""
952puts ""
953print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings
954puts ""
955puts ""
956
957# Print the fold() function to stdout.
958#
959print_fold ${function_prefix}UnicodeFold
960
961if {$generate_fts5_code} {
962  puts ""
963  puts ""
964  print_categories [cc_load_unicodedata_text ${unicodedata.txt}]
965}
966
967# Print the test routines and main() function to stdout, if -test
968# was specified.
969#
970if {$::generate_test_code} {
971  if {$generate_fts5_code==0} {
972    print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange
973  }
974  print_fold_test ${function_prefix}UnicodeFold $mappings
975  print_test_categories [cc_load_unicodedata_text ${unicodedata.txt}]
976  print_test_main
977}
978
979if {$generate_fts5_code} {
980  # no-op
981} else {
982  puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
983  puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"
984}
985