xref: /sqlite-3.40.0/ext/misc/spellfix.c (revision b20a42e3)
1 /*
2 ** 2012 April 10
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used
14 ** to search a large vocabulary for close matches.  See separate
15 ** documentation (http://www.sqlite.org/spellfix1.html) for details.
16 */
17 #include "sqlite3ext.h"
18 SQLITE_EXTENSION_INIT1
19 
20 #ifndef SQLITE_AMALGAMATION
21 # include <string.h>
22 # include <stdio.h>
23 # include <stdlib.h>
24 # include <assert.h>
25 # define ALWAYS(X)  1
26 # define NEVER(X)   0
27   typedef unsigned char u8;
28   typedef unsigned short u16;
29 #endif
30 #include <ctype.h>
31 
32 #ifndef SQLITE_OMIT_VIRTUALTABLE
33 
34 /*
35 ** Character classes for ASCII characters:
36 **
37 **   0   ''        Silent letters:   H W
38 **   1   'A'       Any vowel:   A E I O U (Y)
39 **   2   'B'       A bilabeal stop or fricative:  B F P V W
40 **   3   'C'       Other fricatives or back stops:  C G J K Q S X Z
41 **   4   'D'       Alveolar stops:  D T
42 **   5   'H'       Letter H at the beginning of a word
43 **   6   'L'       Glide:  L
44 **   7   'R'       Semivowel:  R
45 **   8   'M'       Nasals:  M N
46 **   9   'Y'       Letter Y at the beginning of a word.
47 **   10  '9'       Digits: 0 1 2 3 4 5 6 7 8 9
48 **   11  ' '       White space
49 **   12  '?'       Other.
50 */
51 #define CCLASS_SILENT         0
52 #define CCLASS_VOWEL          1
53 #define CCLASS_B              2
54 #define CCLASS_C              3
55 #define CCLASS_D              4
56 #define CCLASS_H              5
57 #define CCLASS_L              6
58 #define CCLASS_R              7
59 #define CCLASS_M              8
60 #define CCLASS_Y              9
61 #define CCLASS_DIGIT         10
62 #define CCLASS_SPACE         11
63 #define CCLASS_OTHER         12
64 
65 /*
66 ** The following table gives the character class for non-initial ASCII
67 ** characters.
68 */
69 static const unsigned char midClass[] = {
70  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
71  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
72  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
73  /*   */ CCLASS_SPACE,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
74  /*   */ CCLASS_SPACE,    /*   */ CCLASS_SPACE,   /*   */ CCLASS_OTHER,
75  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
76  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
77  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
78  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
79  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
80  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_SPACE,
81  /* ! */ CCLASS_OTHER,    /* " */ CCLASS_OTHER,   /* # */ CCLASS_OTHER,
82  /* $ */ CCLASS_OTHER,    /* % */ CCLASS_OTHER,   /* & */ CCLASS_OTHER,
83  /* ' */ CCLASS_SILENT,   /* ( */ CCLASS_OTHER,   /* ) */ CCLASS_OTHER,
84  /* * */ CCLASS_OTHER,    /* + */ CCLASS_OTHER,   /* , */ CCLASS_OTHER,
85  /* - */ CCLASS_OTHER,    /* . */ CCLASS_OTHER,   /* / */ CCLASS_OTHER,
86  /* 0 */ CCLASS_DIGIT,    /* 1 */ CCLASS_DIGIT,   /* 2 */ CCLASS_DIGIT,
87  /* 3 */ CCLASS_DIGIT,    /* 4 */ CCLASS_DIGIT,   /* 5 */ CCLASS_DIGIT,
88  /* 6 */ CCLASS_DIGIT,    /* 7 */ CCLASS_DIGIT,   /* 8 */ CCLASS_DIGIT,
89  /* 9 */ CCLASS_DIGIT,    /* : */ CCLASS_OTHER,   /* ; */ CCLASS_OTHER,
90  /* < */ CCLASS_OTHER,    /* = */ CCLASS_OTHER,   /* > */ CCLASS_OTHER,
91  /* ? */ CCLASS_OTHER,    /* @ */ CCLASS_OTHER,   /* A */ CCLASS_VOWEL,
92  /* B */ CCLASS_B,        /* C */ CCLASS_C,       /* D */ CCLASS_D,
93  /* E */ CCLASS_VOWEL,    /* F */ CCLASS_B,       /* G */ CCLASS_C,
94  /* H */ CCLASS_SILENT,   /* I */ CCLASS_VOWEL,   /* J */ CCLASS_C,
95  /* K */ CCLASS_C,        /* L */ CCLASS_L,       /* M */ CCLASS_M,
96  /* N */ CCLASS_M,        /* O */ CCLASS_VOWEL,   /* P */ CCLASS_B,
97  /* Q */ CCLASS_C,        /* R */ CCLASS_R,       /* S */ CCLASS_C,
98  /* T */ CCLASS_D,        /* U */ CCLASS_VOWEL,   /* V */ CCLASS_B,
99  /* W */ CCLASS_B,        /* X */ CCLASS_C,       /* Y */ CCLASS_VOWEL,
100  /* Z */ CCLASS_C,        /* [ */ CCLASS_OTHER,   /* \ */ CCLASS_OTHER,
101  /* ] */ CCLASS_OTHER,    /* ^ */ CCLASS_OTHER,   /* _ */ CCLASS_OTHER,
102  /* ` */ CCLASS_OTHER,    /* a */ CCLASS_VOWEL,   /* b */ CCLASS_B,
103  /* c */ CCLASS_C,        /* d */ CCLASS_D,       /* e */ CCLASS_VOWEL,
104  /* f */ CCLASS_B,        /* g */ CCLASS_C,       /* h */ CCLASS_SILENT,
105  /* i */ CCLASS_VOWEL,    /* j */ CCLASS_C,       /* k */ CCLASS_C,
106  /* l */ CCLASS_L,        /* m */ CCLASS_M,       /* n */ CCLASS_M,
107  /* o */ CCLASS_VOWEL,    /* p */ CCLASS_B,       /* q */ CCLASS_C,
108  /* r */ CCLASS_R,        /* s */ CCLASS_C,       /* t */ CCLASS_D,
109  /* u */ CCLASS_VOWEL,    /* v */ CCLASS_B,       /* w */ CCLASS_B,
110  /* x */ CCLASS_C,        /* y */ CCLASS_VOWEL,   /* z */ CCLASS_C,
111  /* { */ CCLASS_OTHER,    /* | */ CCLASS_OTHER,   /* } */ CCLASS_OTHER,
112  /* ~ */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,
113 };
114 /*
115 ** This tables gives the character class for ASCII characters that form the
116 ** initial character of a word.  The only difference from midClass is with
117 ** the letters H, W, and Y.
118 */
119 static const unsigned char initClass[] = {
120  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
121  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
122  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
123  /*   */ CCLASS_SPACE,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
124  /*   */ CCLASS_SPACE,    /*   */ CCLASS_SPACE,   /*   */ CCLASS_OTHER,
125  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
126  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
127  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
128  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
129  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
130  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_SPACE,
131  /* ! */ CCLASS_OTHER,    /* " */ CCLASS_OTHER,   /* # */ CCLASS_OTHER,
132  /* $ */ CCLASS_OTHER,    /* % */ CCLASS_OTHER,   /* & */ CCLASS_OTHER,
133  /* ' */ CCLASS_OTHER,    /* ( */ CCLASS_OTHER,   /* ) */ CCLASS_OTHER,
134  /* * */ CCLASS_OTHER,    /* + */ CCLASS_OTHER,   /* , */ CCLASS_OTHER,
135  /* - */ CCLASS_OTHER,    /* . */ CCLASS_OTHER,   /* / */ CCLASS_OTHER,
136  /* 0 */ CCLASS_DIGIT,    /* 1 */ CCLASS_DIGIT,   /* 2 */ CCLASS_DIGIT,
137  /* 3 */ CCLASS_DIGIT,    /* 4 */ CCLASS_DIGIT,   /* 5 */ CCLASS_DIGIT,
138  /* 6 */ CCLASS_DIGIT,    /* 7 */ CCLASS_DIGIT,   /* 8 */ CCLASS_DIGIT,
139  /* 9 */ CCLASS_DIGIT,    /* : */ CCLASS_OTHER,   /* ; */ CCLASS_OTHER,
140  /* < */ CCLASS_OTHER,    /* = */ CCLASS_OTHER,   /* > */ CCLASS_OTHER,
141  /* ? */ CCLASS_OTHER,    /* @ */ CCLASS_OTHER,   /* A */ CCLASS_VOWEL,
142  /* B */ CCLASS_B,        /* C */ CCLASS_C,       /* D */ CCLASS_D,
143  /* E */ CCLASS_VOWEL,    /* F */ CCLASS_B,       /* G */ CCLASS_C,
144  /* H */ CCLASS_SILENT,   /* I */ CCLASS_VOWEL,   /* J */ CCLASS_C,
145  /* K */ CCLASS_C,        /* L */ CCLASS_L,       /* M */ CCLASS_M,
146  /* N */ CCLASS_M,        /* O */ CCLASS_VOWEL,   /* P */ CCLASS_B,
147  /* Q */ CCLASS_C,        /* R */ CCLASS_R,       /* S */ CCLASS_C,
148  /* T */ CCLASS_D,        /* U */ CCLASS_VOWEL,   /* V */ CCLASS_B,
149  /* W */ CCLASS_B,        /* X */ CCLASS_C,       /* Y */ CCLASS_Y,
150  /* Z */ CCLASS_C,        /* [ */ CCLASS_OTHER,   /* \ */ CCLASS_OTHER,
151  /* ] */ CCLASS_OTHER,    /* ^ */ CCLASS_OTHER,   /* _ */ CCLASS_OTHER,
152  /* ` */ CCLASS_OTHER,    /* a */ CCLASS_VOWEL,   /* b */ CCLASS_B,
153  /* c */ CCLASS_C,        /* d */ CCLASS_D,       /* e */ CCLASS_VOWEL,
154  /* f */ CCLASS_B,        /* g */ CCLASS_C,       /* h */ CCLASS_SILENT,
155  /* i */ CCLASS_VOWEL,    /* j */ CCLASS_C,       /* k */ CCLASS_C,
156  /* l */ CCLASS_L,        /* m */ CCLASS_M,       /* n */ CCLASS_M,
157  /* o */ CCLASS_VOWEL,    /* p */ CCLASS_B,       /* q */ CCLASS_C,
158  /* r */ CCLASS_R,        /* s */ CCLASS_C,       /* t */ CCLASS_D,
159  /* u */ CCLASS_VOWEL,    /* v */ CCLASS_B,       /* w */ CCLASS_B,
160  /* x */ CCLASS_C,        /* y */ CCLASS_Y,       /* z */ CCLASS_C,
161  /* { */ CCLASS_OTHER,    /* | */ CCLASS_OTHER,   /* } */ CCLASS_OTHER,
162  /* ~ */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,
163 };
164 
165 /*
166 ** Mapping from the character class number (0-13) to a symbol for each
167 ** character class.  Note that initClass[] can be used to map the class
168 ** symbol back into the class number.
169 */
170 static const unsigned char className[] = ".ABCDHLRMY9 ?";
171 
172 /*
173 ** Generate a "phonetic hash" from a string of ASCII characters
174 ** in zIn[0..nIn-1].
175 **
176 **   * Map characters by character class as defined above.
177 **   * Omit double-letters
178 **   * Omit vowels beside R and L
179 **   * Omit T when followed by CH
180 **   * Omit W when followed by R
181 **   * Omit D when followed by J or G
182 **   * Omit K in KN or G in GN at the beginning of a word
183 **
184 ** Space to hold the result is obtained from sqlite3_malloc()
185 **
186 ** Return NULL if memory allocation fails.
187 */
188 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){
189   unsigned char *zOut = sqlite3_malloc( nIn + 1 );
190   int i;
191   int nOut = 0;
192   char cPrev = 0x77;
193   char cPrevX = 0x77;
194   const unsigned char *aClass = initClass;
195 
196   if( zOut==0 ) return 0;
197   if( nIn>2 ){
198     switch( zIn[0] ){
199       case 'g':
200       case 'k': {
201         if( zIn[1]=='n' ){ zIn++; nIn--; }
202         break;
203       }
204     }
205   }
206   for(i=0; i<nIn; i++){
207     unsigned char c = zIn[i];
208     if( i+1<nIn ){
209       if( c=='w' && zIn[i+1]=='r' ) continue;
210       if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
211       if( i+2<nIn ){
212         if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
213       }
214     }
215     c = aClass[c&0x7f];
216     if( c==CCLASS_SPACE ) continue;
217     if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
218     aClass = midClass;
219     if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){
220        continue; /* No vowels beside L or R */
221     }
222     if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){
223        nOut--;   /* No vowels beside L or R */
224     }
225     cPrev = c;
226     if( c==CCLASS_SILENT ) continue;
227     cPrevX = c;
228     c = className[c];
229     assert( nOut>=0 );
230     if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c;
231   }
232   zOut[nOut] = 0;
233   return zOut;
234 }
235 
236 /*
237 ** This is an SQL function wrapper around phoneticHash().  See
238 ** the description of phoneticHash() for additional information.
239 */
240 static void phoneticHashSqlFunc(
241   sqlite3_context *context,
242   int argc,
243   sqlite3_value **argv
244 ){
245   const unsigned char *zIn;
246   unsigned char *zOut;
247 
248   zIn = sqlite3_value_text(argv[0]);
249   if( zIn==0 ) return;
250   zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0]));
251   if( zOut==0 ){
252     sqlite3_result_error_nomem(context);
253   }else{
254     sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
255   }
256 }
257 
258 /*
259 ** Return the character class number for a character given its
260 ** context.
261 */
262 static char characterClass(char cPrev, char c){
263   return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
264 }
265 
266 /*
267 ** Return the cost of inserting or deleting character c immediately
268 ** following character cPrev.  If cPrev==0, that means c is the first
269 ** character of the word.
270 */
271 static int insertOrDeleteCost(char cPrev, char c, char cNext){
272   char classC = characterClass(cPrev, c);
273   char classCprev;
274 
275   if( classC==CCLASS_SILENT ){
276     /* Insert or delete "silent" characters such as H or W */
277     return 1;
278   }
279   if( cPrev==c ){
280     /* Repeated characters, or miss a repeat */
281     return 10;
282   }
283   if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){
284     return 20;  /* Insert a vowel before or after 'r' */
285   }
286   classCprev = characterClass(cPrev, cPrev);
287   if( classC==classCprev ){
288     if( classC==CCLASS_VOWEL ){
289       /* Remove or add a new vowel to a vowel cluster */
290       return 15;
291     }else{
292       /* Remove or add a consonant not in the same class */
293       return 50;
294     }
295   }
296 
297   /* any other character insertion or deletion */
298   return 100;
299 }
300 
301 /*
302 ** Divide the insertion cost by this factor when appending to the
303 ** end of the word.
304 */
305 #define FINAL_INS_COST_DIV  4
306 
307 /*
308 ** Return the cost of substituting cTo in place of cFrom assuming
309 ** the previous character is cPrev.  If cPrev==0 then cTo is the first
310 ** character of the word.
311 */
312 static int substituteCost(char cPrev, char cFrom, char cTo){
313   char classFrom, classTo;
314   if( cFrom==cTo ){
315     /* Exact match */
316     return 0;
317   }
318   if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){
319     /* differ only in case */
320     return 0;
321   }
322   classFrom = characterClass(cPrev, cFrom);
323   classTo = characterClass(cPrev, cTo);
324   if( classFrom==classTo ){
325     /* Same character class */
326     return 40;
327   }
328   if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
329       && classTo>=CCLASS_B && classTo<=CCLASS_Y ){
330     /* Convert from one consonant to another, but in a different class */
331     return 75;
332   }
333   /* Any other subsitution */
334   return 100;
335 }
336 
337 /*
338 ** Given two strings zA and zB which are pure ASCII, return the cost
339 ** of transforming zA into zB.  If zA ends with '*' assume that it is
340 ** a prefix of zB and give only minimal penalty for extra characters
341 ** on the end of zB.
342 **
343 ** Smaller numbers mean a closer match.
344 **
345 ** Negative values indicate an error:
346 **    -1  One of the inputs is NULL
347 **    -2  Non-ASCII characters on input
348 **    -3  Unable to allocate memory
349 **
350 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
351 ** of zB that matched the pattern in zA. If zA does not end with a '*',
352 ** then this value is always the number of bytes in zB (i.e. strlen(zB)).
353 ** If zA does end in a '*', then it is the number of bytes in the prefix
354 ** of zB that was deemed to match zA.
355 */
356 static int editdist1(const char *zA, const char *zB, int *pnMatch){
357   int nA, nB;            /* Number of characters in zA[] and zB[] */
358   int xA, xB;            /* Loop counters for zA[] and zB[] */
359   char cA = 0, cB;       /* Current character of zA and zB */
360   char cAprev, cBprev;   /* Previous character of zA and zB */
361   char cAnext, cBnext;   /* Next character in zA and zB */
362   int d;                 /* North-west cost value */
363   int dc = 0;            /* North-west character value */
364   int res;               /* Final result */
365   int *m;                /* The cost matrix */
366   char *cx;              /* Corresponding character values */
367   int *toFree = 0;       /* Malloced space */
368   int mStack[60+15];     /* Stack space to use if not too much is needed */
369   int nMatch = 0;
370 
371   /* Early out if either input is NULL */
372   if( zA==0 || zB==0 ) return -1;
373 
374   /* Skip any common prefix */
375   while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
376   if( pnMatch ) *pnMatch = nMatch;
377   if( zA[0]==0 && zB[0]==0 ) return 0;
378 
379 #if 0
380   printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
381 #endif
382 
383   /* Verify input strings and measure their lengths */
384   for(nA=0; zA[nA]; nA++){
385     if( zA[nA]&0x80 ) return -2;
386   }
387   for(nB=0; zB[nB]; nB++){
388     if( zB[nB]&0x80 ) return -2;
389   }
390 
391   /* Special processing if either string is empty */
392   if( nA==0 ){
393     cBprev = dc;
394     for(xB=res=0; (cB = zB[xB])!=0; xB++){
395       res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
396       cBprev = cB;
397     }
398     return res;
399   }
400   if( nB==0 ){
401     cAprev = dc;
402     for(xA=res=0; (cA = zA[xA])!=0; xA++){
403       res += insertOrDeleteCost(cAprev, cA, zA[xA+1]);
404       cAprev = cA;
405     }
406     return res;
407   }
408 
409   /* A is a prefix of B */
410   if( zA[0]=='*' && zA[1]==0 ) return 0;
411 
412   /* Allocate and initialize the Wagner matrix */
413   if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){
414     m = mStack;
415   }else{
416     m = toFree = sqlite3_malloc( (nB+1)*5*sizeof(m[0])/4 );
417     if( m==0 ) return -3;
418   }
419   cx = (char*)&m[nB+1];
420 
421   /* Compute the Wagner edit distance */
422   m[0] = 0;
423   cx[0] = dc;
424   cBprev = dc;
425   for(xB=1; xB<=nB; xB++){
426     cBnext = zB[xB];
427     cB = zB[xB-1];
428     cx[xB] = cB;
429     m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext);
430     cBprev = cB;
431   }
432   cAprev = dc;
433   for(xA=1; xA<=nA; xA++){
434     int lastA = (xA==nA);
435     cA = zA[xA-1];
436     cAnext = zA[xA];
437     if( cA=='*' && lastA ) break;
438     d = m[0];
439     dc = cx[0];
440     m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
441     cBprev = 0;
442     for(xB=1; xB<=nB; xB++){
443       int totalCost, insCost, delCost, subCost, ncx;
444       cB = zB[xB-1];
445       cBnext = zB[xB];
446 
447       /* Cost to insert cB */
448       insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext);
449       if( lastA ) insCost /= FINAL_INS_COST_DIV;
450 
451       /* Cost to delete cA */
452       delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
453 
454       /* Cost to substitute cA->cB */
455       subCost = substituteCost(cx[xB-1], cA, cB);
456 
457       /* Best cost */
458       totalCost = insCost + m[xB-1];
459       ncx = cB;
460       if( (delCost + m[xB])<totalCost ){
461         totalCost = delCost + m[xB];
462         ncx = cA;
463       }
464       if( (subCost + d)<totalCost ){
465         totalCost = subCost + d;
466       }
467 
468 #if 0
469       printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
470              " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
471              xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
472              insCost, delCost, subCost, totalCost, ncx?ncx:' ');
473 #endif
474 
475       /* Update the matrix */
476       d = m[xB];
477       dc = cx[xB];
478       m[xB] = totalCost;
479       cx[xB] = ncx;
480       cBprev = cB;
481     }
482     cAprev = cA;
483   }
484 
485   /* Free the wagner matrix and return the result */
486   if( cA=='*' ){
487     res = m[1];
488     for(xB=1; xB<=nB; xB++){
489       if( m[xB]<res ){
490         res = m[xB];
491         if( pnMatch ) *pnMatch = xB+nMatch;
492       }
493     }
494   }else{
495     res = m[nB];
496     /* In the current implementation, pnMatch is always NULL if zA does
497     ** not end in "*" */
498     assert( pnMatch==0 );
499   }
500   sqlite3_free(toFree);
501   return res;
502 }
503 
504 /*
505 ** Function:    editdist(A,B)
506 **
507 ** Return the cost of transforming string A into string B.  Both strings
508 ** must be pure ASCII text.  If A ends with '*' then it is assumed to be
509 ** a prefix of B and extra characters on the end of B have minimal additional
510 ** cost.
511 */
512 static void editdistSqlFunc(
513   sqlite3_context *context,
514   int argc,
515   sqlite3_value **argv
516 ){
517   int res = editdist1(
518                     (const char*)sqlite3_value_text(argv[0]),
519                     (const char*)sqlite3_value_text(argv[1]),
520                     0);
521   if( res<0 ){
522     if( res==(-3) ){
523       sqlite3_result_error_nomem(context);
524     }else if( res==(-2) ){
525       sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
526     }else{
527       sqlite3_result_error(context, "NULL input to editdist()", -1);
528     }
529   }else{
530     sqlite3_result_int(context, res);
531   }
532 }
533 
534 /* End of the fixed-cost edit distance implementation
535 ******************************************************************************
536 *****************************************************************************
537 ** Begin: Configurable cost unicode edit distance routines
538 */
539 /* Forward declaration of structures */
540 typedef struct EditDist3Cost EditDist3Cost;
541 typedef struct EditDist3Config EditDist3Config;
542 typedef struct EditDist3Point EditDist3Point;
543 typedef struct EditDist3From EditDist3From;
544 typedef struct EditDist3FromString EditDist3FromString;
545 typedef struct EditDist3To EditDist3To;
546 typedef struct EditDist3ToString EditDist3ToString;
547 typedef struct EditDist3Lang EditDist3Lang;
548 
549 
550 /*
551 ** An entry in the edit cost table
552 */
553 struct EditDist3Cost {
554   EditDist3Cost *pNext;     /* Next cost element */
555   u8 nFrom;                 /* Number of bytes in aFrom */
556   u8 nTo;                   /* Number of bytes in aTo */
557   u16 iCost;                /* Cost of this transformation */
558   char a[4]    ;            /* FROM string followed by TO string */
559   /* Additional TO and FROM string bytes appended as necessary */
560 };
561 
562 /*
563 ** Edit costs for a particular language ID
564 */
565 struct EditDist3Lang {
566   int iLang;             /* Language ID */
567   int iInsCost;          /* Default insertion cost */
568   int iDelCost;          /* Default deletion cost */
569   int iSubCost;          /* Default substitution cost */
570   EditDist3Cost *pCost;  /* Costs */
571 };
572 
573 
574 /*
575 ** The default EditDist3Lang object, with default costs.
576 */
577 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 };
578 
579 /*
580 ** Complete configuration
581 */
582 struct EditDist3Config {
583   int nLang;             /* Number of language IDs.  Size of a[] */
584   EditDist3Lang *a;      /* One for each distinct language ID */
585 };
586 
587 /*
588 ** Extra information about each character in the FROM string.
589 */
590 struct EditDist3From {
591   int nSubst;              /* Number of substitution cost entries */
592   int nDel;                /* Number of deletion cost entries */
593   int nByte;               /* Number of bytes in this character */
594   EditDist3Cost **apSubst; /* Array of substitution costs for this element */
595   EditDist3Cost **apDel;   /* Array of deletion cost entries */
596 };
597 
598 /*
599 ** A precompiled FROM string.
600 *
601 ** In the common case we expect the FROM string to be reused multiple times.
602 ** In other words, the common case will be to measure the edit distance
603 ** from a single origin string to multiple target strings.
604 */
605 struct EditDist3FromString {
606   char *z;                 /* The complete text of the FROM string */
607   int n;                   /* Number of characters in the FROM string */
608   int isPrefix;            /* True if ends with '*' character */
609   EditDist3From *a;        /* Extra info about each char of the FROM string */
610 };
611 
612 /*
613 ** Extra information about each character in the TO string.
614 */
615 struct EditDist3To {
616   int nIns;                /* Number of insertion cost entries */
617   int nByte;               /* Number of bytes in this character */
618   EditDist3Cost **apIns;   /* Array of deletion cost entries */
619 };
620 
621 /*
622 ** A precompiled FROM string
623 */
624 struct EditDist3ToString {
625   char *z;                 /* The complete text of the TO string */
626   int n;                   /* Number of characters in the TO string */
627   EditDist3To *a;          /* Extra info about each char of the TO string */
628 };
629 
630 /*
631 ** Clear or delete an instance of the object that records all edit-distance
632 ** weights.
633 */
634 static void editDist3ConfigClear(EditDist3Config *p){
635   int i;
636   if( p==0 ) return;
637   for(i=0; i<p->nLang; i++){
638     EditDist3Cost *pCost, *pNext;
639     pCost = p->a[i].pCost;
640     while( pCost ){
641       pNext = pCost->pNext;
642       sqlite3_free(pCost);
643       pCost = pNext;
644     }
645   }
646   sqlite3_free(p->a);
647   memset(p, 0, sizeof(*p));
648 }
649 static void editDist3ConfigDelete(void *pIn){
650   EditDist3Config *p = (EditDist3Config*)pIn;
651   editDist3ConfigClear(p);
652   sqlite3_free(p);
653 }
654 
655 /*
656 ** Load all edit-distance weights from a table.
657 */
658 static int editDist3ConfigLoad(
659   EditDist3Config *p,      /* The edit distance configuration to load */
660   sqlite3 *db,            /* Load from this database */
661   const char *zTable      /* Name of the table from which to load */
662 ){
663   sqlite3_stmt *pStmt;
664   int rc, rc2;
665   char *zSql;
666   int iLangPrev = -9999;
667   EditDist3Lang *pLang = 0;
668 
669   zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
670                          " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
671   if( zSql==0 ) return SQLITE_NOMEM;
672   rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
673   sqlite3_free(zSql);
674   if( rc ) return rc;
675   editDist3ConfigClear(p);
676   while( sqlite3_step(pStmt)==SQLITE_ROW ){
677     int iLang = sqlite3_column_int(pStmt, 0);
678     const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
679     int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
680     const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
681     int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
682     int iCost = sqlite3_column_int(pStmt, 3);
683 
684     assert( zFrom!=0 || nFrom==0 );
685     assert( zTo!=0 || nTo==0 );
686     if( nFrom>100 || nTo>100 ) continue;
687     if( iCost<0 ) continue;
688     if( pLang==0 || iLang!=iLangPrev ){
689       EditDist3Lang *pNew;
690       pNew = sqlite3_realloc(p->a, (p->nLang+1)*sizeof(p->a[0]));
691       if( pNew==0 ){ rc = SQLITE_NOMEM; break; }
692       p->a = pNew;
693       pLang = &p->a[p->nLang];
694       p->nLang++;
695       pLang->iLang = iLang;
696       pLang->iInsCost = 100;
697       pLang->iDelCost = 100;
698       pLang->iSubCost = 150;
699       pLang->pCost = 0;
700       iLangPrev = iLang;
701     }
702     if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){
703       pLang->iDelCost = iCost;
704     }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){
705       pLang->iInsCost = iCost;
706     }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){
707       pLang->iSubCost = iCost;
708     }else{
709       EditDist3Cost *pCost;
710       int nExtra = nFrom + nTo - 4;
711       if( nExtra<0 ) nExtra = 0;
712       pCost = sqlite3_malloc( sizeof(*pCost) + nExtra );
713       if( pCost==0 ){ rc = SQLITE_NOMEM; break; }
714       pCost->nFrom = nFrom;
715       pCost->nTo = nTo;
716       pCost->iCost = iCost;
717       memcpy(pCost->a, zFrom, nFrom);
718       memcpy(pCost->a + nFrom, zTo, nTo);
719       pCost->pNext = pLang->pCost;
720       pLang->pCost = pCost;
721     }
722   }
723   rc2 = sqlite3_finalize(pStmt);
724   if( rc==SQLITE_OK ) rc = rc2;
725   return rc;
726 }
727 
728 /*
729 ** Return the length (in bytes) of a utf-8 character.  Or return a maximum
730 ** of N.
731 */
732 static int utf8Len(unsigned char c, int N){
733   int len = 1;
734   if( c>0x7f ){
735     if( (c&0xe0)==0xc0 ){
736       len = 2;
737     }else if( (c&0xf0)==0xe0 ){
738       len = 3;
739     }else{
740       len = 4;
741     }
742   }
743   if( len>N ) len = N;
744   return len;
745 }
746 
747 /*
748 ** Return TRUE (non-zero) if the To side of the given cost matches
749 ** the given string.
750 */
751 static int matchTo(EditDist3Cost *p, const char *z, int n){
752   if( p->nTo>n ) return 0;
753   if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0;
754   return 1;
755 }
756 
757 /*
758 ** Return TRUE (non-zero) if the From side of the given cost matches
759 ** the given string.
760 */
761 static int matchFrom(EditDist3Cost *p, const char *z, int n){
762   assert( p->nFrom<=n );
763   if( strncmp(p->a, z, p->nFrom)!=0 ) return 0;
764   return 1;
765 }
766 
767 /*
768 ** Return TRUE (non-zero) of the next FROM character and the next TO
769 ** character are the same.
770 */
771 static int matchFromTo(
772   EditDist3FromString *pStr,  /* Left hand string */
773   int n1,                     /* Index of comparison character on the left */
774   const char *z2,             /* Right-handl comparison character */
775   int n2                      /* Bytes remaining in z2[] */
776 ){
777   int b1 = pStr->a[n1].nByte;
778   if( b1>n2 ) return 0;
779   if( memcmp(pStr->z+n1, z2, b1)!=0 ) return 0;
780   return 1;
781 }
782 
783 /*
784 ** Delete an EditDist3FromString objecct
785 */
786 static void editDist3FromStringDelete(EditDist3FromString *p){
787   int i;
788   if( p ){
789     for(i=0; i<p->n; i++){
790       sqlite3_free(p->a[i].apDel);
791       sqlite3_free(p->a[i].apSubst);
792     }
793     sqlite3_free(p);
794   }
795 }
796 
797 /*
798 ** Create a EditDist3FromString object.
799 */
800 static EditDist3FromString *editDist3FromStringNew(
801   const EditDist3Lang *pLang,
802   const char *z,
803   int n
804 ){
805   EditDist3FromString *pStr;
806   EditDist3Cost *p;
807   int i;
808 
809   if( z==0 ) return 0;
810   if( n<0 ) n = (int)strlen(z);
811   pStr = sqlite3_malloc( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
812   if( pStr==0 ) return 0;
813   pStr->a = (EditDist3From*)&pStr[1];
814   memset(pStr->a, 0, sizeof(pStr->a[0])*n);
815   pStr->n = n;
816   pStr->z = (char*)&pStr->a[n];
817   memcpy(pStr->z, z, n+1);
818   if( n && z[n-1]=='*' ){
819     pStr->isPrefix = 1;
820     n--;
821     pStr->n--;
822     pStr->z[n] = 0;
823   }else{
824     pStr->isPrefix = 0;
825   }
826 
827   for(i=0; i<n; i++){
828     EditDist3From *pFrom = &pStr->a[i];
829     memset(pFrom, 0, sizeof(*pFrom));
830     pFrom->nByte = utf8Len((unsigned char)z[i], n-i);
831     for(p=pLang->pCost; p; p=p->pNext){
832       EditDist3Cost **apNew;
833       if( i+p->nFrom>n ) continue;
834       if( matchFrom(p, z+i, n-i)==0 ) continue;
835       if( p->nTo==0 ){
836         apNew = sqlite3_realloc(pFrom->apDel,
837                                 sizeof(*apNew)*(pFrom->nDel+1));
838         if( apNew==0 ) break;
839         pFrom->apDel = apNew;
840         apNew[pFrom->nDel++] = p;
841       }else{
842         apNew = sqlite3_realloc(pFrom->apSubst,
843                                 sizeof(*apNew)*(pFrom->nSubst+1));
844         if( apNew==0 ) break;
845         pFrom->apSubst = apNew;
846         apNew[pFrom->nSubst++] = p;
847       }
848     }
849     if( p ){
850       editDist3FromStringDelete(pStr);
851       pStr = 0;
852       break;
853     }
854   }
855   return pStr;
856 }
857 
858 /*
859 ** Update entry m[i] such that it is the minimum of its current value
860 ** and m[j]+iCost.
861 **
862 ** If the iCost is 1,000,000 or greater, then consider the cost to be
863 ** infinite and skip the update.
864 */
865 static void updateCost(
866   unsigned int *m,
867   int i,
868   int j,
869   int iCost
870 ){
871   assert( iCost>=0 );
872   if( iCost<10000 ){
873     unsigned int b = m[j] + iCost;
874     if( b<m[i] ) m[i] = b;
875   }
876 }
877 
878 /* Compute the edit distance between two strings.
879 **
880 ** If an error occurs, return a negative number which is the error code.
881 **
882 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters
883 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
884 ** not contain the pattern for a prefix-search, then this is always the number
885 ** of characters in z2. If pFrom does contain a prefix search pattern, then
886 ** it is the number of characters in the prefix of z2 that was deemed to
887 ** match pFrom.
888 */
889 static int editDist3Core(
890   EditDist3FromString *pFrom,  /* The FROM string */
891   const char *z2,              /* The TO string */
892   int n2,                      /* Length of the TO string */
893   const EditDist3Lang *pLang,  /* Edit weights for a particular language ID */
894   int *pnMatch                 /* OUT: Characters in matched prefix */
895 ){
896   int k, n;
897   int i1, b1;
898   int i2, b2;
899   EditDist3FromString f = *pFrom;
900   EditDist3To *a2;
901   unsigned int *m;
902   int szRow;
903   EditDist3Cost *p;
904   int res;
905 
906   /* allocate the Wagner matrix and the aTo[] array for the TO string */
907   n = (f.n+1)*(n2+1);
908   n = (n+1)&~1;
909   m = sqlite3_malloc( n*sizeof(m[0]) + sizeof(a2[0])*n2 );
910   if( m==0 ) return -1;            /* Out of memory */
911   a2 = (EditDist3To*)&m[n];
912   memset(a2, 0, sizeof(a2[0])*n2);
913 
914   /* Fill in the a1[] matrix for all characters of the TO string */
915   for(i2=0; i2<n2; i2++){
916     a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2);
917     for(p=pLang->pCost; p; p=p->pNext){
918       EditDist3Cost **apNew;
919       if( p->nFrom>0 ) continue;
920       if( i2+p->nTo>n2 ) continue;
921       if( matchTo(p, z2+i2, n2-i2)==0 ) continue;
922       a2[i2].nIns++;
923       apNew = sqlite3_realloc(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns);
924       if( apNew==0 ){
925         res = -1;  /* Out of memory */
926         goto editDist3Abort;
927       }
928       a2[i2].apIns = apNew;
929       a2[i2].apIns[a2[i2].nIns-1] = p;
930     }
931   }
932 
933   /* Prepare to compute the minimum edit distance */
934   szRow = f.n+1;
935   memset(m, 0x01, (n2+1)*szRow*sizeof(m[0]));
936   m[0] = 0;
937 
938   /* First fill in the top-row of the matrix with FROM deletion costs */
939   for(i1=0; i1<f.n; i1 += b1){
940     b1 = f.a[i1].nByte;
941     updateCost(m, i1+b1, i1, pLang->iDelCost);
942     for(k=0; k<f.a[i1].nDel; k++){
943       p = f.a[i1].apDel[k];
944       updateCost(m, i1+p->nFrom, i1, p->iCost);
945     }
946   }
947 
948   /* Fill in all subsequent rows, top-to-bottom, left-to-right */
949   for(i2=0; i2<n2; i2 += b2){
950     int rx;      /* Starting index for current row */
951     int rxp;     /* Starting index for previous row */
952     b2 = a2[i2].nByte;
953     rx = szRow*(i2+b2);
954     rxp = szRow*i2;
955     updateCost(m, rx, rxp, pLang->iInsCost);
956     for(k=0; k<a2[i2].nIns; k++){
957       p = a2[i2].apIns[k];
958       updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost);
959     }
960     for(i1=0; i1<f.n; i1+=b1){
961       int cx;    /* Index of current cell */
962       int cxp;   /* Index of cell immediately to the left */
963       int cxd;   /* Index of cell to the left and one row above */
964       int cxu;   /* Index of cell immediately above */
965       b1 = f.a[i1].nByte;
966       cxp = rx + i1;
967       cx = cxp + b1;
968       cxd = rxp + i1;
969       cxu = cxd + b1;
970       updateCost(m, cx, cxp, pLang->iDelCost);
971       for(k=0; k<f.a[i1].nDel; k++){
972         p = f.a[i1].apDel[k];
973         updateCost(m, cxp+p->nFrom, cxp, p->iCost);
974       }
975       updateCost(m, cx, cxu, pLang->iInsCost);
976       if( matchFromTo(&f, i1, z2+i2, n2-i2) ){
977         updateCost(m, cx, cxd, 0);
978       }
979       updateCost(m, cx, cxd, pLang->iSubCost);
980       for(k=0; k<f.a[i1].nSubst; k++){
981         p = f.a[i1].apSubst[k];
982         if( matchTo(p, z2+i2, n2-i2) ){
983           updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
984         }
985       }
986     }
987   }
988 
989 #if 0  /* Enable for debugging */
990   printf("         ^");
991   for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
992   printf("\n   ^:");
993   for(i1=0; i1<szRow; i1++){
994     int v = m[i1];
995     if( v>9999 ) printf(" ****");
996     else         printf(" %4d", v);
997   }
998   printf("\n");
999   for(i2=0; i2<n2; i2++){
1000     printf("%c-%02x:", z2[i2], z2[i2]&0xff);
1001     for(i1=0; i1<szRow; i1++){
1002       int v = m[(i2+1)*szRow+i1];
1003       if( v>9999 ) printf(" ****");
1004       else         printf(" %4d", v);
1005     }
1006     printf("\n");
1007   }
1008 #endif
1009 
1010   /* Free memory allocations and return the result */
1011   res = (int)m[szRow*(n2+1)-1];
1012   n = n2;
1013   if( f.isPrefix ){
1014     for(i2=1; i2<=n2; i2++){
1015       int b = m[szRow*i2-1];
1016       if( b<=res ){
1017         res = b;
1018         n = i2 - 1;
1019       }
1020     }
1021   }
1022   if( pnMatch ){
1023     int nExtra = 0;
1024     for(k=0; k<n; k++){
1025       if( (z2[k] & 0xc0)==0x80 ) nExtra++;
1026     }
1027     *pnMatch = n - nExtra;
1028   }
1029 
1030 editDist3Abort:
1031   for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
1032   sqlite3_free(m);
1033   return res;
1034 }
1035 
1036 /*
1037 ** Get an appropriate EditDist3Lang object.
1038 */
1039 static const EditDist3Lang *editDist3FindLang(
1040   EditDist3Config *pConfig,
1041   int iLang
1042 ){
1043   int i;
1044   for(i=0; i<pConfig->nLang; i++){
1045     if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i];
1046   }
1047   return &editDist3Lang;
1048 }
1049 
1050 /*
1051 ** Function:    editdist3(A,B,iLang)
1052 **              editdist3(tablename)
1053 **
1054 ** Return the cost of transforming string A into string B using edit
1055 ** weights for iLang.
1056 **
1057 ** The second form loads edit weights into memory from a table.
1058 */
1059 static void editDist3SqlFunc(
1060   sqlite3_context *context,
1061   int argc,
1062   sqlite3_value **argv
1063 ){
1064   EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context);
1065   sqlite3 *db = sqlite3_context_db_handle(context);
1066   int rc;
1067   if( argc==1 ){
1068     const char *zTable = (const char*)sqlite3_value_text(argv[0]);
1069     rc = editDist3ConfigLoad(pConfig, db, zTable);
1070     if( rc ) sqlite3_result_error_code(context, rc);
1071   }else{
1072     const char *zA = (const char*)sqlite3_value_text(argv[0]);
1073     const char *zB = (const char*)sqlite3_value_text(argv[1]);
1074     int nA = sqlite3_value_bytes(argv[0]);
1075     int nB = sqlite3_value_bytes(argv[1]);
1076     int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0;
1077     const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang);
1078     EditDist3FromString *pFrom;
1079     int dist;
1080 
1081     pFrom = editDist3FromStringNew(pLang, zA, nA);
1082     if( pFrom==0 ){
1083       sqlite3_result_error_nomem(context);
1084       return;
1085     }
1086     dist = editDist3Core(pFrom, zB, nB, pLang, 0);
1087     editDist3FromStringDelete(pFrom);
1088     if( dist==(-1) ){
1089       sqlite3_result_error_nomem(context);
1090     }else{
1091       sqlite3_result_int(context, dist);
1092     }
1093   }
1094 }
1095 
1096 /*
1097 ** Register the editDist3 function with SQLite
1098 */
1099 static int editDist3Install(sqlite3 *db){
1100   int rc;
1101   EditDist3Config *pConfig = sqlite3_malloc( sizeof(*pConfig) );
1102   if( pConfig==0 ) return SQLITE_NOMEM;
1103   memset(pConfig, 0, sizeof(*pConfig));
1104   rc = sqlite3_create_function_v2(db, "editdist3",
1105               2, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1106   if( rc==SQLITE_OK ){
1107     rc = sqlite3_create_function_v2(db, "editdist3",
1108                 3, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1109   }
1110   if( rc==SQLITE_OK ){
1111     rc = sqlite3_create_function_v2(db, "editdist3",
1112                 1, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0,
1113                 editDist3ConfigDelete);
1114   }else{
1115     sqlite3_free(pConfig);
1116   }
1117   return rc;
1118 }
1119 /* End configurable cost unicode edit distance routines
1120 ******************************************************************************
1121 ******************************************************************************
1122 ** Begin transliterate unicode-to-ascii implementation
1123 */
1124 
1125 #if !SQLITE_AMALGAMATION
1126 /*
1127 ** This lookup table is used to help decode the first byte of
1128 ** a multi-byte UTF8 character.
1129 */
1130 static const unsigned char sqlite3Utf8Trans1[] = {
1131   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1132   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1133   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1134   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1135   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1136   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1137   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1138   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1139 };
1140 #endif
1141 
1142 /*
1143 ** Return the value of the first UTF-8 character in the string.
1144 */
1145 static int utf8Read(const unsigned char *z, int n, int *pSize){
1146   int c, i;
1147 
1148   /* All callers to this routine (in the current implementation)
1149   ** always have n>0. */
1150   if( NEVER(n==0) ){
1151     c = i = 0;
1152   }else{
1153     c = z[0];
1154     i = 1;
1155     if( c>=0xc0 ){
1156       c = sqlite3Utf8Trans1[c-0xc0];
1157       while( i<n && (z[i] & 0xc0)==0x80 ){
1158         c = (c<<6) + (0x3f & z[i++]);
1159       }
1160     }
1161   }
1162   *pSize = i;
1163   return c;
1164 }
1165 
1166 /*
1167 ** Return the number of characters in the utf-8 string in the nIn byte
1168 ** buffer pointed to by zIn.
1169 */
1170 static int utf8Charlen(const char *zIn, int nIn){
1171   int i;
1172   int nChar = 0;
1173   for(i=0; i<nIn; nChar++){
1174     int sz;
1175     utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1176     i += sz;
1177   }
1178   return nChar;
1179 }
1180 
1181 /*
1182 ** Table of translations from unicode characters into ASCII.
1183 */
1184 static const struct {
1185  unsigned short int cFrom;
1186  unsigned char cTo0, cTo1;
1187 } translit[] = {
1188   { 0x00A0,  0x20, 0x00 },  /*   to   */
1189   { 0x00B5,  0x75, 0x00 },  /* µ to u */
1190   { 0x00C0,  0x41, 0x00 },  /* À to A */
1191   { 0x00C1,  0x41, 0x00 },  /* Á to A */
1192   { 0x00C2,  0x41, 0x00 },  /* Â to A */
1193   { 0x00C3,  0x41, 0x00 },  /* Ã to A */
1194   { 0x00C4,  0x41, 0x65 },  /* Ä to Ae */
1195   { 0x00C5,  0x41, 0x61 },  /* Å to Aa */
1196   { 0x00C6,  0x41, 0x45 },  /* Æ to AE */
1197   { 0x00C7,  0x43, 0x00 },  /* Ç to C */
1198   { 0x00C8,  0x45, 0x00 },  /* È to E */
1199   { 0x00C9,  0x45, 0x00 },  /* É to E */
1200   { 0x00CA,  0x45, 0x00 },  /* Ê to E */
1201   { 0x00CB,  0x45, 0x00 },  /* Ë to E */
1202   { 0x00CC,  0x49, 0x00 },  /* Ì to I */
1203   { 0x00CD,  0x49, 0x00 },  /* Í to I */
1204   { 0x00CE,  0x49, 0x00 },  /* Î to I */
1205   { 0x00CF,  0x49, 0x00 },  /* Ï to I */
1206   { 0x00D0,  0x44, 0x00 },  /* Ð to D */
1207   { 0x00D1,  0x4E, 0x00 },  /* Ñ to N */
1208   { 0x00D2,  0x4F, 0x00 },  /* Ò to O */
1209   { 0x00D3,  0x4F, 0x00 },  /* Ó to O */
1210   { 0x00D4,  0x4F, 0x00 },  /* Ô to O */
1211   { 0x00D5,  0x4F, 0x00 },  /* Õ to O */
1212   { 0x00D6,  0x4F, 0x65 },  /* Ö to Oe */
1213   { 0x00D7,  0x78, 0x00 },  /* × to x */
1214   { 0x00D8,  0x4F, 0x00 },  /* Ø to O */
1215   { 0x00D9,  0x55, 0x00 },  /* Ù to U */
1216   { 0x00DA,  0x55, 0x00 },  /* Ú to U */
1217   { 0x00DB,  0x55, 0x00 },  /* Û to U */
1218   { 0x00DC,  0x55, 0x65 },  /* Ü to Ue */
1219   { 0x00DD,  0x59, 0x00 },  /* Ý to Y */
1220   { 0x00DE,  0x54, 0x68 },  /* Þ to Th */
1221   { 0x00DF,  0x73, 0x73 },  /* ß to ss */
1222   { 0x00E0,  0x61, 0x00 },  /* à to a */
1223   { 0x00E1,  0x61, 0x00 },  /* á to a */
1224   { 0x00E2,  0x61, 0x00 },  /* â to a */
1225   { 0x00E3,  0x61, 0x00 },  /* ã to a */
1226   { 0x00E4,  0x61, 0x65 },  /* ä to ae */
1227   { 0x00E5,  0x61, 0x61 },  /* å to aa */
1228   { 0x00E6,  0x61, 0x65 },  /* æ to ae */
1229   { 0x00E7,  0x63, 0x00 },  /* ç to c */
1230   { 0x00E8,  0x65, 0x00 },  /* è to e */
1231   { 0x00E9,  0x65, 0x00 },  /* é to e */
1232   { 0x00EA,  0x65, 0x00 },  /* ê to e */
1233   { 0x00EB,  0x65, 0x00 },  /* ë to e */
1234   { 0x00EC,  0x69, 0x00 },  /* ì to i */
1235   { 0x00ED,  0x69, 0x00 },  /* í to i */
1236   { 0x00EE,  0x69, 0x00 },  /* î to i */
1237   { 0x00EF,  0x69, 0x00 },  /* ï to i */
1238   { 0x00F0,  0x64, 0x00 },  /* ð to d */
1239   { 0x00F1,  0x6E, 0x00 },  /* ñ to n */
1240   { 0x00F2,  0x6F, 0x00 },  /* ò to o */
1241   { 0x00F3,  0x6F, 0x00 },  /* ó to o */
1242   { 0x00F4,  0x6F, 0x00 },  /* ô to o */
1243   { 0x00F5,  0x6F, 0x00 },  /* õ to o */
1244   { 0x00F6,  0x6F, 0x65 },  /* ö to oe */
1245   { 0x00F7,  0x3A, 0x00 },  /* ÷ to : */
1246   { 0x00F8,  0x6F, 0x00 },  /* ø to o */
1247   { 0x00F9,  0x75, 0x00 },  /* ù to u */
1248   { 0x00FA,  0x75, 0x00 },  /* ú to u */
1249   { 0x00FB,  0x75, 0x00 },  /* û to u */
1250   { 0x00FC,  0x75, 0x65 },  /* ü to ue */
1251   { 0x00FD,  0x79, 0x00 },  /* ý to y */
1252   { 0x00FE,  0x74, 0x68 },  /* þ to th */
1253   { 0x00FF,  0x79, 0x00 },  /* ÿ to y */
1254   { 0x0100,  0x41, 0x00 },  /* Ā to A */
1255   { 0x0101,  0x61, 0x00 },  /* ā to a */
1256   { 0x0102,  0x41, 0x00 },  /* Ă to A */
1257   { 0x0103,  0x61, 0x00 },  /* ă to a */
1258   { 0x0104,  0x41, 0x00 },  /* Ą to A */
1259   { 0x0105,  0x61, 0x00 },  /* ą to a */
1260   { 0x0106,  0x43, 0x00 },  /* Ć to C */
1261   { 0x0107,  0x63, 0x00 },  /* ć to c */
1262   { 0x0108,  0x43, 0x68 },  /* Ĉ to Ch */
1263   { 0x0109,  0x63, 0x68 },  /* ĉ to ch */
1264   { 0x010A,  0x43, 0x00 },  /* Ċ to C */
1265   { 0x010B,  0x63, 0x00 },  /* ċ to c */
1266   { 0x010C,  0x43, 0x00 },  /* Č to C */
1267   { 0x010D,  0x63, 0x00 },  /* č to c */
1268   { 0x010E,  0x44, 0x00 },  /* Ď to D */
1269   { 0x010F,  0x64, 0x00 },  /* ď to d */
1270   { 0x0110,  0x44, 0x00 },  /* Đ to D */
1271   { 0x0111,  0x64, 0x00 },  /* đ to d */
1272   { 0x0112,  0x45, 0x00 },  /* Ē to E */
1273   { 0x0113,  0x65, 0x00 },  /* ē to e */
1274   { 0x0114,  0x45, 0x00 },  /* Ĕ to E */
1275   { 0x0115,  0x65, 0x00 },  /* ĕ to e */
1276   { 0x0116,  0x45, 0x00 },  /* Ė to E */
1277   { 0x0117,  0x65, 0x00 },  /* ė to e */
1278   { 0x0118,  0x45, 0x00 },  /* Ę to E */
1279   { 0x0119,  0x65, 0x00 },  /* ę to e */
1280   { 0x011A,  0x45, 0x00 },  /* Ě to E */
1281   { 0x011B,  0x65, 0x00 },  /* ě to e */
1282   { 0x011C,  0x47, 0x68 },  /* Ĝ to Gh */
1283   { 0x011D,  0x67, 0x68 },  /* ĝ to gh */
1284   { 0x011E,  0x47, 0x00 },  /* Ğ to G */
1285   { 0x011F,  0x67, 0x00 },  /* ğ to g */
1286   { 0x0120,  0x47, 0x00 },  /* Ġ to G */
1287   { 0x0121,  0x67, 0x00 },  /* ġ to g */
1288   { 0x0122,  0x47, 0x00 },  /* Ģ to G */
1289   { 0x0123,  0x67, 0x00 },  /* ģ to g */
1290   { 0x0124,  0x48, 0x68 },  /* Ĥ to Hh */
1291   { 0x0125,  0x68, 0x68 },  /* ĥ to hh */
1292   { 0x0126,  0x48, 0x00 },  /* Ħ to H */
1293   { 0x0127,  0x68, 0x00 },  /* ħ to h */
1294   { 0x0128,  0x49, 0x00 },  /* Ĩ to I */
1295   { 0x0129,  0x69, 0x00 },  /* ĩ to i */
1296   { 0x012A,  0x49, 0x00 },  /* Ī to I */
1297   { 0x012B,  0x69, 0x00 },  /* ī to i */
1298   { 0x012C,  0x49, 0x00 },  /* Ĭ to I */
1299   { 0x012D,  0x69, 0x00 },  /* ĭ to i */
1300   { 0x012E,  0x49, 0x00 },  /* Į to I */
1301   { 0x012F,  0x69, 0x00 },  /* į to i */
1302   { 0x0130,  0x49, 0x00 },  /* İ to I */
1303   { 0x0131,  0x69, 0x00 },  /* ı to i */
1304   { 0x0132,  0x49, 0x4A },  /* IJ to IJ */
1305   { 0x0133,  0x69, 0x6A },  /* ij to ij */
1306   { 0x0134,  0x4A, 0x68 },  /* Ĵ to Jh */
1307   { 0x0135,  0x6A, 0x68 },  /* ĵ to jh */
1308   { 0x0136,  0x4B, 0x00 },  /* Ķ to K */
1309   { 0x0137,  0x6B, 0x00 },  /* ķ to k */
1310   { 0x0138,  0x6B, 0x00 },  /* ĸ to k */
1311   { 0x0139,  0x4C, 0x00 },  /* Ĺ to L */
1312   { 0x013A,  0x6C, 0x00 },  /* ĺ to l */
1313   { 0x013B,  0x4C, 0x00 },  /* Ļ to L */
1314   { 0x013C,  0x6C, 0x00 },  /* ļ to l */
1315   { 0x013D,  0x4C, 0x00 },  /* Ľ to L */
1316   { 0x013E,  0x6C, 0x00 },  /* ľ to l */
1317   { 0x013F,  0x4C, 0x2E },  /* Ŀ to L. */
1318   { 0x0140,  0x6C, 0x2E },  /* ŀ to l. */
1319   { 0x0141,  0x4C, 0x00 },  /* Ł to L */
1320   { 0x0142,  0x6C, 0x00 },  /* ł to l */
1321   { 0x0143,  0x4E, 0x00 },  /* Ń to N */
1322   { 0x0144,  0x6E, 0x00 },  /* ń to n */
1323   { 0x0145,  0x4E, 0x00 },  /* Ņ to N */
1324   { 0x0146,  0x6E, 0x00 },  /* ņ to n */
1325   { 0x0147,  0x4E, 0x00 },  /* Ň to N */
1326   { 0x0148,  0x6E, 0x00 },  /* ň to n */
1327   { 0x0149,  0x27, 0x6E },  /* ʼn to 'n */
1328   { 0x014A,  0x4E, 0x47 },  /* Ŋ to NG */
1329   { 0x014B,  0x6E, 0x67 },  /* ŋ to ng */
1330   { 0x014C,  0x4F, 0x00 },  /* Ō to O */
1331   { 0x014D,  0x6F, 0x00 },  /* ō to o */
1332   { 0x014E,  0x4F, 0x00 },  /* Ŏ to O */
1333   { 0x014F,  0x6F, 0x00 },  /* ŏ to o */
1334   { 0x0150,  0x4F, 0x00 },  /* Ő to O */
1335   { 0x0151,  0x6F, 0x00 },  /* ő to o */
1336   { 0x0152,  0x4F, 0x45 },  /* Œ to OE */
1337   { 0x0153,  0x6F, 0x65 },  /* œ to oe */
1338   { 0x0154,  0x52, 0x00 },  /* Ŕ to R */
1339   { 0x0155,  0x72, 0x00 },  /* ŕ to r */
1340   { 0x0156,  0x52, 0x00 },  /* Ŗ to R */
1341   { 0x0157,  0x72, 0x00 },  /* ŗ to r */
1342   { 0x0158,  0x52, 0x00 },  /* Ř to R */
1343   { 0x0159,  0x72, 0x00 },  /* ř to r */
1344   { 0x015A,  0x53, 0x00 },  /* Ś to S */
1345   { 0x015B,  0x73, 0x00 },  /* ś to s */
1346   { 0x015C,  0x53, 0x68 },  /* Ŝ to Sh */
1347   { 0x015D,  0x73, 0x68 },  /* ŝ to sh */
1348   { 0x015E,  0x53, 0x00 },  /* Ş to S */
1349   { 0x015F,  0x73, 0x00 },  /* ş to s */
1350   { 0x0160,  0x53, 0x00 },  /* Š to S */
1351   { 0x0161,  0x73, 0x00 },  /* š to s */
1352   { 0x0162,  0x54, 0x00 },  /* Ţ to T */
1353   { 0x0163,  0x74, 0x00 },  /* ţ to t */
1354   { 0x0164,  0x54, 0x00 },  /* Ť to T */
1355   { 0x0165,  0x74, 0x00 },  /* ť to t */
1356   { 0x0166,  0x54, 0x00 },  /* Ŧ to T */
1357   { 0x0167,  0x74, 0x00 },  /* ŧ to t */
1358   { 0x0168,  0x55, 0x00 },  /* Ũ to U */
1359   { 0x0169,  0x75, 0x00 },  /* ũ to u */
1360   { 0x016A,  0x55, 0x00 },  /* Ū to U */
1361   { 0x016B,  0x75, 0x00 },  /* ū to u */
1362   { 0x016C,  0x55, 0x00 },  /* Ŭ to U */
1363   { 0x016D,  0x75, 0x00 },  /* ŭ to u */
1364   { 0x016E,  0x55, 0x00 },  /* Ů to U */
1365   { 0x016F,  0x75, 0x00 },  /* ů to u */
1366   { 0x0170,  0x55, 0x00 },  /* Ű to U */
1367   { 0x0171,  0x75, 0x00 },  /* ű to u */
1368   { 0x0172,  0x55, 0x00 },  /* Ų to U */
1369   { 0x0173,  0x75, 0x00 },  /* ų to u */
1370   { 0x0174,  0x57, 0x00 },  /* Ŵ to W */
1371   { 0x0175,  0x77, 0x00 },  /* ŵ to w */
1372   { 0x0176,  0x59, 0x00 },  /* Ŷ to Y */
1373   { 0x0177,  0x79, 0x00 },  /* ŷ to y */
1374   { 0x0178,  0x59, 0x00 },  /* Ÿ to Y */
1375   { 0x0179,  0x5A, 0x00 },  /* Ź to Z */
1376   { 0x017A,  0x7A, 0x00 },  /* ź to z */
1377   { 0x017B,  0x5A, 0x00 },  /* Ż to Z */
1378   { 0x017C,  0x7A, 0x00 },  /* ż to z */
1379   { 0x017D,  0x5A, 0x00 },  /* Ž to Z */
1380   { 0x017E,  0x7A, 0x00 },  /* ž to z */
1381   { 0x017F,  0x73, 0x00 },  /* ſ to s */
1382   { 0x0192,  0x66, 0x00 },  /* ƒ to f */
1383   { 0x0218,  0x53, 0x00 },  /* Ș to S */
1384   { 0x0219,  0x73, 0x00 },  /* ș to s */
1385   { 0x021A,  0x54, 0x00 },  /* Ț to T */
1386   { 0x021B,  0x74, 0x00 },  /* ț to t */
1387   { 0x0386,  0x41, 0x00 },  /* Ά to A */
1388   { 0x0388,  0x45, 0x00 },  /* Έ to E */
1389   { 0x0389,  0x49, 0x00 },  /* Ή to I */
1390   { 0x038A,  0x49, 0x00 },  /* Ί to I */
1391   { 0x038C,  0x4f, 0x00 },  /* Ό to O */
1392   { 0x038E,  0x59, 0x00 },  /* Ύ to Y */
1393   { 0x038F,  0x4f, 0x00 },  /* Ώ to O */
1394   { 0x0390,  0x69, 0x00 },  /* ΐ to i */
1395   { 0x0391,  0x41, 0x00 },  /* Α to A */
1396   { 0x0392,  0x42, 0x00 },  /* Β to B */
1397   { 0x0393,  0x47, 0x00 },  /* Γ to G */
1398   { 0x0394,  0x44, 0x00 },  /* Δ to D */
1399   { 0x0395,  0x45, 0x00 },  /* Ε to E */
1400   { 0x0396,  0x5a, 0x00 },  /* Ζ to Z */
1401   { 0x0397,  0x49, 0x00 },  /* Η to I */
1402   { 0x0398,  0x54, 0x68 },  /* Θ to Th */
1403   { 0x0399,  0x49, 0x00 },  /* Ι to I */
1404   { 0x039A,  0x4b, 0x00 },  /* Κ to K */
1405   { 0x039B,  0x4c, 0x00 },  /* Λ to L */
1406   { 0x039C,  0x4d, 0x00 },  /* Μ to M */
1407   { 0x039D,  0x4e, 0x00 },  /* Ν to N */
1408   { 0x039E,  0x58, 0x00 },  /* Ξ to X */
1409   { 0x039F,  0x4f, 0x00 },  /* Ο to O */
1410   { 0x03A0,  0x50, 0x00 },  /* Π to P */
1411   { 0x03A1,  0x52, 0x00 },  /* Ρ to R */
1412   { 0x03A3,  0x53, 0x00 },  /* Σ to S */
1413   { 0x03A4,  0x54, 0x00 },  /* Τ to T */
1414   { 0x03A5,  0x59, 0x00 },  /* Υ to Y */
1415   { 0x03A6,  0x46, 0x00 },  /* Φ to F */
1416   { 0x03A7,  0x43, 0x68 },  /* Χ to Ch */
1417   { 0x03A8,  0x50, 0x73 },  /* Ψ to Ps */
1418   { 0x03A9,  0x4f, 0x00 },  /* Ω to O */
1419   { 0x03AA,  0x49, 0x00 },  /* Ϊ to I */
1420   { 0x03AB,  0x59, 0x00 },  /* Ϋ to Y */
1421   { 0x03AC,  0x61, 0x00 },  /* ά to a */
1422   { 0x03AD,  0x65, 0x00 },  /* έ to e */
1423   { 0x03AE,  0x69, 0x00 },  /* ή to i */
1424   { 0x03AF,  0x69, 0x00 },  /* ί to i */
1425   { 0x03B1,  0x61, 0x00 },  /* α to a */
1426   { 0x03B2,  0x62, 0x00 },  /* β to b */
1427   { 0x03B3,  0x67, 0x00 },  /* γ to g */
1428   { 0x03B4,  0x64, 0x00 },  /* δ to d */
1429   { 0x03B5,  0x65, 0x00 },  /* ε to e */
1430   { 0x03B6,  0x7a, 0x00 },  /* ζ to z */
1431   { 0x03B7,  0x69, 0x00 },  /* η to i */
1432   { 0x03B8,  0x74, 0x68 },  /* θ to th */
1433   { 0x03B9,  0x69, 0x00 },  /* ι to i */
1434   { 0x03BA,  0x6b, 0x00 },  /* κ to k */
1435   { 0x03BB,  0x6c, 0x00 },  /* λ to l */
1436   { 0x03BC,  0x6d, 0x00 },  /* μ to m */
1437   { 0x03BD,  0x6e, 0x00 },  /* ν to n */
1438   { 0x03BE,  0x78, 0x00 },  /* ξ to x */
1439   { 0x03BF,  0x6f, 0x00 },  /* ο to o */
1440   { 0x03C0,  0x70, 0x00 },  /* π to p */
1441   { 0x03C1,  0x72, 0x00 },  /* ρ to r */
1442   { 0x03C3,  0x73, 0x00 },  /* σ to s */
1443   { 0x03C4,  0x74, 0x00 },  /* τ to t */
1444   { 0x03C5,  0x79, 0x00 },  /* υ to y */
1445   { 0x03C6,  0x66, 0x00 },  /* φ to f */
1446   { 0x03C7,  0x63, 0x68 },  /* χ to ch */
1447   { 0x03C8,  0x70, 0x73 },  /* ψ to ps */
1448   { 0x03C9,  0x6f, 0x00 },  /* ω to o */
1449   { 0x03CA,  0x69, 0x00 },  /* ϊ to i */
1450   { 0x03CB,  0x79, 0x00 },  /* ϋ to y */
1451   { 0x03CC,  0x6f, 0x00 },  /* ό to o */
1452   { 0x03CD,  0x79, 0x00 },  /* ύ to y */
1453   { 0x03CE,  0x69, 0x00 },  /* ώ to i */
1454   { 0x0400,  0x45, 0x00 },  /* Ѐ to E */
1455   { 0x0401,  0x45, 0x00 },  /* Ё to E */
1456   { 0x0402,  0x44, 0x00 },  /* Ђ to D */
1457   { 0x0403,  0x47, 0x00 },  /* Ѓ to G */
1458   { 0x0404,  0x45, 0x00 },  /* Є to E */
1459   { 0x0405,  0x5a, 0x00 },  /* Ѕ to Z */
1460   { 0x0406,  0x49, 0x00 },  /* І to I */
1461   { 0x0407,  0x49, 0x00 },  /* Ї to I */
1462   { 0x0408,  0x4a, 0x00 },  /* Ј to J */
1463   { 0x0409,  0x49, 0x00 },  /* Љ to I */
1464   { 0x040A,  0x4e, 0x00 },  /* Њ to N */
1465   { 0x040B,  0x44, 0x00 },  /* Ћ to D */
1466   { 0x040C,  0x4b, 0x00 },  /* Ќ to K */
1467   { 0x040D,  0x49, 0x00 },  /* Ѝ to I */
1468   { 0x040E,  0x55, 0x00 },  /* Ў to U */
1469   { 0x040F,  0x44, 0x00 },  /* Џ to D */
1470   { 0x0410,  0x41, 0x00 },  /* А to A */
1471   { 0x0411,  0x42, 0x00 },  /* Б to B */
1472   { 0x0412,  0x56, 0x00 },  /* В to V */
1473   { 0x0413,  0x47, 0x00 },  /* Г to G */
1474   { 0x0414,  0x44, 0x00 },  /* Д to D */
1475   { 0x0415,  0x45, 0x00 },  /* Е to E */
1476   { 0x0416,  0x5a, 0x68 },  /* Ж to Zh */
1477   { 0x0417,  0x5a, 0x00 },  /* З to Z */
1478   { 0x0418,  0x49, 0x00 },  /* И to I */
1479   { 0x0419,  0x49, 0x00 },  /* Й to I */
1480   { 0x041A,  0x4b, 0x00 },  /* К to K */
1481   { 0x041B,  0x4c, 0x00 },  /* Л to L */
1482   { 0x041C,  0x4d, 0x00 },  /* М to M */
1483   { 0x041D,  0x4e, 0x00 },  /* Н to N */
1484   { 0x041E,  0x4f, 0x00 },  /* О to O */
1485   { 0x041F,  0x50, 0x00 },  /* П to P */
1486   { 0x0420,  0x52, 0x00 },  /* Р to R */
1487   { 0x0421,  0x53, 0x00 },  /* С to S */
1488   { 0x0422,  0x54, 0x00 },  /* Т to T */
1489   { 0x0423,  0x55, 0x00 },  /* У to U */
1490   { 0x0424,  0x46, 0x00 },  /* Ф to F */
1491   { 0x0425,  0x4b, 0x68 },  /* Х to Kh */
1492   { 0x0426,  0x54, 0x63 },  /* Ц to Tc */
1493   { 0x0427,  0x43, 0x68 },  /* Ч to Ch */
1494   { 0x0428,  0x53, 0x68 },  /* Ш to Sh */
1495   { 0x0429,  0x53, 0x68 },  /* Щ to Shch */
1496   { 0x042A,  0x61, 0x00 },  /*  to A */
1497   { 0x042B,  0x59, 0x00 },  /* Ы to Y */
1498   { 0x042C,  0x59, 0x00 },  /*  to Y */
1499   { 0x042D,  0x45, 0x00 },  /* Э to E */
1500   { 0x042E,  0x49, 0x75 },  /* Ю to Iu */
1501   { 0x042F,  0x49, 0x61 },  /* Я to Ia */
1502   { 0x0430,  0x61, 0x00 },  /* а to a */
1503   { 0x0431,  0x62, 0x00 },  /* б to b */
1504   { 0x0432,  0x76, 0x00 },  /* в to v */
1505   { 0x0433,  0x67, 0x00 },  /* г to g */
1506   { 0x0434,  0x64, 0x00 },  /* д to d */
1507   { 0x0435,  0x65, 0x00 },  /* е to e */
1508   { 0x0436,  0x7a, 0x68 },  /* ж to zh */
1509   { 0x0437,  0x7a, 0x00 },  /* з to z */
1510   { 0x0438,  0x69, 0x00 },  /* и to i */
1511   { 0x0439,  0x69, 0x00 },  /* й to i */
1512   { 0x043A,  0x6b, 0x00 },  /* к to k */
1513   { 0x043B,  0x6c, 0x00 },  /* л to l */
1514   { 0x043C,  0x6d, 0x00 },  /* м to m */
1515   { 0x043D,  0x6e, 0x00 },  /* н to n */
1516   { 0x043E,  0x6f, 0x00 },  /* о to o */
1517   { 0x043F,  0x70, 0x00 },  /* п to p */
1518   { 0x0440,  0x72, 0x00 },  /* р to r */
1519   { 0x0441,  0x73, 0x00 },  /* с to s */
1520   { 0x0442,  0x74, 0x00 },  /* т to t */
1521   { 0x0443,  0x75, 0x00 },  /* у to u */
1522   { 0x0444,  0x66, 0x00 },  /* ф to f */
1523   { 0x0445,  0x6b, 0x68 },  /* х to kh */
1524   { 0x0446,  0x74, 0x63 },  /* ц to tc */
1525   { 0x0447,  0x63, 0x68 },  /* ч to ch */
1526   { 0x0448,  0x73, 0x68 },  /* ш to sh */
1527   { 0x0449,  0x73, 0x68 },  /* щ to shch */
1528   { 0x044A,  0x61, 0x00 },  /*  to a */
1529   { 0x044B,  0x79, 0x00 },  /* ы to y */
1530   { 0x044C,  0x79, 0x00 },  /*  to y */
1531   { 0x044D,  0x65, 0x00 },  /* э to e */
1532   { 0x044E,  0x69, 0x75 },  /* ю to iu */
1533   { 0x044F,  0x69, 0x61 },  /* я to ia */
1534   { 0x0450,  0x65, 0x00 },  /* ѐ to e */
1535   { 0x0451,  0x65, 0x00 },  /* ё to e */
1536   { 0x0452,  0x64, 0x00 },  /* ђ to d */
1537   { 0x0453,  0x67, 0x00 },  /* ѓ to g */
1538   { 0x0454,  0x65, 0x00 },  /* є to e */
1539   { 0x0455,  0x7a, 0x00 },  /* ѕ to z */
1540   { 0x0456,  0x69, 0x00 },  /* і to i */
1541   { 0x0457,  0x69, 0x00 },  /* ї to i */
1542   { 0x0458,  0x6a, 0x00 },  /* ј to j */
1543   { 0x0459,  0x69, 0x00 },  /* љ to i */
1544   { 0x045A,  0x6e, 0x00 },  /* њ to n */
1545   { 0x045B,  0x64, 0x00 },  /* ћ to d */
1546   { 0x045C,  0x6b, 0x00 },  /* ќ to k */
1547   { 0x045D,  0x69, 0x00 },  /* ѝ to i */
1548   { 0x045E,  0x75, 0x00 },  /* ў to u */
1549   { 0x045F,  0x64, 0x00 },  /* џ to d */
1550   { 0x1E02,  0x42, 0x00 },  /* Ḃ to B */
1551   { 0x1E03,  0x62, 0x00 },  /* ḃ to b */
1552   { 0x1E0A,  0x44, 0x00 },  /* Ḋ to D */
1553   { 0x1E0B,  0x64, 0x00 },  /* ḋ to d */
1554   { 0x1E1E,  0x46, 0x00 },  /* Ḟ to F */
1555   { 0x1E1F,  0x66, 0x00 },  /* ḟ to f */
1556   { 0x1E40,  0x4D, 0x00 },  /* Ṁ to M */
1557   { 0x1E41,  0x6D, 0x00 },  /* ṁ to m */
1558   { 0x1E56,  0x50, 0x00 },  /* Ṗ to P */
1559   { 0x1E57,  0x70, 0x00 },  /* ṗ to p */
1560   { 0x1E60,  0x53, 0x00 },  /* Ṡ to S */
1561   { 0x1E61,  0x73, 0x00 },  /* ṡ to s */
1562   { 0x1E6A,  0x54, 0x00 },  /* Ṫ to T */
1563   { 0x1E6B,  0x74, 0x00 },  /* ṫ to t */
1564   { 0x1E80,  0x57, 0x00 },  /* Ẁ to W */
1565   { 0x1E81,  0x77, 0x00 },  /* ẁ to w */
1566   { 0x1E82,  0x57, 0x00 },  /* Ẃ to W */
1567   { 0x1E83,  0x77, 0x00 },  /* ẃ to w */
1568   { 0x1E84,  0x57, 0x00 },  /* Ẅ to W */
1569   { 0x1E85,  0x77, 0x00 },  /* ẅ to w */
1570   { 0x1EF2,  0x59, 0x00 },  /* Ỳ to Y */
1571   { 0x1EF3,  0x79, 0x00 },  /* ỳ to y */
1572   { 0xFB00,  0x66, 0x66 },  /* ff to ff */
1573   { 0xFB01,  0x66, 0x69 },  /* fi to fi */
1574   { 0xFB02,  0x66, 0x6C },  /* fl to fl */
1575   { 0xFB05,  0x73, 0x74 },  /* ſt to st */
1576   { 0xFB06,  0x73, 0x74 },  /* st to st */
1577 };
1578 
1579 /*
1580 ** Convert the input string from UTF-8 into pure ASCII by converting
1581 ** all non-ASCII characters to some combination of characters in the
1582 ** ASCII subset.
1583 **
1584 ** The returned string might contain more characters than the input.
1585 **
1586 ** Space to hold the returned string comes from sqlite3_malloc() and
1587 ** should be freed by the caller.
1588 */
1589 static unsigned char *transliterate(const unsigned char *zIn, int nIn){
1590   unsigned char *zOut = sqlite3_malloc( nIn*4 + 1 );
1591   int c, sz, nOut;
1592   if( zOut==0 ) return 0;
1593   nOut = 0;
1594   while( nIn>0 ){
1595     c = utf8Read(zIn, nIn, &sz);
1596     zIn += sz;
1597     nIn -= sz;
1598     if( c<=127 ){
1599       zOut[nOut++] = c;
1600     }else{
1601       int xTop, xBtm, x;
1602       xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1603       xBtm = 0;
1604       while( xTop>=xBtm ){
1605         x = (xTop + xBtm)/2;
1606         if( translit[x].cFrom==c ){
1607           zOut[nOut++] = translit[x].cTo0;
1608           if( translit[x].cTo1 ){
1609             zOut[nOut++] = translit[x].cTo1;
1610             /* Add an extra "ch" after the "sh" for Щ and щ */
1611             if( c==0x0429 || c== 0x0449 ){
1612               zOut[nOut++] = 'c';
1613               zOut[nOut++] = 'h';
1614             }
1615           }
1616           c = 0;
1617           break;
1618         }else if( translit[x].cFrom>c ){
1619           xTop = x-1;
1620         }else{
1621           xBtm = x+1;
1622         }
1623       }
1624       if( c ) zOut[nOut++] = '?';
1625     }
1626   }
1627   zOut[nOut] = 0;
1628   return zOut;
1629 }
1630 
1631 /*
1632 ** Return the number of characters in the shortest prefix of the input
1633 ** string that transliterates to an ASCII string nTrans bytes or longer.
1634 ** Or, if the transliteration of the input string is less than nTrans
1635 ** bytes in size, return the number of characters in the input string.
1636 */
1637 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
1638   int i, c, sz, nOut;
1639   int nChar;
1640 
1641   i = nOut = 0;
1642   for(nChar=0; i<nIn && nOut<nTrans; nChar++){
1643     c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1644     i += sz;
1645 
1646     nOut++;
1647     if( c>=128 ){
1648       int xTop, xBtm, x;
1649       xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1650       xBtm = 0;
1651       while( xTop>=xBtm ){
1652         x = (xTop + xBtm)/2;
1653         if( translit[x].cFrom==c ){
1654           if( translit[x].cTo1 ) nOut++;
1655           if( c==0x0429 || c== 0x0449 ) nOut += 2;
1656           break;
1657         }else if( translit[x].cFrom>c ){
1658           xTop = x-1;
1659         }else{
1660           xBtm = x+1;
1661         }
1662       }
1663     }
1664   }
1665 
1666   return nChar;
1667 }
1668 
1669 
1670 /*
1671 **    spellfix1_translit(X)
1672 **
1673 ** Convert a string that contains non-ASCII Roman characters into
1674 ** pure ASCII.
1675 */
1676 static void transliterateSqlFunc(
1677   sqlite3_context *context,
1678   int argc,
1679   sqlite3_value **argv
1680 ){
1681   const unsigned char *zIn = sqlite3_value_text(argv[0]);
1682   int nIn = sqlite3_value_bytes(argv[0]);
1683   unsigned char *zOut = transliterate(zIn, nIn);
1684   if( zOut==0 ){
1685     sqlite3_result_error_nomem(context);
1686   }else{
1687     sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1688   }
1689 }
1690 
1691 /*
1692 **    spellfix1_scriptcode(X)
1693 **
1694 ** Try to determine the dominant script used by the word X and return
1695 ** its ISO 15924 numeric code.
1696 **
1697 ** The current implementation only understands the following scripts:
1698 **
1699 **    215  (Latin)
1700 **    220  (Cyrillic)
1701 **    200  (Greek)
1702 **
1703 ** This routine will return 998 if the input X contains characters from
1704 ** two or more of the above scripts or 999 if X contains no characters
1705 ** from any of the above scripts.
1706 */
1707 static void scriptCodeSqlFunc(
1708   sqlite3_context *context,
1709   int argc,
1710   sqlite3_value **argv
1711 ){
1712   const unsigned char *zIn = sqlite3_value_text(argv[0]);
1713   int nIn = sqlite3_value_bytes(argv[0]);
1714   int c, sz;
1715   int scriptMask = 0;
1716   int res;
1717 # define SCRIPT_LATIN       0x0001
1718 # define SCRIPT_CYRILLIC    0x0002
1719 # define SCRIPT_GREEK       0x0004
1720 
1721   while( nIn>0 ){
1722     c = utf8Read(zIn, nIn, &sz);
1723     zIn += sz;
1724     nIn -= sz;
1725     if( c<0x02af ){
1726       scriptMask |= SCRIPT_LATIN;
1727     }else if( c>=0x0400 && c<=0x04ff ){
1728       scriptMask |= SCRIPT_CYRILLIC;
1729     }else if( c>=0x0386 && c<=0x03ce ){
1730       scriptMask |= SCRIPT_GREEK;
1731     }
1732   }
1733   switch( scriptMask ){
1734     case 0:                res = 999; break;
1735     case SCRIPT_LATIN:     res = 215; break;
1736     case SCRIPT_CYRILLIC:  res = 220; break;
1737     case SCRIPT_GREEK:     res = 200; break;
1738     default:               res = 998; break;
1739   }
1740   sqlite3_result_int(context, res);
1741 }
1742 
1743 /* End transliterate
1744 ******************************************************************************
1745 ******************************************************************************
1746 ** Begin spellfix1 virtual table.
1747 */
1748 
1749 /* Maximum length of a phonehash used for querying the shadow table */
1750 #define SPELLFIX_MX_HASH  8
1751 
1752 /* Maximum number of hash strings to examine per query */
1753 #define SPELLFIX_MX_RUN   1
1754 
1755 typedef struct spellfix1_vtab spellfix1_vtab;
1756 typedef struct spellfix1_cursor spellfix1_cursor;
1757 
1758 /* Fuzzy-search virtual table object */
1759 struct spellfix1_vtab {
1760   sqlite3_vtab base;         /* Base class - must be first */
1761   sqlite3 *db;               /* Database connection */
1762   char *zDbName;             /* Name of database holding this table */
1763   char *zTableName;          /* Name of the virtual table */
1764   char *zCostTable;          /* Table holding edit-distance cost numbers */
1765   EditDist3Config *pConfig3; /* Parsed edit distance costs */
1766 };
1767 
1768 /* Fuzzy-search cursor object */
1769 struct spellfix1_cursor {
1770   sqlite3_vtab_cursor base;    /* Base class - must be first */
1771   spellfix1_vtab *pVTab;       /* The table to which this cursor belongs */
1772   char *zPattern;              /* rhs of MATCH clause */
1773   int idxNum;                  /* idxNum value passed to xFilter() */
1774   int nRow;                    /* Number of rows of content */
1775   int nAlloc;                  /* Number of allocated rows */
1776   int iRow;                    /* Current row of content */
1777   int iLang;                   /* Value of the langid= constraint */
1778   int iTop;                    /* Value of the top= constraint */
1779   int iScope;                  /* Value of the scope= constraint */
1780   int nSearch;                 /* Number of vocabulary items checked */
1781   sqlite3_stmt *pFullScan;     /* Shadow query for a full table scan */
1782   struct spellfix1_row {       /* For each row of content */
1783     sqlite3_int64 iRowid;         /* Rowid for this row */
1784     char *zWord;                  /* Text for this row */
1785     int iRank;                    /* Rank for this row */
1786     int iDistance;                /* Distance from pattern for this row */
1787     int iScore;                   /* Score for sorting */
1788     int iMatchlen;                /* Value of matchlen column (or -1) */
1789     char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
1790   } *a;
1791 };
1792 
1793 /*
1794 ** Construct one or more SQL statements from the format string given
1795 ** and then evaluate those statements. The success code is written
1796 ** into *pRc.
1797 **
1798 ** If *pRc is initially non-zero then this routine is a no-op.
1799 */
1800 static void spellfix1DbExec(
1801   int *pRc,              /* Success code */
1802   sqlite3 *db,           /* Database in which to run SQL */
1803   const char *zFormat,   /* Format string for SQL */
1804   ...                    /* Arguments to the format string */
1805 ){
1806   va_list ap;
1807   char *zSql;
1808   if( *pRc ) return;
1809   va_start(ap, zFormat);
1810   zSql = sqlite3_vmprintf(zFormat, ap);
1811   va_end(ap);
1812   if( zSql==0 ){
1813     *pRc = SQLITE_NOMEM;
1814   }else{
1815     *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
1816     sqlite3_free(zSql);
1817   }
1818 }
1819 
1820 /*
1821 ** xDisconnect/xDestroy method for the fuzzy-search module.
1822 */
1823 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){
1824   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1825   int rc = SQLITE_OK;
1826   if( isDestroy ){
1827     sqlite3 *db = p->db;
1828     spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1829                   p->zDbName, p->zTableName);
1830   }
1831   if( rc==SQLITE_OK ){
1832     sqlite3_free(p->zTableName);
1833     editDist3ConfigDelete(p->pConfig3);
1834     sqlite3_free(p->zCostTable);
1835     sqlite3_free(p);
1836   }
1837   return rc;
1838 }
1839 static int spellfix1Disconnect(sqlite3_vtab *pVTab){
1840   return spellfix1Uninit(0, pVTab);
1841 }
1842 static int spellfix1Destroy(sqlite3_vtab *pVTab){
1843   return spellfix1Uninit(1, pVTab);
1844 }
1845 
1846 /*
1847 ** Make a copy of a string.  Remove leading and trailing whitespace
1848 ** and dequote it.
1849 */
1850 static char *spellfix1Dequote(const char *zIn){
1851   char *zOut;
1852   int i, j;
1853   char c;
1854   while( isspace(zIn[0]) ) zIn++;
1855   zOut = sqlite3_mprintf("%s", zIn);
1856   if( zOut==0 ) return 0;
1857   i = (int)strlen(zOut);
1858 #if 0  /* The parser will never leave spaces at the end */
1859   while( i>0 && isspace(zOut[i-1]) ){ i--; }
1860 #endif
1861   zOut[i] = 0;
1862   c = zOut[0];
1863   if( c=='\'' || c=='"' ){
1864     for(i=1, j=0; ALWAYS(zOut[i]); i++){
1865       zOut[j++] = zOut[i];
1866       if( zOut[i]==c ){
1867         if( zOut[i+1]==c ){
1868           i++;
1869         }else{
1870           zOut[j-1] = 0;
1871           break;
1872         }
1873       }
1874     }
1875   }
1876   return zOut;
1877 }
1878 
1879 
1880 /*
1881 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
1882 **
1883 **   argv[0]   -> module name  ("spellfix1")
1884 **   argv[1]   -> database name
1885 **   argv[2]   -> table name
1886 **   argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
1887 */
1888 static int spellfix1Init(
1889   int isCreate,
1890   sqlite3 *db,
1891   void *pAux,
1892   int argc, const char *const*argv,
1893   sqlite3_vtab **ppVTab,
1894   char **pzErr
1895 ){
1896   spellfix1_vtab *pNew = 0;
1897   /* const char *zModule = argv[0]; // not used */
1898   const char *zDbName = argv[1];
1899   const char *zTableName = argv[2];
1900   int nDbName;
1901   int rc = SQLITE_OK;
1902   int i;
1903 
1904   nDbName = (int)strlen(zDbName);
1905   pNew = sqlite3_malloc( sizeof(*pNew) + nDbName + 1);
1906   if( pNew==0 ){
1907     rc = SQLITE_NOMEM;
1908   }else{
1909     memset(pNew, 0, sizeof(*pNew));
1910     pNew->zDbName = (char*)&pNew[1];
1911     memcpy(pNew->zDbName, zDbName, nDbName+1);
1912     pNew->zTableName = sqlite3_mprintf("%s", zTableName);
1913     pNew->db = db;
1914     if( pNew->zTableName==0 ){
1915       rc = SQLITE_NOMEM;
1916     }else{
1917       rc = sqlite3_declare_vtab(db,
1918            "CREATE TABLE x(word,rank,distance,langid, "
1919            "score, matchlen, phonehash HIDDEN, "
1920            "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
1921            "soundslike HIDDEN, command HIDDEN)"
1922       );
1923 #define SPELLFIX_COL_WORD            0
1924 #define SPELLFIX_COL_RANK            1
1925 #define SPELLFIX_COL_DISTANCE        2
1926 #define SPELLFIX_COL_LANGID          3
1927 #define SPELLFIX_COL_SCORE           4
1928 #define SPELLFIX_COL_MATCHLEN        5
1929 #define SPELLFIX_COL_PHONEHASH       6
1930 #define SPELLFIX_COL_TOP             7
1931 #define SPELLFIX_COL_SCOPE           8
1932 #define SPELLFIX_COL_SRCHCNT         9
1933 #define SPELLFIX_COL_SOUNDSLIKE     10
1934 #define SPELLFIX_COL_COMMAND        11
1935     }
1936     if( rc==SQLITE_OK && isCreate ){
1937       spellfix1DbExec(&rc, db,
1938          "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
1939          "  id INTEGER PRIMARY KEY,\n"
1940          "  rank INT,\n"
1941          "  langid INT,\n"
1942          "  word TEXT,\n"
1943          "  k1 TEXT,\n"
1944          "  k2 TEXT\n"
1945          ");\n",
1946          zDbName, zTableName
1947       );
1948       spellfix1DbExec(&rc, db,
1949          "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" "
1950             "ON \"%w_vocab\"(langid,k2);",
1951          zDbName, zTableName, zTableName
1952       );
1953     }
1954     for(i=3; rc==SQLITE_OK && i<argc; i++){
1955       if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
1956         pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
1957         if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
1958         continue;
1959       }
1960       *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
1961       rc = SQLITE_ERROR;
1962     }
1963   }
1964 
1965   if( rc && pNew ){
1966     *ppVTab = 0;
1967     spellfix1Uninit(0, &pNew->base);
1968   }else{
1969     *ppVTab = (sqlite3_vtab *)pNew;
1970   }
1971   return rc;
1972 }
1973 
1974 /*
1975 ** The xConnect and xCreate methods
1976 */
1977 static int spellfix1Connect(
1978   sqlite3 *db,
1979   void *pAux,
1980   int argc, const char *const*argv,
1981   sqlite3_vtab **ppVTab,
1982   char **pzErr
1983 ){
1984   return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
1985 }
1986 static int spellfix1Create(
1987   sqlite3 *db,
1988   void *pAux,
1989   int argc, const char *const*argv,
1990   sqlite3_vtab **ppVTab,
1991   char **pzErr
1992 ){
1993   return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
1994 }
1995 
1996 /*
1997 ** Clear all of the content from a cursor.
1998 */
1999 static void spellfix1ResetCursor(spellfix1_cursor *pCur){
2000   int i;
2001   for(i=0; i<pCur->nRow; i++){
2002     sqlite3_free(pCur->a[i].zWord);
2003   }
2004   pCur->nRow = 0;
2005   pCur->iRow = 0;
2006   pCur->nSearch = 0;
2007   if( pCur->pFullScan ){
2008     sqlite3_finalize(pCur->pFullScan);
2009     pCur->pFullScan = 0;
2010   }
2011 }
2012 
2013 /*
2014 ** Resize the cursor to hold up to N rows of content
2015 */
2016 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){
2017   struct spellfix1_row *aNew;
2018   assert( N>=pCur->nRow );
2019   aNew = sqlite3_realloc(pCur->a, sizeof(pCur->a[0])*N);
2020   if( aNew==0 && N>0 ){
2021     spellfix1ResetCursor(pCur);
2022     sqlite3_free(pCur->a);
2023     pCur->nAlloc = 0;
2024     pCur->a = 0;
2025   }else{
2026     pCur->nAlloc = N;
2027     pCur->a = aNew;
2028   }
2029 }
2030 
2031 
2032 /*
2033 ** Close a fuzzy-search cursor.
2034 */
2035 static int spellfix1Close(sqlite3_vtab_cursor *cur){
2036   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2037   spellfix1ResetCursor(pCur);
2038   spellfix1ResizeCursor(pCur, 0);
2039   sqlite3_free(pCur->zPattern);
2040   sqlite3_free(pCur);
2041   return SQLITE_OK;
2042 }
2043 
2044 #define SPELLFIX_IDXNUM_MATCH  0x01         /* word MATCH $str */
2045 #define SPELLFIX_IDXNUM_LANGID 0x02         /* langid == $langid */
2046 #define SPELLFIX_IDXNUM_TOP    0x04         /* top = $top */
2047 #define SPELLFIX_IDXNUM_SCOPE  0x08         /* scope = $scope */
2048 #define SPELLFIX_IDXNUM_DISTLT 0x10         /* distance < $distance */
2049 #define SPELLFIX_IDXNUM_DISTLE 0x20         /* distance <= $distance */
2050 #define SPELLFIX_IDXNUM_ROWID  0x40         /* rowid = $rowid */
2051 #define SPELLFIX_IDXNUM_DIST   (0x10|0x20)  /* DISTLT and DISTLE */
2052 
2053 /*
2054 **
2055 ** The plan number is a bitmask of the SPELLFIX_IDXNUM_* values defined
2056 ** above.
2057 **
2058 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2059 ** if specified and in that order.
2060 */
2061 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
2062   int iPlan = 0;
2063   int iLangTerm = -1;
2064   int iTopTerm = -1;
2065   int iScopeTerm = -1;
2066   int iDistTerm = -1;
2067   int iRowidTerm = -1;
2068   int i;
2069   const struct sqlite3_index_constraint *pConstraint;
2070   pConstraint = pIdxInfo->aConstraint;
2071   for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){
2072     if( pConstraint->usable==0 ) continue;
2073 
2074     /* Terms of the form:  word MATCH $str */
2075     if( (iPlan & SPELLFIX_IDXNUM_MATCH)==0
2076      && pConstraint->iColumn==SPELLFIX_COL_WORD
2077      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
2078     ){
2079       iPlan |= SPELLFIX_IDXNUM_MATCH;
2080       pIdxInfo->aConstraintUsage[i].argvIndex = 1;
2081       pIdxInfo->aConstraintUsage[i].omit = 1;
2082     }
2083 
2084     /* Terms of the form:  langid = $langid  */
2085     if( (iPlan & SPELLFIX_IDXNUM_LANGID)==0
2086      && pConstraint->iColumn==SPELLFIX_COL_LANGID
2087      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2088     ){
2089       iPlan |= SPELLFIX_IDXNUM_LANGID;
2090       iLangTerm = i;
2091     }
2092 
2093     /* Terms of the form:  top = $top */
2094     if( (iPlan & SPELLFIX_IDXNUM_TOP)==0
2095      && pConstraint->iColumn==SPELLFIX_COL_TOP
2096      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2097     ){
2098       iPlan |= SPELLFIX_IDXNUM_TOP;
2099       iTopTerm = i;
2100     }
2101 
2102     /* Terms of the form:  scope = $scope */
2103     if( (iPlan & SPELLFIX_IDXNUM_SCOPE)==0
2104      && pConstraint->iColumn==SPELLFIX_COL_SCOPE
2105      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2106     ){
2107       iPlan |= SPELLFIX_IDXNUM_SCOPE;
2108       iScopeTerm = i;
2109     }
2110 
2111     /* Terms of the form:  distance < $dist or distance <= $dist */
2112     if( (iPlan & SPELLFIX_IDXNUM_DIST)==0
2113      && pConstraint->iColumn==SPELLFIX_COL_DISTANCE
2114      && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT
2115           || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE)
2116     ){
2117       if( pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ){
2118         iPlan |= SPELLFIX_IDXNUM_DISTLT;
2119       }else{
2120         iPlan |= SPELLFIX_IDXNUM_DISTLE;
2121       }
2122       iDistTerm = i;
2123     }
2124 
2125     /* Terms of the form:  distance < $dist or distance <= $dist */
2126     if( (iPlan & SPELLFIX_IDXNUM_ROWID)==0
2127      && pConstraint->iColumn<0
2128      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2129     ){
2130       iPlan |= SPELLFIX_IDXNUM_ROWID;
2131       iRowidTerm = i;
2132     }
2133   }
2134   if( iPlan&SPELLFIX_IDXNUM_MATCH ){
2135     int idx = 2;
2136     pIdxInfo->idxNum = iPlan;
2137     if( pIdxInfo->nOrderBy==1
2138      && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
2139      && pIdxInfo->aOrderBy[0].desc==0
2140     ){
2141       pIdxInfo->orderByConsumed = 1;  /* Default order by iScore */
2142     }
2143     if( iPlan&SPELLFIX_IDXNUM_LANGID ){
2144       pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
2145       pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
2146     }
2147     if( iPlan&SPELLFIX_IDXNUM_TOP ){
2148       pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
2149       pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
2150     }
2151     if( iPlan&SPELLFIX_IDXNUM_SCOPE ){
2152       pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
2153       pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
2154     }
2155     if( iPlan&SPELLFIX_IDXNUM_DIST ){
2156       pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++;
2157       pIdxInfo->aConstraintUsage[iDistTerm].omit = 1;
2158     }
2159     pIdxInfo->estimatedCost = 1e5;
2160   }else if( (iPlan & SPELLFIX_IDXNUM_ROWID) ){
2161     pIdxInfo->idxNum = SPELLFIX_IDXNUM_ROWID;
2162     pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
2163     pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
2164     pIdxInfo->estimatedCost = 5;
2165   }else{
2166     pIdxInfo->idxNum = 0;
2167     pIdxInfo->estimatedCost = 1e50;
2168   }
2169   return SQLITE_OK;
2170 }
2171 
2172 /*
2173 ** Open a new fuzzy-search cursor.
2174 */
2175 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2176   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2177   spellfix1_cursor *pCur;
2178   pCur = sqlite3_malloc( sizeof(*pCur) );
2179   if( pCur==0 ) return SQLITE_NOMEM;
2180   memset(pCur, 0, sizeof(*pCur));
2181   pCur->pVTab = p;
2182   *ppCursor = &pCur->base;
2183   return SQLITE_OK;
2184 }
2185 
2186 /*
2187 ** Adjust a distance measurement by the words rank in order to show
2188 ** preference to common words.
2189 */
2190 static int spellfix1Score(int iDistance, int iRank){
2191   int iLog2;
2192   for(iLog2=0; iRank>0; iLog2++, iRank>>=1){}
2193   return iDistance + 32 - iLog2;
2194 }
2195 
2196 /*
2197 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
2198 ** that they sort in order of increasing distance.
2199 */
2200 static int spellfix1RowCompare(const void *A, const void *B){
2201   const struct spellfix1_row *a = (const struct spellfix1_row*)A;
2202   const struct spellfix1_row *b = (const struct spellfix1_row*)B;
2203   return a->iScore - b->iScore;
2204 }
2205 
2206 /*
2207 ** A structure used to pass information from spellfix1FilterForMatch()
2208 ** into spellfix1RunQuery().
2209 */
2210 typedef struct MatchQuery {
2211   spellfix1_cursor *pCur;          /* The cursor being queried */
2212   sqlite3_stmt *pStmt;             /* shadow table query statment */
2213   char zHash[SPELLFIX_MX_HASH];    /* The current phonehash for zPattern */
2214   const char *zPattern;            /* Transliterated input string */
2215   int nPattern;                    /* Length of zPattern */
2216   EditDist3FromString *pMatchStr3; /* Original unicode string */
2217   EditDist3Config *pConfig3;       /* Edit-distance cost coefficients */
2218   const EditDist3Lang *pLang;      /* The selected language coefficients */
2219   int iLang;                       /* The language id */
2220   int iScope;                      /* Default scope */
2221   int iMaxDist;                    /* Maximum allowed edit distance, or -1 */
2222   int rc;                          /* Error code */
2223   int nRun;                  /* Number of prior runs for the same zPattern */
2224   char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH];  /* Prior hashes */
2225 } MatchQuery;
2226 
2227 /*
2228 ** Run a query looking for the best matches against zPattern using
2229 ** zHash as the character class seed hash.
2230 */
2231 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
2232   const char *zK1;
2233   const char *zWord;
2234   int iDist;
2235   int iRank;
2236   int iScore;
2237   int iWorst = 0;
2238   int idx;
2239   int idxWorst = -1;
2240   int i;
2241   int iScope = p->iScope;
2242   spellfix1_cursor *pCur = p->pCur;
2243   sqlite3_stmt *pStmt = p->pStmt;
2244   char zHash1[SPELLFIX_MX_HASH];
2245   char zHash2[SPELLFIX_MX_HASH];
2246   char *zClass;
2247   int nClass;
2248   int rc;
2249 
2250   if( pCur->a==0 || p->rc ) return;   /* Prior memory allocation failure */
2251   zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
2252   if( zClass==0 ){
2253     p->rc = SQLITE_NOMEM;
2254     return;
2255   }
2256   nClass = (int)strlen(zClass);
2257   if( nClass>SPELLFIX_MX_HASH-2 ){
2258     nClass = SPELLFIX_MX_HASH-2;
2259     zClass[nClass] = 0;
2260   }
2261   if( nClass<=iScope ){
2262     if( nClass>2 ){
2263       iScope = nClass-1;
2264     }else{
2265       iScope = nClass;
2266     }
2267   }
2268   memcpy(zHash1, zClass, iScope);
2269   sqlite3_free(zClass);
2270   zHash1[iScope] = 0;
2271   memcpy(zHash2, zHash1, iScope);
2272   zHash2[iScope] = 'Z';
2273   zHash2[iScope+1] = 0;
2274 #if SPELLFIX_MX_RUN>1
2275   for(i=0; i<p->nRun; i++){
2276     if( strcmp(p->azPrior[i], zHash1)==0 ) return;
2277   }
2278 #endif
2279   assert( p->nRun<SPELLFIX_MX_RUN );
2280   memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
2281   if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
2282    || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
2283   ){
2284     p->rc = SQLITE_NOMEM;
2285     return;
2286   }
2287 #if SPELLFIX_MX_RUN>1
2288   for(i=0; i<pCur->nRow; i++){
2289     if( pCur->a[i].iScore>iWorst ){
2290       iWorst = pCur->a[i].iScore;
2291       idxWorst = i;
2292     }
2293   }
2294 #endif
2295   while( sqlite3_step(pStmt)==SQLITE_ROW ){
2296     int iMatchlen = -1;
2297     iRank = sqlite3_column_int(pStmt, 2);
2298     if( p->pMatchStr3 ){
2299       int nWord = sqlite3_column_bytes(pStmt, 1);
2300       zWord = (const char*)sqlite3_column_text(pStmt, 1);
2301       iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
2302     }else{
2303       zK1 = (const char*)sqlite3_column_text(pStmt, 3);
2304       if( zK1==0 ) continue;
2305       iDist = editdist1(p->zPattern, zK1, 0);
2306     }
2307     if( iDist<0 ){
2308       p->rc = SQLITE_NOMEM;
2309       break;
2310     }
2311     pCur->nSearch++;
2312 
2313     /* If there is a "distance < $dist" or "distance <= $dist" constraint,
2314     ** check if this row meets it. If not, jump back up to the top of the
2315     ** loop to process the next row. Otherwise, if the row does match the
2316     ** distance constraint, check if the pCur->a[] array is already full.
2317     ** If it is and no explicit "top = ?" constraint was present in the
2318     ** query, grow the array to ensure there is room for the new entry. */
2319     assert( (p->iMaxDist>=0)==((pCur->idxNum & SPELLFIX_IDXNUM_DIST) ? 1 : 0) );
2320     if( p->iMaxDist>=0 ){
2321       if( iDist>p->iMaxDist ) continue;
2322       if( pCur->nRow>=pCur->nAlloc && (pCur->idxNum & SPELLFIX_IDXNUM_TOP)==0 ){
2323         spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
2324         if( pCur->a==0 ) break;
2325       }
2326     }
2327 
2328     iScore = spellfix1Score(iDist,iRank);
2329     if( pCur->nRow<pCur->nAlloc ){
2330       idx = pCur->nRow;
2331     }else if( iScore<iWorst ){
2332       idx = idxWorst;
2333       sqlite3_free(pCur->a[idx].zWord);
2334     }else{
2335       continue;
2336     }
2337 
2338     pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
2339     if( pCur->a[idx].zWord==0 ){
2340       p->rc = SQLITE_NOMEM;
2341       break;
2342     }
2343     pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
2344     pCur->a[idx].iRank = iRank;
2345     pCur->a[idx].iDistance = iDist;
2346     pCur->a[idx].iScore = iScore;
2347     pCur->a[idx].iMatchlen = iMatchlen;
2348     memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
2349     if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
2350     if( pCur->nRow==pCur->nAlloc ){
2351       iWorst = pCur->a[0].iScore;
2352       idxWorst = 0;
2353       for(i=1; i<pCur->nRow; i++){
2354         iScore = pCur->a[i].iScore;
2355         if( iWorst<iScore ){
2356           iWorst = iScore;
2357           idxWorst = i;
2358         }
2359       }
2360     }
2361   }
2362   rc = sqlite3_reset(pStmt);
2363   if( rc ) p->rc = rc;
2364 }
2365 
2366 /*
2367 ** This version of the xFilter method work if the MATCH term is present
2368 ** and we are doing a scan.
2369 */
2370 static int spellfix1FilterForMatch(
2371   spellfix1_cursor *pCur,
2372   int argc,
2373   sqlite3_value **argv
2374 ){
2375   int idxNum = pCur->idxNum;
2376   const unsigned char *zMatchThis;   /* RHS of the MATCH operator */
2377   EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
2378   char *zPattern;                    /* Transliteration of zMatchThis */
2379   int nPattern;                      /* Length of zPattern */
2380   int iLimit = 20;                   /* Max number of rows of output */
2381   int iScope = 3;                    /* Use this many characters of zClass */
2382   int iLang = 0;                     /* Language code */
2383   char *zSql;                        /* SQL of shadow table query */
2384   sqlite3_stmt *pStmt = 0;           /* Shadow table query */
2385   int rc;                            /* Result code */
2386   int idx = 1;                       /* Next available filter parameter */
2387   spellfix1_vtab *p = pCur->pVTab;   /* The virtual table that owns pCur */
2388   MatchQuery x;                      /* For passing info to RunQuery() */
2389 
2390   /* Load the cost table if we have not already done so */
2391   if( p->zCostTable!=0 && p->pConfig3==0 ){
2392     p->pConfig3 = sqlite3_malloc( sizeof(p->pConfig3[0]) );
2393     if( p->pConfig3==0 ) return SQLITE_NOMEM;
2394     memset(p->pConfig3, 0, sizeof(p->pConfig3[0]));
2395     rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable);
2396     if( rc ) return rc;
2397   }
2398   memset(&x, 0, sizeof(x));
2399   x.iScope = 3;  /* Default scope if none specified by "WHERE scope=N" */
2400   x.iMaxDist = -1;   /* Maximum allowed edit distance */
2401 
2402   if( idxNum&2 ){
2403     iLang = sqlite3_value_int(argv[idx++]);
2404   }
2405   if( idxNum&4 ){
2406     iLimit = sqlite3_value_int(argv[idx++]);
2407     if( iLimit<1 ) iLimit = 1;
2408   }
2409   if( idxNum&8 ){
2410     x.iScope = sqlite3_value_int(argv[idx++]);
2411     if( x.iScope<1 ) x.iScope = 1;
2412     if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2;
2413   }
2414   if( idxNum&(16|32) ){
2415     x.iMaxDist = sqlite3_value_int(argv[idx++]);
2416     if( idxNum&16 ) x.iMaxDist--;
2417     if( x.iMaxDist<0 ) x.iMaxDist = 0;
2418   }
2419   spellfix1ResetCursor(pCur);
2420   spellfix1ResizeCursor(pCur, iLimit);
2421   zMatchThis = sqlite3_value_text(argv[0]);
2422   if( zMatchThis==0 ) return SQLITE_OK;
2423   if( p->pConfig3 ){
2424     x.pLang = editDist3FindLang(p->pConfig3, iLang);
2425     pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
2426     if( pMatchStr3==0 ){
2427       x.rc = SQLITE_NOMEM;
2428       goto filter_exit;
2429     }
2430   }else{
2431     x.pLang = 0;
2432   }
2433   zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
2434   sqlite3_free(pCur->zPattern);
2435   pCur->zPattern = zPattern;
2436   if( zPattern==0 ){
2437     x.rc = SQLITE_NOMEM;
2438     goto filter_exit;
2439   }
2440   nPattern = (int)strlen(zPattern);
2441   if( zPattern[nPattern-1]=='*' ) nPattern--;
2442   zSql = sqlite3_mprintf(
2443      "SELECT id, word, rank, k1"
2444      "  FROM \"%w\".\"%w_vocab\""
2445      " WHERE langid=%d AND k2>=?1 AND k2<?2",
2446      p->zDbName, p->zTableName, iLang
2447   );
2448   if( zSql==0 ){
2449     x.rc = SQLITE_NOMEM;
2450     pStmt = 0;
2451     goto filter_exit;
2452   }
2453   rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
2454   sqlite3_free(zSql);
2455   pCur->iLang = iLang;
2456   x.pCur = pCur;
2457   x.pStmt = pStmt;
2458   x.zPattern = zPattern;
2459   x.nPattern = nPattern;
2460   x.pMatchStr3 = pMatchStr3;
2461   x.iLang = iLang;
2462   x.rc = rc;
2463   x.pConfig3 = p->pConfig3;
2464   if( x.rc==SQLITE_OK ){
2465     spellfix1RunQuery(&x, zPattern, nPattern);
2466   }
2467 
2468   if( pCur->a ){
2469     qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
2470     pCur->iTop = iLimit;
2471     pCur->iScope = iScope;
2472   }else{
2473     x.rc = SQLITE_NOMEM;
2474   }
2475 
2476 filter_exit:
2477   sqlite3_finalize(pStmt);
2478   editDist3FromStringDelete(pMatchStr3);
2479   return x.rc;
2480 }
2481 
2482 /*
2483 ** This version of xFilter handles a full-table scan case
2484 */
2485 static int spellfix1FilterForFullScan(
2486   spellfix1_cursor *pCur,
2487   int argc,
2488   sqlite3_value **argv
2489 ){
2490   int rc = SQLITE_OK;
2491   int idxNum = pCur->idxNum;
2492   char *zSql;
2493   spellfix1_vtab *pVTab = pCur->pVTab;
2494   spellfix1ResetCursor(pCur);
2495   assert( idxNum==0 || idxNum==64 );
2496   zSql = sqlite3_mprintf(
2497      "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2498      pVTab->zDbName, pVTab->zTableName,
2499      ((idxNum & 64) ? " WHERE rowid=?" : "")
2500   );
2501   if( zSql==0 ) return SQLITE_NOMEM;
2502   rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
2503   sqlite3_free(zSql);
2504   if( rc==SQLITE_OK && (idxNum & 64) ){
2505     assert( argc==1 );
2506     rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]);
2507   }
2508   pCur->nRow = pCur->iRow = 0;
2509   if( rc==SQLITE_OK ){
2510     rc = sqlite3_step(pCur->pFullScan);
2511     if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; }
2512     if( rc==SQLITE_DONE ){ rc = SQLITE_OK; }
2513   }else{
2514     pCur->iRow = 0;
2515   }
2516   return rc;
2517 }
2518 
2519 
2520 /*
2521 ** Called to "rewind" a cursor back to the beginning so that
2522 ** it starts its output over again.  Always called at least once
2523 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2524 */
2525 static int spellfix1Filter(
2526   sqlite3_vtab_cursor *cur,
2527   int idxNum, const char *idxStr,
2528   int argc, sqlite3_value **argv
2529 ){
2530   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2531   int rc;
2532   pCur->idxNum = idxNum;
2533   if( idxNum & 1 ){
2534     rc = spellfix1FilterForMatch(pCur, argc, argv);
2535   }else{
2536     rc = spellfix1FilterForFullScan(pCur, argc, argv);
2537   }
2538   return rc;
2539 }
2540 
2541 
2542 /*
2543 ** Advance a cursor to its next row of output
2544 */
2545 static int spellfix1Next(sqlite3_vtab_cursor *cur){
2546   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2547   int rc = SQLITE_OK;
2548   if( pCur->iRow < pCur->nRow ){
2549     if( pCur->pFullScan ){
2550       rc = sqlite3_step(pCur->pFullScan);
2551       if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
2552       if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK;
2553     }else{
2554       pCur->iRow++;
2555     }
2556   }
2557   return rc;
2558 }
2559 
2560 /*
2561 ** Return TRUE if we are at the end-of-file
2562 */
2563 static int spellfix1Eof(sqlite3_vtab_cursor *cur){
2564   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2565   return pCur->iRow>=pCur->nRow;
2566 }
2567 
2568 /*
2569 ** Return columns from the current row.
2570 */
2571 static int spellfix1Column(
2572   sqlite3_vtab_cursor *cur,
2573   sqlite3_context *ctx,
2574   int i
2575 ){
2576   spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2577   if( pCur->pFullScan ){
2578     if( i<=SPELLFIX_COL_LANGID ){
2579       sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
2580     }else{
2581       sqlite3_result_null(ctx);
2582     }
2583     return SQLITE_OK;
2584   }
2585   switch( i ){
2586     case SPELLFIX_COL_WORD: {
2587       sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
2588       break;
2589     }
2590     case SPELLFIX_COL_RANK: {
2591       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
2592       break;
2593     }
2594     case SPELLFIX_COL_DISTANCE: {
2595       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
2596       break;
2597     }
2598     case SPELLFIX_COL_LANGID: {
2599       sqlite3_result_int(ctx, pCur->iLang);
2600       break;
2601     }
2602     case SPELLFIX_COL_SCORE: {
2603       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
2604       break;
2605     }
2606     case SPELLFIX_COL_MATCHLEN: {
2607       int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
2608       if( iMatchlen<0 ){
2609         int nPattern = (int)strlen(pCur->zPattern);
2610         char *zWord = pCur->a[pCur->iRow].zWord;
2611         int nWord = (int)strlen(zWord);
2612 
2613         if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
2614           char *zTranslit;
2615           int res;
2616           zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
2617           if( !zTranslit ) return SQLITE_NOMEM;
2618           res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
2619           sqlite3_free(zTranslit);
2620           if( res<0 ) return SQLITE_NOMEM;
2621           iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
2622         }else{
2623           iMatchlen = utf8Charlen(zWord, nWord);
2624         }
2625       }
2626 
2627       sqlite3_result_int(ctx, iMatchlen);
2628       break;
2629     }
2630     case SPELLFIX_COL_PHONEHASH: {
2631       sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
2632       break;
2633     }
2634     case SPELLFIX_COL_TOP: {
2635       sqlite3_result_int(ctx, pCur->iTop);
2636       break;
2637     }
2638     case SPELLFIX_COL_SCOPE: {
2639       sqlite3_result_int(ctx, pCur->iScope);
2640       break;
2641     }
2642     case SPELLFIX_COL_SRCHCNT: {
2643       sqlite3_result_int(ctx, pCur->nSearch);
2644       break;
2645     }
2646     default: {
2647       sqlite3_result_null(ctx);
2648       break;
2649     }
2650   }
2651   return SQLITE_OK;
2652 }
2653 
2654 /*
2655 ** The rowid.
2656 */
2657 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
2658   spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2659   if( pCur->pFullScan ){
2660     *pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
2661   }else{
2662     *pRowid = pCur->a[pCur->iRow].iRowid;
2663   }
2664   return SQLITE_OK;
2665 }
2666 
2667 /*
2668 ** This function is called by the xUpdate() method. It returns a string
2669 ** containing the conflict mode that xUpdate() should use for the current
2670 ** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE".
2671 */
2672 static const char *spellfix1GetConflict(sqlite3 *db){
2673   static const char *azConflict[] = {
2674     /* Note: Instead of "FAIL" - "ABORT". */
2675     "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE"
2676   };
2677   int eConflict = sqlite3_vtab_on_conflict(db);
2678 
2679   assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE
2680        || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT
2681        || eConflict==SQLITE_REPLACE
2682   );
2683   assert( SQLITE_ROLLBACK==1 );
2684   assert( SQLITE_IGNORE==2 );
2685   assert( SQLITE_FAIL==3 );
2686   assert( SQLITE_ABORT==4 );
2687   assert( SQLITE_REPLACE==5 );
2688 
2689   return azConflict[eConflict-1];
2690 }
2691 
2692 /*
2693 ** The xUpdate() method.
2694 */
2695 static int spellfix1Update(
2696   sqlite3_vtab *pVTab,
2697   int argc,
2698   sqlite3_value **argv,
2699   sqlite_int64 *pRowid
2700 ){
2701   int rc = SQLITE_OK;
2702   sqlite3_int64 rowid, newRowid;
2703   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2704   sqlite3 *db = p->db;
2705 
2706   if( argc==1 ){
2707     /* A delete operation on the rowid given by argv[0] */
2708     rowid = *pRowid = sqlite3_value_int64(argv[0]);
2709     spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
2710                            " WHERE id=%lld",
2711                   p->zDbName, p->zTableName, rowid);
2712   }else{
2713     const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]);
2714     int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]);
2715     int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]);
2716     int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]);
2717     const unsigned char *zSoundslike =
2718            sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2719     int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2720     char *zK1, *zK2;
2721     int i;
2722     char c;
2723     const char *zConflict = spellfix1GetConflict(db);
2724 
2725     if( zWord==0 ){
2726       /* Inserts of the form:  INSERT INTO table(command) VALUES('xyzzy');
2727       ** cause zWord to be NULL, so we look at the "command" column to see
2728       ** what special actions to take */
2729       const char *zCmd =
2730          (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]);
2731       if( zCmd==0 ){
2732         pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2733                                          p->zTableName);
2734         return SQLITE_CONSTRAINT_NOTNULL;
2735       }
2736       if( strcmp(zCmd,"reset")==0 ){
2737         /* Reset the  edit cost table (if there is one). */
2738         editDist3ConfigDelete(p->pConfig3);
2739         p->pConfig3 = 0;
2740         return SQLITE_OK;
2741       }
2742       if( strncmp(zCmd,"edit_cost_table=",16)==0 ){
2743         editDist3ConfigDelete(p->pConfig3);
2744         p->pConfig3 = 0;
2745         sqlite3_free(p->zCostTable);
2746         p->zCostTable = spellfix1Dequote(zCmd+16);
2747         if( p->zCostTable==0 ) return SQLITE_NOMEM;
2748         if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){
2749           sqlite3_free(p->zCostTable);
2750           p->zCostTable = 0;
2751         }
2752         return SQLITE_OK;
2753       }
2754       pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2755                                        p->zTableName, zCmd);
2756       return SQLITE_ERROR;
2757     }
2758     if( iRank<1 ) iRank = 1;
2759     if( zSoundslike ){
2760       zK1 = (char*)transliterate(zSoundslike, nSoundslike);
2761     }else{
2762       zK1 = (char*)transliterate(zWord, nWord);
2763     }
2764     if( zK1==0 ) return SQLITE_NOMEM;
2765     for(i=0; (c = zK1[i])!=0; i++){
2766        if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
2767     }
2768     zK2 = (char*)phoneticHash((const unsigned char*)zK1, i);
2769     if( zK2==0 ){
2770       sqlite3_free(zK1);
2771       return SQLITE_NOMEM;
2772     }
2773     if( sqlite3_value_type(argv[0])==SQLITE_NULL ){
2774       if( sqlite3_value_type(argv[1])==SQLITE_NULL ){
2775         spellfix1DbExec(&rc, db,
2776                "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
2777                "VALUES(%d,%d,%Q,%Q,%Q)",
2778                p->zDbName, p->zTableName,
2779                iRank, iLang, zWord, zK1, zK2
2780         );
2781       }else{
2782         newRowid = sqlite3_value_int64(argv[1]);
2783         spellfix1DbExec(&rc, db,
2784             "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) "
2785             "VALUES(%lld,%d,%d,%Q,%Q,%Q)",
2786             zConflict, p->zDbName, p->zTableName,
2787             newRowid, iRank, iLang, zWord, zK1, zK2
2788         );
2789       }
2790       *pRowid = sqlite3_last_insert_rowid(db);
2791     }else{
2792       rowid = sqlite3_value_int64(argv[0]);
2793       newRowid = *pRowid = sqlite3_value_int64(argv[1]);
2794       spellfix1DbExec(&rc, db,
2795              "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
2796              " word=%Q, k1=%Q, k2=%Q WHERE id=%lld",
2797              zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang,
2798              zWord, zK1, zK2, rowid
2799       );
2800     }
2801     sqlite3_free(zK1);
2802     sqlite3_free(zK2);
2803   }
2804   return rc;
2805 }
2806 
2807 /*
2808 ** Rename the spellfix1 table.
2809 */
2810 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){
2811   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2812   sqlite3 *db = p->db;
2813   int rc = SQLITE_OK;
2814   char *zNewName = sqlite3_mprintf("%s", zNew);
2815   if( zNewName==0 ){
2816     return SQLITE_NOMEM;
2817   }
2818   spellfix1DbExec(&rc, db,
2819      "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
2820      p->zDbName, p->zTableName, zNewName
2821   );
2822   if( rc==SQLITE_OK ){
2823     sqlite3_free(p->zTableName);
2824     p->zTableName = zNewName;
2825   }else{
2826     sqlite3_free(zNewName);
2827   }
2828   return rc;
2829 }
2830 
2831 
2832 /*
2833 ** A virtual table module that provides fuzzy search.
2834 */
2835 static sqlite3_module spellfix1Module = {
2836   0,                       /* iVersion */
2837   spellfix1Create,         /* xCreate - handle CREATE VIRTUAL TABLE */
2838   spellfix1Connect,        /* xConnect - reconnected to an existing table */
2839   spellfix1BestIndex,      /* xBestIndex - figure out how to do a query */
2840   spellfix1Disconnect,     /* xDisconnect - close a connection */
2841   spellfix1Destroy,        /* xDestroy - handle DROP TABLE */
2842   spellfix1Open,           /* xOpen - open a cursor */
2843   spellfix1Close,          /* xClose - close a cursor */
2844   spellfix1Filter,         /* xFilter - configure scan constraints */
2845   spellfix1Next,           /* xNext - advance a cursor */
2846   spellfix1Eof,            /* xEof - check for end of scan */
2847   spellfix1Column,         /* xColumn - read data */
2848   spellfix1Rowid,          /* xRowid - read data */
2849   spellfix1Update,         /* xUpdate */
2850   0,                       /* xBegin */
2851   0,                       /* xSync */
2852   0,                       /* xCommit */
2853   0,                       /* xRollback */
2854   0,                       /* xFindMethod */
2855   spellfix1Rename,         /* xRename */
2856 };
2857 
2858 /*
2859 ** Register the various functions and the virtual table.
2860 */
2861 static int spellfix1Register(sqlite3 *db){
2862   int rc = SQLITE_OK;
2863   int i;
2864   rc = sqlite3_create_function(db, "spellfix1_translit", 1, SQLITE_UTF8, 0,
2865                                   transliterateSqlFunc, 0, 0);
2866   if( rc==SQLITE_OK ){
2867     rc = sqlite3_create_function(db, "spellfix1_editdist", 2, SQLITE_UTF8, 0,
2868                                   editdistSqlFunc, 0, 0);
2869   }
2870   if( rc==SQLITE_OK ){
2871     rc = sqlite3_create_function(db, "spellfix1_phonehash", 1, SQLITE_UTF8, 0,
2872                                   phoneticHashSqlFunc, 0, 0);
2873   }
2874   if( rc==SQLITE_OK ){
2875     rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1, SQLITE_UTF8, 0,
2876                                   scriptCodeSqlFunc, 0, 0);
2877   }
2878   if( rc==SQLITE_OK ){
2879     rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
2880   }
2881   if( rc==SQLITE_OK ){
2882     rc = editDist3Install(db);
2883   }
2884 
2885   /* Verify sanity of the translit[] table */
2886   for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
2887     assert( translit[i].cFrom<translit[i+1].cFrom );
2888   }
2889 
2890   return rc;
2891 }
2892 
2893 #endif /* SQLITE_OMIT_VIRTUALTABLE */
2894 
2895 /*
2896 ** Extension load function.
2897 */
2898 #ifdef _WIN32
2899 __declspec(dllexport)
2900 #endif
2901 int sqlite3_spellfix_init(
2902   sqlite3 *db,
2903   char **pzErrMsg,
2904   const sqlite3_api_routines *pApi
2905 ){
2906   SQLITE_EXTENSION_INIT2(pApi);
2907 #ifndef SQLITE_OMIT_VIRTUALTABLE
2908   return spellfix1Register(db);
2909 #endif
2910   return SQLITE_OK;
2911 }
2912