xref: /sqlite-3.40.0/ext/misc/spellfix.c (revision 3f09beda)
1 /*
2 ** 2012 April 10
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used
14 ** to search a large vocabulary for close matches.  See separate
15 ** documentation (http://www.sqlite.org/spellfix1.html) for details.
16 */
17 #include "sqlite3ext.h"
18 SQLITE_EXTENSION_INIT1
19 
20 #ifndef SQLITE_AMALGAMATION
21 # include <string.h>
22 # include <stdio.h>
23 # include <stdlib.h>
24 # include <assert.h>
25 # define ALWAYS(X)  1
26 # define NEVER(X)   0
27   typedef unsigned char u8;
28   typedef unsigned short u16;
29 #endif
30 #include <ctype.h>
31 
32 #ifndef SQLITE_OMIT_VIRTUALTABLE
33 
34 /*
35 ** Character classes for ASCII characters:
36 **
37 **   0   ''        Silent letters:   H W
38 **   1   'A'       Any vowel:   A E I O U (Y)
39 **   2   'B'       A bilabeal stop or fricative:  B F P V W
40 **   3   'C'       Other fricatives or back stops:  C G J K Q S X Z
41 **   4   'D'       Alveolar stops:  D T
42 **   5   'H'       Letter H at the beginning of a word
43 **   6   'L'       Glide:  L
44 **   7   'R'       Semivowel:  R
45 **   8   'M'       Nasals:  M N
46 **   9   'Y'       Letter Y at the beginning of a word.
47 **   10  '9'       Digits: 0 1 2 3 4 5 6 7 8 9
48 **   11  ' '       White space
49 **   12  '?'       Other.
50 */
51 #define CCLASS_SILENT         0
52 #define CCLASS_VOWEL          1
53 #define CCLASS_B              2
54 #define CCLASS_C              3
55 #define CCLASS_D              4
56 #define CCLASS_H              5
57 #define CCLASS_L              6
58 #define CCLASS_R              7
59 #define CCLASS_M              8
60 #define CCLASS_Y              9
61 #define CCLASS_DIGIT         10
62 #define CCLASS_SPACE         11
63 #define CCLASS_OTHER         12
64 
65 /*
66 ** The following table gives the character class for non-initial ASCII
67 ** characters.
68 */
69 static const unsigned char midClass[] = {
70  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
71  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
72  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
73  /*   */ CCLASS_SPACE,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
74  /*   */ CCLASS_SPACE,    /*   */ CCLASS_SPACE,   /*   */ CCLASS_OTHER,
75  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
76  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
77  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
78  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
79  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
80  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_SPACE,
81  /* ! */ CCLASS_OTHER,    /* " */ CCLASS_OTHER,   /* # */ CCLASS_OTHER,
82  /* $ */ CCLASS_OTHER,    /* % */ CCLASS_OTHER,   /* & */ CCLASS_OTHER,
83  /* ' */ CCLASS_SILENT,   /* ( */ CCLASS_OTHER,   /* ) */ CCLASS_OTHER,
84  /* * */ CCLASS_OTHER,    /* + */ CCLASS_OTHER,   /* , */ CCLASS_OTHER,
85  /* - */ CCLASS_OTHER,    /* . */ CCLASS_OTHER,   /* / */ CCLASS_OTHER,
86  /* 0 */ CCLASS_DIGIT,    /* 1 */ CCLASS_DIGIT,   /* 2 */ CCLASS_DIGIT,
87  /* 3 */ CCLASS_DIGIT,    /* 4 */ CCLASS_DIGIT,   /* 5 */ CCLASS_DIGIT,
88  /* 6 */ CCLASS_DIGIT,    /* 7 */ CCLASS_DIGIT,   /* 8 */ CCLASS_DIGIT,
89  /* 9 */ CCLASS_DIGIT,    /* : */ CCLASS_OTHER,   /* ; */ CCLASS_OTHER,
90  /* < */ CCLASS_OTHER,    /* = */ CCLASS_OTHER,   /* > */ CCLASS_OTHER,
91  /* ? */ CCLASS_OTHER,    /* @ */ CCLASS_OTHER,   /* A */ CCLASS_VOWEL,
92  /* B */ CCLASS_B,        /* C */ CCLASS_C,       /* D */ CCLASS_D,
93  /* E */ CCLASS_VOWEL,    /* F */ CCLASS_B,       /* G */ CCLASS_C,
94  /* H */ CCLASS_SILENT,   /* I */ CCLASS_VOWEL,   /* J */ CCLASS_C,
95  /* K */ CCLASS_C,        /* L */ CCLASS_L,       /* M */ CCLASS_M,
96  /* N */ CCLASS_M,        /* O */ CCLASS_VOWEL,   /* P */ CCLASS_B,
97  /* Q */ CCLASS_C,        /* R */ CCLASS_R,       /* S */ CCLASS_C,
98  /* T */ CCLASS_D,        /* U */ CCLASS_VOWEL,   /* V */ CCLASS_B,
99  /* W */ CCLASS_B,        /* X */ CCLASS_C,       /* Y */ CCLASS_VOWEL,
100  /* Z */ CCLASS_C,        /* [ */ CCLASS_OTHER,   /* \ */ CCLASS_OTHER,
101  /* ] */ CCLASS_OTHER,    /* ^ */ CCLASS_OTHER,   /* _ */ CCLASS_OTHER,
102  /* ` */ CCLASS_OTHER,    /* a */ CCLASS_VOWEL,   /* b */ CCLASS_B,
103  /* c */ CCLASS_C,        /* d */ CCLASS_D,       /* e */ CCLASS_VOWEL,
104  /* f */ CCLASS_B,        /* g */ CCLASS_C,       /* h */ CCLASS_SILENT,
105  /* i */ CCLASS_VOWEL,    /* j */ CCLASS_C,       /* k */ CCLASS_C,
106  /* l */ CCLASS_L,        /* m */ CCLASS_M,       /* n */ CCLASS_M,
107  /* o */ CCLASS_VOWEL,    /* p */ CCLASS_B,       /* q */ CCLASS_C,
108  /* r */ CCLASS_R,        /* s */ CCLASS_C,       /* t */ CCLASS_D,
109  /* u */ CCLASS_VOWEL,    /* v */ CCLASS_B,       /* w */ CCLASS_B,
110  /* x */ CCLASS_C,        /* y */ CCLASS_VOWEL,   /* z */ CCLASS_C,
111  /* { */ CCLASS_OTHER,    /* | */ CCLASS_OTHER,   /* } */ CCLASS_OTHER,
112  /* ~ */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,
113 };
114 /*
115 ** This tables gives the character class for ASCII characters that form the
116 ** initial character of a word.  The only difference from midClass is with
117 ** the letters H, W, and Y.
118 */
119 static const unsigned char initClass[] = {
120  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
121  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
122  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
123  /*   */ CCLASS_SPACE,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
124  /*   */ CCLASS_SPACE,    /*   */ CCLASS_SPACE,   /*   */ CCLASS_OTHER,
125  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
126  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
127  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
128  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
129  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
130  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_SPACE,
131  /* ! */ CCLASS_OTHER,    /* " */ CCLASS_OTHER,   /* # */ CCLASS_OTHER,
132  /* $ */ CCLASS_OTHER,    /* % */ CCLASS_OTHER,   /* & */ CCLASS_OTHER,
133  /* ' */ CCLASS_OTHER,    /* ( */ CCLASS_OTHER,   /* ) */ CCLASS_OTHER,
134  /* * */ CCLASS_OTHER,    /* + */ CCLASS_OTHER,   /* , */ CCLASS_OTHER,
135  /* - */ CCLASS_OTHER,    /* . */ CCLASS_OTHER,   /* / */ CCLASS_OTHER,
136  /* 0 */ CCLASS_DIGIT,    /* 1 */ CCLASS_DIGIT,   /* 2 */ CCLASS_DIGIT,
137  /* 3 */ CCLASS_DIGIT,    /* 4 */ CCLASS_DIGIT,   /* 5 */ CCLASS_DIGIT,
138  /* 6 */ CCLASS_DIGIT,    /* 7 */ CCLASS_DIGIT,   /* 8 */ CCLASS_DIGIT,
139  /* 9 */ CCLASS_DIGIT,    /* : */ CCLASS_OTHER,   /* ; */ CCLASS_OTHER,
140  /* < */ CCLASS_OTHER,    /* = */ CCLASS_OTHER,   /* > */ CCLASS_OTHER,
141  /* ? */ CCLASS_OTHER,    /* @ */ CCLASS_OTHER,   /* A */ CCLASS_VOWEL,
142  /* B */ CCLASS_B,        /* C */ CCLASS_C,       /* D */ CCLASS_D,
143  /* E */ CCLASS_VOWEL,    /* F */ CCLASS_B,       /* G */ CCLASS_C,
144  /* H */ CCLASS_SILENT,   /* I */ CCLASS_VOWEL,   /* J */ CCLASS_C,
145  /* K */ CCLASS_C,        /* L */ CCLASS_L,       /* M */ CCLASS_M,
146  /* N */ CCLASS_M,        /* O */ CCLASS_VOWEL,   /* P */ CCLASS_B,
147  /* Q */ CCLASS_C,        /* R */ CCLASS_R,       /* S */ CCLASS_C,
148  /* T */ CCLASS_D,        /* U */ CCLASS_VOWEL,   /* V */ CCLASS_B,
149  /* W */ CCLASS_B,        /* X */ CCLASS_C,       /* Y */ CCLASS_Y,
150  /* Z */ CCLASS_C,        /* [ */ CCLASS_OTHER,   /* \ */ CCLASS_OTHER,
151  /* ] */ CCLASS_OTHER,    /* ^ */ CCLASS_OTHER,   /* _ */ CCLASS_OTHER,
152  /* ` */ CCLASS_OTHER,    /* a */ CCLASS_VOWEL,   /* b */ CCLASS_B,
153  /* c */ CCLASS_C,        /* d */ CCLASS_D,       /* e */ CCLASS_VOWEL,
154  /* f */ CCLASS_B,        /* g */ CCLASS_C,       /* h */ CCLASS_SILENT,
155  /* i */ CCLASS_VOWEL,    /* j */ CCLASS_C,       /* k */ CCLASS_C,
156  /* l */ CCLASS_L,        /* m */ CCLASS_M,       /* n */ CCLASS_M,
157  /* o */ CCLASS_VOWEL,    /* p */ CCLASS_B,       /* q */ CCLASS_C,
158  /* r */ CCLASS_R,        /* s */ CCLASS_C,       /* t */ CCLASS_D,
159  /* u */ CCLASS_VOWEL,    /* v */ CCLASS_B,       /* w */ CCLASS_B,
160  /* x */ CCLASS_C,        /* y */ CCLASS_Y,       /* z */ CCLASS_C,
161  /* { */ CCLASS_OTHER,    /* | */ CCLASS_OTHER,   /* } */ CCLASS_OTHER,
162  /* ~ */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,
163 };
164 
165 /*
166 ** Mapping from the character class number (0-13) to a symbol for each
167 ** character class.  Note that initClass[] can be used to map the class
168 ** symbol back into the class number.
169 */
170 static const unsigned char className[] = ".ABCDHLRMY9 ?";
171 
172 /*
173 ** Generate a "phonetic hash" from a string of ASCII characters
174 ** in zIn[0..nIn-1].
175 **
176 **   * Map characters by character class as defined above.
177 **   * Omit double-letters
178 **   * Omit vowels beside R and L
179 **   * Omit T when followed by CH
180 **   * Omit W when followed by R
181 **   * Omit D when followed by J or G
182 **   * Omit K in KN or G in GN at the beginning of a word
183 **
184 ** Space to hold the result is obtained from sqlite3_malloc()
185 **
186 ** Return NULL if memory allocation fails.
187 */
188 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){
189   unsigned char *zOut = sqlite3_malloc( nIn + 1 );
190   int i;
191   int nOut = 0;
192   char cPrev = 0x77;
193   char cPrevX = 0x77;
194   const unsigned char *aClass = initClass;
195 
196   if( zOut==0 ) return 0;
197   if( nIn>2 ){
198     switch( zIn[0] ){
199       case 'g':
200       case 'k': {
201         if( zIn[1]=='n' ){ zIn++; nIn--; }
202         break;
203       }
204     }
205   }
206   for(i=0; i<nIn; i++){
207     unsigned char c = zIn[i];
208     if( i+1<nIn ){
209       if( c=='w' && zIn[i+1]=='r' ) continue;
210       if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
211       if( i+2<nIn ){
212         if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
213       }
214     }
215     c = aClass[c&0x7f];
216     if( c==CCLASS_SPACE ) continue;
217     if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
218     aClass = midClass;
219     if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){
220        continue; /* No vowels beside L or R */
221     }
222     if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){
223        nOut--;   /* No vowels beside L or R */
224     }
225     cPrev = c;
226     if( c==CCLASS_SILENT ) continue;
227     cPrevX = c;
228     c = className[c];
229     assert( nOut>=0 );
230     if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c;
231   }
232   zOut[nOut] = 0;
233   return zOut;
234 }
235 
236 /*
237 ** This is an SQL function wrapper around phoneticHash().  See
238 ** the description of phoneticHash() for additional information.
239 */
240 static void phoneticHashSqlFunc(
241   sqlite3_context *context,
242   int argc,
243   sqlite3_value **argv
244 ){
245   const unsigned char *zIn;
246   unsigned char *zOut;
247 
248   zIn = sqlite3_value_text(argv[0]);
249   if( zIn==0 ) return;
250   zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0]));
251   if( zOut==0 ){
252     sqlite3_result_error_nomem(context);
253   }else{
254     sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
255   }
256 }
257 
258 /*
259 ** Return the character class number for a character given its
260 ** context.
261 */
262 static char characterClass(char cPrev, char c){
263   return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
264 }
265 
266 /*
267 ** Return the cost of inserting or deleting character c immediately
268 ** following character cPrev.  If cPrev==0, that means c is the first
269 ** character of the word.
270 */
271 static int insertOrDeleteCost(char cPrev, char c, char cNext){
272   char classC = characterClass(cPrev, c);
273   char classCprev;
274 
275   if( classC==CCLASS_SILENT ){
276     /* Insert or delete "silent" characters such as H or W */
277     return 1;
278   }
279   if( cPrev==c ){
280     /* Repeated characters, or miss a repeat */
281     return 10;
282   }
283   if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){
284     return 20;  /* Insert a vowel before or after 'r' */
285   }
286   classCprev = characterClass(cPrev, cPrev);
287   if( classC==classCprev ){
288     if( classC==CCLASS_VOWEL ){
289       /* Remove or add a new vowel to a vowel cluster */
290       return 15;
291     }else{
292       /* Remove or add a consonant not in the same class */
293       return 50;
294     }
295   }
296 
297   /* any other character insertion or deletion */
298   return 100;
299 }
300 
301 /*
302 ** Divide the insertion cost by this factor when appending to the
303 ** end of the word.
304 */
305 #define FINAL_INS_COST_DIV  4
306 
307 /*
308 ** Return the cost of substituting cTo in place of cFrom assuming
309 ** the previous character is cPrev.  If cPrev==0 then cTo is the first
310 ** character of the word.
311 */
312 static int substituteCost(char cPrev, char cFrom, char cTo){
313   char classFrom, classTo;
314   if( cFrom==cTo ){
315     /* Exact match */
316     return 0;
317   }
318   if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){
319     /* differ only in case */
320     return 0;
321   }
322   classFrom = characterClass(cPrev, cFrom);
323   classTo = characterClass(cPrev, cTo);
324   if( classFrom==classTo ){
325     /* Same character class */
326     return 40;
327   }
328   if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
329       && classTo>=CCLASS_B && classTo<=CCLASS_Y ){
330     /* Convert from one consonant to another, but in a different class */
331     return 75;
332   }
333   /* Any other subsitution */
334   return 100;
335 }
336 
337 /*
338 ** Given two strings zA and zB which are pure ASCII, return the cost
339 ** of transforming zA into zB.  If zA ends with '*' assume that it is
340 ** a prefix of zB and give only minimal penalty for extra characters
341 ** on the end of zB.
342 **
343 ** Smaller numbers mean a closer match.
344 **
345 ** Negative values indicate an error:
346 **    -1  One of the inputs is NULL
347 **    -2  Non-ASCII characters on input
348 **    -3  Unable to allocate memory
349 **
350 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
351 ** of zB that matched the pattern in zA. If zA does not end with a '*',
352 ** then this value is always the number of bytes in zB (i.e. strlen(zB)).
353 ** If zA does end in a '*', then it is the number of bytes in the prefix
354 ** of zB that was deemed to match zA.
355 */
356 static int editdist1(const char *zA, const char *zB, int *pnMatch){
357   int nA, nB;            /* Number of characters in zA[] and zB[] */
358   int xA, xB;            /* Loop counters for zA[] and zB[] */
359   char cA = 0, cB;       /* Current character of zA and zB */
360   char cAprev, cBprev;   /* Previous character of zA and zB */
361   char cAnext, cBnext;   /* Next character in zA and zB */
362   int d;                 /* North-west cost value */
363   int dc = 0;            /* North-west character value */
364   int res;               /* Final result */
365   int *m;                /* The cost matrix */
366   char *cx;              /* Corresponding character values */
367   int *toFree = 0;       /* Malloced space */
368   int mStack[60+15];     /* Stack space to use if not too much is needed */
369   int nMatch = 0;
370 
371   /* Early out if either input is NULL */
372   if( zA==0 || zB==0 ) return -1;
373 
374   /* Skip any common prefix */
375   while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
376   if( pnMatch ) *pnMatch = nMatch;
377   if( zA[0]==0 && zB[0]==0 ) return 0;
378 
379 #if 0
380   printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
381 #endif
382 
383   /* Verify input strings and measure their lengths */
384   for(nA=0; zA[nA]; nA++){
385     if( zA[nA]&0x80 ) return -2;
386   }
387   for(nB=0; zB[nB]; nB++){
388     if( zB[nB]&0x80 ) return -2;
389   }
390 
391   /* Special processing if either string is empty */
392   if( nA==0 ){
393     cBprev = dc;
394     for(xB=res=0; (cB = zB[xB])!=0; xB++){
395       res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
396       cBprev = cB;
397     }
398     return res;
399   }
400   if( nB==0 ){
401     cAprev = dc;
402     for(xA=res=0; (cA = zA[xA])!=0; xA++){
403       res += insertOrDeleteCost(cAprev, cA, zA[xA+1]);
404       cAprev = cA;
405     }
406     return res;
407   }
408 
409   /* A is a prefix of B */
410   if( zA[0]=='*' && zA[1]==0 ) return 0;
411 
412   /* Allocate and initialize the Wagner matrix */
413   if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){
414     m = mStack;
415   }else{
416     m = toFree = sqlite3_malloc( (nB+1)*5*sizeof(m[0])/4 );
417     if( m==0 ) return -3;
418   }
419   cx = (char*)&m[nB+1];
420 
421   /* Compute the Wagner edit distance */
422   m[0] = 0;
423   cx[0] = dc;
424   cBprev = dc;
425   for(xB=1; xB<=nB; xB++){
426     cBnext = zB[xB];
427     cB = zB[xB-1];
428     cx[xB] = cB;
429     m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext);
430     cBprev = cB;
431   }
432   cAprev = dc;
433   for(xA=1; xA<=nA; xA++){
434     int lastA = (xA==nA);
435     cA = zA[xA-1];
436     cAnext = zA[xA];
437     if( cA=='*' && lastA ) break;
438     d = m[0];
439     dc = cx[0];
440     m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
441     cBprev = 0;
442     for(xB=1; xB<=nB; xB++){
443       int totalCost, insCost, delCost, subCost, ncx;
444       cB = zB[xB-1];
445       cBnext = zB[xB];
446 
447       /* Cost to insert cB */
448       insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext);
449       if( lastA ) insCost /= FINAL_INS_COST_DIV;
450 
451       /* Cost to delete cA */
452       delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
453 
454       /* Cost to substitute cA->cB */
455       subCost = substituteCost(cx[xB-1], cA, cB);
456 
457       /* Best cost */
458       totalCost = insCost + m[xB-1];
459       ncx = cB;
460       if( (delCost + m[xB])<totalCost ){
461         totalCost = delCost + m[xB];
462         ncx = cA;
463       }
464       if( (subCost + d)<totalCost ){
465         totalCost = subCost + d;
466       }
467 
468 #if 0
469       printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
470              " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
471              xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
472              insCost, delCost, subCost, totalCost, ncx?ncx:' ');
473 #endif
474 
475       /* Update the matrix */
476       d = m[xB];
477       dc = cx[xB];
478       m[xB] = totalCost;
479       cx[xB] = ncx;
480       cBprev = cB;
481     }
482     cAprev = cA;
483   }
484 
485   /* Free the wagner matrix and return the result */
486   if( cA=='*' ){
487     res = m[1];
488     for(xB=1; xB<=nB; xB++){
489       if( m[xB]<res ){
490         res = m[xB];
491         if( pnMatch ) *pnMatch = xB+nMatch;
492       }
493     }
494   }else{
495     res = m[nB];
496     /* In the current implementation, pnMatch is always NULL if zA does
497     ** not end in "*" */
498     assert( pnMatch==0 );
499   }
500   sqlite3_free(toFree);
501   return res;
502 }
503 
504 /*
505 ** Function:    editdist(A,B)
506 **
507 ** Return the cost of transforming string A into string B.  Both strings
508 ** must be pure ASCII text.  If A ends with '*' then it is assumed to be
509 ** a prefix of B and extra characters on the end of B have minimal additional
510 ** cost.
511 */
512 static void editdistSqlFunc(
513   sqlite3_context *context,
514   int argc,
515   sqlite3_value **argv
516 ){
517   int res = editdist1(
518                     (const char*)sqlite3_value_text(argv[0]),
519                     (const char*)sqlite3_value_text(argv[1]),
520                     0);
521   if( res<0 ){
522     if( res==(-3) ){
523       sqlite3_result_error_nomem(context);
524     }else if( res==(-2) ){
525       sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
526     }else{
527       sqlite3_result_error(context, "NULL input to editdist()", -1);
528     }
529   }else{
530     sqlite3_result_int(context, res);
531   }
532 }
533 
534 /* End of the fixed-cost edit distance implementation
535 ******************************************************************************
536 *****************************************************************************
537 ** Begin: Configurable cost unicode edit distance routines
538 */
539 /* Forward declaration of structures */
540 typedef struct EditDist3Cost EditDist3Cost;
541 typedef struct EditDist3Config EditDist3Config;
542 typedef struct EditDist3Point EditDist3Point;
543 typedef struct EditDist3From EditDist3From;
544 typedef struct EditDist3FromString EditDist3FromString;
545 typedef struct EditDist3To EditDist3To;
546 typedef struct EditDist3ToString EditDist3ToString;
547 typedef struct EditDist3Lang EditDist3Lang;
548 
549 
550 /*
551 ** An entry in the edit cost table
552 */
553 struct EditDist3Cost {
554   EditDist3Cost *pNext;     /* Next cost element */
555   u8 nFrom;                 /* Number of bytes in aFrom */
556   u8 nTo;                   /* Number of bytes in aTo */
557   u16 iCost;                /* Cost of this transformation */
558   char a[4]    ;            /* FROM string followed by TO string */
559   /* Additional TO and FROM string bytes appended as necessary */
560 };
561 
562 /*
563 ** Edit costs for a particular language ID
564 */
565 struct EditDist3Lang {
566   int iLang;             /* Language ID */
567   int iInsCost;          /* Default insertion cost */
568   int iDelCost;          /* Default deletion cost */
569   int iSubCost;          /* Default substitution cost */
570   EditDist3Cost *pCost;  /* Costs */
571 };
572 
573 
574 /*
575 ** The default EditDist3Lang object, with default costs.
576 */
577 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 };
578 
579 /*
580 ** Complete configuration
581 */
582 struct EditDist3Config {
583   int nLang;             /* Number of language IDs.  Size of a[] */
584   EditDist3Lang *a;      /* One for each distinct language ID */
585 };
586 
587 /*
588 ** Extra information about each character in the FROM string.
589 */
590 struct EditDist3From {
591   int nSubst;              /* Number of substitution cost entries */
592   int nDel;                /* Number of deletion cost entries */
593   int nByte;               /* Number of bytes in this character */
594   EditDist3Cost **apSubst; /* Array of substitution costs for this element */
595   EditDist3Cost **apDel;   /* Array of deletion cost entries */
596 };
597 
598 /*
599 ** A precompiled FROM string.
600 *
601 ** In the common case we expect the FROM string to be reused multiple times.
602 ** In other words, the common case will be to measure the edit distance
603 ** from a single origin string to multiple target strings.
604 */
605 struct EditDist3FromString {
606   char *z;                 /* The complete text of the FROM string */
607   int n;                   /* Number of characters in the FROM string */
608   int isPrefix;            /* True if ends with '*' character */
609   EditDist3From *a;        /* Extra info about each char of the FROM string */
610 };
611 
612 /*
613 ** Extra information about each character in the TO string.
614 */
615 struct EditDist3To {
616   int nIns;                /* Number of insertion cost entries */
617   int nByte;               /* Number of bytes in this character */
618   EditDist3Cost **apIns;   /* Array of deletion cost entries */
619 };
620 
621 /*
622 ** A precompiled FROM string
623 */
624 struct EditDist3ToString {
625   char *z;                 /* The complete text of the TO string */
626   int n;                   /* Number of characters in the TO string */
627   EditDist3To *a;          /* Extra info about each char of the TO string */
628 };
629 
630 /*
631 ** Clear or delete an instance of the object that records all edit-distance
632 ** weights.
633 */
634 static void editDist3ConfigClear(EditDist3Config *p){
635   int i;
636   if( p==0 ) return;
637   for(i=0; i<p->nLang; i++){
638     EditDist3Cost *pCost, *pNext;
639     pCost = p->a[i].pCost;
640     while( pCost ){
641       pNext = pCost->pNext;
642       sqlite3_free(pCost);
643       pCost = pNext;
644     }
645   }
646   sqlite3_free(p->a);
647   memset(p, 0, sizeof(*p));
648 }
649 static void editDist3ConfigDelete(void *pIn){
650   EditDist3Config *p = (EditDist3Config*)pIn;
651   editDist3ConfigClear(p);
652   sqlite3_free(p);
653 }
654 
655 /*
656 ** Load all edit-distance weights from a table.
657 */
658 static int editDist3ConfigLoad(
659   EditDist3Config *p,      /* The edit distance configuration to load */
660   sqlite3 *db,            /* Load from this database */
661   const char *zTable      /* Name of the table from which to load */
662 ){
663   sqlite3_stmt *pStmt;
664   int rc, rc2;
665   char *zSql;
666   int iLangPrev = -9999;
667   EditDist3Lang *pLang = 0;
668 
669   zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
670                          " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
671   if( zSql==0 ) return SQLITE_NOMEM;
672   rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
673   sqlite3_free(zSql);
674   if( rc ) return rc;
675   editDist3ConfigClear(p);
676   while( sqlite3_step(pStmt)==SQLITE_ROW ){
677     int iLang = sqlite3_column_int(pStmt, 0);
678     const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
679     int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
680     const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
681     int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
682     int iCost = sqlite3_column_int(pStmt, 3);
683 
684     assert( zFrom!=0 || nFrom==0 );
685     assert( zTo!=0 || nTo==0 );
686     if( nFrom>100 || nTo>100 ) continue;
687     if( iCost<0 ) continue;
688     if( pLang==0 || iLang!=iLangPrev ){
689       EditDist3Lang *pNew;
690       pNew = sqlite3_realloc(p->a, (p->nLang+1)*sizeof(p->a[0]));
691       if( pNew==0 ){ rc = SQLITE_NOMEM; break; }
692       p->a = pNew;
693       pLang = &p->a[p->nLang];
694       p->nLang++;
695       pLang->iLang = iLang;
696       pLang->iInsCost = 100;
697       pLang->iDelCost = 100;
698       pLang->iSubCost = 150;
699       pLang->pCost = 0;
700       iLangPrev = iLang;
701     }
702     if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){
703       pLang->iDelCost = iCost;
704     }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){
705       pLang->iInsCost = iCost;
706     }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){
707       pLang->iSubCost = iCost;
708     }else{
709       EditDist3Cost *pCost;
710       int nExtra = nFrom + nTo - 4;
711       if( nExtra<0 ) nExtra = 0;
712       pCost = sqlite3_malloc( sizeof(*pCost) + nExtra );
713       if( pCost==0 ){ rc = SQLITE_NOMEM; break; }
714       pCost->nFrom = nFrom;
715       pCost->nTo = nTo;
716       pCost->iCost = iCost;
717       memcpy(pCost->a, zFrom, nFrom);
718       memcpy(pCost->a + nFrom, zTo, nTo);
719       pCost->pNext = pLang->pCost;
720       pLang->pCost = pCost;
721     }
722   }
723   rc2 = sqlite3_finalize(pStmt);
724   if( rc==SQLITE_OK ) rc = rc2;
725   return rc;
726 }
727 
728 /*
729 ** Return the length (in bytes) of a utf-8 character.  Or return a maximum
730 ** of N.
731 */
732 static int utf8Len(unsigned char c, int N){
733   int len = 1;
734   if( c>0x7f ){
735     if( (c&0xe0)==0xc0 ){
736       len = 2;
737     }else if( (c&0xf0)==0xe0 ){
738       len = 3;
739     }else{
740       len = 4;
741     }
742   }
743   if( len>N ) len = N;
744   return len;
745 }
746 
747 /*
748 ** Return TRUE (non-zero) if the To side of the given cost matches
749 ** the given string.
750 */
751 static int matchTo(EditDist3Cost *p, const char *z, int n){
752   if( p->nTo>n ) return 0;
753   if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0;
754   return 1;
755 }
756 
757 /*
758 ** Return TRUE (non-zero) if the From side of the given cost matches
759 ** the given string.
760 */
761 static int matchFrom(EditDist3Cost *p, const char *z, int n){
762   assert( p->nFrom<=n );
763   if( strncmp(p->a, z, p->nFrom)!=0 ) return 0;
764   return 1;
765 }
766 
767 /*
768 ** Return TRUE (non-zero) of the next FROM character and the next TO
769 ** character are the same.
770 */
771 static int matchFromTo(
772   EditDist3FromString *pStr,  /* Left hand string */
773   int n1,                     /* Index of comparison character on the left */
774   const char *z2,             /* Right-handl comparison character */
775   int n2                      /* Bytes remaining in z2[] */
776 ){
777   int b1 = pStr->a[n1].nByte;
778   if( b1>n2 ) return 0;
779   if( memcmp(pStr->z+n1, z2, b1)!=0 ) return 0;
780   return 1;
781 }
782 
783 /*
784 ** Delete an EditDist3FromString objecct
785 */
786 static void editDist3FromStringDelete(EditDist3FromString *p){
787   int i;
788   if( p ){
789     for(i=0; i<p->n; i++){
790       sqlite3_free(p->a[i].apDel);
791       sqlite3_free(p->a[i].apSubst);
792     }
793     sqlite3_free(p);
794   }
795 }
796 
797 /*
798 ** Create a EditDist3FromString object.
799 */
800 static EditDist3FromString *editDist3FromStringNew(
801   const EditDist3Lang *pLang,
802   const char *z,
803   int n
804 ){
805   EditDist3FromString *pStr;
806   EditDist3Cost *p;
807   int i;
808 
809   if( z==0 ) return 0;
810   if( n<0 ) n = (int)strlen(z);
811   pStr = sqlite3_malloc( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
812   if( pStr==0 ) return 0;
813   pStr->a = (EditDist3From*)&pStr[1];
814   memset(pStr->a, 0, sizeof(pStr->a[0])*n);
815   pStr->n = n;
816   pStr->z = (char*)&pStr->a[n];
817   memcpy(pStr->z, z, n+1);
818   if( n && z[n-1]=='*' ){
819     pStr->isPrefix = 1;
820     n--;
821     pStr->n--;
822     pStr->z[n] = 0;
823   }else{
824     pStr->isPrefix = 0;
825   }
826 
827   for(i=0; i<n; i++){
828     EditDist3From *pFrom = &pStr->a[i];
829     memset(pFrom, 0, sizeof(*pFrom));
830     pFrom->nByte = utf8Len((unsigned char)z[i], n-i);
831     for(p=pLang->pCost; p; p=p->pNext){
832       EditDist3Cost **apNew;
833       if( i+p->nFrom>n ) continue;
834       if( matchFrom(p, z+i, n-i)==0 ) continue;
835       if( p->nTo==0 ){
836         apNew = sqlite3_realloc(pFrom->apDel,
837                                 sizeof(*apNew)*(pFrom->nDel+1));
838         if( apNew==0 ) break;
839         pFrom->apDel = apNew;
840         apNew[pFrom->nDel++] = p;
841       }else{
842         apNew = sqlite3_realloc(pFrom->apSubst,
843                                 sizeof(*apNew)*(pFrom->nSubst+1));
844         if( apNew==0 ) break;
845         pFrom->apSubst = apNew;
846         apNew[pFrom->nSubst++] = p;
847       }
848     }
849     if( p ){
850       editDist3FromStringDelete(pStr);
851       pStr = 0;
852       break;
853     }
854   }
855   return pStr;
856 }
857 
858 /*
859 ** Update entry m[i] such that it is the minimum of its current value
860 ** and m[j]+iCost.
861 **
862 ** If the iCost is 1,000,000 or greater, then consider the cost to be
863 ** infinite and skip the update.
864 */
865 static void updateCost(
866   unsigned int *m,
867   int i,
868   int j,
869   int iCost
870 ){
871   assert( iCost>=0 );
872   if( iCost<10000 ){
873     unsigned int b = m[j] + iCost;
874     if( b<m[i] ) m[i] = b;
875   }
876 }
877 
878 /* Compute the edit distance between two strings.
879 **
880 ** If an error occurs, return a negative number which is the error code.
881 **
882 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters
883 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
884 ** not contain the pattern for a prefix-search, then this is always the number
885 ** of characters in z2. If pFrom does contain a prefix search pattern, then
886 ** it is the number of characters in the prefix of z2 that was deemed to
887 ** match pFrom.
888 */
889 static int editDist3Core(
890   EditDist3FromString *pFrom,  /* The FROM string */
891   const char *z2,              /* The TO string */
892   int n2,                      /* Length of the TO string */
893   const EditDist3Lang *pLang,  /* Edit weights for a particular language ID */
894   int *pnMatch                 /* OUT: Characters in matched prefix */
895 ){
896   int k, n;
897   int i1, b1;
898   int i2, b2;
899   EditDist3FromString f = *pFrom;
900   EditDist3To *a2;
901   unsigned int *m;
902   int szRow;
903   EditDist3Cost *p;
904   int res;
905 
906   /* allocate the Wagner matrix and the aTo[] array for the TO string */
907   n = (f.n+1)*(n2+1);
908   n = (n+1)&~1;
909   m = sqlite3_malloc( n*sizeof(m[0]) + sizeof(a2[0])*n2 );
910   if( m==0 ) return -1;            /* Out of memory */
911   a2 = (EditDist3To*)&m[n];
912   memset(a2, 0, sizeof(a2[0])*n2);
913 
914   /* Fill in the a1[] matrix for all characters of the TO string */
915   for(i2=0; i2<n2; i2++){
916     a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2);
917     for(p=pLang->pCost; p; p=p->pNext){
918       EditDist3Cost **apNew;
919       if( p->nFrom>0 ) continue;
920       if( i2+p->nTo>n2 ) continue;
921       if( matchTo(p, z2+i2, n2-i2)==0 ) continue;
922       a2[i2].nIns++;
923       apNew = sqlite3_realloc(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns);
924       if( apNew==0 ){
925         res = -1;  /* Out of memory */
926         goto editDist3Abort;
927       }
928       a2[i2].apIns = apNew;
929       a2[i2].apIns[a2[i2].nIns-1] = p;
930     }
931   }
932 
933   /* Prepare to compute the minimum edit distance */
934   szRow = f.n+1;
935   memset(m, 0x01, (n2+1)*szRow*sizeof(m[0]));
936   m[0] = 0;
937 
938   /* First fill in the top-row of the matrix with FROM deletion costs */
939   for(i1=0; i1<f.n; i1 += b1){
940     b1 = f.a[i1].nByte;
941     updateCost(m, i1+b1, i1, pLang->iDelCost);
942     for(k=0; k<f.a[i1].nDel; k++){
943       p = f.a[i1].apDel[k];
944       updateCost(m, i1+p->nFrom, i1, p->iCost);
945     }
946   }
947 
948   /* Fill in all subsequent rows, top-to-bottom, left-to-right */
949   for(i2=0; i2<n2; i2 += b2){
950     int rx;      /* Starting index for current row */
951     int rxp;     /* Starting index for previous row */
952     b2 = a2[i2].nByte;
953     rx = szRow*(i2+b2);
954     rxp = szRow*i2;
955     updateCost(m, rx, rxp, pLang->iInsCost);
956     for(k=0; k<a2[i2].nIns; k++){
957       p = a2[i2].apIns[k];
958       updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost);
959     }
960     for(i1=0; i1<f.n; i1+=b1){
961       int cx;    /* Index of current cell */
962       int cxp;   /* Index of cell immediately to the left */
963       int cxd;   /* Index of cell to the left and one row above */
964       int cxu;   /* Index of cell immediately above */
965       b1 = f.a[i1].nByte;
966       cxp = rx + i1;
967       cx = cxp + b1;
968       cxd = rxp + i1;
969       cxu = cxd + b1;
970       updateCost(m, cx, cxp, pLang->iDelCost);
971       for(k=0; k<f.a[i1].nDel; k++){
972         p = f.a[i1].apDel[k];
973         updateCost(m, cxp+p->nFrom, cxp, p->iCost);
974       }
975       updateCost(m, cx, cxu, pLang->iInsCost);
976       if( matchFromTo(&f, i1, z2+i2, n2-i2) ){
977         updateCost(m, cx, cxd, 0);
978       }
979       updateCost(m, cx, cxd, pLang->iSubCost);
980       for(k=0; k<f.a[i1].nSubst; k++){
981         p = f.a[i1].apSubst[k];
982         if( matchTo(p, z2+i2, n2-i2) ){
983           updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
984         }
985       }
986     }
987   }
988 
989 #if 0  /* Enable for debugging */
990   printf("         ^");
991   for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
992   printf("\n   ^:");
993   for(i1=0; i1<szRow; i1++){
994     int v = m[i1];
995     if( v>9999 ) printf(" ****");
996     else         printf(" %4d", v);
997   }
998   printf("\n");
999   for(i2=0; i2<n2; i2++){
1000     printf("%c-%02x:", z2[i2], z2[i2]&0xff);
1001     for(i1=0; i1<szRow; i1++){
1002       int v = m[(i2+1)*szRow+i1];
1003       if( v>9999 ) printf(" ****");
1004       else         printf(" %4d", v);
1005     }
1006     printf("\n");
1007   }
1008 #endif
1009 
1010   /* Free memory allocations and return the result */
1011   res = (int)m[szRow*(n2+1)-1];
1012   n = n2;
1013   if( f.isPrefix ){
1014     for(i2=1; i2<=n2; i2++){
1015       int b = m[szRow*i2-1];
1016       if( b<=res ){
1017         res = b;
1018         n = i2 - 1;
1019       }
1020     }
1021   }
1022   if( pnMatch ){
1023     int nExtra = 0;
1024     for(k=0; k<n; k++){
1025       if( (z2[k] & 0xc0)==0x80 ) nExtra++;
1026     }
1027     *pnMatch = n - nExtra;
1028   }
1029 
1030 editDist3Abort:
1031   for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
1032   sqlite3_free(m);
1033   return res;
1034 }
1035 
1036 /*
1037 ** Get an appropriate EditDist3Lang object.
1038 */
1039 static const EditDist3Lang *editDist3FindLang(
1040   EditDist3Config *pConfig,
1041   int iLang
1042 ){
1043   int i;
1044   for(i=0; i<pConfig->nLang; i++){
1045     if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i];
1046   }
1047   return &editDist3Lang;
1048 }
1049 
1050 /*
1051 ** Function:    editdist3(A,B,iLang)
1052 **              editdist3(tablename)
1053 **
1054 ** Return the cost of transforming string A into string B using edit
1055 ** weights for iLang.
1056 **
1057 ** The second form loads edit weights into memory from a table.
1058 */
1059 static void editDist3SqlFunc(
1060   sqlite3_context *context,
1061   int argc,
1062   sqlite3_value **argv
1063 ){
1064   EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context);
1065   sqlite3 *db = sqlite3_context_db_handle(context);
1066   int rc;
1067   if( argc==1 ){
1068     const char *zTable = (const char*)sqlite3_value_text(argv[0]);
1069     rc = editDist3ConfigLoad(pConfig, db, zTable);
1070     if( rc ) sqlite3_result_error_code(context, rc);
1071   }else{
1072     const char *zA = (const char*)sqlite3_value_text(argv[0]);
1073     const char *zB = (const char*)sqlite3_value_text(argv[1]);
1074     int nA = sqlite3_value_bytes(argv[0]);
1075     int nB = sqlite3_value_bytes(argv[1]);
1076     int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0;
1077     const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang);
1078     EditDist3FromString *pFrom;
1079     int dist;
1080 
1081     pFrom = editDist3FromStringNew(pLang, zA, nA);
1082     if( pFrom==0 ){
1083       sqlite3_result_error_nomem(context);
1084       return;
1085     }
1086     dist = editDist3Core(pFrom, zB, nB, pLang, 0);
1087     editDist3FromStringDelete(pFrom);
1088     if( dist==(-1) ){
1089       sqlite3_result_error_nomem(context);
1090     }else{
1091       sqlite3_result_int(context, dist);
1092     }
1093   }
1094 }
1095 
1096 /*
1097 ** Register the editDist3 function with SQLite
1098 */
1099 static int editDist3Install(sqlite3 *db){
1100   int rc;
1101   EditDist3Config *pConfig = sqlite3_malloc( sizeof(*pConfig) );
1102   if( pConfig==0 ) return SQLITE_NOMEM;
1103   memset(pConfig, 0, sizeof(*pConfig));
1104   rc = sqlite3_create_function_v2(db, "editdist3",
1105               2, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1106   if( rc==SQLITE_OK ){
1107     rc = sqlite3_create_function_v2(db, "editdist3",
1108                 3, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1109   }
1110   if( rc==SQLITE_OK ){
1111     rc = sqlite3_create_function_v2(db, "editdist3",
1112                 1, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0,
1113                 editDist3ConfigDelete);
1114   }else{
1115     sqlite3_free(pConfig);
1116   }
1117   return rc;
1118 }
1119 /* End configurable cost unicode edit distance routines
1120 ******************************************************************************
1121 ******************************************************************************
1122 ** Begin transliterate unicode-to-ascii implementation
1123 */
1124 
1125 #if !SQLITE_AMALGAMATION
1126 /*
1127 ** This lookup table is used to help decode the first byte of
1128 ** a multi-byte UTF8 character.
1129 */
1130 static const unsigned char sqlite3Utf8Trans1[] = {
1131   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1132   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1133   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1134   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1135   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1136   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1137   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1138   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1139 };
1140 #endif
1141 
1142 /*
1143 ** Return the value of the first UTF-8 character in the string.
1144 */
1145 static int utf8Read(const unsigned char *z, int n, int *pSize){
1146   int c, i;
1147 
1148   /* All callers to this routine (in the current implementation)
1149   ** always have n>0. */
1150   if( NEVER(n==0) ){
1151     c = i = 0;
1152   }else{
1153     c = z[0];
1154     i = 1;
1155     if( c>=0xc0 ){
1156       c = sqlite3Utf8Trans1[c-0xc0];
1157       while( i<n && (z[i] & 0xc0)==0x80 ){
1158         c = (c<<6) + (0x3f & z[i++]);
1159       }
1160     }
1161   }
1162   *pSize = i;
1163   return c;
1164 }
1165 
1166 /*
1167 ** Return the number of characters in the utf-8 string in the nIn byte
1168 ** buffer pointed to by zIn.
1169 */
1170 static int utf8Charlen(const char *zIn, int nIn){
1171   int i;
1172   int nChar = 0;
1173   for(i=0; i<nIn; nChar++){
1174     int sz;
1175     utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1176     i += sz;
1177   }
1178   return nChar;
1179 }
1180 
1181 /*
1182 ** Table of translations from unicode characters into ASCII.
1183 */
1184 static const struct {
1185  unsigned short int cFrom;
1186  unsigned char cTo0, cTo1;
1187 } translit[] = {
1188   { 0x00A0,  0x20, 0x00 },  /*   to   */
1189   { 0x00B5,  0x75, 0x00 },  /* µ to u */
1190   { 0x00C0,  0x41, 0x00 },  /* À to A */
1191   { 0x00C1,  0x41, 0x00 },  /* Á to A */
1192   { 0x00C2,  0x41, 0x00 },  /* Â to A */
1193   { 0x00C3,  0x41, 0x00 },  /* Ã to A */
1194   { 0x00C4,  0x41, 0x65 },  /* Ä to Ae */
1195   { 0x00C5,  0x41, 0x61 },  /* Å to Aa */
1196   { 0x00C6,  0x41, 0x45 },  /* Æ to AE */
1197   { 0x00C7,  0x43, 0x00 },  /* Ç to C */
1198   { 0x00C8,  0x45, 0x00 },  /* È to E */
1199   { 0x00C9,  0x45, 0x00 },  /* É to E */
1200   { 0x00CA,  0x45, 0x00 },  /* Ê to E */
1201   { 0x00CB,  0x45, 0x00 },  /* Ë to E */
1202   { 0x00CC,  0x49, 0x00 },  /* Ì to I */
1203   { 0x00CD,  0x49, 0x00 },  /* Í to I */
1204   { 0x00CE,  0x49, 0x00 },  /* Î to I */
1205   { 0x00CF,  0x49, 0x00 },  /* Ï to I */
1206   { 0x00D0,  0x44, 0x00 },  /* Ð to D */
1207   { 0x00D1,  0x4E, 0x00 },  /* Ñ to N */
1208   { 0x00D2,  0x4F, 0x00 },  /* Ò to O */
1209   { 0x00D3,  0x4F, 0x00 },  /* Ó to O */
1210   { 0x00D4,  0x4F, 0x00 },  /* Ô to O */
1211   { 0x00D5,  0x4F, 0x00 },  /* Õ to O */
1212   { 0x00D6,  0x4F, 0x65 },  /* Ö to Oe */
1213   { 0x00D7,  0x78, 0x00 },  /* × to x */
1214   { 0x00D8,  0x4F, 0x00 },  /* Ø to O */
1215   { 0x00D9,  0x55, 0x00 },  /* Ù to U */
1216   { 0x00DA,  0x55, 0x00 },  /* Ú to U */
1217   { 0x00DB,  0x55, 0x00 },  /* Û to U */
1218   { 0x00DC,  0x55, 0x65 },  /* Ü to Ue */
1219   { 0x00DD,  0x59, 0x00 },  /* Ý to Y */
1220   { 0x00DE,  0x54, 0x68 },  /* Þ to Th */
1221   { 0x00DF,  0x73, 0x73 },  /* ß to ss */
1222   { 0x00E0,  0x61, 0x00 },  /* à to a */
1223   { 0x00E1,  0x61, 0x00 },  /* á to a */
1224   { 0x00E2,  0x61, 0x00 },  /* â to a */
1225   { 0x00E3,  0x61, 0x00 },  /* ã to a */
1226   { 0x00E4,  0x61, 0x65 },  /* ä to ae */
1227   { 0x00E5,  0x61, 0x61 },  /* å to aa */
1228   { 0x00E6,  0x61, 0x65 },  /* æ to ae */
1229   { 0x00E7,  0x63, 0x00 },  /* ç to c */
1230   { 0x00E8,  0x65, 0x00 },  /* è to e */
1231   { 0x00E9,  0x65, 0x00 },  /* é to e */
1232   { 0x00EA,  0x65, 0x00 },  /* ê to e */
1233   { 0x00EB,  0x65, 0x00 },  /* ë to e */
1234   { 0x00EC,  0x69, 0x00 },  /* ì to i */
1235   { 0x00ED,  0x69, 0x00 },  /* í to i */
1236   { 0x00EE,  0x69, 0x00 },  /* î to i */
1237   { 0x00EF,  0x69, 0x00 },  /* ï to i */
1238   { 0x00F0,  0x64, 0x00 },  /* ð to d */
1239   { 0x00F1,  0x6E, 0x00 },  /* ñ to n */
1240   { 0x00F2,  0x6F, 0x00 },  /* ò to o */
1241   { 0x00F3,  0x6F, 0x00 },  /* ó to o */
1242   { 0x00F4,  0x6F, 0x00 },  /* ô to o */
1243   { 0x00F5,  0x6F, 0x00 },  /* õ to o */
1244   { 0x00F6,  0x6F, 0x65 },  /* ö to oe */
1245   { 0x00F7,  0x3A, 0x00 },  /* ÷ to : */
1246   { 0x00F8,  0x6F, 0x00 },  /* ø to o */
1247   { 0x00F9,  0x75, 0x00 },  /* ù to u */
1248   { 0x00FA,  0x75, 0x00 },  /* ú to u */
1249   { 0x00FB,  0x75, 0x00 },  /* û to u */
1250   { 0x00FC,  0x75, 0x65 },  /* ü to ue */
1251   { 0x00FD,  0x79, 0x00 },  /* ý to y */
1252   { 0x00FE,  0x74, 0x68 },  /* þ to th */
1253   { 0x00FF,  0x79, 0x00 },  /* ÿ to y */
1254   { 0x0100,  0x41, 0x00 },  /* Ā to A */
1255   { 0x0101,  0x61, 0x00 },  /* ā to a */
1256   { 0x0102,  0x41, 0x00 },  /* Ă to A */
1257   { 0x0103,  0x61, 0x00 },  /* ă to a */
1258   { 0x0104,  0x41, 0x00 },  /* Ą to A */
1259   { 0x0105,  0x61, 0x00 },  /* ą to a */
1260   { 0x0106,  0x43, 0x00 },  /* Ć to C */
1261   { 0x0107,  0x63, 0x00 },  /* ć to c */
1262   { 0x0108,  0x43, 0x68 },  /* Ĉ to Ch */
1263   { 0x0109,  0x63, 0x68 },  /* ĉ to ch */
1264   { 0x010A,  0x43, 0x00 },  /* Ċ to C */
1265   { 0x010B,  0x63, 0x00 },  /* ċ to c */
1266   { 0x010C,  0x43, 0x00 },  /* Č to C */
1267   { 0x010D,  0x63, 0x00 },  /* č to c */
1268   { 0x010E,  0x44, 0x00 },  /* Ď to D */
1269   { 0x010F,  0x64, 0x00 },  /* ď to d */
1270   { 0x0110,  0x44, 0x00 },  /* Đ to D */
1271   { 0x0111,  0x64, 0x00 },  /* đ to d */
1272   { 0x0112,  0x45, 0x00 },  /* Ē to E */
1273   { 0x0113,  0x65, 0x00 },  /* ē to e */
1274   { 0x0114,  0x45, 0x00 },  /* Ĕ to E */
1275   { 0x0115,  0x65, 0x00 },  /* ĕ to e */
1276   { 0x0116,  0x45, 0x00 },  /* Ė to E */
1277   { 0x0117,  0x65, 0x00 },  /* ė to e */
1278   { 0x0118,  0x45, 0x00 },  /* Ę to E */
1279   { 0x0119,  0x65, 0x00 },  /* ę to e */
1280   { 0x011A,  0x45, 0x00 },  /* Ě to E */
1281   { 0x011B,  0x65, 0x00 },  /* ě to e */
1282   { 0x011C,  0x47, 0x68 },  /* Ĝ to Gh */
1283   { 0x011D,  0x67, 0x68 },  /* ĝ to gh */
1284   { 0x011E,  0x47, 0x00 },  /* Ğ to G */
1285   { 0x011F,  0x67, 0x00 },  /* ğ to g */
1286   { 0x0120,  0x47, 0x00 },  /* Ġ to G */
1287   { 0x0121,  0x67, 0x00 },  /* ġ to g */
1288   { 0x0122,  0x47, 0x00 },  /* Ģ to G */
1289   { 0x0123,  0x67, 0x00 },  /* ģ to g */
1290   { 0x0124,  0x48, 0x68 },  /* Ĥ to Hh */
1291   { 0x0125,  0x68, 0x68 },  /* ĥ to hh */
1292   { 0x0126,  0x48, 0x00 },  /* Ħ to H */
1293   { 0x0127,  0x68, 0x00 },  /* ħ to h */
1294   { 0x0128,  0x49, 0x00 },  /* Ĩ to I */
1295   { 0x0129,  0x69, 0x00 },  /* ĩ to i */
1296   { 0x012A,  0x49, 0x00 },  /* Ī to I */
1297   { 0x012B,  0x69, 0x00 },  /* ī to i */
1298   { 0x012C,  0x49, 0x00 },  /* Ĭ to I */
1299   { 0x012D,  0x69, 0x00 },  /* ĭ to i */
1300   { 0x012E,  0x49, 0x00 },  /* Į to I */
1301   { 0x012F,  0x69, 0x00 },  /* į to i */
1302   { 0x0130,  0x49, 0x00 },  /* İ to I */
1303   { 0x0131,  0x69, 0x00 },  /* ı to i */
1304   { 0x0132,  0x49, 0x4A },  /* IJ to IJ */
1305   { 0x0133,  0x69, 0x6A },  /* ij to ij */
1306   { 0x0134,  0x4A, 0x68 },  /* Ĵ to Jh */
1307   { 0x0135,  0x6A, 0x68 },  /* ĵ to jh */
1308   { 0x0136,  0x4B, 0x00 },  /* Ķ to K */
1309   { 0x0137,  0x6B, 0x00 },  /* ķ to k */
1310   { 0x0138,  0x6B, 0x00 },  /* ĸ to k */
1311   { 0x0139,  0x4C, 0x00 },  /* Ĺ to L */
1312   { 0x013A,  0x6C, 0x00 },  /* ĺ to l */
1313   { 0x013B,  0x4C, 0x00 },  /* Ļ to L */
1314   { 0x013C,  0x6C, 0x00 },  /* ļ to l */
1315   { 0x013D,  0x4C, 0x00 },  /* Ľ to L */
1316   { 0x013E,  0x6C, 0x00 },  /* ľ to l */
1317   { 0x013F,  0x4C, 0x2E },  /* Ŀ to L. */
1318   { 0x0140,  0x6C, 0x2E },  /* ŀ to l. */
1319   { 0x0141,  0x4C, 0x00 },  /* Ł to L */
1320   { 0x0142,  0x6C, 0x00 },  /* ł to l */
1321   { 0x0143,  0x4E, 0x00 },  /* Ń to N */
1322   { 0x0144,  0x6E, 0x00 },  /* ń to n */
1323   { 0x0145,  0x4E, 0x00 },  /* Ņ to N */
1324   { 0x0146,  0x6E, 0x00 },  /* ņ to n */
1325   { 0x0147,  0x4E, 0x00 },  /* Ň to N */
1326   { 0x0148,  0x6E, 0x00 },  /* ň to n */
1327   { 0x0149,  0x27, 0x6E },  /* ʼn to 'n */
1328   { 0x014A,  0x4E, 0x47 },  /* Ŋ to NG */
1329   { 0x014B,  0x6E, 0x67 },  /* ŋ to ng */
1330   { 0x014C,  0x4F, 0x00 },  /* Ō to O */
1331   { 0x014D,  0x6F, 0x00 },  /* ō to o */
1332   { 0x014E,  0x4F, 0x00 },  /* Ŏ to O */
1333   { 0x014F,  0x6F, 0x00 },  /* ŏ to o */
1334   { 0x0150,  0x4F, 0x00 },  /* Ő to O */
1335   { 0x0151,  0x6F, 0x00 },  /* ő to o */
1336   { 0x0152,  0x4F, 0x45 },  /* Œ to OE */
1337   { 0x0153,  0x6F, 0x65 },  /* œ to oe */
1338   { 0x0154,  0x52, 0x00 },  /* Ŕ to R */
1339   { 0x0155,  0x72, 0x00 },  /* ŕ to r */
1340   { 0x0156,  0x52, 0x00 },  /* Ŗ to R */
1341   { 0x0157,  0x72, 0x00 },  /* ŗ to r */
1342   { 0x0158,  0x52, 0x00 },  /* Ř to R */
1343   { 0x0159,  0x72, 0x00 },  /* ř to r */
1344   { 0x015A,  0x53, 0x00 },  /* Ś to S */
1345   { 0x015B,  0x73, 0x00 },  /* ś to s */
1346   { 0x015C,  0x53, 0x68 },  /* Ŝ to Sh */
1347   { 0x015D,  0x73, 0x68 },  /* ŝ to sh */
1348   { 0x015E,  0x53, 0x00 },  /* Ş to S */
1349   { 0x015F,  0x73, 0x00 },  /* ş to s */
1350   { 0x0160,  0x53, 0x00 },  /* Š to S */
1351   { 0x0161,  0x73, 0x00 },  /* š to s */
1352   { 0x0162,  0x54, 0x00 },  /* Ţ to T */
1353   { 0x0163,  0x74, 0x00 },  /* ţ to t */
1354   { 0x0164,  0x54, 0x00 },  /* Ť to T */
1355   { 0x0165,  0x74, 0x00 },  /* ť to t */
1356   { 0x0166,  0x54, 0x00 },  /* Ŧ to T */
1357   { 0x0167,  0x74, 0x00 },  /* ŧ to t */
1358   { 0x0168,  0x55, 0x00 },  /* Ũ to U */
1359   { 0x0169,  0x75, 0x00 },  /* ũ to u */
1360   { 0x016A,  0x55, 0x00 },  /* Ū to U */
1361   { 0x016B,  0x75, 0x00 },  /* ū to u */
1362   { 0x016C,  0x55, 0x00 },  /* Ŭ to U */
1363   { 0x016D,  0x75, 0x00 },  /* ŭ to u */
1364   { 0x016E,  0x55, 0x00 },  /* Ů to U */
1365   { 0x016F,  0x75, 0x00 },  /* ů to u */
1366   { 0x0170,  0x55, 0x00 },  /* Ű to U */
1367   { 0x0171,  0x75, 0x00 },  /* ű to u */
1368   { 0x0172,  0x55, 0x00 },  /* Ų to U */
1369   { 0x0173,  0x75, 0x00 },  /* ų to u */
1370   { 0x0174,  0x57, 0x00 },  /* Ŵ to W */
1371   { 0x0175,  0x77, 0x00 },  /* ŵ to w */
1372   { 0x0176,  0x59, 0x00 },  /* Ŷ to Y */
1373   { 0x0177,  0x79, 0x00 },  /* ŷ to y */
1374   { 0x0178,  0x59, 0x00 },  /* Ÿ to Y */
1375   { 0x0179,  0x5A, 0x00 },  /* Ź to Z */
1376   { 0x017A,  0x7A, 0x00 },  /* ź to z */
1377   { 0x017B,  0x5A, 0x00 },  /* Ż to Z */
1378   { 0x017C,  0x7A, 0x00 },  /* ż to z */
1379   { 0x017D,  0x5A, 0x00 },  /* Ž to Z */
1380   { 0x017E,  0x7A, 0x00 },  /* ž to z */
1381   { 0x017F,  0x73, 0x00 },  /* ſ to s */
1382   { 0x0192,  0x66, 0x00 },  /* ƒ to f */
1383   { 0x0218,  0x53, 0x00 },  /* Ș to S */
1384   { 0x0219,  0x73, 0x00 },  /* ș to s */
1385   { 0x021A,  0x54, 0x00 },  /* Ț to T */
1386   { 0x021B,  0x74, 0x00 },  /* ț to t */
1387   { 0x0386,  0x41, 0x00 },  /* Ά to A */
1388   { 0x0388,  0x45, 0x00 },  /* Έ to E */
1389   { 0x0389,  0x49, 0x00 },  /* Ή to I */
1390   { 0x038A,  0x49, 0x00 },  /* Ί to I */
1391   { 0x038C,  0x4f, 0x00 },  /* Ό to O */
1392   { 0x038E,  0x59, 0x00 },  /* Ύ to Y */
1393   { 0x038F,  0x4f, 0x00 },  /* Ώ to O */
1394   { 0x0390,  0x69, 0x00 },  /* ΐ to i */
1395   { 0x0391,  0x41, 0x00 },  /* Α to A */
1396   { 0x0392,  0x42, 0x00 },  /* Β to B */
1397   { 0x0393,  0x47, 0x00 },  /* Γ to G */
1398   { 0x0394,  0x44, 0x00 },  /* Δ to D */
1399   { 0x0395,  0x45, 0x00 },  /* Ε to E */
1400   { 0x0396,  0x5a, 0x00 },  /* Ζ to Z */
1401   { 0x0397,  0x49, 0x00 },  /* Η to I */
1402   { 0x0398,  0x54, 0x68 },  /* Θ to Th */
1403   { 0x0399,  0x49, 0x00 },  /* Ι to I */
1404   { 0x039A,  0x4b, 0x00 },  /* Κ to K */
1405   { 0x039B,  0x4c, 0x00 },  /* Λ to L */
1406   { 0x039C,  0x4d, 0x00 },  /* Μ to M */
1407   { 0x039D,  0x4e, 0x00 },  /* Ν to N */
1408   { 0x039E,  0x58, 0x00 },  /* Ξ to X */
1409   { 0x039F,  0x4f, 0x00 },  /* Ο to O */
1410   { 0x03A0,  0x50, 0x00 },  /* Π to P */
1411   { 0x03A1,  0x52, 0x00 },  /* Ρ to R */
1412   { 0x03A3,  0x53, 0x00 },  /* Σ to S */
1413   { 0x03A4,  0x54, 0x00 },  /* Τ to T */
1414   { 0x03A5,  0x59, 0x00 },  /* Υ to Y */
1415   { 0x03A6,  0x46, 0x00 },  /* Φ to F */
1416   { 0x03A7,  0x43, 0x68 },  /* Χ to Ch */
1417   { 0x03A8,  0x50, 0x73 },  /* Ψ to Ps */
1418   { 0x03A9,  0x4f, 0x00 },  /* Ω to O */
1419   { 0x03AA,  0x49, 0x00 },  /* Ϊ to I */
1420   { 0x03AB,  0x59, 0x00 },  /* Ϋ to Y */
1421   { 0x03AC,  0x61, 0x00 },  /* ά to a */
1422   { 0x03AD,  0x65, 0x00 },  /* έ to e */
1423   { 0x03AE,  0x69, 0x00 },  /* ή to i */
1424   { 0x03AF,  0x69, 0x00 },  /* ί to i */
1425   { 0x03B1,  0x61, 0x00 },  /* α to a */
1426   { 0x03B2,  0x62, 0x00 },  /* β to b */
1427   { 0x03B3,  0x67, 0x00 },  /* γ to g */
1428   { 0x03B4,  0x64, 0x00 },  /* δ to d */
1429   { 0x03B5,  0x65, 0x00 },  /* ε to e */
1430   { 0x03B6,  0x7a, 0x00 },  /* ζ to z */
1431   { 0x03B7,  0x69, 0x00 },  /* η to i */
1432   { 0x03B8,  0x74, 0x68 },  /* θ to th */
1433   { 0x03B9,  0x69, 0x00 },  /* ι to i */
1434   { 0x03BA,  0x6b, 0x00 },  /* κ to k */
1435   { 0x03BB,  0x6c, 0x00 },  /* λ to l */
1436   { 0x03BC,  0x6d, 0x00 },  /* μ to m */
1437   { 0x03BD,  0x6e, 0x00 },  /* ν to n */
1438   { 0x03BE,  0x78, 0x00 },  /* ξ to x */
1439   { 0x03BF,  0x6f, 0x00 },  /* ο to o */
1440   { 0x03C0,  0x70, 0x00 },  /* π to p */
1441   { 0x03C1,  0x72, 0x00 },  /* ρ to r */
1442   { 0x03C3,  0x73, 0x00 },  /* σ to s */
1443   { 0x03C4,  0x74, 0x00 },  /* τ to t */
1444   { 0x03C5,  0x79, 0x00 },  /* υ to y */
1445   { 0x03C6,  0x66, 0x00 },  /* φ to f */
1446   { 0x03C7,  0x63, 0x68 },  /* χ to ch */
1447   { 0x03C8,  0x70, 0x73 },  /* ψ to ps */
1448   { 0x03C9,  0x6f, 0x00 },  /* ω to o */
1449   { 0x03CA,  0x69, 0x00 },  /* ϊ to i */
1450   { 0x03CB,  0x79, 0x00 },  /* ϋ to y */
1451   { 0x03CC,  0x6f, 0x00 },  /* ό to o */
1452   { 0x03CD,  0x79, 0x00 },  /* ύ to y */
1453   { 0x03CE,  0x69, 0x00 },  /* ώ to i */
1454   { 0x0400,  0x45, 0x00 },  /* Ѐ to E */
1455   { 0x0401,  0x45, 0x00 },  /* Ё to E */
1456   { 0x0402,  0x44, 0x00 },  /* Ђ to D */
1457   { 0x0403,  0x47, 0x00 },  /* Ѓ to G */
1458   { 0x0404,  0x45, 0x00 },  /* Є to E */
1459   { 0x0405,  0x5a, 0x00 },  /* Ѕ to Z */
1460   { 0x0406,  0x49, 0x00 },  /* І to I */
1461   { 0x0407,  0x49, 0x00 },  /* Ї to I */
1462   { 0x0408,  0x4a, 0x00 },  /* Ј to J */
1463   { 0x0409,  0x49, 0x00 },  /* Љ to I */
1464   { 0x040A,  0x4e, 0x00 },  /* Њ to N */
1465   { 0x040B,  0x44, 0x00 },  /* Ћ to D */
1466   { 0x040C,  0x4b, 0x00 },  /* Ќ to K */
1467   { 0x040D,  0x49, 0x00 },  /* Ѝ to I */
1468   { 0x040E,  0x55, 0x00 },  /* Ў to U */
1469   { 0x040F,  0x44, 0x00 },  /* Џ to D */
1470   { 0x0410,  0x41, 0x00 },  /* А to A */
1471   { 0x0411,  0x42, 0x00 },  /* Б to B */
1472   { 0x0412,  0x56, 0x00 },  /* В to V */
1473   { 0x0413,  0x47, 0x00 },  /* Г to G */
1474   { 0x0414,  0x44, 0x00 },  /* Д to D */
1475   { 0x0415,  0x45, 0x00 },  /* Е to E */
1476   { 0x0416,  0x5a, 0x68 },  /* Ж to Zh */
1477   { 0x0417,  0x5a, 0x00 },  /* З to Z */
1478   { 0x0418,  0x49, 0x00 },  /* И to I */
1479   { 0x0419,  0x49, 0x00 },  /* Й to I */
1480   { 0x041A,  0x4b, 0x00 },  /* К to K */
1481   { 0x041B,  0x4c, 0x00 },  /* Л to L */
1482   { 0x041C,  0x4d, 0x00 },  /* М to M */
1483   { 0x041D,  0x4e, 0x00 },  /* Н to N */
1484   { 0x041E,  0x4f, 0x00 },  /* О to O */
1485   { 0x041F,  0x50, 0x00 },  /* П to P */
1486   { 0x0420,  0x52, 0x00 },  /* Р to R */
1487   { 0x0421,  0x53, 0x00 },  /* С to S */
1488   { 0x0422,  0x54, 0x00 },  /* Т to T */
1489   { 0x0423,  0x55, 0x00 },  /* У to U */
1490   { 0x0424,  0x46, 0x00 },  /* Ф to F */
1491   { 0x0425,  0x4b, 0x68 },  /* Х to Kh */
1492   { 0x0426,  0x54, 0x63 },  /* Ц to Tc */
1493   { 0x0427,  0x43, 0x68 },  /* Ч to Ch */
1494   { 0x0428,  0x53, 0x68 },  /* Ш to Sh */
1495   { 0x0429,  0x53, 0x68 },  /* Щ to Shch */
1496   { 0x042A,  0x61, 0x00 },  /*  to A */
1497   { 0x042B,  0x59, 0x00 },  /* Ы to Y */
1498   { 0x042C,  0x59, 0x00 },  /*  to Y */
1499   { 0x042D,  0x45, 0x00 },  /* Э to E */
1500   { 0x042E,  0x49, 0x75 },  /* Ю to Iu */
1501   { 0x042F,  0x49, 0x61 },  /* Я to Ia */
1502   { 0x0430,  0x61, 0x00 },  /* а to a */
1503   { 0x0431,  0x62, 0x00 },  /* б to b */
1504   { 0x0432,  0x76, 0x00 },  /* в to v */
1505   { 0x0433,  0x67, 0x00 },  /* г to g */
1506   { 0x0434,  0x64, 0x00 },  /* д to d */
1507   { 0x0435,  0x65, 0x00 },  /* е to e */
1508   { 0x0436,  0x7a, 0x68 },  /* ж to zh */
1509   { 0x0437,  0x7a, 0x00 },  /* з to z */
1510   { 0x0438,  0x69, 0x00 },  /* и to i */
1511   { 0x0439,  0x69, 0x00 },  /* й to i */
1512   { 0x043A,  0x6b, 0x00 },  /* к to k */
1513   { 0x043B,  0x6c, 0x00 },  /* л to l */
1514   { 0x043C,  0x6d, 0x00 },  /* м to m */
1515   { 0x043D,  0x6e, 0x00 },  /* н to n */
1516   { 0x043E,  0x6f, 0x00 },  /* о to o */
1517   { 0x043F,  0x70, 0x00 },  /* п to p */
1518   { 0x0440,  0x72, 0x00 },  /* р to r */
1519   { 0x0441,  0x73, 0x00 },  /* с to s */
1520   { 0x0442,  0x74, 0x00 },  /* т to t */
1521   { 0x0443,  0x75, 0x00 },  /* у to u */
1522   { 0x0444,  0x66, 0x00 },  /* ф to f */
1523   { 0x0445,  0x6b, 0x68 },  /* х to kh */
1524   { 0x0446,  0x74, 0x63 },  /* ц to tc */
1525   { 0x0447,  0x63, 0x68 },  /* ч to ch */
1526   { 0x0448,  0x73, 0x68 },  /* ш to sh */
1527   { 0x0449,  0x73, 0x68 },  /* щ to shch */
1528   { 0x044A,  0x61, 0x00 },  /*  to a */
1529   { 0x044B,  0x79, 0x00 },  /* ы to y */
1530   { 0x044C,  0x79, 0x00 },  /*  to y */
1531   { 0x044D,  0x65, 0x00 },  /* э to e */
1532   { 0x044E,  0x69, 0x75 },  /* ю to iu */
1533   { 0x044F,  0x69, 0x61 },  /* я to ia */
1534   { 0x0450,  0x65, 0x00 },  /* ѐ to e */
1535   { 0x0451,  0x65, 0x00 },  /* ё to e */
1536   { 0x0452,  0x64, 0x00 },  /* ђ to d */
1537   { 0x0453,  0x67, 0x00 },  /* ѓ to g */
1538   { 0x0454,  0x65, 0x00 },  /* є to e */
1539   { 0x0455,  0x7a, 0x00 },  /* ѕ to z */
1540   { 0x0456,  0x69, 0x00 },  /* і to i */
1541   { 0x0457,  0x69, 0x00 },  /* ї to i */
1542   { 0x0458,  0x6a, 0x00 },  /* ј to j */
1543   { 0x0459,  0x69, 0x00 },  /* љ to i */
1544   { 0x045A,  0x6e, 0x00 },  /* њ to n */
1545   { 0x045B,  0x64, 0x00 },  /* ћ to d */
1546   { 0x045C,  0x6b, 0x00 },  /* ќ to k */
1547   { 0x045D,  0x69, 0x00 },  /* ѝ to i */
1548   { 0x045E,  0x75, 0x00 },  /* ў to u */
1549   { 0x045F,  0x64, 0x00 },  /* џ to d */
1550   { 0x1E02,  0x42, 0x00 },  /* Ḃ to B */
1551   { 0x1E03,  0x62, 0x00 },  /* ḃ to b */
1552   { 0x1E0A,  0x44, 0x00 },  /* Ḋ to D */
1553   { 0x1E0B,  0x64, 0x00 },  /* ḋ to d */
1554   { 0x1E1E,  0x46, 0x00 },  /* Ḟ to F */
1555   { 0x1E1F,  0x66, 0x00 },  /* ḟ to f */
1556   { 0x1E40,  0x4D, 0x00 },  /* Ṁ to M */
1557   { 0x1E41,  0x6D, 0x00 },  /* ṁ to m */
1558   { 0x1E56,  0x50, 0x00 },  /* Ṗ to P */
1559   { 0x1E57,  0x70, 0x00 },  /* ṗ to p */
1560   { 0x1E60,  0x53, 0x00 },  /* Ṡ to S */
1561   { 0x1E61,  0x73, 0x00 },  /* ṡ to s */
1562   { 0x1E6A,  0x54, 0x00 },  /* Ṫ to T */
1563   { 0x1E6B,  0x74, 0x00 },  /* ṫ to t */
1564   { 0x1E80,  0x57, 0x00 },  /* Ẁ to W */
1565   { 0x1E81,  0x77, 0x00 },  /* ẁ to w */
1566   { 0x1E82,  0x57, 0x00 },  /* Ẃ to W */
1567   { 0x1E83,  0x77, 0x00 },  /* ẃ to w */
1568   { 0x1E84,  0x57, 0x00 },  /* Ẅ to W */
1569   { 0x1E85,  0x77, 0x00 },  /* ẅ to w */
1570   { 0x1EF2,  0x59, 0x00 },  /* Ỳ to Y */
1571   { 0x1EF3,  0x79, 0x00 },  /* ỳ to y */
1572   { 0xFB00,  0x66, 0x66 },  /* ff to ff */
1573   { 0xFB01,  0x66, 0x69 },  /* fi to fi */
1574   { 0xFB02,  0x66, 0x6C },  /* fl to fl */
1575   { 0xFB05,  0x73, 0x74 },  /* ſt to st */
1576   { 0xFB06,  0x73, 0x74 },  /* st to st */
1577 };
1578 
1579 /*
1580 ** Convert the input string from UTF-8 into pure ASCII by converting
1581 ** all non-ASCII characters to some combination of characters in the
1582 ** ASCII subset.
1583 **
1584 ** The returned string might contain more characters than the input.
1585 **
1586 ** Space to hold the returned string comes from sqlite3_malloc() and
1587 ** should be freed by the caller.
1588 */
1589 static unsigned char *transliterate(const unsigned char *zIn, int nIn){
1590   unsigned char *zOut = sqlite3_malloc( nIn*4 + 1 );
1591   int c, sz, nOut;
1592   if( zOut==0 ) return 0;
1593   nOut = 0;
1594   while( nIn>0 ){
1595     c = utf8Read(zIn, nIn, &sz);
1596     zIn += sz;
1597     nIn -= sz;
1598     if( c<=127 ){
1599       zOut[nOut++] = c;
1600     }else{
1601       int xTop, xBtm, x;
1602       xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1603       xBtm = 0;
1604       while( xTop>=xBtm ){
1605         x = (xTop + xBtm)/2;
1606         if( translit[x].cFrom==c ){
1607           zOut[nOut++] = translit[x].cTo0;
1608           if( translit[x].cTo1 ){
1609             zOut[nOut++] = translit[x].cTo1;
1610             /* Add an extra "ch" after the "sh" for Щ and щ */
1611             if( c==0x0429 || c== 0x0449 ){
1612               zOut[nOut++] = 'c';
1613               zOut[nOut++] = 'h';
1614             }
1615           }
1616           c = 0;
1617           break;
1618         }else if( translit[x].cFrom>c ){
1619           xTop = x-1;
1620         }else{
1621           xBtm = x+1;
1622         }
1623       }
1624       if( c ) zOut[nOut++] = '?';
1625     }
1626   }
1627   zOut[nOut] = 0;
1628   return zOut;
1629 }
1630 
1631 /*
1632 ** Return the number of characters in the shortest prefix of the input
1633 ** string that transliterates to an ASCII string nTrans bytes or longer.
1634 ** Or, if the transliteration of the input string is less than nTrans
1635 ** bytes in size, return the number of characters in the input string.
1636 */
1637 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
1638   int i, c, sz, nOut;
1639   int nChar;
1640 
1641   i = nOut = 0;
1642   for(nChar=0; i<nIn && nOut<nTrans; nChar++){
1643     c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1644     i += sz;
1645 
1646     nOut++;
1647     if( c>=128 ){
1648       int xTop, xBtm, x;
1649       xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1650       xBtm = 0;
1651       while( xTop>=xBtm ){
1652         x = (xTop + xBtm)/2;
1653         if( translit[x].cFrom==c ){
1654           if( translit[x].cTo1 ) nOut++;
1655           if( c==0x0429 || c== 0x0449 ) nOut += 2;
1656           break;
1657         }else if( translit[x].cFrom>c ){
1658           xTop = x-1;
1659         }else{
1660           xBtm = x+1;
1661         }
1662       }
1663     }
1664   }
1665 
1666   return nChar;
1667 }
1668 
1669 
1670 /*
1671 **    spellfix1_translit(X)
1672 **
1673 ** Convert a string that contains non-ASCII Roman characters into
1674 ** pure ASCII.
1675 */
1676 static void transliterateSqlFunc(
1677   sqlite3_context *context,
1678   int argc,
1679   sqlite3_value **argv
1680 ){
1681   const unsigned char *zIn = sqlite3_value_text(argv[0]);
1682   int nIn = sqlite3_value_bytes(argv[0]);
1683   unsigned char *zOut = transliterate(zIn, nIn);
1684   if( zOut==0 ){
1685     sqlite3_result_error_nomem(context);
1686   }else{
1687     sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1688   }
1689 }
1690 
1691 /*
1692 **    spellfix1_scriptcode(X)
1693 **
1694 ** Try to determine the dominant script used by the word X and return
1695 ** its ISO 15924 numeric code.
1696 **
1697 ** The current implementation only understands the following scripts:
1698 **
1699 **    215  (Latin)
1700 **    220  (Cyrillic)
1701 **    200  (Greek)
1702 **
1703 ** This routine will return 998 if the input X contains characters from
1704 ** two or more of the above scripts or 999 if X contains no characters
1705 ** from any of the above scripts.
1706 */
1707 static void scriptCodeSqlFunc(
1708   sqlite3_context *context,
1709   int argc,
1710   sqlite3_value **argv
1711 ){
1712   const unsigned char *zIn = sqlite3_value_text(argv[0]);
1713   int nIn = sqlite3_value_bytes(argv[0]);
1714   int c, sz;
1715   int scriptMask = 0;
1716   int res;
1717 # define SCRIPT_LATIN       0x0001
1718 # define SCRIPT_CYRILLIC    0x0002
1719 # define SCRIPT_GREEK       0x0004
1720 
1721   while( nIn>0 ){
1722     c = utf8Read(zIn, nIn, &sz);
1723     zIn += sz;
1724     nIn -= sz;
1725     if( c<0x02af ){
1726       scriptMask |= SCRIPT_LATIN;
1727     }else if( c>=0x0400 && c<=0x04ff ){
1728       scriptMask |= SCRIPT_CYRILLIC;
1729     }else if( c>=0x0386 && c<=0x03ce ){
1730       scriptMask |= SCRIPT_GREEK;
1731     }
1732   }
1733   switch( scriptMask ){
1734     case 0:                res = 999; break;
1735     case SCRIPT_LATIN:     res = 215; break;
1736     case SCRIPT_CYRILLIC:  res = 220; break;
1737     case SCRIPT_GREEK:     res = 200; break;
1738     default:               res = 998; break;
1739   }
1740   sqlite3_result_int(context, res);
1741 }
1742 
1743 /* End transliterate
1744 ******************************************************************************
1745 ******************************************************************************
1746 ** Begin spellfix1 virtual table.
1747 */
1748 
1749 /* Maximum length of a phonehash used for querying the shadow table */
1750 #define SPELLFIX_MX_HASH  8
1751 
1752 /* Maximum number of hash strings to examine per query */
1753 #define SPELLFIX_MX_RUN   1
1754 
1755 typedef struct spellfix1_vtab spellfix1_vtab;
1756 typedef struct spellfix1_cursor spellfix1_cursor;
1757 
1758 /* Fuzzy-search virtual table object */
1759 struct spellfix1_vtab {
1760   sqlite3_vtab base;         /* Base class - must be first */
1761   sqlite3 *db;               /* Database connection */
1762   char *zDbName;             /* Name of database holding this table */
1763   char *zTableName;          /* Name of the virtual table */
1764   char *zCostTable;          /* Table holding edit-distance cost numbers */
1765   EditDist3Config *pConfig3; /* Parsed edit distance costs */
1766 };
1767 
1768 /* Fuzzy-search cursor object */
1769 struct spellfix1_cursor {
1770   sqlite3_vtab_cursor base;    /* Base class - must be first */
1771   spellfix1_vtab *pVTab;       /* The table to which this cursor belongs */
1772   char *zPattern;              /* rhs of MATCH clause */
1773   int nRow;                    /* Number of rows of content */
1774   int nAlloc;                  /* Number of allocated rows */
1775   int iRow;                    /* Current row of content */
1776   int iLang;                   /* Value of the langid= constraint */
1777   int iTop;                    /* Value of the top= constraint */
1778   int iScope;                  /* Value of the scope= constraint */
1779   int nSearch;                 /* Number of vocabulary items checked */
1780   sqlite3_stmt *pFullScan;     /* Shadow query for a full table scan */
1781   struct spellfix1_row {       /* For each row of content */
1782     sqlite3_int64 iRowid;         /* Rowid for this row */
1783     char *zWord;                  /* Text for this row */
1784     int iRank;                    /* Rank for this row */
1785     int iDistance;                /* Distance from pattern for this row */
1786     int iScore;                   /* Score for sorting */
1787     int iMatchlen;                /* Value of matchlen column (or -1) */
1788     char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
1789   } *a;
1790 };
1791 
1792 /*
1793 ** Construct one or more SQL statements from the format string given
1794 ** and then evaluate those statements. The success code is written
1795 ** into *pRc.
1796 **
1797 ** If *pRc is initially non-zero then this routine is a no-op.
1798 */
1799 static void spellfix1DbExec(
1800   int *pRc,              /* Success code */
1801   sqlite3 *db,           /* Database in which to run SQL */
1802   const char *zFormat,   /* Format string for SQL */
1803   ...                    /* Arguments to the format string */
1804 ){
1805   va_list ap;
1806   char *zSql;
1807   if( *pRc ) return;
1808   va_start(ap, zFormat);
1809   zSql = sqlite3_vmprintf(zFormat, ap);
1810   va_end(ap);
1811   if( zSql==0 ){
1812     *pRc = SQLITE_NOMEM;
1813   }else{
1814     *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
1815     sqlite3_free(zSql);
1816   }
1817 }
1818 
1819 /*
1820 ** xDisconnect/xDestroy method for the fuzzy-search module.
1821 */
1822 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){
1823   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1824   int rc = SQLITE_OK;
1825   if( isDestroy ){
1826     sqlite3 *db = p->db;
1827     spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1828                   p->zDbName, p->zTableName);
1829   }
1830   if( rc==SQLITE_OK ){
1831     sqlite3_free(p->zTableName);
1832     editDist3ConfigDelete(p->pConfig3);
1833     sqlite3_free(p->zCostTable);
1834     sqlite3_free(p);
1835   }
1836   return rc;
1837 }
1838 static int spellfix1Disconnect(sqlite3_vtab *pVTab){
1839   return spellfix1Uninit(0, pVTab);
1840 }
1841 static int spellfix1Destroy(sqlite3_vtab *pVTab){
1842   return spellfix1Uninit(1, pVTab);
1843 }
1844 
1845 /*
1846 ** Make a copy of a string.  Remove leading and trailing whitespace
1847 ** and dequote it.
1848 */
1849 static char *spellfix1Dequote(const char *zIn){
1850   char *zOut;
1851   int i, j;
1852   char c;
1853   while( isspace(zIn[0]) ) zIn++;
1854   zOut = sqlite3_mprintf("%s", zIn);
1855   if( zOut==0 ) return 0;
1856   i = (int)strlen(zOut);
1857 #if 0  /* The parser will never leave spaces at the end */
1858   while( i>0 && isspace(zOut[i-1]) ){ i--; }
1859 #endif
1860   zOut[i] = 0;
1861   c = zOut[0];
1862   if( c=='\'' || c=='"' ){
1863     for(i=1, j=0; ALWAYS(zOut[i]); i++){
1864       zOut[j++] = zOut[i];
1865       if( zOut[i]==c ){
1866         if( zOut[i+1]==c ){
1867           i++;
1868         }else{
1869           zOut[j-1] = 0;
1870           break;
1871         }
1872       }
1873     }
1874   }
1875   return zOut;
1876 }
1877 
1878 
1879 /*
1880 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
1881 **
1882 **   argv[0]   -> module name  ("spellfix1")
1883 **   argv[1]   -> database name
1884 **   argv[2]   -> table name
1885 **   argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
1886 */
1887 static int spellfix1Init(
1888   int isCreate,
1889   sqlite3 *db,
1890   void *pAux,
1891   int argc, const char *const*argv,
1892   sqlite3_vtab **ppVTab,
1893   char **pzErr
1894 ){
1895   spellfix1_vtab *pNew = 0;
1896   /* const char *zModule = argv[0]; // not used */
1897   const char *zDbName = argv[1];
1898   const char *zTableName = argv[2];
1899   int nDbName;
1900   int rc = SQLITE_OK;
1901   int i;
1902 
1903   nDbName = (int)strlen(zDbName);
1904   pNew = sqlite3_malloc( sizeof(*pNew) + nDbName + 1);
1905   if( pNew==0 ){
1906     rc = SQLITE_NOMEM;
1907   }else{
1908     memset(pNew, 0, sizeof(*pNew));
1909     pNew->zDbName = (char*)&pNew[1];
1910     memcpy(pNew->zDbName, zDbName, nDbName+1);
1911     pNew->zTableName = sqlite3_mprintf("%s", zTableName);
1912     pNew->db = db;
1913     if( pNew->zTableName==0 ){
1914       rc = SQLITE_NOMEM;
1915     }else{
1916       rc = sqlite3_declare_vtab(db,
1917            "CREATE TABLE x(word,rank,distance,langid, "
1918            "score, matchlen, phonehash HIDDEN, "
1919            "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
1920            "soundslike HIDDEN, command HIDDEN)"
1921       );
1922 #define SPELLFIX_COL_WORD            0
1923 #define SPELLFIX_COL_RANK            1
1924 #define SPELLFIX_COL_DISTANCE        2
1925 #define SPELLFIX_COL_LANGID          3
1926 #define SPELLFIX_COL_SCORE           4
1927 #define SPELLFIX_COL_MATCHLEN        5
1928 #define SPELLFIX_COL_PHONEHASH       6
1929 #define SPELLFIX_COL_TOP             7
1930 #define SPELLFIX_COL_SCOPE           8
1931 #define SPELLFIX_COL_SRCHCNT         9
1932 #define SPELLFIX_COL_SOUNDSLIKE     10
1933 #define SPELLFIX_COL_COMMAND        11
1934     }
1935     if( rc==SQLITE_OK && isCreate ){
1936       spellfix1DbExec(&rc, db,
1937          "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
1938          "  id INTEGER PRIMARY KEY,\n"
1939          "  rank INT,\n"
1940          "  langid INT,\n"
1941          "  word TEXT,\n"
1942          "  k1 TEXT,\n"
1943          "  k2 TEXT\n"
1944          ");\n",
1945          zDbName, zTableName
1946       );
1947       spellfix1DbExec(&rc, db,
1948          "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" "
1949             "ON \"%w_vocab\"(langid,k2);",
1950          zDbName, zTableName, zTableName
1951       );
1952     }
1953     for(i=3; rc==SQLITE_OK && i<argc; i++){
1954       if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
1955         pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
1956         if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
1957         continue;
1958       }
1959       *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
1960       rc = SQLITE_ERROR;
1961     }
1962   }
1963 
1964   if( rc && pNew ){
1965     *ppVTab = 0;
1966     spellfix1Uninit(0, &pNew->base);
1967   }else{
1968     *ppVTab = (sqlite3_vtab *)pNew;
1969   }
1970   return rc;
1971 }
1972 
1973 /*
1974 ** The xConnect and xCreate methods
1975 */
1976 static int spellfix1Connect(
1977   sqlite3 *db,
1978   void *pAux,
1979   int argc, const char *const*argv,
1980   sqlite3_vtab **ppVTab,
1981   char **pzErr
1982 ){
1983   return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
1984 }
1985 static int spellfix1Create(
1986   sqlite3 *db,
1987   void *pAux,
1988   int argc, const char *const*argv,
1989   sqlite3_vtab **ppVTab,
1990   char **pzErr
1991 ){
1992   return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
1993 }
1994 
1995 /*
1996 ** Clear all of the content from a cursor.
1997 */
1998 static void spellfix1ResetCursor(spellfix1_cursor *pCur){
1999   int i;
2000   for(i=0; i<pCur->nRow; i++){
2001     sqlite3_free(pCur->a[i].zWord);
2002   }
2003   pCur->nRow = 0;
2004   pCur->iRow = 0;
2005   pCur->nSearch = 0;
2006   if( pCur->pFullScan ){
2007     sqlite3_finalize(pCur->pFullScan);
2008     pCur->pFullScan = 0;
2009   }
2010 }
2011 
2012 /*
2013 ** Resize the cursor to hold up to N rows of content
2014 */
2015 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){
2016   struct spellfix1_row *aNew;
2017   assert( N>=pCur->nRow );
2018   aNew = sqlite3_realloc(pCur->a, sizeof(pCur->a[0])*N);
2019   if( aNew==0 && N>0 ){
2020     spellfix1ResetCursor(pCur);
2021     sqlite3_free(pCur->a);
2022     pCur->nAlloc = 0;
2023     pCur->a = 0;
2024   }else{
2025     pCur->nAlloc = N;
2026     pCur->a = aNew;
2027   }
2028 }
2029 
2030 
2031 /*
2032 ** Close a fuzzy-search cursor.
2033 */
2034 static int spellfix1Close(sqlite3_vtab_cursor *cur){
2035   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2036   spellfix1ResetCursor(pCur);
2037   spellfix1ResizeCursor(pCur, 0);
2038   sqlite3_free(pCur->zPattern);
2039   sqlite3_free(pCur);
2040   return SQLITE_OK;
2041 }
2042 
2043 /*
2044 ** Search for terms of these forms:
2045 **
2046 **   (A)    word MATCH $str
2047 **   (B)    langid == $langid
2048 **   (C)    top = $top
2049 **   (D)    scope = $scope
2050 **   (E)    distance < $distance
2051 **   (F)    distance <= $distance
2052 **   (G)    rowid = $rowid
2053 **
2054 ** The plan number is a bit mask formed with these bits:
2055 **
2056 **   0x01   (A) is found
2057 **   0x02   (B) is found
2058 **   0x04   (C) is found
2059 **   0x08   (D) is found
2060 **   0x10   (E) is found
2061 **   0x20   (F) is found
2062 **   0x40   (G) is found
2063 **
2064 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2065 ** if specified and in that order.
2066 */
2067 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
2068   int iPlan = 0;
2069   int iLangTerm = -1;
2070   int iTopTerm = -1;
2071   int iScopeTerm = -1;
2072   int iDistTerm = -1;
2073   int iRowidTerm = -1;
2074   int i;
2075   const struct sqlite3_index_constraint *pConstraint;
2076   pConstraint = pIdxInfo->aConstraint;
2077   for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){
2078     if( pConstraint->usable==0 ) continue;
2079 
2080     /* Terms of the form:  word MATCH $str */
2081     if( (iPlan & 1)==0
2082      && pConstraint->iColumn==SPELLFIX_COL_WORD
2083      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
2084     ){
2085       iPlan |= 1;
2086       pIdxInfo->aConstraintUsage[i].argvIndex = 1;
2087       pIdxInfo->aConstraintUsage[i].omit = 1;
2088     }
2089 
2090     /* Terms of the form:  langid = $langid  */
2091     if( (iPlan & 2)==0
2092      && pConstraint->iColumn==SPELLFIX_COL_LANGID
2093      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2094     ){
2095       iPlan |= 2;
2096       iLangTerm = i;
2097     }
2098 
2099     /* Terms of the form:  top = $top */
2100     if( (iPlan & 4)==0
2101      && pConstraint->iColumn==SPELLFIX_COL_TOP
2102      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2103     ){
2104       iPlan |= 4;
2105       iTopTerm = i;
2106     }
2107 
2108     /* Terms of the form:  scope = $scope */
2109     if( (iPlan & 8)==0
2110      && pConstraint->iColumn==SPELLFIX_COL_SCOPE
2111      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2112     ){
2113       iPlan |= 8;
2114       iScopeTerm = i;
2115     }
2116 
2117     /* Terms of the form:  distance < $dist or distance <= $dist */
2118     if( (iPlan & (16|32))==0
2119      && pConstraint->iColumn==SPELLFIX_COL_DISTANCE
2120      && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT
2121           || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE)
2122     ){
2123       iPlan |= pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ? 16 : 32;
2124       iDistTerm = i;
2125     }
2126 
2127     /* Terms of the form:  distance < $dist or distance <= $dist */
2128     if( (iPlan & 64)==0
2129      && pConstraint->iColumn<0
2130      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2131     ){
2132       iPlan |= 64;
2133       iRowidTerm = i;
2134     }
2135   }
2136   if( iPlan&1 ){
2137     int idx = 2;
2138     pIdxInfo->idxNum = iPlan;
2139     if( pIdxInfo->nOrderBy==1
2140      && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
2141      && pIdxInfo->aOrderBy[0].desc==0
2142     ){
2143       pIdxInfo->orderByConsumed = 1;  /* Default order by iScore */
2144     }
2145     if( iPlan&2 ){
2146       pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
2147       pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
2148     }
2149     if( iPlan&4 ){
2150       pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
2151       pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
2152     }
2153     if( iPlan&8 ){
2154       pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
2155       pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
2156     }
2157     if( iPlan&(16|32) ){
2158       pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++;
2159       pIdxInfo->aConstraintUsage[iDistTerm].omit = 1;
2160     }
2161     pIdxInfo->estimatedCost = 1e5;
2162   }else if( (iPlan & 64) ){
2163     pIdxInfo->idxNum = 64;
2164     pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
2165     pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
2166     pIdxInfo->estimatedCost = 5;
2167   }else{
2168     pIdxInfo->idxNum = 0;
2169     pIdxInfo->estimatedCost = 1e50;
2170   }
2171   return SQLITE_OK;
2172 }
2173 
2174 /*
2175 ** Open a new fuzzy-search cursor.
2176 */
2177 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2178   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2179   spellfix1_cursor *pCur;
2180   pCur = sqlite3_malloc( sizeof(*pCur) );
2181   if( pCur==0 ) return SQLITE_NOMEM;
2182   memset(pCur, 0, sizeof(*pCur));
2183   pCur->pVTab = p;
2184   *ppCursor = &pCur->base;
2185   return SQLITE_OK;
2186 }
2187 
2188 /*
2189 ** Adjust a distance measurement by the words rank in order to show
2190 ** preference to common words.
2191 */
2192 static int spellfix1Score(int iDistance, int iRank){
2193   int iLog2;
2194   for(iLog2=0; iRank>0; iLog2++, iRank>>=1){}
2195   return iDistance + 32 - iLog2;
2196 }
2197 
2198 /*
2199 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
2200 ** that they sort in order of increasing distance.
2201 */
2202 static int spellfix1RowCompare(const void *A, const void *B){
2203   const struct spellfix1_row *a = (const struct spellfix1_row*)A;
2204   const struct spellfix1_row *b = (const struct spellfix1_row*)B;
2205   return a->iScore - b->iScore;
2206 }
2207 
2208 /*
2209 ** A structure used to pass information from spellfix1FilterForMatch()
2210 ** into spellfix1RunQuery().
2211 */
2212 typedef struct MatchQuery {
2213   spellfix1_cursor *pCur;          /* The cursor being queried */
2214   sqlite3_stmt *pStmt;             /* shadow table query statment */
2215   char zHash[SPELLFIX_MX_HASH];    /* The current phonehash for zPattern */
2216   const char *zPattern;            /* Transliterated input string */
2217   int nPattern;                    /* Length of zPattern */
2218   EditDist3FromString *pMatchStr3; /* Original unicode string */
2219   EditDist3Config *pConfig3;       /* Edit-distance cost coefficients */
2220   const EditDist3Lang *pLang;      /* The selected language coefficients */
2221   int iLang;                       /* The language id */
2222   int iScope;                      /* Default scope */
2223   int iMaxDist;                    /* Maximum allowed edit distance, or -1 */
2224   int rc;                          /* Error code */
2225   int nRun;                  /* Number of prior runs for the same zPattern */
2226   char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH];  /* Prior hashes */
2227 } MatchQuery;
2228 
2229 /*
2230 ** Run a query looking for the best matches against zPattern using
2231 ** zHash as the character class seed hash.
2232 */
2233 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
2234   const char *zK1;
2235   const char *zWord;
2236   int iDist;
2237   int iRank;
2238   int iScore;
2239   int iWorst = 0;
2240   int idx;
2241   int idxWorst = -1;
2242   int i;
2243   int iScope = p->iScope;
2244   spellfix1_cursor *pCur = p->pCur;
2245   sqlite3_stmt *pStmt = p->pStmt;
2246   char zHash1[SPELLFIX_MX_HASH];
2247   char zHash2[SPELLFIX_MX_HASH];
2248   char *zClass;
2249   int nClass;
2250   int rc;
2251 
2252   if( pCur->a==0 || p->rc ) return;   /* Prior memory allocation failure */
2253   zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
2254   if( zClass==0 ){
2255     p->rc = SQLITE_NOMEM;
2256     return;
2257   }
2258   nClass = (int)strlen(zClass);
2259   if( nClass>SPELLFIX_MX_HASH-2 ){
2260     nClass = SPELLFIX_MX_HASH-2;
2261     zClass[nClass] = 0;
2262   }
2263   if( nClass<=iScope ){
2264     if( nClass>2 ){
2265       iScope = nClass-1;
2266     }else{
2267       iScope = nClass;
2268     }
2269   }
2270   memcpy(zHash1, zClass, iScope);
2271   sqlite3_free(zClass);
2272   zHash1[iScope] = 0;
2273   memcpy(zHash2, zHash1, iScope);
2274   zHash2[iScope] = 'Z';
2275   zHash2[iScope+1] = 0;
2276 #if SPELLFIX_MX_RUN>1
2277   for(i=0; i<p->nRun; i++){
2278     if( strcmp(p->azPrior[i], zHash1)==0 ) return;
2279   }
2280 #endif
2281   assert( p->nRun<SPELLFIX_MX_RUN );
2282   memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
2283   if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
2284    || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
2285   ){
2286     p->rc = SQLITE_NOMEM;
2287     return;
2288   }
2289 #if SPELLFIX_MX_RUN>1
2290   for(i=0; i<pCur->nRow; i++){
2291     if( pCur->a[i].iScore>iWorst ){
2292       iWorst = pCur->a[i].iScore;
2293       idxWorst = i;
2294     }
2295   }
2296 #endif
2297   while( sqlite3_step(pStmt)==SQLITE_ROW ){
2298     int iMatchlen = -1;
2299     iRank = sqlite3_column_int(pStmt, 2);
2300     if( p->pMatchStr3 ){
2301       int nWord = sqlite3_column_bytes(pStmt, 1);
2302       zWord = (const char*)sqlite3_column_text(pStmt, 1);
2303       iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
2304     }else{
2305       zK1 = (const char*)sqlite3_column_text(pStmt, 3);
2306       if( zK1==0 ) continue;
2307       iDist = editdist1(p->zPattern, zK1, 0);
2308     }
2309     if( iDist<0 ){
2310       p->rc = SQLITE_NOMEM;
2311       break;
2312     }
2313     pCur->nSearch++;
2314     iScore = spellfix1Score(iDist,iRank);
2315     if( p->iMaxDist>=0 ){
2316       if( iDist>p->iMaxDist ) continue;
2317       if( pCur->nRow>=pCur->nAlloc-1 ){
2318         spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
2319         if( pCur->a==0 ) break;
2320       }
2321       idx = pCur->nRow;
2322     }else if( pCur->nRow<pCur->nAlloc ){
2323       idx = pCur->nRow;
2324     }else if( iScore<iWorst ){
2325       idx = idxWorst;
2326       sqlite3_free(pCur->a[idx].zWord);
2327     }else{
2328       continue;
2329     }
2330     pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
2331     if( pCur->a[idx].zWord==0 ){
2332       p->rc = SQLITE_NOMEM;
2333       break;
2334     }
2335     pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
2336     pCur->a[idx].iRank = iRank;
2337     pCur->a[idx].iDistance = iDist;
2338     pCur->a[idx].iScore = iScore;
2339     pCur->a[idx].iMatchlen = iMatchlen;
2340     memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
2341     if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
2342     if( pCur->nRow==pCur->nAlloc ){
2343       iWorst = pCur->a[0].iScore;
2344       idxWorst = 0;
2345       for(i=1; i<pCur->nRow; i++){
2346         iScore = pCur->a[i].iScore;
2347         if( iWorst<iScore ){
2348           iWorst = iScore;
2349           idxWorst = i;
2350         }
2351       }
2352     }
2353   }
2354   rc = sqlite3_reset(pStmt);
2355   if( rc ) p->rc = rc;
2356 }
2357 
2358 /*
2359 ** This version of the xFilter method work if the MATCH term is present
2360 ** and we are doing a scan.
2361 */
2362 static int spellfix1FilterForMatch(
2363   spellfix1_cursor *pCur,
2364   int idxNum,
2365   int argc,
2366   sqlite3_value **argv
2367 ){
2368   const unsigned char *zMatchThis;   /* RHS of the MATCH operator */
2369   EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
2370   char *zPattern;                    /* Transliteration of zMatchThis */
2371   int nPattern;                      /* Length of zPattern */
2372   int iLimit = 20;                   /* Max number of rows of output */
2373   int iScope = 3;                    /* Use this many characters of zClass */
2374   int iLang = 0;                     /* Language code */
2375   char *zSql;                        /* SQL of shadow table query */
2376   sqlite3_stmt *pStmt = 0;           /* Shadow table query */
2377   int rc;                            /* Result code */
2378   int idx = 1;                       /* Next available filter parameter */
2379   spellfix1_vtab *p = pCur->pVTab;   /* The virtual table that owns pCur */
2380   MatchQuery x;                      /* For passing info to RunQuery() */
2381 
2382   /* Load the cost table if we have not already done so */
2383   if( p->zCostTable!=0 && p->pConfig3==0 ){
2384     p->pConfig3 = sqlite3_malloc( sizeof(p->pConfig3[0]) );
2385     if( p->pConfig3==0 ) return SQLITE_NOMEM;
2386     memset(p->pConfig3, 0, sizeof(p->pConfig3[0]));
2387     rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable);
2388     if( rc ) return rc;
2389   }
2390   memset(&x, 0, sizeof(x));
2391   x.iScope = 3;  /* Default scope if none specified by "WHERE scope=N" */
2392   x.iMaxDist = -1;   /* Maximum allowed edit distance */
2393 
2394   if( idxNum&2 ){
2395     iLang = sqlite3_value_int(argv[idx++]);
2396   }
2397   if( idxNum&4 ){
2398     iLimit = sqlite3_value_int(argv[idx++]);
2399     if( iLimit<1 ) iLimit = 1;
2400   }
2401   if( idxNum&8 ){
2402     x.iScope = sqlite3_value_int(argv[idx++]);
2403     if( x.iScope<1 ) x.iScope = 1;
2404     if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2;
2405   }
2406   if( idxNum&(16|32) ){
2407     x.iMaxDist = sqlite3_value_int(argv[idx++]);
2408     if( idxNum&16 ) x.iMaxDist--;
2409     if( x.iMaxDist<0 ) x.iMaxDist = 0;
2410   }
2411   spellfix1ResetCursor(pCur);
2412   spellfix1ResizeCursor(pCur, iLimit);
2413   zMatchThis = sqlite3_value_text(argv[0]);
2414   if( zMatchThis==0 ) return SQLITE_OK;
2415   if( p->pConfig3 ){
2416     x.pLang = editDist3FindLang(p->pConfig3, iLang);
2417     pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
2418     if( pMatchStr3==0 ){
2419       x.rc = SQLITE_NOMEM;
2420       goto filter_exit;
2421     }
2422   }else{
2423     x.pLang = 0;
2424   }
2425   zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
2426   sqlite3_free(pCur->zPattern);
2427   pCur->zPattern = zPattern;
2428   if( zPattern==0 ){
2429     x.rc = SQLITE_NOMEM;
2430     goto filter_exit;
2431   }
2432   nPattern = (int)strlen(zPattern);
2433   if( zPattern[nPattern-1]=='*' ) nPattern--;
2434   zSql = sqlite3_mprintf(
2435      "SELECT id, word, rank, k1"
2436      "  FROM \"%w\".\"%w_vocab\""
2437      " WHERE langid=%d AND k2>=?1 AND k2<?2",
2438      p->zDbName, p->zTableName, iLang
2439   );
2440   if( zSql==0 ){
2441     x.rc = SQLITE_NOMEM;
2442     pStmt = 0;
2443     goto filter_exit;
2444   }
2445   rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
2446   sqlite3_free(zSql);
2447   pCur->iLang = iLang;
2448   x.pCur = pCur;
2449   x.pStmt = pStmt;
2450   x.zPattern = zPattern;
2451   x.nPattern = nPattern;
2452   x.pMatchStr3 = pMatchStr3;
2453   x.iLang = iLang;
2454   x.rc = rc;
2455   x.pConfig3 = p->pConfig3;
2456   if( x.rc==SQLITE_OK ){
2457     spellfix1RunQuery(&x, zPattern, nPattern);
2458   }
2459 
2460   if( pCur->a ){
2461     qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
2462     pCur->iTop = iLimit;
2463     pCur->iScope = iScope;
2464   }else{
2465     x.rc = SQLITE_NOMEM;
2466   }
2467 
2468 filter_exit:
2469   sqlite3_finalize(pStmt);
2470   editDist3FromStringDelete(pMatchStr3);
2471   return x.rc;
2472 }
2473 
2474 /*
2475 ** This version of xFilter handles a full-table scan case
2476 */
2477 static int spellfix1FilterForFullScan(
2478   spellfix1_cursor *pCur,
2479   int idxNum,
2480   int argc,
2481   sqlite3_value **argv
2482 ){
2483   int rc = SQLITE_OK;
2484   char *zSql;
2485   spellfix1_vtab *pVTab = pCur->pVTab;
2486   spellfix1ResetCursor(pCur);
2487   assert( idxNum==0 || idxNum==64 );
2488   zSql = sqlite3_mprintf(
2489      "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2490      pVTab->zDbName, pVTab->zTableName,
2491      ((idxNum & 64) ? " WHERE rowid=?" : "")
2492   );
2493   if( zSql==0 ) return SQLITE_NOMEM;
2494   rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
2495   sqlite3_free(zSql);
2496   if( rc==SQLITE_OK && (idxNum & 64) ){
2497     assert( argc==1 );
2498     rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]);
2499   }
2500   pCur->nRow = pCur->iRow = 0;
2501   if( rc==SQLITE_OK ){
2502     rc = sqlite3_step(pCur->pFullScan);
2503     if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; }
2504     if( rc==SQLITE_DONE ){ rc = SQLITE_OK; }
2505   }else{
2506     pCur->iRow = 0;
2507   }
2508   return rc;
2509 }
2510 
2511 
2512 /*
2513 ** Called to "rewind" a cursor back to the beginning so that
2514 ** it starts its output over again.  Always called at least once
2515 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2516 */
2517 static int spellfix1Filter(
2518   sqlite3_vtab_cursor *cur,
2519   int idxNum, const char *idxStr,
2520   int argc, sqlite3_value **argv
2521 ){
2522   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2523   int rc;
2524   if( idxNum & 1 ){
2525     rc = spellfix1FilterForMatch(pCur, idxNum, argc, argv);
2526   }else{
2527     rc = spellfix1FilterForFullScan(pCur, idxNum, argc, argv);
2528   }
2529   return rc;
2530 }
2531 
2532 
2533 /*
2534 ** Advance a cursor to its next row of output
2535 */
2536 static int spellfix1Next(sqlite3_vtab_cursor *cur){
2537   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2538   int rc = SQLITE_OK;
2539   if( pCur->iRow < pCur->nRow ){
2540     if( pCur->pFullScan ){
2541       rc = sqlite3_step(pCur->pFullScan);
2542       if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
2543       if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK;
2544     }else{
2545       pCur->iRow++;
2546     }
2547   }
2548   return rc;
2549 }
2550 
2551 /*
2552 ** Return TRUE if we are at the end-of-file
2553 */
2554 static int spellfix1Eof(sqlite3_vtab_cursor *cur){
2555   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2556   return pCur->iRow>=pCur->nRow;
2557 }
2558 
2559 /*
2560 ** Return columns from the current row.
2561 */
2562 static int spellfix1Column(
2563   sqlite3_vtab_cursor *cur,
2564   sqlite3_context *ctx,
2565   int i
2566 ){
2567   spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2568   if( pCur->pFullScan ){
2569     if( i<=SPELLFIX_COL_LANGID ){
2570       sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
2571     }else{
2572       sqlite3_result_null(ctx);
2573     }
2574     return SQLITE_OK;
2575   }
2576   switch( i ){
2577     case SPELLFIX_COL_WORD: {
2578       sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
2579       break;
2580     }
2581     case SPELLFIX_COL_RANK: {
2582       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
2583       break;
2584     }
2585     case SPELLFIX_COL_DISTANCE: {
2586       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
2587       break;
2588     }
2589     case SPELLFIX_COL_LANGID: {
2590       sqlite3_result_int(ctx, pCur->iLang);
2591       break;
2592     }
2593     case SPELLFIX_COL_SCORE: {
2594       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
2595       break;
2596     }
2597     case SPELLFIX_COL_MATCHLEN: {
2598       int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
2599       if( iMatchlen<0 ){
2600         int nPattern = (int)strlen(pCur->zPattern);
2601         char *zWord = pCur->a[pCur->iRow].zWord;
2602         int nWord = (int)strlen(zWord);
2603 
2604         if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
2605           char *zTranslit;
2606           int res;
2607           zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
2608           if( !zTranslit ) return SQLITE_NOMEM;
2609           res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
2610           sqlite3_free(zTranslit);
2611           if( res<0 ) return SQLITE_NOMEM;
2612           iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
2613         }else{
2614           iMatchlen = utf8Charlen(zWord, nWord);
2615         }
2616       }
2617 
2618       sqlite3_result_int(ctx, iMatchlen);
2619       break;
2620     }
2621     case SPELLFIX_COL_PHONEHASH: {
2622       sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
2623       break;
2624     }
2625     case SPELLFIX_COL_TOP: {
2626       sqlite3_result_int(ctx, pCur->iTop);
2627       break;
2628     }
2629     case SPELLFIX_COL_SCOPE: {
2630       sqlite3_result_int(ctx, pCur->iScope);
2631       break;
2632     }
2633     case SPELLFIX_COL_SRCHCNT: {
2634       sqlite3_result_int(ctx, pCur->nSearch);
2635       break;
2636     }
2637     default: {
2638       sqlite3_result_null(ctx);
2639       break;
2640     }
2641   }
2642   return SQLITE_OK;
2643 }
2644 
2645 /*
2646 ** The rowid.
2647 */
2648 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
2649   spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2650   if( pCur->pFullScan ){
2651     *pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
2652   }else{
2653     *pRowid = pCur->a[pCur->iRow].iRowid;
2654   }
2655   return SQLITE_OK;
2656 }
2657 
2658 /*
2659 ** This function is called by the xUpdate() method. It returns a string
2660 ** containing the conflict mode that xUpdate() should use for the current
2661 ** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE".
2662 */
2663 static const char *spellfix1GetConflict(sqlite3 *db){
2664   static const char *azConflict[] = {
2665     /* Note: Instead of "FAIL" - "ABORT". */
2666     "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE"
2667   };
2668   int eConflict = sqlite3_vtab_on_conflict(db);
2669 
2670   assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE
2671        || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT
2672        || eConflict==SQLITE_REPLACE
2673   );
2674   assert( SQLITE_ROLLBACK==1 );
2675   assert( SQLITE_IGNORE==2 );
2676   assert( SQLITE_FAIL==3 );
2677   assert( SQLITE_ABORT==4 );
2678   assert( SQLITE_REPLACE==5 );
2679 
2680   return azConflict[eConflict-1];
2681 }
2682 
2683 /*
2684 ** The xUpdate() method.
2685 */
2686 static int spellfix1Update(
2687   sqlite3_vtab *pVTab,
2688   int argc,
2689   sqlite3_value **argv,
2690   sqlite_int64 *pRowid
2691 ){
2692   int rc = SQLITE_OK;
2693   sqlite3_int64 rowid, newRowid;
2694   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2695   sqlite3 *db = p->db;
2696 
2697   if( argc==1 ){
2698     /* A delete operation on the rowid given by argv[0] */
2699     rowid = *pRowid = sqlite3_value_int64(argv[0]);
2700     spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
2701                            " WHERE id=%lld",
2702                   p->zDbName, p->zTableName, rowid);
2703   }else{
2704     const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]);
2705     int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]);
2706     int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]);
2707     int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]);
2708     const unsigned char *zSoundslike =
2709            sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2710     int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2711     char *zK1, *zK2;
2712     int i;
2713     char c;
2714     const char *zConflict = spellfix1GetConflict(db);
2715 
2716     if( zWord==0 ){
2717       /* Inserts of the form:  INSERT INTO table(command) VALUES('xyzzy');
2718       ** cause zWord to be NULL, so we look at the "command" column to see
2719       ** what special actions to take */
2720       const char *zCmd =
2721          (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]);
2722       if( zCmd==0 ){
2723         pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2724                                          p->zTableName);
2725         return SQLITE_CONSTRAINT_NOTNULL;
2726       }
2727       if( strcmp(zCmd,"reset")==0 ){
2728         /* Reset the  edit cost table (if there is one). */
2729         editDist3ConfigDelete(p->pConfig3);
2730         p->pConfig3 = 0;
2731         return SQLITE_OK;
2732       }
2733       if( strncmp(zCmd,"edit_cost_table=",16)==0 ){
2734         editDist3ConfigDelete(p->pConfig3);
2735         p->pConfig3 = 0;
2736         sqlite3_free(p->zCostTable);
2737         p->zCostTable = spellfix1Dequote(zCmd+16);
2738         if( p->zCostTable==0 ) return SQLITE_NOMEM;
2739         if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){
2740           sqlite3_free(p->zCostTable);
2741           p->zCostTable = 0;
2742         }
2743         return SQLITE_OK;
2744       }
2745       pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2746                                        p->zTableName, zCmd);
2747       return SQLITE_ERROR;
2748     }
2749     if( iRank<1 ) iRank = 1;
2750     if( zSoundslike ){
2751       zK1 = (char*)transliterate(zSoundslike, nSoundslike);
2752     }else{
2753       zK1 = (char*)transliterate(zWord, nWord);
2754     }
2755     if( zK1==0 ) return SQLITE_NOMEM;
2756     for(i=0; (c = zK1[i])!=0; i++){
2757        if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
2758     }
2759     zK2 = (char*)phoneticHash((const unsigned char*)zK1, i);
2760     if( zK2==0 ){
2761       sqlite3_free(zK1);
2762       return SQLITE_NOMEM;
2763     }
2764     if( sqlite3_value_type(argv[0])==SQLITE_NULL ){
2765       if( sqlite3_value_type(argv[1])==SQLITE_NULL ){
2766         spellfix1DbExec(&rc, db,
2767                "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
2768                "VALUES(%d,%d,%Q,%Q,%Q)",
2769                p->zDbName, p->zTableName,
2770                iRank, iLang, zWord, zK1, zK2
2771         );
2772       }else{
2773         newRowid = sqlite3_value_int64(argv[1]);
2774         spellfix1DbExec(&rc, db,
2775             "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) "
2776             "VALUES(%lld,%d,%d,%Q,%Q,%Q)",
2777             zConflict, p->zDbName, p->zTableName,
2778             newRowid, iRank, iLang, zWord, zK1, zK2
2779         );
2780       }
2781       *pRowid = sqlite3_last_insert_rowid(db);
2782     }else{
2783       rowid = sqlite3_value_int64(argv[0]);
2784       newRowid = *pRowid = sqlite3_value_int64(argv[1]);
2785       spellfix1DbExec(&rc, db,
2786              "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
2787              " word=%Q, k1=%Q, k2=%Q WHERE id=%lld",
2788              zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang,
2789              zWord, zK1, zK2, rowid
2790       );
2791     }
2792     sqlite3_free(zK1);
2793     sqlite3_free(zK2);
2794   }
2795   return rc;
2796 }
2797 
2798 /*
2799 ** Rename the spellfix1 table.
2800 */
2801 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){
2802   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2803   sqlite3 *db = p->db;
2804   int rc = SQLITE_OK;
2805   char *zNewName = sqlite3_mprintf("%s", zNew);
2806   if( zNewName==0 ){
2807     return SQLITE_NOMEM;
2808   }
2809   spellfix1DbExec(&rc, db,
2810      "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
2811      p->zDbName, p->zTableName, zNewName
2812   );
2813   if( rc==SQLITE_OK ){
2814     sqlite3_free(p->zTableName);
2815     p->zTableName = zNewName;
2816   }else{
2817     sqlite3_free(zNewName);
2818   }
2819   return rc;
2820 }
2821 
2822 
2823 /*
2824 ** A virtual table module that provides fuzzy search.
2825 */
2826 static sqlite3_module spellfix1Module = {
2827   0,                       /* iVersion */
2828   spellfix1Create,         /* xCreate - handle CREATE VIRTUAL TABLE */
2829   spellfix1Connect,        /* xConnect - reconnected to an existing table */
2830   spellfix1BestIndex,      /* xBestIndex - figure out how to do a query */
2831   spellfix1Disconnect,     /* xDisconnect - close a connection */
2832   spellfix1Destroy,        /* xDestroy - handle DROP TABLE */
2833   spellfix1Open,           /* xOpen - open a cursor */
2834   spellfix1Close,          /* xClose - close a cursor */
2835   spellfix1Filter,         /* xFilter - configure scan constraints */
2836   spellfix1Next,           /* xNext - advance a cursor */
2837   spellfix1Eof,            /* xEof - check for end of scan */
2838   spellfix1Column,         /* xColumn - read data */
2839   spellfix1Rowid,          /* xRowid - read data */
2840   spellfix1Update,         /* xUpdate */
2841   0,                       /* xBegin */
2842   0,                       /* xSync */
2843   0,                       /* xCommit */
2844   0,                       /* xRollback */
2845   0,                       /* xFindMethod */
2846   spellfix1Rename,         /* xRename */
2847 };
2848 
2849 /*
2850 ** Register the various functions and the virtual table.
2851 */
2852 static int spellfix1Register(sqlite3 *db){
2853   int rc = SQLITE_OK;
2854   int i;
2855   rc = sqlite3_create_function(db, "spellfix1_translit", 1, SQLITE_UTF8, 0,
2856                                   transliterateSqlFunc, 0, 0);
2857   if( rc==SQLITE_OK ){
2858     rc = sqlite3_create_function(db, "spellfix1_editdist", 2, SQLITE_UTF8, 0,
2859                                   editdistSqlFunc, 0, 0);
2860   }
2861   if( rc==SQLITE_OK ){
2862     rc = sqlite3_create_function(db, "spellfix1_phonehash", 1, SQLITE_UTF8, 0,
2863                                   phoneticHashSqlFunc, 0, 0);
2864   }
2865   if( rc==SQLITE_OK ){
2866     rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1, SQLITE_UTF8, 0,
2867                                   scriptCodeSqlFunc, 0, 0);
2868   }
2869   if( rc==SQLITE_OK ){
2870     rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
2871   }
2872   if( rc==SQLITE_OK ){
2873     rc = editDist3Install(db);
2874   }
2875 
2876   /* Verify sanity of the translit[] table */
2877   for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
2878     assert( translit[i].cFrom<translit[i+1].cFrom );
2879   }
2880 
2881   return rc;
2882 }
2883 
2884 #endif /* SQLITE_OMIT_VIRTUALTABLE */
2885 
2886 /*
2887 ** Extension load function.
2888 */
2889 #ifdef _WIN32
2890 __declspec(dllexport)
2891 #endif
2892 int sqlite3_spellfix_init(
2893   sqlite3 *db,
2894   char **pzErrMsg,
2895   const sqlite3_api_routines *pApi
2896 ){
2897   SQLITE_EXTENSION_INIT2(pApi);
2898 #ifndef SQLITE_OMIT_VIRTUALTABLE
2899   return spellfix1Register(db);
2900 #endif
2901   return SQLITE_OK;
2902 }
2903