xref: /sqlite-3.40.0/ext/misc/spellfix.c (revision a3fdec71)
1 /*
2 ** 2012 April 10
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used
14 ** to search a large vocabulary for close matches.  See separate
15 ** documentation (http://www.sqlite.org/spellfix1.html) for details.
16 */
17 #include "sqlite3ext.h"
18 SQLITE_EXTENSION_INIT1
19 
20 #ifndef SQLITE_AMALGAMATION
21 # include <string.h>
22 # include <stdio.h>
23 # include <stdlib.h>
24 # include <assert.h>
25 # define ALWAYS(X)  1
26 # define NEVER(X)   0
27   typedef unsigned char u8;
28   typedef unsigned short u16;
29 # include <ctype.h>
30 #endif
31 
32 #ifndef SQLITE_OMIT_VIRTUALTABLE
33 
34 /*
35 ** Character classes for ASCII characters:
36 **
37 **   0   ''        Silent letters:   H W
38 **   1   'A'       Any vowel:   A E I O U (Y)
39 **   2   'B'       A bilabeal stop or fricative:  B F P V W
40 **   3   'C'       Other fricatives or back stops:  C G J K Q S X Z
41 **   4   'D'       Alveolar stops:  D T
42 **   5   'H'       Letter H at the beginning of a word
43 **   6   'L'       Glide:  L
44 **   7   'R'       Semivowel:  R
45 **   8   'M'       Nasals:  M N
46 **   9   'Y'       Letter Y at the beginning of a word.
47 **   10  '9'       Digits: 0 1 2 3 4 5 6 7 8 9
48 **   11  ' '       White space
49 **   12  '?'       Other.
50 */
51 #define CCLASS_SILENT         0
52 #define CCLASS_VOWEL          1
53 #define CCLASS_B              2
54 #define CCLASS_C              3
55 #define CCLASS_D              4
56 #define CCLASS_H              5
57 #define CCLASS_L              6
58 #define CCLASS_R              7
59 #define CCLASS_M              8
60 #define CCLASS_Y              9
61 #define CCLASS_DIGIT         10
62 #define CCLASS_SPACE         11
63 #define CCLASS_OTHER         12
64 
65 /*
66 ** The following table gives the character class for non-initial ASCII
67 ** characters.
68 */
69 static const unsigned char midClass[] = {
70  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
71  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
72  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
73  /*   */ CCLASS_SPACE,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
74  /*   */ CCLASS_SPACE,    /*   */ CCLASS_SPACE,   /*   */ CCLASS_OTHER,
75  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
76  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
77  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
78  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
79  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
80  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_SPACE,
81  /* ! */ CCLASS_OTHER,    /* " */ CCLASS_OTHER,   /* # */ CCLASS_OTHER,
82  /* $ */ CCLASS_OTHER,    /* % */ CCLASS_OTHER,   /* & */ CCLASS_OTHER,
83  /* ' */ CCLASS_SILENT,   /* ( */ CCLASS_OTHER,   /* ) */ CCLASS_OTHER,
84  /* * */ CCLASS_OTHER,    /* + */ CCLASS_OTHER,   /* , */ CCLASS_OTHER,
85  /* - */ CCLASS_OTHER,    /* . */ CCLASS_OTHER,   /* / */ CCLASS_OTHER,
86  /* 0 */ CCLASS_DIGIT,    /* 1 */ CCLASS_DIGIT,   /* 2 */ CCLASS_DIGIT,
87  /* 3 */ CCLASS_DIGIT,    /* 4 */ CCLASS_DIGIT,   /* 5 */ CCLASS_DIGIT,
88  /* 6 */ CCLASS_DIGIT,    /* 7 */ CCLASS_DIGIT,   /* 8 */ CCLASS_DIGIT,
89  /* 9 */ CCLASS_DIGIT,    /* : */ CCLASS_OTHER,   /* ; */ CCLASS_OTHER,
90  /* < */ CCLASS_OTHER,    /* = */ CCLASS_OTHER,   /* > */ CCLASS_OTHER,
91  /* ? */ CCLASS_OTHER,    /* @ */ CCLASS_OTHER,   /* A */ CCLASS_VOWEL,
92  /* B */ CCLASS_B,        /* C */ CCLASS_C,       /* D */ CCLASS_D,
93  /* E */ CCLASS_VOWEL,    /* F */ CCLASS_B,       /* G */ CCLASS_C,
94  /* H */ CCLASS_SILENT,   /* I */ CCLASS_VOWEL,   /* J */ CCLASS_C,
95  /* K */ CCLASS_C,        /* L */ CCLASS_L,       /* M */ CCLASS_M,
96  /* N */ CCLASS_M,        /* O */ CCLASS_VOWEL,   /* P */ CCLASS_B,
97  /* Q */ CCLASS_C,        /* R */ CCLASS_R,       /* S */ CCLASS_C,
98  /* T */ CCLASS_D,        /* U */ CCLASS_VOWEL,   /* V */ CCLASS_B,
99  /* W */ CCLASS_B,        /* X */ CCLASS_C,       /* Y */ CCLASS_VOWEL,
100  /* Z */ CCLASS_C,        /* [ */ CCLASS_OTHER,   /* \ */ CCLASS_OTHER,
101  /* ] */ CCLASS_OTHER,    /* ^ */ CCLASS_OTHER,   /* _ */ CCLASS_OTHER,
102  /* ` */ CCLASS_OTHER,    /* a */ CCLASS_VOWEL,   /* b */ CCLASS_B,
103  /* c */ CCLASS_C,        /* d */ CCLASS_D,       /* e */ CCLASS_VOWEL,
104  /* f */ CCLASS_B,        /* g */ CCLASS_C,       /* h */ CCLASS_SILENT,
105  /* i */ CCLASS_VOWEL,    /* j */ CCLASS_C,       /* k */ CCLASS_C,
106  /* l */ CCLASS_L,        /* m */ CCLASS_M,       /* n */ CCLASS_M,
107  /* o */ CCLASS_VOWEL,    /* p */ CCLASS_B,       /* q */ CCLASS_C,
108  /* r */ CCLASS_R,        /* s */ CCLASS_C,       /* t */ CCLASS_D,
109  /* u */ CCLASS_VOWEL,    /* v */ CCLASS_B,       /* w */ CCLASS_B,
110  /* x */ CCLASS_C,        /* y */ CCLASS_VOWEL,   /* z */ CCLASS_C,
111  /* { */ CCLASS_OTHER,    /* | */ CCLASS_OTHER,   /* } */ CCLASS_OTHER,
112  /* ~ */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,
113 };
114 /*
115 ** This tables gives the character class for ASCII characters that form the
116 ** initial character of a word.  The only difference from midClass is with
117 ** the letters H, W, and Y.
118 */
119 static const unsigned char initClass[] = {
120  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
121  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
122  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
123  /*   */ CCLASS_SPACE,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
124  /*   */ CCLASS_SPACE,    /*   */ CCLASS_SPACE,   /*   */ CCLASS_OTHER,
125  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
126  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
127  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
128  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
129  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
130  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_SPACE,
131  /* ! */ CCLASS_OTHER,    /* " */ CCLASS_OTHER,   /* # */ CCLASS_OTHER,
132  /* $ */ CCLASS_OTHER,    /* % */ CCLASS_OTHER,   /* & */ CCLASS_OTHER,
133  /* ' */ CCLASS_OTHER,    /* ( */ CCLASS_OTHER,   /* ) */ CCLASS_OTHER,
134  /* * */ CCLASS_OTHER,    /* + */ CCLASS_OTHER,   /* , */ CCLASS_OTHER,
135  /* - */ CCLASS_OTHER,    /* . */ CCLASS_OTHER,   /* / */ CCLASS_OTHER,
136  /* 0 */ CCLASS_DIGIT,    /* 1 */ CCLASS_DIGIT,   /* 2 */ CCLASS_DIGIT,
137  /* 3 */ CCLASS_DIGIT,    /* 4 */ CCLASS_DIGIT,   /* 5 */ CCLASS_DIGIT,
138  /* 6 */ CCLASS_DIGIT,    /* 7 */ CCLASS_DIGIT,   /* 8 */ CCLASS_DIGIT,
139  /* 9 */ CCLASS_DIGIT,    /* : */ CCLASS_OTHER,   /* ; */ CCLASS_OTHER,
140  /* < */ CCLASS_OTHER,    /* = */ CCLASS_OTHER,   /* > */ CCLASS_OTHER,
141  /* ? */ CCLASS_OTHER,    /* @ */ CCLASS_OTHER,   /* A */ CCLASS_VOWEL,
142  /* B */ CCLASS_B,        /* C */ CCLASS_C,       /* D */ CCLASS_D,
143  /* E */ CCLASS_VOWEL,    /* F */ CCLASS_B,       /* G */ CCLASS_C,
144  /* H */ CCLASS_SILENT,   /* I */ CCLASS_VOWEL,   /* J */ CCLASS_C,
145  /* K */ CCLASS_C,        /* L */ CCLASS_L,       /* M */ CCLASS_M,
146  /* N */ CCLASS_M,        /* O */ CCLASS_VOWEL,   /* P */ CCLASS_B,
147  /* Q */ CCLASS_C,        /* R */ CCLASS_R,       /* S */ CCLASS_C,
148  /* T */ CCLASS_D,        /* U */ CCLASS_VOWEL,   /* V */ CCLASS_B,
149  /* W */ CCLASS_B,        /* X */ CCLASS_C,       /* Y */ CCLASS_Y,
150  /* Z */ CCLASS_C,        /* [ */ CCLASS_OTHER,   /* \ */ CCLASS_OTHER,
151  /* ] */ CCLASS_OTHER,    /* ^ */ CCLASS_OTHER,   /* _ */ CCLASS_OTHER,
152  /* ` */ CCLASS_OTHER,    /* a */ CCLASS_VOWEL,   /* b */ CCLASS_B,
153  /* c */ CCLASS_C,        /* d */ CCLASS_D,       /* e */ CCLASS_VOWEL,
154  /* f */ CCLASS_B,        /* g */ CCLASS_C,       /* h */ CCLASS_SILENT,
155  /* i */ CCLASS_VOWEL,    /* j */ CCLASS_C,       /* k */ CCLASS_C,
156  /* l */ CCLASS_L,        /* m */ CCLASS_M,       /* n */ CCLASS_M,
157  /* o */ CCLASS_VOWEL,    /* p */ CCLASS_B,       /* q */ CCLASS_C,
158  /* r */ CCLASS_R,        /* s */ CCLASS_C,       /* t */ CCLASS_D,
159  /* u */ CCLASS_VOWEL,    /* v */ CCLASS_B,       /* w */ CCLASS_B,
160  /* x */ CCLASS_C,        /* y */ CCLASS_Y,       /* z */ CCLASS_C,
161  /* { */ CCLASS_OTHER,    /* | */ CCLASS_OTHER,   /* } */ CCLASS_OTHER,
162  /* ~ */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,
163 };
164 
165 /*
166 ** Mapping from the character class number (0-13) to a symbol for each
167 ** character class.  Note that initClass[] can be used to map the class
168 ** symbol back into the class number.
169 */
170 static const unsigned char className[] = ".ABCDHLRMY9 ?";
171 
172 /*
173 ** Generate a "phonetic hash" from a string of ASCII characters
174 ** in zIn[0..nIn-1].
175 **
176 **   * Map characters by character class as defined above.
177 **   * Omit double-letters
178 **   * Omit vowels beside R and L
179 **   * Omit T when followed by CH
180 **   * Omit W when followed by R
181 **   * Omit D when followed by J or G
182 **   * Omit K in KN or G in GN at the beginning of a word
183 **
184 ** Space to hold the result is obtained from sqlite3_malloc()
185 **
186 ** Return NULL if memory allocation fails.
187 */
188 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){
189   unsigned char *zOut = sqlite3_malloc( nIn + 1 );
190   int i;
191   int nOut = 0;
192   char cPrev = 0x77;
193   char cPrevX = 0x77;
194   const unsigned char *aClass = initClass;
195 
196   if( zOut==0 ) return 0;
197   if( nIn>2 ){
198     switch( zIn[0] ){
199       case 'g':
200       case 'k': {
201         if( zIn[1]=='n' ){ zIn++; nIn--; }
202         break;
203       }
204     }
205   }
206   for(i=0; i<nIn; i++){
207     unsigned char c = zIn[i];
208     if( i+1<nIn ){
209       if( c=='w' && zIn[i+1]=='r' ) continue;
210       if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
211       if( i+2<nIn ){
212         if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
213       }
214     }
215     c = aClass[c&0x7f];
216     if( c==CCLASS_SPACE ) continue;
217     if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
218     aClass = midClass;
219     if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){
220        continue; /* No vowels beside L or R */
221     }
222     if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){
223        nOut--;   /* No vowels beside L or R */
224     }
225     cPrev = c;
226     if( c==CCLASS_SILENT ) continue;
227     cPrevX = c;
228     c = className[c];
229     assert( nOut>=0 );
230     if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c;
231   }
232   zOut[nOut] = 0;
233   return zOut;
234 }
235 
236 /*
237 ** This is an SQL function wrapper around phoneticHash().  See
238 ** the description of phoneticHash() for additional information.
239 */
240 static void phoneticHashSqlFunc(
241   sqlite3_context *context,
242   int argc,
243   sqlite3_value **argv
244 ){
245   const unsigned char *zIn;
246   unsigned char *zOut;
247 
248   zIn = sqlite3_value_text(argv[0]);
249   if( zIn==0 ) return;
250   zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0]));
251   if( zOut==0 ){
252     sqlite3_result_error_nomem(context);
253   }else{
254     sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
255   }
256 }
257 
258 /*
259 ** Return the character class number for a character given its
260 ** context.
261 */
262 static char characterClass(char cPrev, char c){
263   return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
264 }
265 
266 /*
267 ** Return the cost of inserting or deleting character c immediately
268 ** following character cPrev.  If cPrev==0, that means c is the first
269 ** character of the word.
270 */
271 static int insertOrDeleteCost(char cPrev, char c, char cNext){
272   char classC = characterClass(cPrev, c);
273   char classCprev;
274 
275   if( classC==CCLASS_SILENT ){
276     /* Insert or delete "silent" characters such as H or W */
277     return 1;
278   }
279   if( cPrev==c ){
280     /* Repeated characters, or miss a repeat */
281     return 10;
282   }
283   if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){
284     return 20;  /* Insert a vowel before or after 'r' */
285   }
286   classCprev = characterClass(cPrev, cPrev);
287   if( classC==classCprev ){
288     if( classC==CCLASS_VOWEL ){
289       /* Remove or add a new vowel to a vowel cluster */
290       return 15;
291     }else{
292       /* Remove or add a consonant not in the same class */
293       return 50;
294     }
295   }
296 
297   /* any other character insertion or deletion */
298   return 100;
299 }
300 
301 /*
302 ** Divide the insertion cost by this factor when appending to the
303 ** end of the word.
304 */
305 #define FINAL_INS_COST_DIV  4
306 
307 /*
308 ** Return the cost of substituting cTo in place of cFrom assuming
309 ** the previous character is cPrev.  If cPrev==0 then cTo is the first
310 ** character of the word.
311 */
312 static int substituteCost(char cPrev, char cFrom, char cTo){
313   char classFrom, classTo;
314   if( cFrom==cTo ){
315     /* Exact match */
316     return 0;
317   }
318   if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){
319     /* differ only in case */
320     return 0;
321   }
322   classFrom = characterClass(cPrev, cFrom);
323   classTo = characterClass(cPrev, cTo);
324   if( classFrom==classTo ){
325     /* Same character class */
326     return 40;
327   }
328   if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
329       && classTo>=CCLASS_B && classTo<=CCLASS_Y ){
330     /* Convert from one consonant to another, but in a different class */
331     return 75;
332   }
333   /* Any other subsitution */
334   return 100;
335 }
336 
337 /*
338 ** Given two strings zA and zB which are pure ASCII, return the cost
339 ** of transforming zA into zB.  If zA ends with '*' assume that it is
340 ** a prefix of zB and give only minimal penalty for extra characters
341 ** on the end of zB.
342 **
343 ** Smaller numbers mean a closer match.
344 **
345 ** Negative values indicate an error:
346 **    -1  One of the inputs is NULL
347 **    -2  Non-ASCII characters on input
348 **    -3  Unable to allocate memory
349 **
350 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
351 ** of zB that matched the pattern in zA. If zA does not end with a '*',
352 ** then this value is always the number of bytes in zB (i.e. strlen(zB)).
353 ** If zA does end in a '*', then it is the number of bytes in the prefix
354 ** of zB that was deemed to match zA.
355 */
356 static int editdist1(const char *zA, const char *zB, int *pnMatch){
357   int nA, nB;            /* Number of characters in zA[] and zB[] */
358   int xA, xB;            /* Loop counters for zA[] and zB[] */
359   char cA, cB;           /* Current character of zA and zB */
360   char cAprev, cBprev;   /* Previous character of zA and zB */
361   char cAnext, cBnext;   /* Next character in zA and zB */
362   int d;                 /* North-west cost value */
363   int dc = 0;            /* North-west character value */
364   int res;               /* Final result */
365   int *m;                /* The cost matrix */
366   char *cx;              /* Corresponding character values */
367   int *toFree = 0;       /* Malloced space */
368   int mStack[60+15];     /* Stack space to use if not too much is needed */
369   int nMatch = 0;
370 
371   /* Early out if either input is NULL */
372   if( zA==0 || zB==0 ) return -1;
373 
374   /* Skip any common prefix */
375   while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
376   if( pnMatch ) *pnMatch = nMatch;
377   if( zA[0]==0 && zB[0]==0 ) return 0;
378 
379 #if 0
380   printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
381 #endif
382 
383   /* Verify input strings and measure their lengths */
384   for(nA=0; zA[nA]; nA++){
385     if( zA[nA]&0x80 ) return -2;
386   }
387   for(nB=0; zB[nB]; nB++){
388     if( zB[nB]&0x80 ) return -2;
389   }
390 
391   /* Special processing if either string is empty */
392   if( nA==0 ){
393     cBprev = dc;
394     for(xB=res=0; (cB = zB[xB])!=0; xB++){
395       res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
396       cBprev = cB;
397     }
398     return res;
399   }
400   if( nB==0 ){
401     cAprev = dc;
402     for(xA=res=0; (cA = zA[xA])!=0; xA++){
403       res += insertOrDeleteCost(cAprev, cA, zA[xA+1]);
404       cAprev = cA;
405     }
406     return res;
407   }
408 
409   /* A is a prefix of B */
410   if( zA[0]=='*' && zA[1]==0 ) return 0;
411 
412   /* Allocate and initialize the Wagner matrix */
413   if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){
414     m = mStack;
415   }else{
416     m = toFree = sqlite3_malloc( (nB+1)*5*sizeof(m[0])/4 );
417     if( m==0 ) return -3;
418   }
419   cx = (char*)&m[nB+1];
420 
421   /* Compute the Wagner edit distance */
422   m[0] = 0;
423   cx[0] = dc;
424   cBprev = dc;
425   for(xB=1; xB<=nB; xB++){
426     cBnext = zB[xB];
427     cB = zB[xB-1];
428     cx[xB] = cB;
429     m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext);
430     cBprev = cB;
431   }
432   cAprev = dc;
433   for(xA=1; xA<=nA; xA++){
434     int lastA = (xA==nA);
435     cA = zA[xA-1];
436     cAnext = zA[xA];
437     if( cA=='*' && lastA ) break;
438     d = m[0];
439     dc = cx[0];
440     m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
441     cBprev = 0;
442     for(xB=1; xB<=nB; xB++){
443       int totalCost, insCost, delCost, subCost, ncx;
444       cB = zB[xB-1];
445       cBnext = zB[xB];
446 
447       /* Cost to insert cB */
448       insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext);
449       if( lastA ) insCost /= FINAL_INS_COST_DIV;
450 
451       /* Cost to delete cA */
452       delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
453 
454       /* Cost to substitute cA->cB */
455       subCost = substituteCost(cx[xB-1], cA, cB);
456 
457       /* Best cost */
458       totalCost = insCost + m[xB-1];
459       ncx = cB;
460       if( (delCost + m[xB])<totalCost ){
461         totalCost = delCost + m[xB];
462         ncx = cA;
463       }
464       if( (subCost + d)<totalCost ){
465         totalCost = subCost + d;
466       }
467 
468 #if 0
469       printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
470              " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
471              xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
472              insCost, delCost, subCost, totalCost, ncx?ncx:' ');
473 #endif
474 
475       /* Update the matrix */
476       d = m[xB];
477       dc = cx[xB];
478       m[xB] = totalCost;
479       cx[xB] = ncx;
480       cBprev = cB;
481     }
482     cAprev = cA;
483   }
484 
485   /* Free the wagner matrix and return the result */
486   if( cA=='*' ){
487     res = m[1];
488     for(xB=1; xB<=nB; xB++){
489       if( m[xB]<res ){
490         res = m[xB];
491         if( pnMatch ) *pnMatch = xB+nMatch;
492       }
493     }
494   }else{
495     res = m[nB];
496     /* In the current implementation, pnMatch is always NULL if zA does
497     ** not end in "*" */
498     assert( pnMatch==0 );
499   }
500   sqlite3_free(toFree);
501   return res;
502 }
503 
504 /*
505 ** Function:    editdist(A,B)
506 **
507 ** Return the cost of transforming string A into string B.  Both strings
508 ** must be pure ASCII text.  If A ends with '*' then it is assumed to be
509 ** a prefix of B and extra characters on the end of B have minimal additional
510 ** cost.
511 */
512 static void editdistSqlFunc(
513   sqlite3_context *context,
514   int argc,
515   sqlite3_value **argv
516 ){
517   int res = editdist1(
518                     (const char*)sqlite3_value_text(argv[0]),
519                     (const char*)sqlite3_value_text(argv[1]),
520                     0);
521   if( res<0 ){
522     if( res==(-3) ){
523       sqlite3_result_error_nomem(context);
524     }else if( res==(-2) ){
525       sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
526     }else{
527       sqlite3_result_error(context, "NULL input to editdist()", -1);
528     }
529   }else{
530     sqlite3_result_int(context, res);
531   }
532 }
533 
534 /* End of the fixed-cost edit distance implementation
535 ******************************************************************************
536 *****************************************************************************
537 ** Begin: Configurable cost unicode edit distance routines
538 */
539 /* Forward declaration of structures */
540 typedef struct EditDist3Cost EditDist3Cost;
541 typedef struct EditDist3Config EditDist3Config;
542 typedef struct EditDist3Point EditDist3Point;
543 typedef struct EditDist3From EditDist3From;
544 typedef struct EditDist3FromString EditDist3FromString;
545 typedef struct EditDist3To EditDist3To;
546 typedef struct EditDist3ToString EditDist3ToString;
547 typedef struct EditDist3Lang EditDist3Lang;
548 
549 
550 /*
551 ** An entry in the edit cost table
552 */
553 struct EditDist3Cost {
554   EditDist3Cost *pNext;     /* Next cost element */
555   u8 nFrom;                 /* Number of bytes in aFrom */
556   u8 nTo;                   /* Number of bytes in aTo */
557   u16 iCost;                /* Cost of this transformation */
558   char a[4]    ;            /* FROM string followed by TO string */
559   /* Additional TO and FROM string bytes appended as necessary */
560 };
561 
562 /*
563 ** Edit costs for a particular language ID
564 */
565 struct EditDist3Lang {
566   int iLang;             /* Language ID */
567   int iInsCost;          /* Default insertion cost */
568   int iDelCost;          /* Default deletion cost */
569   int iSubCost;          /* Default substitution cost */
570   EditDist3Cost *pCost;  /* Costs */
571 };
572 
573 
574 /*
575 ** The default EditDist3Lang object, with default costs.
576 */
577 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 };
578 
579 /*
580 ** Complete configuration
581 */
582 struct EditDist3Config {
583   int nLang;             /* Number of language IDs.  Size of a[] */
584   EditDist3Lang *a;      /* One for each distinct language ID */
585 };
586 
587 /*
588 ** Extra information about each character in the FROM string.
589 */
590 struct EditDist3From {
591   int nSubst;              /* Number of substitution cost entries */
592   int nDel;                /* Number of deletion cost entries */
593   int nByte;               /* Number of bytes in this character */
594   EditDist3Cost **apSubst; /* Array of substitution costs for this element */
595   EditDist3Cost **apDel;   /* Array of deletion cost entries */
596 };
597 
598 /*
599 ** A precompiled FROM string.
600 *
601 ** In the common case we expect the FROM string to be reused multiple times.
602 ** In other words, the common case will be to measure the edit distance
603 ** from a single origin string to multiple target strings.
604 */
605 struct EditDist3FromString {
606   char *z;                 /* The complete text of the FROM string */
607   int n;                   /* Number of characters in the FROM string */
608   int isPrefix;            /* True if ends with '*' character */
609   EditDist3From *a;        /* Extra info about each char of the FROM string */
610 };
611 
612 /*
613 ** Extra information about each character in the TO string.
614 */
615 struct EditDist3To {
616   int nIns;                /* Number of insertion cost entries */
617   int nByte;               /* Number of bytes in this character */
618   EditDist3Cost **apIns;   /* Array of deletion cost entries */
619 };
620 
621 /*
622 ** A precompiled FROM string
623 */
624 struct EditDist3ToString {
625   char *z;                 /* The complete text of the TO string */
626   int n;                   /* Number of characters in the TO string */
627   EditDist3To *a;          /* Extra info about each char of the TO string */
628 };
629 
630 /*
631 ** Clear or delete an instance of the object that records all edit-distance
632 ** weights.
633 */
634 static void editDist3ConfigClear(EditDist3Config *p){
635   int i;
636   if( p==0 ) return;
637   for(i=0; i<p->nLang; i++){
638     EditDist3Cost *pCost, *pNext;
639     pCost = p->a[i].pCost;
640     while( pCost ){
641       pNext = pCost->pNext;
642       sqlite3_free(pCost);
643       pCost = pNext;
644     }
645   }
646   sqlite3_free(p->a);
647   memset(p, 0, sizeof(*p));
648 }
649 static void editDist3ConfigDelete(void *pIn){
650   EditDist3Config *p = (EditDist3Config*)pIn;
651   editDist3ConfigClear(p);
652   sqlite3_free(p);
653 }
654 
655 /*
656 ** Load all edit-distance weights from a table.
657 */
658 static int editDist3ConfigLoad(
659   EditDist3Config *p,      /* The edit distance configuration to load */
660   sqlite3 *db,            /* Load from this database */
661   const char *zTable      /* Name of the table from which to load */
662 ){
663   sqlite3_stmt *pStmt;
664   int rc, rc2;
665   char *zSql;
666   int iLangPrev = -9999;
667   EditDist3Lang *pLang = 0;
668 
669   zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
670                          " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
671   if( zSql==0 ) return SQLITE_NOMEM;
672   rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
673   sqlite3_free(zSql);
674   if( rc ) return rc;
675   editDist3ConfigClear(p);
676   while( sqlite3_step(pStmt)==SQLITE_ROW ){
677     int iLang = sqlite3_column_int(pStmt, 0);
678     const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
679     int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
680     const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
681     int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
682     int iCost = sqlite3_column_int(pStmt, 3);
683 
684     assert( zFrom!=0 || nFrom==0 );
685     assert( zTo!=0 || nTo==0 );
686     if( nFrom>100 || nTo>100 ) continue;
687     if( iCost<0 ) continue;
688     if( pLang==0 || iLang!=iLangPrev ){
689       EditDist3Lang *pNew;
690       pNew = sqlite3_realloc(p->a, (p->nLang+1)*sizeof(p->a[0]));
691       if( pNew==0 ){ rc = SQLITE_NOMEM; break; }
692       p->a = pNew;
693       pLang = &p->a[p->nLang];
694       p->nLang++;
695       pLang->iLang = iLang;
696       pLang->iInsCost = 100;
697       pLang->iDelCost = 100;
698       pLang->iSubCost = 150;
699       pLang->pCost = 0;
700       iLangPrev = iLang;
701     }
702     if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){
703       pLang->iDelCost = iCost;
704     }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){
705       pLang->iInsCost = iCost;
706     }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){
707       pLang->iSubCost = iCost;
708     }else{
709       EditDist3Cost *pCost;
710       int nExtra = nFrom + nTo - 4;
711       if( nExtra<0 ) nExtra = 0;
712       pCost = sqlite3_malloc( sizeof(*pCost) + nExtra );
713       if( pCost==0 ){ rc = SQLITE_NOMEM; break; }
714       pCost->nFrom = nFrom;
715       pCost->nTo = nTo;
716       pCost->iCost = iCost;
717       memcpy(pCost->a, zFrom, nFrom);
718       memcpy(pCost->a + nFrom, zTo, nTo);
719       pCost->pNext = pLang->pCost;
720       pLang->pCost = pCost;
721     }
722   }
723   rc2 = sqlite3_finalize(pStmt);
724   if( rc==SQLITE_OK ) rc = rc2;
725   return rc;
726 }
727 
728 /*
729 ** Return the length (in bytes) of a utf-8 character.  Or return a maximum
730 ** of N.
731 */
732 static int utf8Len(unsigned char c, int N){
733   int len = 1;
734   if( c>0x7f ){
735     if( (c&0xe0)==0xc0 ){
736       len = 2;
737     }else if( (c&0xf0)==0xe0 ){
738       len = 3;
739     }else{
740       len = 4;
741     }
742   }
743   if( len>N ) len = N;
744   return len;
745 }
746 
747 /*
748 ** Return TRUE (non-zero) if the To side of the given cost matches
749 ** the given string.
750 */
751 static int matchTo(EditDist3Cost *p, const char *z, int n){
752   if( p->nTo>n ) return 0;
753   if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0;
754   return 1;
755 }
756 
757 /*
758 ** Return TRUE (non-zero) if the From side of the given cost matches
759 ** the given string.
760 */
761 static int matchFrom(EditDist3Cost *p, const char *z, int n){
762   assert( p->nFrom<=n );
763   if( strncmp(p->a, z, p->nFrom)!=0 ) return 0;
764   return 1;
765 }
766 
767 /*
768 ** Return TRUE (non-zero) of the next FROM character and the next TO
769 ** character are the same.
770 */
771 static int matchFromTo(
772   EditDist3FromString *pStr,  /* Left hand string */
773   int n1,                     /* Index of comparison character on the left */
774   const char *z2,             /* Right-handl comparison character */
775   int n2                      /* Bytes remaining in z2[] */
776 ){
777   int b1 = pStr->a[n1].nByte;
778   if( b1>n2 ) return 0;
779   if( memcmp(pStr->z+n1, z2, b1)!=0 ) return 0;
780   return 1;
781 }
782 
783 /*
784 ** Delete an EditDist3FromString objecct
785 */
786 static void editDist3FromStringDelete(EditDist3FromString *p){
787   int i;
788   if( p ){
789     for(i=0; i<p->n; i++){
790       sqlite3_free(p->a[i].apDel);
791       sqlite3_free(p->a[i].apSubst);
792     }
793     sqlite3_free(p);
794   }
795 }
796 
797 /*
798 ** Create a EditDist3FromString object.
799 */
800 static EditDist3FromString *editDist3FromStringNew(
801   const EditDist3Lang *pLang,
802   const char *z,
803   int n
804 ){
805   EditDist3FromString *pStr;
806   EditDist3Cost *p;
807   int i;
808 
809   if( z==0 ) return 0;
810   if( n<0 ) n = (int)strlen(z);
811   pStr = sqlite3_malloc( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
812   if( pStr==0 ) return 0;
813   pStr->a = (EditDist3From*)&pStr[1];
814   memset(pStr->a, 0, sizeof(pStr->a[0])*n);
815   pStr->n = n;
816   pStr->z = (char*)&pStr->a[n];
817   memcpy(pStr->z, z, n+1);
818   if( n && z[n-1]=='*' ){
819     pStr->isPrefix = 1;
820     n--;
821     pStr->n--;
822     pStr->z[n] = 0;
823   }else{
824     pStr->isPrefix = 0;
825   }
826 
827   for(i=0; i<n; i++){
828     EditDist3From *pFrom = &pStr->a[i];
829     memset(pFrom, 0, sizeof(*pFrom));
830     pFrom->nByte = utf8Len((unsigned char)z[i], n-i);
831     for(p=pLang->pCost; p; p=p->pNext){
832       EditDist3Cost **apNew;
833       if( i+p->nFrom>n ) continue;
834       if( matchFrom(p, z+i, n-i)==0 ) continue;
835       if( p->nTo==0 ){
836         apNew = sqlite3_realloc(pFrom->apDel,
837                                 sizeof(*apNew)*(pFrom->nDel+1));
838         if( apNew==0 ) break;
839         pFrom->apDel = apNew;
840         apNew[pFrom->nDel++] = p;
841       }else{
842         apNew = sqlite3_realloc(pFrom->apSubst,
843                                 sizeof(*apNew)*(pFrom->nSubst+1));
844         if( apNew==0 ) break;
845         pFrom->apSubst = apNew;
846         apNew[pFrom->nSubst++] = p;
847       }
848     }
849     if( p ){
850       editDist3FromStringDelete(pStr);
851       pStr = 0;
852       break;
853     }
854   }
855   return pStr;
856 }
857 
858 /*
859 ** Update entry m[i] such that it is the minimum of its current value
860 ** and m[j]+iCost.
861 **
862 ** If the iCost is 1,000,000 or greater, then consider the cost to be
863 ** infinite and skip the update.
864 */
865 static void updateCost(
866   unsigned int *m,
867   int i,
868   int j,
869   int iCost
870 ){
871   assert( iCost>=0 );
872   if( iCost<10000 ){
873     unsigned int b = m[j] + iCost;
874     if( b<m[i] ) m[i] = b;
875   }
876 }
877 
878 /* Compute the edit distance between two strings.
879 **
880 ** If an error occurs, return a negative number which is the error code.
881 **
882 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters
883 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
884 ** not contain the pattern for a prefix-search, then this is always the number
885 ** of characters in z2. If pFrom does contain a prefix search pattern, then
886 ** it is the number of characters in the prefix of z2 that was deemed to
887 ** match pFrom.
888 */
889 static int editDist3Core(
890   EditDist3FromString *pFrom,  /* The FROM string */
891   const char *z2,              /* The TO string */
892   int n2,                      /* Length of the TO string */
893   const EditDist3Lang *pLang,  /* Edit weights for a particular language ID */
894   int *pnMatch                 /* OUT: Characters in matched prefix */
895 ){
896   int k, n;
897   int i1, b1;
898   int i2, b2;
899   EditDist3FromString f = *pFrom;
900   EditDist3To *a2;
901   unsigned int *m;
902   int szRow;
903   EditDist3Cost *p;
904   int res;
905 
906   /* allocate the Wagner matrix and the aTo[] array for the TO string */
907   n = (f.n+1)*(n2+1);
908   n = (n+1)&~1;
909   m = sqlite3_malloc( n*sizeof(m[0]) + sizeof(a2[0])*n2 );
910   if( m==0 ) return -1;            /* Out of memory */
911   a2 = (EditDist3To*)&m[n];
912   memset(a2, 0, sizeof(a2[0])*n2);
913 
914   /* Fill in the a1[] matrix for all characters of the TO string */
915   for(i2=0; i2<n2; i2++){
916     a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2);
917     for(p=pLang->pCost; p; p=p->pNext){
918       EditDist3Cost **apNew;
919       if( p->nFrom>0 ) continue;
920       if( i2+p->nTo>n2 ) continue;
921       if( matchTo(p, z2+i2, n2-i2)==0 ) continue;
922       a2[i2].nIns++;
923       apNew = sqlite3_realloc(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns);
924       if( apNew==0 ){
925         res = -1;  /* Out of memory */
926         goto editDist3Abort;
927       }
928       a2[i2].apIns = apNew;
929       a2[i2].apIns[a2[i2].nIns-1] = p;
930     }
931   }
932 
933   /* Prepare to compute the minimum edit distance */
934   szRow = f.n+1;
935   memset(m, 0x01, (n2+1)*szRow*sizeof(m[0]));
936   m[0] = 0;
937 
938   /* First fill in the top-row of the matrix with FROM deletion costs */
939   for(i1=0; i1<f.n; i1 += b1){
940     b1 = f.a[i1].nByte;
941     updateCost(m, i1+b1, i1, pLang->iDelCost);
942     for(k=0; k<f.a[i1].nDel; k++){
943       p = f.a[i1].apDel[k];
944       updateCost(m, i1+p->nFrom, i1, p->iCost);
945     }
946   }
947 
948   /* Fill in all subsequent rows, top-to-bottom, left-to-right */
949   for(i2=0; i2<n2; i2 += b2){
950     int rx;      /* Starting index for current row */
951     int rxp;     /* Starting index for previous row */
952     b2 = a2[i2].nByte;
953     rx = szRow*(i2+b2);
954     rxp = szRow*i2;
955     updateCost(m, rx, rxp, pLang->iInsCost);
956     for(k=0; k<a2[i2].nIns; k++){
957       p = a2[i2].apIns[k];
958       updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost);
959     }
960     for(i1=0; i1<f.n; i1+=b1){
961       int cx;    /* Index of current cell */
962       int cxp;   /* Index of cell immediately to the left */
963       int cxd;   /* Index of cell to the left and one row above */
964       int cxu;   /* Index of cell immediately above */
965       b1 = f.a[i1].nByte;
966       cxp = rx + i1;
967       cx = cxp + b1;
968       cxd = rxp + i1;
969       cxu = cxd + b1;
970       updateCost(m, cx, cxp, pLang->iDelCost);
971       for(k=0; k<f.a[i1].nDel; k++){
972         p = f.a[i1].apDel[k];
973         updateCost(m, cxp+p->nFrom, cxp, p->iCost);
974       }
975       updateCost(m, cx, cxu, pLang->iInsCost);
976       if( matchFromTo(&f, i1, z2+i2, n2-i2) ){
977         updateCost(m, cx, cxd, 0);
978       }
979       updateCost(m, cx, cxd, pLang->iSubCost);
980       for(k=0; k<f.a[i1].nSubst; k++){
981         p = f.a[i1].apSubst[k];
982         if( matchTo(p, z2+i2, n2-i2) ){
983           updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
984         }
985       }
986     }
987   }
988 
989 #if 0  /* Enable for debugging */
990   printf("         ^");
991   for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
992   printf("\n   ^:");
993   for(i1=0; i1<szRow; i1++){
994     int v = m[i1];
995     if( v>9999 ) printf(" ****");
996     else         printf(" %4d", v);
997   }
998   printf("\n");
999   for(i2=0; i2<n2; i2++){
1000     printf("%c-%02x:", z2[i2], z2[i2]&0xff);
1001     for(i1=0; i1<szRow; i1++){
1002       int v = m[(i2+1)*szRow+i1];
1003       if( v>9999 ) printf(" ****");
1004       else         printf(" %4d", v);
1005     }
1006     printf("\n");
1007   }
1008 #endif
1009 
1010   /* Free memory allocations and return the result */
1011   res = (int)m[szRow*(n2+1)-1];
1012   n = n2;
1013   if( f.isPrefix ){
1014     for(i2=1; i2<=n2; i2++){
1015       int b = m[szRow*i2-1];
1016       if( b<=res ){
1017         res = b;
1018         n = i2 - 1;
1019       }
1020     }
1021   }
1022   if( pnMatch ){
1023     int nExtra = 0;
1024     for(k=0; k<n; k++){
1025       if( (z2[k] & 0xc0)==0x80 ) nExtra++;
1026     }
1027     *pnMatch = n - nExtra;
1028   }
1029 
1030 editDist3Abort:
1031   for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
1032   sqlite3_free(m);
1033   return res;
1034 }
1035 
1036 /*
1037 ** Get an appropriate EditDist3Lang object.
1038 */
1039 static const EditDist3Lang *editDist3FindLang(
1040   EditDist3Config *pConfig,
1041   int iLang
1042 ){
1043   int i;
1044   for(i=0; i<pConfig->nLang; i++){
1045     if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i];
1046   }
1047   return &editDist3Lang;
1048 }
1049 
1050 /*
1051 ** Function:    editdist3(A,B,iLang)
1052 **              editdist3(tablename)
1053 **
1054 ** Return the cost of transforming string A into string B using edit
1055 ** weights for iLang.
1056 **
1057 ** The second form loads edit weights into memory from a table.
1058 */
1059 static void editDist3SqlFunc(
1060   sqlite3_context *context,
1061   int argc,
1062   sqlite3_value **argv
1063 ){
1064   EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context);
1065   sqlite3 *db = sqlite3_context_db_handle(context);
1066   int rc;
1067   if( argc==1 ){
1068     const char *zTable = (const char*)sqlite3_value_text(argv[0]);
1069     rc = editDist3ConfigLoad(pConfig, db, zTable);
1070     if( rc ) sqlite3_result_error_code(context, rc);
1071   }else{
1072     const char *zA = (const char*)sqlite3_value_text(argv[0]);
1073     const char *zB = (const char*)sqlite3_value_text(argv[1]);
1074     int nA = sqlite3_value_bytes(argv[0]);
1075     int nB = sqlite3_value_bytes(argv[1]);
1076     int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0;
1077     const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang);
1078     EditDist3FromString *pFrom;
1079     int dist;
1080 
1081     pFrom = editDist3FromStringNew(pLang, zA, nA);
1082     if( pFrom==0 ){
1083       sqlite3_result_error_nomem(context);
1084       return;
1085     }
1086     dist = editDist3Core(pFrom, zB, nB, pLang, 0);
1087     editDist3FromStringDelete(pFrom);
1088     if( dist==(-1) ){
1089       sqlite3_result_error_nomem(context);
1090     }else{
1091       sqlite3_result_int(context, dist);
1092     }
1093   }
1094 }
1095 
1096 /*
1097 ** Register the editDist3 function with SQLite
1098 */
1099 static int editDist3Install(sqlite3 *db){
1100   int rc;
1101   EditDist3Config *pConfig = sqlite3_malloc( sizeof(*pConfig) );
1102   if( pConfig==0 ) return SQLITE_NOMEM;
1103   memset(pConfig, 0, sizeof(*pConfig));
1104   rc = sqlite3_create_function_v2(db, "editdist3",
1105               2, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1106   if( rc==SQLITE_OK ){
1107     rc = sqlite3_create_function_v2(db, "editdist3",
1108                 3, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1109   }
1110   if( rc==SQLITE_OK ){
1111     rc = sqlite3_create_function_v2(db, "editdist3",
1112                 1, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0,
1113                 editDist3ConfigDelete);
1114   }else{
1115     sqlite3_free(pConfig);
1116   }
1117   return rc;
1118 }
1119 /* End configurable cost unicode edit distance routines
1120 ******************************************************************************
1121 ******************************************************************************
1122 ** Begin transliterate unicode-to-ascii implementation
1123 */
1124 
1125 #if !SQLITE_AMALGAMATION
1126 /*
1127 ** This lookup table is used to help decode the first byte of
1128 ** a multi-byte UTF8 character.
1129 */
1130 static const unsigned char sqlite3Utf8Trans1[] = {
1131   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1132   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1133   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1134   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1135   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1136   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1137   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1138   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1139 };
1140 #endif
1141 
1142 /*
1143 ** Return the value of the first UTF-8 character in the string.
1144 */
1145 static int utf8Read(const unsigned char *z, int n, int *pSize){
1146   int c, i;
1147 
1148   /* All callers to this routine (in the current implementation)
1149   ** always have n>0. */
1150   if( NEVER(n==0) ){
1151     c = i = 0;
1152   }else{
1153     c = z[0];
1154     i = 1;
1155     if( c>=0xc0 ){
1156       c = sqlite3Utf8Trans1[c-0xc0];
1157       while( i<n && (z[i] & 0xc0)==0x80 ){
1158         c = (c<<6) + (0x3f & z[i++]);
1159       }
1160     }
1161   }
1162   *pSize = i;
1163   return c;
1164 }
1165 
1166 /*
1167 ** Return the number of characters in the utf-8 string in the nIn byte
1168 ** buffer pointed to by zIn.
1169 */
1170 static int utf8Charlen(const char *zIn, int nIn){
1171   int i;
1172   int nChar = 0;
1173   for(i=0; i<nIn; nChar++){
1174     int sz;
1175     utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1176     i += sz;
1177   }
1178   return nChar;
1179 }
1180 
1181 /*
1182 ** Table of translations from unicode characters into ASCII.
1183 */
1184 static const struct {
1185  unsigned short int cFrom;
1186  unsigned char cTo0, cTo1;
1187 } translit[] = {
1188   { 0x00A0,  0x20, 0x00 },  /*   to   */
1189   { 0x00B5,  0x75, 0x00 },  /* µ to u */
1190   { 0x00C0,  0x41, 0x00 },  /* À to A */
1191   { 0x00C1,  0x41, 0x00 },  /* Á to A */
1192   { 0x00C2,  0x41, 0x00 },  /* Â to A */
1193   { 0x00C3,  0x41, 0x00 },  /* Ã to A */
1194   { 0x00C4,  0x41, 0x65 },  /* Ä to Ae */
1195   { 0x00C5,  0x41, 0x61 },  /* Å to Aa */
1196   { 0x00C6,  0x41, 0x45 },  /* Æ to AE */
1197   { 0x00C7,  0x43, 0x00 },  /* Ç to C */
1198   { 0x00C8,  0x45, 0x00 },  /* È to E */
1199   { 0x00C9,  0x45, 0x00 },  /* É to E */
1200   { 0x00CA,  0x45, 0x00 },  /* Ê to E */
1201   { 0x00CB,  0x45, 0x00 },  /* Ë to E */
1202   { 0x00CC,  0x49, 0x00 },  /* Ì to I */
1203   { 0x00CD,  0x49, 0x00 },  /* Í to I */
1204   { 0x00CE,  0x49, 0x00 },  /* Î to I */
1205   { 0x00CF,  0x49, 0x00 },  /* Ï to I */
1206   { 0x00D0,  0x44, 0x00 },  /* Ð to D */
1207   { 0x00D1,  0x4E, 0x00 },  /* Ñ to N */
1208   { 0x00D2,  0x4F, 0x00 },  /* Ò to O */
1209   { 0x00D3,  0x4F, 0x00 },  /* Ó to O */
1210   { 0x00D4,  0x4F, 0x00 },  /* Ô to O */
1211   { 0x00D5,  0x4F, 0x00 },  /* Õ to O */
1212   { 0x00D6,  0x4F, 0x65 },  /* Ö to Oe */
1213   { 0x00D7,  0x78, 0x00 },  /* × to x */
1214   { 0x00D8,  0x4F, 0x00 },  /* Ø to O */
1215   { 0x00D9,  0x55, 0x00 },  /* Ù to U */
1216   { 0x00DA,  0x55, 0x00 },  /* Ú to U */
1217   { 0x00DB,  0x55, 0x00 },  /* Û to U */
1218   { 0x00DC,  0x55, 0x65 },  /* Ü to Ue */
1219   { 0x00DD,  0x59, 0x00 },  /* Ý to Y */
1220   { 0x00DE,  0x54, 0x68 },  /* Þ to Th */
1221   { 0x00DF,  0x73, 0x73 },  /* ß to ss */
1222   { 0x00E0,  0x61, 0x00 },  /* à to a */
1223   { 0x00E1,  0x61, 0x00 },  /* á to a */
1224   { 0x00E2,  0x61, 0x00 },  /* â to a */
1225   { 0x00E3,  0x61, 0x00 },  /* ã to a */
1226   { 0x00E4,  0x61, 0x65 },  /* ä to ae */
1227   { 0x00E5,  0x61, 0x61 },  /* å to aa */
1228   { 0x00E6,  0x61, 0x65 },  /* æ to ae */
1229   { 0x00E7,  0x63, 0x00 },  /* ç to c */
1230   { 0x00E8,  0x65, 0x00 },  /* è to e */
1231   { 0x00E9,  0x65, 0x00 },  /* é to e */
1232   { 0x00EA,  0x65, 0x00 },  /* ê to e */
1233   { 0x00EB,  0x65, 0x00 },  /* ë to e */
1234   { 0x00EC,  0x69, 0x00 },  /* ì to i */
1235   { 0x00ED,  0x69, 0x00 },  /* í to i */
1236   { 0x00EE,  0x69, 0x00 },  /* î to i */
1237   { 0x00EF,  0x69, 0x00 },  /* ï to i */
1238   { 0x00F0,  0x64, 0x00 },  /* ð to d */
1239   { 0x00F1,  0x6E, 0x00 },  /* ñ to n */
1240   { 0x00F2,  0x6F, 0x00 },  /* ò to o */
1241   { 0x00F3,  0x6F, 0x00 },  /* ó to o */
1242   { 0x00F4,  0x6F, 0x00 },  /* ô to o */
1243   { 0x00F5,  0x6F, 0x00 },  /* õ to o */
1244   { 0x00F6,  0x6F, 0x65 },  /* ö to oe */
1245   { 0x00F7,  0x3A, 0x00 },  /* ÷ to : */
1246   { 0x00F8,  0x6F, 0x00 },  /* ø to o */
1247   { 0x00F9,  0x75, 0x00 },  /* ù to u */
1248   { 0x00FA,  0x75, 0x00 },  /* ú to u */
1249   { 0x00FB,  0x75, 0x00 },  /* û to u */
1250   { 0x00FC,  0x75, 0x65 },  /* ü to ue */
1251   { 0x00FD,  0x79, 0x00 },  /* ý to y */
1252   { 0x00FE,  0x74, 0x68 },  /* þ to th */
1253   { 0x00FF,  0x79, 0x00 },  /* ÿ to y */
1254   { 0x0100,  0x41, 0x00 },  /* Ā to A */
1255   { 0x0101,  0x61, 0x00 },  /* ā to a */
1256   { 0x0102,  0x41, 0x00 },  /* Ă to A */
1257   { 0x0103,  0x61, 0x00 },  /* ă to a */
1258   { 0x0104,  0x41, 0x00 },  /* Ą to A */
1259   { 0x0105,  0x61, 0x00 },  /* ą to a */
1260   { 0x0106,  0x43, 0x00 },  /* Ć to C */
1261   { 0x0107,  0x63, 0x00 },  /* ć to c */
1262   { 0x0108,  0x43, 0x68 },  /* Ĉ to Ch */
1263   { 0x0109,  0x63, 0x68 },  /* ĉ to ch */
1264   { 0x010A,  0x43, 0x00 },  /* Ċ to C */
1265   { 0x010B,  0x63, 0x00 },  /* ċ to c */
1266   { 0x010C,  0x43, 0x00 },  /* Č to C */
1267   { 0x010D,  0x63, 0x00 },  /* č to c */
1268   { 0x010E,  0x44, 0x00 },  /* Ď to D */
1269   { 0x010F,  0x64, 0x00 },  /* ď to d */
1270   { 0x0110,  0x44, 0x00 },  /* Đ to D */
1271   { 0x0111,  0x64, 0x00 },  /* đ to d */
1272   { 0x0112,  0x45, 0x00 },  /* Ē to E */
1273   { 0x0113,  0x65, 0x00 },  /* ē to e */
1274   { 0x0114,  0x45, 0x00 },  /* Ĕ to E */
1275   { 0x0115,  0x65, 0x00 },  /* ĕ to e */
1276   { 0x0116,  0x45, 0x00 },  /* Ė to E */
1277   { 0x0117,  0x65, 0x00 },  /* ė to e */
1278   { 0x0118,  0x45, 0x00 },  /* Ę to E */
1279   { 0x0119,  0x65, 0x00 },  /* ę to e */
1280   { 0x011A,  0x45, 0x00 },  /* Ě to E */
1281   { 0x011B,  0x65, 0x00 },  /* ě to e */
1282   { 0x011C,  0x47, 0x68 },  /* Ĝ to Gh */
1283   { 0x011D,  0x67, 0x68 },  /* ĝ to gh */
1284   { 0x011E,  0x47, 0x00 },  /* Ğ to G */
1285   { 0x011F,  0x67, 0x00 },  /* ğ to g */
1286   { 0x0120,  0x47, 0x00 },  /* Ġ to G */
1287   { 0x0121,  0x67, 0x00 },  /* ġ to g */
1288   { 0x0122,  0x47, 0x00 },  /* Ģ to G */
1289   { 0x0123,  0x67, 0x00 },  /* ģ to g */
1290   { 0x0124,  0x48, 0x68 },  /* Ĥ to Hh */
1291   { 0x0125,  0x68, 0x68 },  /* ĥ to hh */
1292   { 0x0126,  0x48, 0x00 },  /* Ħ to H */
1293   { 0x0127,  0x68, 0x00 },  /* ħ to h */
1294   { 0x0128,  0x49, 0x00 },  /* Ĩ to I */
1295   { 0x0129,  0x69, 0x00 },  /* ĩ to i */
1296   { 0x012A,  0x49, 0x00 },  /* Ī to I */
1297   { 0x012B,  0x69, 0x00 },  /* ī to i */
1298   { 0x012C,  0x49, 0x00 },  /* Ĭ to I */
1299   { 0x012D,  0x69, 0x00 },  /* ĭ to i */
1300   { 0x012E,  0x49, 0x00 },  /* Į to I */
1301   { 0x012F,  0x69, 0x00 },  /* į to i */
1302   { 0x0130,  0x49, 0x00 },  /* İ to I */
1303   { 0x0131,  0x69, 0x00 },  /* ı to i */
1304   { 0x0132,  0x49, 0x4A },  /* IJ to IJ */
1305   { 0x0133,  0x69, 0x6A },  /* ij to ij */
1306   { 0x0134,  0x4A, 0x68 },  /* Ĵ to Jh */
1307   { 0x0135,  0x6A, 0x68 },  /* ĵ to jh */
1308   { 0x0136,  0x4B, 0x00 },  /* Ķ to K */
1309   { 0x0137,  0x6B, 0x00 },  /* ķ to k */
1310   { 0x0138,  0x6B, 0x00 },  /* ĸ to k */
1311   { 0x0139,  0x4C, 0x00 },  /* Ĺ to L */
1312   { 0x013A,  0x6C, 0x00 },  /* ĺ to l */
1313   { 0x013B,  0x4C, 0x00 },  /* Ļ to L */
1314   { 0x013C,  0x6C, 0x00 },  /* ļ to l */
1315   { 0x013D,  0x4C, 0x00 },  /* Ľ to L */
1316   { 0x013E,  0x6C, 0x00 },  /* ľ to l */
1317   { 0x013F,  0x4C, 0x2E },  /* Ŀ to L. */
1318   { 0x0140,  0x6C, 0x2E },  /* ŀ to l. */
1319   { 0x0141,  0x4C, 0x00 },  /* Ł to L */
1320   { 0x0142,  0x6C, 0x00 },  /* ł to l */
1321   { 0x0143,  0x4E, 0x00 },  /* Ń to N */
1322   { 0x0144,  0x6E, 0x00 },  /* ń to n */
1323   { 0x0145,  0x4E, 0x00 },  /* Ņ to N */
1324   { 0x0146,  0x6E, 0x00 },  /* ņ to n */
1325   { 0x0147,  0x4E, 0x00 },  /* Ň to N */
1326   { 0x0148,  0x6E, 0x00 },  /* ň to n */
1327   { 0x0149,  0x27, 0x6E },  /* ʼn to 'n */
1328   { 0x014A,  0x4E, 0x47 },  /* Ŋ to NG */
1329   { 0x014B,  0x6E, 0x67 },  /* ŋ to ng */
1330   { 0x014C,  0x4F, 0x00 },  /* Ō to O */
1331   { 0x014D,  0x6F, 0x00 },  /* ō to o */
1332   { 0x014E,  0x4F, 0x00 },  /* Ŏ to O */
1333   { 0x014F,  0x6F, 0x00 },  /* ŏ to o */
1334   { 0x0150,  0x4F, 0x00 },  /* Ő to O */
1335   { 0x0151,  0x6F, 0x00 },  /* ő to o */
1336   { 0x0152,  0x4F, 0x45 },  /* Œ to OE */
1337   { 0x0153,  0x6F, 0x65 },  /* œ to oe */
1338   { 0x0154,  0x52, 0x00 },  /* Ŕ to R */
1339   { 0x0155,  0x72, 0x00 },  /* ŕ to r */
1340   { 0x0156,  0x52, 0x00 },  /* Ŗ to R */
1341   { 0x0157,  0x72, 0x00 },  /* ŗ to r */
1342   { 0x0158,  0x52, 0x00 },  /* Ř to R */
1343   { 0x0159,  0x72, 0x00 },  /* ř to r */
1344   { 0x015A,  0x53, 0x00 },  /* Ś to S */
1345   { 0x015B,  0x73, 0x00 },  /* ś to s */
1346   { 0x015C,  0x53, 0x68 },  /* Ŝ to Sh */
1347   { 0x015D,  0x73, 0x68 },  /* ŝ to sh */
1348   { 0x015E,  0x53, 0x00 },  /* Ş to S */
1349   { 0x015F,  0x73, 0x00 },  /* ş to s */
1350   { 0x0160,  0x53, 0x00 },  /* Š to S */
1351   { 0x0161,  0x73, 0x00 },  /* š to s */
1352   { 0x0162,  0x54, 0x00 },  /* Ţ to T */
1353   { 0x0163,  0x74, 0x00 },  /* ţ to t */
1354   { 0x0164,  0x54, 0x00 },  /* Ť to T */
1355   { 0x0165,  0x74, 0x00 },  /* ť to t */
1356   { 0x0166,  0x54, 0x00 },  /* Ŧ to T */
1357   { 0x0167,  0x74, 0x00 },  /* ŧ to t */
1358   { 0x0168,  0x55, 0x00 },  /* Ũ to U */
1359   { 0x0169,  0x75, 0x00 },  /* ũ to u */
1360   { 0x016A,  0x55, 0x00 },  /* Ū to U */
1361   { 0x016B,  0x75, 0x00 },  /* ū to u */
1362   { 0x016C,  0x55, 0x00 },  /* Ŭ to U */
1363   { 0x016D,  0x75, 0x00 },  /* ŭ to u */
1364   { 0x016E,  0x55, 0x00 },  /* Ů to U */
1365   { 0x016F,  0x75, 0x00 },  /* ů to u */
1366   { 0x0170,  0x55, 0x00 },  /* Ű to U */
1367   { 0x0171,  0x75, 0x00 },  /* ű to u */
1368   { 0x0172,  0x55, 0x00 },  /* Ų to U */
1369   { 0x0173,  0x75, 0x00 },  /* ų to u */
1370   { 0x0174,  0x57, 0x00 },  /* Ŵ to W */
1371   { 0x0175,  0x77, 0x00 },  /* ŵ to w */
1372   { 0x0176,  0x59, 0x00 },  /* Ŷ to Y */
1373   { 0x0177,  0x79, 0x00 },  /* ŷ to y */
1374   { 0x0178,  0x59, 0x00 },  /* Ÿ to Y */
1375   { 0x0179,  0x5A, 0x00 },  /* Ź to Z */
1376   { 0x017A,  0x7A, 0x00 },  /* ź to z */
1377   { 0x017B,  0x5A, 0x00 },  /* Ż to Z */
1378   { 0x017C,  0x7A, 0x00 },  /* ż to z */
1379   { 0x017D,  0x5A, 0x00 },  /* Ž to Z */
1380   { 0x017E,  0x7A, 0x00 },  /* ž to z */
1381   { 0x017F,  0x73, 0x00 },  /* ſ to s */
1382   { 0x0192,  0x66, 0x00 },  /* ƒ to f */
1383   { 0x0218,  0x53, 0x00 },  /* Ș to S */
1384   { 0x0219,  0x73, 0x00 },  /* ș to s */
1385   { 0x021A,  0x54, 0x00 },  /* Ț to T */
1386   { 0x021B,  0x74, 0x00 },  /* ț to t */
1387   { 0x0386,  0x41, 0x00 },  /* Ά to A */
1388   { 0x0388,  0x45, 0x00 },  /* Έ to E */
1389   { 0x0389,  0x49, 0x00 },  /* Ή to I */
1390   { 0x038A,  0x49, 0x00 },  /* Ί to I */
1391   { 0x038C,  0x4f, 0x00 },  /* Ό to O */
1392   { 0x038E,  0x59, 0x00 },  /* Ύ to Y */
1393   { 0x038F,  0x4f, 0x00 },  /* Ώ to O */
1394   { 0x0390,  0x69, 0x00 },  /* ΐ to i */
1395   { 0x0391,  0x41, 0x00 },  /* Α to A */
1396   { 0x0392,  0x42, 0x00 },  /* Β to B */
1397   { 0x0393,  0x47, 0x00 },  /* Γ to G */
1398   { 0x0394,  0x44, 0x00 },  /* Δ to D */
1399   { 0x0395,  0x45, 0x00 },  /* Ε to E */
1400   { 0x0396,  0x5a, 0x00 },  /* Ζ to Z */
1401   { 0x0397,  0x49, 0x00 },  /* Η to I */
1402   { 0x0398,  0x54, 0x68 },  /* Θ to Th */
1403   { 0x0399,  0x49, 0x00 },  /* Ι to I */
1404   { 0x039A,  0x4b, 0x00 },  /* Κ to K */
1405   { 0x039B,  0x4c, 0x00 },  /* Λ to L */
1406   { 0x039C,  0x4d, 0x00 },  /* Μ to M */
1407   { 0x039D,  0x4e, 0x00 },  /* Ν to N */
1408   { 0x039E,  0x58, 0x00 },  /* Ξ to X */
1409   { 0x039F,  0x4f, 0x00 },  /* Ο to O */
1410   { 0x03A0,  0x50, 0x00 },  /* Π to P */
1411   { 0x03A1,  0x52, 0x00 },  /* Ρ to R */
1412   { 0x03A3,  0x53, 0x00 },  /* Σ to S */
1413   { 0x03A4,  0x54, 0x00 },  /* Τ to T */
1414   { 0x03A5,  0x59, 0x00 },  /* Υ to Y */
1415   { 0x03A6,  0x46, 0x00 },  /* Φ to F */
1416   { 0x03A7,  0x43, 0x68 },  /* Χ to Ch */
1417   { 0x03A8,  0x50, 0x73 },  /* Ψ to Ps */
1418   { 0x03A9,  0x4f, 0x00 },  /* Ω to O */
1419   { 0x03AA,  0x49, 0x00 },  /* Ϊ to I */
1420   { 0x03AB,  0x59, 0x00 },  /* Ϋ to Y */
1421   { 0x03AC,  0x61, 0x00 },  /* ά to a */
1422   { 0x03AD,  0x65, 0x00 },  /* έ to e */
1423   { 0x03AE,  0x69, 0x00 },  /* ή to i */
1424   { 0x03AF,  0x69, 0x00 },  /* ί to i */
1425   { 0x03B1,  0x61, 0x00 },  /* α to a */
1426   { 0x03B2,  0x62, 0x00 },  /* β to b */
1427   { 0x03B3,  0x67, 0x00 },  /* γ to g */
1428   { 0x03B4,  0x64, 0x00 },  /* δ to d */
1429   { 0x03B5,  0x65, 0x00 },  /* ε to e */
1430   { 0x03B6,  0x7a, 0x00 },  /* ζ to z */
1431   { 0x03B7,  0x69, 0x00 },  /* η to i */
1432   { 0x03B8,  0x74, 0x68 },  /* θ to th */
1433   { 0x03B9,  0x69, 0x00 },  /* ι to i */
1434   { 0x03BA,  0x6b, 0x00 },  /* κ to k */
1435   { 0x03BB,  0x6c, 0x00 },  /* λ to l */
1436   { 0x03BC,  0x6d, 0x00 },  /* μ to m */
1437   { 0x03BD,  0x6e, 0x00 },  /* ν to n */
1438   { 0x03BE,  0x78, 0x00 },  /* ξ to x */
1439   { 0x03BF,  0x6f, 0x00 },  /* ο to o */
1440   { 0x03C0,  0x70, 0x00 },  /* π to p */
1441   { 0x03C1,  0x72, 0x00 },  /* ρ to r */
1442   { 0x03C3,  0x73, 0x00 },  /* σ to s */
1443   { 0x03C4,  0x74, 0x00 },  /* τ to t */
1444   { 0x03C5,  0x79, 0x00 },  /* υ to y */
1445   { 0x03C6,  0x66, 0x00 },  /* φ to f */
1446   { 0x03C7,  0x63, 0x68 },  /* χ to ch */
1447   { 0x03C8,  0x70, 0x73 },  /* ψ to ps */
1448   { 0x03C9,  0x6f, 0x00 },  /* ω to o */
1449   { 0x03CA,  0x69, 0x00 },  /* ϊ to i */
1450   { 0x03CB,  0x79, 0x00 },  /* ϋ to y */
1451   { 0x03CC,  0x6f, 0x00 },  /* ό to o */
1452   { 0x03CD,  0x79, 0x00 },  /* ύ to y */
1453   { 0x03CE,  0x69, 0x00 },  /* ώ to i */
1454   { 0x0400,  0x45, 0x00 },  /* Ѐ to E */
1455   { 0x0401,  0x45, 0x00 },  /* Ё to E */
1456   { 0x0402,  0x44, 0x00 },  /* Ђ to D */
1457   { 0x0403,  0x47, 0x00 },  /* Ѓ to G */
1458   { 0x0404,  0x45, 0x00 },  /* Є to E */
1459   { 0x0405,  0x5a, 0x00 },  /* Ѕ to Z */
1460   { 0x0406,  0x49, 0x00 },  /* І to I */
1461   { 0x0407,  0x49, 0x00 },  /* Ї to I */
1462   { 0x0408,  0x4a, 0x00 },  /* Ј to J */
1463   { 0x0409,  0x49, 0x00 },  /* Љ to I */
1464   { 0x040A,  0x4e, 0x00 },  /* Њ to N */
1465   { 0x040B,  0x44, 0x00 },  /* Ћ to D */
1466   { 0x040C,  0x4b, 0x00 },  /* Ќ to K */
1467   { 0x040D,  0x49, 0x00 },  /* Ѝ to I */
1468   { 0x040E,  0x55, 0x00 },  /* Ў to U */
1469   { 0x040F,  0x44, 0x00 },  /* Џ to D */
1470   { 0x0410,  0x41, 0x00 },  /* А to A */
1471   { 0x0411,  0x42, 0x00 },  /* Б to B */
1472   { 0x0412,  0x56, 0x00 },  /* В to V */
1473   { 0x0413,  0x47, 0x00 },  /* Г to G */
1474   { 0x0414,  0x44, 0x00 },  /* Д to D */
1475   { 0x0415,  0x45, 0x00 },  /* Е to E */
1476   { 0x0416,  0x5a, 0x68 },  /* Ж to Zh */
1477   { 0x0417,  0x5a, 0x00 },  /* З to Z */
1478   { 0x0418,  0x49, 0x00 },  /* И to I */
1479   { 0x0419,  0x49, 0x00 },  /* Й to I */
1480   { 0x041A,  0x4b, 0x00 },  /* К to K */
1481   { 0x041B,  0x4c, 0x00 },  /* Л to L */
1482   { 0x041C,  0x4d, 0x00 },  /* М to M */
1483   { 0x041D,  0x4e, 0x00 },  /* Н to N */
1484   { 0x041E,  0x4f, 0x00 },  /* О to O */
1485   { 0x041F,  0x50, 0x00 },  /* П to P */
1486   { 0x0420,  0x52, 0x00 },  /* Р to R */
1487   { 0x0421,  0x53, 0x00 },  /* С to S */
1488   { 0x0422,  0x54, 0x00 },  /* Т to T */
1489   { 0x0423,  0x55, 0x00 },  /* У to U */
1490   { 0x0424,  0x46, 0x00 },  /* Ф to F */
1491   { 0x0425,  0x4b, 0x68 },  /* Х to Kh */
1492   { 0x0426,  0x54, 0x63 },  /* Ц to Tc */
1493   { 0x0427,  0x43, 0x68 },  /* Ч to Ch */
1494   { 0x0428,  0x53, 0x68 },  /* Ш to Sh */
1495   { 0x0429,  0x53, 0x68 },  /* Щ to Shch */
1496   { 0x042A,  0x61, 0x00 },  /*  to A */
1497   { 0x042B,  0x59, 0x00 },  /* Ы to Y */
1498   { 0x042C,  0x59, 0x00 },  /*  to Y */
1499   { 0x042D,  0x45, 0x00 },  /* Э to E */
1500   { 0x042E,  0x49, 0x75 },  /* Ю to Iu */
1501   { 0x042F,  0x49, 0x61 },  /* Я to Ia */
1502   { 0x0430,  0x61, 0x00 },  /* а to a */
1503   { 0x0431,  0x62, 0x00 },  /* б to b */
1504   { 0x0432,  0x76, 0x00 },  /* в to v */
1505   { 0x0433,  0x67, 0x00 },  /* г to g */
1506   { 0x0434,  0x64, 0x00 },  /* д to d */
1507   { 0x0435,  0x65, 0x00 },  /* е to e */
1508   { 0x0436,  0x7a, 0x68 },  /* ж to zh */
1509   { 0x0437,  0x7a, 0x00 },  /* з to z */
1510   { 0x0438,  0x69, 0x00 },  /* и to i */
1511   { 0x0439,  0x69, 0x00 },  /* й to i */
1512   { 0x043A,  0x6b, 0x00 },  /* к to k */
1513   { 0x043B,  0x6c, 0x00 },  /* л to l */
1514   { 0x043C,  0x6d, 0x00 },  /* м to m */
1515   { 0x043D,  0x6e, 0x00 },  /* н to n */
1516   { 0x043E,  0x6f, 0x00 },  /* о to o */
1517   { 0x043F,  0x70, 0x00 },  /* п to p */
1518   { 0x0440,  0x72, 0x00 },  /* р to r */
1519   { 0x0441,  0x73, 0x00 },  /* с to s */
1520   { 0x0442,  0x74, 0x00 },  /* т to t */
1521   { 0x0443,  0x75, 0x00 },  /* у to u */
1522   { 0x0444,  0x66, 0x00 },  /* ф to f */
1523   { 0x0445,  0x6b, 0x68 },  /* х to kh */
1524   { 0x0446,  0x74, 0x63 },  /* ц to tc */
1525   { 0x0447,  0x63, 0x68 },  /* ч to ch */
1526   { 0x0448,  0x73, 0x68 },  /* ш to sh */
1527   { 0x0449,  0x73, 0x68 },  /* щ to shch */
1528   { 0x044A,  0x61, 0x00 },  /*  to a */
1529   { 0x044B,  0x79, 0x00 },  /* ы to y */
1530   { 0x044C,  0x79, 0x00 },  /*  to y */
1531   { 0x044D,  0x65, 0x00 },  /* э to e */
1532   { 0x044E,  0x69, 0x75 },  /* ю to iu */
1533   { 0x044F,  0x69, 0x61 },  /* я to ia */
1534   { 0x0450,  0x65, 0x00 },  /* ѐ to e */
1535   { 0x0451,  0x65, 0x00 },  /* ё to e */
1536   { 0x0452,  0x64, 0x00 },  /* ђ to d */
1537   { 0x0453,  0x67, 0x00 },  /* ѓ to g */
1538   { 0x0454,  0x65, 0x00 },  /* є to e */
1539   { 0x0455,  0x7a, 0x00 },  /* ѕ to z */
1540   { 0x0456,  0x69, 0x00 },  /* і to i */
1541   { 0x0457,  0x69, 0x00 },  /* ї to i */
1542   { 0x0458,  0x6a, 0x00 },  /* ј to j */
1543   { 0x0459,  0x69, 0x00 },  /* љ to i */
1544   { 0x045A,  0x6e, 0x00 },  /* њ to n */
1545   { 0x045B,  0x64, 0x00 },  /* ћ to d */
1546   { 0x045C,  0x6b, 0x00 },  /* ќ to k */
1547   { 0x045D,  0x69, 0x00 },  /* ѝ to i */
1548   { 0x045E,  0x75, 0x00 },  /* ў to u */
1549   { 0x045F,  0x64, 0x00 },  /* џ to d */
1550   { 0x1E02,  0x42, 0x00 },  /* Ḃ to B */
1551   { 0x1E03,  0x62, 0x00 },  /* ḃ to b */
1552   { 0x1E0A,  0x44, 0x00 },  /* Ḋ to D */
1553   { 0x1E0B,  0x64, 0x00 },  /* ḋ to d */
1554   { 0x1E1E,  0x46, 0x00 },  /* Ḟ to F */
1555   { 0x1E1F,  0x66, 0x00 },  /* ḟ to f */
1556   { 0x1E40,  0x4D, 0x00 },  /* Ṁ to M */
1557   { 0x1E41,  0x6D, 0x00 },  /* ṁ to m */
1558   { 0x1E56,  0x50, 0x00 },  /* Ṗ to P */
1559   { 0x1E57,  0x70, 0x00 },  /* ṗ to p */
1560   { 0x1E60,  0x53, 0x00 },  /* Ṡ to S */
1561   { 0x1E61,  0x73, 0x00 },  /* ṡ to s */
1562   { 0x1E6A,  0x54, 0x00 },  /* Ṫ to T */
1563   { 0x1E6B,  0x74, 0x00 },  /* ṫ to t */
1564   { 0x1E80,  0x57, 0x00 },  /* Ẁ to W */
1565   { 0x1E81,  0x77, 0x00 },  /* ẁ to w */
1566   { 0x1E82,  0x57, 0x00 },  /* Ẃ to W */
1567   { 0x1E83,  0x77, 0x00 },  /* ẃ to w */
1568   { 0x1E84,  0x57, 0x00 },  /* Ẅ to W */
1569   { 0x1E85,  0x77, 0x00 },  /* ẅ to w */
1570   { 0x1EF2,  0x59, 0x00 },  /* Ỳ to Y */
1571   { 0x1EF3,  0x79, 0x00 },  /* ỳ to y */
1572   { 0xFB00,  0x66, 0x66 },  /* ff to ff */
1573   { 0xFB01,  0x66, 0x69 },  /* fi to fi */
1574   { 0xFB02,  0x66, 0x6C },  /* fl to fl */
1575   { 0xFB05,  0x73, 0x74 },  /* ſt to st */
1576   { 0xFB06,  0x73, 0x74 },  /* st to st */
1577 };
1578 
1579 /*
1580 ** Convert the input string from UTF-8 into pure ASCII by converting
1581 ** all non-ASCII characters to some combination of characters in the
1582 ** ASCII subset.
1583 **
1584 ** The returned string might contain more characters than the input.
1585 **
1586 ** Space to hold the returned string comes from sqlite3_malloc() and
1587 ** should be freed by the caller.
1588 */
1589 static unsigned char *transliterate(const unsigned char *zIn, int nIn){
1590   unsigned char *zOut = sqlite3_malloc( nIn*4 + 1 );
1591   int c, sz, nOut;
1592   if( zOut==0 ) return 0;
1593   nOut = 0;
1594   while( nIn>0 ){
1595     c = utf8Read(zIn, nIn, &sz);
1596     zIn += sz;
1597     nIn -= sz;
1598     if( c<=127 ){
1599       zOut[nOut++] = c;
1600     }else{
1601       int xTop, xBtm, x;
1602       xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1603       xBtm = 0;
1604       while( xTop>=xBtm ){
1605         x = (xTop + xBtm)/2;
1606         if( translit[x].cFrom==c ){
1607           zOut[nOut++] = translit[x].cTo0;
1608           if( translit[x].cTo1 ){
1609             zOut[nOut++] = translit[x].cTo1;
1610             /* Add an extra "ch" after the "sh" for Щ and щ */
1611             if( c==0x0429 || c== 0x0449 ){
1612               zOut[nOut++] = 'c';
1613               zOut[nOut++] = 'h';
1614             }
1615           }
1616           c = 0;
1617           break;
1618         }else if( translit[x].cFrom>c ){
1619           xTop = x-1;
1620         }else{
1621           xBtm = x+1;
1622         }
1623       }
1624       if( c ) zOut[nOut++] = '?';
1625     }
1626   }
1627   zOut[nOut] = 0;
1628   return zOut;
1629 }
1630 
1631 /*
1632 ** Return the number of characters in the shortest prefix of the input
1633 ** string that transliterates to an ASCII string nTrans bytes or longer.
1634 ** Or, if the transliteration of the input string is less than nTrans
1635 ** bytes in size, return the number of characters in the input string.
1636 */
1637 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
1638   int i, c, sz, nOut;
1639   int nChar;
1640 
1641   i = nOut = 0;
1642   for(nChar=0; i<nIn && nOut<nTrans; nChar++){
1643     c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1644     i += sz;
1645 
1646     nOut++;
1647     if( c>=128 ){
1648       int xTop, xBtm, x;
1649       xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1650       xBtm = 0;
1651       while( xTop>=xBtm ){
1652         x = (xTop + xBtm)/2;
1653         if( translit[x].cFrom==c ){
1654           if( translit[x].cTo1 ) nOut++;
1655           if( c==0x0429 || c== 0x0449 ) nOut += 2;
1656           break;
1657         }else if( translit[x].cFrom>c ){
1658           xTop = x-1;
1659         }else{
1660           xBtm = x+1;
1661         }
1662       }
1663     }
1664   }
1665 
1666   return nChar;
1667 }
1668 
1669 
1670 /*
1671 **    spellfix1_translit(X)
1672 **
1673 ** Convert a string that contains non-ASCII Roman characters into
1674 ** pure ASCII.
1675 */
1676 static void transliterateSqlFunc(
1677   sqlite3_context *context,
1678   int argc,
1679   sqlite3_value **argv
1680 ){
1681   const unsigned char *zIn = sqlite3_value_text(argv[0]);
1682   int nIn = sqlite3_value_bytes(argv[0]);
1683   unsigned char *zOut = transliterate(zIn, nIn);
1684   if( zOut==0 ){
1685     sqlite3_result_error_nomem(context);
1686   }else{
1687     sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1688   }
1689 }
1690 
1691 /*
1692 **    spellfix1_scriptcode(X)
1693 **
1694 ** Try to determine the dominant script used by the word X and return
1695 ** its ISO 15924 numeric code.
1696 **
1697 ** The current implementation only understands the following scripts:
1698 **
1699 **    215  (Latin)
1700 **    220  (Cyrillic)
1701 **    200  (Greek)
1702 **
1703 ** This routine will return 998 if the input X contains characters from
1704 ** two or more of the above scripts or 999 if X contains no characters
1705 ** from any of the above scripts.
1706 */
1707 static void scriptCodeSqlFunc(
1708   sqlite3_context *context,
1709   int argc,
1710   sqlite3_value **argv
1711 ){
1712   const unsigned char *zIn = sqlite3_value_text(argv[0]);
1713   int nIn = sqlite3_value_bytes(argv[0]);
1714   int c, sz;
1715   int scriptMask = 0;
1716   int res;
1717 # define SCRIPT_LATIN       0x0001
1718 # define SCRIPT_CYRILLIC    0x0002
1719 # define SCRIPT_GREEK       0x0004
1720 
1721   while( nIn>0 ){
1722     c = utf8Read(zIn, nIn, &sz);
1723     zIn += sz;
1724     nIn -= sz;
1725     if( c<0x02af ){
1726       scriptMask |= SCRIPT_LATIN;
1727     }else if( c>=0x0400 && c<=0x04ff ){
1728       scriptMask |= SCRIPT_CYRILLIC;
1729     }else if( c>=0x0386 && c<=0x03ce ){
1730       scriptMask |= SCRIPT_GREEK;
1731     }
1732   }
1733   switch( scriptMask ){
1734     case 0:                res = 999; break;
1735     case SCRIPT_LATIN:     res = 215; break;
1736     case SCRIPT_CYRILLIC:  res = 220; break;
1737     case SCRIPT_GREEK:     res = 200; break;
1738     default:               res = 998; break;
1739   }
1740   sqlite3_result_int(context, res);
1741 }
1742 
1743 /* End transliterate
1744 ******************************************************************************
1745 ******************************************************************************
1746 ** Begin spellfix1 virtual table.
1747 */
1748 
1749 /* Maximum length of a phonehash used for querying the shadow table */
1750 #define SPELLFIX_MX_HASH  8
1751 
1752 /* Maximum number of hash strings to examine per query */
1753 #define SPELLFIX_MX_RUN   1
1754 
1755 typedef struct spellfix1_vtab spellfix1_vtab;
1756 typedef struct spellfix1_cursor spellfix1_cursor;
1757 
1758 /* Fuzzy-search virtual table object */
1759 struct spellfix1_vtab {
1760   sqlite3_vtab base;         /* Base class - must be first */
1761   sqlite3 *db;               /* Database connection */
1762   char *zDbName;             /* Name of database holding this table */
1763   char *zTableName;          /* Name of the virtual table */
1764   char *zCostTable;          /* Table holding edit-distance cost numbers */
1765   EditDist3Config *pConfig3; /* Parsed edit distance costs */
1766 };
1767 
1768 /* Fuzzy-search cursor object */
1769 struct spellfix1_cursor {
1770   sqlite3_vtab_cursor base;    /* Base class - must be first */
1771   spellfix1_vtab *pVTab;       /* The table to which this cursor belongs */
1772   char *zPattern;              /* rhs of MATCH clause */
1773   int nRow;                    /* Number of rows of content */
1774   int nAlloc;                  /* Number of allocated rows */
1775   int iRow;                    /* Current row of content */
1776   int iLang;                   /* Value of the langid= constraint */
1777   int iTop;                    /* Value of the top= constraint */
1778   int iScope;                  /* Value of the scope= constraint */
1779   int nSearch;                 /* Number of vocabulary items checked */
1780   sqlite3_stmt *pFullScan;     /* Shadow query for a full table scan */
1781   struct spellfix1_row {       /* For each row of content */
1782     sqlite3_int64 iRowid;         /* Rowid for this row */
1783     char *zWord;                  /* Text for this row */
1784     int iRank;                    /* Rank for this row */
1785     int iDistance;                /* Distance from pattern for this row */
1786     int iScore;                   /* Score for sorting */
1787     int iMatchlen;                /* Value of matchlen column (or -1) */
1788     char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
1789   } *a;
1790 };
1791 
1792 /*
1793 ** Construct one or more SQL statements from the format string given
1794 ** and then evaluate those statements. The success code is written
1795 ** into *pRc.
1796 **
1797 ** If *pRc is initially non-zero then this routine is a no-op.
1798 */
1799 static void spellfix1DbExec(
1800   int *pRc,              /* Success code */
1801   sqlite3 *db,           /* Database in which to run SQL */
1802   const char *zFormat,   /* Format string for SQL */
1803   ...                    /* Arguments to the format string */
1804 ){
1805   va_list ap;
1806   char *zSql;
1807   if( *pRc ) return;
1808   va_start(ap, zFormat);
1809   zSql = sqlite3_vmprintf(zFormat, ap);
1810   va_end(ap);
1811   if( zSql==0 ){
1812     *pRc = SQLITE_NOMEM;
1813   }else{
1814     *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
1815     sqlite3_free(zSql);
1816   }
1817 }
1818 
1819 /*
1820 ** xDisconnect/xDestroy method for the fuzzy-search module.
1821 */
1822 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){
1823   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1824   int rc = SQLITE_OK;
1825   if( isDestroy ){
1826     sqlite3 *db = p->db;
1827     spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1828                   p->zDbName, p->zTableName);
1829   }
1830   if( rc==SQLITE_OK ){
1831     sqlite3_free(p->zTableName);
1832     editDist3ConfigDelete(p->pConfig3);
1833     sqlite3_free(p->zCostTable);
1834     sqlite3_free(p);
1835   }
1836   return rc;
1837 }
1838 static int spellfix1Disconnect(sqlite3_vtab *pVTab){
1839   return spellfix1Uninit(0, pVTab);
1840 }
1841 static int spellfix1Destroy(sqlite3_vtab *pVTab){
1842   return spellfix1Uninit(1, pVTab);
1843 }
1844 
1845 /*
1846 ** Make a copy of a string.  Remove leading and trailing whitespace
1847 ** and dequote it.
1848 */
1849 static char *spellfix1Dequote(const char *zIn){
1850   char *zOut;
1851   int i, j;
1852   char c;
1853   while( isspace(zIn[0]) ) zIn++;
1854   zOut = sqlite3_mprintf("%s", zIn);
1855   if( zOut==0 ) return 0;
1856   i = (int)strlen(zOut);
1857 #if 0  /* The parser will never leave spaces at the end */
1858   while( i>0 && isspace(zOut[i-1]) ){ i--; }
1859 #endif
1860   zOut[i] = 0;
1861   c = zOut[0];
1862   if( c=='\'' || c=='"' ){
1863     for(i=1, j=0; ALWAYS(zOut[i]); i++){
1864       zOut[j++] = zOut[i];
1865       if( zOut[i]==c ){
1866         if( zOut[i+1]==c ){
1867           i++;
1868         }else{
1869           zOut[j-1] = 0;
1870           break;
1871         }
1872       }
1873     }
1874   }
1875   return zOut;
1876 }
1877 
1878 
1879 /*
1880 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
1881 **
1882 **   argv[0]   -> module name  ("spellfix1")
1883 **   argv[1]   -> database name
1884 **   argv[2]   -> table name
1885 **   argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
1886 */
1887 static int spellfix1Init(
1888   int isCreate,
1889   sqlite3 *db,
1890   void *pAux,
1891   int argc, const char *const*argv,
1892   sqlite3_vtab **ppVTab,
1893   char **pzErr
1894 ){
1895   spellfix1_vtab *pNew = 0;
1896   const char *zModule = argv[0];
1897   const char *zDbName = argv[1];
1898   const char *zTableName = argv[2];
1899   int nDbName;
1900   int rc = SQLITE_OK;
1901   int i;
1902 
1903   nDbName = (int)strlen(zDbName);
1904   pNew = sqlite3_malloc( sizeof(*pNew) + nDbName + 1);
1905   if( pNew==0 ){
1906     rc = SQLITE_NOMEM;
1907   }else{
1908     memset(pNew, 0, sizeof(*pNew));
1909     pNew->zDbName = (char*)&pNew[1];
1910     memcpy(pNew->zDbName, zDbName, nDbName+1);
1911     pNew->zTableName = sqlite3_mprintf("%s", zTableName);
1912     pNew->db = db;
1913     if( pNew->zTableName==0 ){
1914       rc = SQLITE_NOMEM;
1915     }else{
1916       rc = sqlite3_declare_vtab(db,
1917            "CREATE TABLE x(word,rank,distance,langid, "
1918            "score, matchlen, phonehash HIDDEN, "
1919            "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
1920            "soundslike HIDDEN, command HIDDEN)"
1921       );
1922 #define SPELLFIX_COL_WORD            0
1923 #define SPELLFIX_COL_RANK            1
1924 #define SPELLFIX_COL_DISTANCE        2
1925 #define SPELLFIX_COL_LANGID          3
1926 #define SPELLFIX_COL_SCORE           4
1927 #define SPELLFIX_COL_MATCHLEN        5
1928 #define SPELLFIX_COL_PHONEHASH       6
1929 #define SPELLFIX_COL_TOP             7
1930 #define SPELLFIX_COL_SCOPE           8
1931 #define SPELLFIX_COL_SRCHCNT         9
1932 #define SPELLFIX_COL_SOUNDSLIKE     10
1933 #define SPELLFIX_COL_COMMAND        11
1934     }
1935     if( rc==SQLITE_OK && isCreate ){
1936       sqlite3_uint64 r;
1937       spellfix1DbExec(&rc, db,
1938          "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
1939          "  id INTEGER PRIMARY KEY,\n"
1940          "  rank INT,\n"
1941          "  langid INT,\n"
1942          "  word TEXT,\n"
1943          "  k1 TEXT,\n"
1944          "  k2 TEXT\n"
1945          ");\n",
1946          zDbName, zTableName
1947       );
1948       sqlite3_randomness(sizeof(r), &r);
1949       spellfix1DbExec(&rc, db,
1950          "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_index_%llx\" "
1951             "ON \"%w_vocab\"(langid,k2);",
1952          zDbName, zModule, r, zTableName
1953       );
1954     }
1955     for(i=3; rc==SQLITE_OK && i<argc; i++){
1956       if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
1957         pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
1958         if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
1959         continue;
1960       }
1961       *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
1962       rc = SQLITE_ERROR;
1963     }
1964   }
1965 
1966   if( rc && pNew ){
1967     *ppVTab = 0;
1968     spellfix1Uninit(0, &pNew->base);
1969   }else{
1970     *ppVTab = (sqlite3_vtab *)pNew;
1971   }
1972   return rc;
1973 }
1974 
1975 /*
1976 ** The xConnect and xCreate methods
1977 */
1978 static int spellfix1Connect(
1979   sqlite3 *db,
1980   void *pAux,
1981   int argc, const char *const*argv,
1982   sqlite3_vtab **ppVTab,
1983   char **pzErr
1984 ){
1985   return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
1986 }
1987 static int spellfix1Create(
1988   sqlite3 *db,
1989   void *pAux,
1990   int argc, const char *const*argv,
1991   sqlite3_vtab **ppVTab,
1992   char **pzErr
1993 ){
1994   return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
1995 }
1996 
1997 /*
1998 ** Clear all of the content from a cursor.
1999 */
2000 static void spellfix1ResetCursor(spellfix1_cursor *pCur){
2001   int i;
2002   for(i=0; i<pCur->nRow; i++){
2003     sqlite3_free(pCur->a[i].zWord);
2004   }
2005   pCur->nRow = 0;
2006   pCur->iRow = 0;
2007   pCur->nSearch = 0;
2008   if( pCur->pFullScan ){
2009     sqlite3_finalize(pCur->pFullScan);
2010     pCur->pFullScan = 0;
2011   }
2012 }
2013 
2014 /*
2015 ** Resize the cursor to hold up to N rows of content
2016 */
2017 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){
2018   struct spellfix1_row *aNew;
2019   assert( N>=pCur->nRow );
2020   aNew = sqlite3_realloc(pCur->a, sizeof(pCur->a[0])*N);
2021   if( aNew==0 && N>0 ){
2022     spellfix1ResetCursor(pCur);
2023     sqlite3_free(pCur->a);
2024     pCur->nAlloc = 0;
2025     pCur->a = 0;
2026   }else{
2027     pCur->nAlloc = N;
2028     pCur->a = aNew;
2029   }
2030 }
2031 
2032 
2033 /*
2034 ** Close a fuzzy-search cursor.
2035 */
2036 static int spellfix1Close(sqlite3_vtab_cursor *cur){
2037   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2038   spellfix1ResetCursor(pCur);
2039   spellfix1ResizeCursor(pCur, 0);
2040   sqlite3_free(pCur->zPattern);
2041   sqlite3_free(pCur);
2042   return SQLITE_OK;
2043 }
2044 
2045 /*
2046 ** Search for terms of these forms:
2047 **
2048 **   (A)    word MATCH $str
2049 **   (B)    langid == $langid
2050 **   (C)    top = $top
2051 **   (D)    scope = $scope
2052 **   (E)    distance < $distance
2053 **   (F)    distance <= $distance
2054 **   (G)    rowid = $rowid
2055 **
2056 ** The plan number is a bit mask formed with these bits:
2057 **
2058 **   0x01   (A) is found
2059 **   0x02   (B) is found
2060 **   0x04   (C) is found
2061 **   0x08   (D) is found
2062 **   0x10   (E) is found
2063 **   0x20   (F) is found
2064 **   0x40   (G) is found
2065 **
2066 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2067 ** if specified and in that order.
2068 */
2069 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
2070   int iPlan = 0;
2071   int iLangTerm = -1;
2072   int iTopTerm = -1;
2073   int iScopeTerm = -1;
2074   int iDistTerm = -1;
2075   int iRowidTerm = -1;
2076   int i;
2077   const struct sqlite3_index_constraint *pConstraint;
2078   pConstraint = pIdxInfo->aConstraint;
2079   for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){
2080     if( pConstraint->usable==0 ) continue;
2081 
2082     /* Terms of the form:  word MATCH $str */
2083     if( (iPlan & 1)==0
2084      && pConstraint->iColumn==SPELLFIX_COL_WORD
2085      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
2086     ){
2087       iPlan |= 1;
2088       pIdxInfo->aConstraintUsage[i].argvIndex = 1;
2089       pIdxInfo->aConstraintUsage[i].omit = 1;
2090     }
2091 
2092     /* Terms of the form:  langid = $langid  */
2093     if( (iPlan & 2)==0
2094      && pConstraint->iColumn==SPELLFIX_COL_LANGID
2095      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2096     ){
2097       iPlan |= 2;
2098       iLangTerm = i;
2099     }
2100 
2101     /* Terms of the form:  top = $top */
2102     if( (iPlan & 4)==0
2103      && pConstraint->iColumn==SPELLFIX_COL_TOP
2104      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2105     ){
2106       iPlan |= 4;
2107       iTopTerm = i;
2108     }
2109 
2110     /* Terms of the form:  scope = $scope */
2111     if( (iPlan & 8)==0
2112      && pConstraint->iColumn==SPELLFIX_COL_SCOPE
2113      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2114     ){
2115       iPlan |= 8;
2116       iScopeTerm = i;
2117     }
2118 
2119     /* Terms of the form:  distance < $dist or distance <= $dist */
2120     if( (iPlan & (16|32))==0
2121      && pConstraint->iColumn==SPELLFIX_COL_DISTANCE
2122      && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT
2123           || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE)
2124     ){
2125       iPlan |= pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ? 16 : 32;
2126       iDistTerm = i;
2127     }
2128 
2129     /* Terms of the form:  distance < $dist or distance <= $dist */
2130     if( (iPlan & 64)==0
2131      && pConstraint->iColumn<0
2132      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2133     ){
2134       iPlan |= 64;
2135       iRowidTerm = i;
2136     }
2137   }
2138   if( iPlan&1 ){
2139     int idx = 2;
2140     pIdxInfo->idxNum = iPlan;
2141     if( pIdxInfo->nOrderBy==1
2142      && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
2143      && pIdxInfo->aOrderBy[0].desc==0
2144     ){
2145       pIdxInfo->orderByConsumed = 1;  /* Default order by iScore */
2146     }
2147     if( iPlan&2 ){
2148       pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
2149       pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
2150     }
2151     if( iPlan&4 ){
2152       pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
2153       pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
2154     }
2155     if( iPlan&8 ){
2156       pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
2157       pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
2158     }
2159     if( iPlan&(16|32) ){
2160       pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++;
2161       pIdxInfo->aConstraintUsage[iDistTerm].omit = 1;
2162     }
2163     pIdxInfo->estimatedCost = 1e5;
2164   }else if( (iPlan & 64) ){
2165     pIdxInfo->idxNum = 64;
2166     pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
2167     pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
2168     pIdxInfo->estimatedCost = 5;
2169   }else{
2170     pIdxInfo->idxNum = 0;
2171     pIdxInfo->estimatedCost = 1e50;
2172   }
2173   return SQLITE_OK;
2174 }
2175 
2176 /*
2177 ** Open a new fuzzy-search cursor.
2178 */
2179 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2180   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2181   spellfix1_cursor *pCur;
2182   pCur = sqlite3_malloc( sizeof(*pCur) );
2183   if( pCur==0 ) return SQLITE_NOMEM;
2184   memset(pCur, 0, sizeof(*pCur));
2185   pCur->pVTab = p;
2186   *ppCursor = &pCur->base;
2187   return SQLITE_OK;
2188 }
2189 
2190 /*
2191 ** Adjust a distance measurement by the words rank in order to show
2192 ** preference to common words.
2193 */
2194 static int spellfix1Score(int iDistance, int iRank){
2195   int iLog2;
2196   for(iLog2=0; iRank>0; iLog2++, iRank>>=1){}
2197   return iDistance + 32 - iLog2;
2198 }
2199 
2200 /*
2201 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
2202 ** that they sort in order of increasing distance.
2203 */
2204 static int spellfix1RowCompare(const void *A, const void *B){
2205   const struct spellfix1_row *a = (const struct spellfix1_row*)A;
2206   const struct spellfix1_row *b = (const struct spellfix1_row*)B;
2207   return a->iScore - b->iScore;
2208 }
2209 
2210 /*
2211 ** A structure used to pass information from spellfix1FilterForMatch()
2212 ** into spellfix1RunQuery().
2213 */
2214 typedef struct MatchQuery {
2215   spellfix1_cursor *pCur;          /* The cursor being queried */
2216   sqlite3_stmt *pStmt;             /* shadow table query statment */
2217   char zHash[SPELLFIX_MX_HASH];    /* The current phonehash for zPattern */
2218   const char *zPattern;            /* Transliterated input string */
2219   int nPattern;                    /* Length of zPattern */
2220   EditDist3FromString *pMatchStr3; /* Original unicode string */
2221   EditDist3Config *pConfig3;       /* Edit-distance cost coefficients */
2222   const EditDist3Lang *pLang;      /* The selected language coefficients */
2223   int iLang;                       /* The language id */
2224   int iScope;                      /* Default scope */
2225   int iMaxDist;                    /* Maximum allowed edit distance, or -1 */
2226   int rc;                          /* Error code */
2227   int nRun;                  /* Number of prior runs for the same zPattern */
2228   char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH];  /* Prior hashes */
2229 } MatchQuery;
2230 
2231 /*
2232 ** Run a query looking for the best matches against zPattern using
2233 ** zHash as the character class seed hash.
2234 */
2235 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
2236   const char *zK1;
2237   const char *zWord;
2238   int iDist;
2239   int iRank;
2240   int iScore;
2241   int iWorst = 0;
2242   int idx;
2243   int idxWorst = -1;
2244   int i;
2245   int iScope = p->iScope;
2246   spellfix1_cursor *pCur = p->pCur;
2247   sqlite3_stmt *pStmt = p->pStmt;
2248   char zHash1[SPELLFIX_MX_HASH];
2249   char zHash2[SPELLFIX_MX_HASH];
2250   char *zClass;
2251   int nClass;
2252   int rc;
2253 
2254   if( pCur->a==0 || p->rc ) return;   /* Prior memory allocation failure */
2255   zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
2256   if( zClass==0 ){
2257     p->rc = SQLITE_NOMEM;
2258     return;
2259   }
2260   nClass = (int)strlen(zClass);
2261   if( nClass>SPELLFIX_MX_HASH-2 ){
2262     nClass = SPELLFIX_MX_HASH-2;
2263     zClass[nClass] = 0;
2264   }
2265   if( nClass<=iScope ){
2266     if( nClass>2 ){
2267       iScope = nClass-1;
2268     }else{
2269       iScope = nClass;
2270     }
2271   }
2272   memcpy(zHash1, zClass, iScope);
2273   sqlite3_free(zClass);
2274   zHash1[iScope] = 0;
2275   memcpy(zHash2, zHash1, iScope);
2276   zHash2[iScope] = 'Z';
2277   zHash2[iScope+1] = 0;
2278 #if SPELLFIX_MX_RUN>1
2279   for(i=0; i<p->nRun; i++){
2280     if( strcmp(p->azPrior[i], zHash1)==0 ) return;
2281   }
2282 #endif
2283   assert( p->nRun<SPELLFIX_MX_RUN );
2284   memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
2285   if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
2286    || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
2287   ){
2288     p->rc = SQLITE_NOMEM;
2289     return;
2290   }
2291 #if SPELLFIX_MX_RUN>1
2292   for(i=0; i<pCur->nRow; i++){
2293     if( pCur->a[i].iScore>iWorst ){
2294       iWorst = pCur->a[i].iScore;
2295       idxWorst = i;
2296     }
2297   }
2298 #endif
2299   while( sqlite3_step(pStmt)==SQLITE_ROW ){
2300     int iMatchlen = -1;
2301     iRank = sqlite3_column_int(pStmt, 2);
2302     if( p->pMatchStr3 ){
2303       int nWord = sqlite3_column_bytes(pStmt, 1);
2304       zWord = (const char*)sqlite3_column_text(pStmt, 1);
2305       iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
2306     }else{
2307       zK1 = (const char*)sqlite3_column_text(pStmt, 3);
2308       if( zK1==0 ) continue;
2309       iDist = editdist1(p->zPattern, zK1, 0);
2310     }
2311     if( iDist<0 ){
2312       p->rc = SQLITE_NOMEM;
2313       break;
2314     }
2315     pCur->nSearch++;
2316     iScore = spellfix1Score(iDist,iRank);
2317     if( p->iMaxDist>=0 ){
2318       if( iDist>p->iMaxDist ) continue;
2319       if( pCur->nRow>=pCur->nAlloc-1 ){
2320         spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
2321         if( pCur->a==0 ) break;
2322       }
2323       idx = pCur->nRow;
2324     }else if( pCur->nRow<pCur->nAlloc ){
2325       idx = pCur->nRow;
2326     }else if( iScore<iWorst ){
2327       idx = idxWorst;
2328       sqlite3_free(pCur->a[idx].zWord);
2329     }else{
2330       continue;
2331     }
2332     pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
2333     if( pCur->a[idx].zWord==0 ){
2334       p->rc = SQLITE_NOMEM;
2335       break;
2336     }
2337     pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
2338     pCur->a[idx].iRank = iRank;
2339     pCur->a[idx].iDistance = iDist;
2340     pCur->a[idx].iScore = iScore;
2341     pCur->a[idx].iMatchlen = iMatchlen;
2342     memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
2343     if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
2344     if( pCur->nRow==pCur->nAlloc ){
2345       iWorst = pCur->a[0].iScore;
2346       idxWorst = 0;
2347       for(i=1; i<pCur->nRow; i++){
2348         iScore = pCur->a[i].iScore;
2349         if( iWorst<iScore ){
2350           iWorst = iScore;
2351           idxWorst = i;
2352         }
2353       }
2354     }
2355   }
2356   rc = sqlite3_reset(pStmt);
2357   if( rc ) p->rc = rc;
2358 }
2359 
2360 /*
2361 ** This version of the xFilter method work if the MATCH term is present
2362 ** and we are doing a scan.
2363 */
2364 static int spellfix1FilterForMatch(
2365   spellfix1_cursor *pCur,
2366   int idxNum,
2367   int argc,
2368   sqlite3_value **argv
2369 ){
2370   const unsigned char *zMatchThis;   /* RHS of the MATCH operator */
2371   EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
2372   char *zPattern;                    /* Transliteration of zMatchThis */
2373   int nPattern;                      /* Length of zPattern */
2374   int iLimit = 20;                   /* Max number of rows of output */
2375   int iScope = 3;                    /* Use this many characters of zClass */
2376   int iLang = 0;                     /* Language code */
2377   char *zSql;                        /* SQL of shadow table query */
2378   sqlite3_stmt *pStmt = 0;           /* Shadow table query */
2379   int rc;                            /* Result code */
2380   int idx = 1;                       /* Next available filter parameter */
2381   spellfix1_vtab *p = pCur->pVTab;   /* The virtual table that owns pCur */
2382   MatchQuery x;                      /* For passing info to RunQuery() */
2383 
2384   /* Load the cost table if we have not already done so */
2385   if( p->zCostTable!=0 && p->pConfig3==0 ){
2386     p->pConfig3 = sqlite3_malloc( sizeof(p->pConfig3[0]) );
2387     if( p->pConfig3==0 ) return SQLITE_NOMEM;
2388     memset(p->pConfig3, 0, sizeof(p->pConfig3[0]));
2389     rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable);
2390     if( rc ) return rc;
2391   }
2392   memset(&x, 0, sizeof(x));
2393   x.iScope = 3;  /* Default scope if none specified by "WHERE scope=N" */
2394   x.iMaxDist = -1;   /* Maximum allowed edit distance */
2395 
2396   if( idxNum&2 ){
2397     iLang = sqlite3_value_int(argv[idx++]);
2398   }
2399   if( idxNum&4 ){
2400     iLimit = sqlite3_value_int(argv[idx++]);
2401     if( iLimit<1 ) iLimit = 1;
2402   }
2403   if( idxNum&8 ){
2404     x.iScope = sqlite3_value_int(argv[idx++]);
2405     if( x.iScope<1 ) x.iScope = 1;
2406     if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2;
2407   }
2408   if( idxNum&(16|32) ){
2409     x.iMaxDist = sqlite3_value_int(argv[idx++]);
2410     if( idxNum&16 ) x.iMaxDist--;
2411     if( x.iMaxDist<0 ) x.iMaxDist = 0;
2412   }
2413   spellfix1ResetCursor(pCur);
2414   spellfix1ResizeCursor(pCur, iLimit);
2415   zMatchThis = sqlite3_value_text(argv[0]);
2416   if( zMatchThis==0 ) return SQLITE_OK;
2417   if( p->pConfig3 ){
2418     x.pLang = editDist3FindLang(p->pConfig3, iLang);
2419     pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
2420     if( pMatchStr3==0 ){
2421       x.rc = SQLITE_NOMEM;
2422       goto filter_exit;
2423     }
2424   }else{
2425     x.pLang = 0;
2426   }
2427   zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
2428   sqlite3_free(pCur->zPattern);
2429   pCur->zPattern = zPattern;
2430   if( zPattern==0 ){
2431     x.rc = SQLITE_NOMEM;
2432     goto filter_exit;
2433   }
2434   nPattern = (int)strlen(zPattern);
2435   if( zPattern[nPattern-1]=='*' ) nPattern--;
2436   zSql = sqlite3_mprintf(
2437      "SELECT id, word, rank, k1"
2438      "  FROM \"%w\".\"%w_vocab\""
2439      " WHERE langid=%d AND k2>=?1 AND k2<?2",
2440      p->zDbName, p->zTableName, iLang
2441   );
2442   if( zSql==0 ){
2443     x.rc = SQLITE_NOMEM;
2444     pStmt = 0;
2445     goto filter_exit;
2446   }
2447   rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
2448   sqlite3_free(zSql);
2449   pCur->iLang = iLang;
2450   x.pCur = pCur;
2451   x.pStmt = pStmt;
2452   x.zPattern = zPattern;
2453   x.nPattern = nPattern;
2454   x.pMatchStr3 = pMatchStr3;
2455   x.iLang = iLang;
2456   x.rc = rc;
2457   x.pConfig3 = p->pConfig3;
2458   if( x.rc==SQLITE_OK ){
2459     spellfix1RunQuery(&x, zPattern, nPattern);
2460   }
2461 
2462   if( pCur->a ){
2463     qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
2464     pCur->iTop = iLimit;
2465     pCur->iScope = iScope;
2466   }else{
2467     x.rc = SQLITE_NOMEM;
2468   }
2469 
2470 filter_exit:
2471   sqlite3_finalize(pStmt);
2472   editDist3FromStringDelete(pMatchStr3);
2473   return x.rc;
2474 }
2475 
2476 /*
2477 ** This version of xFilter handles a full-table scan case
2478 */
2479 static int spellfix1FilterForFullScan(
2480   spellfix1_cursor *pCur,
2481   int idxNum,
2482   int argc,
2483   sqlite3_value **argv
2484 ){
2485   int rc = SQLITE_OK;
2486   char *zSql;
2487   spellfix1_vtab *pVTab = pCur->pVTab;
2488   spellfix1ResetCursor(pCur);
2489   assert( idxNum==0 || idxNum==64 );
2490   zSql = sqlite3_mprintf(
2491      "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2492      pVTab->zDbName, pVTab->zTableName,
2493      ((idxNum & 64) ? " WHERE rowid=?" : "")
2494   );
2495   if( zSql==0 ) return SQLITE_NOMEM;
2496   rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
2497   sqlite3_free(zSql);
2498   if( rc==SQLITE_OK && (idxNum & 64) ){
2499     assert( argc==1 );
2500     rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]);
2501   }
2502   pCur->nRow = pCur->iRow = 0;
2503   if( rc==SQLITE_OK ){
2504     rc = sqlite3_step(pCur->pFullScan);
2505     if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; }
2506     if( rc==SQLITE_DONE ){ rc = SQLITE_OK; }
2507   }else{
2508     pCur->iRow = 0;
2509   }
2510   return rc;
2511 }
2512 
2513 
2514 /*
2515 ** Called to "rewind" a cursor back to the beginning so that
2516 ** it starts its output over again.  Always called at least once
2517 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2518 */
2519 static int spellfix1Filter(
2520   sqlite3_vtab_cursor *cur,
2521   int idxNum, const char *idxStr,
2522   int argc, sqlite3_value **argv
2523 ){
2524   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2525   int rc;
2526   if( idxNum & 1 ){
2527     rc = spellfix1FilterForMatch(pCur, idxNum, argc, argv);
2528   }else{
2529     rc = spellfix1FilterForFullScan(pCur, idxNum, argc, argv);
2530   }
2531   return rc;
2532 }
2533 
2534 
2535 /*
2536 ** Advance a cursor to its next row of output
2537 */
2538 static int spellfix1Next(sqlite3_vtab_cursor *cur){
2539   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2540   int rc = SQLITE_OK;
2541   if( pCur->iRow < pCur->nRow ){
2542     if( pCur->pFullScan ){
2543       rc = sqlite3_step(pCur->pFullScan);
2544       if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
2545       if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK;
2546     }else{
2547       pCur->iRow++;
2548     }
2549   }
2550   return rc;
2551 }
2552 
2553 /*
2554 ** Return TRUE if we are at the end-of-file
2555 */
2556 static int spellfix1Eof(sqlite3_vtab_cursor *cur){
2557   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2558   return pCur->iRow>=pCur->nRow;
2559 }
2560 
2561 /*
2562 ** Return columns from the current row.
2563 */
2564 static int spellfix1Column(
2565   sqlite3_vtab_cursor *cur,
2566   sqlite3_context *ctx,
2567   int i
2568 ){
2569   spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2570   if( pCur->pFullScan ){
2571     if( i<=SPELLFIX_COL_LANGID ){
2572       sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
2573     }else{
2574       sqlite3_result_null(ctx);
2575     }
2576     return SQLITE_OK;
2577   }
2578   switch( i ){
2579     case SPELLFIX_COL_WORD: {
2580       sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
2581       break;
2582     }
2583     case SPELLFIX_COL_RANK: {
2584       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
2585       break;
2586     }
2587     case SPELLFIX_COL_DISTANCE: {
2588       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
2589       break;
2590     }
2591     case SPELLFIX_COL_LANGID: {
2592       sqlite3_result_int(ctx, pCur->iLang);
2593       break;
2594     }
2595     case SPELLFIX_COL_SCORE: {
2596       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
2597       break;
2598     }
2599     case SPELLFIX_COL_MATCHLEN: {
2600       int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
2601       if( iMatchlen<0 ){
2602         int nPattern = (int)strlen(pCur->zPattern);
2603         char *zWord = pCur->a[pCur->iRow].zWord;
2604         int nWord = (int)strlen(zWord);
2605 
2606         if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
2607           char *zTranslit;
2608           int res;
2609           zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
2610           if( !zTranslit ) return SQLITE_NOMEM;
2611           res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
2612           sqlite3_free(zTranslit);
2613           if( res<0 ) return SQLITE_NOMEM;
2614           iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
2615         }else{
2616           iMatchlen = utf8Charlen(zWord, nWord);
2617         }
2618       }
2619 
2620       sqlite3_result_int(ctx, iMatchlen);
2621       break;
2622     }
2623     case SPELLFIX_COL_PHONEHASH: {
2624       sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
2625       break;
2626     }
2627     case SPELLFIX_COL_TOP: {
2628       sqlite3_result_int(ctx, pCur->iTop);
2629       break;
2630     }
2631     case SPELLFIX_COL_SCOPE: {
2632       sqlite3_result_int(ctx, pCur->iScope);
2633       break;
2634     }
2635     case SPELLFIX_COL_SRCHCNT: {
2636       sqlite3_result_int(ctx, pCur->nSearch);
2637       break;
2638     }
2639     default: {
2640       sqlite3_result_null(ctx);
2641       break;
2642     }
2643   }
2644   return SQLITE_OK;
2645 }
2646 
2647 /*
2648 ** The rowid.
2649 */
2650 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
2651   spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2652   if( pCur->pFullScan ){
2653     *pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
2654   }else{
2655     *pRowid = pCur->a[pCur->iRow].iRowid;
2656   }
2657   return SQLITE_OK;
2658 }
2659 
2660 /*
2661 ** The xUpdate() method.
2662 */
2663 static int spellfix1Update(
2664   sqlite3_vtab *pVTab,
2665   int argc,
2666   sqlite3_value **argv,
2667   sqlite_int64 *pRowid
2668 ){
2669   int rc = SQLITE_OK;
2670   sqlite3_int64 rowid, newRowid;
2671   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2672   sqlite3 *db = p->db;
2673 
2674   if( argc==1 ){
2675     /* A delete operation on the rowid given by argv[0] */
2676     rowid = *pRowid = sqlite3_value_int64(argv[0]);
2677     spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
2678                            " WHERE id=%lld",
2679                   p->zDbName, p->zTableName, rowid);
2680   }else{
2681     const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]);
2682     int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]);
2683     int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]);
2684     int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]);
2685     const unsigned char *zSoundslike =
2686            sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2687     int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2688     char *zK1, *zK2;
2689     int i;
2690     char c;
2691 
2692     if( zWord==0 ){
2693       /* Inserts of the form:  INSERT INTO table(command) VALUES('xyzzy');
2694       ** cause zWord to be NULL, so we look at the "command" column to see
2695       ** what special actions to take */
2696       const char *zCmd =
2697          (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]);
2698       if( zCmd==0 ){
2699         pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2700                                          p->zTableName);
2701         return SQLITE_CONSTRAINT_NOTNULL;
2702       }
2703       if( strcmp(zCmd,"reset")==0 ){
2704         /* Reset the  edit cost table (if there is one). */
2705         editDist3ConfigDelete(p->pConfig3);
2706         p->pConfig3 = 0;
2707         return SQLITE_OK;
2708       }
2709       if( strncmp(zCmd,"edit_cost_table=",16)==0 ){
2710         editDist3ConfigDelete(p->pConfig3);
2711         p->pConfig3 = 0;
2712         sqlite3_free(p->zCostTable);
2713         p->zCostTable = spellfix1Dequote(zCmd+16);
2714         if( p->zCostTable==0 ) return SQLITE_NOMEM;
2715         if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){
2716           sqlite3_free(p->zCostTable);
2717           p->zCostTable = 0;
2718         }
2719         return SQLITE_OK;
2720       }
2721       pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2722                                        p->zTableName, zCmd);
2723       return SQLITE_ERROR;
2724     }
2725     if( iRank<1 ) iRank = 1;
2726     if( zSoundslike ){
2727       zK1 = (char*)transliterate(zSoundslike, nSoundslike);
2728     }else{
2729       zK1 = (char*)transliterate(zWord, nWord);
2730     }
2731     if( zK1==0 ) return SQLITE_NOMEM;
2732     for(i=0; (c = zK1[i])!=0; i++){
2733        if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
2734     }
2735     zK2 = (char*)phoneticHash((const unsigned char*)zK1, i);
2736     if( zK2==0 ){
2737       sqlite3_free(zK1);
2738       return SQLITE_NOMEM;
2739     }
2740     if( sqlite3_value_type(argv[0])==SQLITE_NULL ){
2741       spellfix1DbExec(&rc, db,
2742              "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
2743              "VALUES(%d,%d,%Q,%Q,%Q)",
2744              p->zDbName, p->zTableName,
2745              iRank, iLang, zWord, zK1, zK2
2746       );
2747       *pRowid = sqlite3_last_insert_rowid(db);
2748     }else{
2749       rowid = sqlite3_value_int64(argv[0]);
2750       newRowid = *pRowid = sqlite3_value_int64(argv[1]);
2751       spellfix1DbExec(&rc, db,
2752              "UPDATE \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
2753              " word=%Q, k1=%Q, k2=%Q WHERE id=%lld",
2754              p->zDbName, p->zTableName, newRowid, iRank, iLang,
2755              zWord, zK1, zK2, rowid
2756       );
2757     }
2758     sqlite3_free(zK1);
2759     sqlite3_free(zK2);
2760   }
2761   return rc;
2762 }
2763 
2764 /*
2765 ** Rename the spellfix1 table.
2766 */
2767 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){
2768   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2769   sqlite3 *db = p->db;
2770   int rc = SQLITE_OK;
2771   char *zNewName = sqlite3_mprintf("%s", zNew);
2772   if( zNewName==0 ){
2773     return SQLITE_NOMEM;
2774   }
2775   spellfix1DbExec(&rc, db,
2776      "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
2777      p->zDbName, p->zTableName, zNewName
2778   );
2779   if( rc==SQLITE_OK ){
2780     sqlite3_free(p->zTableName);
2781     p->zTableName = zNewName;
2782   }else{
2783     sqlite3_free(zNewName);
2784   }
2785   return rc;
2786 }
2787 
2788 
2789 /*
2790 ** A virtual table module that provides fuzzy search.
2791 */
2792 static sqlite3_module spellfix1Module = {
2793   0,                       /* iVersion */
2794   spellfix1Create,         /* xCreate - handle CREATE VIRTUAL TABLE */
2795   spellfix1Connect,        /* xConnect - reconnected to an existing table */
2796   spellfix1BestIndex,      /* xBestIndex - figure out how to do a query */
2797   spellfix1Disconnect,     /* xDisconnect - close a connection */
2798   spellfix1Destroy,        /* xDestroy - handle DROP TABLE */
2799   spellfix1Open,           /* xOpen - open a cursor */
2800   spellfix1Close,          /* xClose - close a cursor */
2801   spellfix1Filter,         /* xFilter - configure scan constraints */
2802   spellfix1Next,           /* xNext - advance a cursor */
2803   spellfix1Eof,            /* xEof - check for end of scan */
2804   spellfix1Column,         /* xColumn - read data */
2805   spellfix1Rowid,          /* xRowid - read data */
2806   spellfix1Update,         /* xUpdate */
2807   0,                       /* xBegin */
2808   0,                       /* xSync */
2809   0,                       /* xCommit */
2810   0,                       /* xRollback */
2811   0,                       /* xFindMethod */
2812   spellfix1Rename,         /* xRename */
2813 };
2814 
2815 /*
2816 ** Register the various functions and the virtual table.
2817 */
2818 static int spellfix1Register(sqlite3 *db){
2819   int rc = SQLITE_OK;
2820   int i;
2821   rc = sqlite3_create_function(db, "spellfix1_translit", 1, SQLITE_UTF8, 0,
2822                                   transliterateSqlFunc, 0, 0);
2823   if( rc==SQLITE_OK ){
2824     rc = sqlite3_create_function(db, "spellfix1_editdist", 2, SQLITE_UTF8, 0,
2825                                   editdistSqlFunc, 0, 0);
2826   }
2827   if( rc==SQLITE_OK ){
2828     rc = sqlite3_create_function(db, "spellfix1_phonehash", 1, SQLITE_UTF8, 0,
2829                                   phoneticHashSqlFunc, 0, 0);
2830   }
2831   if( rc==SQLITE_OK ){
2832     rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1, SQLITE_UTF8, 0,
2833                                   scriptCodeSqlFunc, 0, 0);
2834   }
2835   if( rc==SQLITE_OK ){
2836     rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
2837   }
2838   if( rc==SQLITE_OK ){
2839     rc = editDist3Install(db);
2840   }
2841 
2842   /* Verify sanity of the translit[] table */
2843   for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
2844     assert( translit[i].cFrom<translit[i+1].cFrom );
2845   }
2846 
2847   return rc;
2848 }
2849 
2850 #endif /* SQLITE_OMIT_VIRTUALTABLE */
2851 
2852 /*
2853 ** Extension load function.
2854 */
2855 #ifdef _WIN32
2856 __declspec(dllexport)
2857 #endif
2858 int sqlite3_spellfix_init(
2859   sqlite3 *db,
2860   char **pzErrMsg,
2861   const sqlite3_api_routines *pApi
2862 ){
2863   SQLITE_EXTENSION_INIT2(pApi);
2864 #ifndef SQLITE_OMIT_VIRTUALTABLE
2865   return spellfix1Register(db);
2866 #endif
2867   return SQLITE_OK;
2868 }
2869