1 /* 2 ** 2012 April 10 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** 13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used 14 ** to search a large vocabulary for close matches. See separate 15 ** documentation (http://www.sqlite.org/spellfix1.html) for details. 16 */ 17 #include "sqlite3ext.h" 18 SQLITE_EXTENSION_INIT1 19 20 #ifndef SQLITE_AMALGAMATION 21 # if !defined(NDEBUG) && !defined(SQLITE_DEBUG) 22 # define NDEBUG 1 23 # endif 24 # if defined(NDEBUG) && defined(SQLITE_DEBUG) 25 # undef NDEBUG 26 # endif 27 # include <string.h> 28 # include <stdio.h> 29 # include <stdlib.h> 30 # include <assert.h> 31 # define ALWAYS(X) 1 32 # define NEVER(X) 0 33 typedef unsigned char u8; 34 typedef unsigned short u16; 35 #endif 36 #include <ctype.h> 37 38 #ifndef SQLITE_OMIT_VIRTUALTABLE 39 40 /* 41 ** Character classes for ASCII characters: 42 ** 43 ** 0 '' Silent letters: H W 44 ** 1 'A' Any vowel: A E I O U (Y) 45 ** 2 'B' A bilabeal stop or fricative: B F P V W 46 ** 3 'C' Other fricatives or back stops: C G J K Q S X Z 47 ** 4 'D' Alveolar stops: D T 48 ** 5 'H' Letter H at the beginning of a word 49 ** 6 'L' Glide: L 50 ** 7 'R' Semivowel: R 51 ** 8 'M' Nasals: M N 52 ** 9 'Y' Letter Y at the beginning of a word. 53 ** 10 '9' Digits: 0 1 2 3 4 5 6 7 8 9 54 ** 11 ' ' White space 55 ** 12 '?' Other. 56 */ 57 #define CCLASS_SILENT 0 58 #define CCLASS_VOWEL 1 59 #define CCLASS_B 2 60 #define CCLASS_C 3 61 #define CCLASS_D 4 62 #define CCLASS_H 5 63 #define CCLASS_L 6 64 #define CCLASS_R 7 65 #define CCLASS_M 8 66 #define CCLASS_Y 9 67 #define CCLASS_DIGIT 10 68 #define CCLASS_SPACE 11 69 #define CCLASS_OTHER 12 70 71 /* 72 ** The following table gives the character class for non-initial ASCII 73 ** characters. 74 */ 75 static const unsigned char midClass[] = { 76 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 77 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 78 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 79 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 80 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, 81 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 82 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 83 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 84 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 85 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 86 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE, 87 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER, 88 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER, 89 /* ' */ CCLASS_SILENT, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER, 90 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER, 91 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER, 92 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT, 93 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT, 94 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT, 95 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER, 96 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER, 97 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL, 98 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D, 99 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C, 100 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C, 101 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M, 102 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B, 103 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C, 104 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B, 105 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_VOWEL, 106 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER, 107 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER, 108 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B, 109 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL, 110 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT, 111 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C, 112 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M, 113 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C, 114 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D, 115 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B, 116 /* x */ CCLASS_C, /* y */ CCLASS_VOWEL, /* z */ CCLASS_C, 117 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER, 118 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER, 119 }; 120 /* 121 ** This tables gives the character class for ASCII characters that form the 122 ** initial character of a word. The only difference from midClass is with 123 ** the letters H, W, and Y. 124 */ 125 static const unsigned char initClass[] = { 126 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 127 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 128 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 129 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 130 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, 131 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 132 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 133 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 134 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 135 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 136 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE, 137 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER, 138 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER, 139 /* ' */ CCLASS_OTHER, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER, 140 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER, 141 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER, 142 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT, 143 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT, 144 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT, 145 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER, 146 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER, 147 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL, 148 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D, 149 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C, 150 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C, 151 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M, 152 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B, 153 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C, 154 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B, 155 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_Y, 156 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER, 157 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER, 158 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B, 159 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL, 160 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT, 161 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C, 162 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M, 163 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C, 164 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D, 165 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B, 166 /* x */ CCLASS_C, /* y */ CCLASS_Y, /* z */ CCLASS_C, 167 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER, 168 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER, 169 }; 170 171 /* 172 ** Mapping from the character class number (0-13) to a symbol for each 173 ** character class. Note that initClass[] can be used to map the class 174 ** symbol back into the class number. 175 */ 176 static const unsigned char className[] = ".ABCDHLRMY9 ?"; 177 178 /* 179 ** Generate a "phonetic hash" from a string of ASCII characters 180 ** in zIn[0..nIn-1]. 181 ** 182 ** * Map characters by character class as defined above. 183 ** * Omit double-letters 184 ** * Omit vowels beside R and L 185 ** * Omit T when followed by CH 186 ** * Omit W when followed by R 187 ** * Omit D when followed by J or G 188 ** * Omit K in KN or G in GN at the beginning of a word 189 ** 190 ** Space to hold the result is obtained from sqlite3_malloc() 191 ** 192 ** Return NULL if memory allocation fails. 193 */ 194 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){ 195 unsigned char *zOut = sqlite3_malloc64( nIn + 1 ); 196 int i; 197 int nOut = 0; 198 char cPrev = 0x77; 199 char cPrevX = 0x77; 200 const unsigned char *aClass = initClass; 201 202 if( zOut==0 ) return 0; 203 if( nIn>2 ){ 204 switch( zIn[0] ){ 205 case 'g': 206 case 'k': { 207 if( zIn[1]=='n' ){ zIn++; nIn--; } 208 break; 209 } 210 } 211 } 212 for(i=0; i<nIn; i++){ 213 unsigned char c = zIn[i]; 214 if( i+1<nIn ){ 215 if( c=='w' && zIn[i+1]=='r' ) continue; 216 if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue; 217 if( i+2<nIn ){ 218 if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue; 219 } 220 } 221 c = aClass[c&0x7f]; 222 if( c==CCLASS_SPACE ) continue; 223 if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue; 224 aClass = midClass; 225 if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){ 226 continue; /* No vowels beside L or R */ 227 } 228 if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){ 229 nOut--; /* No vowels beside L or R */ 230 } 231 cPrev = c; 232 if( c==CCLASS_SILENT ) continue; 233 cPrevX = c; 234 c = className[c]; 235 assert( nOut>=0 ); 236 if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c; 237 } 238 zOut[nOut] = 0; 239 return zOut; 240 } 241 242 /* 243 ** This is an SQL function wrapper around phoneticHash(). See 244 ** the description of phoneticHash() for additional information. 245 */ 246 static void phoneticHashSqlFunc( 247 sqlite3_context *context, 248 int argc, 249 sqlite3_value **argv 250 ){ 251 const unsigned char *zIn; 252 unsigned char *zOut; 253 254 zIn = sqlite3_value_text(argv[0]); 255 if( zIn==0 ) return; 256 zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0])); 257 if( zOut==0 ){ 258 sqlite3_result_error_nomem(context); 259 }else{ 260 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free); 261 } 262 } 263 264 /* 265 ** Return the character class number for a character given its 266 ** context. 267 */ 268 static char characterClass(char cPrev, char c){ 269 return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f]; 270 } 271 272 /* 273 ** Return the cost of inserting or deleting character c immediately 274 ** following character cPrev. If cPrev==0, that means c is the first 275 ** character of the word. 276 */ 277 static int insertOrDeleteCost(char cPrev, char c, char cNext){ 278 char classC = characterClass(cPrev, c); 279 char classCprev; 280 281 if( classC==CCLASS_SILENT ){ 282 /* Insert or delete "silent" characters such as H or W */ 283 return 1; 284 } 285 if( cPrev==c ){ 286 /* Repeated characters, or miss a repeat */ 287 return 10; 288 } 289 if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){ 290 return 20; /* Insert a vowel before or after 'r' */ 291 } 292 classCprev = characterClass(cPrev, cPrev); 293 if( classC==classCprev ){ 294 if( classC==CCLASS_VOWEL ){ 295 /* Remove or add a new vowel to a vowel cluster */ 296 return 15; 297 }else{ 298 /* Remove or add a consonant not in the same class */ 299 return 50; 300 } 301 } 302 303 /* any other character insertion or deletion */ 304 return 100; 305 } 306 307 /* 308 ** Divide the insertion cost by this factor when appending to the 309 ** end of the word. 310 */ 311 #define FINAL_INS_COST_DIV 4 312 313 /* 314 ** Return the cost of substituting cTo in place of cFrom assuming 315 ** the previous character is cPrev. If cPrev==0 then cTo is the first 316 ** character of the word. 317 */ 318 static int substituteCost(char cPrev, char cFrom, char cTo){ 319 char classFrom, classTo; 320 if( cFrom==cTo ){ 321 /* Exact match */ 322 return 0; 323 } 324 if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){ 325 /* differ only in case */ 326 return 0; 327 } 328 classFrom = characterClass(cPrev, cFrom); 329 classTo = characterClass(cPrev, cTo); 330 if( classFrom==classTo ){ 331 /* Same character class */ 332 return 40; 333 } 334 if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y 335 && classTo>=CCLASS_B && classTo<=CCLASS_Y ){ 336 /* Convert from one consonant to another, but in a different class */ 337 return 75; 338 } 339 /* Any other subsitution */ 340 return 100; 341 } 342 343 /* 344 ** Given two strings zA and zB which are pure ASCII, return the cost 345 ** of transforming zA into zB. If zA ends with '*' assume that it is 346 ** a prefix of zB and give only minimal penalty for extra characters 347 ** on the end of zB. 348 ** 349 ** Smaller numbers mean a closer match. 350 ** 351 ** Negative values indicate an error: 352 ** -1 One of the inputs is NULL 353 ** -2 Non-ASCII characters on input 354 ** -3 Unable to allocate memory 355 ** 356 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes 357 ** of zB that matched the pattern in zA. If zA does not end with a '*', 358 ** then this value is always the number of bytes in zB (i.e. strlen(zB)). 359 ** If zA does end in a '*', then it is the number of bytes in the prefix 360 ** of zB that was deemed to match zA. 361 */ 362 static int editdist1(const char *zA, const char *zB, int *pnMatch){ 363 int nA, nB; /* Number of characters in zA[] and zB[] */ 364 int xA, xB; /* Loop counters for zA[] and zB[] */ 365 char cA = 0, cB; /* Current character of zA and zB */ 366 char cAprev, cBprev; /* Previous character of zA and zB */ 367 char cAnext, cBnext; /* Next character in zA and zB */ 368 int d; /* North-west cost value */ 369 int dc = 0; /* North-west character value */ 370 int res; /* Final result */ 371 int *m; /* The cost matrix */ 372 char *cx; /* Corresponding character values */ 373 int *toFree = 0; /* Malloced space */ 374 int nMatch = 0; 375 int mStack[60+15]; /* Stack space to use if not too much is needed */ 376 377 /* Early out if either input is NULL */ 378 if( zA==0 || zB==0 ) return -1; 379 380 /* Skip any common prefix */ 381 while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; } 382 if( pnMatch ) *pnMatch = nMatch; 383 if( zA[0]==0 && zB[0]==0 ) return 0; 384 385 #if 0 386 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' '); 387 #endif 388 389 /* Verify input strings and measure their lengths */ 390 for(nA=0; zA[nA]; nA++){ 391 if( zA[nA]&0x80 ) return -2; 392 } 393 for(nB=0; zB[nB]; nB++){ 394 if( zB[nB]&0x80 ) return -2; 395 } 396 397 /* Special processing if either string is empty */ 398 if( nA==0 ){ 399 cBprev = (char)dc; 400 for(xB=res=0; (cB = zB[xB])!=0; xB++){ 401 res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV; 402 cBprev = cB; 403 } 404 return res; 405 } 406 if( nB==0 ){ 407 cAprev = (char)dc; 408 for(xA=res=0; (cA = zA[xA])!=0; xA++){ 409 res += insertOrDeleteCost(cAprev, cA, zA[xA+1]); 410 cAprev = cA; 411 } 412 return res; 413 } 414 415 /* A is a prefix of B */ 416 if( zA[0]=='*' && zA[1]==0 ) return 0; 417 418 /* Allocate and initialize the Wagner matrix */ 419 if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){ 420 m = mStack; 421 }else{ 422 m = toFree = sqlite3_malloc64( (nB+1)*5*sizeof(m[0])/4 ); 423 if( m==0 ) return -3; 424 } 425 cx = (char*)&m[nB+1]; 426 427 /* Compute the Wagner edit distance */ 428 m[0] = 0; 429 cx[0] = (char)dc; 430 cBprev = (char)dc; 431 for(xB=1; xB<=nB; xB++){ 432 cBnext = zB[xB]; 433 cB = zB[xB-1]; 434 cx[xB] = cB; 435 m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext); 436 cBprev = cB; 437 } 438 cAprev = (char)dc; 439 for(xA=1; xA<=nA; xA++){ 440 int lastA = (xA==nA); 441 cA = zA[xA-1]; 442 cAnext = zA[xA]; 443 if( cA=='*' && lastA ) break; 444 d = m[0]; 445 dc = cx[0]; 446 m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext); 447 cBprev = 0; 448 for(xB=1; xB<=nB; xB++){ 449 int totalCost, insCost, delCost, subCost, ncx; 450 cB = zB[xB-1]; 451 cBnext = zB[xB]; 452 453 /* Cost to insert cB */ 454 insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext); 455 if( lastA ) insCost /= FINAL_INS_COST_DIV; 456 457 /* Cost to delete cA */ 458 delCost = insertOrDeleteCost(cx[xB], cA, cBnext); 459 460 /* Cost to substitute cA->cB */ 461 subCost = substituteCost(cx[xB-1], cA, cB); 462 463 /* Best cost */ 464 totalCost = insCost + m[xB-1]; 465 ncx = cB; 466 if( (delCost + m[xB])<totalCost ){ 467 totalCost = delCost + m[xB]; 468 ncx = cA; 469 } 470 if( (subCost + d)<totalCost ){ 471 totalCost = subCost + d; 472 } 473 474 #if 0 475 printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c" 476 " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n", 477 xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB, 478 insCost, delCost, subCost, totalCost, ncx?ncx:' '); 479 #endif 480 481 /* Update the matrix */ 482 d = m[xB]; 483 dc = cx[xB]; 484 m[xB] = totalCost; 485 cx[xB] = (char)ncx; 486 cBprev = cB; 487 } 488 cAprev = cA; 489 } 490 491 /* Free the wagner matrix and return the result */ 492 if( cA=='*' ){ 493 res = m[1]; 494 for(xB=1; xB<=nB; xB++){ 495 if( m[xB]<res ){ 496 res = m[xB]; 497 if( pnMatch ) *pnMatch = xB+nMatch; 498 } 499 } 500 }else{ 501 res = m[nB]; 502 /* In the current implementation, pnMatch is always NULL if zA does 503 ** not end in "*" */ 504 assert( pnMatch==0 ); 505 } 506 sqlite3_free(toFree); 507 return res; 508 } 509 510 /* 511 ** Function: editdist(A,B) 512 ** 513 ** Return the cost of transforming string A into string B. Both strings 514 ** must be pure ASCII text. If A ends with '*' then it is assumed to be 515 ** a prefix of B and extra characters on the end of B have minimal additional 516 ** cost. 517 */ 518 static void editdistSqlFunc( 519 sqlite3_context *context, 520 int argc, 521 sqlite3_value **argv 522 ){ 523 int res = editdist1( 524 (const char*)sqlite3_value_text(argv[0]), 525 (const char*)sqlite3_value_text(argv[1]), 526 0); 527 if( res<0 ){ 528 if( res==(-3) ){ 529 sqlite3_result_error_nomem(context); 530 }else if( res==(-2) ){ 531 sqlite3_result_error(context, "non-ASCII input to editdist()", -1); 532 }else{ 533 sqlite3_result_error(context, "NULL input to editdist()", -1); 534 } 535 }else{ 536 sqlite3_result_int(context, res); 537 } 538 } 539 540 /* End of the fixed-cost edit distance implementation 541 ****************************************************************************** 542 ***************************************************************************** 543 ** Begin: Configurable cost unicode edit distance routines 544 */ 545 /* Forward declaration of structures */ 546 typedef struct EditDist3Cost EditDist3Cost; 547 typedef struct EditDist3Config EditDist3Config; 548 typedef struct EditDist3Point EditDist3Point; 549 typedef struct EditDist3From EditDist3From; 550 typedef struct EditDist3FromString EditDist3FromString; 551 typedef struct EditDist3To EditDist3To; 552 typedef struct EditDist3ToString EditDist3ToString; 553 typedef struct EditDist3Lang EditDist3Lang; 554 555 556 /* 557 ** An entry in the edit cost table 558 */ 559 struct EditDist3Cost { 560 EditDist3Cost *pNext; /* Next cost element */ 561 u8 nFrom; /* Number of bytes in aFrom */ 562 u8 nTo; /* Number of bytes in aTo */ 563 u16 iCost; /* Cost of this transformation */ 564 char a[4] ; /* FROM string followed by TO string */ 565 /* Additional TO and FROM string bytes appended as necessary */ 566 }; 567 568 /* 569 ** Edit costs for a particular language ID 570 */ 571 struct EditDist3Lang { 572 int iLang; /* Language ID */ 573 int iInsCost; /* Default insertion cost */ 574 int iDelCost; /* Default deletion cost */ 575 int iSubCost; /* Default substitution cost */ 576 EditDist3Cost *pCost; /* Costs */ 577 }; 578 579 580 /* 581 ** The default EditDist3Lang object, with default costs. 582 */ 583 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 }; 584 585 /* 586 ** Complete configuration 587 */ 588 struct EditDist3Config { 589 int nLang; /* Number of language IDs. Size of a[] */ 590 EditDist3Lang *a; /* One for each distinct language ID */ 591 }; 592 593 /* 594 ** Extra information about each character in the FROM string. 595 */ 596 struct EditDist3From { 597 int nSubst; /* Number of substitution cost entries */ 598 int nDel; /* Number of deletion cost entries */ 599 int nByte; /* Number of bytes in this character */ 600 EditDist3Cost **apSubst; /* Array of substitution costs for this element */ 601 EditDist3Cost **apDel; /* Array of deletion cost entries */ 602 }; 603 604 /* 605 ** A precompiled FROM string. 606 * 607 ** In the common case we expect the FROM string to be reused multiple times. 608 ** In other words, the common case will be to measure the edit distance 609 ** from a single origin string to multiple target strings. 610 */ 611 struct EditDist3FromString { 612 char *z; /* The complete text of the FROM string */ 613 int n; /* Number of characters in the FROM string */ 614 int isPrefix; /* True if ends with '*' character */ 615 EditDist3From *a; /* Extra info about each char of the FROM string */ 616 }; 617 618 /* 619 ** Extra information about each character in the TO string. 620 */ 621 struct EditDist3To { 622 int nIns; /* Number of insertion cost entries */ 623 int nByte; /* Number of bytes in this character */ 624 EditDist3Cost **apIns; /* Array of deletion cost entries */ 625 }; 626 627 /* 628 ** A precompiled FROM string 629 */ 630 struct EditDist3ToString { 631 char *z; /* The complete text of the TO string */ 632 int n; /* Number of characters in the TO string */ 633 EditDist3To *a; /* Extra info about each char of the TO string */ 634 }; 635 636 /* 637 ** Clear or delete an instance of the object that records all edit-distance 638 ** weights. 639 */ 640 static void editDist3ConfigClear(EditDist3Config *p){ 641 int i; 642 if( p==0 ) return; 643 for(i=0; i<p->nLang; i++){ 644 EditDist3Cost *pCost, *pNext; 645 pCost = p->a[i].pCost; 646 while( pCost ){ 647 pNext = pCost->pNext; 648 sqlite3_free(pCost); 649 pCost = pNext; 650 } 651 } 652 sqlite3_free(p->a); 653 memset(p, 0, sizeof(*p)); 654 } 655 static void editDist3ConfigDelete(void *pIn){ 656 EditDist3Config *p = (EditDist3Config*)pIn; 657 editDist3ConfigClear(p); 658 sqlite3_free(p); 659 } 660 661 /* Compare the FROM values of two EditDist3Cost objects, for sorting. 662 ** Return negative, zero, or positive if the A is less than, equal to, 663 ** or greater than B. 664 */ 665 static int editDist3CostCompare(EditDist3Cost *pA, EditDist3Cost *pB){ 666 int n = pA->nFrom; 667 int rc; 668 if( n>pB->nFrom ) n = pB->nFrom; 669 rc = strncmp(pA->a, pB->a, n); 670 if( rc==0 ) rc = pA->nFrom - pB->nFrom; 671 return rc; 672 } 673 674 /* 675 ** Merge together two sorted lists of EditDist3Cost objects, in order 676 ** of increasing FROM. 677 */ 678 static EditDist3Cost *editDist3CostMerge( 679 EditDist3Cost *pA, 680 EditDist3Cost *pB 681 ){ 682 EditDist3Cost *pHead = 0; 683 EditDist3Cost **ppTail = &pHead; 684 EditDist3Cost *p; 685 while( pA && pB ){ 686 if( editDist3CostCompare(pA,pB)<=0 ){ 687 p = pA; 688 pA = pA->pNext; 689 }else{ 690 p = pB; 691 pB = pB->pNext; 692 } 693 *ppTail = p; 694 ppTail = &p->pNext; 695 } 696 if( pA ){ 697 *ppTail = pA; 698 }else{ 699 *ppTail = pB; 700 } 701 return pHead; 702 } 703 704 /* 705 ** Sort a list of EditDist3Cost objects into order of increasing FROM 706 */ 707 static EditDist3Cost *editDist3CostSort(EditDist3Cost *pList){ 708 EditDist3Cost *ap[60], *p; 709 int i; 710 int mx = 0; 711 ap[0] = 0; 712 ap[1] = 0; 713 while( pList ){ 714 p = pList; 715 pList = p->pNext; 716 p->pNext = 0; 717 for(i=0; ap[i]; i++){ 718 p = editDist3CostMerge(ap[i],p); 719 ap[i] = 0; 720 } 721 ap[i] = p; 722 if( i>mx ){ 723 mx = i; 724 ap[i+1] = 0; 725 } 726 } 727 p = 0; 728 for(i=0; i<=mx; i++){ 729 if( ap[i] ) p = editDist3CostMerge(p,ap[i]); 730 } 731 return p; 732 } 733 734 /* 735 ** Load all edit-distance weights from a table. 736 */ 737 static int editDist3ConfigLoad( 738 EditDist3Config *p, /* The edit distance configuration to load */ 739 sqlite3 *db, /* Load from this database */ 740 const char *zTable /* Name of the table from which to load */ 741 ){ 742 sqlite3_stmt *pStmt; 743 int rc, rc2; 744 char *zSql; 745 int iLangPrev = -9999; 746 EditDist3Lang *pLang = 0; 747 748 zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost" 749 " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable); 750 if( zSql==0 ) return SQLITE_NOMEM; 751 rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0); 752 sqlite3_free(zSql); 753 if( rc ) return rc; 754 editDist3ConfigClear(p); 755 while( sqlite3_step(pStmt)==SQLITE_ROW ){ 756 int iLang = sqlite3_column_int(pStmt, 0); 757 const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1); 758 int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0; 759 const char *zTo = (const char*)sqlite3_column_text(pStmt, 2); 760 int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0; 761 int iCost = sqlite3_column_int(pStmt, 3); 762 763 assert( zFrom!=0 || nFrom==0 ); 764 assert( zTo!=0 || nTo==0 ); 765 if( nFrom>100 || nTo>100 ) continue; 766 if( iCost<0 ) continue; 767 if( iCost>=10000 ) continue; /* Costs above 10K are considered infinite */ 768 if( pLang==0 || iLang!=iLangPrev ){ 769 EditDist3Lang *pNew; 770 pNew = sqlite3_realloc64(p->a, (p->nLang+1)*sizeof(p->a[0])); 771 if( pNew==0 ){ rc = SQLITE_NOMEM; break; } 772 p->a = pNew; 773 pLang = &p->a[p->nLang]; 774 p->nLang++; 775 pLang->iLang = iLang; 776 pLang->iInsCost = 100; 777 pLang->iDelCost = 100; 778 pLang->iSubCost = 150; 779 pLang->pCost = 0; 780 iLangPrev = iLang; 781 } 782 if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){ 783 pLang->iDelCost = iCost; 784 }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){ 785 pLang->iInsCost = iCost; 786 }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){ 787 pLang->iSubCost = iCost; 788 }else{ 789 EditDist3Cost *pCost; 790 int nExtra = nFrom + nTo - 4; 791 if( nExtra<0 ) nExtra = 0; 792 pCost = sqlite3_malloc64( sizeof(*pCost) + nExtra ); 793 if( pCost==0 ){ rc = SQLITE_NOMEM; break; } 794 pCost->nFrom = (u8)nFrom; 795 pCost->nTo = (u8)nTo; 796 pCost->iCost = (u16)iCost; 797 memcpy(pCost->a, zFrom, nFrom); 798 memcpy(pCost->a + nFrom, zTo, nTo); 799 pCost->pNext = pLang->pCost; 800 pLang->pCost = pCost; 801 } 802 } 803 rc2 = sqlite3_finalize(pStmt); 804 if( rc==SQLITE_OK ) rc = rc2; 805 if( rc==SQLITE_OK ){ 806 int iLang; 807 for(iLang=0; iLang<p->nLang; iLang++){ 808 p->a[iLang].pCost = editDist3CostSort(p->a[iLang].pCost); 809 } 810 } 811 return rc; 812 } 813 814 /* 815 ** Return the length (in bytes) of a utf-8 character. Or return a maximum 816 ** of N. 817 */ 818 static int utf8Len(unsigned char c, int N){ 819 int len = 1; 820 if( c>0x7f ){ 821 if( (c&0xe0)==0xc0 ){ 822 len = 2; 823 }else if( (c&0xf0)==0xe0 ){ 824 len = 3; 825 }else{ 826 len = 4; 827 } 828 } 829 if( len>N ) len = N; 830 return len; 831 } 832 833 /* 834 ** Return TRUE (non-zero) if the To side of the given cost matches 835 ** the given string. 836 */ 837 static int matchTo(EditDist3Cost *p, const char *z, int n){ 838 assert( n>0 ); 839 if( p->a[p->nFrom]!=z[0] ) return 0; 840 if( p->nTo>n ) return 0; 841 if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0; 842 return 1; 843 } 844 845 /* 846 ** Return TRUE (non-zero) if the From side of the given cost matches 847 ** the given string. 848 */ 849 static int matchFrom(EditDist3Cost *p, const char *z, int n){ 850 assert( p->nFrom<=n ); 851 if( p->nFrom ){ 852 if( p->a[0]!=z[0] ) return 0; 853 if( strncmp(p->a, z, p->nFrom)!=0 ) return 0; 854 } 855 return 1; 856 } 857 858 /* 859 ** Return TRUE (non-zero) of the next FROM character and the next TO 860 ** character are the same. 861 */ 862 static int matchFromTo( 863 EditDist3FromString *pStr, /* Left hand string */ 864 int n1, /* Index of comparison character on the left */ 865 const char *z2, /* Right-handl comparison character */ 866 int n2 /* Bytes remaining in z2[] */ 867 ){ 868 int b1 = pStr->a[n1].nByte; 869 if( b1>n2 ) return 0; 870 assert( b1>0 ); 871 if( pStr->z[n1]!=z2[0] ) return 0; 872 if( strncmp(pStr->z+n1, z2, b1)!=0 ) return 0; 873 return 1; 874 } 875 876 /* 877 ** Delete an EditDist3FromString objecct 878 */ 879 static void editDist3FromStringDelete(EditDist3FromString *p){ 880 int i; 881 if( p ){ 882 for(i=0; i<p->n; i++){ 883 sqlite3_free(p->a[i].apDel); 884 sqlite3_free(p->a[i].apSubst); 885 } 886 sqlite3_free(p); 887 } 888 } 889 890 /* 891 ** Create a EditDist3FromString object. 892 */ 893 static EditDist3FromString *editDist3FromStringNew( 894 const EditDist3Lang *pLang, 895 const char *z, 896 int n 897 ){ 898 EditDist3FromString *pStr; 899 EditDist3Cost *p; 900 int i; 901 902 if( z==0 ) return 0; 903 if( n<0 ) n = (int)strlen(z); 904 pStr = sqlite3_malloc64( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 ); 905 if( pStr==0 ) return 0; 906 pStr->a = (EditDist3From*)&pStr[1]; 907 memset(pStr->a, 0, sizeof(pStr->a[0])*n); 908 pStr->n = n; 909 pStr->z = (char*)&pStr->a[n]; 910 memcpy(pStr->z, z, n+1); 911 if( n && z[n-1]=='*' ){ 912 pStr->isPrefix = 1; 913 n--; 914 pStr->n--; 915 pStr->z[n] = 0; 916 }else{ 917 pStr->isPrefix = 0; 918 } 919 920 for(i=0; i<n; i++){ 921 EditDist3From *pFrom = &pStr->a[i]; 922 memset(pFrom, 0, sizeof(*pFrom)); 923 pFrom->nByte = utf8Len((unsigned char)z[i], n-i); 924 for(p=pLang->pCost; p; p=p->pNext){ 925 EditDist3Cost **apNew; 926 if( i+p->nFrom>n ) continue; 927 if( matchFrom(p, z+i, n-i)==0 ) continue; 928 if( p->nTo==0 ){ 929 apNew = sqlite3_realloc64(pFrom->apDel, 930 sizeof(*apNew)*(pFrom->nDel+1)); 931 if( apNew==0 ) break; 932 pFrom->apDel = apNew; 933 apNew[pFrom->nDel++] = p; 934 }else{ 935 apNew = sqlite3_realloc64(pFrom->apSubst, 936 sizeof(*apNew)*(pFrom->nSubst+1)); 937 if( apNew==0 ) break; 938 pFrom->apSubst = apNew; 939 apNew[pFrom->nSubst++] = p; 940 } 941 } 942 if( p ){ 943 editDist3FromStringDelete(pStr); 944 pStr = 0; 945 break; 946 } 947 } 948 return pStr; 949 } 950 951 /* 952 ** Update entry m[i] such that it is the minimum of its current value 953 ** and m[j]+iCost. 954 */ 955 static void updateCost( 956 unsigned int *m, 957 int i, 958 int j, 959 int iCost 960 ){ 961 unsigned int b; 962 assert( iCost>=0 ); 963 assert( iCost<10000 ); 964 b = m[j] + iCost; 965 if( b<m[i] ) m[i] = b; 966 } 967 968 /* 969 ** How much stack space (int bytes) to use for Wagner matrix in 970 ** editDist3Core(). If more space than this is required, the entire 971 ** matrix is taken from the heap. To reduce the load on the memory 972 ** allocator, make this value as large as practical for the 973 ** architecture in use. 974 */ 975 #ifndef SQLITE_SPELLFIX_STACKALLOC_SZ 976 # define SQLITE_SPELLFIX_STACKALLOC_SZ (1024) 977 #endif 978 979 /* Compute the edit distance between two strings. 980 ** 981 ** If an error occurs, return a negative number which is the error code. 982 ** 983 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters 984 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does 985 ** not contain the pattern for a prefix-search, then this is always the number 986 ** of characters in z2. If pFrom does contain a prefix search pattern, then 987 ** it is the number of characters in the prefix of z2 that was deemed to 988 ** match pFrom. 989 */ 990 static int editDist3Core( 991 EditDist3FromString *pFrom, /* The FROM string */ 992 const char *z2, /* The TO string */ 993 int n2, /* Length of the TO string */ 994 const EditDist3Lang *pLang, /* Edit weights for a particular language ID */ 995 int *pnMatch /* OUT: Characters in matched prefix */ 996 ){ 997 int k, n; 998 int i1, b1; 999 int i2, b2; 1000 EditDist3FromString f = *pFrom; 1001 EditDist3To *a2; 1002 unsigned int *m; 1003 unsigned int *pToFree; 1004 int szRow; 1005 EditDist3Cost *p; 1006 int res; 1007 sqlite3_uint64 nByte; 1008 unsigned int stackSpace[SQLITE_SPELLFIX_STACKALLOC_SZ/sizeof(unsigned int)]; 1009 1010 /* allocate the Wagner matrix and the aTo[] array for the TO string */ 1011 n = (f.n+1)*(n2+1); 1012 n = (n+1)&~1; 1013 nByte = n*sizeof(m[0]) + sizeof(a2[0])*n2; 1014 if( nByte<=sizeof(stackSpace) ){ 1015 m = stackSpace; 1016 pToFree = 0; 1017 }else{ 1018 m = pToFree = sqlite3_malloc64( nByte ); 1019 if( m==0 ) return -1; /* Out of memory */ 1020 } 1021 a2 = (EditDist3To*)&m[n]; 1022 memset(a2, 0, sizeof(a2[0])*n2); 1023 1024 /* Fill in the a1[] matrix for all characters of the TO string */ 1025 for(i2=0; i2<n2; i2++){ 1026 a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2); 1027 for(p=pLang->pCost; p; p=p->pNext){ 1028 EditDist3Cost **apNew; 1029 if( p->nFrom>0 ) break; 1030 if( i2+p->nTo>n2 ) continue; 1031 if( p->a[0]>z2[i2] ) break; 1032 if( matchTo(p, z2+i2, n2-i2)==0 ) continue; 1033 a2[i2].nIns++; 1034 apNew = sqlite3_realloc64(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns); 1035 if( apNew==0 ){ 1036 res = -1; /* Out of memory */ 1037 goto editDist3Abort; 1038 } 1039 a2[i2].apIns = apNew; 1040 a2[i2].apIns[a2[i2].nIns-1] = p; 1041 } 1042 } 1043 1044 /* Prepare to compute the minimum edit distance */ 1045 szRow = f.n+1; 1046 memset(m, 0x01, (n2+1)*szRow*sizeof(m[0])); 1047 m[0] = 0; 1048 1049 /* First fill in the top-row of the matrix with FROM deletion costs */ 1050 for(i1=0; i1<f.n; i1 += b1){ 1051 b1 = f.a[i1].nByte; 1052 updateCost(m, i1+b1, i1, pLang->iDelCost); 1053 for(k=0; k<f.a[i1].nDel; k++){ 1054 p = f.a[i1].apDel[k]; 1055 updateCost(m, i1+p->nFrom, i1, p->iCost); 1056 } 1057 } 1058 1059 /* Fill in all subsequent rows, top-to-bottom, left-to-right */ 1060 for(i2=0; i2<n2; i2 += b2){ 1061 int rx; /* Starting index for current row */ 1062 int rxp; /* Starting index for previous row */ 1063 b2 = a2[i2].nByte; 1064 rx = szRow*(i2+b2); 1065 rxp = szRow*i2; 1066 updateCost(m, rx, rxp, pLang->iInsCost); 1067 for(k=0; k<a2[i2].nIns; k++){ 1068 p = a2[i2].apIns[k]; 1069 updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost); 1070 } 1071 for(i1=0; i1<f.n; i1+=b1){ 1072 int cx; /* Index of current cell */ 1073 int cxp; /* Index of cell immediately to the left */ 1074 int cxd; /* Index of cell to the left and one row above */ 1075 int cxu; /* Index of cell immediately above */ 1076 b1 = f.a[i1].nByte; 1077 cxp = rx + i1; 1078 cx = cxp + b1; 1079 cxd = rxp + i1; 1080 cxu = cxd + b1; 1081 updateCost(m, cx, cxp, pLang->iDelCost); 1082 for(k=0; k<f.a[i1].nDel; k++){ 1083 p = f.a[i1].apDel[k]; 1084 updateCost(m, cxp+p->nFrom, cxp, p->iCost); 1085 } 1086 updateCost(m, cx, cxu, pLang->iInsCost); 1087 if( matchFromTo(&f, i1, z2+i2, n2-i2) ){ 1088 updateCost(m, cx, cxd, 0); 1089 } 1090 updateCost(m, cx, cxd, pLang->iSubCost); 1091 for(k=0; k<f.a[i1].nSubst; k++){ 1092 p = f.a[i1].apSubst[k]; 1093 if( matchTo(p, z2+i2, n2-i2) ){ 1094 updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost); 1095 } 1096 } 1097 } 1098 } 1099 1100 #if 0 /* Enable for debugging */ 1101 printf(" ^"); 1102 for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff); 1103 printf("\n ^:"); 1104 for(i1=0; i1<szRow; i1++){ 1105 int v = m[i1]; 1106 if( v>9999 ) printf(" ****"); 1107 else printf(" %4d", v); 1108 } 1109 printf("\n"); 1110 for(i2=0; i2<n2; i2++){ 1111 printf("%c-%02x:", z2[i2], z2[i2]&0xff); 1112 for(i1=0; i1<szRow; i1++){ 1113 int v = m[(i2+1)*szRow+i1]; 1114 if( v>9999 ) printf(" ****"); 1115 else printf(" %4d", v); 1116 } 1117 printf("\n"); 1118 } 1119 #endif 1120 1121 /* Free memory allocations and return the result */ 1122 res = (int)m[szRow*(n2+1)-1]; 1123 n = n2; 1124 if( f.isPrefix ){ 1125 for(i2=1; i2<=n2; i2++){ 1126 int b = m[szRow*i2-1]; 1127 if( b<=res ){ 1128 res = b; 1129 n = i2 - 1; 1130 } 1131 } 1132 } 1133 if( pnMatch ){ 1134 int nExtra = 0; 1135 for(k=0; k<n; k++){ 1136 if( (z2[k] & 0xc0)==0x80 ) nExtra++; 1137 } 1138 *pnMatch = n - nExtra; 1139 } 1140 1141 editDist3Abort: 1142 for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns); 1143 sqlite3_free(pToFree); 1144 return res; 1145 } 1146 1147 /* 1148 ** Get an appropriate EditDist3Lang object. 1149 */ 1150 static const EditDist3Lang *editDist3FindLang( 1151 EditDist3Config *pConfig, 1152 int iLang 1153 ){ 1154 int i; 1155 for(i=0; i<pConfig->nLang; i++){ 1156 if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i]; 1157 } 1158 return &editDist3Lang; 1159 } 1160 1161 /* 1162 ** Function: editdist3(A,B,iLang) 1163 ** editdist3(tablename) 1164 ** 1165 ** Return the cost of transforming string A into string B using edit 1166 ** weights for iLang. 1167 ** 1168 ** The second form loads edit weights into memory from a table. 1169 */ 1170 static void editDist3SqlFunc( 1171 sqlite3_context *context, 1172 int argc, 1173 sqlite3_value **argv 1174 ){ 1175 EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context); 1176 sqlite3 *db = sqlite3_context_db_handle(context); 1177 int rc; 1178 if( argc==1 ){ 1179 const char *zTable = (const char*)sqlite3_value_text(argv[0]); 1180 rc = editDist3ConfigLoad(pConfig, db, zTable); 1181 if( rc ) sqlite3_result_error_code(context, rc); 1182 }else{ 1183 const char *zA = (const char*)sqlite3_value_text(argv[0]); 1184 const char *zB = (const char*)sqlite3_value_text(argv[1]); 1185 int nA = sqlite3_value_bytes(argv[0]); 1186 int nB = sqlite3_value_bytes(argv[1]); 1187 int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0; 1188 const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang); 1189 EditDist3FromString *pFrom; 1190 int dist; 1191 1192 pFrom = editDist3FromStringNew(pLang, zA, nA); 1193 if( pFrom==0 ){ 1194 sqlite3_result_error_nomem(context); 1195 return; 1196 } 1197 dist = editDist3Core(pFrom, zB, nB, pLang, 0); 1198 editDist3FromStringDelete(pFrom); 1199 if( dist==(-1) ){ 1200 sqlite3_result_error_nomem(context); 1201 }else{ 1202 sqlite3_result_int(context, dist); 1203 } 1204 } 1205 } 1206 1207 /* 1208 ** Register the editDist3 function with SQLite 1209 */ 1210 static int editDist3Install(sqlite3 *db){ 1211 int rc; 1212 EditDist3Config *pConfig = sqlite3_malloc64( sizeof(*pConfig) ); 1213 if( pConfig==0 ) return SQLITE_NOMEM; 1214 memset(pConfig, 0, sizeof(*pConfig)); 1215 rc = sqlite3_create_function_v2(db, "editdist3", 1216 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig, 1217 editDist3SqlFunc, 0, 0, 0); 1218 if( rc==SQLITE_OK ){ 1219 rc = sqlite3_create_function_v2(db, "editdist3", 1220 3, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig, 1221 editDist3SqlFunc, 0, 0, 0); 1222 } 1223 if( rc==SQLITE_OK ){ 1224 rc = sqlite3_create_function_v2(db, "editdist3", 1225 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig, 1226 editDist3SqlFunc, 0, 0, editDist3ConfigDelete); 1227 }else{ 1228 sqlite3_free(pConfig); 1229 } 1230 return rc; 1231 } 1232 /* End configurable cost unicode edit distance routines 1233 ****************************************************************************** 1234 ****************************************************************************** 1235 ** Begin transliterate unicode-to-ascii implementation 1236 */ 1237 1238 #if !SQLITE_AMALGAMATION 1239 /* 1240 ** This lookup table is used to help decode the first byte of 1241 ** a multi-byte UTF8 character. 1242 */ 1243 static const unsigned char sqlite3Utf8Trans1[] = { 1244 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1245 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 1246 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 1247 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 1248 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1249 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 1250 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1251 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, 1252 }; 1253 #endif 1254 1255 /* 1256 ** Return the value of the first UTF-8 character in the string. 1257 */ 1258 static int utf8Read(const unsigned char *z, int n, int *pSize){ 1259 int c, i; 1260 1261 /* All callers to this routine (in the current implementation) 1262 ** always have n>0. */ 1263 if( NEVER(n==0) ){ 1264 c = i = 0; 1265 }else{ 1266 c = z[0]; 1267 i = 1; 1268 if( c>=0xc0 ){ 1269 c = sqlite3Utf8Trans1[c-0xc0]; 1270 while( i<n && (z[i] & 0xc0)==0x80 ){ 1271 c = (c<<6) + (0x3f & z[i++]); 1272 } 1273 } 1274 } 1275 *pSize = i; 1276 return c; 1277 } 1278 1279 /* 1280 ** Return the number of characters in the utf-8 string in the nIn byte 1281 ** buffer pointed to by zIn. 1282 */ 1283 static int utf8Charlen(const char *zIn, int nIn){ 1284 int i; 1285 int nChar = 0; 1286 for(i=0; i<nIn; nChar++){ 1287 int sz; 1288 utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz); 1289 i += sz; 1290 } 1291 return nChar; 1292 } 1293 1294 typedef struct Transliteration Transliteration; 1295 struct Transliteration { 1296 unsigned short int cFrom; 1297 unsigned char cTo0, cTo1, cTo2, cTo3; 1298 #ifdef SQLITE_SPELLFIX_5BYTE_MAPPINGS 1299 unsigned char cTo4; 1300 #endif 1301 }; 1302 1303 /* 1304 ** Table of translations from unicode characters into ASCII. 1305 */ 1306 static const Transliteration translit[] = { 1307 { 0x00A0, 0x20, 0x00, 0x00, 0x00 }, /* to */ 1308 { 0x00B5, 0x75, 0x00, 0x00, 0x00 }, /* µ to u */ 1309 { 0x00C0, 0x41, 0x00, 0x00, 0x00 }, /* À to A */ 1310 { 0x00C1, 0x41, 0x00, 0x00, 0x00 }, /* Á to A */ 1311 { 0x00C2, 0x41, 0x00, 0x00, 0x00 }, /*  to A */ 1312 { 0x00C3, 0x41, 0x00, 0x00, 0x00 }, /* à to A */ 1313 { 0x00C4, 0x41, 0x65, 0x00, 0x00 }, /* Ä to Ae */ 1314 { 0x00C5, 0x41, 0x61, 0x00, 0x00 }, /* Å to Aa */ 1315 { 0x00C6, 0x41, 0x45, 0x00, 0x00 }, /* Æ to AE */ 1316 { 0x00C7, 0x43, 0x00, 0x00, 0x00 }, /* Ç to C */ 1317 { 0x00C8, 0x45, 0x00, 0x00, 0x00 }, /* È to E */ 1318 { 0x00C9, 0x45, 0x00, 0x00, 0x00 }, /* É to E */ 1319 { 0x00CA, 0x45, 0x00, 0x00, 0x00 }, /* Ê to E */ 1320 { 0x00CB, 0x45, 0x00, 0x00, 0x00 }, /* Ë to E */ 1321 { 0x00CC, 0x49, 0x00, 0x00, 0x00 }, /* Ì to I */ 1322 { 0x00CD, 0x49, 0x00, 0x00, 0x00 }, /* Í to I */ 1323 { 0x00CE, 0x49, 0x00, 0x00, 0x00 }, /* Î to I */ 1324 { 0x00CF, 0x49, 0x00, 0x00, 0x00 }, /* Ï to I */ 1325 { 0x00D0, 0x44, 0x00, 0x00, 0x00 }, /* Ð to D */ 1326 { 0x00D1, 0x4E, 0x00, 0x00, 0x00 }, /* Ñ to N */ 1327 { 0x00D2, 0x4F, 0x00, 0x00, 0x00 }, /* Ò to O */ 1328 { 0x00D3, 0x4F, 0x00, 0x00, 0x00 }, /* Ó to O */ 1329 { 0x00D4, 0x4F, 0x00, 0x00, 0x00 }, /* Ô to O */ 1330 { 0x00D5, 0x4F, 0x00, 0x00, 0x00 }, /* Õ to O */ 1331 { 0x00D6, 0x4F, 0x65, 0x00, 0x00 }, /* Ö to Oe */ 1332 { 0x00D7, 0x78, 0x00, 0x00, 0x00 }, /* × to x */ 1333 { 0x00D8, 0x4F, 0x00, 0x00, 0x00 }, /* Ø to O */ 1334 { 0x00D9, 0x55, 0x00, 0x00, 0x00 }, /* Ù to U */ 1335 { 0x00DA, 0x55, 0x00, 0x00, 0x00 }, /* Ú to U */ 1336 { 0x00DB, 0x55, 0x00, 0x00, 0x00 }, /* Û to U */ 1337 { 0x00DC, 0x55, 0x65, 0x00, 0x00 }, /* Ü to Ue */ 1338 { 0x00DD, 0x59, 0x00, 0x00, 0x00 }, /* Ý to Y */ 1339 { 0x00DE, 0x54, 0x68, 0x00, 0x00 }, /* Þ to Th */ 1340 { 0x00DF, 0x73, 0x73, 0x00, 0x00 }, /* ß to ss */ 1341 { 0x00E0, 0x61, 0x00, 0x00, 0x00 }, /* à to a */ 1342 { 0x00E1, 0x61, 0x00, 0x00, 0x00 }, /* á to a */ 1343 { 0x00E2, 0x61, 0x00, 0x00, 0x00 }, /* â to a */ 1344 { 0x00E3, 0x61, 0x00, 0x00, 0x00 }, /* ã to a */ 1345 { 0x00E4, 0x61, 0x65, 0x00, 0x00 }, /* ä to ae */ 1346 { 0x00E5, 0x61, 0x61, 0x00, 0x00 }, /* å to aa */ 1347 { 0x00E6, 0x61, 0x65, 0x00, 0x00 }, /* æ to ae */ 1348 { 0x00E7, 0x63, 0x00, 0x00, 0x00 }, /* ç to c */ 1349 { 0x00E8, 0x65, 0x00, 0x00, 0x00 }, /* è to e */ 1350 { 0x00E9, 0x65, 0x00, 0x00, 0x00 }, /* é to e */ 1351 { 0x00EA, 0x65, 0x00, 0x00, 0x00 }, /* ê to e */ 1352 { 0x00EB, 0x65, 0x00, 0x00, 0x00 }, /* ë to e */ 1353 { 0x00EC, 0x69, 0x00, 0x00, 0x00 }, /* ì to i */ 1354 { 0x00ED, 0x69, 0x00, 0x00, 0x00 }, /* í to i */ 1355 { 0x00EE, 0x69, 0x00, 0x00, 0x00 }, /* î to i */ 1356 { 0x00EF, 0x69, 0x00, 0x00, 0x00 }, /* ï to i */ 1357 { 0x00F0, 0x64, 0x00, 0x00, 0x00 }, /* ð to d */ 1358 { 0x00F1, 0x6E, 0x00, 0x00, 0x00 }, /* ñ to n */ 1359 { 0x00F2, 0x6F, 0x00, 0x00, 0x00 }, /* ò to o */ 1360 { 0x00F3, 0x6F, 0x00, 0x00, 0x00 }, /* ó to o */ 1361 { 0x00F4, 0x6F, 0x00, 0x00, 0x00 }, /* ô to o */ 1362 { 0x00F5, 0x6F, 0x00, 0x00, 0x00 }, /* õ to o */ 1363 { 0x00F6, 0x6F, 0x65, 0x00, 0x00 }, /* ö to oe */ 1364 { 0x00F7, 0x3A, 0x00, 0x00, 0x00 }, /* ÷ to : */ 1365 { 0x00F8, 0x6F, 0x00, 0x00, 0x00 }, /* ø to o */ 1366 { 0x00F9, 0x75, 0x00, 0x00, 0x00 }, /* ù to u */ 1367 { 0x00FA, 0x75, 0x00, 0x00, 0x00 }, /* ú to u */ 1368 { 0x00FB, 0x75, 0x00, 0x00, 0x00 }, /* û to u */ 1369 { 0x00FC, 0x75, 0x65, 0x00, 0x00 }, /* ü to ue */ 1370 { 0x00FD, 0x79, 0x00, 0x00, 0x00 }, /* ý to y */ 1371 { 0x00FE, 0x74, 0x68, 0x00, 0x00 }, /* þ to th */ 1372 { 0x00FF, 0x79, 0x00, 0x00, 0x00 }, /* ÿ to y */ 1373 { 0x0100, 0x41, 0x00, 0x00, 0x00 }, /* Ā to A */ 1374 { 0x0101, 0x61, 0x00, 0x00, 0x00 }, /* ā to a */ 1375 { 0x0102, 0x41, 0x00, 0x00, 0x00 }, /* Ă to A */ 1376 { 0x0103, 0x61, 0x00, 0x00, 0x00 }, /* ă to a */ 1377 { 0x0104, 0x41, 0x00, 0x00, 0x00 }, /* Ą to A */ 1378 { 0x0105, 0x61, 0x00, 0x00, 0x00 }, /* ą to a */ 1379 { 0x0106, 0x43, 0x00, 0x00, 0x00 }, /* Ć to C */ 1380 { 0x0107, 0x63, 0x00, 0x00, 0x00 }, /* ć to c */ 1381 { 0x0108, 0x43, 0x68, 0x00, 0x00 }, /* Ĉ to Ch */ 1382 { 0x0109, 0x63, 0x68, 0x00, 0x00 }, /* ĉ to ch */ 1383 { 0x010A, 0x43, 0x00, 0x00, 0x00 }, /* Ċ to C */ 1384 { 0x010B, 0x63, 0x00, 0x00, 0x00 }, /* ċ to c */ 1385 { 0x010C, 0x43, 0x00, 0x00, 0x00 }, /* Č to C */ 1386 { 0x010D, 0x63, 0x00, 0x00, 0x00 }, /* č to c */ 1387 { 0x010E, 0x44, 0x00, 0x00, 0x00 }, /* Ď to D */ 1388 { 0x010F, 0x64, 0x00, 0x00, 0x00 }, /* ď to d */ 1389 { 0x0110, 0x44, 0x00, 0x00, 0x00 }, /* Đ to D */ 1390 { 0x0111, 0x64, 0x00, 0x00, 0x00 }, /* đ to d */ 1391 { 0x0112, 0x45, 0x00, 0x00, 0x00 }, /* Ē to E */ 1392 { 0x0113, 0x65, 0x00, 0x00, 0x00 }, /* ē to e */ 1393 { 0x0114, 0x45, 0x00, 0x00, 0x00 }, /* Ĕ to E */ 1394 { 0x0115, 0x65, 0x00, 0x00, 0x00 }, /* ĕ to e */ 1395 { 0x0116, 0x45, 0x00, 0x00, 0x00 }, /* Ė to E */ 1396 { 0x0117, 0x65, 0x00, 0x00, 0x00 }, /* ė to e */ 1397 { 0x0118, 0x45, 0x00, 0x00, 0x00 }, /* Ę to E */ 1398 { 0x0119, 0x65, 0x00, 0x00, 0x00 }, /* ę to e */ 1399 { 0x011A, 0x45, 0x00, 0x00, 0x00 }, /* Ě to E */ 1400 { 0x011B, 0x65, 0x00, 0x00, 0x00 }, /* ě to e */ 1401 { 0x011C, 0x47, 0x68, 0x00, 0x00 }, /* Ĝ to Gh */ 1402 { 0x011D, 0x67, 0x68, 0x00, 0x00 }, /* ĝ to gh */ 1403 { 0x011E, 0x47, 0x00, 0x00, 0x00 }, /* Ğ to G */ 1404 { 0x011F, 0x67, 0x00, 0x00, 0x00 }, /* ğ to g */ 1405 { 0x0120, 0x47, 0x00, 0x00, 0x00 }, /* Ġ to G */ 1406 { 0x0121, 0x67, 0x00, 0x00, 0x00 }, /* ġ to g */ 1407 { 0x0122, 0x47, 0x00, 0x00, 0x00 }, /* Ģ to G */ 1408 { 0x0123, 0x67, 0x00, 0x00, 0x00 }, /* ģ to g */ 1409 { 0x0124, 0x48, 0x68, 0x00, 0x00 }, /* Ĥ to Hh */ 1410 { 0x0125, 0x68, 0x68, 0x00, 0x00 }, /* ĥ to hh */ 1411 { 0x0126, 0x48, 0x00, 0x00, 0x00 }, /* Ħ to H */ 1412 { 0x0127, 0x68, 0x00, 0x00, 0x00 }, /* ħ to h */ 1413 { 0x0128, 0x49, 0x00, 0x00, 0x00 }, /* Ĩ to I */ 1414 { 0x0129, 0x69, 0x00, 0x00, 0x00 }, /* ĩ to i */ 1415 { 0x012A, 0x49, 0x00, 0x00, 0x00 }, /* Ī to I */ 1416 { 0x012B, 0x69, 0x00, 0x00, 0x00 }, /* ī to i */ 1417 { 0x012C, 0x49, 0x00, 0x00, 0x00 }, /* Ĭ to I */ 1418 { 0x012D, 0x69, 0x00, 0x00, 0x00 }, /* ĭ to i */ 1419 { 0x012E, 0x49, 0x00, 0x00, 0x00 }, /* Į to I */ 1420 { 0x012F, 0x69, 0x00, 0x00, 0x00 }, /* į to i */ 1421 { 0x0130, 0x49, 0x00, 0x00, 0x00 }, /* İ to I */ 1422 { 0x0131, 0x69, 0x00, 0x00, 0x00 }, /* ı to i */ 1423 { 0x0132, 0x49, 0x4A, 0x00, 0x00 }, /* IJ to IJ */ 1424 { 0x0133, 0x69, 0x6A, 0x00, 0x00 }, /* ij to ij */ 1425 { 0x0134, 0x4A, 0x68, 0x00, 0x00 }, /* Ĵ to Jh */ 1426 { 0x0135, 0x6A, 0x68, 0x00, 0x00 }, /* ĵ to jh */ 1427 { 0x0136, 0x4B, 0x00, 0x00, 0x00 }, /* Ķ to K */ 1428 { 0x0137, 0x6B, 0x00, 0x00, 0x00 }, /* ķ to k */ 1429 { 0x0138, 0x6B, 0x00, 0x00, 0x00 }, /* ĸ to k */ 1430 { 0x0139, 0x4C, 0x00, 0x00, 0x00 }, /* Ĺ to L */ 1431 { 0x013A, 0x6C, 0x00, 0x00, 0x00 }, /* ĺ to l */ 1432 { 0x013B, 0x4C, 0x00, 0x00, 0x00 }, /* Ļ to L */ 1433 { 0x013C, 0x6C, 0x00, 0x00, 0x00 }, /* ļ to l */ 1434 { 0x013D, 0x4C, 0x00, 0x00, 0x00 }, /* Ľ to L */ 1435 { 0x013E, 0x6C, 0x00, 0x00, 0x00 }, /* ľ to l */ 1436 { 0x013F, 0x4C, 0x2E, 0x00, 0x00 }, /* Ŀ to L. */ 1437 { 0x0140, 0x6C, 0x2E, 0x00, 0x00 }, /* ŀ to l. */ 1438 { 0x0141, 0x4C, 0x00, 0x00, 0x00 }, /* Ł to L */ 1439 { 0x0142, 0x6C, 0x00, 0x00, 0x00 }, /* ł to l */ 1440 { 0x0143, 0x4E, 0x00, 0x00, 0x00 }, /* Ń to N */ 1441 { 0x0144, 0x6E, 0x00, 0x00, 0x00 }, /* ń to n */ 1442 { 0x0145, 0x4E, 0x00, 0x00, 0x00 }, /* Ņ to N */ 1443 { 0x0146, 0x6E, 0x00, 0x00, 0x00 }, /* ņ to n */ 1444 { 0x0147, 0x4E, 0x00, 0x00, 0x00 }, /* Ň to N */ 1445 { 0x0148, 0x6E, 0x00, 0x00, 0x00 }, /* ň to n */ 1446 { 0x0149, 0x27, 0x6E, 0x00, 0x00 }, /* ʼn to 'n */ 1447 { 0x014A, 0x4E, 0x47, 0x00, 0x00 }, /* Ŋ to NG */ 1448 { 0x014B, 0x6E, 0x67, 0x00, 0x00 }, /* ŋ to ng */ 1449 { 0x014C, 0x4F, 0x00, 0x00, 0x00 }, /* Ō to O */ 1450 { 0x014D, 0x6F, 0x00, 0x00, 0x00 }, /* ō to o */ 1451 { 0x014E, 0x4F, 0x00, 0x00, 0x00 }, /* Ŏ to O */ 1452 { 0x014F, 0x6F, 0x00, 0x00, 0x00 }, /* ŏ to o */ 1453 { 0x0150, 0x4F, 0x00, 0x00, 0x00 }, /* Ő to O */ 1454 { 0x0151, 0x6F, 0x00, 0x00, 0x00 }, /* ő to o */ 1455 { 0x0152, 0x4F, 0x45, 0x00, 0x00 }, /* Œ to OE */ 1456 { 0x0153, 0x6F, 0x65, 0x00, 0x00 }, /* œ to oe */ 1457 { 0x0154, 0x52, 0x00, 0x00, 0x00 }, /* Ŕ to R */ 1458 { 0x0155, 0x72, 0x00, 0x00, 0x00 }, /* ŕ to r */ 1459 { 0x0156, 0x52, 0x00, 0x00, 0x00 }, /* Ŗ to R */ 1460 { 0x0157, 0x72, 0x00, 0x00, 0x00 }, /* ŗ to r */ 1461 { 0x0158, 0x52, 0x00, 0x00, 0x00 }, /* Ř to R */ 1462 { 0x0159, 0x72, 0x00, 0x00, 0x00 }, /* ř to r */ 1463 { 0x015A, 0x53, 0x00, 0x00, 0x00 }, /* Ś to S */ 1464 { 0x015B, 0x73, 0x00, 0x00, 0x00 }, /* ś to s */ 1465 { 0x015C, 0x53, 0x68, 0x00, 0x00 }, /* Ŝ to Sh */ 1466 { 0x015D, 0x73, 0x68, 0x00, 0x00 }, /* ŝ to sh */ 1467 { 0x015E, 0x53, 0x00, 0x00, 0x00 }, /* Ş to S */ 1468 { 0x015F, 0x73, 0x00, 0x00, 0x00 }, /* ş to s */ 1469 { 0x0160, 0x53, 0x00, 0x00, 0x00 }, /* Š to S */ 1470 { 0x0161, 0x73, 0x00, 0x00, 0x00 }, /* š to s */ 1471 { 0x0162, 0x54, 0x00, 0x00, 0x00 }, /* Ţ to T */ 1472 { 0x0163, 0x74, 0x00, 0x00, 0x00 }, /* ţ to t */ 1473 { 0x0164, 0x54, 0x00, 0x00, 0x00 }, /* Ť to T */ 1474 { 0x0165, 0x74, 0x00, 0x00, 0x00 }, /* ť to t */ 1475 { 0x0166, 0x54, 0x00, 0x00, 0x00 }, /* Ŧ to T */ 1476 { 0x0167, 0x74, 0x00, 0x00, 0x00 }, /* ŧ to t */ 1477 { 0x0168, 0x55, 0x00, 0x00, 0x00 }, /* Ũ to U */ 1478 { 0x0169, 0x75, 0x00, 0x00, 0x00 }, /* ũ to u */ 1479 { 0x016A, 0x55, 0x00, 0x00, 0x00 }, /* Ū to U */ 1480 { 0x016B, 0x75, 0x00, 0x00, 0x00 }, /* ū to u */ 1481 { 0x016C, 0x55, 0x00, 0x00, 0x00 }, /* Ŭ to U */ 1482 { 0x016D, 0x75, 0x00, 0x00, 0x00 }, /* ŭ to u */ 1483 { 0x016E, 0x55, 0x00, 0x00, 0x00 }, /* Ů to U */ 1484 { 0x016F, 0x75, 0x00, 0x00, 0x00 }, /* ů to u */ 1485 { 0x0170, 0x55, 0x00, 0x00, 0x00 }, /* Ű to U */ 1486 { 0x0171, 0x75, 0x00, 0x00, 0x00 }, /* ű to u */ 1487 { 0x0172, 0x55, 0x00, 0x00, 0x00 }, /* Ų to U */ 1488 { 0x0173, 0x75, 0x00, 0x00, 0x00 }, /* ų to u */ 1489 { 0x0174, 0x57, 0x00, 0x00, 0x00 }, /* Ŵ to W */ 1490 { 0x0175, 0x77, 0x00, 0x00, 0x00 }, /* ŵ to w */ 1491 { 0x0176, 0x59, 0x00, 0x00, 0x00 }, /* Ŷ to Y */ 1492 { 0x0177, 0x79, 0x00, 0x00, 0x00 }, /* ŷ to y */ 1493 { 0x0178, 0x59, 0x00, 0x00, 0x00 }, /* Ÿ to Y */ 1494 { 0x0179, 0x5A, 0x00, 0x00, 0x00 }, /* Ź to Z */ 1495 { 0x017A, 0x7A, 0x00, 0x00, 0x00 }, /* ź to z */ 1496 { 0x017B, 0x5A, 0x00, 0x00, 0x00 }, /* Ż to Z */ 1497 { 0x017C, 0x7A, 0x00, 0x00, 0x00 }, /* ż to z */ 1498 { 0x017D, 0x5A, 0x00, 0x00, 0x00 }, /* Ž to Z */ 1499 { 0x017E, 0x7A, 0x00, 0x00, 0x00 }, /* ž to z */ 1500 { 0x017F, 0x73, 0x00, 0x00, 0x00 }, /* ſ to s */ 1501 { 0x0192, 0x66, 0x00, 0x00, 0x00 }, /* ƒ to f */ 1502 { 0x0218, 0x53, 0x00, 0x00, 0x00 }, /* Ș to S */ 1503 { 0x0219, 0x73, 0x00, 0x00, 0x00 }, /* ș to s */ 1504 { 0x021A, 0x54, 0x00, 0x00, 0x00 }, /* Ț to T */ 1505 { 0x021B, 0x74, 0x00, 0x00, 0x00 }, /* ț to t */ 1506 { 0x0386, 0x41, 0x00, 0x00, 0x00 }, /* Ά to A */ 1507 { 0x0388, 0x45, 0x00, 0x00, 0x00 }, /* Έ to E */ 1508 { 0x0389, 0x49, 0x00, 0x00, 0x00 }, /* Ή to I */ 1509 { 0x038A, 0x49, 0x00, 0x00, 0x00 }, /* Ί to I */ 1510 { 0x038C, 0x4f, 0x00, 0x00, 0x00 }, /* Ό to O */ 1511 { 0x038E, 0x59, 0x00, 0x00, 0x00 }, /* Ύ to Y */ 1512 { 0x038F, 0x4f, 0x00, 0x00, 0x00 }, /* Ώ to O */ 1513 { 0x0390, 0x69, 0x00, 0x00, 0x00 }, /* ΐ to i */ 1514 { 0x0391, 0x41, 0x00, 0x00, 0x00 }, /* Α to A */ 1515 { 0x0392, 0x42, 0x00, 0x00, 0x00 }, /* Β to B */ 1516 { 0x0393, 0x47, 0x00, 0x00, 0x00 }, /* Γ to G */ 1517 { 0x0394, 0x44, 0x00, 0x00, 0x00 }, /* Δ to D */ 1518 { 0x0395, 0x45, 0x00, 0x00, 0x00 }, /* Ε to E */ 1519 { 0x0396, 0x5a, 0x00, 0x00, 0x00 }, /* Ζ to Z */ 1520 { 0x0397, 0x49, 0x00, 0x00, 0x00 }, /* Η to I */ 1521 { 0x0398, 0x54, 0x68, 0x00, 0x00 }, /* Θ to Th */ 1522 { 0x0399, 0x49, 0x00, 0x00, 0x00 }, /* Ι to I */ 1523 { 0x039A, 0x4b, 0x00, 0x00, 0x00 }, /* Κ to K */ 1524 { 0x039B, 0x4c, 0x00, 0x00, 0x00 }, /* Λ to L */ 1525 { 0x039C, 0x4d, 0x00, 0x00, 0x00 }, /* Μ to M */ 1526 { 0x039D, 0x4e, 0x00, 0x00, 0x00 }, /* Ν to N */ 1527 { 0x039E, 0x58, 0x00, 0x00, 0x00 }, /* Ξ to X */ 1528 { 0x039F, 0x4f, 0x00, 0x00, 0x00 }, /* Ο to O */ 1529 { 0x03A0, 0x50, 0x00, 0x00, 0x00 }, /* Π to P */ 1530 { 0x03A1, 0x52, 0x00, 0x00, 0x00 }, /* Ρ to R */ 1531 { 0x03A3, 0x53, 0x00, 0x00, 0x00 }, /* Σ to S */ 1532 { 0x03A4, 0x54, 0x00, 0x00, 0x00 }, /* Τ to T */ 1533 { 0x03A5, 0x59, 0x00, 0x00, 0x00 }, /* Υ to Y */ 1534 { 0x03A6, 0x46, 0x00, 0x00, 0x00 }, /* Φ to F */ 1535 { 0x03A7, 0x43, 0x68, 0x00, 0x00 }, /* Χ to Ch */ 1536 { 0x03A8, 0x50, 0x73, 0x00, 0x00 }, /* Ψ to Ps */ 1537 { 0x03A9, 0x4f, 0x00, 0x00, 0x00 }, /* Ω to O */ 1538 { 0x03AA, 0x49, 0x00, 0x00, 0x00 }, /* Ϊ to I */ 1539 { 0x03AB, 0x59, 0x00, 0x00, 0x00 }, /* Ϋ to Y */ 1540 { 0x03AC, 0x61, 0x00, 0x00, 0x00 }, /* ά to a */ 1541 { 0x03AD, 0x65, 0x00, 0x00, 0x00 }, /* έ to e */ 1542 { 0x03AE, 0x69, 0x00, 0x00, 0x00 }, /* ή to i */ 1543 { 0x03AF, 0x69, 0x00, 0x00, 0x00 }, /* ί to i */ 1544 { 0x03B1, 0x61, 0x00, 0x00, 0x00 }, /* α to a */ 1545 { 0x03B2, 0x62, 0x00, 0x00, 0x00 }, /* β to b */ 1546 { 0x03B3, 0x67, 0x00, 0x00, 0x00 }, /* γ to g */ 1547 { 0x03B4, 0x64, 0x00, 0x00, 0x00 }, /* δ to d */ 1548 { 0x03B5, 0x65, 0x00, 0x00, 0x00 }, /* ε to e */ 1549 { 0x03B6, 0x7a, 0x00, 0x00, 0x00 }, /* ζ to z */ 1550 { 0x03B7, 0x69, 0x00, 0x00, 0x00 }, /* η to i */ 1551 { 0x03B8, 0x74, 0x68, 0x00, 0x00 }, /* θ to th */ 1552 { 0x03B9, 0x69, 0x00, 0x00, 0x00 }, /* ι to i */ 1553 { 0x03BA, 0x6b, 0x00, 0x00, 0x00 }, /* κ to k */ 1554 { 0x03BB, 0x6c, 0x00, 0x00, 0x00 }, /* λ to l */ 1555 { 0x03BC, 0x6d, 0x00, 0x00, 0x00 }, /* μ to m */ 1556 { 0x03BD, 0x6e, 0x00, 0x00, 0x00 }, /* ν to n */ 1557 { 0x03BE, 0x78, 0x00, 0x00, 0x00 }, /* ξ to x */ 1558 { 0x03BF, 0x6f, 0x00, 0x00, 0x00 }, /* ο to o */ 1559 { 0x03C0, 0x70, 0x00, 0x00, 0x00 }, /* π to p */ 1560 { 0x03C1, 0x72, 0x00, 0x00, 0x00 }, /* ρ to r */ 1561 { 0x03C3, 0x73, 0x00, 0x00, 0x00 }, /* σ to s */ 1562 { 0x03C4, 0x74, 0x00, 0x00, 0x00 }, /* τ to t */ 1563 { 0x03C5, 0x79, 0x00, 0x00, 0x00 }, /* υ to y */ 1564 { 0x03C6, 0x66, 0x00, 0x00, 0x00 }, /* φ to f */ 1565 { 0x03C7, 0x63, 0x68, 0x00, 0x00 }, /* χ to ch */ 1566 { 0x03C8, 0x70, 0x73, 0x00, 0x00 }, /* ψ to ps */ 1567 { 0x03C9, 0x6f, 0x00, 0x00, 0x00 }, /* ω to o */ 1568 { 0x03CA, 0x69, 0x00, 0x00, 0x00 }, /* ϊ to i */ 1569 { 0x03CB, 0x79, 0x00, 0x00, 0x00 }, /* ϋ to y */ 1570 { 0x03CC, 0x6f, 0x00, 0x00, 0x00 }, /* ό to o */ 1571 { 0x03CD, 0x79, 0x00, 0x00, 0x00 }, /* ύ to y */ 1572 { 0x03CE, 0x69, 0x00, 0x00, 0x00 }, /* ώ to i */ 1573 { 0x0400, 0x45, 0x00, 0x00, 0x00 }, /* Ѐ to E */ 1574 { 0x0401, 0x45, 0x00, 0x00, 0x00 }, /* Ё to E */ 1575 { 0x0402, 0x44, 0x00, 0x00, 0x00 }, /* Ђ to D */ 1576 { 0x0403, 0x47, 0x00, 0x00, 0x00 }, /* Ѓ to G */ 1577 { 0x0404, 0x45, 0x00, 0x00, 0x00 }, /* Є to E */ 1578 { 0x0405, 0x5a, 0x00, 0x00, 0x00 }, /* Ѕ to Z */ 1579 { 0x0406, 0x49, 0x00, 0x00, 0x00 }, /* І to I */ 1580 { 0x0407, 0x49, 0x00, 0x00, 0x00 }, /* Ї to I */ 1581 { 0x0408, 0x4a, 0x00, 0x00, 0x00 }, /* Ј to J */ 1582 { 0x0409, 0x49, 0x00, 0x00, 0x00 }, /* Љ to I */ 1583 { 0x040A, 0x4e, 0x00, 0x00, 0x00 }, /* Њ to N */ 1584 { 0x040B, 0x44, 0x00, 0x00, 0x00 }, /* Ћ to D */ 1585 { 0x040C, 0x4b, 0x00, 0x00, 0x00 }, /* Ќ to K */ 1586 { 0x040D, 0x49, 0x00, 0x00, 0x00 }, /* Ѝ to I */ 1587 { 0x040E, 0x55, 0x00, 0x00, 0x00 }, /* Ў to U */ 1588 { 0x040F, 0x44, 0x00, 0x00, 0x00 }, /* Џ to D */ 1589 { 0x0410, 0x41, 0x00, 0x00, 0x00 }, /* А to A */ 1590 { 0x0411, 0x42, 0x00, 0x00, 0x00 }, /* Б to B */ 1591 { 0x0412, 0x56, 0x00, 0x00, 0x00 }, /* В to V */ 1592 { 0x0413, 0x47, 0x00, 0x00, 0x00 }, /* Г to G */ 1593 { 0x0414, 0x44, 0x00, 0x00, 0x00 }, /* Д to D */ 1594 { 0x0415, 0x45, 0x00, 0x00, 0x00 }, /* Е to E */ 1595 { 0x0416, 0x5a, 0x68, 0x00, 0x00 }, /* Ж to Zh */ 1596 { 0x0417, 0x5a, 0x00, 0x00, 0x00 }, /* З to Z */ 1597 { 0x0418, 0x49, 0x00, 0x00, 0x00 }, /* И to I */ 1598 { 0x0419, 0x49, 0x00, 0x00, 0x00 }, /* Й to I */ 1599 { 0x041A, 0x4b, 0x00, 0x00, 0x00 }, /* К to K */ 1600 { 0x041B, 0x4c, 0x00, 0x00, 0x00 }, /* Л to L */ 1601 { 0x041C, 0x4d, 0x00, 0x00, 0x00 }, /* М to M */ 1602 { 0x041D, 0x4e, 0x00, 0x00, 0x00 }, /* Н to N */ 1603 { 0x041E, 0x4f, 0x00, 0x00, 0x00 }, /* О to O */ 1604 { 0x041F, 0x50, 0x00, 0x00, 0x00 }, /* П to P */ 1605 { 0x0420, 0x52, 0x00, 0x00, 0x00 }, /* Р to R */ 1606 { 0x0421, 0x53, 0x00, 0x00, 0x00 }, /* С to S */ 1607 { 0x0422, 0x54, 0x00, 0x00, 0x00 }, /* Т to T */ 1608 { 0x0423, 0x55, 0x00, 0x00, 0x00 }, /* У to U */ 1609 { 0x0424, 0x46, 0x00, 0x00, 0x00 }, /* Ф to F */ 1610 { 0x0425, 0x4b, 0x68, 0x00, 0x00 }, /* Х to Kh */ 1611 { 0x0426, 0x54, 0x63, 0x00, 0x00 }, /* Ц to Tc */ 1612 { 0x0427, 0x43, 0x68, 0x00, 0x00 }, /* Ч to Ch */ 1613 { 0x0428, 0x53, 0x68, 0x00, 0x00 }, /* Ш to Sh */ 1614 { 0x0429, 0x53, 0x68, 0x63, 0x68 }, /* Щ to Shch */ 1615 { 0x042A, 0x61, 0x00, 0x00, 0x00 }, /* to A */ 1616 { 0x042B, 0x59, 0x00, 0x00, 0x00 }, /* Ы to Y */ 1617 { 0x042C, 0x59, 0x00, 0x00, 0x00 }, /* to Y */ 1618 { 0x042D, 0x45, 0x00, 0x00, 0x00 }, /* Э to E */ 1619 { 0x042E, 0x49, 0x75, 0x00, 0x00 }, /* Ю to Iu */ 1620 { 0x042F, 0x49, 0x61, 0x00, 0x00 }, /* Я to Ia */ 1621 { 0x0430, 0x61, 0x00, 0x00, 0x00 }, /* а to a */ 1622 { 0x0431, 0x62, 0x00, 0x00, 0x00 }, /* б to b */ 1623 { 0x0432, 0x76, 0x00, 0x00, 0x00 }, /* в to v */ 1624 { 0x0433, 0x67, 0x00, 0x00, 0x00 }, /* г to g */ 1625 { 0x0434, 0x64, 0x00, 0x00, 0x00 }, /* д to d */ 1626 { 0x0435, 0x65, 0x00, 0x00, 0x00 }, /* е to e */ 1627 { 0x0436, 0x7a, 0x68, 0x00, 0x00 }, /* ж to zh */ 1628 { 0x0437, 0x7a, 0x00, 0x00, 0x00 }, /* з to z */ 1629 { 0x0438, 0x69, 0x00, 0x00, 0x00 }, /* и to i */ 1630 { 0x0439, 0x69, 0x00, 0x00, 0x00 }, /* й to i */ 1631 { 0x043A, 0x6b, 0x00, 0x00, 0x00 }, /* к to k */ 1632 { 0x043B, 0x6c, 0x00, 0x00, 0x00 }, /* л to l */ 1633 { 0x043C, 0x6d, 0x00, 0x00, 0x00 }, /* м to m */ 1634 { 0x043D, 0x6e, 0x00, 0x00, 0x00 }, /* н to n */ 1635 { 0x043E, 0x6f, 0x00, 0x00, 0x00 }, /* о to o */ 1636 { 0x043F, 0x70, 0x00, 0x00, 0x00 }, /* п to p */ 1637 { 0x0440, 0x72, 0x00, 0x00, 0x00 }, /* р to r */ 1638 { 0x0441, 0x73, 0x00, 0x00, 0x00 }, /* с to s */ 1639 { 0x0442, 0x74, 0x00, 0x00, 0x00 }, /* т to t */ 1640 { 0x0443, 0x75, 0x00, 0x00, 0x00 }, /* у to u */ 1641 { 0x0444, 0x66, 0x00, 0x00, 0x00 }, /* ф to f */ 1642 { 0x0445, 0x6b, 0x68, 0x00, 0x00 }, /* х to kh */ 1643 { 0x0446, 0x74, 0x63, 0x00, 0x00 }, /* ц to tc */ 1644 { 0x0447, 0x63, 0x68, 0x00, 0x00 }, /* ч to ch */ 1645 { 0x0448, 0x73, 0x68, 0x00, 0x00 }, /* ш to sh */ 1646 { 0x0449, 0x73, 0x68, 0x63, 0x68 }, /* щ to shch */ 1647 { 0x044A, 0x61, 0x00, 0x00, 0x00 }, /* to a */ 1648 { 0x044B, 0x79, 0x00, 0x00, 0x00 }, /* ы to y */ 1649 { 0x044C, 0x79, 0x00, 0x00, 0x00 }, /* to y */ 1650 { 0x044D, 0x65, 0x00, 0x00, 0x00 }, /* э to e */ 1651 { 0x044E, 0x69, 0x75, 0x00, 0x00 }, /* ю to iu */ 1652 { 0x044F, 0x69, 0x61, 0x00, 0x00 }, /* я to ia */ 1653 { 0x0450, 0x65, 0x00, 0x00, 0x00 }, /* ѐ to e */ 1654 { 0x0451, 0x65, 0x00, 0x00, 0x00 }, /* ё to e */ 1655 { 0x0452, 0x64, 0x00, 0x00, 0x00 }, /* ђ to d */ 1656 { 0x0453, 0x67, 0x00, 0x00, 0x00 }, /* ѓ to g */ 1657 { 0x0454, 0x65, 0x00, 0x00, 0x00 }, /* є to e */ 1658 { 0x0455, 0x7a, 0x00, 0x00, 0x00 }, /* ѕ to z */ 1659 { 0x0456, 0x69, 0x00, 0x00, 0x00 }, /* і to i */ 1660 { 0x0457, 0x69, 0x00, 0x00, 0x00 }, /* ї to i */ 1661 { 0x0458, 0x6a, 0x00, 0x00, 0x00 }, /* ј to j */ 1662 { 0x0459, 0x69, 0x00, 0x00, 0x00 }, /* љ to i */ 1663 { 0x045A, 0x6e, 0x00, 0x00, 0x00 }, /* њ to n */ 1664 { 0x045B, 0x64, 0x00, 0x00, 0x00 }, /* ћ to d */ 1665 { 0x045C, 0x6b, 0x00, 0x00, 0x00 }, /* ќ to k */ 1666 { 0x045D, 0x69, 0x00, 0x00, 0x00 }, /* ѝ to i */ 1667 { 0x045E, 0x75, 0x00, 0x00, 0x00 }, /* ў to u */ 1668 { 0x045F, 0x64, 0x00, 0x00, 0x00 }, /* џ to d */ 1669 { 0x1E02, 0x42, 0x00, 0x00, 0x00 }, /* Ḃ to B */ 1670 { 0x1E03, 0x62, 0x00, 0x00, 0x00 }, /* ḃ to b */ 1671 { 0x1E0A, 0x44, 0x00, 0x00, 0x00 }, /* Ḋ to D */ 1672 { 0x1E0B, 0x64, 0x00, 0x00, 0x00 }, /* ḋ to d */ 1673 { 0x1E1E, 0x46, 0x00, 0x00, 0x00 }, /* Ḟ to F */ 1674 { 0x1E1F, 0x66, 0x00, 0x00, 0x00 }, /* ḟ to f */ 1675 { 0x1E40, 0x4D, 0x00, 0x00, 0x00 }, /* Ṁ to M */ 1676 { 0x1E41, 0x6D, 0x00, 0x00, 0x00 }, /* ṁ to m */ 1677 { 0x1E56, 0x50, 0x00, 0x00, 0x00 }, /* Ṗ to P */ 1678 { 0x1E57, 0x70, 0x00, 0x00, 0x00 }, /* ṗ to p */ 1679 { 0x1E60, 0x53, 0x00, 0x00, 0x00 }, /* Ṡ to S */ 1680 { 0x1E61, 0x73, 0x00, 0x00, 0x00 }, /* ṡ to s */ 1681 { 0x1E6A, 0x54, 0x00, 0x00, 0x00 }, /* Ṫ to T */ 1682 { 0x1E6B, 0x74, 0x00, 0x00, 0x00 }, /* ṫ to t */ 1683 { 0x1E80, 0x57, 0x00, 0x00, 0x00 }, /* Ẁ to W */ 1684 { 0x1E81, 0x77, 0x00, 0x00, 0x00 }, /* ẁ to w */ 1685 { 0x1E82, 0x57, 0x00, 0x00, 0x00 }, /* Ẃ to W */ 1686 { 0x1E83, 0x77, 0x00, 0x00, 0x00 }, /* ẃ to w */ 1687 { 0x1E84, 0x57, 0x00, 0x00, 0x00 }, /* Ẅ to W */ 1688 { 0x1E85, 0x77, 0x00, 0x00, 0x00 }, /* ẅ to w */ 1689 { 0x1EF2, 0x59, 0x00, 0x00, 0x00 }, /* Ỳ to Y */ 1690 { 0x1EF3, 0x79, 0x00, 0x00, 0x00 }, /* ỳ to y */ 1691 { 0xFB00, 0x66, 0x66, 0x00, 0x00 }, /* ff to ff */ 1692 { 0xFB01, 0x66, 0x69, 0x00, 0x00 }, /* fi to fi */ 1693 { 0xFB02, 0x66, 0x6C, 0x00, 0x00 }, /* fl to fl */ 1694 { 0xFB05, 0x73, 0x74, 0x00, 0x00 }, /* ſt to st */ 1695 { 0xFB06, 0x73, 0x74, 0x00, 0x00 }, /* st to st */ 1696 }; 1697 1698 static const Transliteration *spellfixFindTranslit(int c, int *pxTop){ 1699 *pxTop = (sizeof(translit)/sizeof(translit[0])) - 1; 1700 return translit; 1701 } 1702 1703 /* 1704 ** Convert the input string from UTF-8 into pure ASCII by converting 1705 ** all non-ASCII characters to some combination of characters in the 1706 ** ASCII subset. 1707 ** 1708 ** The returned string might contain more characters than the input. 1709 ** 1710 ** Space to hold the returned string comes from sqlite3_malloc() and 1711 ** should be freed by the caller. 1712 */ 1713 static unsigned char *transliterate(const unsigned char *zIn, int nIn){ 1714 #ifdef SQLITE_SPELLFIX_5BYTE_MAPPINGS 1715 unsigned char *zOut = sqlite3_malloc64( nIn*5 + 1 ); 1716 #else 1717 unsigned char *zOut = sqlite3_malloc64( nIn*4 + 1 ); 1718 #endif 1719 int c, sz, nOut; 1720 if( zOut==0 ) return 0; 1721 nOut = 0; 1722 while( nIn>0 ){ 1723 c = utf8Read(zIn, nIn, &sz); 1724 zIn += sz; 1725 nIn -= sz; 1726 if( c<=127 ){ 1727 zOut[nOut++] = (unsigned char)c; 1728 }else{ 1729 int xTop, xBtm, x; 1730 const Transliteration *tbl = spellfixFindTranslit(c, &xTop); 1731 xBtm = 0; 1732 while( xTop>=xBtm ){ 1733 x = (xTop + xBtm)/2; 1734 if( tbl[x].cFrom==c ){ 1735 zOut[nOut++] = tbl[x].cTo0; 1736 if( tbl[x].cTo1 ){ 1737 zOut[nOut++] = tbl[x].cTo1; 1738 if( tbl[x].cTo2 ){ 1739 zOut[nOut++] = tbl[x].cTo2; 1740 if( tbl[x].cTo3 ){ 1741 zOut[nOut++] = tbl[x].cTo3; 1742 #ifdef SQLITE_SPELLFIX_5BYTE_MAPPINGS 1743 if( tbl[x].cTo4 ){ 1744 zOut[nOut++] = tbl[x].cTo4; 1745 } 1746 #endif /* SQLITE_SPELLFIX_5BYTE_MAPPINGS */ 1747 } 1748 } 1749 } 1750 c = 0; 1751 break; 1752 }else if( tbl[x].cFrom>c ){ 1753 xTop = x-1; 1754 }else{ 1755 xBtm = x+1; 1756 } 1757 } 1758 if( c ) zOut[nOut++] = '?'; 1759 } 1760 } 1761 zOut[nOut] = 0; 1762 return zOut; 1763 } 1764 1765 /* 1766 ** Return the number of characters in the shortest prefix of the input 1767 ** string that transliterates to an ASCII string nTrans bytes or longer. 1768 ** Or, if the transliteration of the input string is less than nTrans 1769 ** bytes in size, return the number of characters in the input string. 1770 */ 1771 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){ 1772 int i, c, sz, nOut; 1773 int nChar; 1774 1775 i = nOut = 0; 1776 for(nChar=0; i<nIn && nOut<nTrans; nChar++){ 1777 c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz); 1778 i += sz; 1779 1780 nOut++; 1781 if( c>=128 ){ 1782 int xTop, xBtm, x; 1783 const Transliteration *tbl = spellfixFindTranslit(c, &xTop); 1784 xBtm = 0; 1785 while( xTop>=xBtm ){ 1786 x = (xTop + xBtm)/2; 1787 if( tbl[x].cFrom==c ){ 1788 if( tbl[x].cTo1 ){ 1789 nOut++; 1790 if( tbl[x].cTo2 ){ 1791 nOut++; 1792 if( tbl[x].cTo3 ){ 1793 nOut++; 1794 } 1795 } 1796 } 1797 break; 1798 }else if( tbl[x].cFrom>c ){ 1799 xTop = x-1; 1800 }else{ 1801 xBtm = x+1; 1802 } 1803 } 1804 } 1805 } 1806 1807 return nChar; 1808 } 1809 1810 1811 /* 1812 ** spellfix1_translit(X) 1813 ** 1814 ** Convert a string that contains non-ASCII Roman characters into 1815 ** pure ASCII. 1816 */ 1817 static void transliterateSqlFunc( 1818 sqlite3_context *context, 1819 int argc, 1820 sqlite3_value **argv 1821 ){ 1822 const unsigned char *zIn = sqlite3_value_text(argv[0]); 1823 int nIn = sqlite3_value_bytes(argv[0]); 1824 unsigned char *zOut = transliterate(zIn, nIn); 1825 if( zOut==0 ){ 1826 sqlite3_result_error_nomem(context); 1827 }else{ 1828 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free); 1829 } 1830 } 1831 1832 /* 1833 ** spellfix1_scriptcode(X) 1834 ** 1835 ** Try to determine the dominant script used by the word X and return 1836 ** its ISO 15924 numeric code. 1837 ** 1838 ** The current implementation only understands the following scripts: 1839 ** 1840 ** 215 (Latin) 1841 ** 220 (Cyrillic) 1842 ** 200 (Greek) 1843 ** 1844 ** This routine will return 998 if the input X contains characters from 1845 ** two or more of the above scripts or 999 if X contains no characters 1846 ** from any of the above scripts. 1847 */ 1848 static void scriptCodeSqlFunc( 1849 sqlite3_context *context, 1850 int argc, 1851 sqlite3_value **argv 1852 ){ 1853 const unsigned char *zIn = sqlite3_value_text(argv[0]); 1854 int nIn = sqlite3_value_bytes(argv[0]); 1855 int c, sz; 1856 int scriptMask = 0; 1857 int res; 1858 int seenDigit = 0; 1859 # define SCRIPT_LATIN 0x0001 1860 # define SCRIPT_CYRILLIC 0x0002 1861 # define SCRIPT_GREEK 0x0004 1862 # define SCRIPT_HEBREW 0x0008 1863 # define SCRIPT_ARABIC 0x0010 1864 1865 while( nIn>0 ){ 1866 c = utf8Read(zIn, nIn, &sz); 1867 zIn += sz; 1868 nIn -= sz; 1869 if( c<0x02af ){ 1870 if( c>=0x80 || midClass[c&0x7f]<CCLASS_DIGIT ){ 1871 scriptMask |= SCRIPT_LATIN; 1872 }else if( c>='0' && c<='9' ){ 1873 seenDigit = 1; 1874 } 1875 }else if( c>=0x0400 && c<=0x04ff ){ 1876 scriptMask |= SCRIPT_CYRILLIC; 1877 }else if( c>=0x0386 && c<=0x03ce ){ 1878 scriptMask |= SCRIPT_GREEK; 1879 }else if( c>=0x0590 && c<=0x05ff ){ 1880 scriptMask |= SCRIPT_HEBREW; 1881 }else if( c>=0x0600 && c<=0x06ff ){ 1882 scriptMask |= SCRIPT_ARABIC; 1883 } 1884 } 1885 if( scriptMask==0 && seenDigit ) scriptMask = SCRIPT_LATIN; 1886 switch( scriptMask ){ 1887 case 0: res = 999; break; 1888 case SCRIPT_LATIN: res = 215; break; 1889 case SCRIPT_CYRILLIC: res = 220; break; 1890 case SCRIPT_GREEK: res = 200; break; 1891 case SCRIPT_HEBREW: res = 125; break; 1892 case SCRIPT_ARABIC: res = 160; break; 1893 default: res = 998; break; 1894 } 1895 sqlite3_result_int(context, res); 1896 } 1897 1898 /* End transliterate 1899 ****************************************************************************** 1900 ****************************************************************************** 1901 ** Begin spellfix1 virtual table. 1902 */ 1903 1904 /* Maximum length of a phonehash used for querying the shadow table */ 1905 #define SPELLFIX_MX_HASH 32 1906 1907 /* Maximum number of hash strings to examine per query */ 1908 #define SPELLFIX_MX_RUN 1 1909 1910 typedef struct spellfix1_vtab spellfix1_vtab; 1911 typedef struct spellfix1_cursor spellfix1_cursor; 1912 1913 /* Fuzzy-search virtual table object */ 1914 struct spellfix1_vtab { 1915 sqlite3_vtab base; /* Base class - must be first */ 1916 sqlite3 *db; /* Database connection */ 1917 char *zDbName; /* Name of database holding this table */ 1918 char *zTableName; /* Name of the virtual table */ 1919 char *zCostTable; /* Table holding edit-distance cost numbers */ 1920 EditDist3Config *pConfig3; /* Parsed edit distance costs */ 1921 }; 1922 1923 /* Fuzzy-search cursor object */ 1924 struct spellfix1_cursor { 1925 sqlite3_vtab_cursor base; /* Base class - must be first */ 1926 spellfix1_vtab *pVTab; /* The table to which this cursor belongs */ 1927 char *zPattern; /* rhs of MATCH clause */ 1928 int idxNum; /* idxNum value passed to xFilter() */ 1929 int nRow; /* Number of rows of content */ 1930 int nAlloc; /* Number of allocated rows */ 1931 int iRow; /* Current row of content */ 1932 int iLang; /* Value of the langid= constraint */ 1933 int iTop; /* Value of the top= constraint */ 1934 int iScope; /* Value of the scope= constraint */ 1935 int nSearch; /* Number of vocabulary items checked */ 1936 sqlite3_stmt *pFullScan; /* Shadow query for a full table scan */ 1937 struct spellfix1_row { /* For each row of content */ 1938 sqlite3_int64 iRowid; /* Rowid for this row */ 1939 char *zWord; /* Text for this row */ 1940 int iRank; /* Rank for this row */ 1941 int iDistance; /* Distance from pattern for this row */ 1942 int iScore; /* Score for sorting */ 1943 int iMatchlen; /* Value of matchlen column (or -1) */ 1944 char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */ 1945 } *a; 1946 }; 1947 1948 /* 1949 ** Construct one or more SQL statements from the format string given 1950 ** and then evaluate those statements. The success code is written 1951 ** into *pRc. 1952 ** 1953 ** If *pRc is initially non-zero then this routine is a no-op. 1954 */ 1955 static void spellfix1DbExec( 1956 int *pRc, /* Success code */ 1957 sqlite3 *db, /* Database in which to run SQL */ 1958 const char *zFormat, /* Format string for SQL */ 1959 ... /* Arguments to the format string */ 1960 ){ 1961 va_list ap; 1962 char *zSql; 1963 if( *pRc ) return; 1964 va_start(ap, zFormat); 1965 zSql = sqlite3_vmprintf(zFormat, ap); 1966 va_end(ap); 1967 if( zSql==0 ){ 1968 *pRc = SQLITE_NOMEM; 1969 }else{ 1970 *pRc = sqlite3_exec(db, zSql, 0, 0, 0); 1971 sqlite3_free(zSql); 1972 } 1973 } 1974 1975 /* 1976 ** xDisconnect/xDestroy method for the fuzzy-search module. 1977 */ 1978 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){ 1979 spellfix1_vtab *p = (spellfix1_vtab*)pVTab; 1980 int rc = SQLITE_OK; 1981 if( isDestroy ){ 1982 sqlite3 *db = p->db; 1983 spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"", 1984 p->zDbName, p->zTableName); 1985 } 1986 if( rc==SQLITE_OK ){ 1987 sqlite3_free(p->zTableName); 1988 editDist3ConfigDelete(p->pConfig3); 1989 sqlite3_free(p->zCostTable); 1990 sqlite3_free(p); 1991 } 1992 return rc; 1993 } 1994 static int spellfix1Disconnect(sqlite3_vtab *pVTab){ 1995 return spellfix1Uninit(0, pVTab); 1996 } 1997 static int spellfix1Destroy(sqlite3_vtab *pVTab){ 1998 return spellfix1Uninit(1, pVTab); 1999 } 2000 2001 /* 2002 ** Make a copy of a string. Remove leading and trailing whitespace 2003 ** and dequote it. 2004 */ 2005 static char *spellfix1Dequote(const char *zIn){ 2006 char *zOut; 2007 int i, j; 2008 char c; 2009 while( isspace((unsigned char)zIn[0]) ) zIn++; 2010 zOut = sqlite3_mprintf("%s", zIn); 2011 if( zOut==0 ) return 0; 2012 i = (int)strlen(zOut); 2013 #if 0 /* The parser will never leave spaces at the end */ 2014 while( i>0 && isspace(zOut[i-1]) ){ i--; } 2015 #endif 2016 zOut[i] = 0; 2017 c = zOut[0]; 2018 if( c=='\'' || c=='"' ){ 2019 for(i=1, j=0; ALWAYS(zOut[i]); i++){ 2020 zOut[j++] = zOut[i]; 2021 if( zOut[i]==c ){ 2022 if( zOut[i+1]==c ){ 2023 i++; 2024 }else{ 2025 zOut[j-1] = 0; 2026 break; 2027 } 2028 } 2029 } 2030 } 2031 return zOut; 2032 } 2033 2034 2035 /* 2036 ** xConnect/xCreate method for the spellfix1 module. Arguments are: 2037 ** 2038 ** argv[0] -> module name ("spellfix1") 2039 ** argv[1] -> database name 2040 ** argv[2] -> table name 2041 ** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter) 2042 */ 2043 static int spellfix1Init( 2044 int isCreate, 2045 sqlite3 *db, 2046 void *pAux, 2047 int argc, const char *const*argv, 2048 sqlite3_vtab **ppVTab, 2049 char **pzErr 2050 ){ 2051 spellfix1_vtab *pNew = 0; 2052 /* const char *zModule = argv[0]; // not used */ 2053 const char *zDbName = argv[1]; 2054 const char *zTableName = argv[2]; 2055 int nDbName; 2056 int rc = SQLITE_OK; 2057 int i; 2058 2059 nDbName = (int)strlen(zDbName); 2060 pNew = sqlite3_malloc64( sizeof(*pNew) + nDbName + 1); 2061 if( pNew==0 ){ 2062 rc = SQLITE_NOMEM; 2063 }else{ 2064 memset(pNew, 0, sizeof(*pNew)); 2065 pNew->zDbName = (char*)&pNew[1]; 2066 memcpy(pNew->zDbName, zDbName, nDbName+1); 2067 pNew->zTableName = sqlite3_mprintf("%s", zTableName); 2068 pNew->db = db; 2069 if( pNew->zTableName==0 ){ 2070 rc = SQLITE_NOMEM; 2071 }else{ 2072 rc = sqlite3_declare_vtab(db, 2073 "CREATE TABLE x(word,rank,distance,langid, " 2074 "score, matchlen, phonehash HIDDEN, " 2075 "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, " 2076 "soundslike HIDDEN, command HIDDEN)" 2077 ); 2078 #define SPELLFIX_COL_WORD 0 2079 #define SPELLFIX_COL_RANK 1 2080 #define SPELLFIX_COL_DISTANCE 2 2081 #define SPELLFIX_COL_LANGID 3 2082 #define SPELLFIX_COL_SCORE 4 2083 #define SPELLFIX_COL_MATCHLEN 5 2084 #define SPELLFIX_COL_PHONEHASH 6 2085 #define SPELLFIX_COL_TOP 7 2086 #define SPELLFIX_COL_SCOPE 8 2087 #define SPELLFIX_COL_SRCHCNT 9 2088 #define SPELLFIX_COL_SOUNDSLIKE 10 2089 #define SPELLFIX_COL_COMMAND 11 2090 } 2091 if( rc==SQLITE_OK && isCreate ){ 2092 spellfix1DbExec(&rc, db, 2093 "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n" 2094 " id INTEGER PRIMARY KEY,\n" 2095 " rank INT,\n" 2096 " langid INT,\n" 2097 " word TEXT,\n" 2098 " k1 TEXT,\n" 2099 " k2 TEXT\n" 2100 ");\n", 2101 zDbName, zTableName 2102 ); 2103 spellfix1DbExec(&rc, db, 2104 "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" " 2105 "ON \"%w_vocab\"(langid,k2);", 2106 zDbName, zTableName, zTableName 2107 ); 2108 } 2109 for(i=3; rc==SQLITE_OK && i<argc; i++){ 2110 if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){ 2111 pNew->zCostTable = spellfix1Dequote(&argv[i][16]); 2112 if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM; 2113 continue; 2114 } 2115 *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]); 2116 rc = SQLITE_ERROR; 2117 } 2118 } 2119 2120 if( rc && pNew ){ 2121 *ppVTab = 0; 2122 spellfix1Uninit(0, &pNew->base); 2123 }else{ 2124 *ppVTab = (sqlite3_vtab *)pNew; 2125 } 2126 return rc; 2127 } 2128 2129 /* 2130 ** The xConnect and xCreate methods 2131 */ 2132 static int spellfix1Connect( 2133 sqlite3 *db, 2134 void *pAux, 2135 int argc, const char *const*argv, 2136 sqlite3_vtab **ppVTab, 2137 char **pzErr 2138 ){ 2139 return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr); 2140 } 2141 static int spellfix1Create( 2142 sqlite3 *db, 2143 void *pAux, 2144 int argc, const char *const*argv, 2145 sqlite3_vtab **ppVTab, 2146 char **pzErr 2147 ){ 2148 return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr); 2149 } 2150 2151 /* 2152 ** Clear all of the content from a cursor. 2153 */ 2154 static void spellfix1ResetCursor(spellfix1_cursor *pCur){ 2155 int i; 2156 for(i=0; i<pCur->nRow; i++){ 2157 sqlite3_free(pCur->a[i].zWord); 2158 } 2159 pCur->nRow = 0; 2160 pCur->iRow = 0; 2161 pCur->nSearch = 0; 2162 if( pCur->pFullScan ){ 2163 sqlite3_finalize(pCur->pFullScan); 2164 pCur->pFullScan = 0; 2165 } 2166 } 2167 2168 /* 2169 ** Resize the cursor to hold up to N rows of content 2170 */ 2171 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){ 2172 struct spellfix1_row *aNew; 2173 assert( N>=pCur->nRow ); 2174 aNew = sqlite3_realloc64(pCur->a, sizeof(pCur->a[0])*N); 2175 if( aNew==0 && N>0 ){ 2176 spellfix1ResetCursor(pCur); 2177 sqlite3_free(pCur->a); 2178 pCur->nAlloc = 0; 2179 pCur->a = 0; 2180 }else{ 2181 pCur->nAlloc = N; 2182 pCur->a = aNew; 2183 } 2184 } 2185 2186 2187 /* 2188 ** Close a fuzzy-search cursor. 2189 */ 2190 static int spellfix1Close(sqlite3_vtab_cursor *cur){ 2191 spellfix1_cursor *pCur = (spellfix1_cursor *)cur; 2192 spellfix1ResetCursor(pCur); 2193 spellfix1ResizeCursor(pCur, 0); 2194 sqlite3_free(pCur->zPattern); 2195 sqlite3_free(pCur); 2196 return SQLITE_OK; 2197 } 2198 2199 #define SPELLFIX_IDXNUM_MATCH 0x01 /* word MATCH $str */ 2200 #define SPELLFIX_IDXNUM_LANGID 0x02 /* langid == $langid */ 2201 #define SPELLFIX_IDXNUM_TOP 0x04 /* top = $top */ 2202 #define SPELLFIX_IDXNUM_SCOPE 0x08 /* scope = $scope */ 2203 #define SPELLFIX_IDXNUM_DISTLT 0x10 /* distance < $distance */ 2204 #define SPELLFIX_IDXNUM_DISTLE 0x20 /* distance <= $distance */ 2205 #define SPELLFIX_IDXNUM_ROWID 0x40 /* rowid = $rowid */ 2206 #define SPELLFIX_IDXNUM_DIST (0x10|0x20) /* DISTLT and DISTLE */ 2207 2208 /* 2209 ** 2210 ** The plan number is a bitmask of the SPELLFIX_IDXNUM_* values defined 2211 ** above. 2212 ** 2213 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid 2214 ** if specified and in that order. 2215 */ 2216 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){ 2217 int iPlan = 0; 2218 int iLangTerm = -1; 2219 int iTopTerm = -1; 2220 int iScopeTerm = -1; 2221 int iDistTerm = -1; 2222 int iRowidTerm = -1; 2223 int i; 2224 const struct sqlite3_index_constraint *pConstraint; 2225 pConstraint = pIdxInfo->aConstraint; 2226 for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){ 2227 if( pConstraint->usable==0 ) continue; 2228 2229 /* Terms of the form: word MATCH $str */ 2230 if( (iPlan & SPELLFIX_IDXNUM_MATCH)==0 2231 && pConstraint->iColumn==SPELLFIX_COL_WORD 2232 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH 2233 ){ 2234 iPlan |= SPELLFIX_IDXNUM_MATCH; 2235 pIdxInfo->aConstraintUsage[i].argvIndex = 1; 2236 pIdxInfo->aConstraintUsage[i].omit = 1; 2237 } 2238 2239 /* Terms of the form: langid = $langid */ 2240 if( (iPlan & SPELLFIX_IDXNUM_LANGID)==0 2241 && pConstraint->iColumn==SPELLFIX_COL_LANGID 2242 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ 2243 ){ 2244 iPlan |= SPELLFIX_IDXNUM_LANGID; 2245 iLangTerm = i; 2246 } 2247 2248 /* Terms of the form: top = $top */ 2249 if( (iPlan & SPELLFIX_IDXNUM_TOP)==0 2250 && pConstraint->iColumn==SPELLFIX_COL_TOP 2251 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ 2252 ){ 2253 iPlan |= SPELLFIX_IDXNUM_TOP; 2254 iTopTerm = i; 2255 } 2256 2257 /* Terms of the form: scope = $scope */ 2258 if( (iPlan & SPELLFIX_IDXNUM_SCOPE)==0 2259 && pConstraint->iColumn==SPELLFIX_COL_SCOPE 2260 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ 2261 ){ 2262 iPlan |= SPELLFIX_IDXNUM_SCOPE; 2263 iScopeTerm = i; 2264 } 2265 2266 /* Terms of the form: distance < $dist or distance <= $dist */ 2267 if( (iPlan & SPELLFIX_IDXNUM_DIST)==0 2268 && pConstraint->iColumn==SPELLFIX_COL_DISTANCE 2269 && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT 2270 || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE) 2271 ){ 2272 if( pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ){ 2273 iPlan |= SPELLFIX_IDXNUM_DISTLT; 2274 }else{ 2275 iPlan |= SPELLFIX_IDXNUM_DISTLE; 2276 } 2277 iDistTerm = i; 2278 } 2279 2280 /* Terms of the form: distance < $dist or distance <= $dist */ 2281 if( (iPlan & SPELLFIX_IDXNUM_ROWID)==0 2282 && pConstraint->iColumn<0 2283 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ 2284 ){ 2285 iPlan |= SPELLFIX_IDXNUM_ROWID; 2286 iRowidTerm = i; 2287 } 2288 } 2289 if( iPlan&SPELLFIX_IDXNUM_MATCH ){ 2290 int idx = 2; 2291 pIdxInfo->idxNum = iPlan; 2292 if( pIdxInfo->nOrderBy==1 2293 && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE 2294 && pIdxInfo->aOrderBy[0].desc==0 2295 ){ 2296 pIdxInfo->orderByConsumed = 1; /* Default order by iScore */ 2297 } 2298 if( iPlan&SPELLFIX_IDXNUM_LANGID ){ 2299 pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++; 2300 pIdxInfo->aConstraintUsage[iLangTerm].omit = 1; 2301 } 2302 if( iPlan&SPELLFIX_IDXNUM_TOP ){ 2303 pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++; 2304 pIdxInfo->aConstraintUsage[iTopTerm].omit = 1; 2305 } 2306 if( iPlan&SPELLFIX_IDXNUM_SCOPE ){ 2307 pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++; 2308 pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1; 2309 } 2310 if( iPlan&SPELLFIX_IDXNUM_DIST ){ 2311 pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++; 2312 pIdxInfo->aConstraintUsage[iDistTerm].omit = 1; 2313 } 2314 pIdxInfo->estimatedCost = 1e5; 2315 }else if( (iPlan & SPELLFIX_IDXNUM_ROWID) ){ 2316 pIdxInfo->idxNum = SPELLFIX_IDXNUM_ROWID; 2317 pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1; 2318 pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1; 2319 pIdxInfo->estimatedCost = 5; 2320 }else{ 2321 pIdxInfo->idxNum = 0; 2322 pIdxInfo->estimatedCost = 1e50; 2323 } 2324 return SQLITE_OK; 2325 } 2326 2327 /* 2328 ** Open a new fuzzy-search cursor. 2329 */ 2330 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ 2331 spellfix1_vtab *p = (spellfix1_vtab*)pVTab; 2332 spellfix1_cursor *pCur; 2333 pCur = sqlite3_malloc64( sizeof(*pCur) ); 2334 if( pCur==0 ) return SQLITE_NOMEM; 2335 memset(pCur, 0, sizeof(*pCur)); 2336 pCur->pVTab = p; 2337 *ppCursor = &pCur->base; 2338 return SQLITE_OK; 2339 } 2340 2341 /* 2342 ** Adjust a distance measurement by the words rank in order to show 2343 ** preference to common words. 2344 */ 2345 static int spellfix1Score(int iDistance, int iRank){ 2346 int iLog2; 2347 for(iLog2=0; iRank>0; iLog2++, iRank>>=1){} 2348 return iDistance + 32 - iLog2; 2349 } 2350 2351 /* 2352 ** Compare two spellfix1_row objects for sorting purposes in qsort() such 2353 ** that they sort in order of increasing distance. 2354 */ 2355 static int SQLITE_CDECL spellfix1RowCompare(const void *A, const void *B){ 2356 const struct spellfix1_row *a = (const struct spellfix1_row*)A; 2357 const struct spellfix1_row *b = (const struct spellfix1_row*)B; 2358 return a->iScore - b->iScore; 2359 } 2360 2361 /* 2362 ** A structure used to pass information from spellfix1FilterForMatch() 2363 ** into spellfix1RunQuery(). 2364 */ 2365 typedef struct MatchQuery { 2366 spellfix1_cursor *pCur; /* The cursor being queried */ 2367 sqlite3_stmt *pStmt; /* shadow table query statment */ 2368 char zHash[SPELLFIX_MX_HASH]; /* The current phonehash for zPattern */ 2369 const char *zPattern; /* Transliterated input string */ 2370 int nPattern; /* Length of zPattern */ 2371 EditDist3FromString *pMatchStr3; /* Original unicode string */ 2372 EditDist3Config *pConfig3; /* Edit-distance cost coefficients */ 2373 const EditDist3Lang *pLang; /* The selected language coefficients */ 2374 int iLang; /* The language id */ 2375 int iScope; /* Default scope */ 2376 int iMaxDist; /* Maximum allowed edit distance, or -1 */ 2377 int rc; /* Error code */ 2378 int nRun; /* Number of prior runs for the same zPattern */ 2379 char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH]; /* Prior hashes */ 2380 } MatchQuery; 2381 2382 /* 2383 ** Run a query looking for the best matches against zPattern using 2384 ** zHash as the character class seed hash. 2385 */ 2386 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){ 2387 const char *zK1; 2388 const char *zWord; 2389 int iDist; 2390 int iRank; 2391 int iScore; 2392 int iWorst = 0; 2393 int idx; 2394 int idxWorst = -1; 2395 int i; 2396 int iScope = p->iScope; 2397 spellfix1_cursor *pCur = p->pCur; 2398 sqlite3_stmt *pStmt = p->pStmt; 2399 char zHash1[SPELLFIX_MX_HASH]; 2400 char zHash2[SPELLFIX_MX_HASH]; 2401 char *zClass; 2402 int nClass; 2403 int rc; 2404 2405 if( pCur->a==0 || p->rc ) return; /* Prior memory allocation failure */ 2406 zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery); 2407 if( zClass==0 ){ 2408 p->rc = SQLITE_NOMEM; 2409 return; 2410 } 2411 nClass = (int)strlen(zClass); 2412 if( nClass>SPELLFIX_MX_HASH-2 ){ 2413 nClass = SPELLFIX_MX_HASH-2; 2414 zClass[nClass] = 0; 2415 } 2416 if( nClass<=iScope ){ 2417 if( nClass>2 ){ 2418 iScope = nClass-1; 2419 }else{ 2420 iScope = nClass; 2421 } 2422 } 2423 memcpy(zHash1, zClass, iScope); 2424 sqlite3_free(zClass); 2425 zHash1[iScope] = 0; 2426 memcpy(zHash2, zHash1, iScope); 2427 zHash2[iScope] = 'Z'; 2428 zHash2[iScope+1] = 0; 2429 #if SPELLFIX_MX_RUN>1 2430 for(i=0; i<p->nRun; i++){ 2431 if( strcmp(p->azPrior[i], zHash1)==0 ) return; 2432 } 2433 #endif 2434 assert( p->nRun<SPELLFIX_MX_RUN ); 2435 memcpy(p->azPrior[p->nRun++], zHash1, iScope+1); 2436 if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM 2437 || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM 2438 ){ 2439 p->rc = SQLITE_NOMEM; 2440 return; 2441 } 2442 #if SPELLFIX_MX_RUN>1 2443 for(i=0; i<pCur->nRow; i++){ 2444 if( pCur->a[i].iScore>iWorst ){ 2445 iWorst = pCur->a[i].iScore; 2446 idxWorst = i; 2447 } 2448 } 2449 #endif 2450 while( sqlite3_step(pStmt)==SQLITE_ROW ){ 2451 int iMatchlen = -1; 2452 iRank = sqlite3_column_int(pStmt, 2); 2453 if( p->pMatchStr3 ){ 2454 int nWord = sqlite3_column_bytes(pStmt, 1); 2455 zWord = (const char*)sqlite3_column_text(pStmt, 1); 2456 iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen); 2457 }else{ 2458 zK1 = (const char*)sqlite3_column_text(pStmt, 3); 2459 if( zK1==0 ) continue; 2460 iDist = editdist1(p->zPattern, zK1, 0); 2461 } 2462 if( iDist<0 ){ 2463 p->rc = SQLITE_NOMEM; 2464 break; 2465 } 2466 pCur->nSearch++; 2467 2468 /* If there is a "distance < $dist" or "distance <= $dist" constraint, 2469 ** check if this row meets it. If not, jump back up to the top of the 2470 ** loop to process the next row. Otherwise, if the row does match the 2471 ** distance constraint, check if the pCur->a[] array is already full. 2472 ** If it is and no explicit "top = ?" constraint was present in the 2473 ** query, grow the array to ensure there is room for the new entry. */ 2474 assert( (p->iMaxDist>=0)==((pCur->idxNum & SPELLFIX_IDXNUM_DIST) ? 1 : 0) ); 2475 if( p->iMaxDist>=0 ){ 2476 if( iDist>p->iMaxDist ) continue; 2477 if( pCur->nRow>=pCur->nAlloc && (pCur->idxNum & SPELLFIX_IDXNUM_TOP)==0 ){ 2478 spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10); 2479 if( pCur->a==0 ) break; 2480 } 2481 } 2482 2483 iScore = spellfix1Score(iDist,iRank); 2484 if( pCur->nRow<pCur->nAlloc ){ 2485 idx = pCur->nRow; 2486 }else if( iScore<iWorst ){ 2487 idx = idxWorst; 2488 sqlite3_free(pCur->a[idx].zWord); 2489 }else{ 2490 continue; 2491 } 2492 2493 pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1)); 2494 if( pCur->a[idx].zWord==0 ){ 2495 p->rc = SQLITE_NOMEM; 2496 break; 2497 } 2498 pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0); 2499 pCur->a[idx].iRank = iRank; 2500 pCur->a[idx].iDistance = iDist; 2501 pCur->a[idx].iScore = iScore; 2502 pCur->a[idx].iMatchlen = iMatchlen; 2503 memcpy(pCur->a[idx].zHash, zHash1, iScope+1); 2504 if( pCur->nRow<pCur->nAlloc ) pCur->nRow++; 2505 if( pCur->nRow==pCur->nAlloc ){ 2506 iWorst = pCur->a[0].iScore; 2507 idxWorst = 0; 2508 for(i=1; i<pCur->nRow; i++){ 2509 iScore = pCur->a[i].iScore; 2510 if( iWorst<iScore ){ 2511 iWorst = iScore; 2512 idxWorst = i; 2513 } 2514 } 2515 } 2516 } 2517 rc = sqlite3_reset(pStmt); 2518 if( rc ) p->rc = rc; 2519 } 2520 2521 /* 2522 ** This version of the xFilter method work if the MATCH term is present 2523 ** and we are doing a scan. 2524 */ 2525 static int spellfix1FilterForMatch( 2526 spellfix1_cursor *pCur, 2527 int argc, 2528 sqlite3_value **argv 2529 ){ 2530 int idxNum = pCur->idxNum; 2531 const unsigned char *zMatchThis; /* RHS of the MATCH operator */ 2532 EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */ 2533 char *zPattern; /* Transliteration of zMatchThis */ 2534 int nPattern; /* Length of zPattern */ 2535 int iLimit = 20; /* Max number of rows of output */ 2536 int iScope = 3; /* Use this many characters of zClass */ 2537 int iLang = 0; /* Language code */ 2538 char *zSql; /* SQL of shadow table query */ 2539 sqlite3_stmt *pStmt = 0; /* Shadow table query */ 2540 int rc; /* Result code */ 2541 int idx = 1; /* Next available filter parameter */ 2542 spellfix1_vtab *p = pCur->pVTab; /* The virtual table that owns pCur */ 2543 MatchQuery x; /* For passing info to RunQuery() */ 2544 2545 /* Load the cost table if we have not already done so */ 2546 if( p->zCostTable!=0 && p->pConfig3==0 ){ 2547 p->pConfig3 = sqlite3_malloc64( sizeof(p->pConfig3[0]) ); 2548 if( p->pConfig3==0 ) return SQLITE_NOMEM; 2549 memset(p->pConfig3, 0, sizeof(p->pConfig3[0])); 2550 rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable); 2551 if( rc ) return rc; 2552 } 2553 memset(&x, 0, sizeof(x)); 2554 x.iScope = 3; /* Default scope if none specified by "WHERE scope=N" */ 2555 x.iMaxDist = -1; /* Maximum allowed edit distance */ 2556 2557 if( idxNum&2 ){ 2558 iLang = sqlite3_value_int(argv[idx++]); 2559 } 2560 if( idxNum&4 ){ 2561 iLimit = sqlite3_value_int(argv[idx++]); 2562 if( iLimit<1 ) iLimit = 1; 2563 } 2564 if( idxNum&8 ){ 2565 x.iScope = sqlite3_value_int(argv[idx++]); 2566 if( x.iScope<1 ) x.iScope = 1; 2567 if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2; 2568 } 2569 if( idxNum&(16|32) ){ 2570 x.iMaxDist = sqlite3_value_int(argv[idx++]); 2571 if( idxNum&16 ) x.iMaxDist--; 2572 if( x.iMaxDist<0 ) x.iMaxDist = 0; 2573 } 2574 spellfix1ResetCursor(pCur); 2575 spellfix1ResizeCursor(pCur, iLimit); 2576 zMatchThis = sqlite3_value_text(argv[0]); 2577 if( zMatchThis==0 ) return SQLITE_OK; 2578 if( p->pConfig3 ){ 2579 x.pLang = editDist3FindLang(p->pConfig3, iLang); 2580 pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1); 2581 if( pMatchStr3==0 ){ 2582 x.rc = SQLITE_NOMEM; 2583 goto filter_exit; 2584 } 2585 }else{ 2586 x.pLang = 0; 2587 } 2588 zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0])); 2589 sqlite3_free(pCur->zPattern); 2590 pCur->zPattern = zPattern; 2591 if( zPattern==0 ){ 2592 x.rc = SQLITE_NOMEM; 2593 goto filter_exit; 2594 } 2595 nPattern = (int)strlen(zPattern); 2596 if( zPattern[nPattern-1]=='*' ) nPattern--; 2597 zSql = sqlite3_mprintf( 2598 "SELECT id, word, rank, coalesce(k1,word)" 2599 " FROM \"%w\".\"%w_vocab\"" 2600 " WHERE langid=%d AND k2>=?1 AND k2<?2", 2601 p->zDbName, p->zTableName, iLang 2602 ); 2603 if( zSql==0 ){ 2604 x.rc = SQLITE_NOMEM; 2605 pStmt = 0; 2606 goto filter_exit; 2607 } 2608 rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0); 2609 sqlite3_free(zSql); 2610 pCur->iLang = iLang; 2611 x.pCur = pCur; 2612 x.pStmt = pStmt; 2613 x.zPattern = zPattern; 2614 x.nPattern = nPattern; 2615 x.pMatchStr3 = pMatchStr3; 2616 x.iLang = iLang; 2617 x.rc = rc; 2618 x.pConfig3 = p->pConfig3; 2619 if( x.rc==SQLITE_OK ){ 2620 spellfix1RunQuery(&x, zPattern, nPattern); 2621 } 2622 2623 if( pCur->a ){ 2624 qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare); 2625 pCur->iTop = iLimit; 2626 pCur->iScope = iScope; 2627 }else{ 2628 x.rc = SQLITE_NOMEM; 2629 } 2630 2631 filter_exit: 2632 sqlite3_finalize(pStmt); 2633 editDist3FromStringDelete(pMatchStr3); 2634 return x.rc; 2635 } 2636 2637 /* 2638 ** This version of xFilter handles a full-table scan case 2639 */ 2640 static int spellfix1FilterForFullScan( 2641 spellfix1_cursor *pCur, 2642 int argc, 2643 sqlite3_value **argv 2644 ){ 2645 int rc = SQLITE_OK; 2646 int idxNum = pCur->idxNum; 2647 char *zSql; 2648 spellfix1_vtab *pVTab = pCur->pVTab; 2649 spellfix1ResetCursor(pCur); 2650 assert( idxNum==0 || idxNum==64 ); 2651 zSql = sqlite3_mprintf( 2652 "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s", 2653 pVTab->zDbName, pVTab->zTableName, 2654 ((idxNum & 64) ? " WHERE rowid=?" : "") 2655 ); 2656 if( zSql==0 ) return SQLITE_NOMEM; 2657 rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0); 2658 sqlite3_free(zSql); 2659 if( rc==SQLITE_OK && (idxNum & 64) ){ 2660 assert( argc==1 ); 2661 rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]); 2662 } 2663 pCur->nRow = pCur->iRow = 0; 2664 if( rc==SQLITE_OK ){ 2665 rc = sqlite3_step(pCur->pFullScan); 2666 if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; } 2667 if( rc==SQLITE_DONE ){ rc = SQLITE_OK; } 2668 }else{ 2669 pCur->iRow = 0; 2670 } 2671 return rc; 2672 } 2673 2674 2675 /* 2676 ** Called to "rewind" a cursor back to the beginning so that 2677 ** it starts its output over again. Always called at least once 2678 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call. 2679 */ 2680 static int spellfix1Filter( 2681 sqlite3_vtab_cursor *cur, 2682 int idxNum, const char *idxStr, 2683 int argc, sqlite3_value **argv 2684 ){ 2685 spellfix1_cursor *pCur = (spellfix1_cursor *)cur; 2686 int rc; 2687 pCur->idxNum = idxNum; 2688 if( idxNum & 1 ){ 2689 rc = spellfix1FilterForMatch(pCur, argc, argv); 2690 }else{ 2691 rc = spellfix1FilterForFullScan(pCur, argc, argv); 2692 } 2693 return rc; 2694 } 2695 2696 2697 /* 2698 ** Advance a cursor to its next row of output 2699 */ 2700 static int spellfix1Next(sqlite3_vtab_cursor *cur){ 2701 spellfix1_cursor *pCur = (spellfix1_cursor *)cur; 2702 int rc = SQLITE_OK; 2703 if( pCur->iRow < pCur->nRow ){ 2704 if( pCur->pFullScan ){ 2705 rc = sqlite3_step(pCur->pFullScan); 2706 if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow; 2707 if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK; 2708 }else{ 2709 pCur->iRow++; 2710 } 2711 } 2712 return rc; 2713 } 2714 2715 /* 2716 ** Return TRUE if we are at the end-of-file 2717 */ 2718 static int spellfix1Eof(sqlite3_vtab_cursor *cur){ 2719 spellfix1_cursor *pCur = (spellfix1_cursor *)cur; 2720 return pCur->iRow>=pCur->nRow; 2721 } 2722 2723 /* 2724 ** Return columns from the current row. 2725 */ 2726 static int spellfix1Column( 2727 sqlite3_vtab_cursor *cur, 2728 sqlite3_context *ctx, 2729 int i 2730 ){ 2731 spellfix1_cursor *pCur = (spellfix1_cursor*)cur; 2732 if( pCur->pFullScan ){ 2733 if( i<=SPELLFIX_COL_LANGID ){ 2734 sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i)); 2735 }else{ 2736 sqlite3_result_null(ctx); 2737 } 2738 return SQLITE_OK; 2739 } 2740 switch( i ){ 2741 case SPELLFIX_COL_WORD: { 2742 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC); 2743 break; 2744 } 2745 case SPELLFIX_COL_RANK: { 2746 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank); 2747 break; 2748 } 2749 case SPELLFIX_COL_DISTANCE: { 2750 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance); 2751 break; 2752 } 2753 case SPELLFIX_COL_LANGID: { 2754 sqlite3_result_int(ctx, pCur->iLang); 2755 break; 2756 } 2757 case SPELLFIX_COL_SCORE: { 2758 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore); 2759 break; 2760 } 2761 case SPELLFIX_COL_MATCHLEN: { 2762 int iMatchlen = pCur->a[pCur->iRow].iMatchlen; 2763 if( iMatchlen<0 ){ 2764 int nPattern = (int)strlen(pCur->zPattern); 2765 char *zWord = pCur->a[pCur->iRow].zWord; 2766 int nWord = (int)strlen(zWord); 2767 2768 if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){ 2769 char *zTranslit; 2770 int res; 2771 zTranslit = (char *)transliterate((unsigned char *)zWord, nWord); 2772 if( !zTranslit ) return SQLITE_NOMEM; 2773 res = editdist1(pCur->zPattern, zTranslit, &iMatchlen); 2774 sqlite3_free(zTranslit); 2775 if( res<0 ) return SQLITE_NOMEM; 2776 iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen); 2777 }else{ 2778 iMatchlen = utf8Charlen(zWord, nWord); 2779 } 2780 } 2781 2782 sqlite3_result_int(ctx, iMatchlen); 2783 break; 2784 } 2785 case SPELLFIX_COL_PHONEHASH: { 2786 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC); 2787 break; 2788 } 2789 case SPELLFIX_COL_TOP: { 2790 sqlite3_result_int(ctx, pCur->iTop); 2791 break; 2792 } 2793 case SPELLFIX_COL_SCOPE: { 2794 sqlite3_result_int(ctx, pCur->iScope); 2795 break; 2796 } 2797 case SPELLFIX_COL_SRCHCNT: { 2798 sqlite3_result_int(ctx, pCur->nSearch); 2799 break; 2800 } 2801 default: { 2802 sqlite3_result_null(ctx); 2803 break; 2804 } 2805 } 2806 return SQLITE_OK; 2807 } 2808 2809 /* 2810 ** The rowid. 2811 */ 2812 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){ 2813 spellfix1_cursor *pCur = (spellfix1_cursor*)cur; 2814 if( pCur->pFullScan ){ 2815 *pRowid = sqlite3_column_int64(pCur->pFullScan, 4); 2816 }else{ 2817 *pRowid = pCur->a[pCur->iRow].iRowid; 2818 } 2819 return SQLITE_OK; 2820 } 2821 2822 /* 2823 ** This function is called by the xUpdate() method. It returns a string 2824 ** containing the conflict mode that xUpdate() should use for the current 2825 ** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE". 2826 */ 2827 static const char *spellfix1GetConflict(sqlite3 *db){ 2828 static const char *azConflict[] = { 2829 /* Note: Instead of "FAIL" - "ABORT". */ 2830 "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE" 2831 }; 2832 int eConflict = sqlite3_vtab_on_conflict(db); 2833 2834 assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE 2835 || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT 2836 || eConflict==SQLITE_REPLACE 2837 ); 2838 assert( SQLITE_ROLLBACK==1 ); 2839 assert( SQLITE_IGNORE==2 ); 2840 assert( SQLITE_FAIL==3 ); 2841 assert( SQLITE_ABORT==4 ); 2842 assert( SQLITE_REPLACE==5 ); 2843 2844 return azConflict[eConflict-1]; 2845 } 2846 2847 /* 2848 ** The xUpdate() method. 2849 */ 2850 static int spellfix1Update( 2851 sqlite3_vtab *pVTab, 2852 int argc, 2853 sqlite3_value **argv, 2854 sqlite_int64 *pRowid 2855 ){ 2856 int rc = SQLITE_OK; 2857 sqlite3_int64 rowid, newRowid; 2858 spellfix1_vtab *p = (spellfix1_vtab*)pVTab; 2859 sqlite3 *db = p->db; 2860 2861 if( argc==1 ){ 2862 /* A delete operation on the rowid given by argv[0] */ 2863 rowid = *pRowid = sqlite3_value_int64(argv[0]); 2864 spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" " 2865 " WHERE id=%lld", 2866 p->zDbName, p->zTableName, rowid); 2867 }else{ 2868 const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]); 2869 int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]); 2870 int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]); 2871 int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]); 2872 const unsigned char *zSoundslike = 2873 sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]); 2874 int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]); 2875 char *zK1, *zK2; 2876 int i; 2877 char c; 2878 const char *zConflict = spellfix1GetConflict(db); 2879 2880 if( zWord==0 ){ 2881 /* Inserts of the form: INSERT INTO table(command) VALUES('xyzzy'); 2882 ** cause zWord to be NULL, so we look at the "command" column to see 2883 ** what special actions to take */ 2884 const char *zCmd = 2885 (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]); 2886 if( zCmd==0 ){ 2887 pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word", 2888 p->zTableName); 2889 return SQLITE_CONSTRAINT_NOTNULL; 2890 } 2891 if( strcmp(zCmd,"reset")==0 ){ 2892 /* Reset the edit cost table (if there is one). */ 2893 editDist3ConfigDelete(p->pConfig3); 2894 p->pConfig3 = 0; 2895 return SQLITE_OK; 2896 } 2897 if( strncmp(zCmd,"edit_cost_table=",16)==0 ){ 2898 editDist3ConfigDelete(p->pConfig3); 2899 p->pConfig3 = 0; 2900 sqlite3_free(p->zCostTable); 2901 p->zCostTable = spellfix1Dequote(zCmd+16); 2902 if( p->zCostTable==0 ) return SQLITE_NOMEM; 2903 if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){ 2904 sqlite3_free(p->zCostTable); 2905 p->zCostTable = 0; 2906 } 2907 return SQLITE_OK; 2908 } 2909 pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"", 2910 p->zTableName, zCmd); 2911 return SQLITE_ERROR; 2912 } 2913 if( iRank<1 ) iRank = 1; 2914 if( zSoundslike ){ 2915 zK1 = (char*)transliterate(zSoundslike, nSoundslike); 2916 }else{ 2917 zK1 = (char*)transliterate(zWord, nWord); 2918 } 2919 if( zK1==0 ) return SQLITE_NOMEM; 2920 for(i=0; (c = zK1[i])!=0; i++){ 2921 if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A'; 2922 } 2923 zK2 = (char*)phoneticHash((const unsigned char*)zK1, i); 2924 if( zK2==0 ){ 2925 sqlite3_free(zK1); 2926 return SQLITE_NOMEM; 2927 } 2928 if( sqlite3_value_type(argv[0])==SQLITE_NULL ){ 2929 if( sqlite3_value_type(argv[1])==SQLITE_NULL ){ 2930 spellfix1DbExec(&rc, db, 2931 "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) " 2932 "VALUES(%d,%d,%Q,nullif(%Q,%Q),%Q)", 2933 p->zDbName, p->zTableName, 2934 iRank, iLang, zWord, zK1, zWord, zK2 2935 ); 2936 }else{ 2937 newRowid = sqlite3_value_int64(argv[1]); 2938 spellfix1DbExec(&rc, db, 2939 "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) " 2940 "VALUES(%lld,%d,%d,%Q,nullif(%Q,%Q),%Q)", 2941 zConflict, p->zDbName, p->zTableName, 2942 newRowid, iRank, iLang, zWord, zK1, zWord, zK2 2943 ); 2944 } 2945 *pRowid = sqlite3_last_insert_rowid(db); 2946 }else{ 2947 rowid = sqlite3_value_int64(argv[0]); 2948 newRowid = *pRowid = sqlite3_value_int64(argv[1]); 2949 spellfix1DbExec(&rc, db, 2950 "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d," 2951 " word=%Q, k1=nullif(%Q,%Q), k2=%Q WHERE id=%lld", 2952 zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang, 2953 zWord, zK1, zWord, zK2, rowid 2954 ); 2955 } 2956 sqlite3_free(zK1); 2957 sqlite3_free(zK2); 2958 } 2959 return rc; 2960 } 2961 2962 /* 2963 ** Rename the spellfix1 table. 2964 */ 2965 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){ 2966 spellfix1_vtab *p = (spellfix1_vtab*)pVTab; 2967 sqlite3 *db = p->db; 2968 int rc = SQLITE_OK; 2969 char *zNewName = sqlite3_mprintf("%s", zNew); 2970 if( zNewName==0 ){ 2971 return SQLITE_NOMEM; 2972 } 2973 spellfix1DbExec(&rc, db, 2974 "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"", 2975 p->zDbName, p->zTableName, zNewName 2976 ); 2977 if( rc==SQLITE_OK ){ 2978 sqlite3_free(p->zTableName); 2979 p->zTableName = zNewName; 2980 }else{ 2981 sqlite3_free(zNewName); 2982 } 2983 return rc; 2984 } 2985 2986 2987 /* 2988 ** A virtual table module that provides fuzzy search. 2989 */ 2990 static sqlite3_module spellfix1Module = { 2991 0, /* iVersion */ 2992 spellfix1Create, /* xCreate - handle CREATE VIRTUAL TABLE */ 2993 spellfix1Connect, /* xConnect - reconnected to an existing table */ 2994 spellfix1BestIndex, /* xBestIndex - figure out how to do a query */ 2995 spellfix1Disconnect, /* xDisconnect - close a connection */ 2996 spellfix1Destroy, /* xDestroy - handle DROP TABLE */ 2997 spellfix1Open, /* xOpen - open a cursor */ 2998 spellfix1Close, /* xClose - close a cursor */ 2999 spellfix1Filter, /* xFilter - configure scan constraints */ 3000 spellfix1Next, /* xNext - advance a cursor */ 3001 spellfix1Eof, /* xEof - check for end of scan */ 3002 spellfix1Column, /* xColumn - read data */ 3003 spellfix1Rowid, /* xRowid - read data */ 3004 spellfix1Update, /* xUpdate */ 3005 0, /* xBegin */ 3006 0, /* xSync */ 3007 0, /* xCommit */ 3008 0, /* xRollback */ 3009 0, /* xFindMethod */ 3010 spellfix1Rename, /* xRename */ 3011 }; 3012 3013 /* 3014 ** Register the various functions and the virtual table. 3015 */ 3016 static int spellfix1Register(sqlite3 *db){ 3017 int rc = SQLITE_OK; 3018 int i; 3019 rc = sqlite3_create_function(db, "spellfix1_translit", 1, 3020 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, 3021 transliterateSqlFunc, 0, 0); 3022 if( rc==SQLITE_OK ){ 3023 rc = sqlite3_create_function(db, "spellfix1_editdist", 2, 3024 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, 3025 editdistSqlFunc, 0, 0); 3026 } 3027 if( rc==SQLITE_OK ){ 3028 rc = sqlite3_create_function(db, "spellfix1_phonehash", 1, 3029 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, 3030 phoneticHashSqlFunc, 0, 0); 3031 } 3032 if( rc==SQLITE_OK ){ 3033 rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1, 3034 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, 3035 scriptCodeSqlFunc, 0, 0); 3036 } 3037 if( rc==SQLITE_OK ){ 3038 rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0); 3039 } 3040 if( rc==SQLITE_OK ){ 3041 rc = editDist3Install(db); 3042 } 3043 3044 /* Verify sanity of the translit[] table */ 3045 for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){ 3046 assert( translit[i].cFrom<translit[i+1].cFrom ); 3047 } 3048 3049 return rc; 3050 } 3051 3052 #endif /* SQLITE_OMIT_VIRTUALTABLE */ 3053 3054 /* 3055 ** Extension load function. 3056 */ 3057 #ifdef _WIN32 3058 __declspec(dllexport) 3059 #endif 3060 int sqlite3_spellfix_init( 3061 sqlite3 *db, 3062 char **pzErrMsg, 3063 const sqlite3_api_routines *pApi 3064 ){ 3065 SQLITE_EXTENSION_INIT2(pApi); 3066 #ifndef SQLITE_OMIT_VIRTUALTABLE 3067 return spellfix1Register(db); 3068 #endif 3069 return SQLITE_OK; 3070 } 3071