1 /* 2 ** 2012 April 10 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** 13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used 14 ** to search a large vocabulary for close matches. See separate 15 ** documentation (http://www.sqlite.org/spellfix1.html) for details. 16 */ 17 #include "sqlite3ext.h" 18 SQLITE_EXTENSION_INIT1 19 20 #ifndef SQLITE_AMALGAMATION 21 # include <string.h> 22 # include <stdio.h> 23 # include <stdlib.h> 24 # include <assert.h> 25 # define ALWAYS(X) 1 26 # define NEVER(X) 0 27 typedef unsigned char u8; 28 typedef unsigned short u16; 29 #endif 30 #include <ctype.h> 31 32 #ifndef SQLITE_OMIT_VIRTUALTABLE 33 34 /* 35 ** Character classes for ASCII characters: 36 ** 37 ** 0 '' Silent letters: H W 38 ** 1 'A' Any vowel: A E I O U (Y) 39 ** 2 'B' A bilabeal stop or fricative: B F P V W 40 ** 3 'C' Other fricatives or back stops: C G J K Q S X Z 41 ** 4 'D' Alveolar stops: D T 42 ** 5 'H' Letter H at the beginning of a word 43 ** 6 'L' Glide: L 44 ** 7 'R' Semivowel: R 45 ** 8 'M' Nasals: M N 46 ** 9 'Y' Letter Y at the beginning of a word. 47 ** 10 '9' Digits: 0 1 2 3 4 5 6 7 8 9 48 ** 11 ' ' White space 49 ** 12 '?' Other. 50 */ 51 #define CCLASS_SILENT 0 52 #define CCLASS_VOWEL 1 53 #define CCLASS_B 2 54 #define CCLASS_C 3 55 #define CCLASS_D 4 56 #define CCLASS_H 5 57 #define CCLASS_L 6 58 #define CCLASS_R 7 59 #define CCLASS_M 8 60 #define CCLASS_Y 9 61 #define CCLASS_DIGIT 10 62 #define CCLASS_SPACE 11 63 #define CCLASS_OTHER 12 64 65 /* 66 ** The following table gives the character class for non-initial ASCII 67 ** characters. 68 */ 69 static const unsigned char midClass[] = { 70 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 71 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 72 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 73 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 74 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, 75 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 76 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 77 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 78 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 79 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 80 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE, 81 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER, 82 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER, 83 /* ' */ CCLASS_SILENT, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER, 84 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER, 85 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER, 86 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT, 87 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT, 88 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT, 89 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER, 90 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER, 91 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL, 92 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D, 93 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C, 94 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C, 95 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M, 96 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B, 97 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C, 98 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B, 99 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_VOWEL, 100 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER, 101 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER, 102 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B, 103 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL, 104 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT, 105 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C, 106 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M, 107 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C, 108 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D, 109 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B, 110 /* x */ CCLASS_C, /* y */ CCLASS_VOWEL, /* z */ CCLASS_C, 111 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER, 112 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER, 113 }; 114 /* 115 ** This tables gives the character class for ASCII characters that form the 116 ** initial character of a word. The only difference from midClass is with 117 ** the letters H, W, and Y. 118 */ 119 static const unsigned char initClass[] = { 120 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 121 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 122 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 123 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 124 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, 125 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 126 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 127 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 128 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 129 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, 130 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE, 131 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER, 132 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER, 133 /* ' */ CCLASS_OTHER, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER, 134 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER, 135 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER, 136 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT, 137 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT, 138 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT, 139 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER, 140 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER, 141 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL, 142 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D, 143 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C, 144 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C, 145 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M, 146 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B, 147 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C, 148 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B, 149 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_Y, 150 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER, 151 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER, 152 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B, 153 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL, 154 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT, 155 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C, 156 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M, 157 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C, 158 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D, 159 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B, 160 /* x */ CCLASS_C, /* y */ CCLASS_Y, /* z */ CCLASS_C, 161 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER, 162 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER, 163 }; 164 165 /* 166 ** Mapping from the character class number (0-13) to a symbol for each 167 ** character class. Note that initClass[] can be used to map the class 168 ** symbol back into the class number. 169 */ 170 static const unsigned char className[] = ".ABCDHLRMY9 ?"; 171 172 /* 173 ** Generate a "phonetic hash" from a string of ASCII characters 174 ** in zIn[0..nIn-1]. 175 ** 176 ** * Map characters by character class as defined above. 177 ** * Omit double-letters 178 ** * Omit vowels beside R and L 179 ** * Omit T when followed by CH 180 ** * Omit W when followed by R 181 ** * Omit D when followed by J or G 182 ** * Omit K in KN or G in GN at the beginning of a word 183 ** 184 ** Space to hold the result is obtained from sqlite3_malloc() 185 ** 186 ** Return NULL if memory allocation fails. 187 */ 188 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){ 189 unsigned char *zOut = sqlite3_malloc( nIn + 1 ); 190 int i; 191 int nOut = 0; 192 char cPrev = 0x77; 193 char cPrevX = 0x77; 194 const unsigned char *aClass = initClass; 195 196 if( zOut==0 ) return 0; 197 if( nIn>2 ){ 198 switch( zIn[0] ){ 199 case 'g': 200 case 'k': { 201 if( zIn[1]=='n' ){ zIn++; nIn--; } 202 break; 203 } 204 } 205 } 206 for(i=0; i<nIn; i++){ 207 unsigned char c = zIn[i]; 208 if( i+1<nIn ){ 209 if( c=='w' && zIn[i+1]=='r' ) continue; 210 if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue; 211 if( i+2<nIn ){ 212 if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue; 213 } 214 } 215 c = aClass[c&0x7f]; 216 if( c==CCLASS_SPACE ) continue; 217 if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue; 218 aClass = midClass; 219 if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){ 220 continue; /* No vowels beside L or R */ 221 } 222 if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){ 223 nOut--; /* No vowels beside L or R */ 224 } 225 cPrev = c; 226 if( c==CCLASS_SILENT ) continue; 227 cPrevX = c; 228 c = className[c]; 229 assert( nOut>=0 ); 230 if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c; 231 } 232 zOut[nOut] = 0; 233 return zOut; 234 } 235 236 /* 237 ** This is an SQL function wrapper around phoneticHash(). See 238 ** the description of phoneticHash() for additional information. 239 */ 240 static void phoneticHashSqlFunc( 241 sqlite3_context *context, 242 int argc, 243 sqlite3_value **argv 244 ){ 245 const unsigned char *zIn; 246 unsigned char *zOut; 247 248 zIn = sqlite3_value_text(argv[0]); 249 if( zIn==0 ) return; 250 zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0])); 251 if( zOut==0 ){ 252 sqlite3_result_error_nomem(context); 253 }else{ 254 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free); 255 } 256 } 257 258 /* 259 ** Return the character class number for a character given its 260 ** context. 261 */ 262 static char characterClass(char cPrev, char c){ 263 return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f]; 264 } 265 266 /* 267 ** Return the cost of inserting or deleting character c immediately 268 ** following character cPrev. If cPrev==0, that means c is the first 269 ** character of the word. 270 */ 271 static int insertOrDeleteCost(char cPrev, char c, char cNext){ 272 char classC = characterClass(cPrev, c); 273 char classCprev; 274 275 if( classC==CCLASS_SILENT ){ 276 /* Insert or delete "silent" characters such as H or W */ 277 return 1; 278 } 279 if( cPrev==c ){ 280 /* Repeated characters, or miss a repeat */ 281 return 10; 282 } 283 if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){ 284 return 20; /* Insert a vowel before or after 'r' */ 285 } 286 classCprev = characterClass(cPrev, cPrev); 287 if( classC==classCprev ){ 288 if( classC==CCLASS_VOWEL ){ 289 /* Remove or add a new vowel to a vowel cluster */ 290 return 15; 291 }else{ 292 /* Remove or add a consonant not in the same class */ 293 return 50; 294 } 295 } 296 297 /* any other character insertion or deletion */ 298 return 100; 299 } 300 301 /* 302 ** Divide the insertion cost by this factor when appending to the 303 ** end of the word. 304 */ 305 #define FINAL_INS_COST_DIV 4 306 307 /* 308 ** Return the cost of substituting cTo in place of cFrom assuming 309 ** the previous character is cPrev. If cPrev==0 then cTo is the first 310 ** character of the word. 311 */ 312 static int substituteCost(char cPrev, char cFrom, char cTo){ 313 char classFrom, classTo; 314 if( cFrom==cTo ){ 315 /* Exact match */ 316 return 0; 317 } 318 if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){ 319 /* differ only in case */ 320 return 0; 321 } 322 classFrom = characterClass(cPrev, cFrom); 323 classTo = characterClass(cPrev, cTo); 324 if( classFrom==classTo ){ 325 /* Same character class */ 326 return 40; 327 } 328 if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y 329 && classTo>=CCLASS_B && classTo<=CCLASS_Y ){ 330 /* Convert from one consonant to another, but in a different class */ 331 return 75; 332 } 333 /* Any other subsitution */ 334 return 100; 335 } 336 337 /* 338 ** Given two strings zA and zB which are pure ASCII, return the cost 339 ** of transforming zA into zB. If zA ends with '*' assume that it is 340 ** a prefix of zB and give only minimal penalty for extra characters 341 ** on the end of zB. 342 ** 343 ** Smaller numbers mean a closer match. 344 ** 345 ** Negative values indicate an error: 346 ** -1 One of the inputs is NULL 347 ** -2 Non-ASCII characters on input 348 ** -3 Unable to allocate memory 349 ** 350 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes 351 ** of zB that matched the pattern in zA. If zA does not end with a '*', 352 ** then this value is always the number of bytes in zB (i.e. strlen(zB)). 353 ** If zA does end in a '*', then it is the number of bytes in the prefix 354 ** of zB that was deemed to match zA. 355 */ 356 static int editdist1(const char *zA, const char *zB, int *pnMatch){ 357 int nA, nB; /* Number of characters in zA[] and zB[] */ 358 int xA, xB; /* Loop counters for zA[] and zB[] */ 359 char cA = 0, cB; /* Current character of zA and zB */ 360 char cAprev, cBprev; /* Previous character of zA and zB */ 361 char cAnext, cBnext; /* Next character in zA and zB */ 362 int d; /* North-west cost value */ 363 int dc = 0; /* North-west character value */ 364 int res; /* Final result */ 365 int *m; /* The cost matrix */ 366 char *cx; /* Corresponding character values */ 367 int *toFree = 0; /* Malloced space */ 368 int mStack[60+15]; /* Stack space to use if not too much is needed */ 369 int nMatch = 0; 370 371 /* Early out if either input is NULL */ 372 if( zA==0 || zB==0 ) return -1; 373 374 /* Skip any common prefix */ 375 while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; } 376 if( pnMatch ) *pnMatch = nMatch; 377 if( zA[0]==0 && zB[0]==0 ) return 0; 378 379 #if 0 380 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' '); 381 #endif 382 383 /* Verify input strings and measure their lengths */ 384 for(nA=0; zA[nA]; nA++){ 385 if( zA[nA]&0x80 ) return -2; 386 } 387 for(nB=0; zB[nB]; nB++){ 388 if( zB[nB]&0x80 ) return -2; 389 } 390 391 /* Special processing if either string is empty */ 392 if( nA==0 ){ 393 cBprev = dc; 394 for(xB=res=0; (cB = zB[xB])!=0; xB++){ 395 res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV; 396 cBprev = cB; 397 } 398 return res; 399 } 400 if( nB==0 ){ 401 cAprev = dc; 402 for(xA=res=0; (cA = zA[xA])!=0; xA++){ 403 res += insertOrDeleteCost(cAprev, cA, zA[xA+1]); 404 cAprev = cA; 405 } 406 return res; 407 } 408 409 /* A is a prefix of B */ 410 if( zA[0]=='*' && zA[1]==0 ) return 0; 411 412 /* Allocate and initialize the Wagner matrix */ 413 if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){ 414 m = mStack; 415 }else{ 416 m = toFree = sqlite3_malloc( (nB+1)*5*sizeof(m[0])/4 ); 417 if( m==0 ) return -3; 418 } 419 cx = (char*)&m[nB+1]; 420 421 /* Compute the Wagner edit distance */ 422 m[0] = 0; 423 cx[0] = dc; 424 cBprev = dc; 425 for(xB=1; xB<=nB; xB++){ 426 cBnext = zB[xB]; 427 cB = zB[xB-1]; 428 cx[xB] = cB; 429 m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext); 430 cBprev = cB; 431 } 432 cAprev = dc; 433 for(xA=1; xA<=nA; xA++){ 434 int lastA = (xA==nA); 435 cA = zA[xA-1]; 436 cAnext = zA[xA]; 437 if( cA=='*' && lastA ) break; 438 d = m[0]; 439 dc = cx[0]; 440 m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext); 441 cBprev = 0; 442 for(xB=1; xB<=nB; xB++){ 443 int totalCost, insCost, delCost, subCost, ncx; 444 cB = zB[xB-1]; 445 cBnext = zB[xB]; 446 447 /* Cost to insert cB */ 448 insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext); 449 if( lastA ) insCost /= FINAL_INS_COST_DIV; 450 451 /* Cost to delete cA */ 452 delCost = insertOrDeleteCost(cx[xB], cA, cBnext); 453 454 /* Cost to substitute cA->cB */ 455 subCost = substituteCost(cx[xB-1], cA, cB); 456 457 /* Best cost */ 458 totalCost = insCost + m[xB-1]; 459 ncx = cB; 460 if( (delCost + m[xB])<totalCost ){ 461 totalCost = delCost + m[xB]; 462 ncx = cA; 463 } 464 if( (subCost + d)<totalCost ){ 465 totalCost = subCost + d; 466 } 467 468 #if 0 469 printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c" 470 " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n", 471 xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB, 472 insCost, delCost, subCost, totalCost, ncx?ncx:' '); 473 #endif 474 475 /* Update the matrix */ 476 d = m[xB]; 477 dc = cx[xB]; 478 m[xB] = totalCost; 479 cx[xB] = ncx; 480 cBprev = cB; 481 } 482 cAprev = cA; 483 } 484 485 /* Free the wagner matrix and return the result */ 486 if( cA=='*' ){ 487 res = m[1]; 488 for(xB=1; xB<=nB; xB++){ 489 if( m[xB]<res ){ 490 res = m[xB]; 491 if( pnMatch ) *pnMatch = xB+nMatch; 492 } 493 } 494 }else{ 495 res = m[nB]; 496 /* In the current implementation, pnMatch is always NULL if zA does 497 ** not end in "*" */ 498 assert( pnMatch==0 ); 499 } 500 sqlite3_free(toFree); 501 return res; 502 } 503 504 /* 505 ** Function: editdist(A,B) 506 ** 507 ** Return the cost of transforming string A into string B. Both strings 508 ** must be pure ASCII text. If A ends with '*' then it is assumed to be 509 ** a prefix of B and extra characters on the end of B have minimal additional 510 ** cost. 511 */ 512 static void editdistSqlFunc( 513 sqlite3_context *context, 514 int argc, 515 sqlite3_value **argv 516 ){ 517 int res = editdist1( 518 (const char*)sqlite3_value_text(argv[0]), 519 (const char*)sqlite3_value_text(argv[1]), 520 0); 521 if( res<0 ){ 522 if( res==(-3) ){ 523 sqlite3_result_error_nomem(context); 524 }else if( res==(-2) ){ 525 sqlite3_result_error(context, "non-ASCII input to editdist()", -1); 526 }else{ 527 sqlite3_result_error(context, "NULL input to editdist()", -1); 528 } 529 }else{ 530 sqlite3_result_int(context, res); 531 } 532 } 533 534 /* End of the fixed-cost edit distance implementation 535 ****************************************************************************** 536 ***************************************************************************** 537 ** Begin: Configurable cost unicode edit distance routines 538 */ 539 /* Forward declaration of structures */ 540 typedef struct EditDist3Cost EditDist3Cost; 541 typedef struct EditDist3Config EditDist3Config; 542 typedef struct EditDist3Point EditDist3Point; 543 typedef struct EditDist3From EditDist3From; 544 typedef struct EditDist3FromString EditDist3FromString; 545 typedef struct EditDist3To EditDist3To; 546 typedef struct EditDist3ToString EditDist3ToString; 547 typedef struct EditDist3Lang EditDist3Lang; 548 549 550 /* 551 ** An entry in the edit cost table 552 */ 553 struct EditDist3Cost { 554 EditDist3Cost *pNext; /* Next cost element */ 555 u8 nFrom; /* Number of bytes in aFrom */ 556 u8 nTo; /* Number of bytes in aTo */ 557 u16 iCost; /* Cost of this transformation */ 558 char a[4] ; /* FROM string followed by TO string */ 559 /* Additional TO and FROM string bytes appended as necessary */ 560 }; 561 562 /* 563 ** Edit costs for a particular language ID 564 */ 565 struct EditDist3Lang { 566 int iLang; /* Language ID */ 567 int iInsCost; /* Default insertion cost */ 568 int iDelCost; /* Default deletion cost */ 569 int iSubCost; /* Default substitution cost */ 570 EditDist3Cost *pCost; /* Costs */ 571 }; 572 573 574 /* 575 ** The default EditDist3Lang object, with default costs. 576 */ 577 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 }; 578 579 /* 580 ** Complete configuration 581 */ 582 struct EditDist3Config { 583 int nLang; /* Number of language IDs. Size of a[] */ 584 EditDist3Lang *a; /* One for each distinct language ID */ 585 }; 586 587 /* 588 ** Extra information about each character in the FROM string. 589 */ 590 struct EditDist3From { 591 int nSubst; /* Number of substitution cost entries */ 592 int nDel; /* Number of deletion cost entries */ 593 int nByte; /* Number of bytes in this character */ 594 EditDist3Cost **apSubst; /* Array of substitution costs for this element */ 595 EditDist3Cost **apDel; /* Array of deletion cost entries */ 596 }; 597 598 /* 599 ** A precompiled FROM string. 600 * 601 ** In the common case we expect the FROM string to be reused multiple times. 602 ** In other words, the common case will be to measure the edit distance 603 ** from a single origin string to multiple target strings. 604 */ 605 struct EditDist3FromString { 606 char *z; /* The complete text of the FROM string */ 607 int n; /* Number of characters in the FROM string */ 608 int isPrefix; /* True if ends with '*' character */ 609 EditDist3From *a; /* Extra info about each char of the FROM string */ 610 }; 611 612 /* 613 ** Extra information about each character in the TO string. 614 */ 615 struct EditDist3To { 616 int nIns; /* Number of insertion cost entries */ 617 int nByte; /* Number of bytes in this character */ 618 EditDist3Cost **apIns; /* Array of deletion cost entries */ 619 }; 620 621 /* 622 ** A precompiled FROM string 623 */ 624 struct EditDist3ToString { 625 char *z; /* The complete text of the TO string */ 626 int n; /* Number of characters in the TO string */ 627 EditDist3To *a; /* Extra info about each char of the TO string */ 628 }; 629 630 /* 631 ** Clear or delete an instance of the object that records all edit-distance 632 ** weights. 633 */ 634 static void editDist3ConfigClear(EditDist3Config *p){ 635 int i; 636 if( p==0 ) return; 637 for(i=0; i<p->nLang; i++){ 638 EditDist3Cost *pCost, *pNext; 639 pCost = p->a[i].pCost; 640 while( pCost ){ 641 pNext = pCost->pNext; 642 sqlite3_free(pCost); 643 pCost = pNext; 644 } 645 } 646 sqlite3_free(p->a); 647 memset(p, 0, sizeof(*p)); 648 } 649 static void editDist3ConfigDelete(void *pIn){ 650 EditDist3Config *p = (EditDist3Config*)pIn; 651 editDist3ConfigClear(p); 652 sqlite3_free(p); 653 } 654 655 /* 656 ** Load all edit-distance weights from a table. 657 */ 658 static int editDist3ConfigLoad( 659 EditDist3Config *p, /* The edit distance configuration to load */ 660 sqlite3 *db, /* Load from this database */ 661 const char *zTable /* Name of the table from which to load */ 662 ){ 663 sqlite3_stmt *pStmt; 664 int rc, rc2; 665 char *zSql; 666 int iLangPrev = -9999; 667 EditDist3Lang *pLang = 0; 668 669 zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost" 670 " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable); 671 if( zSql==0 ) return SQLITE_NOMEM; 672 rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0); 673 sqlite3_free(zSql); 674 if( rc ) return rc; 675 editDist3ConfigClear(p); 676 while( sqlite3_step(pStmt)==SQLITE_ROW ){ 677 int iLang = sqlite3_column_int(pStmt, 0); 678 const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1); 679 int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0; 680 const char *zTo = (const char*)sqlite3_column_text(pStmt, 2); 681 int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0; 682 int iCost = sqlite3_column_int(pStmt, 3); 683 684 assert( zFrom!=0 || nFrom==0 ); 685 assert( zTo!=0 || nTo==0 ); 686 if( nFrom>100 || nTo>100 ) continue; 687 if( iCost<0 ) continue; 688 if( pLang==0 || iLang!=iLangPrev ){ 689 EditDist3Lang *pNew; 690 pNew = sqlite3_realloc(p->a, (p->nLang+1)*sizeof(p->a[0])); 691 if( pNew==0 ){ rc = SQLITE_NOMEM; break; } 692 p->a = pNew; 693 pLang = &p->a[p->nLang]; 694 p->nLang++; 695 pLang->iLang = iLang; 696 pLang->iInsCost = 100; 697 pLang->iDelCost = 100; 698 pLang->iSubCost = 150; 699 pLang->pCost = 0; 700 iLangPrev = iLang; 701 } 702 if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){ 703 pLang->iDelCost = iCost; 704 }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){ 705 pLang->iInsCost = iCost; 706 }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){ 707 pLang->iSubCost = iCost; 708 }else{ 709 EditDist3Cost *pCost; 710 int nExtra = nFrom + nTo - 4; 711 if( nExtra<0 ) nExtra = 0; 712 pCost = sqlite3_malloc( sizeof(*pCost) + nExtra ); 713 if( pCost==0 ){ rc = SQLITE_NOMEM; break; } 714 pCost->nFrom = nFrom; 715 pCost->nTo = nTo; 716 pCost->iCost = iCost; 717 memcpy(pCost->a, zFrom, nFrom); 718 memcpy(pCost->a + nFrom, zTo, nTo); 719 pCost->pNext = pLang->pCost; 720 pLang->pCost = pCost; 721 } 722 } 723 rc2 = sqlite3_finalize(pStmt); 724 if( rc==SQLITE_OK ) rc = rc2; 725 return rc; 726 } 727 728 /* 729 ** Return the length (in bytes) of a utf-8 character. Or return a maximum 730 ** of N. 731 */ 732 static int utf8Len(unsigned char c, int N){ 733 int len = 1; 734 if( c>0x7f ){ 735 if( (c&0xe0)==0xc0 ){ 736 len = 2; 737 }else if( (c&0xf0)==0xe0 ){ 738 len = 3; 739 }else{ 740 len = 4; 741 } 742 } 743 if( len>N ) len = N; 744 return len; 745 } 746 747 /* 748 ** Return TRUE (non-zero) if the To side of the given cost matches 749 ** the given string. 750 */ 751 static int matchTo(EditDist3Cost *p, const char *z, int n){ 752 if( p->nTo>n ) return 0; 753 if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0; 754 return 1; 755 } 756 757 /* 758 ** Return TRUE (non-zero) if the From side of the given cost matches 759 ** the given string. 760 */ 761 static int matchFrom(EditDist3Cost *p, const char *z, int n){ 762 assert( p->nFrom<=n ); 763 if( strncmp(p->a, z, p->nFrom)!=0 ) return 0; 764 return 1; 765 } 766 767 /* 768 ** Return TRUE (non-zero) of the next FROM character and the next TO 769 ** character are the same. 770 */ 771 static int matchFromTo( 772 EditDist3FromString *pStr, /* Left hand string */ 773 int n1, /* Index of comparison character on the left */ 774 const char *z2, /* Right-handl comparison character */ 775 int n2 /* Bytes remaining in z2[] */ 776 ){ 777 int b1 = pStr->a[n1].nByte; 778 if( b1>n2 ) return 0; 779 if( memcmp(pStr->z+n1, z2, b1)!=0 ) return 0; 780 return 1; 781 } 782 783 /* 784 ** Delete an EditDist3FromString objecct 785 */ 786 static void editDist3FromStringDelete(EditDist3FromString *p){ 787 int i; 788 if( p ){ 789 for(i=0; i<p->n; i++){ 790 sqlite3_free(p->a[i].apDel); 791 sqlite3_free(p->a[i].apSubst); 792 } 793 sqlite3_free(p); 794 } 795 } 796 797 /* 798 ** Create a EditDist3FromString object. 799 */ 800 static EditDist3FromString *editDist3FromStringNew( 801 const EditDist3Lang *pLang, 802 const char *z, 803 int n 804 ){ 805 EditDist3FromString *pStr; 806 EditDist3Cost *p; 807 int i; 808 809 if( z==0 ) return 0; 810 if( n<0 ) n = (int)strlen(z); 811 pStr = sqlite3_malloc( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 ); 812 if( pStr==0 ) return 0; 813 pStr->a = (EditDist3From*)&pStr[1]; 814 memset(pStr->a, 0, sizeof(pStr->a[0])*n); 815 pStr->n = n; 816 pStr->z = (char*)&pStr->a[n]; 817 memcpy(pStr->z, z, n+1); 818 if( n && z[n-1]=='*' ){ 819 pStr->isPrefix = 1; 820 n--; 821 pStr->n--; 822 pStr->z[n] = 0; 823 }else{ 824 pStr->isPrefix = 0; 825 } 826 827 for(i=0; i<n; i++){ 828 EditDist3From *pFrom = &pStr->a[i]; 829 memset(pFrom, 0, sizeof(*pFrom)); 830 pFrom->nByte = utf8Len((unsigned char)z[i], n-i); 831 for(p=pLang->pCost; p; p=p->pNext){ 832 EditDist3Cost **apNew; 833 if( i+p->nFrom>n ) continue; 834 if( matchFrom(p, z+i, n-i)==0 ) continue; 835 if( p->nTo==0 ){ 836 apNew = sqlite3_realloc(pFrom->apDel, 837 sizeof(*apNew)*(pFrom->nDel+1)); 838 if( apNew==0 ) break; 839 pFrom->apDel = apNew; 840 apNew[pFrom->nDel++] = p; 841 }else{ 842 apNew = sqlite3_realloc(pFrom->apSubst, 843 sizeof(*apNew)*(pFrom->nSubst+1)); 844 if( apNew==0 ) break; 845 pFrom->apSubst = apNew; 846 apNew[pFrom->nSubst++] = p; 847 } 848 } 849 if( p ){ 850 editDist3FromStringDelete(pStr); 851 pStr = 0; 852 break; 853 } 854 } 855 return pStr; 856 } 857 858 /* 859 ** Update entry m[i] such that it is the minimum of its current value 860 ** and m[j]+iCost. 861 ** 862 ** If the iCost is 1,000,000 or greater, then consider the cost to be 863 ** infinite and skip the update. 864 */ 865 static void updateCost( 866 unsigned int *m, 867 int i, 868 int j, 869 int iCost 870 ){ 871 assert( iCost>=0 ); 872 if( iCost<10000 ){ 873 unsigned int b = m[j] + iCost; 874 if( b<m[i] ) m[i] = b; 875 } 876 } 877 878 /* Compute the edit distance between two strings. 879 ** 880 ** If an error occurs, return a negative number which is the error code. 881 ** 882 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters 883 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does 884 ** not contain the pattern for a prefix-search, then this is always the number 885 ** of characters in z2. If pFrom does contain a prefix search pattern, then 886 ** it is the number of characters in the prefix of z2 that was deemed to 887 ** match pFrom. 888 */ 889 static int editDist3Core( 890 EditDist3FromString *pFrom, /* The FROM string */ 891 const char *z2, /* The TO string */ 892 int n2, /* Length of the TO string */ 893 const EditDist3Lang *pLang, /* Edit weights for a particular language ID */ 894 int *pnMatch /* OUT: Characters in matched prefix */ 895 ){ 896 int k, n; 897 int i1, b1; 898 int i2, b2; 899 EditDist3FromString f = *pFrom; 900 EditDist3To *a2; 901 unsigned int *m; 902 int szRow; 903 EditDist3Cost *p; 904 int res; 905 906 /* allocate the Wagner matrix and the aTo[] array for the TO string */ 907 n = (f.n+1)*(n2+1); 908 n = (n+1)&~1; 909 m = sqlite3_malloc( n*sizeof(m[0]) + sizeof(a2[0])*n2 ); 910 if( m==0 ) return -1; /* Out of memory */ 911 a2 = (EditDist3To*)&m[n]; 912 memset(a2, 0, sizeof(a2[0])*n2); 913 914 /* Fill in the a1[] matrix for all characters of the TO string */ 915 for(i2=0; i2<n2; i2++){ 916 a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2); 917 for(p=pLang->pCost; p; p=p->pNext){ 918 EditDist3Cost **apNew; 919 if( p->nFrom>0 ) continue; 920 if( i2+p->nTo>n2 ) continue; 921 if( matchTo(p, z2+i2, n2-i2)==0 ) continue; 922 a2[i2].nIns++; 923 apNew = sqlite3_realloc(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns); 924 if( apNew==0 ){ 925 res = -1; /* Out of memory */ 926 goto editDist3Abort; 927 } 928 a2[i2].apIns = apNew; 929 a2[i2].apIns[a2[i2].nIns-1] = p; 930 } 931 } 932 933 /* Prepare to compute the minimum edit distance */ 934 szRow = f.n+1; 935 memset(m, 0x01, (n2+1)*szRow*sizeof(m[0])); 936 m[0] = 0; 937 938 /* First fill in the top-row of the matrix with FROM deletion costs */ 939 for(i1=0; i1<f.n; i1 += b1){ 940 b1 = f.a[i1].nByte; 941 updateCost(m, i1+b1, i1, pLang->iDelCost); 942 for(k=0; k<f.a[i1].nDel; k++){ 943 p = f.a[i1].apDel[k]; 944 updateCost(m, i1+p->nFrom, i1, p->iCost); 945 } 946 } 947 948 /* Fill in all subsequent rows, top-to-bottom, left-to-right */ 949 for(i2=0; i2<n2; i2 += b2){ 950 int rx; /* Starting index for current row */ 951 int rxp; /* Starting index for previous row */ 952 b2 = a2[i2].nByte; 953 rx = szRow*(i2+b2); 954 rxp = szRow*i2; 955 updateCost(m, rx, rxp, pLang->iInsCost); 956 for(k=0; k<a2[i2].nIns; k++){ 957 p = a2[i2].apIns[k]; 958 updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost); 959 } 960 for(i1=0; i1<f.n; i1+=b1){ 961 int cx; /* Index of current cell */ 962 int cxp; /* Index of cell immediately to the left */ 963 int cxd; /* Index of cell to the left and one row above */ 964 int cxu; /* Index of cell immediately above */ 965 b1 = f.a[i1].nByte; 966 cxp = rx + i1; 967 cx = cxp + b1; 968 cxd = rxp + i1; 969 cxu = cxd + b1; 970 updateCost(m, cx, cxp, pLang->iDelCost); 971 for(k=0; k<f.a[i1].nDel; k++){ 972 p = f.a[i1].apDel[k]; 973 updateCost(m, cxp+p->nFrom, cxp, p->iCost); 974 } 975 updateCost(m, cx, cxu, pLang->iInsCost); 976 if( matchFromTo(&f, i1, z2+i2, n2-i2) ){ 977 updateCost(m, cx, cxd, 0); 978 } 979 updateCost(m, cx, cxd, pLang->iSubCost); 980 for(k=0; k<f.a[i1].nSubst; k++){ 981 p = f.a[i1].apSubst[k]; 982 if( matchTo(p, z2+i2, n2-i2) ){ 983 updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost); 984 } 985 } 986 } 987 } 988 989 #if 0 /* Enable for debugging */ 990 printf(" ^"); 991 for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff); 992 printf("\n ^:"); 993 for(i1=0; i1<szRow; i1++){ 994 int v = m[i1]; 995 if( v>9999 ) printf(" ****"); 996 else printf(" %4d", v); 997 } 998 printf("\n"); 999 for(i2=0; i2<n2; i2++){ 1000 printf("%c-%02x:", z2[i2], z2[i2]&0xff); 1001 for(i1=0; i1<szRow; i1++){ 1002 int v = m[(i2+1)*szRow+i1]; 1003 if( v>9999 ) printf(" ****"); 1004 else printf(" %4d", v); 1005 } 1006 printf("\n"); 1007 } 1008 #endif 1009 1010 /* Free memory allocations and return the result */ 1011 res = (int)m[szRow*(n2+1)-1]; 1012 n = n2; 1013 if( f.isPrefix ){ 1014 for(i2=1; i2<=n2; i2++){ 1015 int b = m[szRow*i2-1]; 1016 if( b<=res ){ 1017 res = b; 1018 n = i2 - 1; 1019 } 1020 } 1021 } 1022 if( pnMatch ){ 1023 int nExtra = 0; 1024 for(k=0; k<n; k++){ 1025 if( (z2[k] & 0xc0)==0x80 ) nExtra++; 1026 } 1027 *pnMatch = n - nExtra; 1028 } 1029 1030 editDist3Abort: 1031 for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns); 1032 sqlite3_free(m); 1033 return res; 1034 } 1035 1036 /* 1037 ** Get an appropriate EditDist3Lang object. 1038 */ 1039 static const EditDist3Lang *editDist3FindLang( 1040 EditDist3Config *pConfig, 1041 int iLang 1042 ){ 1043 int i; 1044 for(i=0; i<pConfig->nLang; i++){ 1045 if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i]; 1046 } 1047 return &editDist3Lang; 1048 } 1049 1050 /* 1051 ** Function: editdist3(A,B,iLang) 1052 ** editdist3(tablename) 1053 ** 1054 ** Return the cost of transforming string A into string B using edit 1055 ** weights for iLang. 1056 ** 1057 ** The second form loads edit weights into memory from a table. 1058 */ 1059 static void editDist3SqlFunc( 1060 sqlite3_context *context, 1061 int argc, 1062 sqlite3_value **argv 1063 ){ 1064 EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context); 1065 sqlite3 *db = sqlite3_context_db_handle(context); 1066 int rc; 1067 if( argc==1 ){ 1068 const char *zTable = (const char*)sqlite3_value_text(argv[0]); 1069 rc = editDist3ConfigLoad(pConfig, db, zTable); 1070 if( rc ) sqlite3_result_error_code(context, rc); 1071 }else{ 1072 const char *zA = (const char*)sqlite3_value_text(argv[0]); 1073 const char *zB = (const char*)sqlite3_value_text(argv[1]); 1074 int nA = sqlite3_value_bytes(argv[0]); 1075 int nB = sqlite3_value_bytes(argv[1]); 1076 int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0; 1077 const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang); 1078 EditDist3FromString *pFrom; 1079 int dist; 1080 1081 pFrom = editDist3FromStringNew(pLang, zA, nA); 1082 if( pFrom==0 ){ 1083 sqlite3_result_error_nomem(context); 1084 return; 1085 } 1086 dist = editDist3Core(pFrom, zB, nB, pLang, 0); 1087 editDist3FromStringDelete(pFrom); 1088 if( dist==(-1) ){ 1089 sqlite3_result_error_nomem(context); 1090 }else{ 1091 sqlite3_result_int(context, dist); 1092 } 1093 } 1094 } 1095 1096 /* 1097 ** Register the editDist3 function with SQLite 1098 */ 1099 static int editDist3Install(sqlite3 *db){ 1100 int rc; 1101 EditDist3Config *pConfig = sqlite3_malloc( sizeof(*pConfig) ); 1102 if( pConfig==0 ) return SQLITE_NOMEM; 1103 memset(pConfig, 0, sizeof(*pConfig)); 1104 rc = sqlite3_create_function_v2(db, "editdist3", 1105 2, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0); 1106 if( rc==SQLITE_OK ){ 1107 rc = sqlite3_create_function_v2(db, "editdist3", 1108 3, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0); 1109 } 1110 if( rc==SQLITE_OK ){ 1111 rc = sqlite3_create_function_v2(db, "editdist3", 1112 1, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 1113 editDist3ConfigDelete); 1114 }else{ 1115 sqlite3_free(pConfig); 1116 } 1117 return rc; 1118 } 1119 /* End configurable cost unicode edit distance routines 1120 ****************************************************************************** 1121 ****************************************************************************** 1122 ** Begin transliterate unicode-to-ascii implementation 1123 */ 1124 1125 #if !SQLITE_AMALGAMATION 1126 /* 1127 ** This lookup table is used to help decode the first byte of 1128 ** a multi-byte UTF8 character. 1129 */ 1130 static const unsigned char sqlite3Utf8Trans1[] = { 1131 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1132 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 1133 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 1134 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 1135 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1136 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 1137 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1138 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, 1139 }; 1140 #endif 1141 1142 /* 1143 ** Return the value of the first UTF-8 character in the string. 1144 */ 1145 static int utf8Read(const unsigned char *z, int n, int *pSize){ 1146 int c, i; 1147 1148 /* All callers to this routine (in the current implementation) 1149 ** always have n>0. */ 1150 if( NEVER(n==0) ){ 1151 c = i = 0; 1152 }else{ 1153 c = z[0]; 1154 i = 1; 1155 if( c>=0xc0 ){ 1156 c = sqlite3Utf8Trans1[c-0xc0]; 1157 while( i<n && (z[i] & 0xc0)==0x80 ){ 1158 c = (c<<6) + (0x3f & z[i++]); 1159 } 1160 } 1161 } 1162 *pSize = i; 1163 return c; 1164 } 1165 1166 /* 1167 ** Return the number of characters in the utf-8 string in the nIn byte 1168 ** buffer pointed to by zIn. 1169 */ 1170 static int utf8Charlen(const char *zIn, int nIn){ 1171 int i; 1172 int nChar = 0; 1173 for(i=0; i<nIn; nChar++){ 1174 int sz; 1175 utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz); 1176 i += sz; 1177 } 1178 return nChar; 1179 } 1180 1181 /* 1182 ** Table of translations from unicode characters into ASCII. 1183 */ 1184 static const struct { 1185 unsigned short int cFrom; 1186 unsigned char cTo0, cTo1; 1187 } translit[] = { 1188 { 0x00A0, 0x20, 0x00 }, /* to */ 1189 { 0x00B5, 0x75, 0x00 }, /* µ to u */ 1190 { 0x00C0, 0x41, 0x00 }, /* À to A */ 1191 { 0x00C1, 0x41, 0x00 }, /* Á to A */ 1192 { 0x00C2, 0x41, 0x00 }, /*  to A */ 1193 { 0x00C3, 0x41, 0x00 }, /* à to A */ 1194 { 0x00C4, 0x41, 0x65 }, /* Ä to Ae */ 1195 { 0x00C5, 0x41, 0x61 }, /* Å to Aa */ 1196 { 0x00C6, 0x41, 0x45 }, /* Æ to AE */ 1197 { 0x00C7, 0x43, 0x00 }, /* Ç to C */ 1198 { 0x00C8, 0x45, 0x00 }, /* È to E */ 1199 { 0x00C9, 0x45, 0x00 }, /* É to E */ 1200 { 0x00CA, 0x45, 0x00 }, /* Ê to E */ 1201 { 0x00CB, 0x45, 0x00 }, /* Ë to E */ 1202 { 0x00CC, 0x49, 0x00 }, /* Ì to I */ 1203 { 0x00CD, 0x49, 0x00 }, /* Í to I */ 1204 { 0x00CE, 0x49, 0x00 }, /* Î to I */ 1205 { 0x00CF, 0x49, 0x00 }, /* Ï to I */ 1206 { 0x00D0, 0x44, 0x00 }, /* Ð to D */ 1207 { 0x00D1, 0x4E, 0x00 }, /* Ñ to N */ 1208 { 0x00D2, 0x4F, 0x00 }, /* Ò to O */ 1209 { 0x00D3, 0x4F, 0x00 }, /* Ó to O */ 1210 { 0x00D4, 0x4F, 0x00 }, /* Ô to O */ 1211 { 0x00D5, 0x4F, 0x00 }, /* Õ to O */ 1212 { 0x00D6, 0x4F, 0x65 }, /* Ö to Oe */ 1213 { 0x00D7, 0x78, 0x00 }, /* × to x */ 1214 { 0x00D8, 0x4F, 0x00 }, /* Ø to O */ 1215 { 0x00D9, 0x55, 0x00 }, /* Ù to U */ 1216 { 0x00DA, 0x55, 0x00 }, /* Ú to U */ 1217 { 0x00DB, 0x55, 0x00 }, /* Û to U */ 1218 { 0x00DC, 0x55, 0x65 }, /* Ü to Ue */ 1219 { 0x00DD, 0x59, 0x00 }, /* Ý to Y */ 1220 { 0x00DE, 0x54, 0x68 }, /* Þ to Th */ 1221 { 0x00DF, 0x73, 0x73 }, /* ß to ss */ 1222 { 0x00E0, 0x61, 0x00 }, /* à to a */ 1223 { 0x00E1, 0x61, 0x00 }, /* á to a */ 1224 { 0x00E2, 0x61, 0x00 }, /* â to a */ 1225 { 0x00E3, 0x61, 0x00 }, /* ã to a */ 1226 { 0x00E4, 0x61, 0x65 }, /* ä to ae */ 1227 { 0x00E5, 0x61, 0x61 }, /* å to aa */ 1228 { 0x00E6, 0x61, 0x65 }, /* æ to ae */ 1229 { 0x00E7, 0x63, 0x00 }, /* ç to c */ 1230 { 0x00E8, 0x65, 0x00 }, /* è to e */ 1231 { 0x00E9, 0x65, 0x00 }, /* é to e */ 1232 { 0x00EA, 0x65, 0x00 }, /* ê to e */ 1233 { 0x00EB, 0x65, 0x00 }, /* ë to e */ 1234 { 0x00EC, 0x69, 0x00 }, /* ì to i */ 1235 { 0x00ED, 0x69, 0x00 }, /* í to i */ 1236 { 0x00EE, 0x69, 0x00 }, /* î to i */ 1237 { 0x00EF, 0x69, 0x00 }, /* ï to i */ 1238 { 0x00F0, 0x64, 0x00 }, /* ð to d */ 1239 { 0x00F1, 0x6E, 0x00 }, /* ñ to n */ 1240 { 0x00F2, 0x6F, 0x00 }, /* ò to o */ 1241 { 0x00F3, 0x6F, 0x00 }, /* ó to o */ 1242 { 0x00F4, 0x6F, 0x00 }, /* ô to o */ 1243 { 0x00F5, 0x6F, 0x00 }, /* õ to o */ 1244 { 0x00F6, 0x6F, 0x65 }, /* ö to oe */ 1245 { 0x00F7, 0x3A, 0x00 }, /* ÷ to : */ 1246 { 0x00F8, 0x6F, 0x00 }, /* ø to o */ 1247 { 0x00F9, 0x75, 0x00 }, /* ù to u */ 1248 { 0x00FA, 0x75, 0x00 }, /* ú to u */ 1249 { 0x00FB, 0x75, 0x00 }, /* û to u */ 1250 { 0x00FC, 0x75, 0x65 }, /* ü to ue */ 1251 { 0x00FD, 0x79, 0x00 }, /* ý to y */ 1252 { 0x00FE, 0x74, 0x68 }, /* þ to th */ 1253 { 0x00FF, 0x79, 0x00 }, /* ÿ to y */ 1254 { 0x0100, 0x41, 0x00 }, /* Ā to A */ 1255 { 0x0101, 0x61, 0x00 }, /* ā to a */ 1256 { 0x0102, 0x41, 0x00 }, /* Ă to A */ 1257 { 0x0103, 0x61, 0x00 }, /* ă to a */ 1258 { 0x0104, 0x41, 0x00 }, /* Ą to A */ 1259 { 0x0105, 0x61, 0x00 }, /* ą to a */ 1260 { 0x0106, 0x43, 0x00 }, /* Ć to C */ 1261 { 0x0107, 0x63, 0x00 }, /* ć to c */ 1262 { 0x0108, 0x43, 0x68 }, /* Ĉ to Ch */ 1263 { 0x0109, 0x63, 0x68 }, /* ĉ to ch */ 1264 { 0x010A, 0x43, 0x00 }, /* Ċ to C */ 1265 { 0x010B, 0x63, 0x00 }, /* ċ to c */ 1266 { 0x010C, 0x43, 0x00 }, /* Č to C */ 1267 { 0x010D, 0x63, 0x00 }, /* č to c */ 1268 { 0x010E, 0x44, 0x00 }, /* Ď to D */ 1269 { 0x010F, 0x64, 0x00 }, /* ď to d */ 1270 { 0x0110, 0x44, 0x00 }, /* Đ to D */ 1271 { 0x0111, 0x64, 0x00 }, /* đ to d */ 1272 { 0x0112, 0x45, 0x00 }, /* Ē to E */ 1273 { 0x0113, 0x65, 0x00 }, /* ē to e */ 1274 { 0x0114, 0x45, 0x00 }, /* Ĕ to E */ 1275 { 0x0115, 0x65, 0x00 }, /* ĕ to e */ 1276 { 0x0116, 0x45, 0x00 }, /* Ė to E */ 1277 { 0x0117, 0x65, 0x00 }, /* ė to e */ 1278 { 0x0118, 0x45, 0x00 }, /* Ę to E */ 1279 { 0x0119, 0x65, 0x00 }, /* ę to e */ 1280 { 0x011A, 0x45, 0x00 }, /* Ě to E */ 1281 { 0x011B, 0x65, 0x00 }, /* ě to e */ 1282 { 0x011C, 0x47, 0x68 }, /* Ĝ to Gh */ 1283 { 0x011D, 0x67, 0x68 }, /* ĝ to gh */ 1284 { 0x011E, 0x47, 0x00 }, /* Ğ to G */ 1285 { 0x011F, 0x67, 0x00 }, /* ğ to g */ 1286 { 0x0120, 0x47, 0x00 }, /* Ġ to G */ 1287 { 0x0121, 0x67, 0x00 }, /* ġ to g */ 1288 { 0x0122, 0x47, 0x00 }, /* Ģ to G */ 1289 { 0x0123, 0x67, 0x00 }, /* ģ to g */ 1290 { 0x0124, 0x48, 0x68 }, /* Ĥ to Hh */ 1291 { 0x0125, 0x68, 0x68 }, /* ĥ to hh */ 1292 { 0x0126, 0x48, 0x00 }, /* Ħ to H */ 1293 { 0x0127, 0x68, 0x00 }, /* ħ to h */ 1294 { 0x0128, 0x49, 0x00 }, /* Ĩ to I */ 1295 { 0x0129, 0x69, 0x00 }, /* ĩ to i */ 1296 { 0x012A, 0x49, 0x00 }, /* Ī to I */ 1297 { 0x012B, 0x69, 0x00 }, /* ī to i */ 1298 { 0x012C, 0x49, 0x00 }, /* Ĭ to I */ 1299 { 0x012D, 0x69, 0x00 }, /* ĭ to i */ 1300 { 0x012E, 0x49, 0x00 }, /* Į to I */ 1301 { 0x012F, 0x69, 0x00 }, /* į to i */ 1302 { 0x0130, 0x49, 0x00 }, /* İ to I */ 1303 { 0x0131, 0x69, 0x00 }, /* ı to i */ 1304 { 0x0132, 0x49, 0x4A }, /* IJ to IJ */ 1305 { 0x0133, 0x69, 0x6A }, /* ij to ij */ 1306 { 0x0134, 0x4A, 0x68 }, /* Ĵ to Jh */ 1307 { 0x0135, 0x6A, 0x68 }, /* ĵ to jh */ 1308 { 0x0136, 0x4B, 0x00 }, /* Ķ to K */ 1309 { 0x0137, 0x6B, 0x00 }, /* ķ to k */ 1310 { 0x0138, 0x6B, 0x00 }, /* ĸ to k */ 1311 { 0x0139, 0x4C, 0x00 }, /* Ĺ to L */ 1312 { 0x013A, 0x6C, 0x00 }, /* ĺ to l */ 1313 { 0x013B, 0x4C, 0x00 }, /* Ļ to L */ 1314 { 0x013C, 0x6C, 0x00 }, /* ļ to l */ 1315 { 0x013D, 0x4C, 0x00 }, /* Ľ to L */ 1316 { 0x013E, 0x6C, 0x00 }, /* ľ to l */ 1317 { 0x013F, 0x4C, 0x2E }, /* Ŀ to L. */ 1318 { 0x0140, 0x6C, 0x2E }, /* ŀ to l. */ 1319 { 0x0141, 0x4C, 0x00 }, /* Ł to L */ 1320 { 0x0142, 0x6C, 0x00 }, /* ł to l */ 1321 { 0x0143, 0x4E, 0x00 }, /* Ń to N */ 1322 { 0x0144, 0x6E, 0x00 }, /* ń to n */ 1323 { 0x0145, 0x4E, 0x00 }, /* Ņ to N */ 1324 { 0x0146, 0x6E, 0x00 }, /* ņ to n */ 1325 { 0x0147, 0x4E, 0x00 }, /* Ň to N */ 1326 { 0x0148, 0x6E, 0x00 }, /* ň to n */ 1327 { 0x0149, 0x27, 0x6E }, /* ʼn to 'n */ 1328 { 0x014A, 0x4E, 0x47 }, /* Ŋ to NG */ 1329 { 0x014B, 0x6E, 0x67 }, /* ŋ to ng */ 1330 { 0x014C, 0x4F, 0x00 }, /* Ō to O */ 1331 { 0x014D, 0x6F, 0x00 }, /* ō to o */ 1332 { 0x014E, 0x4F, 0x00 }, /* Ŏ to O */ 1333 { 0x014F, 0x6F, 0x00 }, /* ŏ to o */ 1334 { 0x0150, 0x4F, 0x00 }, /* Ő to O */ 1335 { 0x0151, 0x6F, 0x00 }, /* ő to o */ 1336 { 0x0152, 0x4F, 0x45 }, /* Œ to OE */ 1337 { 0x0153, 0x6F, 0x65 }, /* œ to oe */ 1338 { 0x0154, 0x52, 0x00 }, /* Ŕ to R */ 1339 { 0x0155, 0x72, 0x00 }, /* ŕ to r */ 1340 { 0x0156, 0x52, 0x00 }, /* Ŗ to R */ 1341 { 0x0157, 0x72, 0x00 }, /* ŗ to r */ 1342 { 0x0158, 0x52, 0x00 }, /* Ř to R */ 1343 { 0x0159, 0x72, 0x00 }, /* ř to r */ 1344 { 0x015A, 0x53, 0x00 }, /* Ś to S */ 1345 { 0x015B, 0x73, 0x00 }, /* ś to s */ 1346 { 0x015C, 0x53, 0x68 }, /* Ŝ to Sh */ 1347 { 0x015D, 0x73, 0x68 }, /* ŝ to sh */ 1348 { 0x015E, 0x53, 0x00 }, /* Ş to S */ 1349 { 0x015F, 0x73, 0x00 }, /* ş to s */ 1350 { 0x0160, 0x53, 0x00 }, /* Š to S */ 1351 { 0x0161, 0x73, 0x00 }, /* š to s */ 1352 { 0x0162, 0x54, 0x00 }, /* Ţ to T */ 1353 { 0x0163, 0x74, 0x00 }, /* ţ to t */ 1354 { 0x0164, 0x54, 0x00 }, /* Ť to T */ 1355 { 0x0165, 0x74, 0x00 }, /* ť to t */ 1356 { 0x0166, 0x54, 0x00 }, /* Ŧ to T */ 1357 { 0x0167, 0x74, 0x00 }, /* ŧ to t */ 1358 { 0x0168, 0x55, 0x00 }, /* Ũ to U */ 1359 { 0x0169, 0x75, 0x00 }, /* ũ to u */ 1360 { 0x016A, 0x55, 0x00 }, /* Ū to U */ 1361 { 0x016B, 0x75, 0x00 }, /* ū to u */ 1362 { 0x016C, 0x55, 0x00 }, /* Ŭ to U */ 1363 { 0x016D, 0x75, 0x00 }, /* ŭ to u */ 1364 { 0x016E, 0x55, 0x00 }, /* Ů to U */ 1365 { 0x016F, 0x75, 0x00 }, /* ů to u */ 1366 { 0x0170, 0x55, 0x00 }, /* Ű to U */ 1367 { 0x0171, 0x75, 0x00 }, /* ű to u */ 1368 { 0x0172, 0x55, 0x00 }, /* Ų to U */ 1369 { 0x0173, 0x75, 0x00 }, /* ų to u */ 1370 { 0x0174, 0x57, 0x00 }, /* Ŵ to W */ 1371 { 0x0175, 0x77, 0x00 }, /* ŵ to w */ 1372 { 0x0176, 0x59, 0x00 }, /* Ŷ to Y */ 1373 { 0x0177, 0x79, 0x00 }, /* ŷ to y */ 1374 { 0x0178, 0x59, 0x00 }, /* Ÿ to Y */ 1375 { 0x0179, 0x5A, 0x00 }, /* Ź to Z */ 1376 { 0x017A, 0x7A, 0x00 }, /* ź to z */ 1377 { 0x017B, 0x5A, 0x00 }, /* Ż to Z */ 1378 { 0x017C, 0x7A, 0x00 }, /* ż to z */ 1379 { 0x017D, 0x5A, 0x00 }, /* Ž to Z */ 1380 { 0x017E, 0x7A, 0x00 }, /* ž to z */ 1381 { 0x017F, 0x73, 0x00 }, /* ſ to s */ 1382 { 0x0192, 0x66, 0x00 }, /* ƒ to f */ 1383 { 0x0218, 0x53, 0x00 }, /* Ș to S */ 1384 { 0x0219, 0x73, 0x00 }, /* ș to s */ 1385 { 0x021A, 0x54, 0x00 }, /* Ț to T */ 1386 { 0x021B, 0x74, 0x00 }, /* ț to t */ 1387 { 0x0386, 0x41, 0x00 }, /* Ά to A */ 1388 { 0x0388, 0x45, 0x00 }, /* Έ to E */ 1389 { 0x0389, 0x49, 0x00 }, /* Ή to I */ 1390 { 0x038A, 0x49, 0x00 }, /* Ί to I */ 1391 { 0x038C, 0x4f, 0x00 }, /* Ό to O */ 1392 { 0x038E, 0x59, 0x00 }, /* Ύ to Y */ 1393 { 0x038F, 0x4f, 0x00 }, /* Ώ to O */ 1394 { 0x0390, 0x69, 0x00 }, /* ΐ to i */ 1395 { 0x0391, 0x41, 0x00 }, /* Α to A */ 1396 { 0x0392, 0x42, 0x00 }, /* Β to B */ 1397 { 0x0393, 0x47, 0x00 }, /* Γ to G */ 1398 { 0x0394, 0x44, 0x00 }, /* Δ to D */ 1399 { 0x0395, 0x45, 0x00 }, /* Ε to E */ 1400 { 0x0396, 0x5a, 0x00 }, /* Ζ to Z */ 1401 { 0x0397, 0x49, 0x00 }, /* Η to I */ 1402 { 0x0398, 0x54, 0x68 }, /* Θ to Th */ 1403 { 0x0399, 0x49, 0x00 }, /* Ι to I */ 1404 { 0x039A, 0x4b, 0x00 }, /* Κ to K */ 1405 { 0x039B, 0x4c, 0x00 }, /* Λ to L */ 1406 { 0x039C, 0x4d, 0x00 }, /* Μ to M */ 1407 { 0x039D, 0x4e, 0x00 }, /* Ν to N */ 1408 { 0x039E, 0x58, 0x00 }, /* Ξ to X */ 1409 { 0x039F, 0x4f, 0x00 }, /* Ο to O */ 1410 { 0x03A0, 0x50, 0x00 }, /* Π to P */ 1411 { 0x03A1, 0x52, 0x00 }, /* Ρ to R */ 1412 { 0x03A3, 0x53, 0x00 }, /* Σ to S */ 1413 { 0x03A4, 0x54, 0x00 }, /* Τ to T */ 1414 { 0x03A5, 0x59, 0x00 }, /* Υ to Y */ 1415 { 0x03A6, 0x46, 0x00 }, /* Φ to F */ 1416 { 0x03A7, 0x43, 0x68 }, /* Χ to Ch */ 1417 { 0x03A8, 0x50, 0x73 }, /* Ψ to Ps */ 1418 { 0x03A9, 0x4f, 0x00 }, /* Ω to O */ 1419 { 0x03AA, 0x49, 0x00 }, /* Ϊ to I */ 1420 { 0x03AB, 0x59, 0x00 }, /* Ϋ to Y */ 1421 { 0x03AC, 0x61, 0x00 }, /* ά to a */ 1422 { 0x03AD, 0x65, 0x00 }, /* έ to e */ 1423 { 0x03AE, 0x69, 0x00 }, /* ή to i */ 1424 { 0x03AF, 0x69, 0x00 }, /* ί to i */ 1425 { 0x03B1, 0x61, 0x00 }, /* α to a */ 1426 { 0x03B2, 0x62, 0x00 }, /* β to b */ 1427 { 0x03B3, 0x67, 0x00 }, /* γ to g */ 1428 { 0x03B4, 0x64, 0x00 }, /* δ to d */ 1429 { 0x03B5, 0x65, 0x00 }, /* ε to e */ 1430 { 0x03B6, 0x7a, 0x00 }, /* ζ to z */ 1431 { 0x03B7, 0x69, 0x00 }, /* η to i */ 1432 { 0x03B8, 0x74, 0x68 }, /* θ to th */ 1433 { 0x03B9, 0x69, 0x00 }, /* ι to i */ 1434 { 0x03BA, 0x6b, 0x00 }, /* κ to k */ 1435 { 0x03BB, 0x6c, 0x00 }, /* λ to l */ 1436 { 0x03BC, 0x6d, 0x00 }, /* μ to m */ 1437 { 0x03BD, 0x6e, 0x00 }, /* ν to n */ 1438 { 0x03BE, 0x78, 0x00 }, /* ξ to x */ 1439 { 0x03BF, 0x6f, 0x00 }, /* ο to o */ 1440 { 0x03C0, 0x70, 0x00 }, /* π to p */ 1441 { 0x03C1, 0x72, 0x00 }, /* ρ to r */ 1442 { 0x03C3, 0x73, 0x00 }, /* σ to s */ 1443 { 0x03C4, 0x74, 0x00 }, /* τ to t */ 1444 { 0x03C5, 0x79, 0x00 }, /* υ to y */ 1445 { 0x03C6, 0x66, 0x00 }, /* φ to f */ 1446 { 0x03C7, 0x63, 0x68 }, /* χ to ch */ 1447 { 0x03C8, 0x70, 0x73 }, /* ψ to ps */ 1448 { 0x03C9, 0x6f, 0x00 }, /* ω to o */ 1449 { 0x03CA, 0x69, 0x00 }, /* ϊ to i */ 1450 { 0x03CB, 0x79, 0x00 }, /* ϋ to y */ 1451 { 0x03CC, 0x6f, 0x00 }, /* ό to o */ 1452 { 0x03CD, 0x79, 0x00 }, /* ύ to y */ 1453 { 0x03CE, 0x69, 0x00 }, /* ώ to i */ 1454 { 0x0400, 0x45, 0x00 }, /* Ѐ to E */ 1455 { 0x0401, 0x45, 0x00 }, /* Ё to E */ 1456 { 0x0402, 0x44, 0x00 }, /* Ђ to D */ 1457 { 0x0403, 0x47, 0x00 }, /* Ѓ to G */ 1458 { 0x0404, 0x45, 0x00 }, /* Є to E */ 1459 { 0x0405, 0x5a, 0x00 }, /* Ѕ to Z */ 1460 { 0x0406, 0x49, 0x00 }, /* І to I */ 1461 { 0x0407, 0x49, 0x00 }, /* Ї to I */ 1462 { 0x0408, 0x4a, 0x00 }, /* Ј to J */ 1463 { 0x0409, 0x49, 0x00 }, /* Љ to I */ 1464 { 0x040A, 0x4e, 0x00 }, /* Њ to N */ 1465 { 0x040B, 0x44, 0x00 }, /* Ћ to D */ 1466 { 0x040C, 0x4b, 0x00 }, /* Ќ to K */ 1467 { 0x040D, 0x49, 0x00 }, /* Ѝ to I */ 1468 { 0x040E, 0x55, 0x00 }, /* Ў to U */ 1469 { 0x040F, 0x44, 0x00 }, /* Џ to D */ 1470 { 0x0410, 0x41, 0x00 }, /* А to A */ 1471 { 0x0411, 0x42, 0x00 }, /* Б to B */ 1472 { 0x0412, 0x56, 0x00 }, /* В to V */ 1473 { 0x0413, 0x47, 0x00 }, /* Г to G */ 1474 { 0x0414, 0x44, 0x00 }, /* Д to D */ 1475 { 0x0415, 0x45, 0x00 }, /* Е to E */ 1476 { 0x0416, 0x5a, 0x68 }, /* Ж to Zh */ 1477 { 0x0417, 0x5a, 0x00 }, /* З to Z */ 1478 { 0x0418, 0x49, 0x00 }, /* И to I */ 1479 { 0x0419, 0x49, 0x00 }, /* Й to I */ 1480 { 0x041A, 0x4b, 0x00 }, /* К to K */ 1481 { 0x041B, 0x4c, 0x00 }, /* Л to L */ 1482 { 0x041C, 0x4d, 0x00 }, /* М to M */ 1483 { 0x041D, 0x4e, 0x00 }, /* Н to N */ 1484 { 0x041E, 0x4f, 0x00 }, /* О to O */ 1485 { 0x041F, 0x50, 0x00 }, /* П to P */ 1486 { 0x0420, 0x52, 0x00 }, /* Р to R */ 1487 { 0x0421, 0x53, 0x00 }, /* С to S */ 1488 { 0x0422, 0x54, 0x00 }, /* Т to T */ 1489 { 0x0423, 0x55, 0x00 }, /* У to U */ 1490 { 0x0424, 0x46, 0x00 }, /* Ф to F */ 1491 { 0x0425, 0x4b, 0x68 }, /* Х to Kh */ 1492 { 0x0426, 0x54, 0x63 }, /* Ц to Tc */ 1493 { 0x0427, 0x43, 0x68 }, /* Ч to Ch */ 1494 { 0x0428, 0x53, 0x68 }, /* Ш to Sh */ 1495 { 0x0429, 0x53, 0x68 }, /* Щ to Shch */ 1496 { 0x042A, 0x61, 0x00 }, /* to A */ 1497 { 0x042B, 0x59, 0x00 }, /* Ы to Y */ 1498 { 0x042C, 0x59, 0x00 }, /* to Y */ 1499 { 0x042D, 0x45, 0x00 }, /* Э to E */ 1500 { 0x042E, 0x49, 0x75 }, /* Ю to Iu */ 1501 { 0x042F, 0x49, 0x61 }, /* Я to Ia */ 1502 { 0x0430, 0x61, 0x00 }, /* а to a */ 1503 { 0x0431, 0x62, 0x00 }, /* б to b */ 1504 { 0x0432, 0x76, 0x00 }, /* в to v */ 1505 { 0x0433, 0x67, 0x00 }, /* г to g */ 1506 { 0x0434, 0x64, 0x00 }, /* д to d */ 1507 { 0x0435, 0x65, 0x00 }, /* е to e */ 1508 { 0x0436, 0x7a, 0x68 }, /* ж to zh */ 1509 { 0x0437, 0x7a, 0x00 }, /* з to z */ 1510 { 0x0438, 0x69, 0x00 }, /* и to i */ 1511 { 0x0439, 0x69, 0x00 }, /* й to i */ 1512 { 0x043A, 0x6b, 0x00 }, /* к to k */ 1513 { 0x043B, 0x6c, 0x00 }, /* л to l */ 1514 { 0x043C, 0x6d, 0x00 }, /* м to m */ 1515 { 0x043D, 0x6e, 0x00 }, /* н to n */ 1516 { 0x043E, 0x6f, 0x00 }, /* о to o */ 1517 { 0x043F, 0x70, 0x00 }, /* п to p */ 1518 { 0x0440, 0x72, 0x00 }, /* р to r */ 1519 { 0x0441, 0x73, 0x00 }, /* с to s */ 1520 { 0x0442, 0x74, 0x00 }, /* т to t */ 1521 { 0x0443, 0x75, 0x00 }, /* у to u */ 1522 { 0x0444, 0x66, 0x00 }, /* ф to f */ 1523 { 0x0445, 0x6b, 0x68 }, /* х to kh */ 1524 { 0x0446, 0x74, 0x63 }, /* ц to tc */ 1525 { 0x0447, 0x63, 0x68 }, /* ч to ch */ 1526 { 0x0448, 0x73, 0x68 }, /* ш to sh */ 1527 { 0x0449, 0x73, 0x68 }, /* щ to shch */ 1528 { 0x044A, 0x61, 0x00 }, /* to a */ 1529 { 0x044B, 0x79, 0x00 }, /* ы to y */ 1530 { 0x044C, 0x79, 0x00 }, /* to y */ 1531 { 0x044D, 0x65, 0x00 }, /* э to e */ 1532 { 0x044E, 0x69, 0x75 }, /* ю to iu */ 1533 { 0x044F, 0x69, 0x61 }, /* я to ia */ 1534 { 0x0450, 0x65, 0x00 }, /* ѐ to e */ 1535 { 0x0451, 0x65, 0x00 }, /* ё to e */ 1536 { 0x0452, 0x64, 0x00 }, /* ђ to d */ 1537 { 0x0453, 0x67, 0x00 }, /* ѓ to g */ 1538 { 0x0454, 0x65, 0x00 }, /* є to e */ 1539 { 0x0455, 0x7a, 0x00 }, /* ѕ to z */ 1540 { 0x0456, 0x69, 0x00 }, /* і to i */ 1541 { 0x0457, 0x69, 0x00 }, /* ї to i */ 1542 { 0x0458, 0x6a, 0x00 }, /* ј to j */ 1543 { 0x0459, 0x69, 0x00 }, /* љ to i */ 1544 { 0x045A, 0x6e, 0x00 }, /* њ to n */ 1545 { 0x045B, 0x64, 0x00 }, /* ћ to d */ 1546 { 0x045C, 0x6b, 0x00 }, /* ќ to k */ 1547 { 0x045D, 0x69, 0x00 }, /* ѝ to i */ 1548 { 0x045E, 0x75, 0x00 }, /* ў to u */ 1549 { 0x045F, 0x64, 0x00 }, /* џ to d */ 1550 { 0x1E02, 0x42, 0x00 }, /* Ḃ to B */ 1551 { 0x1E03, 0x62, 0x00 }, /* ḃ to b */ 1552 { 0x1E0A, 0x44, 0x00 }, /* Ḋ to D */ 1553 { 0x1E0B, 0x64, 0x00 }, /* ḋ to d */ 1554 { 0x1E1E, 0x46, 0x00 }, /* Ḟ to F */ 1555 { 0x1E1F, 0x66, 0x00 }, /* ḟ to f */ 1556 { 0x1E40, 0x4D, 0x00 }, /* Ṁ to M */ 1557 { 0x1E41, 0x6D, 0x00 }, /* ṁ to m */ 1558 { 0x1E56, 0x50, 0x00 }, /* Ṗ to P */ 1559 { 0x1E57, 0x70, 0x00 }, /* ṗ to p */ 1560 { 0x1E60, 0x53, 0x00 }, /* Ṡ to S */ 1561 { 0x1E61, 0x73, 0x00 }, /* ṡ to s */ 1562 { 0x1E6A, 0x54, 0x00 }, /* Ṫ to T */ 1563 { 0x1E6B, 0x74, 0x00 }, /* ṫ to t */ 1564 { 0x1E80, 0x57, 0x00 }, /* Ẁ to W */ 1565 { 0x1E81, 0x77, 0x00 }, /* ẁ to w */ 1566 { 0x1E82, 0x57, 0x00 }, /* Ẃ to W */ 1567 { 0x1E83, 0x77, 0x00 }, /* ẃ to w */ 1568 { 0x1E84, 0x57, 0x00 }, /* Ẅ to W */ 1569 { 0x1E85, 0x77, 0x00 }, /* ẅ to w */ 1570 { 0x1EF2, 0x59, 0x00 }, /* Ỳ to Y */ 1571 { 0x1EF3, 0x79, 0x00 }, /* ỳ to y */ 1572 { 0xFB00, 0x66, 0x66 }, /* ff to ff */ 1573 { 0xFB01, 0x66, 0x69 }, /* fi to fi */ 1574 { 0xFB02, 0x66, 0x6C }, /* fl to fl */ 1575 { 0xFB05, 0x73, 0x74 }, /* ſt to st */ 1576 { 0xFB06, 0x73, 0x74 }, /* st to st */ 1577 }; 1578 1579 /* 1580 ** Convert the input string from UTF-8 into pure ASCII by converting 1581 ** all non-ASCII characters to some combination of characters in the 1582 ** ASCII subset. 1583 ** 1584 ** The returned string might contain more characters than the input. 1585 ** 1586 ** Space to hold the returned string comes from sqlite3_malloc() and 1587 ** should be freed by the caller. 1588 */ 1589 static unsigned char *transliterate(const unsigned char *zIn, int nIn){ 1590 unsigned char *zOut = sqlite3_malloc( nIn*4 + 1 ); 1591 int c, sz, nOut; 1592 if( zOut==0 ) return 0; 1593 nOut = 0; 1594 while( nIn>0 ){ 1595 c = utf8Read(zIn, nIn, &sz); 1596 zIn += sz; 1597 nIn -= sz; 1598 if( c<=127 ){ 1599 zOut[nOut++] = c; 1600 }else{ 1601 int xTop, xBtm, x; 1602 xTop = sizeof(translit)/sizeof(translit[0]) - 1; 1603 xBtm = 0; 1604 while( xTop>=xBtm ){ 1605 x = (xTop + xBtm)/2; 1606 if( translit[x].cFrom==c ){ 1607 zOut[nOut++] = translit[x].cTo0; 1608 if( translit[x].cTo1 ){ 1609 zOut[nOut++] = translit[x].cTo1; 1610 /* Add an extra "ch" after the "sh" for Щ and щ */ 1611 if( c==0x0429 || c== 0x0449 ){ 1612 zOut[nOut++] = 'c'; 1613 zOut[nOut++] = 'h'; 1614 } 1615 } 1616 c = 0; 1617 break; 1618 }else if( translit[x].cFrom>c ){ 1619 xTop = x-1; 1620 }else{ 1621 xBtm = x+1; 1622 } 1623 } 1624 if( c ) zOut[nOut++] = '?'; 1625 } 1626 } 1627 zOut[nOut] = 0; 1628 return zOut; 1629 } 1630 1631 /* 1632 ** Return the number of characters in the shortest prefix of the input 1633 ** string that transliterates to an ASCII string nTrans bytes or longer. 1634 ** Or, if the transliteration of the input string is less than nTrans 1635 ** bytes in size, return the number of characters in the input string. 1636 */ 1637 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){ 1638 int i, c, sz, nOut; 1639 int nChar; 1640 1641 i = nOut = 0; 1642 for(nChar=0; i<nIn && nOut<nTrans; nChar++){ 1643 c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz); 1644 i += sz; 1645 1646 nOut++; 1647 if( c>=128 ){ 1648 int xTop, xBtm, x; 1649 xTop = sizeof(translit)/sizeof(translit[0]) - 1; 1650 xBtm = 0; 1651 while( xTop>=xBtm ){ 1652 x = (xTop + xBtm)/2; 1653 if( translit[x].cFrom==c ){ 1654 if( translit[x].cTo1 ) nOut++; 1655 if( c==0x0429 || c== 0x0449 ) nOut += 2; 1656 break; 1657 }else if( translit[x].cFrom>c ){ 1658 xTop = x-1; 1659 }else{ 1660 xBtm = x+1; 1661 } 1662 } 1663 } 1664 } 1665 1666 return nChar; 1667 } 1668 1669 1670 /* 1671 ** spellfix1_translit(X) 1672 ** 1673 ** Convert a string that contains non-ASCII Roman characters into 1674 ** pure ASCII. 1675 */ 1676 static void transliterateSqlFunc( 1677 sqlite3_context *context, 1678 int argc, 1679 sqlite3_value **argv 1680 ){ 1681 const unsigned char *zIn = sqlite3_value_text(argv[0]); 1682 int nIn = sqlite3_value_bytes(argv[0]); 1683 unsigned char *zOut = transliterate(zIn, nIn); 1684 if( zOut==0 ){ 1685 sqlite3_result_error_nomem(context); 1686 }else{ 1687 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free); 1688 } 1689 } 1690 1691 /* 1692 ** spellfix1_scriptcode(X) 1693 ** 1694 ** Try to determine the dominant script used by the word X and return 1695 ** its ISO 15924 numeric code. 1696 ** 1697 ** The current implementation only understands the following scripts: 1698 ** 1699 ** 215 (Latin) 1700 ** 220 (Cyrillic) 1701 ** 200 (Greek) 1702 ** 1703 ** This routine will return 998 if the input X contains characters from 1704 ** two or more of the above scripts or 999 if X contains no characters 1705 ** from any of the above scripts. 1706 */ 1707 static void scriptCodeSqlFunc( 1708 sqlite3_context *context, 1709 int argc, 1710 sqlite3_value **argv 1711 ){ 1712 const unsigned char *zIn = sqlite3_value_text(argv[0]); 1713 int nIn = sqlite3_value_bytes(argv[0]); 1714 int c, sz; 1715 int scriptMask = 0; 1716 int res; 1717 # define SCRIPT_LATIN 0x0001 1718 # define SCRIPT_CYRILLIC 0x0002 1719 # define SCRIPT_GREEK 0x0004 1720 1721 while( nIn>0 ){ 1722 c = utf8Read(zIn, nIn, &sz); 1723 zIn += sz; 1724 nIn -= sz; 1725 if( c<0x02af ){ 1726 scriptMask |= SCRIPT_LATIN; 1727 }else if( c>=0x0400 && c<=0x04ff ){ 1728 scriptMask |= SCRIPT_CYRILLIC; 1729 }else if( c>=0x0386 && c<=0x03ce ){ 1730 scriptMask |= SCRIPT_GREEK; 1731 } 1732 } 1733 switch( scriptMask ){ 1734 case 0: res = 999; break; 1735 case SCRIPT_LATIN: res = 215; break; 1736 case SCRIPT_CYRILLIC: res = 220; break; 1737 case SCRIPT_GREEK: res = 200; break; 1738 default: res = 998; break; 1739 } 1740 sqlite3_result_int(context, res); 1741 } 1742 1743 /* End transliterate 1744 ****************************************************************************** 1745 ****************************************************************************** 1746 ** Begin spellfix1 virtual table. 1747 */ 1748 1749 /* Maximum length of a phonehash used for querying the shadow table */ 1750 #define SPELLFIX_MX_HASH 8 1751 1752 /* Maximum number of hash strings to examine per query */ 1753 #define SPELLFIX_MX_RUN 1 1754 1755 typedef struct spellfix1_vtab spellfix1_vtab; 1756 typedef struct spellfix1_cursor spellfix1_cursor; 1757 1758 /* Fuzzy-search virtual table object */ 1759 struct spellfix1_vtab { 1760 sqlite3_vtab base; /* Base class - must be first */ 1761 sqlite3 *db; /* Database connection */ 1762 char *zDbName; /* Name of database holding this table */ 1763 char *zTableName; /* Name of the virtual table */ 1764 char *zCostTable; /* Table holding edit-distance cost numbers */ 1765 EditDist3Config *pConfig3; /* Parsed edit distance costs */ 1766 }; 1767 1768 /* Fuzzy-search cursor object */ 1769 struct spellfix1_cursor { 1770 sqlite3_vtab_cursor base; /* Base class - must be first */ 1771 spellfix1_vtab *pVTab; /* The table to which this cursor belongs */ 1772 char *zPattern; /* rhs of MATCH clause */ 1773 int idxNum; /* idxNum value passed to xFilter() */ 1774 int nRow; /* Number of rows of content */ 1775 int nAlloc; /* Number of allocated rows */ 1776 int iRow; /* Current row of content */ 1777 int iLang; /* Value of the langid= constraint */ 1778 int iTop; /* Value of the top= constraint */ 1779 int iScope; /* Value of the scope= constraint */ 1780 int nSearch; /* Number of vocabulary items checked */ 1781 sqlite3_stmt *pFullScan; /* Shadow query for a full table scan */ 1782 struct spellfix1_row { /* For each row of content */ 1783 sqlite3_int64 iRowid; /* Rowid for this row */ 1784 char *zWord; /* Text for this row */ 1785 int iRank; /* Rank for this row */ 1786 int iDistance; /* Distance from pattern for this row */ 1787 int iScore; /* Score for sorting */ 1788 int iMatchlen; /* Value of matchlen column (or -1) */ 1789 char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */ 1790 } *a; 1791 }; 1792 1793 /* 1794 ** Construct one or more SQL statements from the format string given 1795 ** and then evaluate those statements. The success code is written 1796 ** into *pRc. 1797 ** 1798 ** If *pRc is initially non-zero then this routine is a no-op. 1799 */ 1800 static void spellfix1DbExec( 1801 int *pRc, /* Success code */ 1802 sqlite3 *db, /* Database in which to run SQL */ 1803 const char *zFormat, /* Format string for SQL */ 1804 ... /* Arguments to the format string */ 1805 ){ 1806 va_list ap; 1807 char *zSql; 1808 if( *pRc ) return; 1809 va_start(ap, zFormat); 1810 zSql = sqlite3_vmprintf(zFormat, ap); 1811 va_end(ap); 1812 if( zSql==0 ){ 1813 *pRc = SQLITE_NOMEM; 1814 }else{ 1815 *pRc = sqlite3_exec(db, zSql, 0, 0, 0); 1816 sqlite3_free(zSql); 1817 } 1818 } 1819 1820 /* 1821 ** xDisconnect/xDestroy method for the fuzzy-search module. 1822 */ 1823 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){ 1824 spellfix1_vtab *p = (spellfix1_vtab*)pVTab; 1825 int rc = SQLITE_OK; 1826 if( isDestroy ){ 1827 sqlite3 *db = p->db; 1828 spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"", 1829 p->zDbName, p->zTableName); 1830 } 1831 if( rc==SQLITE_OK ){ 1832 sqlite3_free(p->zTableName); 1833 editDist3ConfigDelete(p->pConfig3); 1834 sqlite3_free(p->zCostTable); 1835 sqlite3_free(p); 1836 } 1837 return rc; 1838 } 1839 static int spellfix1Disconnect(sqlite3_vtab *pVTab){ 1840 return spellfix1Uninit(0, pVTab); 1841 } 1842 static int spellfix1Destroy(sqlite3_vtab *pVTab){ 1843 return spellfix1Uninit(1, pVTab); 1844 } 1845 1846 /* 1847 ** Make a copy of a string. Remove leading and trailing whitespace 1848 ** and dequote it. 1849 */ 1850 static char *spellfix1Dequote(const char *zIn){ 1851 char *zOut; 1852 int i, j; 1853 char c; 1854 while( isspace(zIn[0]) ) zIn++; 1855 zOut = sqlite3_mprintf("%s", zIn); 1856 if( zOut==0 ) return 0; 1857 i = (int)strlen(zOut); 1858 #if 0 /* The parser will never leave spaces at the end */ 1859 while( i>0 && isspace(zOut[i-1]) ){ i--; } 1860 #endif 1861 zOut[i] = 0; 1862 c = zOut[0]; 1863 if( c=='\'' || c=='"' ){ 1864 for(i=1, j=0; ALWAYS(zOut[i]); i++){ 1865 zOut[j++] = zOut[i]; 1866 if( zOut[i]==c ){ 1867 if( zOut[i+1]==c ){ 1868 i++; 1869 }else{ 1870 zOut[j-1] = 0; 1871 break; 1872 } 1873 } 1874 } 1875 } 1876 return zOut; 1877 } 1878 1879 1880 /* 1881 ** xConnect/xCreate method for the spellfix1 module. Arguments are: 1882 ** 1883 ** argv[0] -> module name ("spellfix1") 1884 ** argv[1] -> database name 1885 ** argv[2] -> table name 1886 ** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter) 1887 */ 1888 static int spellfix1Init( 1889 int isCreate, 1890 sqlite3 *db, 1891 void *pAux, 1892 int argc, const char *const*argv, 1893 sqlite3_vtab **ppVTab, 1894 char **pzErr 1895 ){ 1896 spellfix1_vtab *pNew = 0; 1897 /* const char *zModule = argv[0]; // not used */ 1898 const char *zDbName = argv[1]; 1899 const char *zTableName = argv[2]; 1900 int nDbName; 1901 int rc = SQLITE_OK; 1902 int i; 1903 1904 nDbName = (int)strlen(zDbName); 1905 pNew = sqlite3_malloc( sizeof(*pNew) + nDbName + 1); 1906 if( pNew==0 ){ 1907 rc = SQLITE_NOMEM; 1908 }else{ 1909 memset(pNew, 0, sizeof(*pNew)); 1910 pNew->zDbName = (char*)&pNew[1]; 1911 memcpy(pNew->zDbName, zDbName, nDbName+1); 1912 pNew->zTableName = sqlite3_mprintf("%s", zTableName); 1913 pNew->db = db; 1914 if( pNew->zTableName==0 ){ 1915 rc = SQLITE_NOMEM; 1916 }else{ 1917 rc = sqlite3_declare_vtab(db, 1918 "CREATE TABLE x(word,rank,distance,langid, " 1919 "score, matchlen, phonehash HIDDEN, " 1920 "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, " 1921 "soundslike HIDDEN, command HIDDEN)" 1922 ); 1923 #define SPELLFIX_COL_WORD 0 1924 #define SPELLFIX_COL_RANK 1 1925 #define SPELLFIX_COL_DISTANCE 2 1926 #define SPELLFIX_COL_LANGID 3 1927 #define SPELLFIX_COL_SCORE 4 1928 #define SPELLFIX_COL_MATCHLEN 5 1929 #define SPELLFIX_COL_PHONEHASH 6 1930 #define SPELLFIX_COL_TOP 7 1931 #define SPELLFIX_COL_SCOPE 8 1932 #define SPELLFIX_COL_SRCHCNT 9 1933 #define SPELLFIX_COL_SOUNDSLIKE 10 1934 #define SPELLFIX_COL_COMMAND 11 1935 } 1936 if( rc==SQLITE_OK && isCreate ){ 1937 spellfix1DbExec(&rc, db, 1938 "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n" 1939 " id INTEGER PRIMARY KEY,\n" 1940 " rank INT,\n" 1941 " langid INT,\n" 1942 " word TEXT,\n" 1943 " k1 TEXT,\n" 1944 " k2 TEXT\n" 1945 ");\n", 1946 zDbName, zTableName 1947 ); 1948 spellfix1DbExec(&rc, db, 1949 "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" " 1950 "ON \"%w_vocab\"(langid,k2);", 1951 zDbName, zTableName, zTableName 1952 ); 1953 } 1954 for(i=3; rc==SQLITE_OK && i<argc; i++){ 1955 if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){ 1956 pNew->zCostTable = spellfix1Dequote(&argv[i][16]); 1957 if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM; 1958 continue; 1959 } 1960 *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]); 1961 rc = SQLITE_ERROR; 1962 } 1963 } 1964 1965 if( rc && pNew ){ 1966 *ppVTab = 0; 1967 spellfix1Uninit(0, &pNew->base); 1968 }else{ 1969 *ppVTab = (sqlite3_vtab *)pNew; 1970 } 1971 return rc; 1972 } 1973 1974 /* 1975 ** The xConnect and xCreate methods 1976 */ 1977 static int spellfix1Connect( 1978 sqlite3 *db, 1979 void *pAux, 1980 int argc, const char *const*argv, 1981 sqlite3_vtab **ppVTab, 1982 char **pzErr 1983 ){ 1984 return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr); 1985 } 1986 static int spellfix1Create( 1987 sqlite3 *db, 1988 void *pAux, 1989 int argc, const char *const*argv, 1990 sqlite3_vtab **ppVTab, 1991 char **pzErr 1992 ){ 1993 return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr); 1994 } 1995 1996 /* 1997 ** Clear all of the content from a cursor. 1998 */ 1999 static void spellfix1ResetCursor(spellfix1_cursor *pCur){ 2000 int i; 2001 for(i=0; i<pCur->nRow; i++){ 2002 sqlite3_free(pCur->a[i].zWord); 2003 } 2004 pCur->nRow = 0; 2005 pCur->iRow = 0; 2006 pCur->nSearch = 0; 2007 if( pCur->pFullScan ){ 2008 sqlite3_finalize(pCur->pFullScan); 2009 pCur->pFullScan = 0; 2010 } 2011 } 2012 2013 /* 2014 ** Resize the cursor to hold up to N rows of content 2015 */ 2016 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){ 2017 struct spellfix1_row *aNew; 2018 assert( N>=pCur->nRow ); 2019 aNew = sqlite3_realloc(pCur->a, sizeof(pCur->a[0])*N); 2020 if( aNew==0 && N>0 ){ 2021 spellfix1ResetCursor(pCur); 2022 sqlite3_free(pCur->a); 2023 pCur->nAlloc = 0; 2024 pCur->a = 0; 2025 }else{ 2026 pCur->nAlloc = N; 2027 pCur->a = aNew; 2028 } 2029 } 2030 2031 2032 /* 2033 ** Close a fuzzy-search cursor. 2034 */ 2035 static int spellfix1Close(sqlite3_vtab_cursor *cur){ 2036 spellfix1_cursor *pCur = (spellfix1_cursor *)cur; 2037 spellfix1ResetCursor(pCur); 2038 spellfix1ResizeCursor(pCur, 0); 2039 sqlite3_free(pCur->zPattern); 2040 sqlite3_free(pCur); 2041 return SQLITE_OK; 2042 } 2043 2044 #define SPELLFIX_IDXNUM_MATCH 0x01 /* word MATCH $str */ 2045 #define SPELLFIX_IDXNUM_LANGID 0x02 /* langid == $langid */ 2046 #define SPELLFIX_IDXNUM_TOP 0x04 /* top = $top */ 2047 #define SPELLFIX_IDXNUM_SCOPE 0x08 /* scope = $scope */ 2048 #define SPELLFIX_IDXNUM_DISTLT 0x10 /* distance < $distance */ 2049 #define SPELLFIX_IDXNUM_DISTLE 0x20 /* distance <= $distance */ 2050 #define SPELLFIX_IDXNUM_ROWID 0x40 /* rowid = $rowid */ 2051 #define SPELLFIX_IDXNUM_DIST (0x10|0x20) /* DISTLT and DISTLE */ 2052 2053 /* 2054 ** 2055 ** The plan number is a bitmask of the SPELLFIX_IDXNUM_* values defined 2056 ** above. 2057 ** 2058 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid 2059 ** if specified and in that order. 2060 */ 2061 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){ 2062 int iPlan = 0; 2063 int iLangTerm = -1; 2064 int iTopTerm = -1; 2065 int iScopeTerm = -1; 2066 int iDistTerm = -1; 2067 int iRowidTerm = -1; 2068 int i; 2069 const struct sqlite3_index_constraint *pConstraint; 2070 pConstraint = pIdxInfo->aConstraint; 2071 for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){ 2072 if( pConstraint->usable==0 ) continue; 2073 2074 /* Terms of the form: word MATCH $str */ 2075 if( (iPlan & SPELLFIX_IDXNUM_MATCH)==0 2076 && pConstraint->iColumn==SPELLFIX_COL_WORD 2077 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH 2078 ){ 2079 iPlan |= SPELLFIX_IDXNUM_MATCH; 2080 pIdxInfo->aConstraintUsage[i].argvIndex = 1; 2081 pIdxInfo->aConstraintUsage[i].omit = 1; 2082 } 2083 2084 /* Terms of the form: langid = $langid */ 2085 if( (iPlan & SPELLFIX_IDXNUM_LANGID)==0 2086 && pConstraint->iColumn==SPELLFIX_COL_LANGID 2087 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ 2088 ){ 2089 iPlan |= SPELLFIX_IDXNUM_LANGID; 2090 iLangTerm = i; 2091 } 2092 2093 /* Terms of the form: top = $top */ 2094 if( (iPlan & SPELLFIX_IDXNUM_TOP)==0 2095 && pConstraint->iColumn==SPELLFIX_COL_TOP 2096 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ 2097 ){ 2098 iPlan |= SPELLFIX_IDXNUM_TOP; 2099 iTopTerm = i; 2100 } 2101 2102 /* Terms of the form: scope = $scope */ 2103 if( (iPlan & SPELLFIX_IDXNUM_SCOPE)==0 2104 && pConstraint->iColumn==SPELLFIX_COL_SCOPE 2105 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ 2106 ){ 2107 iPlan |= SPELLFIX_IDXNUM_SCOPE; 2108 iScopeTerm = i; 2109 } 2110 2111 /* Terms of the form: distance < $dist or distance <= $dist */ 2112 if( (iPlan & SPELLFIX_IDXNUM_DIST)==0 2113 && pConstraint->iColumn==SPELLFIX_COL_DISTANCE 2114 && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT 2115 || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE) 2116 ){ 2117 if( pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ){ 2118 iPlan |= SPELLFIX_IDXNUM_DISTLT; 2119 }else{ 2120 iPlan |= SPELLFIX_IDXNUM_DISTLE; 2121 } 2122 iDistTerm = i; 2123 } 2124 2125 /* Terms of the form: distance < $dist or distance <= $dist */ 2126 if( (iPlan & SPELLFIX_IDXNUM_ROWID)==0 2127 && pConstraint->iColumn<0 2128 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ 2129 ){ 2130 iPlan |= SPELLFIX_IDXNUM_ROWID; 2131 iRowidTerm = i; 2132 } 2133 } 2134 if( iPlan&SPELLFIX_IDXNUM_MATCH ){ 2135 int idx = 2; 2136 pIdxInfo->idxNum = iPlan; 2137 if( pIdxInfo->nOrderBy==1 2138 && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE 2139 && pIdxInfo->aOrderBy[0].desc==0 2140 ){ 2141 pIdxInfo->orderByConsumed = 1; /* Default order by iScore */ 2142 } 2143 if( iPlan&SPELLFIX_IDXNUM_LANGID ){ 2144 pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++; 2145 pIdxInfo->aConstraintUsage[iLangTerm].omit = 1; 2146 } 2147 if( iPlan&SPELLFIX_IDXNUM_TOP ){ 2148 pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++; 2149 pIdxInfo->aConstraintUsage[iTopTerm].omit = 1; 2150 } 2151 if( iPlan&SPELLFIX_IDXNUM_SCOPE ){ 2152 pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++; 2153 pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1; 2154 } 2155 if( iPlan&SPELLFIX_IDXNUM_DIST ){ 2156 pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++; 2157 pIdxInfo->aConstraintUsage[iDistTerm].omit = 1; 2158 } 2159 pIdxInfo->estimatedCost = 1e5; 2160 }else if( (iPlan & SPELLFIX_IDXNUM_ROWID) ){ 2161 pIdxInfo->idxNum = SPELLFIX_IDXNUM_ROWID; 2162 pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1; 2163 pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1; 2164 pIdxInfo->estimatedCost = 5; 2165 }else{ 2166 pIdxInfo->idxNum = 0; 2167 pIdxInfo->estimatedCost = 1e50; 2168 } 2169 return SQLITE_OK; 2170 } 2171 2172 /* 2173 ** Open a new fuzzy-search cursor. 2174 */ 2175 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ 2176 spellfix1_vtab *p = (spellfix1_vtab*)pVTab; 2177 spellfix1_cursor *pCur; 2178 pCur = sqlite3_malloc( sizeof(*pCur) ); 2179 if( pCur==0 ) return SQLITE_NOMEM; 2180 memset(pCur, 0, sizeof(*pCur)); 2181 pCur->pVTab = p; 2182 *ppCursor = &pCur->base; 2183 return SQLITE_OK; 2184 } 2185 2186 /* 2187 ** Adjust a distance measurement by the words rank in order to show 2188 ** preference to common words. 2189 */ 2190 static int spellfix1Score(int iDistance, int iRank){ 2191 int iLog2; 2192 for(iLog2=0; iRank>0; iLog2++, iRank>>=1){} 2193 return iDistance + 32 - iLog2; 2194 } 2195 2196 /* 2197 ** Compare two spellfix1_row objects for sorting purposes in qsort() such 2198 ** that they sort in order of increasing distance. 2199 */ 2200 static int spellfix1RowCompare(const void *A, const void *B){ 2201 const struct spellfix1_row *a = (const struct spellfix1_row*)A; 2202 const struct spellfix1_row *b = (const struct spellfix1_row*)B; 2203 return a->iScore - b->iScore; 2204 } 2205 2206 /* 2207 ** A structure used to pass information from spellfix1FilterForMatch() 2208 ** into spellfix1RunQuery(). 2209 */ 2210 typedef struct MatchQuery { 2211 spellfix1_cursor *pCur; /* The cursor being queried */ 2212 sqlite3_stmt *pStmt; /* shadow table query statment */ 2213 char zHash[SPELLFIX_MX_HASH]; /* The current phonehash for zPattern */ 2214 const char *zPattern; /* Transliterated input string */ 2215 int nPattern; /* Length of zPattern */ 2216 EditDist3FromString *pMatchStr3; /* Original unicode string */ 2217 EditDist3Config *pConfig3; /* Edit-distance cost coefficients */ 2218 const EditDist3Lang *pLang; /* The selected language coefficients */ 2219 int iLang; /* The language id */ 2220 int iScope; /* Default scope */ 2221 int iMaxDist; /* Maximum allowed edit distance, or -1 */ 2222 int rc; /* Error code */ 2223 int nRun; /* Number of prior runs for the same zPattern */ 2224 char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH]; /* Prior hashes */ 2225 } MatchQuery; 2226 2227 /* 2228 ** Run a query looking for the best matches against zPattern using 2229 ** zHash as the character class seed hash. 2230 */ 2231 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){ 2232 const char *zK1; 2233 const char *zWord; 2234 int iDist; 2235 int iRank; 2236 int iScore; 2237 int iWorst = 0; 2238 int idx; 2239 int idxWorst = -1; 2240 int i; 2241 int iScope = p->iScope; 2242 spellfix1_cursor *pCur = p->pCur; 2243 sqlite3_stmt *pStmt = p->pStmt; 2244 char zHash1[SPELLFIX_MX_HASH]; 2245 char zHash2[SPELLFIX_MX_HASH]; 2246 char *zClass; 2247 int nClass; 2248 int rc; 2249 2250 if( pCur->a==0 || p->rc ) return; /* Prior memory allocation failure */ 2251 zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery); 2252 if( zClass==0 ){ 2253 p->rc = SQLITE_NOMEM; 2254 return; 2255 } 2256 nClass = (int)strlen(zClass); 2257 if( nClass>SPELLFIX_MX_HASH-2 ){ 2258 nClass = SPELLFIX_MX_HASH-2; 2259 zClass[nClass] = 0; 2260 } 2261 if( nClass<=iScope ){ 2262 if( nClass>2 ){ 2263 iScope = nClass-1; 2264 }else{ 2265 iScope = nClass; 2266 } 2267 } 2268 memcpy(zHash1, zClass, iScope); 2269 sqlite3_free(zClass); 2270 zHash1[iScope] = 0; 2271 memcpy(zHash2, zHash1, iScope); 2272 zHash2[iScope] = 'Z'; 2273 zHash2[iScope+1] = 0; 2274 #if SPELLFIX_MX_RUN>1 2275 for(i=0; i<p->nRun; i++){ 2276 if( strcmp(p->azPrior[i], zHash1)==0 ) return; 2277 } 2278 #endif 2279 assert( p->nRun<SPELLFIX_MX_RUN ); 2280 memcpy(p->azPrior[p->nRun++], zHash1, iScope+1); 2281 if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM 2282 || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM 2283 ){ 2284 p->rc = SQLITE_NOMEM; 2285 return; 2286 } 2287 #if SPELLFIX_MX_RUN>1 2288 for(i=0; i<pCur->nRow; i++){ 2289 if( pCur->a[i].iScore>iWorst ){ 2290 iWorst = pCur->a[i].iScore; 2291 idxWorst = i; 2292 } 2293 } 2294 #endif 2295 while( sqlite3_step(pStmt)==SQLITE_ROW ){ 2296 int iMatchlen = -1; 2297 iRank = sqlite3_column_int(pStmt, 2); 2298 if( p->pMatchStr3 ){ 2299 int nWord = sqlite3_column_bytes(pStmt, 1); 2300 zWord = (const char*)sqlite3_column_text(pStmt, 1); 2301 iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen); 2302 }else{ 2303 zK1 = (const char*)sqlite3_column_text(pStmt, 3); 2304 if( zK1==0 ) continue; 2305 iDist = editdist1(p->zPattern, zK1, 0); 2306 } 2307 if( iDist<0 ){ 2308 p->rc = SQLITE_NOMEM; 2309 break; 2310 } 2311 pCur->nSearch++; 2312 2313 /* If there is a "distance < $dist" or "distance <= $dist" constraint, 2314 ** check if this row meets it. If not, jump back up to the top of the 2315 ** loop to process the next row. Otherwise, if the row does match the 2316 ** distance constraint, check if the pCur->a[] array is already full. 2317 ** If it is and no explicit "top = ?" constraint was present in the 2318 ** query, grow the array to ensure there is room for the new entry. */ 2319 assert( (p->iMaxDist>=0)==((pCur->idxNum & SPELLFIX_IDXNUM_DIST) ? 1 : 0) ); 2320 if( p->iMaxDist>=0 ){ 2321 if( iDist>p->iMaxDist ) continue; 2322 if( pCur->nRow>=pCur->nAlloc && (pCur->idxNum & SPELLFIX_IDXNUM_TOP)==0 ){ 2323 spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10); 2324 if( pCur->a==0 ) break; 2325 } 2326 } 2327 2328 iScore = spellfix1Score(iDist,iRank); 2329 if( pCur->nRow<pCur->nAlloc ){ 2330 idx = pCur->nRow; 2331 }else if( iScore<iWorst ){ 2332 idx = idxWorst; 2333 sqlite3_free(pCur->a[idx].zWord); 2334 }else{ 2335 continue; 2336 } 2337 2338 pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1)); 2339 if( pCur->a[idx].zWord==0 ){ 2340 p->rc = SQLITE_NOMEM; 2341 break; 2342 } 2343 pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0); 2344 pCur->a[idx].iRank = iRank; 2345 pCur->a[idx].iDistance = iDist; 2346 pCur->a[idx].iScore = iScore; 2347 pCur->a[idx].iMatchlen = iMatchlen; 2348 memcpy(pCur->a[idx].zHash, zHash1, iScope+1); 2349 if( pCur->nRow<pCur->nAlloc ) pCur->nRow++; 2350 if( pCur->nRow==pCur->nAlloc ){ 2351 iWorst = pCur->a[0].iScore; 2352 idxWorst = 0; 2353 for(i=1; i<pCur->nRow; i++){ 2354 iScore = pCur->a[i].iScore; 2355 if( iWorst<iScore ){ 2356 iWorst = iScore; 2357 idxWorst = i; 2358 } 2359 } 2360 } 2361 } 2362 rc = sqlite3_reset(pStmt); 2363 if( rc ) p->rc = rc; 2364 } 2365 2366 /* 2367 ** This version of the xFilter method work if the MATCH term is present 2368 ** and we are doing a scan. 2369 */ 2370 static int spellfix1FilterForMatch( 2371 spellfix1_cursor *pCur, 2372 int argc, 2373 sqlite3_value **argv 2374 ){ 2375 int idxNum = pCur->idxNum; 2376 const unsigned char *zMatchThis; /* RHS of the MATCH operator */ 2377 EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */ 2378 char *zPattern; /* Transliteration of zMatchThis */ 2379 int nPattern; /* Length of zPattern */ 2380 int iLimit = 20; /* Max number of rows of output */ 2381 int iScope = 3; /* Use this many characters of zClass */ 2382 int iLang = 0; /* Language code */ 2383 char *zSql; /* SQL of shadow table query */ 2384 sqlite3_stmt *pStmt = 0; /* Shadow table query */ 2385 int rc; /* Result code */ 2386 int idx = 1; /* Next available filter parameter */ 2387 spellfix1_vtab *p = pCur->pVTab; /* The virtual table that owns pCur */ 2388 MatchQuery x; /* For passing info to RunQuery() */ 2389 2390 /* Load the cost table if we have not already done so */ 2391 if( p->zCostTable!=0 && p->pConfig3==0 ){ 2392 p->pConfig3 = sqlite3_malloc( sizeof(p->pConfig3[0]) ); 2393 if( p->pConfig3==0 ) return SQLITE_NOMEM; 2394 memset(p->pConfig3, 0, sizeof(p->pConfig3[0])); 2395 rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable); 2396 if( rc ) return rc; 2397 } 2398 memset(&x, 0, sizeof(x)); 2399 x.iScope = 3; /* Default scope if none specified by "WHERE scope=N" */ 2400 x.iMaxDist = -1; /* Maximum allowed edit distance */ 2401 2402 if( idxNum&2 ){ 2403 iLang = sqlite3_value_int(argv[idx++]); 2404 } 2405 if( idxNum&4 ){ 2406 iLimit = sqlite3_value_int(argv[idx++]); 2407 if( iLimit<1 ) iLimit = 1; 2408 } 2409 if( idxNum&8 ){ 2410 x.iScope = sqlite3_value_int(argv[idx++]); 2411 if( x.iScope<1 ) x.iScope = 1; 2412 if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2; 2413 } 2414 if( idxNum&(16|32) ){ 2415 x.iMaxDist = sqlite3_value_int(argv[idx++]); 2416 if( idxNum&16 ) x.iMaxDist--; 2417 if( x.iMaxDist<0 ) x.iMaxDist = 0; 2418 } 2419 spellfix1ResetCursor(pCur); 2420 spellfix1ResizeCursor(pCur, iLimit); 2421 zMatchThis = sqlite3_value_text(argv[0]); 2422 if( zMatchThis==0 ) return SQLITE_OK; 2423 if( p->pConfig3 ){ 2424 x.pLang = editDist3FindLang(p->pConfig3, iLang); 2425 pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1); 2426 if( pMatchStr3==0 ){ 2427 x.rc = SQLITE_NOMEM; 2428 goto filter_exit; 2429 } 2430 }else{ 2431 x.pLang = 0; 2432 } 2433 zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0])); 2434 sqlite3_free(pCur->zPattern); 2435 pCur->zPattern = zPattern; 2436 if( zPattern==0 ){ 2437 x.rc = SQLITE_NOMEM; 2438 goto filter_exit; 2439 } 2440 nPattern = (int)strlen(zPattern); 2441 if( zPattern[nPattern-1]=='*' ) nPattern--; 2442 zSql = sqlite3_mprintf( 2443 "SELECT id, word, rank, k1" 2444 " FROM \"%w\".\"%w_vocab\"" 2445 " WHERE langid=%d AND k2>=?1 AND k2<?2", 2446 p->zDbName, p->zTableName, iLang 2447 ); 2448 if( zSql==0 ){ 2449 x.rc = SQLITE_NOMEM; 2450 pStmt = 0; 2451 goto filter_exit; 2452 } 2453 rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0); 2454 sqlite3_free(zSql); 2455 pCur->iLang = iLang; 2456 x.pCur = pCur; 2457 x.pStmt = pStmt; 2458 x.zPattern = zPattern; 2459 x.nPattern = nPattern; 2460 x.pMatchStr3 = pMatchStr3; 2461 x.iLang = iLang; 2462 x.rc = rc; 2463 x.pConfig3 = p->pConfig3; 2464 if( x.rc==SQLITE_OK ){ 2465 spellfix1RunQuery(&x, zPattern, nPattern); 2466 } 2467 2468 if( pCur->a ){ 2469 qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare); 2470 pCur->iTop = iLimit; 2471 pCur->iScope = iScope; 2472 }else{ 2473 x.rc = SQLITE_NOMEM; 2474 } 2475 2476 filter_exit: 2477 sqlite3_finalize(pStmt); 2478 editDist3FromStringDelete(pMatchStr3); 2479 return x.rc; 2480 } 2481 2482 /* 2483 ** This version of xFilter handles a full-table scan case 2484 */ 2485 static int spellfix1FilterForFullScan( 2486 spellfix1_cursor *pCur, 2487 int argc, 2488 sqlite3_value **argv 2489 ){ 2490 int rc = SQLITE_OK; 2491 int idxNum = pCur->idxNum; 2492 char *zSql; 2493 spellfix1_vtab *pVTab = pCur->pVTab; 2494 spellfix1ResetCursor(pCur); 2495 assert( idxNum==0 || idxNum==64 ); 2496 zSql = sqlite3_mprintf( 2497 "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s", 2498 pVTab->zDbName, pVTab->zTableName, 2499 ((idxNum & 64) ? " WHERE rowid=?" : "") 2500 ); 2501 if( zSql==0 ) return SQLITE_NOMEM; 2502 rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0); 2503 sqlite3_free(zSql); 2504 if( rc==SQLITE_OK && (idxNum & 64) ){ 2505 assert( argc==1 ); 2506 rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]); 2507 } 2508 pCur->nRow = pCur->iRow = 0; 2509 if( rc==SQLITE_OK ){ 2510 rc = sqlite3_step(pCur->pFullScan); 2511 if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; } 2512 if( rc==SQLITE_DONE ){ rc = SQLITE_OK; } 2513 }else{ 2514 pCur->iRow = 0; 2515 } 2516 return rc; 2517 } 2518 2519 2520 /* 2521 ** Called to "rewind" a cursor back to the beginning so that 2522 ** it starts its output over again. Always called at least once 2523 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call. 2524 */ 2525 static int spellfix1Filter( 2526 sqlite3_vtab_cursor *cur, 2527 int idxNum, const char *idxStr, 2528 int argc, sqlite3_value **argv 2529 ){ 2530 spellfix1_cursor *pCur = (spellfix1_cursor *)cur; 2531 int rc; 2532 pCur->idxNum = idxNum; 2533 if( idxNum & 1 ){ 2534 rc = spellfix1FilterForMatch(pCur, argc, argv); 2535 }else{ 2536 rc = spellfix1FilterForFullScan(pCur, argc, argv); 2537 } 2538 return rc; 2539 } 2540 2541 2542 /* 2543 ** Advance a cursor to its next row of output 2544 */ 2545 static int spellfix1Next(sqlite3_vtab_cursor *cur){ 2546 spellfix1_cursor *pCur = (spellfix1_cursor *)cur; 2547 int rc = SQLITE_OK; 2548 if( pCur->iRow < pCur->nRow ){ 2549 if( pCur->pFullScan ){ 2550 rc = sqlite3_step(pCur->pFullScan); 2551 if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow; 2552 if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK; 2553 }else{ 2554 pCur->iRow++; 2555 } 2556 } 2557 return rc; 2558 } 2559 2560 /* 2561 ** Return TRUE if we are at the end-of-file 2562 */ 2563 static int spellfix1Eof(sqlite3_vtab_cursor *cur){ 2564 spellfix1_cursor *pCur = (spellfix1_cursor *)cur; 2565 return pCur->iRow>=pCur->nRow; 2566 } 2567 2568 /* 2569 ** Return columns from the current row. 2570 */ 2571 static int spellfix1Column( 2572 sqlite3_vtab_cursor *cur, 2573 sqlite3_context *ctx, 2574 int i 2575 ){ 2576 spellfix1_cursor *pCur = (spellfix1_cursor*)cur; 2577 if( pCur->pFullScan ){ 2578 if( i<=SPELLFIX_COL_LANGID ){ 2579 sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i)); 2580 }else{ 2581 sqlite3_result_null(ctx); 2582 } 2583 return SQLITE_OK; 2584 } 2585 switch( i ){ 2586 case SPELLFIX_COL_WORD: { 2587 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC); 2588 break; 2589 } 2590 case SPELLFIX_COL_RANK: { 2591 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank); 2592 break; 2593 } 2594 case SPELLFIX_COL_DISTANCE: { 2595 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance); 2596 break; 2597 } 2598 case SPELLFIX_COL_LANGID: { 2599 sqlite3_result_int(ctx, pCur->iLang); 2600 break; 2601 } 2602 case SPELLFIX_COL_SCORE: { 2603 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore); 2604 break; 2605 } 2606 case SPELLFIX_COL_MATCHLEN: { 2607 int iMatchlen = pCur->a[pCur->iRow].iMatchlen; 2608 if( iMatchlen<0 ){ 2609 int nPattern = (int)strlen(pCur->zPattern); 2610 char *zWord = pCur->a[pCur->iRow].zWord; 2611 int nWord = (int)strlen(zWord); 2612 2613 if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){ 2614 char *zTranslit; 2615 int res; 2616 zTranslit = (char *)transliterate((unsigned char *)zWord, nWord); 2617 if( !zTranslit ) return SQLITE_NOMEM; 2618 res = editdist1(pCur->zPattern, zTranslit, &iMatchlen); 2619 sqlite3_free(zTranslit); 2620 if( res<0 ) return SQLITE_NOMEM; 2621 iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen); 2622 }else{ 2623 iMatchlen = utf8Charlen(zWord, nWord); 2624 } 2625 } 2626 2627 sqlite3_result_int(ctx, iMatchlen); 2628 break; 2629 } 2630 case SPELLFIX_COL_PHONEHASH: { 2631 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC); 2632 break; 2633 } 2634 case SPELLFIX_COL_TOP: { 2635 sqlite3_result_int(ctx, pCur->iTop); 2636 break; 2637 } 2638 case SPELLFIX_COL_SCOPE: { 2639 sqlite3_result_int(ctx, pCur->iScope); 2640 break; 2641 } 2642 case SPELLFIX_COL_SRCHCNT: { 2643 sqlite3_result_int(ctx, pCur->nSearch); 2644 break; 2645 } 2646 default: { 2647 sqlite3_result_null(ctx); 2648 break; 2649 } 2650 } 2651 return SQLITE_OK; 2652 } 2653 2654 /* 2655 ** The rowid. 2656 */ 2657 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){ 2658 spellfix1_cursor *pCur = (spellfix1_cursor*)cur; 2659 if( pCur->pFullScan ){ 2660 *pRowid = sqlite3_column_int64(pCur->pFullScan, 4); 2661 }else{ 2662 *pRowid = pCur->a[pCur->iRow].iRowid; 2663 } 2664 return SQLITE_OK; 2665 } 2666 2667 /* 2668 ** This function is called by the xUpdate() method. It returns a string 2669 ** containing the conflict mode that xUpdate() should use for the current 2670 ** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE". 2671 */ 2672 static const char *spellfix1GetConflict(sqlite3 *db){ 2673 static const char *azConflict[] = { 2674 /* Note: Instead of "FAIL" - "ABORT". */ 2675 "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE" 2676 }; 2677 int eConflict = sqlite3_vtab_on_conflict(db); 2678 2679 assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE 2680 || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT 2681 || eConflict==SQLITE_REPLACE 2682 ); 2683 assert( SQLITE_ROLLBACK==1 ); 2684 assert( SQLITE_IGNORE==2 ); 2685 assert( SQLITE_FAIL==3 ); 2686 assert( SQLITE_ABORT==4 ); 2687 assert( SQLITE_REPLACE==5 ); 2688 2689 return azConflict[eConflict-1]; 2690 } 2691 2692 /* 2693 ** The xUpdate() method. 2694 */ 2695 static int spellfix1Update( 2696 sqlite3_vtab *pVTab, 2697 int argc, 2698 sqlite3_value **argv, 2699 sqlite_int64 *pRowid 2700 ){ 2701 int rc = SQLITE_OK; 2702 sqlite3_int64 rowid, newRowid; 2703 spellfix1_vtab *p = (spellfix1_vtab*)pVTab; 2704 sqlite3 *db = p->db; 2705 2706 if( argc==1 ){ 2707 /* A delete operation on the rowid given by argv[0] */ 2708 rowid = *pRowid = sqlite3_value_int64(argv[0]); 2709 spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" " 2710 " WHERE id=%lld", 2711 p->zDbName, p->zTableName, rowid); 2712 }else{ 2713 const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]); 2714 int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]); 2715 int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]); 2716 int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]); 2717 const unsigned char *zSoundslike = 2718 sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]); 2719 int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]); 2720 char *zK1, *zK2; 2721 int i; 2722 char c; 2723 const char *zConflict = spellfix1GetConflict(db); 2724 2725 if( zWord==0 ){ 2726 /* Inserts of the form: INSERT INTO table(command) VALUES('xyzzy'); 2727 ** cause zWord to be NULL, so we look at the "command" column to see 2728 ** what special actions to take */ 2729 const char *zCmd = 2730 (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]); 2731 if( zCmd==0 ){ 2732 pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word", 2733 p->zTableName); 2734 return SQLITE_CONSTRAINT_NOTNULL; 2735 } 2736 if( strcmp(zCmd,"reset")==0 ){ 2737 /* Reset the edit cost table (if there is one). */ 2738 editDist3ConfigDelete(p->pConfig3); 2739 p->pConfig3 = 0; 2740 return SQLITE_OK; 2741 } 2742 if( strncmp(zCmd,"edit_cost_table=",16)==0 ){ 2743 editDist3ConfigDelete(p->pConfig3); 2744 p->pConfig3 = 0; 2745 sqlite3_free(p->zCostTable); 2746 p->zCostTable = spellfix1Dequote(zCmd+16); 2747 if( p->zCostTable==0 ) return SQLITE_NOMEM; 2748 if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){ 2749 sqlite3_free(p->zCostTable); 2750 p->zCostTable = 0; 2751 } 2752 return SQLITE_OK; 2753 } 2754 pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"", 2755 p->zTableName, zCmd); 2756 return SQLITE_ERROR; 2757 } 2758 if( iRank<1 ) iRank = 1; 2759 if( zSoundslike ){ 2760 zK1 = (char*)transliterate(zSoundslike, nSoundslike); 2761 }else{ 2762 zK1 = (char*)transliterate(zWord, nWord); 2763 } 2764 if( zK1==0 ) return SQLITE_NOMEM; 2765 for(i=0; (c = zK1[i])!=0; i++){ 2766 if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A'; 2767 } 2768 zK2 = (char*)phoneticHash((const unsigned char*)zK1, i); 2769 if( zK2==0 ){ 2770 sqlite3_free(zK1); 2771 return SQLITE_NOMEM; 2772 } 2773 if( sqlite3_value_type(argv[0])==SQLITE_NULL ){ 2774 if( sqlite3_value_type(argv[1])==SQLITE_NULL ){ 2775 spellfix1DbExec(&rc, db, 2776 "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) " 2777 "VALUES(%d,%d,%Q,%Q,%Q)", 2778 p->zDbName, p->zTableName, 2779 iRank, iLang, zWord, zK1, zK2 2780 ); 2781 }else{ 2782 newRowid = sqlite3_value_int64(argv[1]); 2783 spellfix1DbExec(&rc, db, 2784 "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) " 2785 "VALUES(%lld,%d,%d,%Q,%Q,%Q)", 2786 zConflict, p->zDbName, p->zTableName, 2787 newRowid, iRank, iLang, zWord, zK1, zK2 2788 ); 2789 } 2790 *pRowid = sqlite3_last_insert_rowid(db); 2791 }else{ 2792 rowid = sqlite3_value_int64(argv[0]); 2793 newRowid = *pRowid = sqlite3_value_int64(argv[1]); 2794 spellfix1DbExec(&rc, db, 2795 "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d," 2796 " word=%Q, k1=%Q, k2=%Q WHERE id=%lld", 2797 zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang, 2798 zWord, zK1, zK2, rowid 2799 ); 2800 } 2801 sqlite3_free(zK1); 2802 sqlite3_free(zK2); 2803 } 2804 return rc; 2805 } 2806 2807 /* 2808 ** Rename the spellfix1 table. 2809 */ 2810 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){ 2811 spellfix1_vtab *p = (spellfix1_vtab*)pVTab; 2812 sqlite3 *db = p->db; 2813 int rc = SQLITE_OK; 2814 char *zNewName = sqlite3_mprintf("%s", zNew); 2815 if( zNewName==0 ){ 2816 return SQLITE_NOMEM; 2817 } 2818 spellfix1DbExec(&rc, db, 2819 "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"", 2820 p->zDbName, p->zTableName, zNewName 2821 ); 2822 if( rc==SQLITE_OK ){ 2823 sqlite3_free(p->zTableName); 2824 p->zTableName = zNewName; 2825 }else{ 2826 sqlite3_free(zNewName); 2827 } 2828 return rc; 2829 } 2830 2831 2832 /* 2833 ** A virtual table module that provides fuzzy search. 2834 */ 2835 static sqlite3_module spellfix1Module = { 2836 0, /* iVersion */ 2837 spellfix1Create, /* xCreate - handle CREATE VIRTUAL TABLE */ 2838 spellfix1Connect, /* xConnect - reconnected to an existing table */ 2839 spellfix1BestIndex, /* xBestIndex - figure out how to do a query */ 2840 spellfix1Disconnect, /* xDisconnect - close a connection */ 2841 spellfix1Destroy, /* xDestroy - handle DROP TABLE */ 2842 spellfix1Open, /* xOpen - open a cursor */ 2843 spellfix1Close, /* xClose - close a cursor */ 2844 spellfix1Filter, /* xFilter - configure scan constraints */ 2845 spellfix1Next, /* xNext - advance a cursor */ 2846 spellfix1Eof, /* xEof - check for end of scan */ 2847 spellfix1Column, /* xColumn - read data */ 2848 spellfix1Rowid, /* xRowid - read data */ 2849 spellfix1Update, /* xUpdate */ 2850 0, /* xBegin */ 2851 0, /* xSync */ 2852 0, /* xCommit */ 2853 0, /* xRollback */ 2854 0, /* xFindMethod */ 2855 spellfix1Rename, /* xRename */ 2856 }; 2857 2858 /* 2859 ** Register the various functions and the virtual table. 2860 */ 2861 static int spellfix1Register(sqlite3 *db){ 2862 int rc = SQLITE_OK; 2863 int i; 2864 rc = sqlite3_create_function(db, "spellfix1_translit", 1, SQLITE_UTF8, 0, 2865 transliterateSqlFunc, 0, 0); 2866 if( rc==SQLITE_OK ){ 2867 rc = sqlite3_create_function(db, "spellfix1_editdist", 2, SQLITE_UTF8, 0, 2868 editdistSqlFunc, 0, 0); 2869 } 2870 if( rc==SQLITE_OK ){ 2871 rc = sqlite3_create_function(db, "spellfix1_phonehash", 1, SQLITE_UTF8, 0, 2872 phoneticHashSqlFunc, 0, 0); 2873 } 2874 if( rc==SQLITE_OK ){ 2875 rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1, SQLITE_UTF8, 0, 2876 scriptCodeSqlFunc, 0, 0); 2877 } 2878 if( rc==SQLITE_OK ){ 2879 rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0); 2880 } 2881 if( rc==SQLITE_OK ){ 2882 rc = editDist3Install(db); 2883 } 2884 2885 /* Verify sanity of the translit[] table */ 2886 for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){ 2887 assert( translit[i].cFrom<translit[i+1].cFrom ); 2888 } 2889 2890 return rc; 2891 } 2892 2893 #endif /* SQLITE_OMIT_VIRTUALTABLE */ 2894 2895 /* 2896 ** Extension load function. 2897 */ 2898 #ifdef _WIN32 2899 __declspec(dllexport) 2900 #endif 2901 int sqlite3_spellfix_init( 2902 sqlite3 *db, 2903 char **pzErrMsg, 2904 const sqlite3_api_routines *pApi 2905 ){ 2906 SQLITE_EXTENSION_INIT2(pApi); 2907 #ifndef SQLITE_OMIT_VIRTUALTABLE 2908 return spellfix1Register(db); 2909 #endif 2910 return SQLITE_OK; 2911 } 2912