1 /* 2 ** 2007 May 6 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $ 13 ** 14 ** This file implements an integration between the ICU library 15 ** ("International Components for Unicode", an open-source library 16 ** for handling unicode data) and SQLite. The integration uses 17 ** ICU to provide the following to SQLite: 18 ** 19 ** * An implementation of the SQL regexp() function (and hence REGEXP 20 ** operator) using the ICU uregex_XX() APIs. 21 ** 22 ** * Implementations of the SQL scalar upper() and lower() functions 23 ** for case mapping. 24 ** 25 ** * Integration of ICU and SQLite collation sequences. 26 ** 27 ** * An implementation of the LIKE operator that uses ICU to 28 ** provide case-independent matching. 29 */ 30 31 #if !defined(SQLITE_CORE) \ 32 || defined(SQLITE_ENABLE_ICU) \ 33 || defined(SQLITE_ENABLE_ICU_COLLATIONS) 34 35 /* Include ICU headers */ 36 #include <unicode/utypes.h> 37 #include <unicode/uregex.h> 38 #include <unicode/ustring.h> 39 #include <unicode/ucol.h> 40 41 #include <assert.h> 42 43 #ifndef SQLITE_CORE 44 #include "sqlite3ext.h" 45 SQLITE_EXTENSION_INIT1 46 #else 47 #include "sqlite3.h" 48 #endif 49 50 /* 51 ** This function is called when an ICU function called from within 52 ** the implementation of an SQL scalar function returns an error. 53 ** 54 ** The scalar function context passed as the first argument is 55 ** loaded with an error message based on the following two args. 56 */ 57 static void icuFunctionError( 58 sqlite3_context *pCtx, /* SQLite scalar function context */ 59 const char *zName, /* Name of ICU function that failed */ 60 UErrorCode e /* Error code returned by ICU function */ 61 ){ 62 char zBuf[128]; 63 sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e)); 64 zBuf[127] = '\0'; 65 sqlite3_result_error(pCtx, zBuf, -1); 66 } 67 68 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) 69 70 /* 71 ** Maximum length (in bytes) of the pattern in a LIKE or GLOB 72 ** operator. 73 */ 74 #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH 75 # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000 76 #endif 77 78 /* 79 ** Version of sqlite3_free() that is always a function, never a macro. 80 */ 81 static void xFree(void *p){ 82 sqlite3_free(p); 83 } 84 85 /* 86 ** This lookup table is used to help decode the first byte of 87 ** a multi-byte UTF8 character. It is copied here from SQLite source 88 ** code file utf8.c. 89 */ 90 static const unsigned char icuUtf8Trans1[] = { 91 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 92 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 93 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 94 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 95 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 96 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 97 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 98 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, 99 }; 100 101 #define SQLITE_ICU_READ_UTF8(zIn, c) \ 102 c = *(zIn++); \ 103 if( c>=0xc0 ){ \ 104 c = icuUtf8Trans1[c-0xc0]; \ 105 while( (*zIn & 0xc0)==0x80 ){ \ 106 c = (c<<6) + (0x3f & *(zIn++)); \ 107 } \ 108 } 109 110 #define SQLITE_ICU_SKIP_UTF8(zIn) \ 111 assert( *zIn ); \ 112 if( *(zIn++)>=0xc0 ){ \ 113 while( (*zIn & 0xc0)==0x80 ){zIn++;} \ 114 } 115 116 117 /* 118 ** Compare two UTF-8 strings for equality where the first string is 119 ** a "LIKE" expression. Return true (1) if they are the same and 120 ** false (0) if they are different. 121 */ 122 static int icuLikeCompare( 123 const uint8_t *zPattern, /* LIKE pattern */ 124 const uint8_t *zString, /* The UTF-8 string to compare against */ 125 const UChar32 uEsc /* The escape character */ 126 ){ 127 static const uint32_t MATCH_ONE = (uint32_t)'_'; 128 static const uint32_t MATCH_ALL = (uint32_t)'%'; 129 130 int prevEscape = 0; /* True if the previous character was uEsc */ 131 132 while( 1 ){ 133 134 /* Read (and consume) the next character from the input pattern. */ 135 uint32_t uPattern; 136 SQLITE_ICU_READ_UTF8(zPattern, uPattern); 137 if( uPattern==0 ) break; 138 139 /* There are now 4 possibilities: 140 ** 141 ** 1. uPattern is an unescaped match-all character "%", 142 ** 2. uPattern is an unescaped match-one character "_", 143 ** 3. uPattern is an unescaped escape character, or 144 ** 4. uPattern is to be handled as an ordinary character 145 */ 146 if( uPattern==MATCH_ALL && !prevEscape && uPattern!=(uint32_t)uEsc ){ 147 /* Case 1. */ 148 uint8_t c; 149 150 /* Skip any MATCH_ALL or MATCH_ONE characters that follow a 151 ** MATCH_ALL. For each MATCH_ONE, skip one character in the 152 ** test string. 153 */ 154 while( (c=*zPattern) == MATCH_ALL || c == MATCH_ONE ){ 155 if( c==MATCH_ONE ){ 156 if( *zString==0 ) return 0; 157 SQLITE_ICU_SKIP_UTF8(zString); 158 } 159 zPattern++; 160 } 161 162 if( *zPattern==0 ) return 1; 163 164 while( *zString ){ 165 if( icuLikeCompare(zPattern, zString, uEsc) ){ 166 return 1; 167 } 168 SQLITE_ICU_SKIP_UTF8(zString); 169 } 170 return 0; 171 172 }else if( uPattern==MATCH_ONE && !prevEscape && uPattern!=(uint32_t)uEsc ){ 173 /* Case 2. */ 174 if( *zString==0 ) return 0; 175 SQLITE_ICU_SKIP_UTF8(zString); 176 177 }else if( uPattern==(uint32_t)uEsc && !prevEscape ){ 178 /* Case 3. */ 179 prevEscape = 1; 180 181 }else{ 182 /* Case 4. */ 183 uint32_t uString; 184 SQLITE_ICU_READ_UTF8(zString, uString); 185 uString = (uint32_t)u_foldCase((UChar32)uString, U_FOLD_CASE_DEFAULT); 186 uPattern = (uint32_t)u_foldCase((UChar32)uPattern, U_FOLD_CASE_DEFAULT); 187 if( uString!=uPattern ){ 188 return 0; 189 } 190 prevEscape = 0; 191 } 192 } 193 194 return *zString==0; 195 } 196 197 /* 198 ** Implementation of the like() SQL function. This function implements 199 ** the build-in LIKE operator. The first argument to the function is the 200 ** pattern and the second argument is the string. So, the SQL statements: 201 ** 202 ** A LIKE B 203 ** 204 ** is implemented as like(B, A). If there is an escape character E, 205 ** 206 ** A LIKE B ESCAPE E 207 ** 208 ** is mapped to like(B, A, E). 209 */ 210 static void icuLikeFunc( 211 sqlite3_context *context, 212 int argc, 213 sqlite3_value **argv 214 ){ 215 const unsigned char *zA = sqlite3_value_text(argv[0]); 216 const unsigned char *zB = sqlite3_value_text(argv[1]); 217 UChar32 uEsc = 0; 218 219 /* Limit the length of the LIKE or GLOB pattern to avoid problems 220 ** of deep recursion and N*N behavior in patternCompare(). 221 */ 222 if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){ 223 sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1); 224 return; 225 } 226 227 228 if( argc==3 ){ 229 /* The escape character string must consist of a single UTF-8 character. 230 ** Otherwise, return an error. 231 */ 232 int nE= sqlite3_value_bytes(argv[2]); 233 const unsigned char *zE = sqlite3_value_text(argv[2]); 234 int i = 0; 235 if( zE==0 ) return; 236 U8_NEXT(zE, i, nE, uEsc); 237 if( i!=nE){ 238 sqlite3_result_error(context, 239 "ESCAPE expression must be a single character", -1); 240 return; 241 } 242 } 243 244 if( zA && zB ){ 245 sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc)); 246 } 247 } 248 249 /* 250 ** Function to delete compiled regexp objects. Registered as 251 ** a destructor function with sqlite3_set_auxdata(). 252 */ 253 static void icuRegexpDelete(void *p){ 254 URegularExpression *pExpr = (URegularExpression *)p; 255 uregex_close(pExpr); 256 } 257 258 /* 259 ** Implementation of SQLite REGEXP operator. This scalar function takes 260 ** two arguments. The first is a regular expression pattern to compile 261 ** the second is a string to match against that pattern. If either 262 ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result 263 ** is 1 if the string matches the pattern, or 0 otherwise. 264 ** 265 ** SQLite maps the regexp() function to the regexp() operator such 266 ** that the following two are equivalent: 267 ** 268 ** zString REGEXP zPattern 269 ** regexp(zPattern, zString) 270 ** 271 ** Uses the following ICU regexp APIs: 272 ** 273 ** uregex_open() 274 ** uregex_matches() 275 ** uregex_close() 276 */ 277 static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){ 278 UErrorCode status = U_ZERO_ERROR; 279 URegularExpression *pExpr; 280 UBool res; 281 const UChar *zString = sqlite3_value_text16(apArg[1]); 282 283 (void)nArg; /* Unused parameter */ 284 285 /* If the left hand side of the regexp operator is NULL, 286 ** then the result is also NULL. 287 */ 288 if( !zString ){ 289 return; 290 } 291 292 pExpr = sqlite3_get_auxdata(p, 0); 293 if( !pExpr ){ 294 const UChar *zPattern = sqlite3_value_text16(apArg[0]); 295 if( !zPattern ){ 296 return; 297 } 298 pExpr = uregex_open(zPattern, -1, 0, 0, &status); 299 300 if( U_SUCCESS(status) ){ 301 sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete); 302 pExpr = sqlite3_get_auxdata(p, 0); 303 } 304 if( !pExpr ){ 305 icuFunctionError(p, "uregex_open", status); 306 return; 307 } 308 } 309 310 /* Configure the text that the regular expression operates on. */ 311 uregex_setText(pExpr, zString, -1, &status); 312 if( !U_SUCCESS(status) ){ 313 icuFunctionError(p, "uregex_setText", status); 314 return; 315 } 316 317 /* Attempt the match */ 318 res = uregex_matches(pExpr, 0, &status); 319 if( !U_SUCCESS(status) ){ 320 icuFunctionError(p, "uregex_matches", status); 321 return; 322 } 323 324 /* Set the text that the regular expression operates on to a NULL 325 ** pointer. This is not really necessary, but it is tidier than 326 ** leaving the regular expression object configured with an invalid 327 ** pointer after this function returns. 328 */ 329 uregex_setText(pExpr, 0, 0, &status); 330 331 /* Return 1 or 0. */ 332 sqlite3_result_int(p, res ? 1 : 0); 333 } 334 335 /* 336 ** Implementations of scalar functions for case mapping - upper() and 337 ** lower(). Function upper() converts its input to upper-case (ABC). 338 ** Function lower() converts to lower-case (abc). 339 ** 340 ** ICU provides two types of case mapping, "general" case mapping and 341 ** "language specific". Refer to ICU documentation for the differences 342 ** between the two. 343 ** 344 ** To utilise "general" case mapping, the upper() or lower() scalar 345 ** functions are invoked with one argument: 346 ** 347 ** upper('ABC') -> 'abc' 348 ** lower('abc') -> 'ABC' 349 ** 350 ** To access ICU "language specific" case mapping, upper() or lower() 351 ** should be invoked with two arguments. The second argument is the name 352 ** of the locale to use. Passing an empty string ("") or SQL NULL value 353 ** as the second argument is the same as invoking the 1 argument version 354 ** of upper() or lower(). 355 ** 356 ** lower('I', 'en_us') -> 'i' 357 ** lower('I', 'tr_tr') -> '\u131' (small dotless i) 358 ** 359 ** http://www.icu-project.org/userguide/posix.html#case_mappings 360 */ 361 static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){ 362 const UChar *zInput; /* Pointer to input string */ 363 UChar *zOutput = 0; /* Pointer to output buffer */ 364 int nInput; /* Size of utf-16 input string in bytes */ 365 int nOut; /* Size of output buffer in bytes */ 366 int cnt; 367 int bToUpper; /* True for toupper(), false for tolower() */ 368 UErrorCode status; 369 const char *zLocale = 0; 370 371 assert(nArg==1 || nArg==2); 372 bToUpper = (sqlite3_user_data(p)!=0); 373 if( nArg==2 ){ 374 zLocale = (const char *)sqlite3_value_text(apArg[1]); 375 } 376 377 zInput = sqlite3_value_text16(apArg[0]); 378 if( !zInput ){ 379 return; 380 } 381 nOut = nInput = sqlite3_value_bytes16(apArg[0]); 382 if( nOut==0 ){ 383 sqlite3_result_text16(p, "", 0, SQLITE_STATIC); 384 return; 385 } 386 387 for(cnt=0; cnt<2; cnt++){ 388 UChar *zNew = sqlite3_realloc(zOutput, nOut); 389 if( zNew==0 ){ 390 sqlite3_free(zOutput); 391 sqlite3_result_error_nomem(p); 392 return; 393 } 394 zOutput = zNew; 395 status = U_ZERO_ERROR; 396 if( bToUpper ){ 397 nOut = 2*u_strToUpper(zOutput,nOut/2,zInput,nInput/2,zLocale,&status); 398 }else{ 399 nOut = 2*u_strToLower(zOutput,nOut/2,zInput,nInput/2,zLocale,&status); 400 } 401 402 if( U_SUCCESS(status) ){ 403 sqlite3_result_text16(p, zOutput, nOut, xFree); 404 }else if( status==U_BUFFER_OVERFLOW_ERROR ){ 405 assert( cnt==0 ); 406 continue; 407 }else{ 408 icuFunctionError(p, bToUpper ? "u_strToUpper" : "u_strToLower", status); 409 } 410 return; 411 } 412 assert( 0 ); /* Unreachable */ 413 } 414 415 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */ 416 417 /* 418 ** Collation sequence destructor function. The pCtx argument points to 419 ** a UCollator structure previously allocated using ucol_open(). 420 */ 421 static void icuCollationDel(void *pCtx){ 422 UCollator *p = (UCollator *)pCtx; 423 ucol_close(p); 424 } 425 426 /* 427 ** Collation sequence comparison function. The pCtx argument points to 428 ** a UCollator structure previously allocated using ucol_open(). 429 */ 430 static int icuCollationColl( 431 void *pCtx, 432 int nLeft, 433 const void *zLeft, 434 int nRight, 435 const void *zRight 436 ){ 437 UCollationResult res; 438 UCollator *p = (UCollator *)pCtx; 439 res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2); 440 switch( res ){ 441 case UCOL_LESS: return -1; 442 case UCOL_GREATER: return +1; 443 case UCOL_EQUAL: return 0; 444 } 445 assert(!"Unexpected return value from ucol_strcoll()"); 446 return 0; 447 } 448 449 /* 450 ** Implementation of the scalar function icu_load_collation(). 451 ** 452 ** This scalar function is used to add ICU collation based collation 453 ** types to an SQLite database connection. It is intended to be called 454 ** as follows: 455 ** 456 ** SELECT icu_load_collation(<locale>, <collation-name>); 457 ** 458 ** Where <locale> is a string containing an ICU locale identifier (i.e. 459 ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the 460 ** collation sequence to create. 461 */ 462 static void icuLoadCollation( 463 sqlite3_context *p, 464 int nArg, 465 sqlite3_value **apArg 466 ){ 467 sqlite3 *db = (sqlite3 *)sqlite3_user_data(p); 468 UErrorCode status = U_ZERO_ERROR; 469 const char *zLocale; /* Locale identifier - (eg. "jp_JP") */ 470 const char *zName; /* SQL Collation sequence name (eg. "japanese") */ 471 UCollator *pUCollator; /* ICU library collation object */ 472 int rc; /* Return code from sqlite3_create_collation_x() */ 473 474 assert(nArg==2); 475 (void)nArg; /* Unused parameter */ 476 zLocale = (const char *)sqlite3_value_text(apArg[0]); 477 zName = (const char *)sqlite3_value_text(apArg[1]); 478 479 if( !zLocale || !zName ){ 480 return; 481 } 482 483 pUCollator = ucol_open(zLocale, &status); 484 if( !U_SUCCESS(status) ){ 485 icuFunctionError(p, "ucol_open", status); 486 return; 487 } 488 assert(p); 489 490 rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator, 491 icuCollationColl, icuCollationDel 492 ); 493 if( rc!=SQLITE_OK ){ 494 ucol_close(pUCollator); 495 sqlite3_result_error(p, "Error registering collation function", -1); 496 } 497 } 498 499 /* 500 ** Register the ICU extension functions with database db. 501 */ 502 int sqlite3IcuInit(sqlite3 *db){ 503 # define SQLITEICU_EXTRAFLAGS (SQLITE_DETERMINISTIC|SQLITE_INNOCUOUS) 504 static const struct IcuScalar { 505 const char *zName; /* Function name */ 506 unsigned char nArg; /* Number of arguments */ 507 unsigned int enc; /* Optimal text encoding */ 508 unsigned char iContext; /* sqlite3_user_data() context */ 509 void (*xFunc)(sqlite3_context*,int,sqlite3_value**); 510 } scalars[] = { 511 {"icu_load_collation",2,SQLITE_UTF8|SQLITE_DIRECTONLY,1, icuLoadCollation}, 512 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) 513 {"regexp", 2, SQLITE_ANY|SQLITEICU_EXTRAFLAGS, 0, icuRegexpFunc}, 514 {"lower", 1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 515 {"lower", 2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 516 {"upper", 1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 517 {"upper", 2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 518 {"lower", 1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 519 {"lower", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 520 {"upper", 1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 521 {"upper", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 522 {"like", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuLikeFunc}, 523 {"like", 3, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuLikeFunc}, 524 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */ 525 }; 526 int rc = SQLITE_OK; 527 int i; 528 529 for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){ 530 const struct IcuScalar *p = &scalars[i]; 531 rc = sqlite3_create_function( 532 db, p->zName, p->nArg, p->enc, 533 p->iContext ? (void*)db : (void*)0, 534 p->xFunc, 0, 0 535 ); 536 } 537 538 return rc; 539 } 540 541 #if !SQLITE_CORE 542 #ifdef _WIN32 543 __declspec(dllexport) 544 #endif 545 int sqlite3_icu_init( 546 sqlite3 *db, 547 char **pzErrMsg, 548 const sqlite3_api_routines *pApi 549 ){ 550 SQLITE_EXTENSION_INIT2(pApi) 551 return sqlite3IcuInit(db); 552 } 553 #endif 554 555 #endif 556