xref: /sqlite-3.40.0/ext/icu/icu.c (revision c9099d2d)
1 /*
2 ** 2007 May 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
13 **
14 ** This file implements an integration between the ICU library
15 ** ("International Components for Unicode", an open-source library
16 ** for handling unicode data) and SQLite. The integration uses
17 ** ICU to provide the following to SQLite:
18 **
19 **   * An implementation of the SQL regexp() function (and hence REGEXP
20 **     operator) using the ICU uregex_XX() APIs.
21 **
22 **   * Implementations of the SQL scalar upper() and lower() functions
23 **     for case mapping.
24 **
25 **   * Integration of ICU and SQLite collation sequences.
26 **
27 **   * An implementation of the LIKE operator that uses ICU to
28 **     provide case-independent matching.
29 */
30 
31 #if !defined(SQLITE_CORE)                  \
32  || defined(SQLITE_ENABLE_ICU)             \
33  || defined(SQLITE_ENABLE_ICU_COLLATIONS)
34 
35 /* Include ICU headers */
36 #include <unicode/utypes.h>
37 #include <unicode/uregex.h>
38 #include <unicode/ustring.h>
39 #include <unicode/ucol.h>
40 
41 #include <assert.h>
42 
43 #ifndef SQLITE_CORE
44   #include "sqlite3ext.h"
45   SQLITE_EXTENSION_INIT1
46 #else
47   #include "sqlite3.h"
48 #endif
49 
50 /*
51 ** This function is called when an ICU function called from within
52 ** the implementation of an SQL scalar function returns an error.
53 **
54 ** The scalar function context passed as the first argument is
55 ** loaded with an error message based on the following two args.
56 */
icuFunctionError(sqlite3_context * pCtx,const char * zName,UErrorCode e)57 static void icuFunctionError(
58   sqlite3_context *pCtx,       /* SQLite scalar function context */
59   const char *zName,           /* Name of ICU function that failed */
60   UErrorCode e                 /* Error code returned by ICU function */
61 ){
62   char zBuf[128];
63   sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
64   zBuf[127] = '\0';
65   sqlite3_result_error(pCtx, zBuf, -1);
66 }
67 
68 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
69 
70 /*
71 ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
72 ** operator.
73 */
74 #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
75 # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
76 #endif
77 
78 /*
79 ** Version of sqlite3_free() that is always a function, never a macro.
80 */
xFree(void * p)81 static void xFree(void *p){
82   sqlite3_free(p);
83 }
84 
85 /*
86 ** This lookup table is used to help decode the first byte of
87 ** a multi-byte UTF8 character. It is copied here from SQLite source
88 ** code file utf8.c.
89 */
90 static const unsigned char icuUtf8Trans1[] = {
91   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
92   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
93   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
94   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
95   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
96   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
97   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
98   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
99 };
100 
101 #define SQLITE_ICU_READ_UTF8(zIn, c)                       \
102   c = *(zIn++);                                            \
103   if( c>=0xc0 ){                                           \
104     c = icuUtf8Trans1[c-0xc0];                             \
105     while( (*zIn & 0xc0)==0x80 ){                          \
106       c = (c<<6) + (0x3f & *(zIn++));                      \
107     }                                                      \
108   }
109 
110 #define SQLITE_ICU_SKIP_UTF8(zIn)                          \
111   assert( *zIn );                                          \
112   if( *(zIn++)>=0xc0 ){                                    \
113     while( (*zIn & 0xc0)==0x80 ){zIn++;}                   \
114   }
115 
116 
117 /*
118 ** Compare two UTF-8 strings for equality where the first string is
119 ** a "LIKE" expression. Return true (1) if they are the same and
120 ** false (0) if they are different.
121 */
icuLikeCompare(const uint8_t * zPattern,const uint8_t * zString,const UChar32 uEsc)122 static int icuLikeCompare(
123   const uint8_t *zPattern,   /* LIKE pattern */
124   const uint8_t *zString,    /* The UTF-8 string to compare against */
125   const UChar32 uEsc         /* The escape character */
126 ){
127   static const uint32_t MATCH_ONE = (uint32_t)'_';
128   static const uint32_t MATCH_ALL = (uint32_t)'%';
129 
130   int prevEscape = 0;     /* True if the previous character was uEsc */
131 
132   while( 1 ){
133 
134     /* Read (and consume) the next character from the input pattern. */
135     uint32_t uPattern;
136     SQLITE_ICU_READ_UTF8(zPattern, uPattern);
137     if( uPattern==0 ) break;
138 
139     /* There are now 4 possibilities:
140     **
141     **     1. uPattern is an unescaped match-all character "%",
142     **     2. uPattern is an unescaped match-one character "_",
143     **     3. uPattern is an unescaped escape character, or
144     **     4. uPattern is to be handled as an ordinary character
145     */
146     if( uPattern==MATCH_ALL && !prevEscape && uPattern!=(uint32_t)uEsc ){
147       /* Case 1. */
148       uint8_t c;
149 
150       /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
151       ** MATCH_ALL. For each MATCH_ONE, skip one character in the
152       ** test string.
153       */
154       while( (c=*zPattern) == MATCH_ALL || c == MATCH_ONE ){
155         if( c==MATCH_ONE ){
156           if( *zString==0 ) return 0;
157           SQLITE_ICU_SKIP_UTF8(zString);
158         }
159         zPattern++;
160       }
161 
162       if( *zPattern==0 ) return 1;
163 
164       while( *zString ){
165         if( icuLikeCompare(zPattern, zString, uEsc) ){
166           return 1;
167         }
168         SQLITE_ICU_SKIP_UTF8(zString);
169       }
170       return 0;
171 
172     }else if( uPattern==MATCH_ONE && !prevEscape && uPattern!=(uint32_t)uEsc ){
173       /* Case 2. */
174       if( *zString==0 ) return 0;
175       SQLITE_ICU_SKIP_UTF8(zString);
176 
177     }else if( uPattern==(uint32_t)uEsc && !prevEscape ){
178       /* Case 3. */
179       prevEscape = 1;
180 
181     }else{
182       /* Case 4. */
183       uint32_t uString;
184       SQLITE_ICU_READ_UTF8(zString, uString);
185       uString = (uint32_t)u_foldCase((UChar32)uString, U_FOLD_CASE_DEFAULT);
186       uPattern = (uint32_t)u_foldCase((UChar32)uPattern, U_FOLD_CASE_DEFAULT);
187       if( uString!=uPattern ){
188         return 0;
189       }
190       prevEscape = 0;
191     }
192   }
193 
194   return *zString==0;
195 }
196 
197 /*
198 ** Implementation of the like() SQL function.  This function implements
199 ** the build-in LIKE operator.  The first argument to the function is the
200 ** pattern and the second argument is the string.  So, the SQL statements:
201 **
202 **       A LIKE B
203 **
204 ** is implemented as like(B, A). If there is an escape character E,
205 **
206 **       A LIKE B ESCAPE E
207 **
208 ** is mapped to like(B, A, E).
209 */
icuLikeFunc(sqlite3_context * context,int argc,sqlite3_value ** argv)210 static void icuLikeFunc(
211   sqlite3_context *context,
212   int argc,
213   sqlite3_value **argv
214 ){
215   const unsigned char *zA = sqlite3_value_text(argv[0]);
216   const unsigned char *zB = sqlite3_value_text(argv[1]);
217   UChar32 uEsc = 0;
218 
219   /* Limit the length of the LIKE or GLOB pattern to avoid problems
220   ** of deep recursion and N*N behavior in patternCompare().
221   */
222   if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
223     sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
224     return;
225   }
226 
227 
228   if( argc==3 ){
229     /* The escape character string must consist of a single UTF-8 character.
230     ** Otherwise, return an error.
231     */
232     int nE= sqlite3_value_bytes(argv[2]);
233     const unsigned char *zE = sqlite3_value_text(argv[2]);
234     int i = 0;
235     if( zE==0 ) return;
236     U8_NEXT(zE, i, nE, uEsc);
237     if( i!=nE){
238       sqlite3_result_error(context,
239           "ESCAPE expression must be a single character", -1);
240       return;
241     }
242   }
243 
244   if( zA && zB ){
245     sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
246   }
247 }
248 
249 /*
250 ** Function to delete compiled regexp objects. Registered as
251 ** a destructor function with sqlite3_set_auxdata().
252 */
icuRegexpDelete(void * p)253 static void icuRegexpDelete(void *p){
254   URegularExpression *pExpr = (URegularExpression *)p;
255   uregex_close(pExpr);
256 }
257 
258 /*
259 ** Implementation of SQLite REGEXP operator. This scalar function takes
260 ** two arguments. The first is a regular expression pattern to compile
261 ** the second is a string to match against that pattern. If either
262 ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
263 ** is 1 if the string matches the pattern, or 0 otherwise.
264 **
265 ** SQLite maps the regexp() function to the regexp() operator such
266 ** that the following two are equivalent:
267 **
268 **     zString REGEXP zPattern
269 **     regexp(zPattern, zString)
270 **
271 ** Uses the following ICU regexp APIs:
272 **
273 **     uregex_open()
274 **     uregex_matches()
275 **     uregex_close()
276 */
icuRegexpFunc(sqlite3_context * p,int nArg,sqlite3_value ** apArg)277 static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
278   UErrorCode status = U_ZERO_ERROR;
279   URegularExpression *pExpr;
280   UBool res;
281   const UChar *zString = sqlite3_value_text16(apArg[1]);
282 
283   (void)nArg;  /* Unused parameter */
284 
285   /* If the left hand side of the regexp operator is NULL,
286   ** then the result is also NULL.
287   */
288   if( !zString ){
289     return;
290   }
291 
292   pExpr = sqlite3_get_auxdata(p, 0);
293   if( !pExpr ){
294     const UChar *zPattern = sqlite3_value_text16(apArg[0]);
295     if( !zPattern ){
296       return;
297     }
298     pExpr = uregex_open(zPattern, -1, 0, 0, &status);
299 
300     if( U_SUCCESS(status) ){
301       sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
302       pExpr = sqlite3_get_auxdata(p, 0);
303     }
304     if( !pExpr ){
305       icuFunctionError(p, "uregex_open", status);
306       return;
307     }
308   }
309 
310   /* Configure the text that the regular expression operates on. */
311   uregex_setText(pExpr, zString, -1, &status);
312   if( !U_SUCCESS(status) ){
313     icuFunctionError(p, "uregex_setText", status);
314     return;
315   }
316 
317   /* Attempt the match */
318   res = uregex_matches(pExpr, 0, &status);
319   if( !U_SUCCESS(status) ){
320     icuFunctionError(p, "uregex_matches", status);
321     return;
322   }
323 
324   /* Set the text that the regular expression operates on to a NULL
325   ** pointer. This is not really necessary, but it is tidier than
326   ** leaving the regular expression object configured with an invalid
327   ** pointer after this function returns.
328   */
329   uregex_setText(pExpr, 0, 0, &status);
330 
331   /* Return 1 or 0. */
332   sqlite3_result_int(p, res ? 1 : 0);
333 }
334 
335 /*
336 ** Implementations of scalar functions for case mapping - upper() and
337 ** lower(). Function upper() converts its input to upper-case (ABC).
338 ** Function lower() converts to lower-case (abc).
339 **
340 ** ICU provides two types of case mapping, "general" case mapping and
341 ** "language specific". Refer to ICU documentation for the differences
342 ** between the two.
343 **
344 ** To utilise "general" case mapping, the upper() or lower() scalar
345 ** functions are invoked with one argument:
346 **
347 **     upper('ABC') -> 'abc'
348 **     lower('abc') -> 'ABC'
349 **
350 ** To access ICU "language specific" case mapping, upper() or lower()
351 ** should be invoked with two arguments. The second argument is the name
352 ** of the locale to use. Passing an empty string ("") or SQL NULL value
353 ** as the second argument is the same as invoking the 1 argument version
354 ** of upper() or lower().
355 **
356 **     lower('I', 'en_us') -> 'i'
357 **     lower('I', 'tr_tr') -> '\u131' (small dotless i)
358 **
359 ** http://www.icu-project.org/userguide/posix.html#case_mappings
360 */
icuCaseFunc16(sqlite3_context * p,int nArg,sqlite3_value ** apArg)361 static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
362   const UChar *zInput;            /* Pointer to input string */
363   UChar *zOutput = 0;             /* Pointer to output buffer */
364   int nInput;                     /* Size of utf-16 input string in bytes */
365   int nOut;                       /* Size of output buffer in bytes */
366   int cnt;
367   int bToUpper;                   /* True for toupper(), false for tolower() */
368   UErrorCode status;
369   const char *zLocale = 0;
370 
371   assert(nArg==1 || nArg==2);
372   bToUpper = (sqlite3_user_data(p)!=0);
373   if( nArg==2 ){
374     zLocale = (const char *)sqlite3_value_text(apArg[1]);
375   }
376 
377   zInput = sqlite3_value_text16(apArg[0]);
378   if( !zInput ){
379     return;
380   }
381   nOut = nInput = sqlite3_value_bytes16(apArg[0]);
382   if( nOut==0 ){
383     sqlite3_result_text16(p, "", 0, SQLITE_STATIC);
384     return;
385   }
386 
387   for(cnt=0; cnt<2; cnt++){
388     UChar *zNew = sqlite3_realloc(zOutput, nOut);
389     if( zNew==0 ){
390       sqlite3_free(zOutput);
391       sqlite3_result_error_nomem(p);
392       return;
393     }
394     zOutput = zNew;
395     status = U_ZERO_ERROR;
396     if( bToUpper ){
397       nOut = 2*u_strToUpper(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
398     }else{
399       nOut = 2*u_strToLower(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
400     }
401 
402     if( U_SUCCESS(status) ){
403       sqlite3_result_text16(p, zOutput, nOut, xFree);
404     }else if( status==U_BUFFER_OVERFLOW_ERROR ){
405       assert( cnt==0 );
406       continue;
407     }else{
408       icuFunctionError(p, bToUpper ? "u_strToUpper" : "u_strToLower", status);
409     }
410     return;
411   }
412   assert( 0 );     /* Unreachable */
413 }
414 
415 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */
416 
417 /*
418 ** Collation sequence destructor function. The pCtx argument points to
419 ** a UCollator structure previously allocated using ucol_open().
420 */
icuCollationDel(void * pCtx)421 static void icuCollationDel(void *pCtx){
422   UCollator *p = (UCollator *)pCtx;
423   ucol_close(p);
424 }
425 
426 /*
427 ** Collation sequence comparison function. The pCtx argument points to
428 ** a UCollator structure previously allocated using ucol_open().
429 */
icuCollationColl(void * pCtx,int nLeft,const void * zLeft,int nRight,const void * zRight)430 static int icuCollationColl(
431   void *pCtx,
432   int nLeft,
433   const void *zLeft,
434   int nRight,
435   const void *zRight
436 ){
437   UCollationResult res;
438   UCollator *p = (UCollator *)pCtx;
439   res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
440   switch( res ){
441     case UCOL_LESS:    return -1;
442     case UCOL_GREATER: return +1;
443     case UCOL_EQUAL:   return 0;
444   }
445   assert(!"Unexpected return value from ucol_strcoll()");
446   return 0;
447 }
448 
449 /*
450 ** Implementation of the scalar function icu_load_collation().
451 **
452 ** This scalar function is used to add ICU collation based collation
453 ** types to an SQLite database connection. It is intended to be called
454 ** as follows:
455 **
456 **     SELECT icu_load_collation(<locale>, <collation-name>);
457 **
458 ** Where <locale> is a string containing an ICU locale identifier (i.e.
459 ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
460 ** collation sequence to create.
461 */
icuLoadCollation(sqlite3_context * p,int nArg,sqlite3_value ** apArg)462 static void icuLoadCollation(
463   sqlite3_context *p,
464   int nArg,
465   sqlite3_value **apArg
466 ){
467   sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
468   UErrorCode status = U_ZERO_ERROR;
469   const char *zLocale;      /* Locale identifier - (eg. "jp_JP") */
470   const char *zName;        /* SQL Collation sequence name (eg. "japanese") */
471   UCollator *pUCollator;    /* ICU library collation object */
472   int rc;                   /* Return code from sqlite3_create_collation_x() */
473 
474   assert(nArg==2);
475   (void)nArg; /* Unused parameter */
476   zLocale = (const char *)sqlite3_value_text(apArg[0]);
477   zName = (const char *)sqlite3_value_text(apArg[1]);
478 
479   if( !zLocale || !zName ){
480     return;
481   }
482 
483   pUCollator = ucol_open(zLocale, &status);
484   if( !U_SUCCESS(status) ){
485     icuFunctionError(p, "ucol_open", status);
486     return;
487   }
488   assert(p);
489 
490   rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator,
491       icuCollationColl, icuCollationDel
492   );
493   if( rc!=SQLITE_OK ){
494     ucol_close(pUCollator);
495     sqlite3_result_error(p, "Error registering collation function", -1);
496   }
497 }
498 
499 /*
500 ** Register the ICU extension functions with database db.
501 */
sqlite3IcuInit(sqlite3 * db)502 int sqlite3IcuInit(sqlite3 *db){
503 # define SQLITEICU_EXTRAFLAGS (SQLITE_DETERMINISTIC|SQLITE_INNOCUOUS)
504   static const struct IcuScalar {
505     const char *zName;                        /* Function name */
506     unsigned char nArg;                       /* Number of arguments */
507     unsigned int enc;                         /* Optimal text encoding */
508     unsigned char iContext;                   /* sqlite3_user_data() context */
509     void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
510   } scalars[] = {
511     {"icu_load_collation",2,SQLITE_UTF8|SQLITE_DIRECTONLY,1, icuLoadCollation},
512 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
513     {"regexp", 2, SQLITE_ANY|SQLITEICU_EXTRAFLAGS,         0, icuRegexpFunc},
514     {"lower",  1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS,       0, icuCaseFunc16},
515     {"lower",  2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS,       0, icuCaseFunc16},
516     {"upper",  1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS,       1, icuCaseFunc16},
517     {"upper",  2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS,       1, icuCaseFunc16},
518     {"lower",  1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS,        0, icuCaseFunc16},
519     {"lower",  2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS,        0, icuCaseFunc16},
520     {"upper",  1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS,        1, icuCaseFunc16},
521     {"upper",  2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS,        1, icuCaseFunc16},
522     {"like",   2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS,        0, icuLikeFunc},
523     {"like",   3, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS,        0, icuLikeFunc},
524 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */
525   };
526   int rc = SQLITE_OK;
527   int i;
528 
529   for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
530     const struct IcuScalar *p = &scalars[i];
531     rc = sqlite3_create_function(
532         db, p->zName, p->nArg, p->enc,
533         p->iContext ? (void*)db : (void*)0,
534         p->xFunc, 0, 0
535     );
536   }
537 
538   return rc;
539 }
540 
541 #if !SQLITE_CORE
542 #ifdef _WIN32
543 __declspec(dllexport)
544 #endif
sqlite3_icu_init(sqlite3 * db,char ** pzErrMsg,const sqlite3_api_routines * pApi)545 int sqlite3_icu_init(
546   sqlite3 *db,
547   char **pzErrMsg,
548   const sqlite3_api_routines *pApi
549 ){
550   SQLITE_EXTENSION_INIT2(pApi)
551   return sqlite3IcuInit(db);
552 }
553 #endif
554 
555 #endif
556