xref: /sqlite-3.40.0/ext/fts3/fts3_icu.c (revision 2d77d80a)
1 /*
2 ** 2007 June 22
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a tokenizer for fts3 based on the ICU library.
13 */
14 #include "fts3Int.h"
15 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
16 #ifdef SQLITE_ENABLE_ICU
17 
18 #include <assert.h>
19 #include <string.h>
20 #include "fts3_tokenizer.h"
21 
22 #include <unicode/ubrk.h>
23 #include <unicode/ucol.h>
24 #include <unicode/ustring.h>
25 #include <unicode/utf16.h>
26 
27 typedef struct IcuTokenizer IcuTokenizer;
28 typedef struct IcuCursor IcuCursor;
29 
30 struct IcuTokenizer {
31   sqlite3_tokenizer base;
32   char *zLocale;
33 };
34 
35 struct IcuCursor {
36   sqlite3_tokenizer_cursor base;
37 
38   UBreakIterator *pIter;      /* ICU break-iterator object */
39   int nChar;                  /* Number of UChar elements in pInput */
40   UChar *aChar;               /* Copy of input using utf-16 encoding */
41   int *aOffset;               /* Offsets of each character in utf-8 input */
42 
43   int nBuffer;
44   char *zBuffer;
45 
46   int iToken;
47 };
48 
49 /*
50 ** Create a new tokenizer instance.
51 */
icuCreate(int argc,const char * const * argv,sqlite3_tokenizer ** ppTokenizer)52 static int icuCreate(
53   int argc,                            /* Number of entries in argv[] */
54   const char * const *argv,            /* Tokenizer creation arguments */
55   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
56 ){
57   IcuTokenizer *p;
58   int n = 0;
59 
60   if( argc>0 ){
61     n = strlen(argv[0])+1;
62   }
63   p = (IcuTokenizer *)sqlite3_malloc64(sizeof(IcuTokenizer)+n);
64   if( !p ){
65     return SQLITE_NOMEM;
66   }
67   memset(p, 0, sizeof(IcuTokenizer));
68 
69   if( n ){
70     p->zLocale = (char *)&p[1];
71     memcpy(p->zLocale, argv[0], n);
72   }
73 
74   *ppTokenizer = (sqlite3_tokenizer *)p;
75 
76   return SQLITE_OK;
77 }
78 
79 /*
80 ** Destroy a tokenizer
81 */
icuDestroy(sqlite3_tokenizer * pTokenizer)82 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
83   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
84   sqlite3_free(p);
85   return SQLITE_OK;
86 }
87 
88 /*
89 ** Prepare to begin tokenizing a particular string.  The input
90 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
91 ** used to incrementally tokenize this string is returned in
92 ** *ppCursor.
93 */
icuOpen(sqlite3_tokenizer * pTokenizer,const char * zInput,int nInput,sqlite3_tokenizer_cursor ** ppCursor)94 static int icuOpen(
95   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
96   const char *zInput,                    /* Input string */
97   int nInput,                            /* Length of zInput in bytes */
98   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
99 ){
100   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
101   IcuCursor *pCsr;
102 
103   const int32_t opt = U_FOLD_CASE_DEFAULT;
104   UErrorCode status = U_ZERO_ERROR;
105   int nChar;
106 
107   UChar32 c;
108   int iInput = 0;
109   int iOut = 0;
110 
111   *ppCursor = 0;
112 
113   if( zInput==0 ){
114     nInput = 0;
115     zInput = "";
116   }else if( nInput<0 ){
117     nInput = strlen(zInput);
118   }
119   nChar = nInput+1;
120   pCsr = (IcuCursor *)sqlite3_malloc64(
121       sizeof(IcuCursor) +                /* IcuCursor */
122       ((nChar+3)&~3) * sizeof(UChar) +   /* IcuCursor.aChar[] */
123       (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
124   );
125   if( !pCsr ){
126     return SQLITE_NOMEM;
127   }
128   memset(pCsr, 0, sizeof(IcuCursor));
129   pCsr->aChar = (UChar *)&pCsr[1];
130   pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
131 
132   pCsr->aOffset[iOut] = iInput;
133   U8_NEXT(zInput, iInput, nInput, c);
134   while( c>0 ){
135     int isError = 0;
136     c = u_foldCase(c, opt);
137     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
138     if( isError ){
139       sqlite3_free(pCsr);
140       return SQLITE_ERROR;
141     }
142     pCsr->aOffset[iOut] = iInput;
143 
144     if( iInput<nInput ){
145       U8_NEXT(zInput, iInput, nInput, c);
146     }else{
147       c = 0;
148     }
149   }
150 
151   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
152   if( !U_SUCCESS(status) ){
153     sqlite3_free(pCsr);
154     return SQLITE_ERROR;
155   }
156   pCsr->nChar = iOut;
157 
158   ubrk_first(pCsr->pIter);
159   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
160   return SQLITE_OK;
161 }
162 
163 /*
164 ** Close a tokenization cursor previously opened by a call to icuOpen().
165 */
icuClose(sqlite3_tokenizer_cursor * pCursor)166 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
167   IcuCursor *pCsr = (IcuCursor *)pCursor;
168   ubrk_close(pCsr->pIter);
169   sqlite3_free(pCsr->zBuffer);
170   sqlite3_free(pCsr);
171   return SQLITE_OK;
172 }
173 
174 /*
175 ** Extract the next token from a tokenization cursor.
176 */
icuNext(sqlite3_tokenizer_cursor * pCursor,const char ** ppToken,int * pnBytes,int * piStartOffset,int * piEndOffset,int * piPosition)177 static int icuNext(
178   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
179   const char **ppToken,               /* OUT: *ppToken is the token text */
180   int *pnBytes,                       /* OUT: Number of bytes in token */
181   int *piStartOffset,                 /* OUT: Starting offset of token */
182   int *piEndOffset,                   /* OUT: Ending offset of token */
183   int *piPosition                     /* OUT: Position integer of token */
184 ){
185   IcuCursor *pCsr = (IcuCursor *)pCursor;
186 
187   int iStart = 0;
188   int iEnd = 0;
189   int nByte = 0;
190 
191   while( iStart==iEnd ){
192     UChar32 c;
193 
194     iStart = ubrk_current(pCsr->pIter);
195     iEnd = ubrk_next(pCsr->pIter);
196     if( iEnd==UBRK_DONE ){
197       return SQLITE_DONE;
198     }
199 
200     while( iStart<iEnd ){
201       int iWhite = iStart;
202       U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
203       if( u_isspace(c) ){
204         iStart = iWhite;
205       }else{
206         break;
207       }
208     }
209     assert(iStart<=iEnd);
210   }
211 
212   do {
213     UErrorCode status = U_ZERO_ERROR;
214     if( nByte ){
215       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
216       if( !zNew ){
217         return SQLITE_NOMEM;
218       }
219       pCsr->zBuffer = zNew;
220       pCsr->nBuffer = nByte;
221     }
222 
223     u_strToUTF8(
224         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
225         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
226         &status                                  /* Output success/failure */
227     );
228   } while( nByte>pCsr->nBuffer );
229 
230   *ppToken = pCsr->zBuffer;
231   *pnBytes = nByte;
232   *piStartOffset = pCsr->aOffset[iStart];
233   *piEndOffset = pCsr->aOffset[iEnd];
234   *piPosition = pCsr->iToken++;
235 
236   return SQLITE_OK;
237 }
238 
239 /*
240 ** The set of routines that implement the simple tokenizer
241 */
242 static const sqlite3_tokenizer_module icuTokenizerModule = {
243   0,                           /* iVersion    */
244   icuCreate,                   /* xCreate     */
245   icuDestroy,                  /* xCreate     */
246   icuOpen,                     /* xOpen       */
247   icuClose,                    /* xClose      */
248   icuNext,                     /* xNext       */
249   0,                           /* xLanguageid */
250 };
251 
252 /*
253 ** Set *ppModule to point at the implementation of the ICU tokenizer.
254 */
sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const ** ppModule)255 void sqlite3Fts3IcuTokenizerModule(
256   sqlite3_tokenizer_module const**ppModule
257 ){
258   *ppModule = &icuTokenizerModule;
259 }
260 
261 #endif /* defined(SQLITE_ENABLE_ICU) */
262 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
263