xref: /sqlite-3.40.0/ext/fts3/fts3_unicode2.c (revision 8fc4a11c)
1 /*
2 ** 2012-05-25
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13 
14 /*
15 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
16 */
17 
18 #ifndef SQLITE_DISABLE_FTS3_UNICODE
19 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
20 
21 #include <assert.h>
22 
23 /*
24 ** Return true if the argument corresponds to a unicode codepoint
25 ** classified as either a letter or a number. Otherwise false.
26 **
27 ** The results are undefined if the value passed to this function
28 ** is less than zero.
29 */
sqlite3FtsUnicodeIsalnum(int c)30 int sqlite3FtsUnicodeIsalnum(int c){
31   /* Each unsigned integer in the following array corresponds to a contiguous
32   ** range of unicode codepoints that are not either letters or numbers (i.e.
33   ** codepoints for which this function should return 0).
34   **
35   ** The most significant 22 bits in each 32-bit value contain the first
36   ** codepoint in the range. The least significant 10 bits are used to store
37   ** the size of the range (always at least 1). In other words, the value
38   ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
39   ** C. It is not possible to represent a range larger than 1023 codepoints
40   ** using this format.
41   */
42   static const unsigned int aEntry[] = {
43     0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
44     0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
45     0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
46     0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
47     0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
48     0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
49     0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
50     0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
51     0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
52     0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
53     0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
54     0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
55     0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
56     0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
57     0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
58     0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
59     0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
60     0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
61     0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
62     0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
63     0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
64     0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
65     0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
66     0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
67     0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
68     0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
69     0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
70     0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
71     0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
72     0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
73     0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
74     0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
75     0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
76     0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
77     0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
78     0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
79     0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
80     0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
81     0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
82     0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
83     0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
84     0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
85     0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
86     0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
87     0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
88     0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
89     0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
90     0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
91     0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
92     0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
93     0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
94     0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
95     0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
96     0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
97     0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
98     0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
99     0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
100     0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
101     0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
102     0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
103     0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
104     0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
105     0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
106     0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
107     0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
108     0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
109     0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
110     0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
111     0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
112     0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
113     0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
114     0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
115     0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
116     0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
117     0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
118     0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
119     0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
120     0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
121     0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
122     0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
123     0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
124     0x380400F0,
125   };
126   static const unsigned int aAscii[4] = {
127     0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
128   };
129 
130   if( (unsigned int)c<128 ){
131     return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
132   }else if( (unsigned int)c<(1<<22) ){
133     unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
134     int iRes = 0;
135     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
136     int iLo = 0;
137     while( iHi>=iLo ){
138       int iTest = (iHi + iLo) / 2;
139       if( key >= aEntry[iTest] ){
140         iRes = iTest;
141         iLo = iTest+1;
142       }else{
143         iHi = iTest-1;
144       }
145     }
146     assert( aEntry[0]<key );
147     assert( key>=aEntry[iRes] );
148     return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
149   }
150   return 1;
151 }
152 
153 
154 /*
155 ** If the argument is a codepoint corresponding to a lowercase letter
156 ** in the ASCII range with a diacritic added, return the codepoint
157 ** of the ASCII letter only. For example, if passed 235 - "LATIN
158 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
159 ** E"). The resuls of passing a codepoint that corresponds to an
160 ** uppercase letter are undefined.
161 */
remove_diacritic(int c,int bComplex)162 static int remove_diacritic(int c, int bComplex){
163   unsigned short aDia[] = {
164         0,  1797,  1848,  1859,  1891,  1928,  1940,  1995,
165      2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286,
166      2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732,
167      2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336,
168      3456,  3696,  3712,  3728,  3744,  3766,  3832,  3896,
169      3912,  3928,  3944,  3968,  4008,  4040,  4056,  4106,
170      4138,  4170,  4202,  4234,  4266,  4296,  4312,  4344,
171      4408,  4424,  4442,  4472,  4488,  4504,  6148,  6198,
172      6264,  6280,  6360,  6429,  6505,  6529, 61448, 61468,
173     61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704,
174     61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914,
175     61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218,
176     62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554,
177     62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766,
178     62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118,
179     63182, 63242, 63274, 63310, 63368, 63390,
180   };
181 #define HIBIT ((unsigned char)0x80)
182   unsigned char aChar[] = {
183     '\0',      'a',       'c',       'e',       'i',       'n',
184     'o',       'u',       'y',       'y',       'a',       'c',
185     'd',       'e',       'e',       'g',       'h',       'i',
186     'j',       'k',       'l',       'n',       'o',       'r',
187     's',       't',       'u',       'u',       'w',       'y',
188     'z',       'o',       'u',       'a',       'i',       'o',
189     'u',       'u'|HIBIT, 'a'|HIBIT, 'g',       'k',       'o',
190     'o'|HIBIT, 'j',       'g',       'n',       'a'|HIBIT, 'a',
191     'e',       'i',       'o',       'r',       'u',       's',
192     't',       'h',       'a',       'e',       'o'|HIBIT, 'o',
193     'o'|HIBIT, 'y',       '\0',      '\0',      '\0',      '\0',
194     '\0',      '\0',      '\0',      '\0',      'a',       'b',
195     'c'|HIBIT, 'd',       'd',       'e'|HIBIT, 'e',       'e'|HIBIT,
196     'f',       'g',       'h',       'h',       'i',       'i'|HIBIT,
197     'k',       'l',       'l'|HIBIT, 'l',       'm',       'n',
198     'o'|HIBIT, 'p',       'r',       'r'|HIBIT, 'r',       's',
199     's'|HIBIT, 't',       'u',       'u'|HIBIT, 'v',       'w',
200     'w',       'x',       'y',       'z',       'h',       't',
201     'w',       'y',       'a',       'a'|HIBIT, 'a'|HIBIT, 'a'|HIBIT,
202     'e',       'e'|HIBIT, 'e'|HIBIT, 'i',       'o',       'o'|HIBIT,
203     'o'|HIBIT, 'o'|HIBIT, 'u',       'u'|HIBIT, 'u'|HIBIT, 'y',
204   };
205 
206   unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
207   int iRes = 0;
208   int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
209   int iLo = 0;
210   while( iHi>=iLo ){
211     int iTest = (iHi + iLo) / 2;
212     if( key >= aDia[iTest] ){
213       iRes = iTest;
214       iLo = iTest+1;
215     }else{
216       iHi = iTest-1;
217     }
218   }
219   assert( key>=aDia[iRes] );
220   if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
221   return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
222 }
223 
224 
225 /*
226 ** Return true if the argument interpreted as a unicode codepoint
227 ** is a diacritical modifier character.
228 */
sqlite3FtsUnicodeIsdiacritic(int c)229 int sqlite3FtsUnicodeIsdiacritic(int c){
230   unsigned int mask0 = 0x08029FDF;
231   unsigned int mask1 = 0x000361F8;
232   if( c<768 || c>817 ) return 0;
233   return (c < 768+32) ?
234       (mask0 & ((unsigned int)1 << (c-768))) :
235       (mask1 & ((unsigned int)1 << (c-768-32)));
236 }
237 
238 
239 /*
240 ** Interpret the argument as a unicode codepoint. If the codepoint
241 ** is an upper case character that has a lower case equivalent,
242 ** return the codepoint corresponding to the lower case version.
243 ** Otherwise, return a copy of the argument.
244 **
245 ** The results are undefined if the value passed to this function
246 ** is less than zero.
247 */
sqlite3FtsUnicodeFold(int c,int eRemoveDiacritic)248 int sqlite3FtsUnicodeFold(int c, int eRemoveDiacritic){
249   /* Each entry in the following array defines a rule for folding a range
250   ** of codepoints to lower case. The rule applies to a range of nRange
251   ** codepoints starting at codepoint iCode.
252   **
253   ** If the least significant bit in flags is clear, then the rule applies
254   ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
255   ** need to be folded). Or, if it is set, then the rule only applies to
256   ** every second codepoint in the range, starting with codepoint C.
257   **
258   ** The 7 most significant bits in flags are an index into the aiOff[]
259   ** array. If a specific codepoint C does require folding, then its lower
260   ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
261   **
262   ** The contents of this array are generated by parsing the CaseFolding.txt
263   ** file distributed as part of the "Unicode Character Database". See
264   ** http://www.unicode.org for details.
265   */
266   static const struct TableEntry {
267     unsigned short iCode;
268     unsigned char flags;
269     unsigned char nRange;
270   } aEntry[] = {
271     {65, 14, 26},          {181, 64, 1},          {192, 14, 23},
272     {216, 14, 7},          {256, 1, 48},          {306, 1, 6},
273     {313, 1, 16},          {330, 1, 46},          {376, 116, 1},
274     {377, 1, 6},           {383, 104, 1},         {385, 50, 1},
275     {386, 1, 4},           {390, 44, 1},          {391, 0, 1},
276     {393, 42, 2},          {395, 0, 1},           {398, 32, 1},
277     {399, 38, 1},          {400, 40, 1},          {401, 0, 1},
278     {403, 42, 1},          {404, 46, 1},          {406, 52, 1},
279     {407, 48, 1},          {408, 0, 1},           {412, 52, 1},
280     {413, 54, 1},          {415, 56, 1},          {416, 1, 6},
281     {422, 60, 1},          {423, 0, 1},           {425, 60, 1},
282     {428, 0, 1},           {430, 60, 1},          {431, 0, 1},
283     {433, 58, 2},          {435, 1, 4},           {439, 62, 1},
284     {440, 0, 1},           {444, 0, 1},           {452, 2, 1},
285     {453, 0, 1},           {455, 2, 1},           {456, 0, 1},
286     {458, 2, 1},           {459, 1, 18},          {478, 1, 18},
287     {497, 2, 1},           {498, 1, 4},           {502, 122, 1},
288     {503, 134, 1},         {504, 1, 40},          {544, 110, 1},
289     {546, 1, 18},          {570, 70, 1},          {571, 0, 1},
290     {573, 108, 1},         {574, 68, 1},          {577, 0, 1},
291     {579, 106, 1},         {580, 28, 1},          {581, 30, 1},
292     {582, 1, 10},          {837, 36, 1},          {880, 1, 4},
293     {886, 0, 1},           {902, 18, 1},          {904, 16, 3},
294     {908, 26, 1},          {910, 24, 2},          {913, 14, 17},
295     {931, 14, 9},          {962, 0, 1},           {975, 4, 1},
296     {976, 140, 1},         {977, 142, 1},         {981, 146, 1},
297     {982, 144, 1},         {984, 1, 24},          {1008, 136, 1},
298     {1009, 138, 1},        {1012, 130, 1},        {1013, 128, 1},
299     {1015, 0, 1},          {1017, 152, 1},        {1018, 0, 1},
300     {1021, 110, 3},        {1024, 34, 16},        {1040, 14, 32},
301     {1120, 1, 34},         {1162, 1, 54},         {1216, 6, 1},
302     {1217, 1, 14},         {1232, 1, 88},         {1329, 22, 38},
303     {4256, 66, 38},        {4295, 66, 1},         {4301, 66, 1},
304     {7680, 1, 150},        {7835, 132, 1},        {7838, 96, 1},
305     {7840, 1, 96},         {7944, 150, 8},        {7960, 150, 6},
306     {7976, 150, 8},        {7992, 150, 8},        {8008, 150, 6},
307     {8025, 151, 8},        {8040, 150, 8},        {8072, 150, 8},
308     {8088, 150, 8},        {8104, 150, 8},        {8120, 150, 2},
309     {8122, 126, 2},        {8124, 148, 1},        {8126, 100, 1},
310     {8136, 124, 4},        {8140, 148, 1},        {8152, 150, 2},
311     {8154, 120, 2},        {8168, 150, 2},        {8170, 118, 2},
312     {8172, 152, 1},        {8184, 112, 2},        {8186, 114, 2},
313     {8188, 148, 1},        {8486, 98, 1},         {8490, 92, 1},
314     {8491, 94, 1},         {8498, 12, 1},         {8544, 8, 16},
315     {8579, 0, 1},          {9398, 10, 26},        {11264, 22, 47},
316     {11360, 0, 1},         {11362, 88, 1},        {11363, 102, 1},
317     {11364, 90, 1},        {11367, 1, 6},         {11373, 84, 1},
318     {11374, 86, 1},        {11375, 80, 1},        {11376, 82, 1},
319     {11378, 0, 1},         {11381, 0, 1},         {11390, 78, 2},
320     {11392, 1, 100},       {11499, 1, 4},         {11506, 0, 1},
321     {42560, 1, 46},        {42624, 1, 24},        {42786, 1, 14},
322     {42802, 1, 62},        {42873, 1, 4},         {42877, 76, 1},
323     {42878, 1, 10},        {42891, 0, 1},         {42893, 74, 1},
324     {42896, 1, 4},         {42912, 1, 10},        {42922, 72, 1},
325     {65313, 14, 26},
326   };
327   static const unsigned short aiOff[] = {
328    1,     2,     8,     15,    16,    26,    28,    32,
329    37,    38,    40,    48,    63,    64,    69,    71,
330    79,    80,    116,   202,   203,   205,   206,   207,
331    209,   210,   211,   213,   214,   217,   218,   219,
332    775,   7264,  10792, 10795, 23228, 23256, 30204, 54721,
333    54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
334    57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406,
335    65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462,
336    65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511,
337    65514, 65521, 65527, 65528, 65529,
338   };
339 
340   int ret = c;
341 
342   assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
343 
344   if( c<128 ){
345     if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
346   }else if( c<65536 ){
347     const struct TableEntry *p;
348     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
349     int iLo = 0;
350     int iRes = -1;
351 
352     assert( c>aEntry[0].iCode );
353     while( iHi>=iLo ){
354       int iTest = (iHi + iLo) / 2;
355       int cmp = (c - aEntry[iTest].iCode);
356       if( cmp>=0 ){
357         iRes = iTest;
358         iLo = iTest+1;
359       }else{
360         iHi = iTest-1;
361       }
362     }
363 
364     assert( iRes>=0 && c>=aEntry[iRes].iCode );
365     p = &aEntry[iRes];
366     if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
367       ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
368       assert( ret>0 );
369     }
370 
371     if( eRemoveDiacritic ){
372       ret = remove_diacritic(ret, eRemoveDiacritic==2);
373     }
374   }
375 
376   else if( c>=66560 && c<66600 ){
377     ret = c + 40;
378   }
379 
380   return ret;
381 }
382 #endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
383 #endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */
384