xref: /sqlite-3.40.0/ext/misc/normalize.c (revision a41f2855)
1 /*
2 ** 2018-01-08
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 **
13 ** This file contains code to implement the sqlite3_normalize() function.
14 **
15 **    char *sqlite3_normalize(const char *zSql);
16 **
17 ** This function takes an SQL string as input and returns a "normalized"
18 ** version of that string in memory obtained from sqlite3_malloc64().  The
19 ** caller is responsible for ensuring that the returned memory is freed.
20 **
21 ** If a memory allocation error occurs, this routine returns NULL.
22 **
23 ** The normalization consists of the following transformations:
24 **
25 **   (1)  Convert every literal (string, blob literal, numeric constant,
26 **        or "NULL" constant) into a ?
27 **
28 **   (2)  Remove all superfluous whitespace, including comments.  Change
29 **        all required whitespace to a single space character.
30 **
31 **   (3)  Lowercase all ASCII characters.
32 **
33 **   (4)  If an IN or NOT IN operator is followed by a list of 1 or more
34 **        values, convert that list into "(?,?,?)".
35 **
36 ** The purpose of normalization is two-fold:
37 **
38 **   (1)  Sanitize queries by removing possibly sensitive information contained
39 **        in literals.
40 **
41 **   (2)  Identify structurally identical queries by comparing their
42 **        normalized forms.
43 */
44 #include <sqlite3.h>
45 #include <string.h>
46 
47 /*
48 ** Implementation note:
49 **
50 ** Much of the tokenizer logic is copied out of the tokenize.c source file
51 ** of SQLite.  This logic could be simplified for this particular application,
52 ** but that would impose a risk of introducing subtle errors.  It is best to
53 ** keep the code as close to the original as possible.
54 */
55 
56 
57 /* Character classes for tokenizing
58 **
59 ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
60 ** using a lookup table, whereas a switch() directly on c uses a binary search.
61 ** The lookup table is much faster.  To maximize speed, and to ensure that
62 ** a lookup table is used, all of the classes need to be small integers and
63 ** all of them need to be used within the switch.
64 */
65 #define CC_X          0    /* The letter 'x', or start of BLOB literal */
66 #define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
67 #define CC_ID         2    /* unicode characters usable in IDs */
68 #define CC_DIGIT      3    /* Digits */
69 #define CC_DOLLAR     4    /* '$' */
70 #define CC_VARALPHA   5    /* '@', '#', ':'.  Alphabetic SQL variables */
71 #define CC_VARNUM     6    /* '?'.  Numeric SQL variables */
72 #define CC_SPACE      7    /* Space characters */
73 #define CC_QUOTE      8    /* '"', '\'', or '`'.  String literals, quoted ids */
74 #define CC_QUOTE2     9    /* '['.   [...] style quoted ids */
75 #define CC_PIPE      10    /* '|'.   Bitwise OR or concatenate */
76 #define CC_MINUS     11    /* '-'.  Minus or SQL-style comment */
77 #define CC_LT        12    /* '<'.  Part of < or <= or <> */
78 #define CC_GT        13    /* '>'.  Part of > or >= */
79 #define CC_EQ        14    /* '='.  Part of = or == */
80 #define CC_BANG      15    /* '!'.  Part of != */
81 #define CC_SLASH     16    /* '/'.  / or c-style comment */
82 #define CC_LP        17    /* '(' */
83 #define CC_RP        18    /* ')' */
84 #define CC_SEMI      19    /* ';' */
85 #define CC_PLUS      20    /* '+' */
86 #define CC_STAR      21    /* '*' */
87 #define CC_PERCENT   22    /* '%' */
88 #define CC_COMMA     23    /* ',' */
89 #define CC_AND       24    /* '&' */
90 #define CC_TILDA     25    /* '~' */
91 #define CC_DOT       26    /* '.' */
92 #define CC_ILLEGAL   27    /* Illegal character */
93 
94 static const unsigned char aiClass[] = {
95 /*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
96 /* 0x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
97 /* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
98 /* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
99 /* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
100 /* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
101 /* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  9, 27, 27, 27,  1,
102 /* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
103 /* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 10, 27, 25, 27,
104 /* 8x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
105 /* 9x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
106 /* Ax */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
107 /* Bx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
108 /* Cx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
109 /* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
110 /* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
111 /* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
112 };
113 
114 /* An array to map all upper-case characters into their corresponding
115 ** lower-case character.
116 **
117 ** SQLite only considers US-ASCII (or EBCDIC) characters.  We do not
118 ** handle case conversions for the UTF character set since the tables
119 ** involved are nearly as big or bigger than SQLite itself.
120 */
121 const unsigned char sqlite3UpperToLower[] = {
122       0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
123      18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
124      36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
125      54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
126     104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
127     122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
128     108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
129     126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
130     144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
131     162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
132     180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
133     198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
134     216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
135     234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
136     252,253,254,255
137 };
138 
139 /*
140 ** The following 256 byte lookup table is used to support SQLites built-in
141 ** equivalents to the following standard library functions:
142 **
143 **   isspace()                        0x01
144 **   isalpha()                        0x02
145 **   isdigit()                        0x04
146 **   isalnum()                        0x06
147 **   isxdigit()                       0x08
148 **   toupper()                        0x20
149 **   SQLite identifier character      0x40
150 **   Quote character                  0x80
151 **
152 ** Bit 0x20 is set if the mapped character requires translation to upper
153 ** case. i.e. if the character is a lower-case ASCII character.
154 ** If x is a lower-case ASCII character, then its upper-case equivalent
155 ** is (x - 0x20). Therefore toupper() can be implemented as:
156 **
157 **   (x & ~(map[x]&0x20))
158 **
159 ** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
160 ** array. tolower() is used more often than toupper() by SQLite.
161 **
162 ** Bit 0x40 is set if the character is non-alphanumeric and can be used in an
163 ** SQLite identifier.  Identifiers are alphanumerics, "_", "$", and any
164 ** non-ASCII UTF character. Hence the test for whether or not a character is
165 ** part of an identifier is 0x46.
166 */
167 const unsigned char sqlite3CtypeMap[256] = {
168   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 00..07    ........ */
169   0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,  /* 08..0f    ........ */
170   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 10..17    ........ */
171   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 18..1f    ........ */
172   0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80,  /* 20..27     !"#$%&' */
173   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 28..2f    ()*+,-./ */
174   0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,  /* 30..37    01234567 */
175   0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 38..3f    89:;<=>? */
176 
177   0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02,  /* 40..47    @ABCDEFG */
178   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 48..4f    HIJKLMNO */
179   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 50..57    PQRSTUVW */
180   0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40,  /* 58..5f    XYZ[\]^_ */
181   0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22,  /* 60..67    `abcdefg */
182   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 68..6f    hijklmno */
183   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 70..77    pqrstuvw */
184   0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 78..7f    xyz{|}~. */
185 
186   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 80..87    ........ */
187   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 88..8f    ........ */
188   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 90..97    ........ */
189   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 98..9f    ........ */
190   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a0..a7    ........ */
191   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a8..af    ........ */
192   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b0..b7    ........ */
193   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b8..bf    ........ */
194 
195   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c0..c7    ........ */
196   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c8..cf    ........ */
197   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d0..d7    ........ */
198   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d8..df    ........ */
199   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e0..e7    ........ */
200   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e8..ef    ........ */
201   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* f0..f7    ........ */
202   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40   /* f8..ff    ........ */
203 };
204 #define sqlite3Toupper(x)   ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
205 #define sqlite3Isspace(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
206 #define sqlite3Isalnum(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
207 #define sqlite3Isalpha(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
208 #define sqlite3Isdigit(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
209 #define sqlite3Isxdigit(x)  (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
210 #define sqlite3Tolower(x)   (sqlite3UpperToLower[(unsigned char)(x)])
211 #define sqlite3Isquote(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
212 
213 
214 /*
215 ** If X is a character that can be used in an identifier then
216 ** IdChar(X) will be true.  Otherwise it is false.
217 **
218 ** For ASCII, any character with the high-order bit set is
219 ** allowed in an identifier.  For 7-bit characters,
220 ** sqlite3IsIdChar[X] must be 1.
221 **
222 ** For EBCDIC, the rules are more complex but have the same
223 ** end result.
224 **
225 ** Ticket #1066.  the SQL standard does not allow '$' in the
226 ** middle of identifiers.  But many SQL implementations do.
227 ** SQLite will allow '$' in identifiers for compatibility.
228 ** But the feature is undocumented.
229 */
230 #define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
231 
232 /*
233 ** Ignore testcase() macros
234 */
235 #define testcase(X)
236 
237 /*
238 ** Token values
239 */
240 #define TK_SPACE    0
241 #define TK_NAME     1
242 #define TK_LITERAL  2
243 #define TK_PUNCT    3
244 #define TK_ERROR    4
245 
246 #define TK_MINUS    TK_PUNCT
247 #define TK_LP       TK_PUNCT
248 #define TK_RP       TK_PUNCT
249 #define TK_SEMI     TK_PUNCT
250 #define TK_PLUS     TK_PUNCT
251 #define TK_STAR     TK_PUNCT
252 #define TK_SLASH    TK_PUNCT
253 #define TK_REM      TK_PUNCT
254 #define TK_EQ       TK_PUNCT
255 #define TK_LE       TK_PUNCT
256 #define TK_NE       TK_PUNCT
257 #define TK_LSHIFT   TK_PUNCT
258 #define TK_LT       TK_PUNCT
259 #define TK_GE       TK_PUNCT
260 #define TK_RSHIFT   TK_PUNCT
261 #define TK_GT       TK_PUNCT
262 #define TK_GE       TK_PUNCT
263 #define TK_BITOR    TK_PUNCT
264 #define TK_CONCAT   TK_PUNCT
265 #define TK_COMMA    TK_PUNCT
266 #define TK_BITAND   TK_PUNCT
267 #define TK_BITNOT   TK_PUNCT
268 #define TK_STRING   TK_LITERAL
269 #define TK_ID       TK_NAME
270 #define TK_ILLEGAL  TK_ERROR
271 #define TK_DOT      TK_PUNCT
272 #define TK_INTEGER  TK_LITERAL
273 #define TK_FLOAT    TK_LITERAL
274 #define TK_VARIABLE TK_LITERAL
275 #define TK_BLOB     TK_LITERAL
276 
277 /*
278 ** Return the length (in bytes) of the token that begins at z[0].
279 ** Store the token type in *tokenType before returning.
280 */
281 static int sqlite3GetToken(const unsigned char *z, int *tokenType){
282   int i, c;
283   switch( aiClass[*z] ){  /* Switch on the character-class of the first byte
284                           ** of the token. See the comment on the CC_ defines
285                           ** above. */
286     case CC_SPACE: {
287       for(i=1; sqlite3Isspace(z[i]); i++){}
288       *tokenType = TK_SPACE;
289       return i;
290     }
291     case CC_MINUS: {
292       if( z[1]=='-' ){
293         for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
294         *tokenType = TK_SPACE;
295         return i;
296       }
297       *tokenType = TK_MINUS;
298       return 1;
299     }
300     case CC_LP: {
301       *tokenType = TK_LP;
302       return 1;
303     }
304     case CC_RP: {
305       *tokenType = TK_RP;
306       return 1;
307     }
308     case CC_SEMI: {
309       *tokenType = TK_SEMI;
310       return 1;
311     }
312     case CC_PLUS: {
313       *tokenType = TK_PLUS;
314       return 1;
315     }
316     case CC_STAR: {
317       *tokenType = TK_STAR;
318       return 1;
319     }
320     case CC_SLASH: {
321       if( z[1]!='*' || z[2]==0 ){
322         *tokenType = TK_SLASH;
323         return 1;
324       }
325       for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
326       if( c ) i++;
327       *tokenType = TK_SPACE;
328       return i;
329     }
330     case CC_PERCENT: {
331       *tokenType = TK_REM;
332       return 1;
333     }
334     case CC_EQ: {
335       *tokenType = TK_EQ;
336       return 1 + (z[1]=='=');
337     }
338     case CC_LT: {
339       if( (c=z[1])=='=' ){
340         *tokenType = TK_LE;
341         return 2;
342       }else if( c=='>' ){
343         *tokenType = TK_NE;
344         return 2;
345       }else if( c=='<' ){
346         *tokenType = TK_LSHIFT;
347         return 2;
348       }else{
349         *tokenType = TK_LT;
350         return 1;
351       }
352     }
353     case CC_GT: {
354       if( (c=z[1])=='=' ){
355         *tokenType = TK_GE;
356         return 2;
357       }else if( c=='>' ){
358         *tokenType = TK_RSHIFT;
359         return 2;
360       }else{
361         *tokenType = TK_GT;
362         return 1;
363       }
364     }
365     case CC_BANG: {
366       if( z[1]!='=' ){
367         *tokenType = TK_ILLEGAL;
368         return 1;
369       }else{
370         *tokenType = TK_NE;
371         return 2;
372       }
373     }
374     case CC_PIPE: {
375       if( z[1]!='|' ){
376         *tokenType = TK_BITOR;
377         return 1;
378       }else{
379         *tokenType = TK_CONCAT;
380         return 2;
381       }
382     }
383     case CC_COMMA: {
384       *tokenType = TK_COMMA;
385       return 1;
386     }
387     case CC_AND: {
388       *tokenType = TK_BITAND;
389       return 1;
390     }
391     case CC_TILDA: {
392       *tokenType = TK_BITNOT;
393       return 1;
394     }
395     case CC_QUOTE: {
396       int delim = z[0];
397       testcase( delim=='`' );
398       testcase( delim=='\'' );
399       testcase( delim=='"' );
400       for(i=1; (c=z[i])!=0; i++){
401         if( c==delim ){
402           if( z[i+1]==delim ){
403             i++;
404           }else{
405             break;
406           }
407         }
408       }
409       if( c=='\'' ){
410         *tokenType = TK_STRING;
411         return i+1;
412       }else if( c!=0 ){
413         *tokenType = TK_ID;
414         return i+1;
415       }else{
416         *tokenType = TK_ILLEGAL;
417         return i;
418       }
419     }
420     case CC_DOT: {
421       if( !sqlite3Isdigit(z[1]) ){
422         *tokenType = TK_DOT;
423         return 1;
424       }
425       /* If the next character is a digit, this is a floating point
426       ** number that begins with ".".  Fall thru into the next case */
427     }
428     case CC_DIGIT: {
429       *tokenType = TK_INTEGER;
430       if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
431         for(i=3; sqlite3Isxdigit(z[i]); i++){}
432         return i;
433       }
434       for(i=0; sqlite3Isdigit(z[i]); i++){}
435       if( z[i]=='.' ){
436         i++;
437         while( sqlite3Isdigit(z[i]) ){ i++; }
438         *tokenType = TK_FLOAT;
439       }
440       if( (z[i]=='e' || z[i]=='E') &&
441            ( sqlite3Isdigit(z[i+1])
442             || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
443            )
444       ){
445         i += 2;
446         while( sqlite3Isdigit(z[i]) ){ i++; }
447         *tokenType = TK_FLOAT;
448       }
449       while( IdChar(z[i]) ){
450         *tokenType = TK_ILLEGAL;
451         i++;
452       }
453       return i;
454     }
455     case CC_QUOTE2: {
456       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
457       *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
458       return i;
459     }
460     case CC_VARNUM: {
461       *tokenType = TK_VARIABLE;
462       for(i=1; sqlite3Isdigit(z[i]); i++){}
463       return i;
464     }
465     case CC_DOLLAR:
466     case CC_VARALPHA: {
467       int n = 0;
468       testcase( z[0]=='$' );  testcase( z[0]=='@' );
469       testcase( z[0]==':' );  testcase( z[0]=='#' );
470       *tokenType = TK_VARIABLE;
471       for(i=1; (c=z[i])!=0; i++){
472         if( IdChar(c) ){
473           n++;
474         }else if( c=='(' && n>0 ){
475           do{
476             i++;
477           }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
478           if( c==')' ){
479             i++;
480           }else{
481             *tokenType = TK_ILLEGAL;
482           }
483           break;
484         }else if( c==':' && z[i+1]==':' ){
485           i++;
486         }else{
487           break;
488         }
489       }
490       if( n==0 ) *tokenType = TK_ILLEGAL;
491       return i;
492     }
493     case CC_KYWD: {
494       for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
495       if( IdChar(z[i]) ){
496         /* This token started out using characters that can appear in keywords,
497         ** but z[i] is a character not allowed within keywords, so this must
498         ** be an identifier instead */
499         i++;
500         break;
501       }
502       *tokenType = TK_ID;
503       return i;
504     }
505     case CC_X: {
506       testcase( z[0]=='x' ); testcase( z[0]=='X' );
507       if( z[1]=='\'' ){
508         *tokenType = TK_BLOB;
509         for(i=2; sqlite3Isxdigit(z[i]); i++){}
510         if( z[i]!='\'' || i%2 ){
511           *tokenType = TK_ILLEGAL;
512           while( z[i] && z[i]!='\'' ){ i++; }
513         }
514         if( z[i] ) i++;
515         return i;
516       }
517       /* If it is not a BLOB literal, then it must be an ID, since no
518       ** SQL keywords start with the letter 'x'.  Fall through */
519     }
520     case CC_ID: {
521       i = 1;
522       break;
523     }
524     default: {
525       *tokenType = TK_ILLEGAL;
526       return 1;
527     }
528   }
529   while( IdChar(z[i]) ){ i++; }
530   *tokenType = TK_ID;
531   return i;
532 }
533 
534 char *sqlite3_normalize(const char *zSql){
535   char *z;              /* The output string */
536   sqlite3_int64 nZ;     /* Size of the output string in bytes */
537   sqlite3_int64 nSql;   /* Size of the input string in bytes */
538   int i;                /* Next character to read from zSql[] */
539   int j;                /* Next slot to fill in on z[] */
540   int tokenType;        /* Type of the next token */
541   int n;                /* Size of the next token */
542   int k;                /* Loop counter */
543 
544   nSql = strlen(zSql);
545   nZ = nSql;
546   z = sqlite3_malloc64( nZ+2 );
547   if( z==0 ) return 0;
548   for(i=j=0; zSql[i]; i += n){
549     n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
550     switch( tokenType ){
551       case TK_SPACE: {
552         break;
553       }
554       case TK_ERROR: {
555         sqlite3_free(z);
556         return 0;
557       }
558       case TK_LITERAL: {
559         z[j++] = '?';
560         break;
561       }
562       case TK_PUNCT:
563       case TK_NAME: {
564         if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
565         for(k=0; k<n; k++){
566           z[j++] = sqlite3Tolower(zSql[i+k]);
567         }
568         break;
569       }
570     }
571   }
572   while( j>0 && z[j-1]==' ' ){ j--; }
573   if( i>0 && z[j-1]!=';' ){ z[j++] = ';'; }
574   z[j] = 0;
575   return z;
576 }
577 
578 #ifdef NORMALIZE_TEST
579 #include <stdio.h>
580 #include <stdlib.h>
581 
582 static void normalizeFile(char *zIn){
583   int i;
584   if( zIn==0 ) return;
585   for(i=0; zIn[i]; i++){
586     char cSaved;
587     if( zIn[i]!=';' ) continue;
588     cSaved = zIn[i+1];
589     zIn[i+1] = 0;
590     if( sqlite3_complete(zIn) ){
591       char *zOut = sqlite3_normalize(zIn);
592       if( zOut ){
593         printf("%s\n", zOut);
594         sqlite3_free(zOut);
595       }else{
596         fprintf(stderr, "ERROR: %s\n", zIn);
597       }
598       zIn[i+1] = cSaved;
599       zIn += i+1;
600       i = -1;
601     }else{
602       zIn[i+1] = cSaved;
603     }
604   }
605 }
606 
607 int main(int argc, char **argv){
608   int i;
609   FILE *in;
610   char *zBuf = 0;
611   sqlite3_int64 sz, got;
612 
613   for(i=1; i<argc; i++){
614     in = fopen(argv[i], "rb");
615     if( in==0 ){
616       fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
617       continue;
618     }
619     fseek(in, 0, SEEK_END);
620     sz = ftell(in);
621     rewind(in);
622     zBuf = sqlite3_realloc64(zBuf, sz+1);
623     if( zBuf==0 ){
624       fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
625       exit(1);
626     }
627     got = fread(zBuf, 1, sz, in);
628     fclose(in);
629     if( got!=sz ){
630       fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
631               got, sz, argv[i]);
632     }else{
633       zBuf[got] = 0;
634       normalizeFile(zBuf);
635     }
636   }
637   sqlite3_free(zBuf);
638 }
639 #endif /* NORMALIZE_TEST */
640