xref: /sqlite-3.40.0/ext/misc/normalize.c (revision e7952263)
1 /*
2 ** 2018-01-08
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 **
13 ** This file contains code to implement the sqlite3_normalize() function.
14 **
15 **    char *sqlite3_normalize(const char *zSql);
16 **
17 ** This function takes an SQL string as input and returns a "normalized"
18 ** version of that string in memory obtained from sqlite3_malloc64().  The
19 ** caller is responsible for ensuring that the returned memory is freed.
20 **
21 ** If a memory allocation error occurs, this routine returns NULL.
22 **
23 ** The normalization consists of the following transformations:
24 **
25 **   (1)  Convert every literal (string, blob literal, numeric constant,
26 **        or "NULL" constant) into a ?
27 **
28 **   (2)  Remove all superfluous whitespace, including comments.  Change
29 **        all required whitespace to a single space character.
30 **
31 **   (3)  Lowercase all ASCII characters.
32 **
33 **   (4)  If an IN or NOT IN operator is followed by a list of 1 or more
34 **        values, convert that list into "(?,?,?)".
35 **
36 ** The purpose of normalization is two-fold:
37 **
38 **   (1)  Sanitize queries by removing potentially private or sensitive
39 **        information contained in literals.
40 **
41 **   (2)  Identify structurally identical queries by comparing their
42 **        normalized forms.
43 **
44 ** Command-Line Utility
45 ** --------------------
46 **
47 ** This file also contains code for a command-line utility that converts
48 ** SQL queries in text files into their normalized forms.  To build the
49 ** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI
50 ** and link it against the SQLite library.
51 */
52 #include <sqlite3.h>
53 #include <string.h>
54 
55 /*
56 ** Implementation note:
57 **
58 ** Much of the tokenizer logic is copied out of the tokenize.c source file
59 ** of SQLite.  That logic could be simplified for this particular application,
60 ** but that would impose a risk of introducing subtle errors.  It is best to
61 ** keep the code as close to the original as possible.
62 **
63 ** The tokenize code is in sync with the SQLite core as of 2018-01-08.
64 ** Any future changes to the core tokenizer might require corresponding
65 ** adjustments to the tokenizer logic in this module.
66 */
67 
68 
69 /* Character classes for tokenizing
70 **
71 ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
72 ** using a lookup table, whereas a switch() directly on c uses a binary search.
73 ** The lookup table is much faster.  To maximize speed, and to ensure that
74 ** a lookup table is used, all of the classes need to be small integers and
75 ** all of them need to be used within the switch.
76 */
77 #define CC_X          0    /* The letter 'x', or start of BLOB literal */
78 #define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
79 #define CC_ID         2    /* unicode characters usable in IDs */
80 #define CC_DIGIT      3    /* Digits */
81 #define CC_DOLLAR     4    /* '$' */
82 #define CC_VARALPHA   5    /* '@', '#', ':'.  Alphabetic SQL variables */
83 #define CC_VARNUM     6    /* '?'.  Numeric SQL variables */
84 #define CC_SPACE      7    /* Space characters */
85 #define CC_QUOTE      8    /* '"', '\'', or '`'.  String literals, quoted ids */
86 #define CC_QUOTE2     9    /* '['.   [...] style quoted ids */
87 #define CC_PIPE      10    /* '|'.   Bitwise OR or concatenate */
88 #define CC_MINUS     11    /* '-'.  Minus or SQL-style comment */
89 #define CC_LT        12    /* '<'.  Part of < or <= or <> */
90 #define CC_GT        13    /* '>'.  Part of > or >= */
91 #define CC_EQ        14    /* '='.  Part of = or == */
92 #define CC_BANG      15    /* '!'.  Part of != */
93 #define CC_SLASH     16    /* '/'.  / or c-style comment */
94 #define CC_LP        17    /* '(' */
95 #define CC_RP        18    /* ')' */
96 #define CC_SEMI      19    /* ';' */
97 #define CC_PLUS      20    /* '+' */
98 #define CC_STAR      21    /* '*' */
99 #define CC_PERCENT   22    /* '%' */
100 #define CC_COMMA     23    /* ',' */
101 #define CC_AND       24    /* '&' */
102 #define CC_TILDA     25    /* '~' */
103 #define CC_DOT       26    /* '.' */
104 #define CC_ILLEGAL   27    /* Illegal character */
105 
106 static const unsigned char aiClass[] = {
107 /*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
108 /* 0x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
109 /* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
110 /* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
111 /* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
112 /* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
113 /* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  9, 27, 27, 27,  1,
114 /* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
115 /* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 10, 27, 25, 27,
116 /* 8x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
117 /* 9x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
118 /* Ax */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
119 /* Bx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
120 /* Cx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
121 /* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
122 /* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
123 /* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
124 };
125 
126 /* An array to map all upper-case characters into their corresponding
127 ** lower-case character.
128 **
129 ** SQLite only considers US-ASCII (or EBCDIC) characters.  We do not
130 ** handle case conversions for the UTF character set since the tables
131 ** involved are nearly as big or bigger than SQLite itself.
132 */
133 static const unsigned char sqlite3UpperToLower[] = {
134       0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
135      18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
136      36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
137      54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
138     104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
139     122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
140     108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
141     126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
142     144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
143     162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
144     180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
145     198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
146     216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
147     234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
148     252,253,254,255
149 };
150 
151 /*
152 ** The following 256 byte lookup table is used to support SQLites built-in
153 ** equivalents to the following standard library functions:
154 **
155 **   isspace()                        0x01
156 **   isalpha()                        0x02
157 **   isdigit()                        0x04
158 **   isalnum()                        0x06
159 **   isxdigit()                       0x08
160 **   toupper()                        0x20
161 **   SQLite identifier character      0x40
162 **   Quote character                  0x80
163 **
164 ** Bit 0x20 is set if the mapped character requires translation to upper
165 ** case. i.e. if the character is a lower-case ASCII character.
166 ** If x is a lower-case ASCII character, then its upper-case equivalent
167 ** is (x - 0x20). Therefore toupper() can be implemented as:
168 **
169 **   (x & ~(map[x]&0x20))
170 **
171 ** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
172 ** array. tolower() is used more often than toupper() by SQLite.
173 **
174 ** Bit 0x40 is set if the character is non-alphanumeric and can be used in an
175 ** SQLite identifier.  Identifiers are alphanumerics, "_", "$", and any
176 ** non-ASCII UTF character. Hence the test for whether or not a character is
177 ** part of an identifier is 0x46.
178 */
179 static const unsigned char sqlite3CtypeMap[256] = {
180   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 00..07    ........ */
181   0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,  /* 08..0f    ........ */
182   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 10..17    ........ */
183   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 18..1f    ........ */
184   0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80,  /* 20..27     !"#$%&' */
185   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 28..2f    ()*+,-./ */
186   0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,  /* 30..37    01234567 */
187   0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 38..3f    89:;<=>? */
188 
189   0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02,  /* 40..47    @ABCDEFG */
190   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 48..4f    HIJKLMNO */
191   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 50..57    PQRSTUVW */
192   0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40,  /* 58..5f    XYZ[\]^_ */
193   0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22,  /* 60..67    `abcdefg */
194   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 68..6f    hijklmno */
195   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 70..77    pqrstuvw */
196   0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 78..7f    xyz{|}~. */
197 
198   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 80..87    ........ */
199   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 88..8f    ........ */
200   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 90..97    ........ */
201   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 98..9f    ........ */
202   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a0..a7    ........ */
203   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a8..af    ........ */
204   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b0..b7    ........ */
205   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b8..bf    ........ */
206 
207   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c0..c7    ........ */
208   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c8..cf    ........ */
209   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d0..d7    ........ */
210   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d8..df    ........ */
211   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e0..e7    ........ */
212   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e8..ef    ........ */
213   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* f0..f7    ........ */
214   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40   /* f8..ff    ........ */
215 };
216 #define sqlite3Toupper(x)   ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
217 #define sqlite3Isspace(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
218 #define sqlite3Isalnum(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
219 #define sqlite3Isalpha(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
220 #define sqlite3Isdigit(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
221 #define sqlite3Isxdigit(x)  (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
222 #define sqlite3Tolower(x)   (sqlite3UpperToLower[(unsigned char)(x)])
223 #define sqlite3Isquote(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
224 
225 
226 /*
227 ** If X is a character that can be used in an identifier then
228 ** IdChar(X) will be true.  Otherwise it is false.
229 **
230 ** For ASCII, any character with the high-order bit set is
231 ** allowed in an identifier.  For 7-bit characters,
232 ** sqlite3IsIdChar[X] must be 1.
233 **
234 ** For EBCDIC, the rules are more complex but have the same
235 ** end result.
236 **
237 ** Ticket #1066.  the SQL standard does not allow '$' in the
238 ** middle of identifiers.  But many SQL implementations do.
239 ** SQLite will allow '$' in identifiers for compatibility.
240 ** But the feature is undocumented.
241 */
242 #define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
243 
244 /*
245 ** Ignore testcase() macros
246 */
247 #define testcase(X)
248 
249 /*
250 ** Token values
251 */
252 #define TK_SPACE    0
253 #define TK_NAME     1
254 #define TK_LITERAL  2
255 #define TK_PUNCT    3
256 #define TK_ERROR    4
257 
258 #define TK_MINUS    TK_PUNCT
259 #define TK_LP       TK_PUNCT
260 #define TK_RP       TK_PUNCT
261 #define TK_SEMI     TK_PUNCT
262 #define TK_PLUS     TK_PUNCT
263 #define TK_STAR     TK_PUNCT
264 #define TK_SLASH    TK_PUNCT
265 #define TK_REM      TK_PUNCT
266 #define TK_EQ       TK_PUNCT
267 #define TK_LE       TK_PUNCT
268 #define TK_NE       TK_PUNCT
269 #define TK_LSHIFT   TK_PUNCT
270 #define TK_LT       TK_PUNCT
271 #define TK_GE       TK_PUNCT
272 #define TK_RSHIFT   TK_PUNCT
273 #define TK_GT       TK_PUNCT
274 #define TK_GE       TK_PUNCT
275 #define TK_BITOR    TK_PUNCT
276 #define TK_CONCAT   TK_PUNCT
277 #define TK_COMMA    TK_PUNCT
278 #define TK_BITAND   TK_PUNCT
279 #define TK_BITNOT   TK_PUNCT
280 #define TK_STRING   TK_LITERAL
281 #define TK_ID       TK_NAME
282 #define TK_ILLEGAL  TK_ERROR
283 #define TK_DOT      TK_PUNCT
284 #define TK_INTEGER  TK_LITERAL
285 #define TK_FLOAT    TK_LITERAL
286 #define TK_VARIABLE TK_LITERAL
287 #define TK_BLOB     TK_LITERAL
288 
289 /*
290 ** Return the length (in bytes) of the token that begins at z[0].
291 ** Store the token type in *tokenType before returning.
292 */
293 static int sqlite3GetToken(const unsigned char *z, int *tokenType){
294   int i, c;
295   switch( aiClass[*z] ){  /* Switch on the character-class of the first byte
296                           ** of the token. See the comment on the CC_ defines
297                           ** above. */
298     case CC_SPACE: {
299       for(i=1; sqlite3Isspace(z[i]); i++){}
300       *tokenType = TK_SPACE;
301       return i;
302     }
303     case CC_MINUS: {
304       if( z[1]=='-' ){
305         for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
306         *tokenType = TK_SPACE;
307         return i;
308       }
309       *tokenType = TK_MINUS;
310       return 1;
311     }
312     case CC_LP: {
313       *tokenType = TK_LP;
314       return 1;
315     }
316     case CC_RP: {
317       *tokenType = TK_RP;
318       return 1;
319     }
320     case CC_SEMI: {
321       *tokenType = TK_SEMI;
322       return 1;
323     }
324     case CC_PLUS: {
325       *tokenType = TK_PLUS;
326       return 1;
327     }
328     case CC_STAR: {
329       *tokenType = TK_STAR;
330       return 1;
331     }
332     case CC_SLASH: {
333       if( z[1]!='*' || z[2]==0 ){
334         *tokenType = TK_SLASH;
335         return 1;
336       }
337       for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
338       if( c ) i++;
339       *tokenType = TK_SPACE;
340       return i;
341     }
342     case CC_PERCENT: {
343       *tokenType = TK_REM;
344       return 1;
345     }
346     case CC_EQ: {
347       *tokenType = TK_EQ;
348       return 1 + (z[1]=='=');
349     }
350     case CC_LT: {
351       if( (c=z[1])=='=' ){
352         *tokenType = TK_LE;
353         return 2;
354       }else if( c=='>' ){
355         *tokenType = TK_NE;
356         return 2;
357       }else if( c=='<' ){
358         *tokenType = TK_LSHIFT;
359         return 2;
360       }else{
361         *tokenType = TK_LT;
362         return 1;
363       }
364     }
365     case CC_GT: {
366       if( (c=z[1])=='=' ){
367         *tokenType = TK_GE;
368         return 2;
369       }else if( c=='>' ){
370         *tokenType = TK_RSHIFT;
371         return 2;
372       }else{
373         *tokenType = TK_GT;
374         return 1;
375       }
376     }
377     case CC_BANG: {
378       if( z[1]!='=' ){
379         *tokenType = TK_ILLEGAL;
380         return 1;
381       }else{
382         *tokenType = TK_NE;
383         return 2;
384       }
385     }
386     case CC_PIPE: {
387       if( z[1]!='|' ){
388         *tokenType = TK_BITOR;
389         return 1;
390       }else{
391         *tokenType = TK_CONCAT;
392         return 2;
393       }
394     }
395     case CC_COMMA: {
396       *tokenType = TK_COMMA;
397       return 1;
398     }
399     case CC_AND: {
400       *tokenType = TK_BITAND;
401       return 1;
402     }
403     case CC_TILDA: {
404       *tokenType = TK_BITNOT;
405       return 1;
406     }
407     case CC_QUOTE: {
408       int delim = z[0];
409       testcase( delim=='`' );
410       testcase( delim=='\'' );
411       testcase( delim=='"' );
412       for(i=1; (c=z[i])!=0; i++){
413         if( c==delim ){
414           if( z[i+1]==delim ){
415             i++;
416           }else{
417             break;
418           }
419         }
420       }
421       if( c=='\'' ){
422         *tokenType = TK_STRING;
423         return i+1;
424       }else if( c!=0 ){
425         *tokenType = TK_ID;
426         return i+1;
427       }else{
428         *tokenType = TK_ILLEGAL;
429         return i;
430       }
431     }
432     case CC_DOT: {
433       if( !sqlite3Isdigit(z[1]) ){
434         *tokenType = TK_DOT;
435         return 1;
436       }
437       /* If the next character is a digit, this is a floating point
438       ** number that begins with ".".  Fall thru into the next case */
439     }
440     case CC_DIGIT: {
441       *tokenType = TK_INTEGER;
442       if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
443         for(i=3; sqlite3Isxdigit(z[i]); i++){}
444         return i;
445       }
446       for(i=0; sqlite3Isdigit(z[i]); i++){}
447       if( z[i]=='.' ){
448         i++;
449         while( sqlite3Isdigit(z[i]) ){ i++; }
450         *tokenType = TK_FLOAT;
451       }
452       if( (z[i]=='e' || z[i]=='E') &&
453            ( sqlite3Isdigit(z[i+1])
454             || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
455            )
456       ){
457         i += 2;
458         while( sqlite3Isdigit(z[i]) ){ i++; }
459         *tokenType = TK_FLOAT;
460       }
461       while( IdChar(z[i]) ){
462         *tokenType = TK_ILLEGAL;
463         i++;
464       }
465       return i;
466     }
467     case CC_QUOTE2: {
468       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
469       *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
470       return i;
471     }
472     case CC_VARNUM: {
473       *tokenType = TK_VARIABLE;
474       for(i=1; sqlite3Isdigit(z[i]); i++){}
475       return i;
476     }
477     case CC_DOLLAR:
478     case CC_VARALPHA: {
479       int n = 0;
480       testcase( z[0]=='$' );  testcase( z[0]=='@' );
481       testcase( z[0]==':' );  testcase( z[0]=='#' );
482       *tokenType = TK_VARIABLE;
483       for(i=1; (c=z[i])!=0; i++){
484         if( IdChar(c) ){
485           n++;
486         }else if( c=='(' && n>0 ){
487           do{
488             i++;
489           }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
490           if( c==')' ){
491             i++;
492           }else{
493             *tokenType = TK_ILLEGAL;
494           }
495           break;
496         }else if( c==':' && z[i+1]==':' ){
497           i++;
498         }else{
499           break;
500         }
501       }
502       if( n==0 ) *tokenType = TK_ILLEGAL;
503       return i;
504     }
505     case CC_KYWD: {
506       for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
507       if( IdChar(z[i]) ){
508         /* This token started out using characters that can appear in keywords,
509         ** but z[i] is a character not allowed within keywords, so this must
510         ** be an identifier instead */
511         i++;
512         break;
513       }
514       *tokenType = TK_ID;
515       return i;
516     }
517     case CC_X: {
518       testcase( z[0]=='x' ); testcase( z[0]=='X' );
519       if( z[1]=='\'' ){
520         *tokenType = TK_BLOB;
521         for(i=2; sqlite3Isxdigit(z[i]); i++){}
522         if( z[i]!='\'' || i%2 ){
523           *tokenType = TK_ILLEGAL;
524           while( z[i] && z[i]!='\'' ){ i++; }
525         }
526         if( z[i] ) i++;
527         return i;
528       }
529       /* If it is not a BLOB literal, then it must be an ID, since no
530       ** SQL keywords start with the letter 'x'.  Fall through */
531     }
532     case CC_ID: {
533       i = 1;
534       break;
535     }
536     default: {
537       *tokenType = TK_ILLEGAL;
538       return 1;
539     }
540   }
541   while( IdChar(z[i]) ){ i++; }
542   *tokenType = TK_ID;
543   return i;
544 }
545 
546 char *sqlite3_normalize(const char *zSql){
547   char *z;              /* The output string */
548   sqlite3_int64 nZ;     /* Size of the output string in bytes */
549   sqlite3_int64 nSql;   /* Size of the input string in bytes */
550   int i;                /* Next character to read from zSql[] */
551   int j;                /* Next slot to fill in on z[] */
552   int tokenType;        /* Type of the next token */
553   int n;                /* Size of the next token */
554   int k;                /* Loop counter */
555 
556   nSql = strlen(zSql);
557   nZ = nSql;
558   z = sqlite3_malloc64( nZ+2 );
559   if( z==0 ) return 0;
560   for(i=j=0; zSql[i]; i += n){
561     n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
562     switch( tokenType ){
563       case TK_SPACE: {
564         break;
565       }
566       case TK_ERROR: {
567         sqlite3_free(z);
568         return 0;
569       }
570       case TK_LITERAL: {
571         z[j++] = '?';
572         break;
573       }
574       case TK_PUNCT:
575       case TK_NAME: {
576         if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){
577           if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3]))
578            || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4]))
579           ){
580             /* NULL is a keyword in this case, not a literal value */
581           }else{
582             /* Here the NULL is a literal value */
583             z[j++] = '?';
584             break;
585           }
586         }
587         if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
588         for(k=0; k<n; k++){
589           z[j++] = sqlite3Tolower(zSql[i+k]);
590         }
591         break;
592       }
593     }
594   }
595   while( j>0 && z[j-1]==' ' ){ j--; }
596   if( j>0 && z[j-1]!=';' ){ z[j++] = ';'; }
597   z[j] = 0;
598 
599   /* Make a second pass converting "in(...)" where the "..." is not a
600   ** SELECT statement into "in(?,?,?)" */
601   for(i=0; i<j; i=n){
602     char *zIn = strstr(z+i, "in(");
603     int nParen;
604     if( zIn==0 ) break;
605     n = (int)(zIn-z)+3;  /* Index of first char past "in(" */
606     if( n && IdChar(zIn[-1]) ) continue;
607     if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue;
608     if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue;
609     for(nParen=1, k=0; z[n+k]; k++){
610       if( z[n+k]=='(' ) nParen++;
611       if( z[n+k]==')' ){
612         nParen--;
613         if( nParen==0 ) break;
614       }
615     }
616     /* k is the number of bytes in the "..." within "in(...)" */
617     if( k<5 ){
618       z = sqlite3_realloc64(z, j+(5-k)+1);
619       if( z==0 ) return 0;
620       memmove(z+n+5, z+n+k, j-(n+k));
621     }else if( k>5 ){
622       memmove(z+n+5, z+n+k, j-(n+k));
623     }
624     j = j-k+5;
625     z[j] = 0;
626     memcpy(z+n, "?,?,?", 5);
627   }
628   return z;
629 }
630 
631 /*
632 ** For testing purposes, or to build a stand-alone SQL normalizer program,
633 ** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link
634 ** it against any SQLite library.  The resulting command-line program will
635 ** run sqlite3_normalize() over the text of all files named on the command-
636 ** line and show the result on standard output.
637 */
638 #ifdef SQLITE_NORMALIZE_CLI
639 #include <stdio.h>
640 #include <stdlib.h>
641 
642 /*
643 ** Break zIn up into separate SQL statements and run sqlite3_normalize()
644 ** on each one.  Print the result of each run.
645 */
646 static void normalizeFile(char *zIn){
647   int i;
648   if( zIn==0 ) return;
649   for(i=0; zIn[i]; i++){
650     char cSaved;
651     if( zIn[i]!=';' ) continue;
652     cSaved = zIn[i+1];
653     zIn[i+1] = 0;
654     if( sqlite3_complete(zIn) ){
655       char *zOut = sqlite3_normalize(zIn);
656       if( zOut ){
657         printf("%s\n", zOut);
658         sqlite3_free(zOut);
659       }else{
660         fprintf(stderr, "ERROR: %s\n", zIn);
661       }
662       zIn[i+1] = cSaved;
663       zIn += i+1;
664       i = -1;
665     }else{
666       zIn[i+1] = cSaved;
667     }
668   }
669 }
670 
671 /*
672 ** The main routine for "sql_normalize".  Read files named on the
673 ** command-line and run the text of each through sqlite3_normalize().
674 */
675 int main(int argc, char **argv){
676   int i;
677   FILE *in;
678   char *zBuf = 0;
679   sqlite3_int64 sz, got;
680 
681   for(i=1; i<argc; i++){
682     in = fopen(argv[i], "rb");
683     if( in==0 ){
684       fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
685       continue;
686     }
687     fseek(in, 0, SEEK_END);
688     sz = ftell(in);
689     rewind(in);
690     zBuf = sqlite3_realloc64(zBuf, sz+1);
691     if( zBuf==0 ){
692       fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
693       exit(1);
694     }
695     got = fread(zBuf, 1, sz, in);
696     fclose(in);
697     if( got!=sz ){
698       fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
699               got, sz, argv[i]);
700     }else{
701       zBuf[got] = 0;
702       normalizeFile(zBuf);
703     }
704   }
705   sqlite3_free(zBuf);
706 }
707 #endif /* SQLITE_NORMALIZE_CLI */
708