1a41f2855Sdrh /*
2a41f2855Sdrh ** 2018-01-08
3a41f2855Sdrh **
4a41f2855Sdrh ** The author disclaims copyright to this source code. In place of
5a41f2855Sdrh ** a legal notice, here is a blessing:
6a41f2855Sdrh **
7a41f2855Sdrh ** May you do good and not evil.
8a41f2855Sdrh ** May you find forgiveness for yourself and forgive others.
9a41f2855Sdrh ** May you share freely, never taking more than you give.
10a41f2855Sdrh **
11a41f2855Sdrh ******************************************************************************
12a41f2855Sdrh **
13a41f2855Sdrh ** This file contains code to implement the sqlite3_normalize() function.
14a41f2855Sdrh **
15a41f2855Sdrh ** char *sqlite3_normalize(const char *zSql);
16a41f2855Sdrh **
17a41f2855Sdrh ** This function takes an SQL string as input and returns a "normalized"
18a41f2855Sdrh ** version of that string in memory obtained from sqlite3_malloc64(). The
19a41f2855Sdrh ** caller is responsible for ensuring that the returned memory is freed.
20a41f2855Sdrh **
21a41f2855Sdrh ** If a memory allocation error occurs, this routine returns NULL.
22a41f2855Sdrh **
23a41f2855Sdrh ** The normalization consists of the following transformations:
24a41f2855Sdrh **
25a41f2855Sdrh ** (1) Convert every literal (string, blob literal, numeric constant,
26a41f2855Sdrh ** or "NULL" constant) into a ?
27a41f2855Sdrh **
28a41f2855Sdrh ** (2) Remove all superfluous whitespace, including comments. Change
29a41f2855Sdrh ** all required whitespace to a single space character.
30a41f2855Sdrh **
31a41f2855Sdrh ** (3) Lowercase all ASCII characters.
32a41f2855Sdrh **
33a41f2855Sdrh ** (4) If an IN or NOT IN operator is followed by a list of 1 or more
34a41f2855Sdrh ** values, convert that list into "(?,?,?)".
35a41f2855Sdrh **
36a41f2855Sdrh ** The purpose of normalization is two-fold:
37a41f2855Sdrh **
38ef424812Sdrh ** (1) Sanitize queries by removing potentially private or sensitive
39ef424812Sdrh ** information contained in literals.
40a41f2855Sdrh **
41a41f2855Sdrh ** (2) Identify structurally identical queries by comparing their
42a41f2855Sdrh ** normalized forms.
43ef424812Sdrh **
44ef424812Sdrh ** Command-Line Utility
45ef424812Sdrh ** --------------------
46ef424812Sdrh **
47ef424812Sdrh ** This file also contains code for a command-line utility that converts
48ef424812Sdrh ** SQL queries in text files into their normalized forms. To build the
49ef424812Sdrh ** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI
50ef424812Sdrh ** and link it against the SQLite library.
51a41f2855Sdrh */
52a41f2855Sdrh #include <sqlite3.h>
53a41f2855Sdrh #include <string.h>
54a41f2855Sdrh
55a41f2855Sdrh /*
56a41f2855Sdrh ** Implementation note:
57a41f2855Sdrh **
58a41f2855Sdrh ** Much of the tokenizer logic is copied out of the tokenize.c source file
59ef424812Sdrh ** of SQLite. That logic could be simplified for this particular application,
60a41f2855Sdrh ** but that would impose a risk of introducing subtle errors. It is best to
61a41f2855Sdrh ** keep the code as close to the original as possible.
62ef424812Sdrh **
63ef424812Sdrh ** The tokenize code is in sync with the SQLite core as of 2018-01-08.
64ef424812Sdrh ** Any future changes to the core tokenizer might require corresponding
65ef424812Sdrh ** adjustments to the tokenizer logic in this module.
66a41f2855Sdrh */
67a41f2855Sdrh
68a41f2855Sdrh
69a41f2855Sdrh /* Character classes for tokenizing
70a41f2855Sdrh **
71a41f2855Sdrh ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
72a41f2855Sdrh ** using a lookup table, whereas a switch() directly on c uses a binary search.
73a41f2855Sdrh ** The lookup table is much faster. To maximize speed, and to ensure that
74a41f2855Sdrh ** a lookup table is used, all of the classes need to be small integers and
75a41f2855Sdrh ** all of them need to be used within the switch.
76a41f2855Sdrh */
77a41f2855Sdrh #define CC_X 0 /* The letter 'x', or start of BLOB literal */
78a41f2855Sdrh #define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */
79a41f2855Sdrh #define CC_ID 2 /* unicode characters usable in IDs */
80a41f2855Sdrh #define CC_DIGIT 3 /* Digits */
81a41f2855Sdrh #define CC_DOLLAR 4 /* '$' */
82a41f2855Sdrh #define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */
83a41f2855Sdrh #define CC_VARNUM 6 /* '?'. Numeric SQL variables */
84a41f2855Sdrh #define CC_SPACE 7 /* Space characters */
85a41f2855Sdrh #define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */
86a41f2855Sdrh #define CC_QUOTE2 9 /* '['. [...] style quoted ids */
87a41f2855Sdrh #define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */
88a41f2855Sdrh #define CC_MINUS 11 /* '-'. Minus or SQL-style comment */
89a41f2855Sdrh #define CC_LT 12 /* '<'. Part of < or <= or <> */
90a41f2855Sdrh #define CC_GT 13 /* '>'. Part of > or >= */
91a41f2855Sdrh #define CC_EQ 14 /* '='. Part of = or == */
92a41f2855Sdrh #define CC_BANG 15 /* '!'. Part of != */
93a41f2855Sdrh #define CC_SLASH 16 /* '/'. / or c-style comment */
94a41f2855Sdrh #define CC_LP 17 /* '(' */
95a41f2855Sdrh #define CC_RP 18 /* ')' */
96a41f2855Sdrh #define CC_SEMI 19 /* ';' */
97a41f2855Sdrh #define CC_PLUS 20 /* '+' */
98a41f2855Sdrh #define CC_STAR 21 /* '*' */
99a41f2855Sdrh #define CC_PERCENT 22 /* '%' */
100a41f2855Sdrh #define CC_COMMA 23 /* ',' */
101a41f2855Sdrh #define CC_AND 24 /* '&' */
102a41f2855Sdrh #define CC_TILDA 25 /* '~' */
103a41f2855Sdrh #define CC_DOT 26 /* '.' */
104a41f2855Sdrh #define CC_ILLEGAL 27 /* Illegal character */
105a41f2855Sdrh
106a41f2855Sdrh static const unsigned char aiClass[] = {
107a41f2855Sdrh /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */
108a41f2855Sdrh /* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27,
109a41f2855Sdrh /* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
110a41f2855Sdrh /* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16,
111a41f2855Sdrh /* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6,
112a41f2855Sdrh /* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113a41f2855Sdrh /* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 9, 27, 27, 27, 1,
114a41f2855Sdrh /* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
115a41f2855Sdrh /* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 10, 27, 25, 27,
116a41f2855Sdrh /* 8x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
117a41f2855Sdrh /* 9x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118a41f2855Sdrh /* Ax */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119a41f2855Sdrh /* Bx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120a41f2855Sdrh /* Cx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121a41f2855Sdrh /* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122a41f2855Sdrh /* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123a41f2855Sdrh /* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
124a41f2855Sdrh };
125a41f2855Sdrh
126a41f2855Sdrh /* An array to map all upper-case characters into their corresponding
127a41f2855Sdrh ** lower-case character.
128a41f2855Sdrh **
129a41f2855Sdrh ** SQLite only considers US-ASCII (or EBCDIC) characters. We do not
130a41f2855Sdrh ** handle case conversions for the UTF character set since the tables
131a41f2855Sdrh ** involved are nearly as big or bigger than SQLite itself.
132a41f2855Sdrh */
13391694dbdSdrh static const unsigned char sqlite3UpperToLower[] = {
134a41f2855Sdrh 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
135a41f2855Sdrh 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
136a41f2855Sdrh 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
137a41f2855Sdrh 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
138a41f2855Sdrh 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
139a41f2855Sdrh 122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
140a41f2855Sdrh 108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
141a41f2855Sdrh 126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
142a41f2855Sdrh 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
143a41f2855Sdrh 162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
144a41f2855Sdrh 180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
145a41f2855Sdrh 198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
146a41f2855Sdrh 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
147a41f2855Sdrh 234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
148a41f2855Sdrh 252,253,254,255
149a41f2855Sdrh };
150a41f2855Sdrh
151a41f2855Sdrh /*
152a41f2855Sdrh ** The following 256 byte lookup table is used to support SQLites built-in
153a41f2855Sdrh ** equivalents to the following standard library functions:
154a41f2855Sdrh **
155a41f2855Sdrh ** isspace() 0x01
156a41f2855Sdrh ** isalpha() 0x02
157a41f2855Sdrh ** isdigit() 0x04
158a41f2855Sdrh ** isalnum() 0x06
159a41f2855Sdrh ** isxdigit() 0x08
160a41f2855Sdrh ** toupper() 0x20
161a41f2855Sdrh ** SQLite identifier character 0x40
162a41f2855Sdrh ** Quote character 0x80
163a41f2855Sdrh **
164a41f2855Sdrh ** Bit 0x20 is set if the mapped character requires translation to upper
165a41f2855Sdrh ** case. i.e. if the character is a lower-case ASCII character.
166a41f2855Sdrh ** If x is a lower-case ASCII character, then its upper-case equivalent
167a41f2855Sdrh ** is (x - 0x20). Therefore toupper() can be implemented as:
168a41f2855Sdrh **
169a41f2855Sdrh ** (x & ~(map[x]&0x20))
170a41f2855Sdrh **
171a41f2855Sdrh ** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
172a41f2855Sdrh ** array. tolower() is used more often than toupper() by SQLite.
173a41f2855Sdrh **
174a41f2855Sdrh ** Bit 0x40 is set if the character is non-alphanumeric and can be used in an
175a41f2855Sdrh ** SQLite identifier. Identifiers are alphanumerics, "_", "$", and any
176a41f2855Sdrh ** non-ASCII UTF character. Hence the test for whether or not a character is
177a41f2855Sdrh ** part of an identifier is 0x46.
178a41f2855Sdrh */
17991694dbdSdrh static const unsigned char sqlite3CtypeMap[256] = {
180a41f2855Sdrh 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */
181a41f2855Sdrh 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */
182a41f2855Sdrh 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */
183a41f2855Sdrh 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f ........ */
184a41f2855Sdrh 0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27 !"#$%&' */
185a41f2855Sdrh 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f ()*+,-./ */
186a41f2855Sdrh 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37 01234567 */
187a41f2855Sdrh 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f 89:;<=>? */
188a41f2855Sdrh
189a41f2855Sdrh 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47 @ABCDEFG */
190a41f2855Sdrh 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f HIJKLMNO */
191a41f2855Sdrh 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57 PQRSTUVW */
192a41f2855Sdrh 0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f XYZ[\]^_ */
193a41f2855Sdrh 0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67 `abcdefg */
194a41f2855Sdrh 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f hijklmno */
195a41f2855Sdrh 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77 pqrstuvw */
196a41f2855Sdrh 0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f xyz{|}~. */
197a41f2855Sdrh
198a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87 ........ */
199a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f ........ */
200a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97 ........ */
201a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f ........ */
202a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7 ........ */
203a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af ........ */
204a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7 ........ */
205a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf ........ */
206a41f2855Sdrh
207a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7 ........ */
208a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf ........ */
209a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7 ........ */
210a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df ........ */
211a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7 ........ */
212a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef ........ */
213a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7 ........ */
214a41f2855Sdrh 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* f8..ff ........ */
215a41f2855Sdrh };
216a41f2855Sdrh #define sqlite3Toupper(x) ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
217a41f2855Sdrh #define sqlite3Isspace(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
218a41f2855Sdrh #define sqlite3Isalnum(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
219a41f2855Sdrh #define sqlite3Isalpha(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
220a41f2855Sdrh #define sqlite3Isdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
221a41f2855Sdrh #define sqlite3Isxdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
222a41f2855Sdrh #define sqlite3Tolower(x) (sqlite3UpperToLower[(unsigned char)(x)])
223a41f2855Sdrh #define sqlite3Isquote(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
224a41f2855Sdrh
225a41f2855Sdrh
226a41f2855Sdrh /*
227a41f2855Sdrh ** If X is a character that can be used in an identifier then
228a41f2855Sdrh ** IdChar(X) will be true. Otherwise it is false.
229a41f2855Sdrh **
230a41f2855Sdrh ** For ASCII, any character with the high-order bit set is
231a41f2855Sdrh ** allowed in an identifier. For 7-bit characters,
232a41f2855Sdrh ** sqlite3IsIdChar[X] must be 1.
233a41f2855Sdrh **
234a41f2855Sdrh ** For EBCDIC, the rules are more complex but have the same
235a41f2855Sdrh ** end result.
236a41f2855Sdrh **
237a41f2855Sdrh ** Ticket #1066. the SQL standard does not allow '$' in the
238a41f2855Sdrh ** middle of identifiers. But many SQL implementations do.
239a41f2855Sdrh ** SQLite will allow '$' in identifiers for compatibility.
240a41f2855Sdrh ** But the feature is undocumented.
241a41f2855Sdrh */
242a41f2855Sdrh #define IdChar(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
243a41f2855Sdrh
244a41f2855Sdrh /*
245a41f2855Sdrh ** Ignore testcase() macros
246a41f2855Sdrh */
247a41f2855Sdrh #define testcase(X)
248a41f2855Sdrh
249a41f2855Sdrh /*
250a41f2855Sdrh ** Token values
251a41f2855Sdrh */
252a41f2855Sdrh #define TK_SPACE 0
253a41f2855Sdrh #define TK_NAME 1
254a41f2855Sdrh #define TK_LITERAL 2
255a41f2855Sdrh #define TK_PUNCT 3
256a41f2855Sdrh #define TK_ERROR 4
257a41f2855Sdrh
258a41f2855Sdrh #define TK_MINUS TK_PUNCT
259a41f2855Sdrh #define TK_LP TK_PUNCT
260a41f2855Sdrh #define TK_RP TK_PUNCT
261a41f2855Sdrh #define TK_SEMI TK_PUNCT
262a41f2855Sdrh #define TK_PLUS TK_PUNCT
263a41f2855Sdrh #define TK_STAR TK_PUNCT
264a41f2855Sdrh #define TK_SLASH TK_PUNCT
265a41f2855Sdrh #define TK_REM TK_PUNCT
266a41f2855Sdrh #define TK_EQ TK_PUNCT
267a41f2855Sdrh #define TK_LE TK_PUNCT
268a41f2855Sdrh #define TK_NE TK_PUNCT
269a41f2855Sdrh #define TK_LSHIFT TK_PUNCT
270a41f2855Sdrh #define TK_LT TK_PUNCT
271a41f2855Sdrh #define TK_GE TK_PUNCT
272a41f2855Sdrh #define TK_RSHIFT TK_PUNCT
273a41f2855Sdrh #define TK_GT TK_PUNCT
274a41f2855Sdrh #define TK_GE TK_PUNCT
275a41f2855Sdrh #define TK_BITOR TK_PUNCT
276a41f2855Sdrh #define TK_CONCAT TK_PUNCT
277a41f2855Sdrh #define TK_COMMA TK_PUNCT
278a41f2855Sdrh #define TK_BITAND TK_PUNCT
279a41f2855Sdrh #define TK_BITNOT TK_PUNCT
280a41f2855Sdrh #define TK_STRING TK_LITERAL
281a41f2855Sdrh #define TK_ID TK_NAME
282a41f2855Sdrh #define TK_ILLEGAL TK_ERROR
283a41f2855Sdrh #define TK_DOT TK_PUNCT
284a41f2855Sdrh #define TK_INTEGER TK_LITERAL
285a41f2855Sdrh #define TK_FLOAT TK_LITERAL
286a41f2855Sdrh #define TK_VARIABLE TK_LITERAL
287a41f2855Sdrh #define TK_BLOB TK_LITERAL
288a41f2855Sdrh
289*08b92086Sdrh /* Disable nuisence warnings about case fall-through */
290*08b92086Sdrh #if !defined(deliberate_fall_through) && defined(__GCC__) && __GCC__>=7
291*08b92086Sdrh # define deliberate_fall_through __attribute__((fallthrough));
292*08b92086Sdrh #else
293*08b92086Sdrh # define deliberate_fall_through
294*08b92086Sdrh #endif
295*08b92086Sdrh
296a41f2855Sdrh /*
297a41f2855Sdrh ** Return the length (in bytes) of the token that begins at z[0].
298a41f2855Sdrh ** Store the token type in *tokenType before returning.
299a41f2855Sdrh */
sqlite3GetToken(const unsigned char * z,int * tokenType)300a41f2855Sdrh static int sqlite3GetToken(const unsigned char *z, int *tokenType){
301a41f2855Sdrh int i, c;
302a41f2855Sdrh switch( aiClass[*z] ){ /* Switch on the character-class of the first byte
303a41f2855Sdrh ** of the token. See the comment on the CC_ defines
304a41f2855Sdrh ** above. */
305a41f2855Sdrh case CC_SPACE: {
306a41f2855Sdrh for(i=1; sqlite3Isspace(z[i]); i++){}
307a41f2855Sdrh *tokenType = TK_SPACE;
308a41f2855Sdrh return i;
309a41f2855Sdrh }
310a41f2855Sdrh case CC_MINUS: {
311a41f2855Sdrh if( z[1]=='-' ){
312a41f2855Sdrh for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
313a41f2855Sdrh *tokenType = TK_SPACE;
314a41f2855Sdrh return i;
315a41f2855Sdrh }
316a41f2855Sdrh *tokenType = TK_MINUS;
317a41f2855Sdrh return 1;
318a41f2855Sdrh }
319a41f2855Sdrh case CC_LP: {
320a41f2855Sdrh *tokenType = TK_LP;
321a41f2855Sdrh return 1;
322a41f2855Sdrh }
323a41f2855Sdrh case CC_RP: {
324a41f2855Sdrh *tokenType = TK_RP;
325a41f2855Sdrh return 1;
326a41f2855Sdrh }
327a41f2855Sdrh case CC_SEMI: {
328a41f2855Sdrh *tokenType = TK_SEMI;
329a41f2855Sdrh return 1;
330a41f2855Sdrh }
331a41f2855Sdrh case CC_PLUS: {
332a41f2855Sdrh *tokenType = TK_PLUS;
333a41f2855Sdrh return 1;
334a41f2855Sdrh }
335a41f2855Sdrh case CC_STAR: {
336a41f2855Sdrh *tokenType = TK_STAR;
337a41f2855Sdrh return 1;
338a41f2855Sdrh }
339a41f2855Sdrh case CC_SLASH: {
340a41f2855Sdrh if( z[1]!='*' || z[2]==0 ){
341a41f2855Sdrh *tokenType = TK_SLASH;
342a41f2855Sdrh return 1;
343a41f2855Sdrh }
344a41f2855Sdrh for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
345a41f2855Sdrh if( c ) i++;
346a41f2855Sdrh *tokenType = TK_SPACE;
347a41f2855Sdrh return i;
348a41f2855Sdrh }
349a41f2855Sdrh case CC_PERCENT: {
350a41f2855Sdrh *tokenType = TK_REM;
351a41f2855Sdrh return 1;
352a41f2855Sdrh }
353a41f2855Sdrh case CC_EQ: {
354a41f2855Sdrh *tokenType = TK_EQ;
355a41f2855Sdrh return 1 + (z[1]=='=');
356a41f2855Sdrh }
357a41f2855Sdrh case CC_LT: {
358a41f2855Sdrh if( (c=z[1])=='=' ){
359a41f2855Sdrh *tokenType = TK_LE;
360a41f2855Sdrh return 2;
361a41f2855Sdrh }else if( c=='>' ){
362a41f2855Sdrh *tokenType = TK_NE;
363a41f2855Sdrh return 2;
364a41f2855Sdrh }else if( c=='<' ){
365a41f2855Sdrh *tokenType = TK_LSHIFT;
366a41f2855Sdrh return 2;
367a41f2855Sdrh }else{
368a41f2855Sdrh *tokenType = TK_LT;
369a41f2855Sdrh return 1;
370a41f2855Sdrh }
371a41f2855Sdrh }
372a41f2855Sdrh case CC_GT: {
373a41f2855Sdrh if( (c=z[1])=='=' ){
374a41f2855Sdrh *tokenType = TK_GE;
375a41f2855Sdrh return 2;
376a41f2855Sdrh }else if( c=='>' ){
377a41f2855Sdrh *tokenType = TK_RSHIFT;
378a41f2855Sdrh return 2;
379a41f2855Sdrh }else{
380a41f2855Sdrh *tokenType = TK_GT;
381a41f2855Sdrh return 1;
382a41f2855Sdrh }
383a41f2855Sdrh }
384a41f2855Sdrh case CC_BANG: {
385a41f2855Sdrh if( z[1]!='=' ){
386a41f2855Sdrh *tokenType = TK_ILLEGAL;
387a41f2855Sdrh return 1;
388a41f2855Sdrh }else{
389a41f2855Sdrh *tokenType = TK_NE;
390a41f2855Sdrh return 2;
391a41f2855Sdrh }
392a41f2855Sdrh }
393a41f2855Sdrh case CC_PIPE: {
394a41f2855Sdrh if( z[1]!='|' ){
395a41f2855Sdrh *tokenType = TK_BITOR;
396a41f2855Sdrh return 1;
397a41f2855Sdrh }else{
398a41f2855Sdrh *tokenType = TK_CONCAT;
399a41f2855Sdrh return 2;
400a41f2855Sdrh }
401a41f2855Sdrh }
402a41f2855Sdrh case CC_COMMA: {
403a41f2855Sdrh *tokenType = TK_COMMA;
404a41f2855Sdrh return 1;
405a41f2855Sdrh }
406a41f2855Sdrh case CC_AND: {
407a41f2855Sdrh *tokenType = TK_BITAND;
408a41f2855Sdrh return 1;
409a41f2855Sdrh }
410a41f2855Sdrh case CC_TILDA: {
411a41f2855Sdrh *tokenType = TK_BITNOT;
412a41f2855Sdrh return 1;
413a41f2855Sdrh }
414a41f2855Sdrh case CC_QUOTE: {
415a41f2855Sdrh int delim = z[0];
416a41f2855Sdrh testcase( delim=='`' );
417a41f2855Sdrh testcase( delim=='\'' );
418a41f2855Sdrh testcase( delim=='"' );
419a41f2855Sdrh for(i=1; (c=z[i])!=0; i++){
420a41f2855Sdrh if( c==delim ){
421a41f2855Sdrh if( z[i+1]==delim ){
422a41f2855Sdrh i++;
423a41f2855Sdrh }else{
424a41f2855Sdrh break;
425a41f2855Sdrh }
426a41f2855Sdrh }
427a41f2855Sdrh }
428a41f2855Sdrh if( c=='\'' ){
429a41f2855Sdrh *tokenType = TK_STRING;
430a41f2855Sdrh return i+1;
431a41f2855Sdrh }else if( c!=0 ){
432a41f2855Sdrh *tokenType = TK_ID;
433a41f2855Sdrh return i+1;
434a41f2855Sdrh }else{
435a41f2855Sdrh *tokenType = TK_ILLEGAL;
436a41f2855Sdrh return i;
437a41f2855Sdrh }
438a41f2855Sdrh }
439a41f2855Sdrh case CC_DOT: {
440a41f2855Sdrh if( !sqlite3Isdigit(z[1]) ){
441a41f2855Sdrh *tokenType = TK_DOT;
442a41f2855Sdrh return 1;
443a41f2855Sdrh }
444a41f2855Sdrh /* If the next character is a digit, this is a floating point
445a41f2855Sdrh ** number that begins with ".". Fall thru into the next case */
446*08b92086Sdrh /* no break */ deliberate_fall_through
447a41f2855Sdrh }
448a41f2855Sdrh case CC_DIGIT: {
449a41f2855Sdrh *tokenType = TK_INTEGER;
450a41f2855Sdrh if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
451a41f2855Sdrh for(i=3; sqlite3Isxdigit(z[i]); i++){}
452a41f2855Sdrh return i;
453a41f2855Sdrh }
454a41f2855Sdrh for(i=0; sqlite3Isdigit(z[i]); i++){}
455a41f2855Sdrh if( z[i]=='.' ){
456a41f2855Sdrh i++;
457a41f2855Sdrh while( sqlite3Isdigit(z[i]) ){ i++; }
458a41f2855Sdrh *tokenType = TK_FLOAT;
459a41f2855Sdrh }
460a41f2855Sdrh if( (z[i]=='e' || z[i]=='E') &&
461a41f2855Sdrh ( sqlite3Isdigit(z[i+1])
462a41f2855Sdrh || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
463a41f2855Sdrh )
464a41f2855Sdrh ){
465a41f2855Sdrh i += 2;
466a41f2855Sdrh while( sqlite3Isdigit(z[i]) ){ i++; }
467a41f2855Sdrh *tokenType = TK_FLOAT;
468a41f2855Sdrh }
469a41f2855Sdrh while( IdChar(z[i]) ){
470a41f2855Sdrh *tokenType = TK_ILLEGAL;
471a41f2855Sdrh i++;
472a41f2855Sdrh }
473a41f2855Sdrh return i;
474a41f2855Sdrh }
475a41f2855Sdrh case CC_QUOTE2: {
476a41f2855Sdrh for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
477a41f2855Sdrh *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
478a41f2855Sdrh return i;
479a41f2855Sdrh }
480a41f2855Sdrh case CC_VARNUM: {
481a41f2855Sdrh *tokenType = TK_VARIABLE;
482a41f2855Sdrh for(i=1; sqlite3Isdigit(z[i]); i++){}
483a41f2855Sdrh return i;
484a41f2855Sdrh }
485a41f2855Sdrh case CC_DOLLAR:
486a41f2855Sdrh case CC_VARALPHA: {
487a41f2855Sdrh int n = 0;
488a41f2855Sdrh testcase( z[0]=='$' ); testcase( z[0]=='@' );
489a41f2855Sdrh testcase( z[0]==':' ); testcase( z[0]=='#' );
490a41f2855Sdrh *tokenType = TK_VARIABLE;
491a41f2855Sdrh for(i=1; (c=z[i])!=0; i++){
492a41f2855Sdrh if( IdChar(c) ){
493a41f2855Sdrh n++;
494a41f2855Sdrh }else if( c=='(' && n>0 ){
495a41f2855Sdrh do{
496a41f2855Sdrh i++;
497a41f2855Sdrh }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
498a41f2855Sdrh if( c==')' ){
499a41f2855Sdrh i++;
500a41f2855Sdrh }else{
501a41f2855Sdrh *tokenType = TK_ILLEGAL;
502a41f2855Sdrh }
503a41f2855Sdrh break;
504a41f2855Sdrh }else if( c==':' && z[i+1]==':' ){
505a41f2855Sdrh i++;
506a41f2855Sdrh }else{
507a41f2855Sdrh break;
508a41f2855Sdrh }
509a41f2855Sdrh }
510a41f2855Sdrh if( n==0 ) *tokenType = TK_ILLEGAL;
511a41f2855Sdrh return i;
512a41f2855Sdrh }
513a41f2855Sdrh case CC_KYWD: {
514a41f2855Sdrh for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
515a41f2855Sdrh if( IdChar(z[i]) ){
516a41f2855Sdrh /* This token started out using characters that can appear in keywords,
517a41f2855Sdrh ** but z[i] is a character not allowed within keywords, so this must
518a41f2855Sdrh ** be an identifier instead */
519a41f2855Sdrh i++;
520a41f2855Sdrh break;
521a41f2855Sdrh }
522a41f2855Sdrh *tokenType = TK_ID;
523a41f2855Sdrh return i;
524a41f2855Sdrh }
525a41f2855Sdrh case CC_X: {
526a41f2855Sdrh testcase( z[0]=='x' ); testcase( z[0]=='X' );
527a41f2855Sdrh if( z[1]=='\'' ){
528a41f2855Sdrh *tokenType = TK_BLOB;
529a41f2855Sdrh for(i=2; sqlite3Isxdigit(z[i]); i++){}
530a41f2855Sdrh if( z[i]!='\'' || i%2 ){
531a41f2855Sdrh *tokenType = TK_ILLEGAL;
532a41f2855Sdrh while( z[i] && z[i]!='\'' ){ i++; }
533a41f2855Sdrh }
534a41f2855Sdrh if( z[i] ) i++;
535a41f2855Sdrh return i;
536a41f2855Sdrh }
537a41f2855Sdrh /* If it is not a BLOB literal, then it must be an ID, since no
538a41f2855Sdrh ** SQL keywords start with the letter 'x'. Fall through */
539*08b92086Sdrh /* no break */ deliberate_fall_through
540a41f2855Sdrh }
541a41f2855Sdrh case CC_ID: {
542a41f2855Sdrh i = 1;
543a41f2855Sdrh break;
544a41f2855Sdrh }
545a41f2855Sdrh default: {
546a41f2855Sdrh *tokenType = TK_ILLEGAL;
547a41f2855Sdrh return 1;
548a41f2855Sdrh }
549a41f2855Sdrh }
550a41f2855Sdrh while( IdChar(z[i]) ){ i++; }
551a41f2855Sdrh *tokenType = TK_ID;
552a41f2855Sdrh return i;
553a41f2855Sdrh }
554a41f2855Sdrh
sqlite3_normalize(const char * zSql)555a41f2855Sdrh char *sqlite3_normalize(const char *zSql){
556a41f2855Sdrh char *z; /* The output string */
557a41f2855Sdrh sqlite3_int64 nZ; /* Size of the output string in bytes */
558a41f2855Sdrh sqlite3_int64 nSql; /* Size of the input string in bytes */
559a41f2855Sdrh int i; /* Next character to read from zSql[] */
560a41f2855Sdrh int j; /* Next slot to fill in on z[] */
561a41f2855Sdrh int tokenType; /* Type of the next token */
562a41f2855Sdrh int n; /* Size of the next token */
563a41f2855Sdrh int k; /* Loop counter */
564a41f2855Sdrh
565a41f2855Sdrh nSql = strlen(zSql);
566a41f2855Sdrh nZ = nSql;
567a41f2855Sdrh z = sqlite3_malloc64( nZ+2 );
568a41f2855Sdrh if( z==0 ) return 0;
569a41f2855Sdrh for(i=j=0; zSql[i]; i += n){
570a41f2855Sdrh n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
571a41f2855Sdrh switch( tokenType ){
572a41f2855Sdrh case TK_SPACE: {
573a41f2855Sdrh break;
574a41f2855Sdrh }
575a41f2855Sdrh case TK_ERROR: {
576a41f2855Sdrh sqlite3_free(z);
577a41f2855Sdrh return 0;
578a41f2855Sdrh }
579a41f2855Sdrh case TK_LITERAL: {
580a41f2855Sdrh z[j++] = '?';
581a41f2855Sdrh break;
582a41f2855Sdrh }
583a41f2855Sdrh case TK_PUNCT:
584a41f2855Sdrh case TK_NAME: {
585d19866bbSdrh if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){
586d19866bbSdrh if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3]))
587d19866bbSdrh || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4]))
588d19866bbSdrh ){
589d19866bbSdrh /* NULL is a keyword in this case, not a literal value */
590d19866bbSdrh }else{
591d19866bbSdrh /* Here the NULL is a literal value */
592d19866bbSdrh z[j++] = '?';
593d19866bbSdrh break;
594d19866bbSdrh }
595d19866bbSdrh }
596a41f2855Sdrh if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
597a41f2855Sdrh for(k=0; k<n; k++){
598a41f2855Sdrh z[j++] = sqlite3Tolower(zSql[i+k]);
599a41f2855Sdrh }
600a41f2855Sdrh break;
601a41f2855Sdrh }
602a41f2855Sdrh }
603a41f2855Sdrh }
604a41f2855Sdrh while( j>0 && z[j-1]==' ' ){ j--; }
6055bce7d8fSmistachkin if( j>0 && z[j-1]!=';' ){ z[j++] = ';'; }
606a41f2855Sdrh z[j] = 0;
607ef424812Sdrh
608ef424812Sdrh /* Make a second pass converting "in(...)" where the "..." is not a
609ef424812Sdrh ** SELECT statement into "in(?,?,?)" */
610ef424812Sdrh for(i=0; i<j; i=n){
611ef424812Sdrh char *zIn = strstr(z+i, "in(");
612ef424812Sdrh int nParen;
613ef424812Sdrh if( zIn==0 ) break;
614ef424812Sdrh n = (int)(zIn-z)+3; /* Index of first char past "in(" */
615ef424812Sdrh if( n && IdChar(zIn[-1]) ) continue;
616ef424812Sdrh if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue;
617ef424812Sdrh if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue;
618ef424812Sdrh for(nParen=1, k=0; z[n+k]; k++){
619ef424812Sdrh if( z[n+k]=='(' ) nParen++;
620ef424812Sdrh if( z[n+k]==')' ){
621ef424812Sdrh nParen--;
622ef424812Sdrh if( nParen==0 ) break;
623ef424812Sdrh }
624ef424812Sdrh }
625ef424812Sdrh /* k is the number of bytes in the "..." within "in(...)" */
626ef424812Sdrh if( k<5 ){
627ef424812Sdrh z = sqlite3_realloc64(z, j+(5-k)+1);
628ef424812Sdrh if( z==0 ) return 0;
629ef424812Sdrh memmove(z+n+5, z+n+k, j-(n+k));
630ef424812Sdrh }else if( k>5 ){
631ef424812Sdrh memmove(z+n+5, z+n+k, j-(n+k));
632ef424812Sdrh }
633ef424812Sdrh j = j-k+5;
634ef424812Sdrh z[j] = 0;
635ef424812Sdrh memcpy(z+n, "?,?,?", 5);
636ef424812Sdrh }
637a41f2855Sdrh return z;
638a41f2855Sdrh }
639a41f2855Sdrh
640ef424812Sdrh /*
641ef424812Sdrh ** For testing purposes, or to build a stand-alone SQL normalizer program,
642ef424812Sdrh ** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link
643ef424812Sdrh ** it against any SQLite library. The resulting command-line program will
644ef424812Sdrh ** run sqlite3_normalize() over the text of all files named on the command-
645ef424812Sdrh ** line and show the result on standard output.
646ef424812Sdrh */
647ef424812Sdrh #ifdef SQLITE_NORMALIZE_CLI
648a41f2855Sdrh #include <stdio.h>
649a41f2855Sdrh #include <stdlib.h>
650a41f2855Sdrh
651ef424812Sdrh /*
652ef424812Sdrh ** Break zIn up into separate SQL statements and run sqlite3_normalize()
653ef424812Sdrh ** on each one. Print the result of each run.
654ef424812Sdrh */
normalizeFile(char * zIn)655a41f2855Sdrh static void normalizeFile(char *zIn){
656a41f2855Sdrh int i;
657a41f2855Sdrh if( zIn==0 ) return;
658a41f2855Sdrh for(i=0; zIn[i]; i++){
659a41f2855Sdrh char cSaved;
660a41f2855Sdrh if( zIn[i]!=';' ) continue;
661a41f2855Sdrh cSaved = zIn[i+1];
662a41f2855Sdrh zIn[i+1] = 0;
663a41f2855Sdrh if( sqlite3_complete(zIn) ){
664a41f2855Sdrh char *zOut = sqlite3_normalize(zIn);
665a41f2855Sdrh if( zOut ){
666a41f2855Sdrh printf("%s\n", zOut);
667a41f2855Sdrh sqlite3_free(zOut);
668a41f2855Sdrh }else{
669a41f2855Sdrh fprintf(stderr, "ERROR: %s\n", zIn);
670a41f2855Sdrh }
671a41f2855Sdrh zIn[i+1] = cSaved;
672a41f2855Sdrh zIn += i+1;
673a41f2855Sdrh i = -1;
674a41f2855Sdrh }else{
675a41f2855Sdrh zIn[i+1] = cSaved;
676a41f2855Sdrh }
677a41f2855Sdrh }
678a41f2855Sdrh }
679a41f2855Sdrh
680ef424812Sdrh /*
681ef424812Sdrh ** The main routine for "sql_normalize". Read files named on the
682ef424812Sdrh ** command-line and run the text of each through sqlite3_normalize().
683ef424812Sdrh */
main(int argc,char ** argv)684a41f2855Sdrh int main(int argc, char **argv){
685a41f2855Sdrh int i;
686a41f2855Sdrh FILE *in;
687a41f2855Sdrh char *zBuf = 0;
688a41f2855Sdrh sqlite3_int64 sz, got;
689a41f2855Sdrh
690a41f2855Sdrh for(i=1; i<argc; i++){
691a41f2855Sdrh in = fopen(argv[i], "rb");
692a41f2855Sdrh if( in==0 ){
693a41f2855Sdrh fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
694a41f2855Sdrh continue;
695a41f2855Sdrh }
696a41f2855Sdrh fseek(in, 0, SEEK_END);
697a41f2855Sdrh sz = ftell(in);
698a41f2855Sdrh rewind(in);
699a41f2855Sdrh zBuf = sqlite3_realloc64(zBuf, sz+1);
700a41f2855Sdrh if( zBuf==0 ){
701a41f2855Sdrh fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
702a41f2855Sdrh exit(1);
703a41f2855Sdrh }
704a41f2855Sdrh got = fread(zBuf, 1, sz, in);
705a41f2855Sdrh fclose(in);
706a41f2855Sdrh if( got!=sz ){
707a41f2855Sdrh fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
708a41f2855Sdrh got, sz, argv[i]);
709a41f2855Sdrh }else{
710a41f2855Sdrh zBuf[got] = 0;
711a41f2855Sdrh normalizeFile(zBuf);
712a41f2855Sdrh }
713a41f2855Sdrh }
714a41f2855Sdrh sqlite3_free(zBuf);
715a41f2855Sdrh }
716ef424812Sdrh #endif /* SQLITE_NORMALIZE_CLI */
717