xref: /sqlite-3.40.0/ext/misc/normalize.c (revision 08b92086)
1a41f2855Sdrh /*
2a41f2855Sdrh ** 2018-01-08
3a41f2855Sdrh **
4a41f2855Sdrh ** The author disclaims copyright to this source code.  In place of
5a41f2855Sdrh ** a legal notice, here is a blessing:
6a41f2855Sdrh **
7a41f2855Sdrh **    May you do good and not evil.
8a41f2855Sdrh **    May you find forgiveness for yourself and forgive others.
9a41f2855Sdrh **    May you share freely, never taking more than you give.
10a41f2855Sdrh **
11a41f2855Sdrh ******************************************************************************
12a41f2855Sdrh **
13a41f2855Sdrh ** This file contains code to implement the sqlite3_normalize() function.
14a41f2855Sdrh **
15a41f2855Sdrh **    char *sqlite3_normalize(const char *zSql);
16a41f2855Sdrh **
17a41f2855Sdrh ** This function takes an SQL string as input and returns a "normalized"
18a41f2855Sdrh ** version of that string in memory obtained from sqlite3_malloc64().  The
19a41f2855Sdrh ** caller is responsible for ensuring that the returned memory is freed.
20a41f2855Sdrh **
21a41f2855Sdrh ** If a memory allocation error occurs, this routine returns NULL.
22a41f2855Sdrh **
23a41f2855Sdrh ** The normalization consists of the following transformations:
24a41f2855Sdrh **
25a41f2855Sdrh **   (1)  Convert every literal (string, blob literal, numeric constant,
26a41f2855Sdrh **        or "NULL" constant) into a ?
27a41f2855Sdrh **
28a41f2855Sdrh **   (2)  Remove all superfluous whitespace, including comments.  Change
29a41f2855Sdrh **        all required whitespace to a single space character.
30a41f2855Sdrh **
31a41f2855Sdrh **   (3)  Lowercase all ASCII characters.
32a41f2855Sdrh **
33a41f2855Sdrh **   (4)  If an IN or NOT IN operator is followed by a list of 1 or more
34a41f2855Sdrh **        values, convert that list into "(?,?,?)".
35a41f2855Sdrh **
36a41f2855Sdrh ** The purpose of normalization is two-fold:
37a41f2855Sdrh **
38ef424812Sdrh **   (1)  Sanitize queries by removing potentially private or sensitive
39ef424812Sdrh **        information contained in literals.
40a41f2855Sdrh **
41a41f2855Sdrh **   (2)  Identify structurally identical queries by comparing their
42a41f2855Sdrh **        normalized forms.
43ef424812Sdrh **
44ef424812Sdrh ** Command-Line Utility
45ef424812Sdrh ** --------------------
46ef424812Sdrh **
47ef424812Sdrh ** This file also contains code for a command-line utility that converts
48ef424812Sdrh ** SQL queries in text files into their normalized forms.  To build the
49ef424812Sdrh ** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI
50ef424812Sdrh ** and link it against the SQLite library.
51a41f2855Sdrh */
52a41f2855Sdrh #include <sqlite3.h>
53a41f2855Sdrh #include <string.h>
54a41f2855Sdrh 
55a41f2855Sdrh /*
56a41f2855Sdrh ** Implementation note:
57a41f2855Sdrh **
58a41f2855Sdrh ** Much of the tokenizer logic is copied out of the tokenize.c source file
59ef424812Sdrh ** of SQLite.  That logic could be simplified for this particular application,
60a41f2855Sdrh ** but that would impose a risk of introducing subtle errors.  It is best to
61a41f2855Sdrh ** keep the code as close to the original as possible.
62ef424812Sdrh **
63ef424812Sdrh ** The tokenize code is in sync with the SQLite core as of 2018-01-08.
64ef424812Sdrh ** Any future changes to the core tokenizer might require corresponding
65ef424812Sdrh ** adjustments to the tokenizer logic in this module.
66a41f2855Sdrh */
67a41f2855Sdrh 
68a41f2855Sdrh 
69a41f2855Sdrh /* Character classes for tokenizing
70a41f2855Sdrh **
71a41f2855Sdrh ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
72a41f2855Sdrh ** using a lookup table, whereas a switch() directly on c uses a binary search.
73a41f2855Sdrh ** The lookup table is much faster.  To maximize speed, and to ensure that
74a41f2855Sdrh ** a lookup table is used, all of the classes need to be small integers and
75a41f2855Sdrh ** all of them need to be used within the switch.
76a41f2855Sdrh */
77a41f2855Sdrh #define CC_X          0    /* The letter 'x', or start of BLOB literal */
78a41f2855Sdrh #define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
79a41f2855Sdrh #define CC_ID         2    /* unicode characters usable in IDs */
80a41f2855Sdrh #define CC_DIGIT      3    /* Digits */
81a41f2855Sdrh #define CC_DOLLAR     4    /* '$' */
82a41f2855Sdrh #define CC_VARALPHA   5    /* '@', '#', ':'.  Alphabetic SQL variables */
83a41f2855Sdrh #define CC_VARNUM     6    /* '?'.  Numeric SQL variables */
84a41f2855Sdrh #define CC_SPACE      7    /* Space characters */
85a41f2855Sdrh #define CC_QUOTE      8    /* '"', '\'', or '`'.  String literals, quoted ids */
86a41f2855Sdrh #define CC_QUOTE2     9    /* '['.   [...] style quoted ids */
87a41f2855Sdrh #define CC_PIPE      10    /* '|'.   Bitwise OR or concatenate */
88a41f2855Sdrh #define CC_MINUS     11    /* '-'.  Minus or SQL-style comment */
89a41f2855Sdrh #define CC_LT        12    /* '<'.  Part of < or <= or <> */
90a41f2855Sdrh #define CC_GT        13    /* '>'.  Part of > or >= */
91a41f2855Sdrh #define CC_EQ        14    /* '='.  Part of = or == */
92a41f2855Sdrh #define CC_BANG      15    /* '!'.  Part of != */
93a41f2855Sdrh #define CC_SLASH     16    /* '/'.  / or c-style comment */
94a41f2855Sdrh #define CC_LP        17    /* '(' */
95a41f2855Sdrh #define CC_RP        18    /* ')' */
96a41f2855Sdrh #define CC_SEMI      19    /* ';' */
97a41f2855Sdrh #define CC_PLUS      20    /* '+' */
98a41f2855Sdrh #define CC_STAR      21    /* '*' */
99a41f2855Sdrh #define CC_PERCENT   22    /* '%' */
100a41f2855Sdrh #define CC_COMMA     23    /* ',' */
101a41f2855Sdrh #define CC_AND       24    /* '&' */
102a41f2855Sdrh #define CC_TILDA     25    /* '~' */
103a41f2855Sdrh #define CC_DOT       26    /* '.' */
104a41f2855Sdrh #define CC_ILLEGAL   27    /* Illegal character */
105a41f2855Sdrh 
106a41f2855Sdrh static const unsigned char aiClass[] = {
107a41f2855Sdrh /*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
108a41f2855Sdrh /* 0x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
109a41f2855Sdrh /* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
110a41f2855Sdrh /* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
111a41f2855Sdrh /* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
112a41f2855Sdrh /* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
113a41f2855Sdrh /* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  9, 27, 27, 27,  1,
114a41f2855Sdrh /* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
115a41f2855Sdrh /* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 10, 27, 25, 27,
116a41f2855Sdrh /* 8x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
117a41f2855Sdrh /* 9x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
118a41f2855Sdrh /* Ax */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
119a41f2855Sdrh /* Bx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
120a41f2855Sdrh /* Cx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
121a41f2855Sdrh /* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
122a41f2855Sdrh /* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
123a41f2855Sdrh /* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
124a41f2855Sdrh };
125a41f2855Sdrh 
126a41f2855Sdrh /* An array to map all upper-case characters into their corresponding
127a41f2855Sdrh ** lower-case character.
128a41f2855Sdrh **
129a41f2855Sdrh ** SQLite only considers US-ASCII (or EBCDIC) characters.  We do not
130a41f2855Sdrh ** handle case conversions for the UTF character set since the tables
131a41f2855Sdrh ** involved are nearly as big or bigger than SQLite itself.
132a41f2855Sdrh */
13391694dbdSdrh static const unsigned char sqlite3UpperToLower[] = {
134a41f2855Sdrh       0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
135a41f2855Sdrh      18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
136a41f2855Sdrh      36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
137a41f2855Sdrh      54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
138a41f2855Sdrh     104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
139a41f2855Sdrh     122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
140a41f2855Sdrh     108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
141a41f2855Sdrh     126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
142a41f2855Sdrh     144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
143a41f2855Sdrh     162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
144a41f2855Sdrh     180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
145a41f2855Sdrh     198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
146a41f2855Sdrh     216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
147a41f2855Sdrh     234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
148a41f2855Sdrh     252,253,254,255
149a41f2855Sdrh };
150a41f2855Sdrh 
151a41f2855Sdrh /*
152a41f2855Sdrh ** The following 256 byte lookup table is used to support SQLites built-in
153a41f2855Sdrh ** equivalents to the following standard library functions:
154a41f2855Sdrh **
155a41f2855Sdrh **   isspace()                        0x01
156a41f2855Sdrh **   isalpha()                        0x02
157a41f2855Sdrh **   isdigit()                        0x04
158a41f2855Sdrh **   isalnum()                        0x06
159a41f2855Sdrh **   isxdigit()                       0x08
160a41f2855Sdrh **   toupper()                        0x20
161a41f2855Sdrh **   SQLite identifier character      0x40
162a41f2855Sdrh **   Quote character                  0x80
163a41f2855Sdrh **
164a41f2855Sdrh ** Bit 0x20 is set if the mapped character requires translation to upper
165a41f2855Sdrh ** case. i.e. if the character is a lower-case ASCII character.
166a41f2855Sdrh ** If x is a lower-case ASCII character, then its upper-case equivalent
167a41f2855Sdrh ** is (x - 0x20). Therefore toupper() can be implemented as:
168a41f2855Sdrh **
169a41f2855Sdrh **   (x & ~(map[x]&0x20))
170a41f2855Sdrh **
171a41f2855Sdrh ** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
172a41f2855Sdrh ** array. tolower() is used more often than toupper() by SQLite.
173a41f2855Sdrh **
174a41f2855Sdrh ** Bit 0x40 is set if the character is non-alphanumeric and can be used in an
175a41f2855Sdrh ** SQLite identifier.  Identifiers are alphanumerics, "_", "$", and any
176a41f2855Sdrh ** non-ASCII UTF character. Hence the test for whether or not a character is
177a41f2855Sdrh ** part of an identifier is 0x46.
178a41f2855Sdrh */
17991694dbdSdrh static const unsigned char sqlite3CtypeMap[256] = {
180a41f2855Sdrh   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 00..07    ........ */
181a41f2855Sdrh   0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,  /* 08..0f    ........ */
182a41f2855Sdrh   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 10..17    ........ */
183a41f2855Sdrh   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 18..1f    ........ */
184a41f2855Sdrh   0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80,  /* 20..27     !"#$%&' */
185a41f2855Sdrh   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 28..2f    ()*+,-./ */
186a41f2855Sdrh   0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,  /* 30..37    01234567 */
187a41f2855Sdrh   0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 38..3f    89:;<=>? */
188a41f2855Sdrh 
189a41f2855Sdrh   0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02,  /* 40..47    @ABCDEFG */
190a41f2855Sdrh   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 48..4f    HIJKLMNO */
191a41f2855Sdrh   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 50..57    PQRSTUVW */
192a41f2855Sdrh   0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40,  /* 58..5f    XYZ[\]^_ */
193a41f2855Sdrh   0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22,  /* 60..67    `abcdefg */
194a41f2855Sdrh   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 68..6f    hijklmno */
195a41f2855Sdrh   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 70..77    pqrstuvw */
196a41f2855Sdrh   0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 78..7f    xyz{|}~. */
197a41f2855Sdrh 
198a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 80..87    ........ */
199a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 88..8f    ........ */
200a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 90..97    ........ */
201a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 98..9f    ........ */
202a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a0..a7    ........ */
203a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a8..af    ........ */
204a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b0..b7    ........ */
205a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b8..bf    ........ */
206a41f2855Sdrh 
207a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c0..c7    ........ */
208a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c8..cf    ........ */
209a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d0..d7    ........ */
210a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d8..df    ........ */
211a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e0..e7    ........ */
212a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e8..ef    ........ */
213a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* f0..f7    ........ */
214a41f2855Sdrh   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40   /* f8..ff    ........ */
215a41f2855Sdrh };
216a41f2855Sdrh #define sqlite3Toupper(x)   ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
217a41f2855Sdrh #define sqlite3Isspace(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
218a41f2855Sdrh #define sqlite3Isalnum(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
219a41f2855Sdrh #define sqlite3Isalpha(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
220a41f2855Sdrh #define sqlite3Isdigit(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
221a41f2855Sdrh #define sqlite3Isxdigit(x)  (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
222a41f2855Sdrh #define sqlite3Tolower(x)   (sqlite3UpperToLower[(unsigned char)(x)])
223a41f2855Sdrh #define sqlite3Isquote(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
224a41f2855Sdrh 
225a41f2855Sdrh 
226a41f2855Sdrh /*
227a41f2855Sdrh ** If X is a character that can be used in an identifier then
228a41f2855Sdrh ** IdChar(X) will be true.  Otherwise it is false.
229a41f2855Sdrh **
230a41f2855Sdrh ** For ASCII, any character with the high-order bit set is
231a41f2855Sdrh ** allowed in an identifier.  For 7-bit characters,
232a41f2855Sdrh ** sqlite3IsIdChar[X] must be 1.
233a41f2855Sdrh **
234a41f2855Sdrh ** For EBCDIC, the rules are more complex but have the same
235a41f2855Sdrh ** end result.
236a41f2855Sdrh **
237a41f2855Sdrh ** Ticket #1066.  the SQL standard does not allow '$' in the
238a41f2855Sdrh ** middle of identifiers.  But many SQL implementations do.
239a41f2855Sdrh ** SQLite will allow '$' in identifiers for compatibility.
240a41f2855Sdrh ** But the feature is undocumented.
241a41f2855Sdrh */
242a41f2855Sdrh #define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
243a41f2855Sdrh 
244a41f2855Sdrh /*
245a41f2855Sdrh ** Ignore testcase() macros
246a41f2855Sdrh */
247a41f2855Sdrh #define testcase(X)
248a41f2855Sdrh 
249a41f2855Sdrh /*
250a41f2855Sdrh ** Token values
251a41f2855Sdrh */
252a41f2855Sdrh #define TK_SPACE    0
253a41f2855Sdrh #define TK_NAME     1
254a41f2855Sdrh #define TK_LITERAL  2
255a41f2855Sdrh #define TK_PUNCT    3
256a41f2855Sdrh #define TK_ERROR    4
257a41f2855Sdrh 
258a41f2855Sdrh #define TK_MINUS    TK_PUNCT
259a41f2855Sdrh #define TK_LP       TK_PUNCT
260a41f2855Sdrh #define TK_RP       TK_PUNCT
261a41f2855Sdrh #define TK_SEMI     TK_PUNCT
262a41f2855Sdrh #define TK_PLUS     TK_PUNCT
263a41f2855Sdrh #define TK_STAR     TK_PUNCT
264a41f2855Sdrh #define TK_SLASH    TK_PUNCT
265a41f2855Sdrh #define TK_REM      TK_PUNCT
266a41f2855Sdrh #define TK_EQ       TK_PUNCT
267a41f2855Sdrh #define TK_LE       TK_PUNCT
268a41f2855Sdrh #define TK_NE       TK_PUNCT
269a41f2855Sdrh #define TK_LSHIFT   TK_PUNCT
270a41f2855Sdrh #define TK_LT       TK_PUNCT
271a41f2855Sdrh #define TK_GE       TK_PUNCT
272a41f2855Sdrh #define TK_RSHIFT   TK_PUNCT
273a41f2855Sdrh #define TK_GT       TK_PUNCT
274a41f2855Sdrh #define TK_GE       TK_PUNCT
275a41f2855Sdrh #define TK_BITOR    TK_PUNCT
276a41f2855Sdrh #define TK_CONCAT   TK_PUNCT
277a41f2855Sdrh #define TK_COMMA    TK_PUNCT
278a41f2855Sdrh #define TK_BITAND   TK_PUNCT
279a41f2855Sdrh #define TK_BITNOT   TK_PUNCT
280a41f2855Sdrh #define TK_STRING   TK_LITERAL
281a41f2855Sdrh #define TK_ID       TK_NAME
282a41f2855Sdrh #define TK_ILLEGAL  TK_ERROR
283a41f2855Sdrh #define TK_DOT      TK_PUNCT
284a41f2855Sdrh #define TK_INTEGER  TK_LITERAL
285a41f2855Sdrh #define TK_FLOAT    TK_LITERAL
286a41f2855Sdrh #define TK_VARIABLE TK_LITERAL
287a41f2855Sdrh #define TK_BLOB     TK_LITERAL
288a41f2855Sdrh 
289*08b92086Sdrh /* Disable nuisence warnings about case fall-through */
290*08b92086Sdrh #if !defined(deliberate_fall_through) && defined(__GCC__) && __GCC__>=7
291*08b92086Sdrh # define deliberate_fall_through __attribute__((fallthrough));
292*08b92086Sdrh #else
293*08b92086Sdrh # define deliberate_fall_through
294*08b92086Sdrh #endif
295*08b92086Sdrh 
296a41f2855Sdrh /*
297a41f2855Sdrh ** Return the length (in bytes) of the token that begins at z[0].
298a41f2855Sdrh ** Store the token type in *tokenType before returning.
299a41f2855Sdrh */
sqlite3GetToken(const unsigned char * z,int * tokenType)300a41f2855Sdrh static int sqlite3GetToken(const unsigned char *z, int *tokenType){
301a41f2855Sdrh   int i, c;
302a41f2855Sdrh   switch( aiClass[*z] ){  /* Switch on the character-class of the first byte
303a41f2855Sdrh                           ** of the token. See the comment on the CC_ defines
304a41f2855Sdrh                           ** above. */
305a41f2855Sdrh     case CC_SPACE: {
306a41f2855Sdrh       for(i=1; sqlite3Isspace(z[i]); i++){}
307a41f2855Sdrh       *tokenType = TK_SPACE;
308a41f2855Sdrh       return i;
309a41f2855Sdrh     }
310a41f2855Sdrh     case CC_MINUS: {
311a41f2855Sdrh       if( z[1]=='-' ){
312a41f2855Sdrh         for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
313a41f2855Sdrh         *tokenType = TK_SPACE;
314a41f2855Sdrh         return i;
315a41f2855Sdrh       }
316a41f2855Sdrh       *tokenType = TK_MINUS;
317a41f2855Sdrh       return 1;
318a41f2855Sdrh     }
319a41f2855Sdrh     case CC_LP: {
320a41f2855Sdrh       *tokenType = TK_LP;
321a41f2855Sdrh       return 1;
322a41f2855Sdrh     }
323a41f2855Sdrh     case CC_RP: {
324a41f2855Sdrh       *tokenType = TK_RP;
325a41f2855Sdrh       return 1;
326a41f2855Sdrh     }
327a41f2855Sdrh     case CC_SEMI: {
328a41f2855Sdrh       *tokenType = TK_SEMI;
329a41f2855Sdrh       return 1;
330a41f2855Sdrh     }
331a41f2855Sdrh     case CC_PLUS: {
332a41f2855Sdrh       *tokenType = TK_PLUS;
333a41f2855Sdrh       return 1;
334a41f2855Sdrh     }
335a41f2855Sdrh     case CC_STAR: {
336a41f2855Sdrh       *tokenType = TK_STAR;
337a41f2855Sdrh       return 1;
338a41f2855Sdrh     }
339a41f2855Sdrh     case CC_SLASH: {
340a41f2855Sdrh       if( z[1]!='*' || z[2]==0 ){
341a41f2855Sdrh         *tokenType = TK_SLASH;
342a41f2855Sdrh         return 1;
343a41f2855Sdrh       }
344a41f2855Sdrh       for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
345a41f2855Sdrh       if( c ) i++;
346a41f2855Sdrh       *tokenType = TK_SPACE;
347a41f2855Sdrh       return i;
348a41f2855Sdrh     }
349a41f2855Sdrh     case CC_PERCENT: {
350a41f2855Sdrh       *tokenType = TK_REM;
351a41f2855Sdrh       return 1;
352a41f2855Sdrh     }
353a41f2855Sdrh     case CC_EQ: {
354a41f2855Sdrh       *tokenType = TK_EQ;
355a41f2855Sdrh       return 1 + (z[1]=='=');
356a41f2855Sdrh     }
357a41f2855Sdrh     case CC_LT: {
358a41f2855Sdrh       if( (c=z[1])=='=' ){
359a41f2855Sdrh         *tokenType = TK_LE;
360a41f2855Sdrh         return 2;
361a41f2855Sdrh       }else if( c=='>' ){
362a41f2855Sdrh         *tokenType = TK_NE;
363a41f2855Sdrh         return 2;
364a41f2855Sdrh       }else if( c=='<' ){
365a41f2855Sdrh         *tokenType = TK_LSHIFT;
366a41f2855Sdrh         return 2;
367a41f2855Sdrh       }else{
368a41f2855Sdrh         *tokenType = TK_LT;
369a41f2855Sdrh         return 1;
370a41f2855Sdrh       }
371a41f2855Sdrh     }
372a41f2855Sdrh     case CC_GT: {
373a41f2855Sdrh       if( (c=z[1])=='=' ){
374a41f2855Sdrh         *tokenType = TK_GE;
375a41f2855Sdrh         return 2;
376a41f2855Sdrh       }else if( c=='>' ){
377a41f2855Sdrh         *tokenType = TK_RSHIFT;
378a41f2855Sdrh         return 2;
379a41f2855Sdrh       }else{
380a41f2855Sdrh         *tokenType = TK_GT;
381a41f2855Sdrh         return 1;
382a41f2855Sdrh       }
383a41f2855Sdrh     }
384a41f2855Sdrh     case CC_BANG: {
385a41f2855Sdrh       if( z[1]!='=' ){
386a41f2855Sdrh         *tokenType = TK_ILLEGAL;
387a41f2855Sdrh         return 1;
388a41f2855Sdrh       }else{
389a41f2855Sdrh         *tokenType = TK_NE;
390a41f2855Sdrh         return 2;
391a41f2855Sdrh       }
392a41f2855Sdrh     }
393a41f2855Sdrh     case CC_PIPE: {
394a41f2855Sdrh       if( z[1]!='|' ){
395a41f2855Sdrh         *tokenType = TK_BITOR;
396a41f2855Sdrh         return 1;
397a41f2855Sdrh       }else{
398a41f2855Sdrh         *tokenType = TK_CONCAT;
399a41f2855Sdrh         return 2;
400a41f2855Sdrh       }
401a41f2855Sdrh     }
402a41f2855Sdrh     case CC_COMMA: {
403a41f2855Sdrh       *tokenType = TK_COMMA;
404a41f2855Sdrh       return 1;
405a41f2855Sdrh     }
406a41f2855Sdrh     case CC_AND: {
407a41f2855Sdrh       *tokenType = TK_BITAND;
408a41f2855Sdrh       return 1;
409a41f2855Sdrh     }
410a41f2855Sdrh     case CC_TILDA: {
411a41f2855Sdrh       *tokenType = TK_BITNOT;
412a41f2855Sdrh       return 1;
413a41f2855Sdrh     }
414a41f2855Sdrh     case CC_QUOTE: {
415a41f2855Sdrh       int delim = z[0];
416a41f2855Sdrh       testcase( delim=='`' );
417a41f2855Sdrh       testcase( delim=='\'' );
418a41f2855Sdrh       testcase( delim=='"' );
419a41f2855Sdrh       for(i=1; (c=z[i])!=0; i++){
420a41f2855Sdrh         if( c==delim ){
421a41f2855Sdrh           if( z[i+1]==delim ){
422a41f2855Sdrh             i++;
423a41f2855Sdrh           }else{
424a41f2855Sdrh             break;
425a41f2855Sdrh           }
426a41f2855Sdrh         }
427a41f2855Sdrh       }
428a41f2855Sdrh       if( c=='\'' ){
429a41f2855Sdrh         *tokenType = TK_STRING;
430a41f2855Sdrh         return i+1;
431a41f2855Sdrh       }else if( c!=0 ){
432a41f2855Sdrh         *tokenType = TK_ID;
433a41f2855Sdrh         return i+1;
434a41f2855Sdrh       }else{
435a41f2855Sdrh         *tokenType = TK_ILLEGAL;
436a41f2855Sdrh         return i;
437a41f2855Sdrh       }
438a41f2855Sdrh     }
439a41f2855Sdrh     case CC_DOT: {
440a41f2855Sdrh       if( !sqlite3Isdigit(z[1]) ){
441a41f2855Sdrh         *tokenType = TK_DOT;
442a41f2855Sdrh         return 1;
443a41f2855Sdrh       }
444a41f2855Sdrh       /* If the next character is a digit, this is a floating point
445a41f2855Sdrh       ** number that begins with ".".  Fall thru into the next case */
446*08b92086Sdrh       /* no break */ deliberate_fall_through
447a41f2855Sdrh     }
448a41f2855Sdrh     case CC_DIGIT: {
449a41f2855Sdrh       *tokenType = TK_INTEGER;
450a41f2855Sdrh       if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
451a41f2855Sdrh         for(i=3; sqlite3Isxdigit(z[i]); i++){}
452a41f2855Sdrh         return i;
453a41f2855Sdrh       }
454a41f2855Sdrh       for(i=0; sqlite3Isdigit(z[i]); i++){}
455a41f2855Sdrh       if( z[i]=='.' ){
456a41f2855Sdrh         i++;
457a41f2855Sdrh         while( sqlite3Isdigit(z[i]) ){ i++; }
458a41f2855Sdrh         *tokenType = TK_FLOAT;
459a41f2855Sdrh       }
460a41f2855Sdrh       if( (z[i]=='e' || z[i]=='E') &&
461a41f2855Sdrh            ( sqlite3Isdigit(z[i+1])
462a41f2855Sdrh             || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
463a41f2855Sdrh            )
464a41f2855Sdrh       ){
465a41f2855Sdrh         i += 2;
466a41f2855Sdrh         while( sqlite3Isdigit(z[i]) ){ i++; }
467a41f2855Sdrh         *tokenType = TK_FLOAT;
468a41f2855Sdrh       }
469a41f2855Sdrh       while( IdChar(z[i]) ){
470a41f2855Sdrh         *tokenType = TK_ILLEGAL;
471a41f2855Sdrh         i++;
472a41f2855Sdrh       }
473a41f2855Sdrh       return i;
474a41f2855Sdrh     }
475a41f2855Sdrh     case CC_QUOTE2: {
476a41f2855Sdrh       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
477a41f2855Sdrh       *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
478a41f2855Sdrh       return i;
479a41f2855Sdrh     }
480a41f2855Sdrh     case CC_VARNUM: {
481a41f2855Sdrh       *tokenType = TK_VARIABLE;
482a41f2855Sdrh       for(i=1; sqlite3Isdigit(z[i]); i++){}
483a41f2855Sdrh       return i;
484a41f2855Sdrh     }
485a41f2855Sdrh     case CC_DOLLAR:
486a41f2855Sdrh     case CC_VARALPHA: {
487a41f2855Sdrh       int n = 0;
488a41f2855Sdrh       testcase( z[0]=='$' );  testcase( z[0]=='@' );
489a41f2855Sdrh       testcase( z[0]==':' );  testcase( z[0]=='#' );
490a41f2855Sdrh       *tokenType = TK_VARIABLE;
491a41f2855Sdrh       for(i=1; (c=z[i])!=0; i++){
492a41f2855Sdrh         if( IdChar(c) ){
493a41f2855Sdrh           n++;
494a41f2855Sdrh         }else if( c=='(' && n>0 ){
495a41f2855Sdrh           do{
496a41f2855Sdrh             i++;
497a41f2855Sdrh           }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
498a41f2855Sdrh           if( c==')' ){
499a41f2855Sdrh             i++;
500a41f2855Sdrh           }else{
501a41f2855Sdrh             *tokenType = TK_ILLEGAL;
502a41f2855Sdrh           }
503a41f2855Sdrh           break;
504a41f2855Sdrh         }else if( c==':' && z[i+1]==':' ){
505a41f2855Sdrh           i++;
506a41f2855Sdrh         }else{
507a41f2855Sdrh           break;
508a41f2855Sdrh         }
509a41f2855Sdrh       }
510a41f2855Sdrh       if( n==0 ) *tokenType = TK_ILLEGAL;
511a41f2855Sdrh       return i;
512a41f2855Sdrh     }
513a41f2855Sdrh     case CC_KYWD: {
514a41f2855Sdrh       for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
515a41f2855Sdrh       if( IdChar(z[i]) ){
516a41f2855Sdrh         /* This token started out using characters that can appear in keywords,
517a41f2855Sdrh         ** but z[i] is a character not allowed within keywords, so this must
518a41f2855Sdrh         ** be an identifier instead */
519a41f2855Sdrh         i++;
520a41f2855Sdrh         break;
521a41f2855Sdrh       }
522a41f2855Sdrh       *tokenType = TK_ID;
523a41f2855Sdrh       return i;
524a41f2855Sdrh     }
525a41f2855Sdrh     case CC_X: {
526a41f2855Sdrh       testcase( z[0]=='x' ); testcase( z[0]=='X' );
527a41f2855Sdrh       if( z[1]=='\'' ){
528a41f2855Sdrh         *tokenType = TK_BLOB;
529a41f2855Sdrh         for(i=2; sqlite3Isxdigit(z[i]); i++){}
530a41f2855Sdrh         if( z[i]!='\'' || i%2 ){
531a41f2855Sdrh           *tokenType = TK_ILLEGAL;
532a41f2855Sdrh           while( z[i] && z[i]!='\'' ){ i++; }
533a41f2855Sdrh         }
534a41f2855Sdrh         if( z[i] ) i++;
535a41f2855Sdrh         return i;
536a41f2855Sdrh       }
537a41f2855Sdrh       /* If it is not a BLOB literal, then it must be an ID, since no
538a41f2855Sdrh       ** SQL keywords start with the letter 'x'.  Fall through */
539*08b92086Sdrh       /* no break */ deliberate_fall_through
540a41f2855Sdrh     }
541a41f2855Sdrh     case CC_ID: {
542a41f2855Sdrh       i = 1;
543a41f2855Sdrh       break;
544a41f2855Sdrh     }
545a41f2855Sdrh     default: {
546a41f2855Sdrh       *tokenType = TK_ILLEGAL;
547a41f2855Sdrh       return 1;
548a41f2855Sdrh     }
549a41f2855Sdrh   }
550a41f2855Sdrh   while( IdChar(z[i]) ){ i++; }
551a41f2855Sdrh   *tokenType = TK_ID;
552a41f2855Sdrh   return i;
553a41f2855Sdrh }
554a41f2855Sdrh 
sqlite3_normalize(const char * zSql)555a41f2855Sdrh char *sqlite3_normalize(const char *zSql){
556a41f2855Sdrh   char *z;              /* The output string */
557a41f2855Sdrh   sqlite3_int64 nZ;     /* Size of the output string in bytes */
558a41f2855Sdrh   sqlite3_int64 nSql;   /* Size of the input string in bytes */
559a41f2855Sdrh   int i;                /* Next character to read from zSql[] */
560a41f2855Sdrh   int j;                /* Next slot to fill in on z[] */
561a41f2855Sdrh   int tokenType;        /* Type of the next token */
562a41f2855Sdrh   int n;                /* Size of the next token */
563a41f2855Sdrh   int k;                /* Loop counter */
564a41f2855Sdrh 
565a41f2855Sdrh   nSql = strlen(zSql);
566a41f2855Sdrh   nZ = nSql;
567a41f2855Sdrh   z = sqlite3_malloc64( nZ+2 );
568a41f2855Sdrh   if( z==0 ) return 0;
569a41f2855Sdrh   for(i=j=0; zSql[i]; i += n){
570a41f2855Sdrh     n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
571a41f2855Sdrh     switch( tokenType ){
572a41f2855Sdrh       case TK_SPACE: {
573a41f2855Sdrh         break;
574a41f2855Sdrh       }
575a41f2855Sdrh       case TK_ERROR: {
576a41f2855Sdrh         sqlite3_free(z);
577a41f2855Sdrh         return 0;
578a41f2855Sdrh       }
579a41f2855Sdrh       case TK_LITERAL: {
580a41f2855Sdrh         z[j++] = '?';
581a41f2855Sdrh         break;
582a41f2855Sdrh       }
583a41f2855Sdrh       case TK_PUNCT:
584a41f2855Sdrh       case TK_NAME: {
585d19866bbSdrh         if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){
586d19866bbSdrh           if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3]))
587d19866bbSdrh            || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4]))
588d19866bbSdrh           ){
589d19866bbSdrh             /* NULL is a keyword in this case, not a literal value */
590d19866bbSdrh           }else{
591d19866bbSdrh             /* Here the NULL is a literal value */
592d19866bbSdrh             z[j++] = '?';
593d19866bbSdrh             break;
594d19866bbSdrh           }
595d19866bbSdrh         }
596a41f2855Sdrh         if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
597a41f2855Sdrh         for(k=0; k<n; k++){
598a41f2855Sdrh           z[j++] = sqlite3Tolower(zSql[i+k]);
599a41f2855Sdrh         }
600a41f2855Sdrh         break;
601a41f2855Sdrh       }
602a41f2855Sdrh     }
603a41f2855Sdrh   }
604a41f2855Sdrh   while( j>0 && z[j-1]==' ' ){ j--; }
6055bce7d8fSmistachkin   if( j>0 && z[j-1]!=';' ){ z[j++] = ';'; }
606a41f2855Sdrh   z[j] = 0;
607ef424812Sdrh 
608ef424812Sdrh   /* Make a second pass converting "in(...)" where the "..." is not a
609ef424812Sdrh   ** SELECT statement into "in(?,?,?)" */
610ef424812Sdrh   for(i=0; i<j; i=n){
611ef424812Sdrh     char *zIn = strstr(z+i, "in(");
612ef424812Sdrh     int nParen;
613ef424812Sdrh     if( zIn==0 ) break;
614ef424812Sdrh     n = (int)(zIn-z)+3;  /* Index of first char past "in(" */
615ef424812Sdrh     if( n && IdChar(zIn[-1]) ) continue;
616ef424812Sdrh     if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue;
617ef424812Sdrh     if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue;
618ef424812Sdrh     for(nParen=1, k=0; z[n+k]; k++){
619ef424812Sdrh       if( z[n+k]=='(' ) nParen++;
620ef424812Sdrh       if( z[n+k]==')' ){
621ef424812Sdrh         nParen--;
622ef424812Sdrh         if( nParen==0 ) break;
623ef424812Sdrh       }
624ef424812Sdrh     }
625ef424812Sdrh     /* k is the number of bytes in the "..." within "in(...)" */
626ef424812Sdrh     if( k<5 ){
627ef424812Sdrh       z = sqlite3_realloc64(z, j+(5-k)+1);
628ef424812Sdrh       if( z==0 ) return 0;
629ef424812Sdrh       memmove(z+n+5, z+n+k, j-(n+k));
630ef424812Sdrh     }else if( k>5 ){
631ef424812Sdrh       memmove(z+n+5, z+n+k, j-(n+k));
632ef424812Sdrh     }
633ef424812Sdrh     j = j-k+5;
634ef424812Sdrh     z[j] = 0;
635ef424812Sdrh     memcpy(z+n, "?,?,?", 5);
636ef424812Sdrh   }
637a41f2855Sdrh   return z;
638a41f2855Sdrh }
639a41f2855Sdrh 
640ef424812Sdrh /*
641ef424812Sdrh ** For testing purposes, or to build a stand-alone SQL normalizer program,
642ef424812Sdrh ** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link
643ef424812Sdrh ** it against any SQLite library.  The resulting command-line program will
644ef424812Sdrh ** run sqlite3_normalize() over the text of all files named on the command-
645ef424812Sdrh ** line and show the result on standard output.
646ef424812Sdrh */
647ef424812Sdrh #ifdef SQLITE_NORMALIZE_CLI
648a41f2855Sdrh #include <stdio.h>
649a41f2855Sdrh #include <stdlib.h>
650a41f2855Sdrh 
651ef424812Sdrh /*
652ef424812Sdrh ** Break zIn up into separate SQL statements and run sqlite3_normalize()
653ef424812Sdrh ** on each one.  Print the result of each run.
654ef424812Sdrh */
normalizeFile(char * zIn)655a41f2855Sdrh static void normalizeFile(char *zIn){
656a41f2855Sdrh   int i;
657a41f2855Sdrh   if( zIn==0 ) return;
658a41f2855Sdrh   for(i=0; zIn[i]; i++){
659a41f2855Sdrh     char cSaved;
660a41f2855Sdrh     if( zIn[i]!=';' ) continue;
661a41f2855Sdrh     cSaved = zIn[i+1];
662a41f2855Sdrh     zIn[i+1] = 0;
663a41f2855Sdrh     if( sqlite3_complete(zIn) ){
664a41f2855Sdrh       char *zOut = sqlite3_normalize(zIn);
665a41f2855Sdrh       if( zOut ){
666a41f2855Sdrh         printf("%s\n", zOut);
667a41f2855Sdrh         sqlite3_free(zOut);
668a41f2855Sdrh       }else{
669a41f2855Sdrh         fprintf(stderr, "ERROR: %s\n", zIn);
670a41f2855Sdrh       }
671a41f2855Sdrh       zIn[i+1] = cSaved;
672a41f2855Sdrh       zIn += i+1;
673a41f2855Sdrh       i = -1;
674a41f2855Sdrh     }else{
675a41f2855Sdrh       zIn[i+1] = cSaved;
676a41f2855Sdrh     }
677a41f2855Sdrh   }
678a41f2855Sdrh }
679a41f2855Sdrh 
680ef424812Sdrh /*
681ef424812Sdrh ** The main routine for "sql_normalize".  Read files named on the
682ef424812Sdrh ** command-line and run the text of each through sqlite3_normalize().
683ef424812Sdrh */
main(int argc,char ** argv)684a41f2855Sdrh int main(int argc, char **argv){
685a41f2855Sdrh   int i;
686a41f2855Sdrh   FILE *in;
687a41f2855Sdrh   char *zBuf = 0;
688a41f2855Sdrh   sqlite3_int64 sz, got;
689a41f2855Sdrh 
690a41f2855Sdrh   for(i=1; i<argc; i++){
691a41f2855Sdrh     in = fopen(argv[i], "rb");
692a41f2855Sdrh     if( in==0 ){
693a41f2855Sdrh       fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
694a41f2855Sdrh       continue;
695a41f2855Sdrh     }
696a41f2855Sdrh     fseek(in, 0, SEEK_END);
697a41f2855Sdrh     sz = ftell(in);
698a41f2855Sdrh     rewind(in);
699a41f2855Sdrh     zBuf = sqlite3_realloc64(zBuf, sz+1);
700a41f2855Sdrh     if( zBuf==0 ){
701a41f2855Sdrh       fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
702a41f2855Sdrh       exit(1);
703a41f2855Sdrh     }
704a41f2855Sdrh     got = fread(zBuf, 1, sz, in);
705a41f2855Sdrh     fclose(in);
706a41f2855Sdrh     if( got!=sz ){
707a41f2855Sdrh       fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
708a41f2855Sdrh               got, sz, argv[i]);
709a41f2855Sdrh     }else{
710a41f2855Sdrh       zBuf[got] = 0;
711a41f2855Sdrh       normalizeFile(zBuf);
712a41f2855Sdrh     }
713a41f2855Sdrh   }
714a41f2855Sdrh   sqlite3_free(zBuf);
715a41f2855Sdrh }
716ef424812Sdrh #endif /* SQLITE_NORMALIZE_CLI */
717