1 /* 2 ** 2018-01-08 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ****************************************************************************** 12 ** 13 ** This file contains code to implement the sqlite3_normalize() function. 14 ** 15 ** char *sqlite3_normalize(const char *zSql); 16 ** 17 ** This function takes an SQL string as input and returns a "normalized" 18 ** version of that string in memory obtained from sqlite3_malloc64(). The 19 ** caller is responsible for ensuring that the returned memory is freed. 20 ** 21 ** If a memory allocation error occurs, this routine returns NULL. 22 ** 23 ** The normalization consists of the following transformations: 24 ** 25 ** (1) Convert every literal (string, blob literal, numeric constant, 26 ** or "NULL" constant) into a ? 27 ** 28 ** (2) Remove all superfluous whitespace, including comments. Change 29 ** all required whitespace to a single space character. 30 ** 31 ** (3) Lowercase all ASCII characters. 32 ** 33 ** (4) If an IN or NOT IN operator is followed by a list of 1 or more 34 ** values, convert that list into "(?,?,?)". 35 ** 36 ** The purpose of normalization is two-fold: 37 ** 38 ** (1) Sanitize queries by removing potentially private or sensitive 39 ** information contained in literals. 40 ** 41 ** (2) Identify structurally identical queries by comparing their 42 ** normalized forms. 43 ** 44 ** Command-Line Utility 45 ** -------------------- 46 ** 47 ** This file also contains code for a command-line utility that converts 48 ** SQL queries in text files into their normalized forms. To build the 49 ** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI 50 ** and link it against the SQLite library. 51 */ 52 #include <sqlite3.h> 53 #include <string.h> 54 55 /* 56 ** Implementation note: 57 ** 58 ** Much of the tokenizer logic is copied out of the tokenize.c source file 59 ** of SQLite. That logic could be simplified for this particular application, 60 ** but that would impose a risk of introducing subtle errors. It is best to 61 ** keep the code as close to the original as possible. 62 ** 63 ** The tokenize code is in sync with the SQLite core as of 2018-01-08. 64 ** Any future changes to the core tokenizer might require corresponding 65 ** adjustments to the tokenizer logic in this module. 66 */ 67 68 69 /* Character classes for tokenizing 70 ** 71 ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented 72 ** using a lookup table, whereas a switch() directly on c uses a binary search. 73 ** The lookup table is much faster. To maximize speed, and to ensure that 74 ** a lookup table is used, all of the classes need to be small integers and 75 ** all of them need to be used within the switch. 76 */ 77 #define CC_X 0 /* The letter 'x', or start of BLOB literal */ 78 #define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */ 79 #define CC_ID 2 /* unicode characters usable in IDs */ 80 #define CC_DIGIT 3 /* Digits */ 81 #define CC_DOLLAR 4 /* '$' */ 82 #define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */ 83 #define CC_VARNUM 6 /* '?'. Numeric SQL variables */ 84 #define CC_SPACE 7 /* Space characters */ 85 #define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */ 86 #define CC_QUOTE2 9 /* '['. [...] style quoted ids */ 87 #define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */ 88 #define CC_MINUS 11 /* '-'. Minus or SQL-style comment */ 89 #define CC_LT 12 /* '<'. Part of < or <= or <> */ 90 #define CC_GT 13 /* '>'. Part of > or >= */ 91 #define CC_EQ 14 /* '='. Part of = or == */ 92 #define CC_BANG 15 /* '!'. Part of != */ 93 #define CC_SLASH 16 /* '/'. / or c-style comment */ 94 #define CC_LP 17 /* '(' */ 95 #define CC_RP 18 /* ')' */ 96 #define CC_SEMI 19 /* ';' */ 97 #define CC_PLUS 20 /* '+' */ 98 #define CC_STAR 21 /* '*' */ 99 #define CC_PERCENT 22 /* '%' */ 100 #define CC_COMMA 23 /* ',' */ 101 #define CC_AND 24 /* '&' */ 102 #define CC_TILDA 25 /* '~' */ 103 #define CC_DOT 26 /* '.' */ 104 #define CC_ILLEGAL 27 /* Illegal character */ 105 106 static const unsigned char aiClass[] = { 107 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ 108 /* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27, 109 /* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 110 /* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16, 111 /* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6, 112 /* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 113 /* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 9, 27, 27, 27, 1, 114 /* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 115 /* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 10, 27, 25, 27, 116 /* 8x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 117 /* 9x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 118 /* Ax */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 119 /* Bx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 120 /* Cx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 121 /* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 122 /* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 123 /* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 124 }; 125 126 /* An array to map all upper-case characters into their corresponding 127 ** lower-case character. 128 ** 129 ** SQLite only considers US-ASCII (or EBCDIC) characters. We do not 130 ** handle case conversions for the UTF character set since the tables 131 ** involved are nearly as big or bigger than SQLite itself. 132 */ 133 static const unsigned char sqlite3UpperToLower[] = { 134 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 135 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 136 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 137 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103, 138 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121, 139 122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107, 140 108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125, 141 126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, 142 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161, 143 162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179, 144 180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197, 145 198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215, 146 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233, 147 234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251, 148 252,253,254,255 149 }; 150 151 /* 152 ** The following 256 byte lookup table is used to support SQLites built-in 153 ** equivalents to the following standard library functions: 154 ** 155 ** isspace() 0x01 156 ** isalpha() 0x02 157 ** isdigit() 0x04 158 ** isalnum() 0x06 159 ** isxdigit() 0x08 160 ** toupper() 0x20 161 ** SQLite identifier character 0x40 162 ** Quote character 0x80 163 ** 164 ** Bit 0x20 is set if the mapped character requires translation to upper 165 ** case. i.e. if the character is a lower-case ASCII character. 166 ** If x is a lower-case ASCII character, then its upper-case equivalent 167 ** is (x - 0x20). Therefore toupper() can be implemented as: 168 ** 169 ** (x & ~(map[x]&0x20)) 170 ** 171 ** The equivalent of tolower() is implemented using the sqlite3UpperToLower[] 172 ** array. tolower() is used more often than toupper() by SQLite. 173 ** 174 ** Bit 0x40 is set if the character is non-alphanumeric and can be used in an 175 ** SQLite identifier. Identifiers are alphanumerics, "_", "$", and any 176 ** non-ASCII UTF character. Hence the test for whether or not a character is 177 ** part of an identifier is 0x46. 178 */ 179 static const unsigned char sqlite3CtypeMap[256] = { 180 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */ 181 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */ 182 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */ 183 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f ........ */ 184 0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27 !"#$%&' */ 185 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f ()*+,-./ */ 186 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37 01234567 */ 187 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f 89:;<=>? */ 188 189 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47 @ABCDEFG */ 190 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f HIJKLMNO */ 191 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57 PQRSTUVW */ 192 0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f XYZ[\]^_ */ 193 0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67 `abcdefg */ 194 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f hijklmno */ 195 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77 pqrstuvw */ 196 0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f xyz{|}~. */ 197 198 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87 ........ */ 199 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f ........ */ 200 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97 ........ */ 201 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f ........ */ 202 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7 ........ */ 203 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af ........ */ 204 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7 ........ */ 205 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf ........ */ 206 207 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7 ........ */ 208 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf ........ */ 209 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7 ........ */ 210 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df ........ */ 211 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7 ........ */ 212 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef ........ */ 213 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7 ........ */ 214 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* f8..ff ........ */ 215 }; 216 #define sqlite3Toupper(x) ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20)) 217 #define sqlite3Isspace(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x01) 218 #define sqlite3Isalnum(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x06) 219 #define sqlite3Isalpha(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x02) 220 #define sqlite3Isdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x04) 221 #define sqlite3Isxdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x08) 222 #define sqlite3Tolower(x) (sqlite3UpperToLower[(unsigned char)(x)]) 223 #define sqlite3Isquote(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x80) 224 225 226 /* 227 ** If X is a character that can be used in an identifier then 228 ** IdChar(X) will be true. Otherwise it is false. 229 ** 230 ** For ASCII, any character with the high-order bit set is 231 ** allowed in an identifier. For 7-bit characters, 232 ** sqlite3IsIdChar[X] must be 1. 233 ** 234 ** For EBCDIC, the rules are more complex but have the same 235 ** end result. 236 ** 237 ** Ticket #1066. the SQL standard does not allow '$' in the 238 ** middle of identifiers. But many SQL implementations do. 239 ** SQLite will allow '$' in identifiers for compatibility. 240 ** But the feature is undocumented. 241 */ 242 #define IdChar(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0) 243 244 /* 245 ** Ignore testcase() macros 246 */ 247 #define testcase(X) 248 249 /* 250 ** Token values 251 */ 252 #define TK_SPACE 0 253 #define TK_NAME 1 254 #define TK_LITERAL 2 255 #define TK_PUNCT 3 256 #define TK_ERROR 4 257 258 #define TK_MINUS TK_PUNCT 259 #define TK_LP TK_PUNCT 260 #define TK_RP TK_PUNCT 261 #define TK_SEMI TK_PUNCT 262 #define TK_PLUS TK_PUNCT 263 #define TK_STAR TK_PUNCT 264 #define TK_SLASH TK_PUNCT 265 #define TK_REM TK_PUNCT 266 #define TK_EQ TK_PUNCT 267 #define TK_LE TK_PUNCT 268 #define TK_NE TK_PUNCT 269 #define TK_LSHIFT TK_PUNCT 270 #define TK_LT TK_PUNCT 271 #define TK_GE TK_PUNCT 272 #define TK_RSHIFT TK_PUNCT 273 #define TK_GT TK_PUNCT 274 #define TK_GE TK_PUNCT 275 #define TK_BITOR TK_PUNCT 276 #define TK_CONCAT TK_PUNCT 277 #define TK_COMMA TK_PUNCT 278 #define TK_BITAND TK_PUNCT 279 #define TK_BITNOT TK_PUNCT 280 #define TK_STRING TK_LITERAL 281 #define TK_ID TK_NAME 282 #define TK_ILLEGAL TK_ERROR 283 #define TK_DOT TK_PUNCT 284 #define TK_INTEGER TK_LITERAL 285 #define TK_FLOAT TK_LITERAL 286 #define TK_VARIABLE TK_LITERAL 287 #define TK_BLOB TK_LITERAL 288 289 /* 290 ** Return the length (in bytes) of the token that begins at z[0]. 291 ** Store the token type in *tokenType before returning. 292 */ 293 static int sqlite3GetToken(const unsigned char *z, int *tokenType){ 294 int i, c; 295 switch( aiClass[*z] ){ /* Switch on the character-class of the first byte 296 ** of the token. See the comment on the CC_ defines 297 ** above. */ 298 case CC_SPACE: { 299 for(i=1; sqlite3Isspace(z[i]); i++){} 300 *tokenType = TK_SPACE; 301 return i; 302 } 303 case CC_MINUS: { 304 if( z[1]=='-' ){ 305 for(i=2; (c=z[i])!=0 && c!='\n'; i++){} 306 *tokenType = TK_SPACE; 307 return i; 308 } 309 *tokenType = TK_MINUS; 310 return 1; 311 } 312 case CC_LP: { 313 *tokenType = TK_LP; 314 return 1; 315 } 316 case CC_RP: { 317 *tokenType = TK_RP; 318 return 1; 319 } 320 case CC_SEMI: { 321 *tokenType = TK_SEMI; 322 return 1; 323 } 324 case CC_PLUS: { 325 *tokenType = TK_PLUS; 326 return 1; 327 } 328 case CC_STAR: { 329 *tokenType = TK_STAR; 330 return 1; 331 } 332 case CC_SLASH: { 333 if( z[1]!='*' || z[2]==0 ){ 334 *tokenType = TK_SLASH; 335 return 1; 336 } 337 for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){} 338 if( c ) i++; 339 *tokenType = TK_SPACE; 340 return i; 341 } 342 case CC_PERCENT: { 343 *tokenType = TK_REM; 344 return 1; 345 } 346 case CC_EQ: { 347 *tokenType = TK_EQ; 348 return 1 + (z[1]=='='); 349 } 350 case CC_LT: { 351 if( (c=z[1])=='=' ){ 352 *tokenType = TK_LE; 353 return 2; 354 }else if( c=='>' ){ 355 *tokenType = TK_NE; 356 return 2; 357 }else if( c=='<' ){ 358 *tokenType = TK_LSHIFT; 359 return 2; 360 }else{ 361 *tokenType = TK_LT; 362 return 1; 363 } 364 } 365 case CC_GT: { 366 if( (c=z[1])=='=' ){ 367 *tokenType = TK_GE; 368 return 2; 369 }else if( c=='>' ){ 370 *tokenType = TK_RSHIFT; 371 return 2; 372 }else{ 373 *tokenType = TK_GT; 374 return 1; 375 } 376 } 377 case CC_BANG: { 378 if( z[1]!='=' ){ 379 *tokenType = TK_ILLEGAL; 380 return 1; 381 }else{ 382 *tokenType = TK_NE; 383 return 2; 384 } 385 } 386 case CC_PIPE: { 387 if( z[1]!='|' ){ 388 *tokenType = TK_BITOR; 389 return 1; 390 }else{ 391 *tokenType = TK_CONCAT; 392 return 2; 393 } 394 } 395 case CC_COMMA: { 396 *tokenType = TK_COMMA; 397 return 1; 398 } 399 case CC_AND: { 400 *tokenType = TK_BITAND; 401 return 1; 402 } 403 case CC_TILDA: { 404 *tokenType = TK_BITNOT; 405 return 1; 406 } 407 case CC_QUOTE: { 408 int delim = z[0]; 409 testcase( delim=='`' ); 410 testcase( delim=='\'' ); 411 testcase( delim=='"' ); 412 for(i=1; (c=z[i])!=0; i++){ 413 if( c==delim ){ 414 if( z[i+1]==delim ){ 415 i++; 416 }else{ 417 break; 418 } 419 } 420 } 421 if( c=='\'' ){ 422 *tokenType = TK_STRING; 423 return i+1; 424 }else if( c!=0 ){ 425 *tokenType = TK_ID; 426 return i+1; 427 }else{ 428 *tokenType = TK_ILLEGAL; 429 return i; 430 } 431 } 432 case CC_DOT: { 433 if( !sqlite3Isdigit(z[1]) ){ 434 *tokenType = TK_DOT; 435 return 1; 436 } 437 /* If the next character is a digit, this is a floating point 438 ** number that begins with ".". Fall thru into the next case */ 439 } 440 case CC_DIGIT: { 441 *tokenType = TK_INTEGER; 442 if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){ 443 for(i=3; sqlite3Isxdigit(z[i]); i++){} 444 return i; 445 } 446 for(i=0; sqlite3Isdigit(z[i]); i++){} 447 if( z[i]=='.' ){ 448 i++; 449 while( sqlite3Isdigit(z[i]) ){ i++; } 450 *tokenType = TK_FLOAT; 451 } 452 if( (z[i]=='e' || z[i]=='E') && 453 ( sqlite3Isdigit(z[i+1]) 454 || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2])) 455 ) 456 ){ 457 i += 2; 458 while( sqlite3Isdigit(z[i]) ){ i++; } 459 *tokenType = TK_FLOAT; 460 } 461 while( IdChar(z[i]) ){ 462 *tokenType = TK_ILLEGAL; 463 i++; 464 } 465 return i; 466 } 467 case CC_QUOTE2: { 468 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} 469 *tokenType = c==']' ? TK_ID : TK_ILLEGAL; 470 return i; 471 } 472 case CC_VARNUM: { 473 *tokenType = TK_VARIABLE; 474 for(i=1; sqlite3Isdigit(z[i]); i++){} 475 return i; 476 } 477 case CC_DOLLAR: 478 case CC_VARALPHA: { 479 int n = 0; 480 testcase( z[0]=='$' ); testcase( z[0]=='@' ); 481 testcase( z[0]==':' ); testcase( z[0]=='#' ); 482 *tokenType = TK_VARIABLE; 483 for(i=1; (c=z[i])!=0; i++){ 484 if( IdChar(c) ){ 485 n++; 486 }else if( c=='(' && n>0 ){ 487 do{ 488 i++; 489 }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' ); 490 if( c==')' ){ 491 i++; 492 }else{ 493 *tokenType = TK_ILLEGAL; 494 } 495 break; 496 }else if( c==':' && z[i+1]==':' ){ 497 i++; 498 }else{ 499 break; 500 } 501 } 502 if( n==0 ) *tokenType = TK_ILLEGAL; 503 return i; 504 } 505 case CC_KYWD: { 506 for(i=1; aiClass[z[i]]<=CC_KYWD; i++){} 507 if( IdChar(z[i]) ){ 508 /* This token started out using characters that can appear in keywords, 509 ** but z[i] is a character not allowed within keywords, so this must 510 ** be an identifier instead */ 511 i++; 512 break; 513 } 514 *tokenType = TK_ID; 515 return i; 516 } 517 case CC_X: { 518 testcase( z[0]=='x' ); testcase( z[0]=='X' ); 519 if( z[1]=='\'' ){ 520 *tokenType = TK_BLOB; 521 for(i=2; sqlite3Isxdigit(z[i]); i++){} 522 if( z[i]!='\'' || i%2 ){ 523 *tokenType = TK_ILLEGAL; 524 while( z[i] && z[i]!='\'' ){ i++; } 525 } 526 if( z[i] ) i++; 527 return i; 528 } 529 /* If it is not a BLOB literal, then it must be an ID, since no 530 ** SQL keywords start with the letter 'x'. Fall through */ 531 } 532 case CC_ID: { 533 i = 1; 534 break; 535 } 536 default: { 537 *tokenType = TK_ILLEGAL; 538 return 1; 539 } 540 } 541 while( IdChar(z[i]) ){ i++; } 542 *tokenType = TK_ID; 543 return i; 544 } 545 546 char *sqlite3_normalize(const char *zSql){ 547 char *z; /* The output string */ 548 sqlite3_int64 nZ; /* Size of the output string in bytes */ 549 sqlite3_int64 nSql; /* Size of the input string in bytes */ 550 int i; /* Next character to read from zSql[] */ 551 int j; /* Next slot to fill in on z[] */ 552 int tokenType; /* Type of the next token */ 553 int n; /* Size of the next token */ 554 int k; /* Loop counter */ 555 556 nSql = strlen(zSql); 557 nZ = nSql; 558 z = sqlite3_malloc64( nZ+2 ); 559 if( z==0 ) return 0; 560 for(i=j=0; zSql[i]; i += n){ 561 n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType); 562 switch( tokenType ){ 563 case TK_SPACE: { 564 break; 565 } 566 case TK_ERROR: { 567 sqlite3_free(z); 568 return 0; 569 } 570 case TK_LITERAL: { 571 z[j++] = '?'; 572 break; 573 } 574 case TK_PUNCT: 575 case TK_NAME: { 576 if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){ 577 if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3])) 578 || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4])) 579 ){ 580 /* NULL is a keyword in this case, not a literal value */ 581 }else{ 582 /* Here the NULL is a literal value */ 583 z[j++] = '?'; 584 break; 585 } 586 } 587 if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' '; 588 for(k=0; k<n; k++){ 589 z[j++] = sqlite3Tolower(zSql[i+k]); 590 } 591 break; 592 } 593 } 594 } 595 while( j>0 && z[j-1]==' ' ){ j--; } 596 if( j>0 && z[j-1]!=';' ){ z[j++] = ';'; } 597 z[j] = 0; 598 599 /* Make a second pass converting "in(...)" where the "..." is not a 600 ** SELECT statement into "in(?,?,?)" */ 601 for(i=0; i<j; i=n){ 602 char *zIn = strstr(z+i, "in("); 603 int nParen; 604 if( zIn==0 ) break; 605 n = (int)(zIn-z)+3; /* Index of first char past "in(" */ 606 if( n && IdChar(zIn[-1]) ) continue; 607 if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue; 608 if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue; 609 for(nParen=1, k=0; z[n+k]; k++){ 610 if( z[n+k]=='(' ) nParen++; 611 if( z[n+k]==')' ){ 612 nParen--; 613 if( nParen==0 ) break; 614 } 615 } 616 /* k is the number of bytes in the "..." within "in(...)" */ 617 if( k<5 ){ 618 z = sqlite3_realloc64(z, j+(5-k)+1); 619 if( z==0 ) return 0; 620 memmove(z+n+5, z+n+k, j-(n+k)); 621 }else if( k>5 ){ 622 memmove(z+n+5, z+n+k, j-(n+k)); 623 } 624 j = j-k+5; 625 z[j] = 0; 626 memcpy(z+n, "?,?,?", 5); 627 } 628 return z; 629 } 630 631 /* 632 ** For testing purposes, or to build a stand-alone SQL normalizer program, 633 ** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link 634 ** it against any SQLite library. The resulting command-line program will 635 ** run sqlite3_normalize() over the text of all files named on the command- 636 ** line and show the result on standard output. 637 */ 638 #ifdef SQLITE_NORMALIZE_CLI 639 #include <stdio.h> 640 #include <stdlib.h> 641 642 /* 643 ** Break zIn up into separate SQL statements and run sqlite3_normalize() 644 ** on each one. Print the result of each run. 645 */ 646 static void normalizeFile(char *zIn){ 647 int i; 648 if( zIn==0 ) return; 649 for(i=0; zIn[i]; i++){ 650 char cSaved; 651 if( zIn[i]!=';' ) continue; 652 cSaved = zIn[i+1]; 653 zIn[i+1] = 0; 654 if( sqlite3_complete(zIn) ){ 655 char *zOut = sqlite3_normalize(zIn); 656 if( zOut ){ 657 printf("%s\n", zOut); 658 sqlite3_free(zOut); 659 }else{ 660 fprintf(stderr, "ERROR: %s\n", zIn); 661 } 662 zIn[i+1] = cSaved; 663 zIn += i+1; 664 i = -1; 665 }else{ 666 zIn[i+1] = cSaved; 667 } 668 } 669 } 670 671 /* 672 ** The main routine for "sql_normalize". Read files named on the 673 ** command-line and run the text of each through sqlite3_normalize(). 674 */ 675 int main(int argc, char **argv){ 676 int i; 677 FILE *in; 678 char *zBuf = 0; 679 sqlite3_int64 sz, got; 680 681 for(i=1; i<argc; i++){ 682 in = fopen(argv[i], "rb"); 683 if( in==0 ){ 684 fprintf(stderr, "cannot open \"%s\"\n", argv[i]); 685 continue; 686 } 687 fseek(in, 0, SEEK_END); 688 sz = ftell(in); 689 rewind(in); 690 zBuf = sqlite3_realloc64(zBuf, sz+1); 691 if( zBuf==0 ){ 692 fprintf(stderr, "failed to malloc for %lld bytes\n", sz); 693 exit(1); 694 } 695 got = fread(zBuf, 1, sz, in); 696 fclose(in); 697 if( got!=sz ){ 698 fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n", 699 got, sz, argv[i]); 700 }else{ 701 zBuf[got] = 0; 702 normalizeFile(zBuf); 703 } 704 } 705 sqlite3_free(zBuf); 706 } 707 #endif /* SQLITE_NORMALIZE_CLI */ 708