xref: /sqlite-3.40.0/src/tokenize.c (revision 4dcbdbff)
1 /*
2 ** 2001 September 15
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** An tokenizer for SQL
13 **
14 ** This file contains C code that splits an SQL input string up into
15 ** individual tokens and sends those tokens one-by-one over to the
16 ** parser for analysis.
17 **
18 ** $Id: tokenize.c,v 1.104 2005/06/22 08:48:06 drh Exp $
19 */
20 #include "sqliteInt.h"
21 #include "os.h"
22 #include <ctype.h>
23 #include <stdlib.h>
24 
25 /*
26 ** The sqlite3KeywordCode function looks up an identifier to determine if
27 ** it is a keyword.  If it is a keyword, the token code of that keyword is
28 ** returned.  If the input is not a keyword, TK_ID is returned.
29 **
30 ** The implementation of this routine was generated by a program,
31 ** mkkeywordhash.h, located in the tool subdirectory of the distribution.
32 ** The output of the mkkeywordhash.c program is written into a file
33 ** named keywordhash.h and then included into this source file by
34 ** the #include below.
35 */
36 #include "keywordhash.h"
37 
38 
39 /*
40 ** If X is a character that can be used in an identifier and
41 ** X&0x80==0 then isIdChar[X] will be 1.  If X&0x80==0x80 then
42 ** X is always an identifier character.  (Hence all UTF-8
43 ** characters can be part of an identifier).  isIdChar[X] will
44 ** be 0 for every character in the lower 128 ASCII characters
45 ** that cannot be used as part of an identifier.
46 **
47 ** In this implementation, an identifier can be a string of
48 ** alphabetic characters, digits, and "_" plus any character
49 ** with the high-order bit set.  The latter rule means that
50 ** any sequence of UTF-8 characters or characters taken from
51 ** an extended ISO8859 character set can form an identifier.
52 **
53 ** Ticket #1066.  the SQL standard does not allow '$' in the
54 ** middle of identfiers.  But many SQL implementations do.
55 ** SQLite will allow '$' in identifiers for compatibility.
56 ** But the feature is undocumented.
57 */
58 static const char isIdChar[] = {
59 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
60     0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
61     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
62     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
63     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
64     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
65     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
66 };
67 
68 #define IdChar(C)  (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
69 
70 /*
71 ** Return the length of the token that begins at z[0].
72 ** Store the token type in *tokenType before returning.
73 */
74 static int getToken(const unsigned char *z, int *tokenType){
75   int i, c;
76   switch( *z ){
77     case ' ': case '\t': case '\n': case '\f': case '\r': {
78       for(i=1; isspace(z[i]); i++){}
79       *tokenType = TK_SPACE;
80       return i;
81     }
82     case '-': {
83       if( z[1]=='-' ){
84         for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
85         *tokenType = TK_COMMENT;
86         return i;
87       }
88       *tokenType = TK_MINUS;
89       return 1;
90     }
91     case '(': {
92       *tokenType = TK_LP;
93       return 1;
94     }
95     case ')': {
96       *tokenType = TK_RP;
97       return 1;
98     }
99     case ';': {
100       *tokenType = TK_SEMI;
101       return 1;
102     }
103     case '+': {
104       *tokenType = TK_PLUS;
105       return 1;
106     }
107     case '*': {
108       *tokenType = TK_STAR;
109       return 1;
110     }
111     case '/': {
112       if( z[1]!='*' || z[2]==0 ){
113         *tokenType = TK_SLASH;
114         return 1;
115       }
116       for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
117       if( c ) i++;
118       *tokenType = TK_COMMENT;
119       return i;
120     }
121     case '%': {
122       *tokenType = TK_REM;
123       return 1;
124     }
125     case '=': {
126       *tokenType = TK_EQ;
127       return 1 + (z[1]=='=');
128     }
129     case '<': {
130       if( (c=z[1])=='=' ){
131         *tokenType = TK_LE;
132         return 2;
133       }else if( c=='>' ){
134         *tokenType = TK_NE;
135         return 2;
136       }else if( c=='<' ){
137         *tokenType = TK_LSHIFT;
138         return 2;
139       }else{
140         *tokenType = TK_LT;
141         return 1;
142       }
143     }
144     case '>': {
145       if( (c=z[1])=='=' ){
146         *tokenType = TK_GE;
147         return 2;
148       }else if( c=='>' ){
149         *tokenType = TK_RSHIFT;
150         return 2;
151       }else{
152         *tokenType = TK_GT;
153         return 1;
154       }
155     }
156     case '!': {
157       if( z[1]!='=' ){
158         *tokenType = TK_ILLEGAL;
159         return 2;
160       }else{
161         *tokenType = TK_NE;
162         return 2;
163       }
164     }
165     case '|': {
166       if( z[1]!='|' ){
167         *tokenType = TK_BITOR;
168         return 1;
169       }else{
170         *tokenType = TK_CONCAT;
171         return 2;
172       }
173     }
174     case ',': {
175       *tokenType = TK_COMMA;
176       return 1;
177     }
178     case '&': {
179       *tokenType = TK_BITAND;
180       return 1;
181     }
182     case '~': {
183       *tokenType = TK_BITNOT;
184       return 1;
185     }
186     case '\'': case '"': {
187       int delim = z[0];
188       for(i=1; (c=z[i])!=0; i++){
189         if( c==delim ){
190           if( z[i+1]==delim ){
191             i++;
192           }else{
193             break;
194           }
195         }
196       }
197       if( c ) i++;
198       *tokenType = TK_STRING;
199       return i;
200     }
201     case '.': {
202       *tokenType = TK_DOT;
203       return 1;
204     }
205     case '0': case '1': case '2': case '3': case '4':
206     case '5': case '6': case '7': case '8': case '9': {
207       *tokenType = TK_INTEGER;
208       for(i=1; isdigit(z[i]); i++){}
209 #ifndef SQLITE_OMIT_FLOATING_POINT
210       if( z[i]=='.' && isdigit(z[i+1]) ){
211         i += 2;
212         while( isdigit(z[i]) ){ i++; }
213         *tokenType = TK_FLOAT;
214       }
215       if( (z[i]=='e' || z[i]=='E') &&
216            ( isdigit(z[i+1])
217             || ((z[i+1]=='+' || z[i+1]=='-') && isdigit(z[i+2]))
218            )
219       ){
220         i += 2;
221         while( isdigit(z[i]) ){ i++; }
222         *tokenType = TK_FLOAT;
223       }
224 #endif
225       return i;
226     }
227     case '[': {
228       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
229       *tokenType = TK_ID;
230       return i;
231     }
232     case '?': {
233       *tokenType = TK_VARIABLE;
234       for(i=1; isdigit(z[i]); i++){}
235       return i;
236     }
237     case '#': {
238       for(i=1; isdigit(z[i]); i++){}
239       if( i>1 ){
240         /* Parameters of the form #NNN (where NNN is a number) are used
241         ** internally by sqlite3NestedParse.  */
242         *tokenType = TK_REGISTER;
243         return i;
244       }
245       /* Fall through into the next case if the '#' is not followed by
246       ** a digit. Try to match #AAAA where AAAA is a parameter name. */
247     }
248 #ifndef SQLITE_OMIT_TCL_VARIABLE
249     case '$':
250 #endif
251     case ':': {
252       int n = 0;
253       *tokenType = TK_VARIABLE;
254       for(i=1; (c=z[i])!=0; i++){
255         if( IdChar(c) ){
256           n++;
257 #ifndef SQLITE_OMIT_TCL_VARIABLE
258         }else if( c=='(' && n>0 ){
259           do{
260             i++;
261           }while( (c=z[i])!=0 && !isspace(c) && c!=')' );
262           if( c==')' ){
263             i++;
264           }else{
265             *tokenType = TK_ILLEGAL;
266           }
267           break;
268         }else if( c==':' && z[i+1]==':' ){
269           i++;
270 #endif
271         }else{
272           break;
273         }
274       }
275       if( n==0 ) *tokenType = TK_ILLEGAL;
276       return i;
277     }
278 #ifndef SQLITE_OMIT_BLOB_LITERAL
279     case 'x': case 'X': {
280       if( (c=z[1])=='\'' || c=='"' ){
281         int delim = c;
282         *tokenType = TK_BLOB;
283         for(i=2; (c=z[i])!=0; i++){
284           if( c==delim ){
285             if( i%2 ) *tokenType = TK_ILLEGAL;
286             break;
287           }
288           if( !isxdigit(c) ){
289             *tokenType = TK_ILLEGAL;
290             return i;
291           }
292         }
293         if( c ) i++;
294         return i;
295       }
296       /* Otherwise fall through to the next case */
297     }
298 #endif
299     default: {
300       if( !IdChar(*z) ){
301         break;
302       }
303       for(i=1; IdChar(z[i]); i++){}
304       *tokenType = keywordCode((char*)z, i);
305       return i;
306     }
307   }
308   *tokenType = TK_ILLEGAL;
309   return 1;
310 }
311 int sqlite3GetToken(const unsigned char *z, int *tokenType){
312   return getToken(z, tokenType);
313 }
314 
315 /*
316 ** Run the parser on the given SQL string.  The parser structure is
317 ** passed in.  An SQLITE_ status code is returned.  If an error occurs
318 ** and pzErrMsg!=NULL then an error message might be written into
319 ** memory obtained from malloc() and *pzErrMsg made to point to that
320 ** error message.  Or maybe not.
321 */
322 int sqlite3RunParser(Parse *pParse, const char *zSql, char **pzErrMsg){
323   int nErr = 0;
324   int i;
325   void *pEngine;
326   int tokenType;
327   int lastTokenParsed = -1;
328   sqlite3 *db = pParse->db;
329   extern void *sqlite3ParserAlloc(void*(*)(int));
330   extern void sqlite3ParserFree(void*, void(*)(void*));
331   extern int sqlite3Parser(void*, int, Token, Parse*);
332 
333   db->flags &= ~SQLITE_Interrupt;
334   pParse->rc = SQLITE_OK;
335   i = 0;
336   pEngine = sqlite3ParserAlloc((void*(*)(int))sqlite3MallocX);
337   if( pEngine==0 ){
338     sqlite3SetString(pzErrMsg, "out of memory", (char*)0);
339     return SQLITE_NOMEM;
340   }
341   assert( pParse->sLastToken.dyn==0 );
342   assert( pParse->pNewTable==0 );
343   assert( pParse->pNewTrigger==0 );
344   assert( pParse->nVar==0 );
345   assert( pParse->nVarExpr==0 );
346   assert( pParse->nVarExprAlloc==0 );
347   assert( pParse->apVarExpr==0 );
348   pParse->zTail = pParse->zSql = zSql;
349   while( sqlite3_malloc_failed==0 && zSql[i]!=0 ){
350     assert( i>=0 );
351     pParse->sLastToken.z = &zSql[i];
352     assert( pParse->sLastToken.dyn==0 );
353     pParse->sLastToken.n = getToken((unsigned char*)&zSql[i],&tokenType);
354     i += pParse->sLastToken.n;
355     switch( tokenType ){
356       case TK_SPACE:
357       case TK_COMMENT: {
358         if( (db->flags & SQLITE_Interrupt)!=0 ){
359           pParse->rc = SQLITE_INTERRUPT;
360           sqlite3SetString(pzErrMsg, "interrupt", (char*)0);
361           goto abort_parse;
362         }
363         break;
364       }
365       case TK_ILLEGAL: {
366         if( pzErrMsg ){
367           sqliteFree(*pzErrMsg);
368           *pzErrMsg = sqlite3MPrintf("unrecognized token: \"%T\"",
369                           &pParse->sLastToken);
370         }
371         nErr++;
372         goto abort_parse;
373       }
374       case TK_SEMI: {
375         pParse->zTail = &zSql[i];
376         /* Fall thru into the default case */
377       }
378       default: {
379         sqlite3Parser(pEngine, tokenType, pParse->sLastToken, pParse);
380         lastTokenParsed = tokenType;
381         if( pParse->rc!=SQLITE_OK ){
382           goto abort_parse;
383         }
384         break;
385       }
386     }
387   }
388 abort_parse:
389   if( zSql[i]==0 && nErr==0 && pParse->rc==SQLITE_OK ){
390     if( lastTokenParsed!=TK_SEMI ){
391       sqlite3Parser(pEngine, TK_SEMI, pParse->sLastToken, pParse);
392       pParse->zTail = &zSql[i];
393     }
394     sqlite3Parser(pEngine, 0, pParse->sLastToken, pParse);
395   }
396   sqlite3ParserFree(pEngine, sqlite3FreeX);
397   if( sqlite3_malloc_failed ){
398     pParse->rc = SQLITE_NOMEM;
399   }
400   if( pParse->rc!=SQLITE_OK && pParse->rc!=SQLITE_DONE && pParse->zErrMsg==0 ){
401     sqlite3SetString(&pParse->zErrMsg, sqlite3ErrStr(pParse->rc),
402                     (char*)0);
403   }
404   if( pParse->zErrMsg ){
405     if( pzErrMsg && *pzErrMsg==0 ){
406       *pzErrMsg = pParse->zErrMsg;
407     }else{
408       sqliteFree(pParse->zErrMsg);
409     }
410     pParse->zErrMsg = 0;
411     if( !nErr ) nErr++;
412   }
413   if( pParse->pVdbe && pParse->nErr>0 && pParse->nested==0 ){
414     sqlite3VdbeDelete(pParse->pVdbe);
415     pParse->pVdbe = 0;
416   }
417   sqlite3DeleteTable(pParse->db, pParse->pNewTable);
418   sqlite3DeleteTrigger(pParse->pNewTrigger);
419   sqliteFree(pParse->apVarExpr);
420   if( nErr>0 && (pParse->rc==SQLITE_OK || pParse->rc==SQLITE_DONE) ){
421     pParse->rc = SQLITE_ERROR;
422   }
423   return nErr;
424 }
425 
426 /* The sqlite3_complete() API may be omitted (to save code space) by
427 ** defining the following symbol.
428 */
429 #ifndef SQLITE_OMIT_COMPLETE
430 
431 /*
432 ** Token types used by the sqlite3_complete() routine.  See the header
433 ** comments on that procedure for additional information.
434 */
435 #define tkSEMI    0
436 #define tkWS      1
437 #define tkOTHER   2
438 #define tkEXPLAIN 3
439 #define tkCREATE  4
440 #define tkTEMP    5
441 #define tkTRIGGER 6
442 #define tkEND     7
443 
444 /*
445 ** Return TRUE if the given SQL string ends in a semicolon.
446 **
447 ** Special handling is require for CREATE TRIGGER statements.
448 ** Whenever the CREATE TRIGGER keywords are seen, the statement
449 ** must end with ";END;".
450 **
451 ** This implementation uses a state machine with 7 states:
452 **
453 **   (0) START     At the beginning or end of an SQL statement.  This routine
454 **                 returns 1 if it ends in the START state and 0 if it ends
455 **                 in any other state.
456 **
457 **   (1) NORMAL    We are in the middle of statement which ends with a single
458 **                 semicolon.
459 **
460 **   (2) EXPLAIN   The keyword EXPLAIN has been seen at the beginning of
461 **                 a statement.
462 **
463 **   (3) CREATE    The keyword CREATE has been seen at the beginning of a
464 **                 statement, possibly preceeded by EXPLAIN and/or followed by
465 **                 TEMP or TEMPORARY
466 **
467 **   (4) TRIGGER   We are in the middle of a trigger definition that must be
468 **                 ended by a semicolon, the keyword END, and another semicolon.
469 **
470 **   (5) SEMI      We've seen the first semicolon in the ";END;" that occurs at
471 **                 the end of a trigger definition.
472 **
473 **   (6) END       We've seen the ";END" of the ";END;" that occurs at the end
474 **                 of a trigger difinition.
475 **
476 ** Transitions between states above are determined by tokens extracted
477 ** from the input.  The following tokens are significant:
478 **
479 **   (0) tkSEMI      A semicolon.
480 **   (1) tkWS        Whitespace
481 **   (2) tkOTHER     Any other SQL token.
482 **   (3) tkEXPLAIN   The "explain" keyword.
483 **   (4) tkCREATE    The "create" keyword.
484 **   (5) tkTEMP      The "temp" or "temporary" keyword.
485 **   (6) tkTRIGGER   The "trigger" keyword.
486 **   (7) tkEND       The "end" keyword.
487 **
488 ** Whitespace never causes a state transition and is always ignored.
489 **
490 ** If we compile with SQLITE_OMIT_TRIGGER, all of the computation needed
491 ** to recognize the end of a trigger can be omitted.  All we have to do
492 ** is look for a semicolon that is not part of an string or comment.
493 */
494 int sqlite3_complete(const char *zSql){
495   u8 state = 0;   /* Current state, using numbers defined in header comment */
496   u8 token;       /* Value of the next token */
497 
498 #ifndef SQLITE_OMIT_TRIGGER
499   /* A complex statement machine used to detect the end of a CREATE TRIGGER
500   ** statement.  This is the normal case.
501   */
502   static const u8 trans[7][8] = {
503                      /* Token:                                                */
504      /* State:       **  SEMI  WS  OTHER EXPLAIN  CREATE  TEMP  TRIGGER  END  */
505      /* 0   START: */ {    0,  0,     1,      2,      3,    1,       1,   1,  },
506      /* 1  NORMAL: */ {    0,  1,     1,      1,      1,    1,       1,   1,  },
507      /* 2 EXPLAIN: */ {    0,  2,     1,      1,      3,    1,       1,   1,  },
508      /* 3  CREATE: */ {    0,  3,     1,      1,      1,    3,       4,   1,  },
509      /* 4 TRIGGER: */ {    5,  4,     4,      4,      4,    4,       4,   4,  },
510      /* 5    SEMI: */ {    5,  5,     4,      4,      4,    4,       4,   6,  },
511      /* 6     END: */ {    0,  6,     4,      4,      4,    4,       4,   4,  },
512   };
513 #else
514   /* If triggers are not suppored by this compile then the statement machine
515   ** used to detect the end of a statement is much simplier
516   */
517   static const u8 trans[2][3] = {
518                      /* Token:           */
519      /* State:       **  SEMI  WS  OTHER */
520      /* 0   START: */ {    0,  0,     1, },
521      /* 1  NORMAL: */ {    0,  1,     1, },
522   };
523 #endif /* SQLITE_OMIT_TRIGGER */
524 
525   while( *zSql ){
526     switch( *zSql ){
527       case ';': {  /* A semicolon */
528         token = tkSEMI;
529         break;
530       }
531       case ' ':
532       case '\r':
533       case '\t':
534       case '\n':
535       case '\f': {  /* White space is ignored */
536         token = tkWS;
537         break;
538       }
539       case '/': {   /* C-style comments */
540         if( zSql[1]!='*' ){
541           token = tkOTHER;
542           break;
543         }
544         zSql += 2;
545         while( zSql[0] && (zSql[0]!='*' || zSql[1]!='/') ){ zSql++; }
546         if( zSql[0]==0 ) return 0;
547         zSql++;
548         token = tkWS;
549         break;
550       }
551       case '-': {   /* SQL-style comments from "--" to end of line */
552         if( zSql[1]!='-' ){
553           token = tkOTHER;
554           break;
555         }
556         while( *zSql && *zSql!='\n' ){ zSql++; }
557         if( *zSql==0 ) return state==0;
558         token = tkWS;
559         break;
560       }
561       case '[': {   /* Microsoft-style identifiers in [...] */
562         zSql++;
563         while( *zSql && *zSql!=']' ){ zSql++; }
564         if( *zSql==0 ) return 0;
565         token = tkOTHER;
566         break;
567       }
568       case '"':     /* single- and double-quoted strings */
569       case '\'': {
570         int c = *zSql;
571         zSql++;
572         while( *zSql && *zSql!=c ){ zSql++; }
573         if( *zSql==0 ) return 0;
574         token = tkOTHER;
575         break;
576       }
577       default: {
578         int c;
579         if( IdChar((u8)*zSql) ){
580           /* Keywords and unquoted identifiers */
581           int nId;
582           for(nId=1; IdChar(zSql[nId]); nId++){}
583 #ifdef SQLITE_OMIT_TRIGGER
584           token = tkOTHER;
585 #else
586           switch( *zSql ){
587             case 'c': case 'C': {
588               if( nId==6 && sqlite3StrNICmp(zSql, "create", 6)==0 ){
589                 token = tkCREATE;
590               }else{
591                 token = tkOTHER;
592               }
593               break;
594             }
595             case 't': case 'T': {
596               if( nId==7 && sqlite3StrNICmp(zSql, "trigger", 7)==0 ){
597                 token = tkTRIGGER;
598               }else if( nId==4 && sqlite3StrNICmp(zSql, "temp", 4)==0 ){
599                 token = tkTEMP;
600               }else if( nId==9 && sqlite3StrNICmp(zSql, "temporary", 9)==0 ){
601                 token = tkTEMP;
602               }else{
603                 token = tkOTHER;
604               }
605               break;
606             }
607             case 'e':  case 'E': {
608               if( nId==3 && sqlite3StrNICmp(zSql, "end", 3)==0 ){
609                 token = tkEND;
610               }else
611 #ifndef SQLITE_OMIT_EXPLAIN
612               if( nId==7 && sqlite3StrNICmp(zSql, "explain", 7)==0 ){
613                 token = tkEXPLAIN;
614               }else
615 #endif
616               {
617                 token = tkOTHER;
618               }
619               break;
620             }
621             default: {
622               token = tkOTHER;
623               break;
624             }
625           }
626 #endif /* SQLITE_OMIT_TRIGGER */
627           zSql += nId-1;
628         }else{
629           /* Operators and special symbols */
630           token = tkOTHER;
631         }
632         break;
633       }
634     }
635     state = trans[state][token];
636     zSql++;
637   }
638   return state==0;
639 }
640 
641 #ifndef SQLITE_OMIT_UTF16
642 /*
643 ** This routine is the same as the sqlite3_complete() routine described
644 ** above, except that the parameter is required to be UTF-16 encoded, not
645 ** UTF-8.
646 */
647 int sqlite3_complete16(const void *zSql){
648   sqlite3_value *pVal;
649   char const *zSql8;
650   int rc = 0;
651 
652   pVal = sqlite3ValueNew();
653   sqlite3ValueSetStr(pVal, -1, zSql, SQLITE_UTF16NATIVE, SQLITE_STATIC);
654   zSql8 = sqlite3ValueText(pVal, SQLITE_UTF8);
655   if( zSql8 ){
656     rc = sqlite3_complete(zSql8);
657   }
658   sqlite3ValueFree(pVal);
659   return rc;
660 }
661 #endif /* SQLITE_OMIT_UTF16 */
662 #endif /* SQLITE_OMIT_COMPLETE */
663