xref: /sqlite-3.40.0/src/tokenize.c (revision 74217cc0)
1 /*
2 ** 2001 September 15
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** An tokenizer for SQL
13 **
14 ** This file contains C code that splits an SQL input string up into
15 ** individual tokens and sends those tokens one-by-one over to the
16 ** parser for analysis.
17 **
18 ** $Id: tokenize.c,v 1.107 2005/08/23 11:31:26 drh Exp $
19 */
20 #include "sqliteInt.h"
21 #include "os.h"
22 #include <ctype.h>
23 #include <stdlib.h>
24 
25 /*
26 ** The sqlite3KeywordCode function looks up an identifier to determine if
27 ** it is a keyword.  If it is a keyword, the token code of that keyword is
28 ** returned.  If the input is not a keyword, TK_ID is returned.
29 **
30 ** The implementation of this routine was generated by a program,
31 ** mkkeywordhash.h, located in the tool subdirectory of the distribution.
32 ** The output of the mkkeywordhash.c program is written into a file
33 ** named keywordhash.h and then included into this source file by
34 ** the #include below.
35 */
36 #include "keywordhash.h"
37 
38 
39 /*
40 ** If X is a character that can be used in an identifier and
41 ** X&0x80==0 then sqlite3IsIdChar[X] will be 1.  If X&0x80==0x80 then
42 ** X is always an identifier character.  (Hence all UTF-8
43 ** characters can be part of an identifier).  sqlite3IsIdChar[X] will
44 ** be 0 for every character in the lower 128 ASCII characters
45 ** that cannot be used as part of an identifier.
46 **
47 ** In this implementation, an identifier can be a string of
48 ** alphabetic characters, digits, and "_" plus any character
49 ** with the high-order bit set.  The latter rule means that
50 ** any sequence of UTF-8 characters or characters taken from
51 ** an extended ISO8859 character set can form an identifier.
52 **
53 ** Ticket #1066.  the SQL standard does not allow '$' in the
54 ** middle of identfiers.  But many SQL implementations do.
55 ** SQLite will allow '$' in identifiers for compatibility.
56 ** But the feature is undocumented.
57 */
58 const char sqlite3IsIdChar[] = {
59 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
60     0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
61     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
62     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
63     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
64     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
65     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
66 };
67 
68 #define IdChar(C)  (((c=C)&0x80)!=0 || (c>0x1f && sqlite3IsIdChar[c-0x20]))
69 
70 /*
71 ** Return the length of the token that begins at z[0].
72 ** Store the token type in *tokenType before returning.
73 */
74 static int getToken(const unsigned char *z, int *tokenType){
75   int i, c;
76   switch( *z ){
77     case ' ': case '\t': case '\n': case '\f': case '\r': {
78       for(i=1; isspace(z[i]); i++){}
79       *tokenType = TK_SPACE;
80       return i;
81     }
82     case '-': {
83       if( z[1]=='-' ){
84         for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
85         *tokenType = TK_COMMENT;
86         return i;
87       }
88       *tokenType = TK_MINUS;
89       return 1;
90     }
91     case '(': {
92       *tokenType = TK_LP;
93       return 1;
94     }
95     case ')': {
96       *tokenType = TK_RP;
97       return 1;
98     }
99     case ';': {
100       *tokenType = TK_SEMI;
101       return 1;
102     }
103     case '+': {
104       *tokenType = TK_PLUS;
105       return 1;
106     }
107     case '*': {
108       *tokenType = TK_STAR;
109       return 1;
110     }
111     case '/': {
112       if( z[1]!='*' || z[2]==0 ){
113         *tokenType = TK_SLASH;
114         return 1;
115       }
116       for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
117       if( c ) i++;
118       *tokenType = TK_COMMENT;
119       return i;
120     }
121     case '%': {
122       *tokenType = TK_REM;
123       return 1;
124     }
125     case '=': {
126       *tokenType = TK_EQ;
127       return 1 + (z[1]=='=');
128     }
129     case '<': {
130       if( (c=z[1])=='=' ){
131         *tokenType = TK_LE;
132         return 2;
133       }else if( c=='>' ){
134         *tokenType = TK_NE;
135         return 2;
136       }else if( c=='<' ){
137         *tokenType = TK_LSHIFT;
138         return 2;
139       }else{
140         *tokenType = TK_LT;
141         return 1;
142       }
143     }
144     case '>': {
145       if( (c=z[1])=='=' ){
146         *tokenType = TK_GE;
147         return 2;
148       }else if( c=='>' ){
149         *tokenType = TK_RSHIFT;
150         return 2;
151       }else{
152         *tokenType = TK_GT;
153         return 1;
154       }
155     }
156     case '!': {
157       if( z[1]!='=' ){
158         *tokenType = TK_ILLEGAL;
159         return 2;
160       }else{
161         *tokenType = TK_NE;
162         return 2;
163       }
164     }
165     case '|': {
166       if( z[1]!='|' ){
167         *tokenType = TK_BITOR;
168         return 1;
169       }else{
170         *tokenType = TK_CONCAT;
171         return 2;
172       }
173     }
174     case ',': {
175       *tokenType = TK_COMMA;
176       return 1;
177     }
178     case '&': {
179       *tokenType = TK_BITAND;
180       return 1;
181     }
182     case '~': {
183       *tokenType = TK_BITNOT;
184       return 1;
185     }
186     case '`':
187     case '\'':
188     case '"': {
189       int delim = z[0];
190       for(i=1; (c=z[i])!=0; i++){
191         if( c==delim ){
192           if( z[i+1]==delim ){
193             i++;
194           }else{
195             break;
196           }
197         }
198       }
199       if( c ) i++;
200       *tokenType = TK_STRING;
201       return i;
202     }
203     case '.': {
204 #ifndef SQLITE_OMIT_FLOATING_POINT
205       if( !isdigit(z[1]) )
206 #endif
207       {
208         *tokenType = TK_DOT;
209         return 1;
210       }
211       /* If the next character is a digit, this is a floating point
212       ** number that begins with ".".  Fall thru into the next case */
213     }
214     case '0': case '1': case '2': case '3': case '4':
215     case '5': case '6': case '7': case '8': case '9': {
216       *tokenType = TK_INTEGER;
217       for(i=0; isdigit(z[i]); i++){}
218 #ifndef SQLITE_OMIT_FLOATING_POINT
219       if( z[i]=='.' ){
220         i++;
221         while( isdigit(z[i]) ){ i++; }
222         *tokenType = TK_FLOAT;
223       }
224       if( (z[i]=='e' || z[i]=='E') &&
225            ( isdigit(z[i+1])
226             || ((z[i+1]=='+' || z[i+1]=='-') && isdigit(z[i+2]))
227            )
228       ){
229         i += 2;
230         while( isdigit(z[i]) ){ i++; }
231         *tokenType = TK_FLOAT;
232       }
233 #endif
234       return i;
235     }
236     case '[': {
237       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
238       *tokenType = TK_ID;
239       return i;
240     }
241     case '?': {
242       *tokenType = TK_VARIABLE;
243       for(i=1; isdigit(z[i]); i++){}
244       return i;
245     }
246     case '#': {
247       for(i=1; isdigit(z[i]); i++){}
248       if( i>1 ){
249         /* Parameters of the form #NNN (where NNN is a number) are used
250         ** internally by sqlite3NestedParse.  */
251         *tokenType = TK_REGISTER;
252         return i;
253       }
254       /* Fall through into the next case if the '#' is not followed by
255       ** a digit. Try to match #AAAA where AAAA is a parameter name. */
256     }
257 #ifndef SQLITE_OMIT_TCL_VARIABLE
258     case '$':
259 #endif
260     case ':': {
261       int n = 0;
262       *tokenType = TK_VARIABLE;
263       for(i=1; (c=z[i])!=0; i++){
264         if( IdChar(c) ){
265           n++;
266 #ifndef SQLITE_OMIT_TCL_VARIABLE
267         }else if( c=='(' && n>0 ){
268           do{
269             i++;
270           }while( (c=z[i])!=0 && !isspace(c) && c!=')' );
271           if( c==')' ){
272             i++;
273           }else{
274             *tokenType = TK_ILLEGAL;
275           }
276           break;
277         }else if( c==':' && z[i+1]==':' ){
278           i++;
279 #endif
280         }else{
281           break;
282         }
283       }
284       if( n==0 ) *tokenType = TK_ILLEGAL;
285       return i;
286     }
287 #ifndef SQLITE_OMIT_BLOB_LITERAL
288     case 'x': case 'X': {
289       if( (c=z[1])=='\'' || c=='"' ){
290         int delim = c;
291         *tokenType = TK_BLOB;
292         for(i=2; (c=z[i])!=0; i++){
293           if( c==delim ){
294             if( i%2 ) *tokenType = TK_ILLEGAL;
295             break;
296           }
297           if( !isxdigit(c) ){
298             *tokenType = TK_ILLEGAL;
299             return i;
300           }
301         }
302         if( c ) i++;
303         return i;
304       }
305       /* Otherwise fall through to the next case */
306     }
307 #endif
308     default: {
309       if( !IdChar(*z) ){
310         break;
311       }
312       for(i=1; IdChar(z[i]); i++){}
313       *tokenType = keywordCode((char*)z, i);
314       return i;
315     }
316   }
317   *tokenType = TK_ILLEGAL;
318   return 1;
319 }
320 int sqlite3GetToken(const unsigned char *z, int *tokenType){
321   return getToken(z, tokenType);
322 }
323 
324 /*
325 ** Run the parser on the given SQL string.  The parser structure is
326 ** passed in.  An SQLITE_ status code is returned.  If an error occurs
327 ** and pzErrMsg!=NULL then an error message might be written into
328 ** memory obtained from malloc() and *pzErrMsg made to point to that
329 ** error message.  Or maybe not.
330 */
331 int sqlite3RunParser(Parse *pParse, const char *zSql, char **pzErrMsg){
332   int nErr = 0;
333   int i;
334   void *pEngine;
335   int tokenType;
336   int lastTokenParsed = -1;
337   sqlite3 *db = pParse->db;
338   extern void *sqlite3ParserAlloc(void*(*)(int));
339   extern void sqlite3ParserFree(void*, void(*)(void*));
340   extern int sqlite3Parser(void*, int, Token, Parse*);
341 
342   db->flags &= ~SQLITE_Interrupt;
343   pParse->rc = SQLITE_OK;
344   i = 0;
345   pEngine = sqlite3ParserAlloc((void*(*)(int))sqlite3MallocX);
346   if( pEngine==0 ){
347     sqlite3SetString(pzErrMsg, "out of memory", (char*)0);
348     return SQLITE_NOMEM;
349   }
350   assert( pParse->sLastToken.dyn==0 );
351   assert( pParse->pNewTable==0 );
352   assert( pParse->pNewTrigger==0 );
353   assert( pParse->nVar==0 );
354   assert( pParse->nVarExpr==0 );
355   assert( pParse->nVarExprAlloc==0 );
356   assert( pParse->apVarExpr==0 );
357   pParse->zTail = pParse->zSql = zSql;
358   while( sqlite3_malloc_failed==0 && zSql[i]!=0 ){
359     assert( i>=0 );
360     pParse->sLastToken.z = &zSql[i];
361     assert( pParse->sLastToken.dyn==0 );
362     pParse->sLastToken.n = getToken((unsigned char*)&zSql[i],&tokenType);
363     i += pParse->sLastToken.n;
364     switch( tokenType ){
365       case TK_SPACE:
366       case TK_COMMENT: {
367         if( (db->flags & SQLITE_Interrupt)!=0 ){
368           pParse->rc = SQLITE_INTERRUPT;
369           sqlite3SetString(pzErrMsg, "interrupt", (char*)0);
370           goto abort_parse;
371         }
372         break;
373       }
374       case TK_ILLEGAL: {
375         if( pzErrMsg ){
376           sqliteFree(*pzErrMsg);
377           *pzErrMsg = sqlite3MPrintf("unrecognized token: \"%T\"",
378                           &pParse->sLastToken);
379         }
380         nErr++;
381         goto abort_parse;
382       }
383       case TK_SEMI: {
384         pParse->zTail = &zSql[i];
385         /* Fall thru into the default case */
386       }
387       default: {
388         sqlite3Parser(pEngine, tokenType, pParse->sLastToken, pParse);
389         lastTokenParsed = tokenType;
390         if( pParse->rc!=SQLITE_OK ){
391           goto abort_parse;
392         }
393         break;
394       }
395     }
396   }
397 abort_parse:
398   if( zSql[i]==0 && nErr==0 && pParse->rc==SQLITE_OK ){
399     if( lastTokenParsed!=TK_SEMI ){
400       sqlite3Parser(pEngine, TK_SEMI, pParse->sLastToken, pParse);
401       pParse->zTail = &zSql[i];
402     }
403     sqlite3Parser(pEngine, 0, pParse->sLastToken, pParse);
404   }
405   sqlite3ParserFree(pEngine, sqlite3FreeX);
406   if( sqlite3_malloc_failed ){
407     pParse->rc = SQLITE_NOMEM;
408   }
409   if( pParse->rc!=SQLITE_OK && pParse->rc!=SQLITE_DONE && pParse->zErrMsg==0 ){
410     sqlite3SetString(&pParse->zErrMsg, sqlite3ErrStr(pParse->rc),
411                     (char*)0);
412   }
413   if( pParse->zErrMsg ){
414     if( pzErrMsg && *pzErrMsg==0 ){
415       *pzErrMsg = pParse->zErrMsg;
416     }else{
417       sqliteFree(pParse->zErrMsg);
418     }
419     pParse->zErrMsg = 0;
420     if( !nErr ) nErr++;
421   }
422   if( pParse->pVdbe && pParse->nErr>0 && pParse->nested==0 ){
423     sqlite3VdbeDelete(pParse->pVdbe);
424     pParse->pVdbe = 0;
425   }
426   sqlite3DeleteTable(pParse->db, pParse->pNewTable);
427   sqlite3DeleteTrigger(pParse->pNewTrigger);
428   sqliteFree(pParse->apVarExpr);
429   if( nErr>0 && (pParse->rc==SQLITE_OK || pParse->rc==SQLITE_DONE) ){
430     pParse->rc = SQLITE_ERROR;
431   }
432   return nErr;
433 }
434