xref: /sqlite-3.40.0/ext/misc/csv.c (revision e7ebe0aa)
1 /*
2 ** 2016-05-28
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 **
13 ** This file contains the implementation of an SQLite virtual table for
14 ** reading CSV files.
15 **
16 ** Usage:
17 **
18 **    .load ./csv
19 **    CREATE VIRTUAL TABLE temp.csv USING csv(filename=FILENAME);
20 **    SELECT * FROM csv;
21 **
22 ** The columns are named "c1", "c2", "c3", ... by default.  Or the
23 ** application can define its own CREATE TABLE statement using the
24 ** schema= parameter, like this:
25 **
26 **    CREATE VIRTUAL TABLE temp.csv2 USING csv(
27 **       filename = "../http.log",
28 **       schema = "CREATE TABLE x(date,ipaddr,url,referrer,userAgent)"
29 **    );
30 **
31 ** Instead of specifying a file, the text of the CSV can be loaded using
32 ** the data= parameter.
33 **
34 ** If the columns=N parameter is supplied, then the CSV file is assumed to have
35 ** N columns.  If both the columns= and schema= parameters are omitted, then
36 ** the number and names of the columns is determined by the first line of
37 ** the CSV input.
38 **
39 ** Some extra debugging features (used for testing virtual tables) are available
40 ** if this module is compiled with -DSQLITE_TEST.
41 */
42 #include <sqlite3ext.h>
43 SQLITE_EXTENSION_INIT1
44 #include <string.h>
45 #include <stdlib.h>
46 #include <assert.h>
47 #include <stdarg.h>
48 #include <ctype.h>
49 #include <stdio.h>
50 
51 #ifndef SQLITE_OMIT_VIRTUALTABLE
52 
53 /*
54 ** A macro to hint to the compiler that a function should not be
55 ** inlined.
56 */
57 #if defined(__GNUC__)
58 #  define CSV_NOINLINE  __attribute__((noinline))
59 #elif defined(_MSC_VER) && _MSC_VER>=1310
60 #  define CSV_NOINLINE  __declspec(noinline)
61 #else
62 #  define CSV_NOINLINE
63 #endif
64 
65 
66 /* Max size of the error message in a CsvReader */
67 #define CSV_MXERR 200
68 
69 /* Size of the CsvReader input buffer */
70 #define CSV_INBUFSZ 1024
71 
72 /* A context object used when read a CSV file. */
73 typedef struct CsvReader CsvReader;
74 struct CsvReader {
75   FILE *in;              /* Read the CSV text from this input stream */
76   char *z;               /* Accumulated text for a field */
77   int n;                 /* Number of bytes in z */
78   int nAlloc;            /* Space allocated for z[] */
79   int nLine;             /* Current line number */
80   int bNotFirst;         /* True if prior text has been seen */
81   int cTerm;             /* Character that terminated the most recent field */
82   size_t iIn;            /* Next unread character in the input buffer */
83   size_t nIn;            /* Number of characters in the input buffer */
84   char *zIn;             /* The input buffer */
85   char zErr[CSV_MXERR];  /* Error message */
86 };
87 
88 /* Initialize a CsvReader object */
csv_reader_init(CsvReader * p)89 static void csv_reader_init(CsvReader *p){
90   p->in = 0;
91   p->z = 0;
92   p->n = 0;
93   p->nAlloc = 0;
94   p->nLine = 0;
95   p->bNotFirst = 0;
96   p->nIn = 0;
97   p->zIn = 0;
98   p->zErr[0] = 0;
99 }
100 
101 /* Close and reset a CsvReader object */
csv_reader_reset(CsvReader * p)102 static void csv_reader_reset(CsvReader *p){
103   if( p->in ){
104     fclose(p->in);
105     sqlite3_free(p->zIn);
106   }
107   sqlite3_free(p->z);
108   csv_reader_init(p);
109 }
110 
111 /* Report an error on a CsvReader */
csv_errmsg(CsvReader * p,const char * zFormat,...)112 static void csv_errmsg(CsvReader *p, const char *zFormat, ...){
113   va_list ap;
114   va_start(ap, zFormat);
115   sqlite3_vsnprintf(CSV_MXERR, p->zErr, zFormat, ap);
116   va_end(ap);
117 }
118 
119 /* Open the file associated with a CsvReader
120 ** Return the number of errors.
121 */
csv_reader_open(CsvReader * p,const char * zFilename,const char * zData)122 static int csv_reader_open(
123   CsvReader *p,               /* The reader to open */
124   const char *zFilename,      /* Read from this filename */
125   const char *zData           /*  ... or use this data */
126 ){
127   if( zFilename ){
128     p->zIn = sqlite3_malloc( CSV_INBUFSZ );
129     if( p->zIn==0 ){
130       csv_errmsg(p, "out of memory");
131       return 1;
132     }
133     p->in = fopen(zFilename, "rb");
134     if( p->in==0 ){
135       sqlite3_free(p->zIn);
136       csv_reader_reset(p);
137       csv_errmsg(p, "cannot open '%s' for reading", zFilename);
138       return 1;
139     }
140   }else{
141     assert( p->in==0 );
142     p->zIn = (char*)zData;
143     p->nIn = strlen(zData);
144   }
145   return 0;
146 }
147 
148 /* The input buffer has overflowed.  Refill the input buffer, then
149 ** return the next character
150 */
csv_getc_refill(CsvReader * p)151 static CSV_NOINLINE int csv_getc_refill(CsvReader *p){
152   size_t got;
153 
154   assert( p->iIn>=p->nIn );  /* Only called on an empty input buffer */
155   assert( p->in!=0 );        /* Only called if reading froma file */
156 
157   got = fread(p->zIn, 1, CSV_INBUFSZ, p->in);
158   if( got==0 ) return EOF;
159   p->nIn = got;
160   p->iIn = 1;
161   return p->zIn[0];
162 }
163 
164 /* Return the next character of input.  Return EOF at end of input. */
csv_getc(CsvReader * p)165 static int csv_getc(CsvReader *p){
166   if( p->iIn >= p->nIn ){
167     if( p->in!=0 ) return csv_getc_refill(p);
168     return EOF;
169   }
170   return ((unsigned char*)p->zIn)[p->iIn++];
171 }
172 
173 /* Increase the size of p->z and append character c to the end.
174 ** Return 0 on success and non-zero if there is an OOM error */
csv_resize_and_append(CsvReader * p,char c)175 static CSV_NOINLINE int csv_resize_and_append(CsvReader *p, char c){
176   char *zNew;
177   int nNew = p->nAlloc*2 + 100;
178   zNew = sqlite3_realloc64(p->z, nNew);
179   if( zNew ){
180     p->z = zNew;
181     p->nAlloc = nNew;
182     p->z[p->n++] = c;
183     return 0;
184   }else{
185     csv_errmsg(p, "out of memory");
186     return 1;
187   }
188 }
189 
190 /* Append a single character to the CsvReader.z[] array.
191 ** Return 0 on success and non-zero if there is an OOM error */
csv_append(CsvReader * p,char c)192 static int csv_append(CsvReader *p, char c){
193   if( p->n>=p->nAlloc-1 ) return csv_resize_and_append(p, c);
194   p->z[p->n++] = c;
195   return 0;
196 }
197 
198 /* Read a single field of CSV text.  Compatible with rfc4180 and extended
199 ** with the option of having a separator other than ",".
200 **
201 **   +  Input comes from p->in.
202 **   +  Store results in p->z of length p->n.  Space to hold p->z comes
203 **      from sqlite3_malloc64().
204 **   +  Keep track of the line number in p->nLine.
205 **   +  Store the character that terminates the field in p->cTerm.  Store
206 **      EOF on end-of-file.
207 **
208 ** Return 0 at EOF or on OOM.  On EOF, the p->cTerm character will have
209 ** been set to EOF.
210 */
csv_read_one_field(CsvReader * p)211 static char *csv_read_one_field(CsvReader *p){
212   int c;
213   p->n = 0;
214   c = csv_getc(p);
215   if( c==EOF ){
216     p->cTerm = EOF;
217     return 0;
218   }
219   if( c=='"' ){
220     int pc, ppc;
221     int startLine = p->nLine;
222     pc = ppc = 0;
223     while( 1 ){
224       c = csv_getc(p);
225       if( c<='"' || pc=='"' ){
226         if( c=='\n' ) p->nLine++;
227         if( c=='"' ){
228           if( pc=='"' ){
229             pc = 0;
230             continue;
231           }
232         }
233         if( (c==',' && pc=='"')
234          || (c=='\n' && pc=='"')
235          || (c=='\n' && pc=='\r' && ppc=='"')
236          || (c==EOF && pc=='"')
237         ){
238           do{ p->n--; }while( p->z[p->n]!='"' );
239           p->cTerm = (char)c;
240           break;
241         }
242         if( pc=='"' && c!='\r' ){
243           csv_errmsg(p, "line %d: unescaped %c character", p->nLine, '"');
244           break;
245         }
246         if( c==EOF ){
247           csv_errmsg(p, "line %d: unterminated %c-quoted field\n",
248                      startLine, '"');
249           p->cTerm = (char)c;
250           break;
251         }
252       }
253       if( csv_append(p, (char)c) ) return 0;
254       ppc = pc;
255       pc = c;
256     }
257   }else{
258     /* If this is the first field being parsed and it begins with the
259     ** UTF-8 BOM  (0xEF BB BF) then skip the BOM */
260     if( (c&0xff)==0xef && p->bNotFirst==0 ){
261       csv_append(p, (char)c);
262       c = csv_getc(p);
263       if( (c&0xff)==0xbb ){
264         csv_append(p, (char)c);
265         c = csv_getc(p);
266         if( (c&0xff)==0xbf ){
267           p->bNotFirst = 1;
268           p->n = 0;
269           return csv_read_one_field(p);
270         }
271       }
272     }
273     while( c>',' || (c!=EOF && c!=',' && c!='\n') ){
274       if( csv_append(p, (char)c) ) return 0;
275       c = csv_getc(p);
276     }
277     if( c=='\n' ){
278       p->nLine++;
279       if( p->n>0 && p->z[p->n-1]=='\r' ) p->n--;
280     }
281     p->cTerm = (char)c;
282   }
283   assert( p->z==0 || p->n<p->nAlloc );
284   if( p->z ) p->z[p->n] = 0;
285   p->bNotFirst = 1;
286   return p->z;
287 }
288 
289 
290 /* Forward references to the various virtual table methods implemented
291 ** in this file. */
292 static int csvtabCreate(sqlite3*, void*, int, const char*const*,
293                            sqlite3_vtab**,char**);
294 static int csvtabConnect(sqlite3*, void*, int, const char*const*,
295                            sqlite3_vtab**,char**);
296 static int csvtabBestIndex(sqlite3_vtab*,sqlite3_index_info*);
297 static int csvtabDisconnect(sqlite3_vtab*);
298 static int csvtabOpen(sqlite3_vtab*, sqlite3_vtab_cursor**);
299 static int csvtabClose(sqlite3_vtab_cursor*);
300 static int csvtabFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr,
301                           int argc, sqlite3_value **argv);
302 static int csvtabNext(sqlite3_vtab_cursor*);
303 static int csvtabEof(sqlite3_vtab_cursor*);
304 static int csvtabColumn(sqlite3_vtab_cursor*,sqlite3_context*,int);
305 static int csvtabRowid(sqlite3_vtab_cursor*,sqlite3_int64*);
306 
307 /* An instance of the CSV virtual table */
308 typedef struct CsvTable {
309   sqlite3_vtab base;              /* Base class.  Must be first */
310   char *zFilename;                /* Name of the CSV file */
311   char *zData;                    /* Raw CSV data in lieu of zFilename */
312   long iStart;                    /* Offset to start of data in zFilename */
313   int nCol;                       /* Number of columns in the CSV file */
314   unsigned int tstFlags;          /* Bit values used for testing */
315 } CsvTable;
316 
317 /* Allowed values for tstFlags */
318 #define CSVTEST_FIDX  0x0001      /* Pretend that constrained searchs cost less*/
319 
320 /* A cursor for the CSV virtual table */
321 typedef struct CsvCursor {
322   sqlite3_vtab_cursor base;       /* Base class.  Must be first */
323   CsvReader rdr;                  /* The CsvReader object */
324   char **azVal;                   /* Value of the current row */
325   int *aLen;                      /* Length of each entry */
326   sqlite3_int64 iRowid;           /* The current rowid.  Negative for EOF */
327 } CsvCursor;
328 
329 /* Transfer error message text from a reader into a CsvTable */
csv_xfer_error(CsvTable * pTab,CsvReader * pRdr)330 static void csv_xfer_error(CsvTable *pTab, CsvReader *pRdr){
331   sqlite3_free(pTab->base.zErrMsg);
332   pTab->base.zErrMsg = sqlite3_mprintf("%s", pRdr->zErr);
333 }
334 
335 /*
336 ** This method is the destructor fo a CsvTable object.
337 */
csvtabDisconnect(sqlite3_vtab * pVtab)338 static int csvtabDisconnect(sqlite3_vtab *pVtab){
339   CsvTable *p = (CsvTable*)pVtab;
340   sqlite3_free(p->zFilename);
341   sqlite3_free(p->zData);
342   sqlite3_free(p);
343   return SQLITE_OK;
344 }
345 
346 /* Skip leading whitespace.  Return a pointer to the first non-whitespace
347 ** character, or to the zero terminator if the string has only whitespace */
csv_skip_whitespace(const char * z)348 static const char *csv_skip_whitespace(const char *z){
349   while( isspace((unsigned char)z[0]) ) z++;
350   return z;
351 }
352 
353 /* Remove trailing whitespace from the end of string z[] */
csv_trim_whitespace(char * z)354 static void csv_trim_whitespace(char *z){
355   size_t n = strlen(z);
356   while( n>0 && isspace((unsigned char)z[n]) ) n--;
357   z[n] = 0;
358 }
359 
360 /* Dequote the string */
csv_dequote(char * z)361 static void csv_dequote(char *z){
362   int j;
363   char cQuote = z[0];
364   size_t i, n;
365 
366   if( cQuote!='\'' && cQuote!='"' ) return;
367   n = strlen(z);
368   if( n<2 || z[n-1]!=z[0] ) return;
369   for(i=1, j=0; i<n-1; i++){
370     if( z[i]==cQuote && z[i+1]==cQuote ) i++;
371     z[j++] = z[i];
372   }
373   z[j] = 0;
374 }
375 
376 /* Check to see if the string is of the form:  "TAG = VALUE" with optional
377 ** whitespace before and around tokens.  If it is, return a pointer to the
378 ** first character of VALUE.  If it is not, return NULL.
379 */
csv_parameter(const char * zTag,int nTag,const char * z)380 static const char *csv_parameter(const char *zTag, int nTag, const char *z){
381   z = csv_skip_whitespace(z);
382   if( strncmp(zTag, z, nTag)!=0 ) return 0;
383   z = csv_skip_whitespace(z+nTag);
384   if( z[0]!='=' ) return 0;
385   return csv_skip_whitespace(z+1);
386 }
387 
388 /* Decode a parameter that requires a dequoted string.
389 **
390 ** Return 1 if the parameter is seen, or 0 if not.  1 is returned
391 ** even if there is an error.  If an error occurs, then an error message
392 ** is left in p->zErr.  If there are no errors, p->zErr[0]==0.
393 */
csv_string_parameter(CsvReader * p,const char * zParam,const char * zArg,char ** pzVal)394 static int csv_string_parameter(
395   CsvReader *p,            /* Leave the error message here, if there is one */
396   const char *zParam,      /* Parameter we are checking for */
397   const char *zArg,        /* Raw text of the virtual table argment */
398   char **pzVal             /* Write the dequoted string value here */
399 ){
400   const char *zValue;
401   zValue = csv_parameter(zParam,(int)strlen(zParam),zArg);
402   if( zValue==0 ) return 0;
403   p->zErr[0] = 0;
404   if( *pzVal ){
405     csv_errmsg(p, "more than one '%s' parameter", zParam);
406     return 1;
407   }
408   *pzVal = sqlite3_mprintf("%s", zValue);
409   if( *pzVal==0 ){
410     csv_errmsg(p, "out of memory");
411     return 1;
412   }
413   csv_trim_whitespace(*pzVal);
414   csv_dequote(*pzVal);
415   return 1;
416 }
417 
418 
419 /* Return 0 if the argument is false and 1 if it is true.  Return -1 if
420 ** we cannot really tell.
421 */
csv_boolean(const char * z)422 static int csv_boolean(const char *z){
423   if( sqlite3_stricmp("yes",z)==0
424    || sqlite3_stricmp("on",z)==0
425    || sqlite3_stricmp("true",z)==0
426    || (z[0]=='1' && z[1]==0)
427   ){
428     return 1;
429   }
430   if( sqlite3_stricmp("no",z)==0
431    || sqlite3_stricmp("off",z)==0
432    || sqlite3_stricmp("false",z)==0
433    || (z[0]=='0' && z[1]==0)
434   ){
435     return 0;
436   }
437   return -1;
438 }
439 
440 /* Check to see if the string is of the form:  "TAG = BOOLEAN" or just "TAG".
441 ** If it is, set *pValue to be the value of the boolean ("true" if there is
442 ** not "= BOOLEAN" component) and return non-zero.  If the input string
443 ** does not begin with TAG, return zero.
444 */
csv_boolean_parameter(const char * zTag,int nTag,const char * z,int * pValue)445 static int csv_boolean_parameter(
446   const char *zTag,       /* Tag we are looking for */
447   int nTag,               /* Size of the tag in bytes */
448   const char *z,          /* Input parameter */
449   int *pValue             /* Write boolean value here */
450 ){
451   int b;
452   z = csv_skip_whitespace(z);
453   if( strncmp(zTag, z, nTag)!=0 ) return 0;
454   z = csv_skip_whitespace(z + nTag);
455   if( z[0]==0 ){
456     *pValue = 1;
457     return 1;
458   }
459   if( z[0]!='=' ) return 0;
460   z = csv_skip_whitespace(z+1);
461   b = csv_boolean(z);
462   if( b>=0 ){
463     *pValue = b;
464     return 1;
465   }
466   return 0;
467 }
468 
469 /*
470 ** Parameters:
471 **    filename=FILENAME          Name of file containing CSV content
472 **    data=TEXT                  Direct CSV content.
473 **    schema=SCHEMA              Alternative CSV schema.
474 **    header=YES|NO              First row of CSV defines the names of
475 **                               columns if "yes".  Default "no".
476 **    columns=N                  Assume the CSV file contains N columns.
477 **
478 ** Only available if compiled with SQLITE_TEST:
479 **
480 **    testflags=N                Bitmask of test flags.  Optional
481 **
482 ** If schema= is omitted, then the columns are named "c0", "c1", "c2",
483 ** and so forth.  If columns=N is omitted, then the file is opened and
484 ** the number of columns in the first row is counted to determine the
485 ** column count.  If header=YES, then the first row is skipped.
486 */
csvtabConnect(sqlite3 * db,void * pAux,int argc,const char * const * argv,sqlite3_vtab ** ppVtab,char ** pzErr)487 static int csvtabConnect(
488   sqlite3 *db,
489   void *pAux,
490   int argc, const char *const*argv,
491   sqlite3_vtab **ppVtab,
492   char **pzErr
493 ){
494   CsvTable *pNew = 0;        /* The CsvTable object to construct */
495   int bHeader = -1;          /* header= flags.  -1 means not seen yet */
496   int rc = SQLITE_OK;        /* Result code from this routine */
497   int i, j;                  /* Loop counters */
498 #ifdef SQLITE_TEST
499   int tstFlags = 0;          /* Value for testflags=N parameter */
500 #endif
501   int b;                     /* Value of a boolean parameter */
502   int nCol = -99;            /* Value of the columns= parameter */
503   CsvReader sRdr;            /* A CSV file reader used to store an error
504                              ** message and/or to count the number of columns */
505   static const char *azParam[] = {
506      "filename", "data", "schema",
507   };
508   char *azPValue[3];         /* Parameter values */
509 # define CSV_FILENAME (azPValue[0])
510 # define CSV_DATA     (azPValue[1])
511 # define CSV_SCHEMA   (azPValue[2])
512 
513 
514   assert( sizeof(azPValue)==sizeof(azParam) );
515   memset(&sRdr, 0, sizeof(sRdr));
516   memset(azPValue, 0, sizeof(azPValue));
517   for(i=3; i<argc; i++){
518     const char *z = argv[i];
519     const char *zValue;
520     for(j=0; j<sizeof(azParam)/sizeof(azParam[0]); j++){
521       if( csv_string_parameter(&sRdr, azParam[j], z, &azPValue[j]) ) break;
522     }
523     if( j<sizeof(azParam)/sizeof(azParam[0]) ){
524       if( sRdr.zErr[0] ) goto csvtab_connect_error;
525     }else
526     if( csv_boolean_parameter("header",6,z,&b) ){
527       if( bHeader>=0 ){
528         csv_errmsg(&sRdr, "more than one 'header' parameter");
529         goto csvtab_connect_error;
530       }
531       bHeader = b;
532     }else
533 #ifdef SQLITE_TEST
534     if( (zValue = csv_parameter("testflags",9,z))!=0 ){
535       tstFlags = (unsigned int)atoi(zValue);
536     }else
537 #endif
538     if( (zValue = csv_parameter("columns",7,z))!=0 ){
539       if( nCol>0 ){
540         csv_errmsg(&sRdr, "more than one 'columns' parameter");
541         goto csvtab_connect_error;
542       }
543       nCol = atoi(zValue);
544       if( nCol<=0 ){
545         csv_errmsg(&sRdr, "column= value must be positive");
546         goto csvtab_connect_error;
547       }
548     }else
549     {
550       csv_errmsg(&sRdr, "bad parameter: '%s'", z);
551       goto csvtab_connect_error;
552     }
553   }
554   if( (CSV_FILENAME==0)==(CSV_DATA==0) ){
555     csv_errmsg(&sRdr, "must specify either filename= or data= but not both");
556     goto csvtab_connect_error;
557   }
558 
559   if( (nCol<=0 || bHeader==1)
560    && csv_reader_open(&sRdr, CSV_FILENAME, CSV_DATA)
561   ){
562     goto csvtab_connect_error;
563   }
564   pNew = sqlite3_malloc( sizeof(*pNew) );
565   *ppVtab = (sqlite3_vtab*)pNew;
566   if( pNew==0 ) goto csvtab_connect_oom;
567   memset(pNew, 0, sizeof(*pNew));
568   if( CSV_SCHEMA==0 ){
569     sqlite3_str *pStr = sqlite3_str_new(0);
570     char *zSep = "";
571     int iCol = 0;
572     sqlite3_str_appendf(pStr, "CREATE TABLE x(");
573     if( nCol<0 && bHeader<1 ){
574       nCol = 0;
575       do{
576         csv_read_one_field(&sRdr);
577         nCol++;
578       }while( sRdr.cTerm==',' );
579     }
580     if( nCol>0 && bHeader<1 ){
581       for(iCol=0; iCol<nCol; iCol++){
582         sqlite3_str_appendf(pStr, "%sc%d TEXT", zSep, iCol);
583         zSep = ",";
584       }
585     }else{
586       do{
587         char *z = csv_read_one_field(&sRdr);
588         if( (nCol>0 && iCol<nCol) || (nCol<0 && bHeader) ){
589           sqlite3_str_appendf(pStr,"%s\"%w\" TEXT", zSep, z);
590           zSep = ",";
591           iCol++;
592         }
593       }while( sRdr.cTerm==',' );
594       if( nCol<0 ){
595         nCol = iCol;
596       }else{
597         while( iCol<nCol ){
598           sqlite3_str_appendf(pStr,"%sc%d TEXT", zSep, ++iCol);
599           zSep = ",";
600         }
601       }
602     }
603     pNew->nCol = nCol;
604     sqlite3_str_appendf(pStr, ")");
605     CSV_SCHEMA = sqlite3_str_finish(pStr);
606     if( CSV_SCHEMA==0 ) goto csvtab_connect_oom;
607   }else if( nCol<0 ){
608     do{
609       csv_read_one_field(&sRdr);
610       pNew->nCol++;
611     }while( sRdr.cTerm==',' );
612   }else{
613     pNew->nCol = nCol;
614   }
615   pNew->zFilename = CSV_FILENAME;  CSV_FILENAME = 0;
616   pNew->zData = CSV_DATA;          CSV_DATA = 0;
617 #ifdef SQLITE_TEST
618   pNew->tstFlags = tstFlags;
619 #endif
620   if( bHeader!=1 ){
621     pNew->iStart = 0;
622   }else if( pNew->zData ){
623     pNew->iStart = (int)sRdr.iIn;
624   }else{
625     pNew->iStart = (int)(ftell(sRdr.in) - sRdr.nIn + sRdr.iIn);
626   }
627   csv_reader_reset(&sRdr);
628   rc = sqlite3_declare_vtab(db, CSV_SCHEMA);
629   if( rc ){
630     csv_errmsg(&sRdr, "bad schema: '%s' - %s", CSV_SCHEMA, sqlite3_errmsg(db));
631     goto csvtab_connect_error;
632   }
633   for(i=0; i<sizeof(azPValue)/sizeof(azPValue[0]); i++){
634     sqlite3_free(azPValue[i]);
635   }
636   /* Rationale for DIRECTONLY:
637   ** An attacker who controls a database schema could use this vtab
638   ** to exfiltrate sensitive data from other files in the filesystem.
639   ** And, recommended practice is to put all CSV virtual tables in the
640   ** TEMP namespace, so they should still be usable from within TEMP
641   ** views, so there shouldn't be a serious loss of functionality by
642   ** prohibiting the use of this vtab from persistent triggers and views.
643   */
644   sqlite3_vtab_config(db, SQLITE_VTAB_DIRECTONLY);
645   return SQLITE_OK;
646 
647 csvtab_connect_oom:
648   rc = SQLITE_NOMEM;
649   csv_errmsg(&sRdr, "out of memory");
650 
651 csvtab_connect_error:
652   if( pNew ) csvtabDisconnect(&pNew->base);
653   for(i=0; i<sizeof(azPValue)/sizeof(azPValue[0]); i++){
654     sqlite3_free(azPValue[i]);
655   }
656   if( sRdr.zErr[0] ){
657     sqlite3_free(*pzErr);
658     *pzErr = sqlite3_mprintf("%s", sRdr.zErr);
659   }
660   csv_reader_reset(&sRdr);
661   if( rc==SQLITE_OK ) rc = SQLITE_ERROR;
662   return rc;
663 }
664 
665 /*
666 ** Reset the current row content held by a CsvCursor.
667 */
csvtabCursorRowReset(CsvCursor * pCur)668 static void csvtabCursorRowReset(CsvCursor *pCur){
669   CsvTable *pTab = (CsvTable*)pCur->base.pVtab;
670   int i;
671   for(i=0; i<pTab->nCol; i++){
672     sqlite3_free(pCur->azVal[i]);
673     pCur->azVal[i] = 0;
674     pCur->aLen[i] = 0;
675   }
676 }
677 
678 /*
679 ** The xConnect and xCreate methods do the same thing, but they must be
680 ** different so that the virtual table is not an eponymous virtual table.
681 */
csvtabCreate(sqlite3 * db,void * pAux,int argc,const char * const * argv,sqlite3_vtab ** ppVtab,char ** pzErr)682 static int csvtabCreate(
683   sqlite3 *db,
684   void *pAux,
685   int argc, const char *const*argv,
686   sqlite3_vtab **ppVtab,
687   char **pzErr
688 ){
689  return csvtabConnect(db, pAux, argc, argv, ppVtab, pzErr);
690 }
691 
692 /*
693 ** Destructor for a CsvCursor.
694 */
csvtabClose(sqlite3_vtab_cursor * cur)695 static int csvtabClose(sqlite3_vtab_cursor *cur){
696   CsvCursor *pCur = (CsvCursor*)cur;
697   csvtabCursorRowReset(pCur);
698   csv_reader_reset(&pCur->rdr);
699   sqlite3_free(cur);
700   return SQLITE_OK;
701 }
702 
703 /*
704 ** Constructor for a new CsvTable cursor object.
705 */
csvtabOpen(sqlite3_vtab * p,sqlite3_vtab_cursor ** ppCursor)706 static int csvtabOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){
707   CsvTable *pTab = (CsvTable*)p;
708   CsvCursor *pCur;
709   size_t nByte;
710   nByte = sizeof(*pCur) + (sizeof(char*)+sizeof(int))*pTab->nCol;
711   pCur = sqlite3_malloc64( nByte );
712   if( pCur==0 ) return SQLITE_NOMEM;
713   memset(pCur, 0, nByte);
714   pCur->azVal = (char**)&pCur[1];
715   pCur->aLen = (int*)&pCur->azVal[pTab->nCol];
716   *ppCursor = &pCur->base;
717   if( csv_reader_open(&pCur->rdr, pTab->zFilename, pTab->zData) ){
718     csv_xfer_error(pTab, &pCur->rdr);
719     return SQLITE_ERROR;
720   }
721   return SQLITE_OK;
722 }
723 
724 
725 /*
726 ** Advance a CsvCursor to its next row of input.
727 ** Set the EOF marker if we reach the end of input.
728 */
csvtabNext(sqlite3_vtab_cursor * cur)729 static int csvtabNext(sqlite3_vtab_cursor *cur){
730   CsvCursor *pCur = (CsvCursor*)cur;
731   CsvTable *pTab = (CsvTable*)cur->pVtab;
732   int i = 0;
733   char *z;
734   do{
735     z = csv_read_one_field(&pCur->rdr);
736     if( z==0 ){
737       break;
738     }
739     if( i<pTab->nCol ){
740       if( pCur->aLen[i] < pCur->rdr.n+1 ){
741         char *zNew = sqlite3_realloc64(pCur->azVal[i], pCur->rdr.n+1);
742         if( zNew==0 ){
743           csv_errmsg(&pCur->rdr, "out of memory");
744           csv_xfer_error(pTab, &pCur->rdr);
745           break;
746         }
747         pCur->azVal[i] = zNew;
748         pCur->aLen[i] = pCur->rdr.n+1;
749       }
750       memcpy(pCur->azVal[i], z, pCur->rdr.n+1);
751       i++;
752     }
753   }while( pCur->rdr.cTerm==',' );
754   if( z==0 && i==0 ){
755     pCur->iRowid = -1;
756   }else{
757     pCur->iRowid++;
758     while( i<pTab->nCol ){
759       sqlite3_free(pCur->azVal[i]);
760       pCur->azVal[i] = 0;
761       pCur->aLen[i] = 0;
762       i++;
763     }
764   }
765   return SQLITE_OK;
766 }
767 
768 /*
769 ** Return values of columns for the row at which the CsvCursor
770 ** is currently pointing.
771 */
csvtabColumn(sqlite3_vtab_cursor * cur,sqlite3_context * ctx,int i)772 static int csvtabColumn(
773   sqlite3_vtab_cursor *cur,   /* The cursor */
774   sqlite3_context *ctx,       /* First argument to sqlite3_result_...() */
775   int i                       /* Which column to return */
776 ){
777   CsvCursor *pCur = (CsvCursor*)cur;
778   CsvTable *pTab = (CsvTable*)cur->pVtab;
779   if( i>=0 && i<pTab->nCol && pCur->azVal[i]!=0 ){
780     sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_TRANSIENT);
781   }
782   return SQLITE_OK;
783 }
784 
785 /*
786 ** Return the rowid for the current row.
787 */
csvtabRowid(sqlite3_vtab_cursor * cur,sqlite_int64 * pRowid)788 static int csvtabRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
789   CsvCursor *pCur = (CsvCursor*)cur;
790   *pRowid = pCur->iRowid;
791   return SQLITE_OK;
792 }
793 
794 /*
795 ** Return TRUE if the cursor has been moved off of the last
796 ** row of output.
797 */
csvtabEof(sqlite3_vtab_cursor * cur)798 static int csvtabEof(sqlite3_vtab_cursor *cur){
799   CsvCursor *pCur = (CsvCursor*)cur;
800   return pCur->iRowid<0;
801 }
802 
803 /*
804 ** Only a full table scan is supported.  So xFilter simply rewinds to
805 ** the beginning.
806 */
csvtabFilter(sqlite3_vtab_cursor * pVtabCursor,int idxNum,const char * idxStr,int argc,sqlite3_value ** argv)807 static int csvtabFilter(
808   sqlite3_vtab_cursor *pVtabCursor,
809   int idxNum, const char *idxStr,
810   int argc, sqlite3_value **argv
811 ){
812   CsvCursor *pCur = (CsvCursor*)pVtabCursor;
813   CsvTable *pTab = (CsvTable*)pVtabCursor->pVtab;
814   pCur->iRowid = 0;
815 
816   /* Ensure the field buffer is always allocated. Otherwise, if the
817   ** first field is zero bytes in size, this may be mistaken for an OOM
818   ** error in csvtabNext(). */
819   if( csv_append(&pCur->rdr, 0) ) return SQLITE_NOMEM;
820 
821   if( pCur->rdr.in==0 ){
822     assert( pCur->rdr.zIn==pTab->zData );
823     assert( pTab->iStart>=0 );
824     assert( (size_t)pTab->iStart<=pCur->rdr.nIn );
825     pCur->rdr.iIn = pTab->iStart;
826   }else{
827     fseek(pCur->rdr.in, pTab->iStart, SEEK_SET);
828     pCur->rdr.iIn = 0;
829     pCur->rdr.nIn = 0;
830   }
831   return csvtabNext(pVtabCursor);
832 }
833 
834 /*
835 ** Only a forward full table scan is supported.  xBestIndex is mostly
836 ** a no-op.  If CSVTEST_FIDX is set, then the presence of equality
837 ** constraints lowers the estimated cost, which is fiction, but is useful
838 ** for testing certain kinds of virtual table behavior.
839 */
csvtabBestIndex(sqlite3_vtab * tab,sqlite3_index_info * pIdxInfo)840 static int csvtabBestIndex(
841   sqlite3_vtab *tab,
842   sqlite3_index_info *pIdxInfo
843 ){
844   pIdxInfo->estimatedCost = 1000000;
845 #ifdef SQLITE_TEST
846   if( (((CsvTable*)tab)->tstFlags & CSVTEST_FIDX)!=0 ){
847     /* The usual (and sensible) case is to always do a full table scan.
848     ** The code in this branch only runs when testflags=1.  This code
849     ** generates an artifical and unrealistic plan which is useful
850     ** for testing virtual table logic but is not helpful to real applications.
851     **
852     ** Any ==, LIKE, or GLOB constraint is marked as usable by the virtual
853     ** table (even though it is not) and the cost of running the virtual table
854     ** is reduced from 1 million to just 10.  The constraints are *not* marked
855     ** as omittable, however, so the query planner should still generate a
856     ** plan that gives a correct answer, even if they plan is not optimal.
857     */
858     int i;
859     int nConst = 0;
860     for(i=0; i<pIdxInfo->nConstraint; i++){
861       unsigned char op;
862       if( pIdxInfo->aConstraint[i].usable==0 ) continue;
863       op = pIdxInfo->aConstraint[i].op;
864       if( op==SQLITE_INDEX_CONSTRAINT_EQ
865        || op==SQLITE_INDEX_CONSTRAINT_LIKE
866        || op==SQLITE_INDEX_CONSTRAINT_GLOB
867       ){
868         pIdxInfo->estimatedCost = 10;
869         pIdxInfo->aConstraintUsage[nConst].argvIndex = nConst+1;
870         nConst++;
871       }
872     }
873   }
874 #endif
875   return SQLITE_OK;
876 }
877 
878 
879 static sqlite3_module CsvModule = {
880   0,                       /* iVersion */
881   csvtabCreate,            /* xCreate */
882   csvtabConnect,           /* xConnect */
883   csvtabBestIndex,         /* xBestIndex */
884   csvtabDisconnect,        /* xDisconnect */
885   csvtabDisconnect,        /* xDestroy */
886   csvtabOpen,              /* xOpen - open a cursor */
887   csvtabClose,             /* xClose - close a cursor */
888   csvtabFilter,            /* xFilter - configure scan constraints */
889   csvtabNext,              /* xNext - advance a cursor */
890   csvtabEof,               /* xEof - check for end of scan */
891   csvtabColumn,            /* xColumn - read data */
892   csvtabRowid,             /* xRowid - read data */
893   0,                       /* xUpdate */
894   0,                       /* xBegin */
895   0,                       /* xSync */
896   0,                       /* xCommit */
897   0,                       /* xRollback */
898   0,                       /* xFindMethod */
899   0,                       /* xRename */
900 };
901 
902 #ifdef SQLITE_TEST
903 /*
904 ** For virtual table testing, make a version of the CSV virtual table
905 ** available that has an xUpdate function.  But the xUpdate always returns
906 ** SQLITE_READONLY since the CSV file is not really writable.
907 */
csvtabUpdate(sqlite3_vtab * p,int n,sqlite3_value ** v,sqlite3_int64 * x)908 static int csvtabUpdate(sqlite3_vtab *p,int n,sqlite3_value**v,sqlite3_int64*x){
909   return SQLITE_READONLY;
910 }
911 static sqlite3_module CsvModuleFauxWrite = {
912   0,                       /* iVersion */
913   csvtabCreate,            /* xCreate */
914   csvtabConnect,           /* xConnect */
915   csvtabBestIndex,         /* xBestIndex */
916   csvtabDisconnect,        /* xDisconnect */
917   csvtabDisconnect,        /* xDestroy */
918   csvtabOpen,              /* xOpen - open a cursor */
919   csvtabClose,             /* xClose - close a cursor */
920   csvtabFilter,            /* xFilter - configure scan constraints */
921   csvtabNext,              /* xNext - advance a cursor */
922   csvtabEof,               /* xEof - check for end of scan */
923   csvtabColumn,            /* xColumn - read data */
924   csvtabRowid,             /* xRowid - read data */
925   csvtabUpdate,            /* xUpdate */
926   0,                       /* xBegin */
927   0,                       /* xSync */
928   0,                       /* xCommit */
929   0,                       /* xRollback */
930   0,                       /* xFindMethod */
931   0,                       /* xRename */
932 };
933 #endif /* SQLITE_TEST */
934 
935 #endif /* !defined(SQLITE_OMIT_VIRTUALTABLE) */
936 
937 
938 #ifdef _WIN32
939 __declspec(dllexport)
940 #endif
941 /*
942 ** This routine is called when the extension is loaded.  The new
943 ** CSV virtual table module is registered with the calling database
944 ** connection.
945 */
sqlite3_csv_init(sqlite3 * db,char ** pzErrMsg,const sqlite3_api_routines * pApi)946 int sqlite3_csv_init(
947   sqlite3 *db,
948   char **pzErrMsg,
949   const sqlite3_api_routines *pApi
950 ){
951 #ifndef SQLITE_OMIT_VIRTUALTABLE
952   int rc;
953   SQLITE_EXTENSION_INIT2(pApi);
954   rc = sqlite3_create_module(db, "csv", &CsvModule, 0);
955 #ifdef SQLITE_TEST
956   if( rc==SQLITE_OK ){
957     rc = sqlite3_create_module(db, "csv_wr", &CsvModuleFauxWrite, 0);
958   }
959 #endif
960   return rc;
961 #else
962   return SQLITE_OK;
963 #endif
964 }
965