xref: /sqlite-3.40.0/ext/misc/csv.c (revision abfd272b)
1 /*
2 ** 2016-05-28
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 **
13 ** This file contains the implementation of an SQLite virtual table for
14 ** reading CSV files.
15 **
16 ** Usage:
17 **
18 **    .load ./csv
19 **    CREATE VIRTUAL TABLE temp.csv USING csv(filename=FILENAME);
20 **    SELECT * FROM csv;
21 **
22 ** The columns are named "c1", "c2", "c3", ... by default.  But the
23 ** application can define its own CREATE TABLE statement as an additional
24 ** parameter.  For example:
25 **
26 **    CREATE VIRTUAL TABLE temp.csv2 USING csv(
27 **       filename = "../http.log",
28 **       schema = "CREATE TABLE x(date,ipaddr,url,referrer,userAgent)"
29 **    );
30 */
31 #include <sqlite3ext.h>
32 SQLITE_EXTENSION_INIT1
33 #include <string.h>
34 #include <stdlib.h>
35 #include <assert.h>
36 #include <stdarg.h>
37 #include <ctype.h>
38 #include <stdio.h>
39 
40 /*
41 ** A macro to hint to the compiler that a function should not be
42 ** inlined.
43 */
44 #if defined(__GNUC__)
45 #  define CSV_NOINLINE  __attribute__((noinline))
46 #elif defined(_MSC_VER) && _MSC_VER>=1310
47 #  define CSV_NOINLINE  __declspec(noinline)
48 #else
49 #  define CSV_NOINLINE
50 #endif
51 
52 
53 /* Max size of the error message in a CsvReader */
54 #define CSV_MXERR 200
55 
56 /* A context object used when read a CSV file. */
57 typedef struct CsvReader CsvReader;
58 struct CsvReader {
59   FILE *in;              /* Read the CSV text from this input stream */
60   char *z;               /* Accumulated text for a field */
61   int n;                 /* Number of bytes in z */
62   int nAlloc;            /* Space allocated for z[] */
63   int nLine;             /* Current line number */
64   int cTerm;             /* Character that terminated the most recent field */
65   char zErr[CSV_MXERR];  /* Error message */
66 };
67 
68 /* Initialize a CsvReader object */
69 static void csv_reader_init(CsvReader *p){
70   memset(p, 0, sizeof(*p));
71 }
72 
73 /* Close and reset a CsvReader object */
74 static void csv_reader_reset(CsvReader *p){
75   if( p->in ) fclose(p->in);
76   sqlite3_free(p->z);
77   csv_reader_init(p);
78 }
79 
80 /* Report an error on a CsvReader */
81 static void csv_errmsg(CsvReader *p, const char *zFormat, ...){
82   va_list ap;
83   va_start(ap, zFormat);
84   sqlite3_vsnprintf(CSV_MXERR, p->zErr, zFormat, ap);
85   va_end(ap);
86 }
87 
88 /* Open the file associated with a CsvReader
89 ** Return the number of errors.
90 */
91 static int csv_reader_open(CsvReader *p, const char *zFilename){
92   p->in = fopen(zFilename, "rb");
93   if( p->in==0 ){
94     csv_errmsg(p, "cannot open '%s' for reading", zFilename);
95     return 1;
96   }
97   return 0;
98 }
99 
100 /* Increase the size of p->z and append character c to the end.
101 ** Return 0 on success and non-zero if there is an OOM error */
102 static CSV_NOINLINE int csv_resize_and_append(CsvReader *p, char c){
103   char *zNew;
104   int nNew = p->nAlloc*2 + 100;
105   zNew = sqlite3_realloc64(p->z, nNew);
106   if( zNew ){
107     p->z = zNew;
108     p->nAlloc = nNew;
109     p->z[p->n++] = c;
110     return 0;
111   }else{
112     csv_errmsg(p, "out of memory");
113     return 1;
114   }
115 }
116 
117 /* Append a single character to the CsvReader.z[] array.
118 ** Return 0 on success and non-zero if there is an OOM error */
119 static int csv_append(CsvReader *p, char c){
120   if( p->n>=p->nAlloc-1 ) return csv_resize_and_append(p, c);
121   p->z[p->n++] = c;
122   return 0;
123 }
124 
125 /* Read a single field of CSV text.  Compatible with rfc4180 and extended
126 ** with the option of having a separator other than ",".
127 **
128 **   +  Input comes from p->in.
129 **   +  Store results in p->z of length p->n.  Space to hold p->z comes
130 **      from sqlite3_malloc64().
131 **   +  Keep track of the line number in p->nLine.
132 **   +  Store the character that terminates the field in p->cTerm.  Store
133 **      EOF on end-of-file.
134 **
135 ** Return "" at EOF.  Return 0 on an OOM error.
136 */
137 static char *csv_read_one_field(CsvReader *p){
138   int c;
139   p->n = 0;
140   c = fgetc(p->in);
141   if( c==EOF ){
142     p->cTerm = EOF;
143     return "";
144   }
145   if( c=='"' ){
146     int pc, ppc;
147     int startLine = p->nLine;
148     int cQuote = c;
149     pc = ppc = 0;
150     while( 1 ){
151       c = fgetc(p->in);
152       if( c=='\n' ) p->nLine++;
153       if( c==cQuote ){
154         if( pc==cQuote ){
155           pc = 0;
156           continue;
157         }
158       }
159       if( (c==',' && pc==cQuote)
160        || (c=='\n' && pc==cQuote)
161        || (c=='\n' && pc=='\r' && ppc==cQuote)
162        || (c==EOF && pc==cQuote)
163       ){
164         do{ p->n--; }while( p->z[p->n]!=cQuote );
165         p->cTerm = c;
166         break;
167       }
168       if( pc==cQuote && c!='\r' ){
169         csv_errmsg(p, "line %d: unescaped %c character", p->nLine, cQuote);
170         break;
171       }
172       if( c==EOF ){
173         csv_errmsg(p, "line %d: unterminated %c-quoted field\n",
174                    startLine, cQuote);
175         p->cTerm = c;
176         break;
177       }
178       if( csv_append(p, (char)c) ) return 0;
179       ppc = pc;
180       pc = c;
181     }
182   }else{
183     while( c!=EOF && c!=',' && c!='\n' ){
184       if( csv_append(p, (char)c) ) return 0;
185       c = fgetc(p->in);
186     }
187     if( c=='\n' ){
188       p->nLine++;
189       if( p->n>0 && p->z[p->n-1]=='\r' ) p->n--;
190     }
191     p->cTerm = c;
192   }
193   if( p->z ) p->z[p->n] = 0;
194   return p->z;
195 }
196 
197 
198 /* Forward references to the various virtual table methods implemented
199 ** in this file. */
200 static int csvtabCreate(sqlite3*, void*, int, const char*const*,
201                            sqlite3_vtab**,char**);
202 static int csvtabConnect(sqlite3*, void*, int, const char*const*,
203                            sqlite3_vtab**,char**);
204 static int csvtabBestIndex(sqlite3_vtab*,sqlite3_index_info*);
205 static int csvtabDisconnect(sqlite3_vtab*);
206 static int csvtabOpen(sqlite3_vtab*, sqlite3_vtab_cursor**);
207 static int csvtabClose(sqlite3_vtab_cursor*);
208 static int csvtabFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr,
209                           int argc, sqlite3_value **argv);
210 static int csvtabNext(sqlite3_vtab_cursor*);
211 static int csvtabEof(sqlite3_vtab_cursor*);
212 static int csvtabColumn(sqlite3_vtab_cursor*,sqlite3_context*,int);
213 static int csvtabRowid(sqlite3_vtab_cursor*,sqlite3_int64*);
214 
215 /* An instance of the CSV virtual table */
216 typedef struct CsvTable {
217   sqlite3_vtab base;              /* Base class.  Must be first */
218   char *zFilename;                /* Name of the CSV file */
219   long iStart;                    /* Offset to start of data in zFilename */
220   int nCol;                       /* Number of columns in the CSV file */
221   unsigned int tstFlags;          /* Bit values used for testing */
222 } CsvTable;
223 
224 /* Allowed values for tstFlags */
225 #define CSVTEST_FIDX  0x0001      /* Pretend that constrained searchs cost less*/
226 
227 /* A cursor for the CSV virtual table */
228 typedef struct CsvCursor {
229   sqlite3_vtab_cursor base;       /* Base class.  Must be first */
230   CsvReader rdr;                  /* The CsvReader object */
231   char **azVal;                   /* Value of the current row */
232   sqlite3_int64 iRowid;           /* The current rowid.  Negative for EOF */
233 } CsvCursor;
234 
235 /* Transfer error message text from a reader into a CsvTable */
236 static void csv_xfer_error(CsvTable *pTab, CsvReader *pRdr){
237   sqlite3_free(pTab->base.zErrMsg);
238   pTab->base.zErrMsg = sqlite3_mprintf("%s", pRdr->zErr);
239 }
240 
241 /*
242 ** This method is the destructor fo a CsvTable object.
243 */
244 static int csvtabDisconnect(sqlite3_vtab *pVtab){
245   CsvTable *p = (CsvTable*)pVtab;
246   sqlite3_free(p->zFilename);
247   sqlite3_free(p);
248   return SQLITE_OK;
249 }
250 
251 /* Skip leading whitespace.  Return a pointer to the first non-whitespace
252 ** character, or to the zero terminator if the string has only whitespace */
253 static const char *csv_skip_whitespace(const char *z){
254   while( isspace((unsigned char)z[0]) ) z++;
255   return z;
256 }
257 
258 /* Remove trailing whitespace from the end of string z[] */
259 static void csv_trim_whitespace(char *z){
260   size_t n = strlen(z);
261   while( n>0 && isspace((unsigned char)z[n]) ) n--;
262   z[n] = 0;
263 }
264 
265 /* Dequote the string */
266 static void csv_dequote(char *z){
267   int i, j;
268   char cQuote = z[0];
269   size_t n;
270 
271   if( cQuote!='\'' && cQuote!='"' ) return;
272   n = strlen(z);
273   if( n<2 || z[n-1]!=z[0] ) return;
274   for(i=1, j=0; i<n-1; i++){
275     if( z[i]==cQuote && z[i+1]==cQuote ) i++;
276     z[j++] = z[i];
277   }
278   z[j] = 0;
279 }
280 
281 /* Check to see if the string is of the form:  "TAG = VALUE" with optional
282 ** whitespace before and around tokens.  If it is, return a pointer to the
283 ** first character of VALUE.  If it is not, return NULL.
284 */
285 static const char *csv_parameter(const char *zTag, int nTag, const char *z){
286   z = csv_skip_whitespace(z);
287   if( strncmp(zTag, z, nTag)!=0 ) return 0;
288   z = csv_skip_whitespace(z+nTag);
289   if( z[0]!='=' ) return 0;
290   return csv_skip_whitespace(z+1);
291 }
292 
293 /* Return 0 if the argument is false and 1 if it is true.  Return -1 if
294 ** we cannot really tell.
295 */
296 static int csv_boolean(const char *z){
297   if( sqlite3_stricmp("yes",z)==0
298    || sqlite3_stricmp("on",z)==0
299    || sqlite3_stricmp("true",z)==0
300    || (z[0]=='1' && z[0]==0)
301   ){
302     return 1;
303   }
304   if( sqlite3_stricmp("no",z)==0
305    || sqlite3_stricmp("off",z)==0
306    || sqlite3_stricmp("false",z)==0
307    || (z[0]=='0' && z[1]==0)
308   ){
309     return 0;
310   }
311   return -1;
312 }
313 
314 
315 /*
316 ** Parameters:
317 **    filename=FILENAME          Required
318 **    schema=SCHEMA              Optional
319 **    header=YES|NO              First row of CSV defines the names of
320 **                               columns if "yes".  Default "no".
321 **    testflags=N                Bitmask of test flags.  Optional
322 **
323 ** If header=no and not columns are listed, then the columns are named
324 ** "c0", "c1", "c2", and so forth.
325 */
326 static int csvtabConnect(
327   sqlite3 *db,
328   void *pAux,
329   int argc, const char *const*argv,
330   sqlite3_vtab **ppVtab,
331   char **pzErr
332 ){
333   CsvTable *pNew = 0;
334   int bHeader = -1;
335   int rc = SQLITE_OK;
336   int i;
337   char *zFilename = 0;
338   char *zSchema = 0;
339   int tstFlags = 0;
340   CsvReader sRdr;
341 
342   memset(&sRdr, 0, sizeof(sRdr));
343   for(i=3; i<argc; i++){
344     const char *z = argv[i];
345     const char *zValue;
346     if( (zValue = csv_parameter("filename",8,z))!=0 ){
347       if( zFilename ){
348         csv_errmsg(&sRdr, "more than one 'filename' parameter");
349         goto csvtab_connect_error;
350       }
351       zFilename = sqlite3_mprintf("%s", zValue);
352       if( zFilename==0 ) goto csvtab_connect_oom;
353       csv_trim_whitespace(zFilename);
354       csv_dequote(zFilename);
355     }else
356     if( (zValue = csv_parameter("schema",6,z))!=0 ){
357       if( zSchema ){
358         csv_errmsg(&sRdr, "more than one 'schema' parameter");
359         goto csvtab_connect_error;
360       }
361       zSchema = sqlite3_mprintf("%s", zValue);
362       if( zSchema==0 ) goto csvtab_connect_oom;
363       csv_trim_whitespace(zSchema);
364       csv_dequote(zSchema);
365     }else
366     if( (zValue = csv_parameter("header",6,z))!=0 ){
367       int x;
368       if( bHeader>=0 ){
369         csv_errmsg(&sRdr, "more than one 'header' parameter");
370         goto csvtab_connect_error;
371       }
372       x = csv_boolean(zValue);
373       if( x==1 ){
374         bHeader = 1;
375       }else if( x==0 ){
376         bHeader = 0;
377       }else{
378         csv_errmsg(&sRdr, "unrecognized argument to 'header': %s", zValue);
379         goto csvtab_connect_error;
380       }
381     }else
382     if( (zValue = csv_parameter("testflags",9,z))!=0 ){
383       tstFlags = (unsigned int)atoi(zValue);
384     }else
385     {
386       csv_errmsg(&sRdr, "unrecognized parameter '%s'", z);
387       goto csvtab_connect_error;
388     }
389   }
390   if( zFilename==0 ){
391     csv_errmsg(&sRdr, "missing 'filename' parameter");
392     goto csvtab_connect_error;
393   }
394   if( csv_reader_open(&sRdr, zFilename) ){
395     goto csvtab_connect_error;
396   }
397   pNew = sqlite3_malloc( sizeof(*pNew) );
398   *ppVtab = (sqlite3_vtab*)pNew;
399   if( pNew==0 ) goto csvtab_connect_oom;
400   memset(pNew, 0, sizeof(*pNew));
401   do{
402     const char *z = csv_read_one_field(&sRdr);
403     if( z==0 ) goto csvtab_connect_oom;
404     pNew->nCol++;
405   }while( sRdr.cTerm==',' );
406   pNew->zFilename = zFilename;
407   pNew->tstFlags = tstFlags;
408   zFilename = 0;
409   pNew->iStart = bHeader==1 ? ftell(sRdr.in) : 0;
410   csv_reader_reset(&sRdr);
411   if( zSchema==0 ){
412     char *zSep = "";
413     zSchema = sqlite3_mprintf("CREATE TABLE x(");
414     if( zSchema==0 ) goto csvtab_connect_oom;
415     for(i=0; i<pNew->nCol; i++){
416       zSchema = sqlite3_mprintf("%z%sc%d TEXT",zSchema, zSep, i);
417       zSep = ",";
418     }
419     zSchema = sqlite3_mprintf("%z);", zSchema);
420   }
421   rc = sqlite3_declare_vtab(db, zSchema);
422   if( rc ) goto csvtab_connect_error;
423   sqlite3_free(zSchema);
424   return SQLITE_OK;
425 
426 csvtab_connect_oom:
427   rc = SQLITE_NOMEM;
428   csv_errmsg(&sRdr, "out of memory");
429 
430 csvtab_connect_error:
431   if( pNew ) csvtabDisconnect(&pNew->base);
432   sqlite3_free(zFilename);
433   sqlite3_free(zSchema);
434   if( sRdr.zErr[0] ){
435     sqlite3_free(*pzErr);
436     *pzErr = sqlite3_mprintf("%s", sRdr.zErr);
437   }
438   csv_reader_reset(&sRdr);
439   if( rc==SQLITE_OK ) rc = SQLITE_ERROR;
440   return rc;
441 }
442 
443 /*
444 ** Reset the current row content held by a CsvCursor.
445 */
446 static void csvtabCursorRowReset(CsvCursor *pCur){
447   CsvTable *pTab = (CsvTable*)pCur->base.pVtab;
448   int i;
449   for(i=0; i<pTab->nCol; i++){
450     sqlite3_free(pCur->azVal[i]);
451     pCur->azVal[i] = 0;
452   }
453 }
454 
455 /*
456 ** The xConnect and xCreate methods do the same thing, but they must be
457 ** different so that the virtual table is not an eponymous virtual table.
458 */
459 static int csvtabCreate(
460   sqlite3 *db,
461   void *pAux,
462   int argc, const char *const*argv,
463   sqlite3_vtab **ppVtab,
464   char **pzErr
465 ){
466  return csvtabConnect(db, pAux, argc, argv, ppVtab, pzErr);
467 }
468 
469 /*
470 ** Destructor for a CsvCursor.
471 */
472 static int csvtabClose(sqlite3_vtab_cursor *cur){
473   CsvCursor *pCur = (CsvCursor*)cur;
474   csvtabCursorRowReset(pCur);
475   csv_reader_reset(&pCur->rdr);
476   sqlite3_free(cur);
477   return SQLITE_OK;
478 }
479 
480 /*
481 ** Constructor for a new CsvTable cursor object.
482 */
483 static int csvtabOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){
484   CsvTable *pTab = (CsvTable*)p;
485   CsvCursor *pCur;
486   pCur = sqlite3_malloc( sizeof(*pCur) * sizeof(char*)*pTab->nCol );
487   if( pCur==0 ) return SQLITE_NOMEM;
488   memset(pCur, 0, sizeof(*pCur) + sizeof(char*)*pTab->nCol );
489   pCur->azVal = (char**)&pCur[1];
490   *ppCursor = &pCur->base;
491   if( csv_reader_open(&pCur->rdr, pTab->zFilename) ){
492     csv_xfer_error(pTab, &pCur->rdr);
493     return SQLITE_ERROR;
494   }
495   return SQLITE_OK;
496 }
497 
498 
499 /*
500 ** Advance a CsvCursor to its next row of input.
501 ** Set the EOF marker if we reach the end of input.
502 */
503 static int csvtabNext(sqlite3_vtab_cursor *cur){
504   CsvCursor *pCur = (CsvCursor*)cur;
505   CsvTable *pTab = (CsvTable*)cur->pVtab;
506   int i = 0;
507   char *z;
508   csvtabCursorRowReset(pCur);
509   do{
510     z = csv_read_one_field(&pCur->rdr);
511     if( z==0 ){
512       csv_xfer_error(pTab, &pCur->rdr);
513       break;
514     }
515     z = sqlite3_mprintf("%s", z);
516     if( z==0 ){
517       csv_errmsg(&pCur->rdr, "out of memory");
518       csv_xfer_error(pTab, &pCur->rdr);
519       break;
520     }
521     if( i<pTab->nCol ){
522       pCur->azVal[i++] = z;
523     }
524   }while( z!=0 && pCur->rdr.cTerm==',' );
525   if( z==0 || pCur->rdr.cTerm==EOF ){
526     pCur->iRowid = -1;
527   }else{
528     pCur->iRowid++;
529   }
530   return SQLITE_OK;
531 }
532 
533 /*
534 ** Return values of columns for the row at which the CsvCursor
535 ** is currently pointing.
536 */
537 static int csvtabColumn(
538   sqlite3_vtab_cursor *cur,   /* The cursor */
539   sqlite3_context *ctx,       /* First argument to sqlite3_result_...() */
540   int i                       /* Which column to return */
541 ){
542   CsvCursor *pCur = (CsvCursor*)cur;
543   CsvTable *pTab = (CsvTable*)cur->pVtab;
544   if( i>=0 && i<pTab->nCol && pCur->azVal[i]!=0 ){
545     sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_STATIC);
546   }
547   return SQLITE_OK;
548 }
549 
550 /*
551 ** Return the rowid for the current row.
552 */
553 static int csvtabRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
554   CsvCursor *pCur = (CsvCursor*)cur;
555   *pRowid = pCur->iRowid;
556   return SQLITE_OK;
557 }
558 
559 /*
560 ** Return TRUE if the cursor has been moved off of the last
561 ** row of output.
562 */
563 static int csvtabEof(sqlite3_vtab_cursor *cur){
564   CsvCursor *pCur = (CsvCursor*)cur;
565   return pCur->iRowid<0;
566 }
567 
568 /*
569 ** Only a full table scan is supported.  So xFilter simply rewinds to
570 ** the beginning.
571 */
572 static int csvtabFilter(
573   sqlite3_vtab_cursor *pVtabCursor,
574   int idxNum, const char *idxStr,
575   int argc, sqlite3_value **argv
576 ){
577   CsvCursor *pCur = (CsvCursor*)pVtabCursor;
578   CsvTable *pTab = (CsvTable*)pVtabCursor->pVtab;
579   pCur->iRowid = 0;
580   fseek(pCur->rdr.in, pTab->iStart, SEEK_SET);
581   return csvtabNext(pVtabCursor);
582 }
583 
584 /*
585 ** Only a forwards full table scan is supported.  xBestIndex is mostly
586 ** a no-op.  If CSVTEST_FIDX is set, then the presence of equality
587 ** constraints lowers the estimated cost, which is fiction, but is useful
588 ** for testing certain kinds of virtual table behavior.
589 */
590 static int csvtabBestIndex(
591   sqlite3_vtab *tab,
592   sqlite3_index_info *pIdxInfo
593 ){
594   CsvTable *pTab = (CsvTable*)tab;
595   int i;
596   pIdxInfo->estimatedCost = 1000000;
597   if( (pTab->tstFlags & CSVTEST_FIDX)==0 ){
598     return SQLITE_OK;
599   }
600   for(i=0; i<pIdxInfo->nConstraint; i++){
601     if( pIdxInfo->aConstraint[i].usable==0 ) continue;
602     if( pIdxInfo->aConstraint[i].op==SQLITE_INDEX_CONSTRAINT_EQ ){
603       pIdxInfo->estimatedCost = 10;
604       break;
605     }
606   }
607   return SQLITE_OK;
608 }
609 
610 
611 static sqlite3_module CsvModule = {
612   0,                       /* iVersion */
613   csvtabCreate,            /* xCreate */
614   csvtabConnect,           /* xConnect */
615   csvtabBestIndex,         /* xBestIndex */
616   csvtabDisconnect,        /* xDisconnect */
617   csvtabDisconnect,        /* xDestroy */
618   csvtabOpen,              /* xOpen - open a cursor */
619   csvtabClose,             /* xClose - close a cursor */
620   csvtabFilter,            /* xFilter - configure scan constraints */
621   csvtabNext,              /* xNext - advance a cursor */
622   csvtabEof,               /* xEof - check for end of scan */
623   csvtabColumn,            /* xColumn - read data */
624   csvtabRowid,             /* xRowid - read data */
625   0,                       /* xUpdate */
626   0,                       /* xBegin */
627   0,                       /* xSync */
628   0,                       /* xCommit */
629   0,                       /* xRollback */
630   0,                       /* xFindMethod */
631   0,                       /* xRename */
632 };
633 
634 #ifdef _WIN32
635 __declspec(dllexport)
636 #endif
637 /*
638 ** This routine is called when the extension is loaded.  The new
639 ** CSV virtual table module is registered with the calling database
640 ** connection.
641 */
642 int sqlite3_csv_init(
643   sqlite3 *db,
644   char **pzErrMsg,
645   const sqlite3_api_routines *pApi
646 ){
647   SQLITE_EXTENSION_INIT2(pApi);
648   return sqlite3_create_module(db, "csv", &CsvModule, 0);
649 }
650