1 /* 2 ** 2016-05-28 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ****************************************************************************** 12 ** 13 ** This file contains the implementation of an SQLite virtual table for 14 ** reading CSV files. 15 ** 16 ** Usage: 17 ** 18 ** .load ./csv 19 ** CREATE VIRTUAL TABLE temp.csv USING csv(filename=FILENAME); 20 ** SELECT * FROM csv; 21 ** 22 ** The columns are named "c1", "c2", "c3", ... by default. But the 23 ** application can define its own CREATE TABLE statement as an additional 24 ** parameter. For example: 25 ** 26 ** CREATE VIRTUAL TABLE temp.csv2 USING csv( 27 ** filename = "../http.log", 28 ** schema = "CREATE TABLE x(date,ipaddr,url,referrer,userAgent)" 29 ** ); 30 */ 31 #include <sqlite3ext.h> 32 SQLITE_EXTENSION_INIT1 33 #include <string.h> 34 #include <stdlib.h> 35 #include <assert.h> 36 #include <stdarg.h> 37 #include <ctype.h> 38 #include <stdio.h> 39 40 /* 41 ** A macro to hint to the compiler that a function should not be 42 ** inlined. 43 */ 44 #if defined(__GNUC__) 45 # define CSV_NOINLINE __attribute__((noinline)) 46 #elif defined(_MSC_VER) && _MSC_VER>=1310 47 # define CSV_NOINLINE __declspec(noinline) 48 #else 49 # define CSV_NOINLINE 50 #endif 51 52 53 /* Max size of the error message in a CsvReader */ 54 #define CSV_MXERR 200 55 56 /* A context object used when read a CSV file. */ 57 typedef struct CsvReader CsvReader; 58 struct CsvReader { 59 FILE *in; /* Read the CSV text from this input stream */ 60 char *z; /* Accumulated text for a field */ 61 int n; /* Number of bytes in z */ 62 int nAlloc; /* Space allocated for z[] */ 63 int nLine; /* Current line number */ 64 int cTerm; /* Character that terminated the most recent field */ 65 char zErr[CSV_MXERR]; /* Error message */ 66 }; 67 68 /* Initialize a CsvReader object */ 69 static void csv_reader_init(CsvReader *p){ 70 memset(p, 0, sizeof(*p)); 71 } 72 73 /* Close and reset a CsvReader object */ 74 static void csv_reader_reset(CsvReader *p){ 75 if( p->in ) fclose(p->in); 76 sqlite3_free(p->z); 77 csv_reader_init(p); 78 } 79 80 /* Report an error on a CsvReader */ 81 static void csv_errmsg(CsvReader *p, const char *zFormat, ...){ 82 va_list ap; 83 va_start(ap, zFormat); 84 sqlite3_vsnprintf(CSV_MXERR, p->zErr, zFormat, ap); 85 va_end(ap); 86 } 87 88 /* Open the file associated with a CsvReader 89 ** Return the number of errors. 90 */ 91 static int csv_reader_open(CsvReader *p, const char *zFilename){ 92 p->in = fopen(zFilename, "rb"); 93 if( p->in==0 ){ 94 csv_errmsg(p, "cannot open '%s' for reading", zFilename); 95 return 1; 96 } 97 return 0; 98 } 99 100 /* Increase the size of p->z and append character c to the end. 101 ** Return 0 on success and non-zero if there is an OOM error */ 102 static CSV_NOINLINE int csv_resize_and_append(CsvReader *p, char c){ 103 char *zNew; 104 int nNew = p->nAlloc*2 + 100; 105 zNew = sqlite3_realloc64(p->z, nNew); 106 if( zNew ){ 107 p->z = zNew; 108 p->nAlloc = nNew; 109 p->z[p->n++] = c; 110 return 0; 111 }else{ 112 csv_errmsg(p, "out of memory"); 113 return 1; 114 } 115 } 116 117 /* Append a single character to the CsvReader.z[] array. 118 ** Return 0 on success and non-zero if there is an OOM error */ 119 static int csv_append(CsvReader *p, char c){ 120 if( p->n>=p->nAlloc-1 ) return csv_resize_and_append(p, c); 121 p->z[p->n++] = c; 122 return 0; 123 } 124 125 /* Read a single field of CSV text. Compatible with rfc4180 and extended 126 ** with the option of having a separator other than ",". 127 ** 128 ** + Input comes from p->in. 129 ** + Store results in p->z of length p->n. Space to hold p->z comes 130 ** from sqlite3_malloc64(). 131 ** + Keep track of the line number in p->nLine. 132 ** + Store the character that terminates the field in p->cTerm. Store 133 ** EOF on end-of-file. 134 ** 135 ** Return "" at EOF. Return 0 on an OOM error. 136 */ 137 static char *csv_read_one_field(CsvReader *p){ 138 int c; 139 p->n = 0; 140 c = fgetc(p->in); 141 if( c==EOF ){ 142 p->cTerm = EOF; 143 return ""; 144 } 145 if( c=='"' ){ 146 int pc, ppc; 147 int startLine = p->nLine; 148 int cQuote = c; 149 pc = ppc = 0; 150 while( 1 ){ 151 c = fgetc(p->in); 152 if( c=='\n' ) p->nLine++; 153 if( c==cQuote ){ 154 if( pc==cQuote ){ 155 pc = 0; 156 continue; 157 } 158 } 159 if( (c==',' && pc==cQuote) 160 || (c=='\n' && pc==cQuote) 161 || (c=='\n' && pc=='\r' && ppc==cQuote) 162 || (c==EOF && pc==cQuote) 163 ){ 164 do{ p->n--; }while( p->z[p->n]!=cQuote ); 165 p->cTerm = c; 166 break; 167 } 168 if( pc==cQuote && c!='\r' ){ 169 csv_errmsg(p, "line %d: unescaped %c character", p->nLine, cQuote); 170 break; 171 } 172 if( c==EOF ){ 173 csv_errmsg(p, "line %d: unterminated %c-quoted field\n", 174 startLine, cQuote); 175 p->cTerm = c; 176 break; 177 } 178 if( csv_append(p, (char)c) ) return 0; 179 ppc = pc; 180 pc = c; 181 } 182 }else{ 183 while( c!=EOF && c!=',' && c!='\n' ){ 184 if( csv_append(p, (char)c) ) return 0; 185 c = fgetc(p->in); 186 } 187 if( c=='\n' ){ 188 p->nLine++; 189 if( p->n>0 && p->z[p->n-1]=='\r' ) p->n--; 190 } 191 p->cTerm = c; 192 } 193 if( p->z ) p->z[p->n] = 0; 194 return p->z; 195 } 196 197 198 /* Forward references to the various virtual table methods implemented 199 ** in this file. */ 200 static int csvtabCreate(sqlite3*, void*, int, const char*const*, 201 sqlite3_vtab**,char**); 202 static int csvtabConnect(sqlite3*, void*, int, const char*const*, 203 sqlite3_vtab**,char**); 204 static int csvtabBestIndex(sqlite3_vtab*,sqlite3_index_info*); 205 static int csvtabDisconnect(sqlite3_vtab*); 206 static int csvtabOpen(sqlite3_vtab*, sqlite3_vtab_cursor**); 207 static int csvtabClose(sqlite3_vtab_cursor*); 208 static int csvtabFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr, 209 int argc, sqlite3_value **argv); 210 static int csvtabNext(sqlite3_vtab_cursor*); 211 static int csvtabEof(sqlite3_vtab_cursor*); 212 static int csvtabColumn(sqlite3_vtab_cursor*,sqlite3_context*,int); 213 static int csvtabRowid(sqlite3_vtab_cursor*,sqlite3_int64*); 214 215 /* An instance of the CSV virtual table */ 216 typedef struct CsvTable { 217 sqlite3_vtab base; /* Base class. Must be first */ 218 char *zFilename; /* Name of the CSV file */ 219 long iStart; /* Offset to start of data in zFilename */ 220 int nCol; /* Number of columns in the CSV file */ 221 unsigned int tstFlags; /* Bit values used for testing */ 222 } CsvTable; 223 224 /* Allowed values for tstFlags */ 225 #define CSVTEST_FIDX 0x0001 /* Pretend that constrained searchs cost less*/ 226 227 /* A cursor for the CSV virtual table */ 228 typedef struct CsvCursor { 229 sqlite3_vtab_cursor base; /* Base class. Must be first */ 230 CsvReader rdr; /* The CsvReader object */ 231 char **azVal; /* Value of the current row */ 232 sqlite3_int64 iRowid; /* The current rowid. Negative for EOF */ 233 } CsvCursor; 234 235 /* Transfer error message text from a reader into a CsvTable */ 236 static void csv_xfer_error(CsvTable *pTab, CsvReader *pRdr){ 237 sqlite3_free(pTab->base.zErrMsg); 238 pTab->base.zErrMsg = sqlite3_mprintf("%s", pRdr->zErr); 239 } 240 241 /* 242 ** This method is the destructor fo a CsvTable object. 243 */ 244 static int csvtabDisconnect(sqlite3_vtab *pVtab){ 245 CsvTable *p = (CsvTable*)pVtab; 246 sqlite3_free(p->zFilename); 247 sqlite3_free(p); 248 return SQLITE_OK; 249 } 250 251 /* Skip leading whitespace. Return a pointer to the first non-whitespace 252 ** character, or to the zero terminator if the string has only whitespace */ 253 static const char *csv_skip_whitespace(const char *z){ 254 while( isspace((unsigned char)z[0]) ) z++; 255 return z; 256 } 257 258 /* Remove trailing whitespace from the end of string z[] */ 259 static void csv_trim_whitespace(char *z){ 260 size_t n = strlen(z); 261 while( n>0 && isspace((unsigned char)z[n]) ) n--; 262 z[n] = 0; 263 } 264 265 /* Dequote the string */ 266 static void csv_dequote(char *z){ 267 int i, j; 268 char cQuote = z[0]; 269 size_t n; 270 271 if( cQuote!='\'' && cQuote!='"' ) return; 272 n = strlen(z); 273 if( n<2 || z[n-1]!=z[0] ) return; 274 for(i=1, j=0; i<n-1; i++){ 275 if( z[i]==cQuote && z[i+1]==cQuote ) i++; 276 z[j++] = z[i]; 277 } 278 z[j] = 0; 279 } 280 281 /* Check to see if the string is of the form: "TAG = VALUE" with optional 282 ** whitespace before and around tokens. If it is, return a pointer to the 283 ** first character of VALUE. If it is not, return NULL. 284 */ 285 static const char *csv_parameter(const char *zTag, int nTag, const char *z){ 286 z = csv_skip_whitespace(z); 287 if( strncmp(zTag, z, nTag)!=0 ) return 0; 288 z = csv_skip_whitespace(z+nTag); 289 if( z[0]!='=' ) return 0; 290 return csv_skip_whitespace(z+1); 291 } 292 293 /* Return 0 if the argument is false and 1 if it is true. Return -1 if 294 ** we cannot really tell. 295 */ 296 static int csv_boolean(const char *z){ 297 if( sqlite3_stricmp("yes",z)==0 298 || sqlite3_stricmp("on",z)==0 299 || sqlite3_stricmp("true",z)==0 300 || (z[0]=='1' && z[0]==0) 301 ){ 302 return 1; 303 } 304 if( sqlite3_stricmp("no",z)==0 305 || sqlite3_stricmp("off",z)==0 306 || sqlite3_stricmp("false",z)==0 307 || (z[0]=='0' && z[1]==0) 308 ){ 309 return 0; 310 } 311 return -1; 312 } 313 314 315 /* 316 ** Parameters: 317 ** filename=FILENAME Required 318 ** schema=SCHEMA Optional 319 ** header=YES|NO First row of CSV defines the names of 320 ** columns if "yes". Default "no". 321 ** testflags=N Bitmask of test flags. Optional 322 ** 323 ** If header=no and not columns are listed, then the columns are named 324 ** "c0", "c1", "c2", and so forth. 325 */ 326 static int csvtabConnect( 327 sqlite3 *db, 328 void *pAux, 329 int argc, const char *const*argv, 330 sqlite3_vtab **ppVtab, 331 char **pzErr 332 ){ 333 CsvTable *pNew = 0; 334 int bHeader = -1; 335 int rc = SQLITE_OK; 336 int i; 337 char *zFilename = 0; 338 char *zSchema = 0; 339 int tstFlags = 0; 340 CsvReader sRdr; 341 342 memset(&sRdr, 0, sizeof(sRdr)); 343 for(i=3; i<argc; i++){ 344 const char *z = argv[i]; 345 const char *zValue; 346 if( (zValue = csv_parameter("filename",8,z))!=0 ){ 347 if( zFilename ){ 348 csv_errmsg(&sRdr, "more than one 'filename' parameter"); 349 goto csvtab_connect_error; 350 } 351 zFilename = sqlite3_mprintf("%s", zValue); 352 if( zFilename==0 ) goto csvtab_connect_oom; 353 csv_trim_whitespace(zFilename); 354 csv_dequote(zFilename); 355 }else 356 if( (zValue = csv_parameter("schema",6,z))!=0 ){ 357 if( zSchema ){ 358 csv_errmsg(&sRdr, "more than one 'schema' parameter"); 359 goto csvtab_connect_error; 360 } 361 zSchema = sqlite3_mprintf("%s", zValue); 362 if( zSchema==0 ) goto csvtab_connect_oom; 363 csv_trim_whitespace(zSchema); 364 csv_dequote(zSchema); 365 }else 366 if( (zValue = csv_parameter("header",6,z))!=0 ){ 367 int x; 368 if( bHeader>=0 ){ 369 csv_errmsg(&sRdr, "more than one 'header' parameter"); 370 goto csvtab_connect_error; 371 } 372 x = csv_boolean(zValue); 373 if( x==1 ){ 374 bHeader = 1; 375 }else if( x==0 ){ 376 bHeader = 0; 377 }else{ 378 csv_errmsg(&sRdr, "unrecognized argument to 'header': %s", zValue); 379 goto csvtab_connect_error; 380 } 381 }else 382 if( (zValue = csv_parameter("testflags",9,z))!=0 ){ 383 tstFlags = (unsigned int)atoi(zValue); 384 }else 385 { 386 csv_errmsg(&sRdr, "unrecognized parameter '%s'", z); 387 goto csvtab_connect_error; 388 } 389 } 390 if( zFilename==0 ){ 391 csv_errmsg(&sRdr, "missing 'filename' parameter"); 392 goto csvtab_connect_error; 393 } 394 if( csv_reader_open(&sRdr, zFilename) ){ 395 goto csvtab_connect_error; 396 } 397 pNew = sqlite3_malloc( sizeof(*pNew) ); 398 *ppVtab = (sqlite3_vtab*)pNew; 399 if( pNew==0 ) goto csvtab_connect_oom; 400 memset(pNew, 0, sizeof(*pNew)); 401 do{ 402 const char *z = csv_read_one_field(&sRdr); 403 if( z==0 ) goto csvtab_connect_oom; 404 pNew->nCol++; 405 }while( sRdr.cTerm==',' ); 406 pNew->zFilename = zFilename; 407 pNew->tstFlags = tstFlags; 408 zFilename = 0; 409 pNew->iStart = bHeader==1 ? ftell(sRdr.in) : 0; 410 csv_reader_reset(&sRdr); 411 if( zSchema==0 ){ 412 char *zSep = ""; 413 zSchema = sqlite3_mprintf("CREATE TABLE x("); 414 if( zSchema==0 ) goto csvtab_connect_oom; 415 for(i=0; i<pNew->nCol; i++){ 416 zSchema = sqlite3_mprintf("%z%sc%d TEXT",zSchema, zSep, i); 417 zSep = ","; 418 } 419 zSchema = sqlite3_mprintf("%z);", zSchema); 420 } 421 rc = sqlite3_declare_vtab(db, zSchema); 422 if( rc ) goto csvtab_connect_error; 423 sqlite3_free(zSchema); 424 return SQLITE_OK; 425 426 csvtab_connect_oom: 427 rc = SQLITE_NOMEM; 428 csv_errmsg(&sRdr, "out of memory"); 429 430 csvtab_connect_error: 431 if( pNew ) csvtabDisconnect(&pNew->base); 432 sqlite3_free(zFilename); 433 sqlite3_free(zSchema); 434 if( sRdr.zErr[0] ){ 435 sqlite3_free(*pzErr); 436 *pzErr = sqlite3_mprintf("%s", sRdr.zErr); 437 } 438 csv_reader_reset(&sRdr); 439 if( rc==SQLITE_OK ) rc = SQLITE_ERROR; 440 return rc; 441 } 442 443 /* 444 ** Reset the current row content held by a CsvCursor. 445 */ 446 static void csvtabCursorRowReset(CsvCursor *pCur){ 447 CsvTable *pTab = (CsvTable*)pCur->base.pVtab; 448 int i; 449 for(i=0; i<pTab->nCol; i++){ 450 sqlite3_free(pCur->azVal[i]); 451 pCur->azVal[i] = 0; 452 } 453 } 454 455 /* 456 ** The xConnect and xCreate methods do the same thing, but they must be 457 ** different so that the virtual table is not an eponymous virtual table. 458 */ 459 static int csvtabCreate( 460 sqlite3 *db, 461 void *pAux, 462 int argc, const char *const*argv, 463 sqlite3_vtab **ppVtab, 464 char **pzErr 465 ){ 466 return csvtabConnect(db, pAux, argc, argv, ppVtab, pzErr); 467 } 468 469 /* 470 ** Destructor for a CsvCursor. 471 */ 472 static int csvtabClose(sqlite3_vtab_cursor *cur){ 473 CsvCursor *pCur = (CsvCursor*)cur; 474 csvtabCursorRowReset(pCur); 475 csv_reader_reset(&pCur->rdr); 476 sqlite3_free(cur); 477 return SQLITE_OK; 478 } 479 480 /* 481 ** Constructor for a new CsvTable cursor object. 482 */ 483 static int csvtabOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){ 484 CsvTable *pTab = (CsvTable*)p; 485 CsvCursor *pCur; 486 pCur = sqlite3_malloc( sizeof(*pCur) * sizeof(char*)*pTab->nCol ); 487 if( pCur==0 ) return SQLITE_NOMEM; 488 memset(pCur, 0, sizeof(*pCur) + sizeof(char*)*pTab->nCol ); 489 pCur->azVal = (char**)&pCur[1]; 490 *ppCursor = &pCur->base; 491 if( csv_reader_open(&pCur->rdr, pTab->zFilename) ){ 492 csv_xfer_error(pTab, &pCur->rdr); 493 return SQLITE_ERROR; 494 } 495 return SQLITE_OK; 496 } 497 498 499 /* 500 ** Advance a CsvCursor to its next row of input. 501 ** Set the EOF marker if we reach the end of input. 502 */ 503 static int csvtabNext(sqlite3_vtab_cursor *cur){ 504 CsvCursor *pCur = (CsvCursor*)cur; 505 CsvTable *pTab = (CsvTable*)cur->pVtab; 506 int i = 0; 507 char *z; 508 csvtabCursorRowReset(pCur); 509 do{ 510 z = csv_read_one_field(&pCur->rdr); 511 if( z==0 ){ 512 csv_xfer_error(pTab, &pCur->rdr); 513 break; 514 } 515 z = sqlite3_mprintf("%s", z); 516 if( z==0 ){ 517 csv_errmsg(&pCur->rdr, "out of memory"); 518 csv_xfer_error(pTab, &pCur->rdr); 519 break; 520 } 521 if( i<pTab->nCol ){ 522 pCur->azVal[i++] = z; 523 } 524 }while( z!=0 && pCur->rdr.cTerm==',' ); 525 if( z==0 || pCur->rdr.cTerm==EOF ){ 526 pCur->iRowid = -1; 527 }else{ 528 pCur->iRowid++; 529 } 530 return SQLITE_OK; 531 } 532 533 /* 534 ** Return values of columns for the row at which the CsvCursor 535 ** is currently pointing. 536 */ 537 static int csvtabColumn( 538 sqlite3_vtab_cursor *cur, /* The cursor */ 539 sqlite3_context *ctx, /* First argument to sqlite3_result_...() */ 540 int i /* Which column to return */ 541 ){ 542 CsvCursor *pCur = (CsvCursor*)cur; 543 CsvTable *pTab = (CsvTable*)cur->pVtab; 544 if( i>=0 && i<pTab->nCol && pCur->azVal[i]!=0 ){ 545 sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_STATIC); 546 } 547 return SQLITE_OK; 548 } 549 550 /* 551 ** Return the rowid for the current row. 552 */ 553 static int csvtabRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){ 554 CsvCursor *pCur = (CsvCursor*)cur; 555 *pRowid = pCur->iRowid; 556 return SQLITE_OK; 557 } 558 559 /* 560 ** Return TRUE if the cursor has been moved off of the last 561 ** row of output. 562 */ 563 static int csvtabEof(sqlite3_vtab_cursor *cur){ 564 CsvCursor *pCur = (CsvCursor*)cur; 565 return pCur->iRowid<0; 566 } 567 568 /* 569 ** Only a full table scan is supported. So xFilter simply rewinds to 570 ** the beginning. 571 */ 572 static int csvtabFilter( 573 sqlite3_vtab_cursor *pVtabCursor, 574 int idxNum, const char *idxStr, 575 int argc, sqlite3_value **argv 576 ){ 577 CsvCursor *pCur = (CsvCursor*)pVtabCursor; 578 CsvTable *pTab = (CsvTable*)pVtabCursor->pVtab; 579 pCur->iRowid = 0; 580 fseek(pCur->rdr.in, pTab->iStart, SEEK_SET); 581 return csvtabNext(pVtabCursor); 582 } 583 584 /* 585 ** Only a forwards full table scan is supported. xBestIndex is mostly 586 ** a no-op. If CSVTEST_FIDX is set, then the presence of equality 587 ** constraints lowers the estimated cost, which is fiction, but is useful 588 ** for testing certain kinds of virtual table behavior. 589 */ 590 static int csvtabBestIndex( 591 sqlite3_vtab *tab, 592 sqlite3_index_info *pIdxInfo 593 ){ 594 CsvTable *pTab = (CsvTable*)tab; 595 int i; 596 pIdxInfo->estimatedCost = 1000000; 597 if( (pTab->tstFlags & CSVTEST_FIDX)==0 ){ 598 return SQLITE_OK; 599 } 600 for(i=0; i<pIdxInfo->nConstraint; i++){ 601 if( pIdxInfo->aConstraint[i].usable==0 ) continue; 602 if( pIdxInfo->aConstraint[i].op==SQLITE_INDEX_CONSTRAINT_EQ ){ 603 pIdxInfo->estimatedCost = 10; 604 break; 605 } 606 } 607 return SQLITE_OK; 608 } 609 610 611 static sqlite3_module CsvModule = { 612 0, /* iVersion */ 613 csvtabCreate, /* xCreate */ 614 csvtabConnect, /* xConnect */ 615 csvtabBestIndex, /* xBestIndex */ 616 csvtabDisconnect, /* xDisconnect */ 617 csvtabDisconnect, /* xDestroy */ 618 csvtabOpen, /* xOpen - open a cursor */ 619 csvtabClose, /* xClose - close a cursor */ 620 csvtabFilter, /* xFilter - configure scan constraints */ 621 csvtabNext, /* xNext - advance a cursor */ 622 csvtabEof, /* xEof - check for end of scan */ 623 csvtabColumn, /* xColumn - read data */ 624 csvtabRowid, /* xRowid - read data */ 625 0, /* xUpdate */ 626 0, /* xBegin */ 627 0, /* xSync */ 628 0, /* xCommit */ 629 0, /* xRollback */ 630 0, /* xFindMethod */ 631 0, /* xRename */ 632 }; 633 634 #ifdef _WIN32 635 __declspec(dllexport) 636 #endif 637 /* 638 ** This routine is called when the extension is loaded. The new 639 ** CSV virtual table module is registered with the calling database 640 ** connection. 641 */ 642 int sqlite3_csv_init( 643 sqlite3 *db, 644 char **pzErrMsg, 645 const sqlite3_api_routines *pApi 646 ){ 647 SQLITE_EXTENSION_INIT2(pApi); 648 return sqlite3_create_module(db, "csv", &CsvModule, 0); 649 } 650