blob: 62521d81f780a692805dc1f23c3d629ccbd1ec06 [file] [log] [blame]
drh724b1892016-05-31 16:22:48 +00001/*
2** 2016-05-28
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This file contains the implementation of an SQLite virtual table for
14** reading CSV files.
15**
16** Usage:
17**
18** .load ./csv
19** CREATE VIRTUAL TABLE temp.csv USING csv(filename=FILENAME);
20** SELECT * FROM csv;
21**
22** The columns are named "c1", "c2", "c3", ... by default. But the
23** application can define its own CREATE TABLE statement as an additional
24** parameter. For example:
25**
26** CREATE VIRTUAL TABLE temp.csv2 USING csv(
27** filename = "../http.log",
28** schema = "CREATE TABLE x(date,ipaddr,url,referrer,userAgent)"
29** );
30*/
31#include <sqlite3ext.h>
32SQLITE_EXTENSION_INIT1
33#include <string.h>
34#include <stdlib.h>
35#include <assert.h>
36#include <stdarg.h>
37#include <ctype.h>
38#include <stdio.h>
39
40/*
41** A macro to hint to the compiler that a function should not be
42** inlined.
43*/
44#if defined(__GNUC__)
45# define CSV_NOINLINE __attribute__((noinline))
46#elif defined(_MSC_VER) && _MSC_VER>=1310
47# define CSV_NOINLINE __declspec(noinline)
48#else
49# define CSV_NOINLINE
50#endif
51
52
53/* Max size of the error message in a CsvReader */
54#define CSV_MXERR 200
55
56/* A context object used when read a CSV file. */
57typedef struct CsvReader CsvReader;
58struct CsvReader {
59 FILE *in; /* Read the CSV text from this input stream */
60 char *z; /* Accumulated text for a field */
61 int n; /* Number of bytes in z */
62 int nAlloc; /* Space allocated for z[] */
63 int nLine; /* Current line number */
64 int cTerm; /* Character that terminated the most recent field */
65 char zErr[CSV_MXERR]; /* Error message */
66};
67
68/* Initialize a CsvReader object */
69static void csv_reader_init(CsvReader *p){
70 memset(p, 0, sizeof(*p));
71}
72
73/* Close and reset a CsvReader object */
74static void csv_reader_reset(CsvReader *p){
75 if( p->in ) fclose(p->in);
76 sqlite3_free(p->z);
77 csv_reader_init(p);
78}
79
80/* Report an error on a CsvReader */
81static void csv_errmsg(CsvReader *p, const char *zFormat, ...){
82 va_list ap;
83 va_start(ap, zFormat);
84 sqlite3_vsnprintf(CSV_MXERR, p->zErr, zFormat, ap);
85 va_end(ap);
86}
87
88/* Open the file associated with a CsvReader
89** Return the number of errors.
90*/
91static int csv_reader_open(CsvReader *p, const char *zFilename){
92 p->in = fopen(zFilename, "rb");
93 if( p->in==0 ){
94 csv_errmsg(p, "cannot open '%s' for reading", zFilename);
95 return 1;
96 }
97 return 0;
98}
99
100/* Increase the size of p->z and append character c to the end.
101** Return 0 on success and non-zero if there is an OOM error */
102static CSV_NOINLINE int csv_resize_and_append(CsvReader *p, char c){
103 char *zNew;
104 int nNew = p->nAlloc*2 + 100;
105 zNew = sqlite3_realloc64(p->z, nNew);
106 if( zNew ){
107 p->z = zNew;
108 p->nAlloc = nNew;
109 p->z[p->n++] = c;
110 return 0;
111 }else{
112 csv_errmsg(p, "out of memory");
113 return 1;
114 }
115}
116
117/* Append a single character to the CsvReader.z[] array.
118** Return 0 on success and non-zero if there is an OOM error */
119static int csv_append(CsvReader *p, char c){
120 if( p->n>=p->nAlloc-1 ) return csv_resize_and_append(p, c);
121 p->z[p->n++] = c;
122 return 0;
123}
124
125/* Read a single field of CSV text. Compatible with rfc4180 and extended
126** with the option of having a separator other than ",".
127**
128** + Input comes from p->in.
129** + Store results in p->z of length p->n. Space to hold p->z comes
130** from sqlite3_malloc64().
131** + Keep track of the line number in p->nLine.
132** + Store the character that terminates the field in p->cTerm. Store
133** EOF on end-of-file.
134**
135** Return "" at EOF. Return 0 on an OOM error.
136*/
137static char *csv_read_one_field(CsvReader *p){
138 int c;
139 p->n = 0;
140 c = fgetc(p->in);
141 if( c==EOF ){
142 p->cTerm = EOF;
143 return "";
144 }
145 if( c=='"' ){
146 int pc, ppc;
147 int startLine = p->nLine;
148 int cQuote = c;
149 pc = ppc = 0;
150 while( 1 ){
151 c = fgetc(p->in);
152 if( c=='\n' ) p->nLine++;
153 if( c==cQuote ){
154 if( pc==cQuote ){
155 pc = 0;
156 continue;
157 }
158 }
159 if( (c==',' && pc==cQuote)
160 || (c=='\n' && pc==cQuote)
161 || (c=='\n' && pc=='\r' && ppc==cQuote)
162 || (c==EOF && pc==cQuote)
163 ){
164 do{ p->n--; }while( p->z[p->n]!=cQuote );
165 p->cTerm = c;
166 break;
167 }
168 if( pc==cQuote && c!='\r' ){
169 csv_errmsg(p, "line %d: unescaped %c character", p->nLine, cQuote);
170 break;
171 }
172 if( c==EOF ){
173 csv_errmsg(p, "line %d: unterminated %c-quoted field\n",
174 startLine, cQuote);
175 p->cTerm = c;
176 break;
177 }
178 if( csv_append(p, (char)c) ) return 0;
179 ppc = pc;
180 pc = c;
181 }
182 }else{
183 while( c!=EOF && c!=',' && c!='\n' ){
184 if( csv_append(p, (char)c) ) return 0;
185 c = fgetc(p->in);
186 }
187 if( c=='\n' ){
188 p->nLine++;
189 if( p->n>0 && p->z[p->n-1]=='\r' ) p->n--;
190 }
191 p->cTerm = c;
192 }
193 if( p->z ) p->z[p->n] = 0;
194 return p->z;
195}
196
197
198/* Forward references to the various virtual table methods implemented
199** in this file. */
200static int csvtabCreate(sqlite3*, void*, int, const char*const*,
201 sqlite3_vtab**,char**);
202static int csvtabConnect(sqlite3*, void*, int, const char*const*,
203 sqlite3_vtab**,char**);
204static int csvtabBestIndex(sqlite3_vtab*,sqlite3_index_info*);
205static int csvtabDisconnect(sqlite3_vtab*);
206static int csvtabOpen(sqlite3_vtab*, sqlite3_vtab_cursor**);
207static int csvtabClose(sqlite3_vtab_cursor*);
208static int csvtabFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr,
209 int argc, sqlite3_value **argv);
210static int csvtabNext(sqlite3_vtab_cursor*);
211static int csvtabEof(sqlite3_vtab_cursor*);
212static int csvtabColumn(sqlite3_vtab_cursor*,sqlite3_context*,int);
213static int csvtabRowid(sqlite3_vtab_cursor*,sqlite3_int64*);
214
215/* An instance of the CSV virtual table */
216typedef struct CsvTable {
217 sqlite3_vtab base; /* Base class. Must be first */
218 char *zFilename; /* Name of the CSV file */
219 long iStart; /* Offset to start of data in zFilename */
220 int nCol; /* Number of columns in the CSV file */
221} CsvTable;
222
223/* A cursor for the CSV virtual table */
224typedef struct CsvCursor {
225 sqlite3_vtab_cursor base; /* Base class. Must be first */
226 CsvReader rdr; /* The CsvReader object */
227 char **azVal; /* Value of the current row */
228 sqlite3_int64 iRowid; /* The current rowid. Negative for EOF */
229} CsvCursor;
230
231/* Transfer error message text from a reader into a CsvTable */
232static void csv_xfer_error(CsvTable *pTab, CsvReader *pRdr){
233 sqlite3_free(pTab->base.zErrMsg);
234 pTab->base.zErrMsg = sqlite3_mprintf("%s", pRdr->zErr);
235}
236
237/*
238** This method is the destructor fo a CsvTable object.
239*/
240static int csvtabDisconnect(sqlite3_vtab *pVtab){
241 CsvTable *p = (CsvTable*)pVtab;
242 sqlite3_free(p->zFilename);
243 sqlite3_free(p);
244 return SQLITE_OK;
245}
246
247/* Skip leading whitespace. Return a pointer to the first non-whitespace
248** character, or to the zero terminator if the string has only whitespace */
249static const char *csv_skip_whitespace(const char *z){
250 while( isspace((unsigned char)z[0]) ) z++;
251 return z;
252}
253
254/* Remove trailing whitespace from the end of string z[] */
255static void csv_trim_whitespace(char *z){
256 size_t n = strlen(z);
257 while( n>0 && isspace((unsigned char)z[n]) ) n--;
258 z[n] = 0;
259}
260
261/* Dequote the string */
262static void csv_dequote(char *z){
263 int i, j;
264 char cQuote = z[0];
265 size_t n;
266
267 if( cQuote!='\'' && cQuote!='"' ) return;
268 n = strlen(z);
269 if( n<2 || z[n-1]!=z[0] ) return;
270 for(i=1, j=0; i<n-1; i++){
271 if( z[i]==cQuote && z[i+1]==cQuote ) i++;
272 z[j++] = z[i];
273 }
274 z[j] = 0;
275}
276
277/* Check to see if the string is of the form: "TAG = VALUE" with optional
278** whitespace before and around tokens. If it is, return a pointer to the
279** first character of VALUE. If it is not, return NULL.
280*/
281static const char *csv_parameter(const char *zTag, int nTag, const char *z){
282 z = csv_skip_whitespace(z);
283 if( strncmp(zTag, z, nTag)!=0 ) return 0;
284 z = csv_skip_whitespace(z+nTag);
285 if( z[0]!='=' ) return 0;
286 return csv_skip_whitespace(z+1);
287}
288
289/* Return 0 if the argument is false and 1 if it is true. Return -1 if
290** we cannot really tell.
291*/
292static int csv_boolean(const char *z){
293 if( sqlite3_stricmp("yes",z)==0
294 || sqlite3_stricmp("on",z)==0
295 || sqlite3_stricmp("true",z)==0
296 || (z[0]=='1' && z[0]==0)
297 ){
298 return 1;
299 }
300 if( sqlite3_stricmp("no",z)==0
301 || sqlite3_stricmp("off",z)==0
302 || sqlite3_stricmp("false",z)==0
303 || (z[0]=='0' && z[1]==0)
304 ){
305 return 0;
306 }
307 return -1;
308}
309
310
311/*
312** Parameters:
313** filename=FILENAME Required
314** schema=SCHEMA Optional
315** header=YES|NO First row of CSV defines the names of
316** columns if "yes". Default "no".
317**
318** If header=no and not columns are listed, then the columns are named
319** "c0", "c1", "c2", and so forth.
320*/
321static int csvtabConnect(
322 sqlite3 *db,
323 void *pAux,
324 int argc, const char *const*argv,
325 sqlite3_vtab **ppVtab,
326 char **pzErr
327){
328 CsvTable *pNew = 0;
329 int bHeader = -1;
330 int rc = SQLITE_OK;
331 int i;
332 char *zFilename = 0;
333 char *zSchema = 0;
334 CsvReader sRdr;
335
336 memset(&sRdr, 0, sizeof(sRdr));
337 for(i=3; i<argc; i++){
338 const char *z = argv[i];
339 const char *zValue;
340 if( (zValue = csv_parameter("filename",8,z))!=0 ){
341 if( zFilename ){
342 csv_errmsg(&sRdr, "more than one 'filename' parameter");
343 goto csvtab_connect_error;
344 }
345 zFilename = sqlite3_mprintf("%s", zValue);
346 if( zFilename==0 ) goto csvtab_connect_oom;
347 csv_trim_whitespace(zFilename);
348 csv_dequote(zFilename);
349 }else
350 if( (zValue = csv_parameter("schema",6,z))!=0 ){
351 if( zSchema ){
352 csv_errmsg(&sRdr, "more than one 'schema' parameter");
353 goto csvtab_connect_error;
354 }
355 zSchema = sqlite3_mprintf("%s", zValue);
356 if( zSchema==0 ) goto csvtab_connect_oom;
357 csv_trim_whitespace(zSchema);
358 csv_dequote(zSchema);
359 }else
360 if( (zValue = csv_parameter("header",6,z))!=0 ){
361 int x;
362 if( bHeader>=0 ){
363 csv_errmsg(&sRdr, "more than one 'header' parameter");
364 goto csvtab_connect_error;
365 }
366 x = csv_boolean(zValue);
367 if( x==1 ){
368 bHeader = 1;
369 }else if( x==0 ){
370 bHeader = 0;
371 }else{
372 csv_errmsg(&sRdr, "unrecognized argument to 'header': %s", zValue);
373 goto csvtab_connect_error;
374 }
375 }else
376 {
377 csv_errmsg(&sRdr, "unrecognized parameter '%s'", z);
378 goto csvtab_connect_error;
379 }
380 }
381 if( zFilename==0 ){
382 csv_errmsg(&sRdr, "missing 'filename' parameter");
383 goto csvtab_connect_error;
384 }
385 if( csv_reader_open(&sRdr, zFilename) ){
386 goto csvtab_connect_error;
387 }
388 pNew = sqlite3_malloc( sizeof(*pNew) );
389 *ppVtab = (sqlite3_vtab*)pNew;
390 if( pNew==0 ) goto csvtab_connect_oom;
391 memset(pNew, 0, sizeof(*pNew));
392 do{
393 const char *z = csv_read_one_field(&sRdr);
394 if( z==0 ) goto csvtab_connect_oom;
395 pNew->nCol++;
396 }while( sRdr.cTerm==',' );
397 pNew->zFilename = zFilename;
398 zFilename = 0;
399 pNew->iStart = bHeader==1 ? ftell(sRdr.in) : 0;
400 csv_reader_reset(&sRdr);
401 if( zSchema==0 ){
402 char *zSep = "";
403 zSchema = sqlite3_mprintf("CREATE TABLE x(");
404 if( zSchema==0 ) goto csvtab_connect_oom;
405 for(i=0; i<pNew->nCol; i++){
406 zSchema = sqlite3_mprintf("%z%sc%d TEXT",zSchema, zSep, i);
407 zSep = ",";
408 }
409 zSchema = sqlite3_mprintf("%z);", zSchema);
410 }
411 rc = sqlite3_declare_vtab(db, zSchema);
412 if( rc ) goto csvtab_connect_error;
413 sqlite3_free(zSchema);
414 return SQLITE_OK;
415
416csvtab_connect_oom:
417 rc = SQLITE_NOMEM;
418 csv_errmsg(&sRdr, "out of memory");
419
420csvtab_connect_error:
421 if( pNew ) csvtabDisconnect(&pNew->base);
422 sqlite3_free(zFilename);
423 sqlite3_free(zSchema);
424 if( sRdr.zErr[0] ){
425 sqlite3_free(*pzErr);
426 *pzErr = sqlite3_mprintf("%s", sRdr.zErr);
427 }
428 csv_reader_reset(&sRdr);
429 return rc;
430}
431
432/*
433** Reset the current row content held by a CsvCursor.
434*/
435static void csvtabCursorRowReset(CsvCursor *pCur){
436 CsvTable *pTab = (CsvTable*)pCur->base.pVtab;
437 int i;
438 for(i=0; i<pTab->nCol; i++){
439 sqlite3_free(pCur->azVal[i]);
440 pCur->azVal[i] = 0;
441 }
442}
443
444/*
445** The xConnect and xCreate methods do the same thing, but they must be
446** different so that the virtual table is not an eponymous virtual table.
447*/
448static int csvtabCreate(
449 sqlite3 *db,
450 void *pAux,
451 int argc, const char *const*argv,
452 sqlite3_vtab **ppVtab,
453 char **pzErr
454){
455 return csvtabConnect(db, pAux, argc, argv, ppVtab, pzErr);
456}
457
458/*
459** Destructor for a CsvCursor.
460*/
461static int csvtabClose(sqlite3_vtab_cursor *cur){
462 CsvCursor *pCur = (CsvCursor*)cur;
463 csvtabCursorRowReset(pCur);
464 csv_reader_reset(&pCur->rdr);
465 sqlite3_free(cur);
466 return SQLITE_OK;
467}
468
469/*
470** Constructor for a new CsvTable cursor object.
471*/
472static int csvtabOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){
473 CsvTable *pTab = (CsvTable*)p;
474 CsvCursor *pCur;
475 pCur = sqlite3_malloc( sizeof(*pCur) * sizeof(char*)*pTab->nCol );
476 if( pCur==0 ) return SQLITE_NOMEM;
477 memset(pCur, 0, sizeof(*pCur) + sizeof(char*)*pTab->nCol );
478 pCur->azVal = (char**)&pCur[1];
479 *ppCursor = &pCur->base;
480 if( csv_reader_open(&pCur->rdr, pTab->zFilename) ){
481 csv_xfer_error(pTab, &pCur->rdr);
482 return SQLITE_ERROR;
483 }
484 return SQLITE_OK;
485}
486
487
488/*
489** Advance a CsvCursor to its next row of input.
490** Set the EOF marker if we reach the end of input.
491*/
492static int csvtabNext(sqlite3_vtab_cursor *cur){
493 CsvCursor *pCur = (CsvCursor*)cur;
494 CsvTable *pTab = (CsvTable*)cur->pVtab;
495 int i = 0;
496 char *z;
497 csvtabCursorRowReset(pCur);
498 do{
499 z = csv_read_one_field(&pCur->rdr);
500 if( z==0 ){
501 csv_xfer_error(pTab, &pCur->rdr);
502 break;
503 }
504 z = sqlite3_mprintf("%s", z);
505 if( z==0 ){
506 csv_errmsg(&pCur->rdr, "out of memory");
507 csv_xfer_error(pTab, &pCur->rdr);
508 break;
509 }
510 if( i<pTab->nCol ){
511 pCur->azVal[i++] = z;
512 }
513 }while( z!=0 && pCur->rdr.cTerm==',' );
514 if( z==0 || pCur->rdr.cTerm==EOF ){
515 pCur->iRowid = -1;
516 }else{
517 pCur->iRowid++;
518 }
519 return SQLITE_OK;
520}
521
522/*
523** Return values of columns for the row at which the CsvCursor
524** is currently pointing.
525*/
526static int csvtabColumn(
527 sqlite3_vtab_cursor *cur, /* The cursor */
528 sqlite3_context *ctx, /* First argument to sqlite3_result_...() */
529 int i /* Which column to return */
530){
531 CsvCursor *pCur = (CsvCursor*)cur;
532 CsvTable *pTab = (CsvTable*)cur->pVtab;
533 if( i>=0 && i<pTab->nCol && pCur->azVal[i]!=0 ){
534 sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_STATIC);
535 }
536 return SQLITE_OK;
537}
538
539/*
540** Return the rowid for the current row.
541*/
542static int csvtabRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
543 CsvCursor *pCur = (CsvCursor*)cur;
544 *pRowid = pCur->iRowid;
545 return SQLITE_OK;
546}
547
548/*
549** Return TRUE if the cursor has been moved off of the last
550** row of output.
551*/
552static int csvtabEof(sqlite3_vtab_cursor *cur){
553 CsvCursor *pCur = (CsvCursor*)cur;
554 return pCur->iRowid<0;
555}
556
557/*
558** Only a full table scan is supported. So xFilter simply rewinds to
559** the beginning.
560*/
561static int csvtabFilter(
562 sqlite3_vtab_cursor *pVtabCursor,
563 int idxNum, const char *idxStr,
564 int argc, sqlite3_value **argv
565){
566 CsvCursor *pCur = (CsvCursor*)pVtabCursor;
567 CsvTable *pTab = (CsvTable*)pVtabCursor->pVtab;
568 pCur->iRowid = 0;
569 fseek(pCur->rdr.in, pTab->iStart, SEEK_SET);
570 return csvtabNext(pVtabCursor);
571}
572
573/*
574** Only a forwards full table scan is supported. xBestIndex is a no-op.
575*/
576static int csvtabBestIndex(
577 sqlite3_vtab *tab,
578 sqlite3_index_info *pIdxInfo
579){
580 return SQLITE_OK;
581}
582
583
584static sqlite3_module CsvModule = {
585 0, /* iVersion */
586 csvtabCreate, /* xCreate */
587 csvtabConnect, /* xConnect */
588 csvtabBestIndex, /* xBestIndex */
589 csvtabDisconnect, /* xDisconnect */
590 csvtabDisconnect, /* xDestroy */
591 csvtabOpen, /* xOpen - open a cursor */
592 csvtabClose, /* xClose - close a cursor */
593 csvtabFilter, /* xFilter - configure scan constraints */
594 csvtabNext, /* xNext - advance a cursor */
595 csvtabEof, /* xEof - check for end of scan */
596 csvtabColumn, /* xColumn - read data */
597 csvtabRowid, /* xRowid - read data */
598 0, /* xUpdate */
599 0, /* xBegin */
600 0, /* xSync */
601 0, /* xCommit */
602 0, /* xRollback */
603 0, /* xFindMethod */
604 0, /* xRename */
605};
606
607#ifdef _WIN32
608__declspec(dllexport)
609#endif
610/*
611** This routine is called when the extension is loaded. The new
612** CSV virtual table module is registered with the calling database
613** connection.
614*/
615int sqlite3_csv_init(
616 sqlite3 *db,
617 char **pzErrMsg,
618 const sqlite3_api_routines *pApi
619){
620 SQLITE_EXTENSION_INIT2(pApi);
621 return sqlite3_create_module(db, "csv", &CsvModule, 0);
622}