Skip to content

Commit 3456f97

Browse files
CSV import options for DELIMITER, NULL, and QUOTE (#256)
This branch implements more csv parsing options. The goal is that the following sql string is correctly parsed: ``` COPY "{}" FROM '{}' WITH (FORMAT CSV, DELIMITER '|', NULL '', QUOTE '"'); ``` where delimiter is the delimiter of the csv file, null is the value of null values in the csv and quote is the string used to quote values.
1 parent 0663319 commit 3456f97

File tree

10 files changed

+2042
-1791
lines changed

10 files changed

+2042
-1791
lines changed

src/parser/bison_parser.cpp

Lines changed: 1878 additions & 1780 deletions
Large diffs are not rendered by default.

src/parser/bison_parser.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,10 +345,11 @@ union HSQL_STYPE
345345
hsql::RowLockWaitPolicy lock_wait_policy_t;
346346

347347
hsql::ImportExportOptions* import_export_option_t;
348+
std::pair<hsql::CsvOptionType, char*>* csv_option_t;
348349

349350
// clang-format off
350351

351-
#line 352 "bison_parser.h"
352+
#line 353 "bison_parser.h"
352353

353354
};
354355
typedef union HSQL_STYPE HSQL_STYPE;

src/parser/bison_parser.y

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@
169169
hsql::RowLockWaitPolicy lock_wait_policy_t;
170170

171171
hsql::ImportExportOptions* import_export_option_t;
172+
std::pair<hsql::CsvOptionType, char*>* csv_option_t;
172173

173174
// clang-format off
174175
}
@@ -199,6 +200,10 @@
199200
}
200201
delete ($$);
201202
} <table_vec> <table_element_vec> <update_vec> <expr_vec> <order_vec> <stmt_vec>
203+
%destructor {
204+
free($$->second);
205+
delete ($$);
206+
} <csv_option_t>
202207
%destructor { delete ($$); } <*>
203208

204209

@@ -294,6 +299,7 @@
294299
// ImportType is used for compatibility reasons
295300
%type <import_type_t> file_type
296301
%type <import_export_option_t> opt_import_export_options import_export_options
302+
%type <csv_option_t> csv_option
297303

298304
%type <str_vec> ident_commalist opt_column_list
299305
%type <expr_vec> expr_list select_list opt_extended_literal_list extended_literal_list hint_list opt_hints opt_partition
@@ -469,6 +475,10 @@ import_statement : IMPORT FROM file_type FILE file_path INTO table_name {
469475
$$->encoding = $5->encoding;
470476
$5->encoding = nullptr;
471477
}
478+
if ($5->csv_options) {
479+
$$->csv_options = $5->csv_options;
480+
$5->csv_options = nullptr;
481+
}
472482
delete $5;
473483
};
474484

@@ -499,6 +509,11 @@ import_export_options : import_export_options ',' FORMAT file_type {
499509
yyerror(&yyloc, result, scanner, "File type must only be provided once.");
500510
YYERROR;
501511
}
512+
if ($1->csv_options && $4 != kImportCSV && $4 != kImportAuto) {
513+
delete $1;
514+
yyerror(&yyloc, result, scanner, "CSV options (DELIMITER, NULL, QUOTE) are only allowed for CSV files.");
515+
YYERROR;
516+
}
502517
$1->format = $4;
503518
$$ = $1;
504519
}
@@ -519,7 +534,53 @@ import_export_options : import_export_options ',' FORMAT file_type {
519534
| ENCODING STRING {
520535
$$ = new ImportExportOptions{};
521536
$$->encoding = $2;
522-
};
537+
}
538+
| import_export_options ',' csv_option {
539+
if ($1->format != kImportAuto && $1->format != kImportCSV) {
540+
delete $1;
541+
free($3->second);
542+
delete $3;
543+
yyerror(&yyloc, result, scanner, "CSV options (DELIMITER, NULL, QUOTE) are only allowed for CSV files.");
544+
YYERROR;
545+
}
546+
547+
if ($1->csv_options == nullptr) {
548+
$1->csv_options = new CsvOptions{};
549+
}
550+
551+
if (!$1->csv_options->accept_csv_option($3)) {
552+
free($3->second);
553+
delete $3;
554+
delete $1;
555+
yyerror(&yyloc, result, scanner, "CSV options (DELIMITER, NULL, QUOTE) cannot be provided more than once.");
556+
YYERROR;
557+
}
558+
559+
delete $3;
560+
$$ = $1;
561+
}
562+
| csv_option {
563+
$$ = new ImportExportOptions{};
564+
$$->csv_options = new CsvOptions{};
565+
$$->csv_options->accept_csv_option($1);
566+
567+
delete $1;
568+
}
569+
570+
csv_option : IDENTIFIER STRING {
571+
if (strcasecmp($1, "DELIMITER") == 0) {
572+
$$ = new std::pair<CsvOptionType, char*>(CsvOptionType::Delimiter, $2);
573+
} else if (strcasecmp($1, "QUOTE") == 0) {
574+
$$ = new std::pair<CsvOptionType, char*>(CsvOptionType::Quote, $2);
575+
} else {
576+
free($1);
577+
free($2);
578+
yyerror(&yyloc, result, scanner, "Unknown CSV option.");
579+
YYERROR;
580+
}
581+
free($1);
582+
}
583+
| NULL STRING { $$ = new std::pair<CsvOptionType, char*>(CsvOptionType::Null, $2); }
523584

524585
/******************************
525586
* Export Statement
@@ -535,6 +596,10 @@ export_statement : COPY table_name TO file_path opt_import_export_options {
535596
$$->encoding = $5->encoding;
536597
$5->encoding = nullptr;
537598
}
599+
if ($5->csv_options) {
600+
$$->csv_options = $5->csv_options;
601+
$5->csv_options = nullptr;
602+
}
538603
delete $5;
539604
}
540605
| COPY select_with_paren TO file_path opt_import_export_options {
@@ -545,6 +610,10 @@ export_statement : COPY table_name TO file_path opt_import_export_options {
545610
$$->encoding = $5->encoding;
546611
$5->encoding = nullptr;
547612
}
613+
if ($5->csv_options) {
614+
$$->csv_options = $5->csv_options;
615+
$5->csv_options = nullptr;
616+
}
548617
delete $5;
549618
};
550619

src/sql/ExportStatement.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ struct ExportStatement : SQLStatement {
1818
char* tableName;
1919
SelectStatement* select;
2020
char* encoding;
21+
CsvOptions* csv_options;
2122
};
2223

2324
} // namespace hsql

src/sql/ImportExportOptions.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#ifndef SQLPARSER_IMPORT_EXPORT_OPTIONS_H
22
#define SQLPARSER_IMPORT_EXPORT_OPTIONS_H
33

4+
#include <utility>
5+
46
namespace hsql {
57

68
// Name unchanged for compatibility. Historically, this was only used for import statements before we introduced export
@@ -13,12 +15,30 @@ enum ImportType {
1315
kImportAuto
1416
};
1517

18+
enum CsvOptionType {
19+
Delimiter,
20+
Null,
21+
Quote,
22+
};
23+
24+
struct CsvOptions {
25+
CsvOptions();
26+
~CsvOptions();
27+
28+
char* delimiter;
29+
char* null;
30+
char* quote;
31+
32+
bool accept_csv_option(std::pair<CsvOptionType, char*>* option);
33+
};
34+
1635
struct ImportExportOptions {
1736
ImportExportOptions();
1837
~ImportExportOptions();
1938

2039
ImportType format;
2140
char* encoding;
41+
CsvOptions* csv_options;
2242
};
2343

2444
} // namespace hsql

src/sql/ImportStatement.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ struct ImportStatement : SQLStatement {
1717
char* tableName;
1818
Expr* whereClause;
1919
char* encoding;
20+
CsvOptions* csv_options;
2021
};
2122

2223
} // namespace hsql

src/sql/statements.cpp

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,19 +140,56 @@ ExportStatement::ExportStatement(ImportType type)
140140
schema(nullptr),
141141
tableName(nullptr),
142142
select(nullptr),
143-
encoding(nullptr) {}
143+
encoding(nullptr),
144+
csv_options(nullptr) {}
144145

145146
ExportStatement::~ExportStatement() {
146147
free(filePath);
147148
free(schema);
148149
free(tableName);
149150
delete select;
150151
free(encoding);
152+
delete csv_options;
151153
}
152154

153-
ImportExportOptions::ImportExportOptions() : format(kImportAuto), encoding(nullptr) {}
155+
CsvOptions::CsvOptions() : delimiter(nullptr), null(nullptr), quote(nullptr) {}
156+
CsvOptions::~CsvOptions() {
157+
free(delimiter);
158+
free(null);
159+
free(quote);
160+
}
161+
162+
bool CsvOptions::accept_csv_option(std::pair<CsvOptionType, char*>* option) {
163+
switch (option->first) {
164+
case CsvOptionType::Delimiter:
165+
if (delimiter != nullptr) {
166+
return false;
167+
}
168+
delimiter = option->second;
169+
break;
170+
case CsvOptionType::Null:
171+
if (null != nullptr) {
172+
return false;
173+
}
174+
null = option->second;
175+
break;
176+
case CsvOptionType::Quote:
177+
if (quote != nullptr) {
178+
return false;
179+
}
180+
quote = option->second;
181+
break;
182+
}
183+
184+
return true;
185+
}
154186

155-
ImportExportOptions::~ImportExportOptions() { free(encoding); }
187+
ImportExportOptions::ImportExportOptions() : format(kImportAuto), encoding(nullptr), csv_options(nullptr) {}
188+
189+
ImportExportOptions::~ImportExportOptions() {
190+
free(encoding);
191+
delete csv_options;
192+
}
156193

157194
// ImportStatement
158195
ImportStatement::ImportStatement(ImportType type)
@@ -162,14 +199,16 @@ ImportStatement::ImportStatement(ImportType type)
162199
schema(nullptr),
163200
tableName(nullptr),
164201
whereClause(nullptr),
165-
encoding(nullptr) {}
202+
encoding(nullptr),
203+
csv_options(nullptr) {}
166204

167205
ImportStatement::~ImportStatement() {
168206
free(filePath);
169207
free(schema);
170208
free(tableName);
171209
delete whereClause;
172210
free(encoding);
211+
delete csv_options;
173212
}
174213

175214
// InsertStatement

test/queries/queries-bad.sql

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,11 @@
102102
!SELECT * FROM students ORDER BY name FIRST;
103103
!SELECT * FROM students ORDER BY name ASC LAST;
104104
!SELECT * FROM students ORDER BY name DESC NULLS gibberish;
105+
# CSV options
106+
!COPY students FROM 'file_path' WITH (FORMAT TBL, DELIMITER '|', NULL '', QUOTE '"');
107+
!COPY students FROM 'file_path' WITH (DELIMITER '|', NULL '', QUOTE '"', FORMAT TBL);
108+
!COPY students FROM 'file_path' WITH (DELIMITER '|', NULL '', FORMAT TBL, QUOTE '"');
109+
!COPY students FROM 'file_path' WITH (DELIMITER '|', NULL '', QUOTE '"', NULL 'a');
110+
!COPY students FROM 'file_path' WITH (NULL '', QUOTE '"', DELIMITER '|', DELIMITER '/');
111+
!COPY students FROM 'file_path' WITH (QUOTE '"', NULL '', DELIMITER '/', QUOTE '_',);
112+
!COPY students FROM 'file_path' WITH (FORMAT CSV, QUOTE '"', DELIMINIMITER '|');

test/queries/queries-good.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ COPY students FROM 'file_path' WITH (FORMAT TBL);
7474
COPY students FROM 'file_path' WITH (FORMAT CSV);
7575
COPY students FROM 'file_path' WITH (FORMAT BIN);
7676
COPY students FROM 'file_path' WITH (FORMAT BINARY);
77+
COPY students FROM 'file_path' WITH (FORMAT CSV, DELIMITER '|', NULL '', QUOTE '"');
78+
COPY students FROM 'file_path' WITH (DELIMITER '|', NULL '', FORMAT CSV, QUOTE '"');
79+
COPY students FROM 'file_path' WITH (DELIMITER '|', NULL '', QUOTE '"');
80+
COPY students FROM 'file_path' WITH (DELIMITER '|', FORMAT CSV);
7781
COPY students FROM 'file_path' (FORMAT TBL);
7882
COPY good_students FROM 'file_path' WHERE grade > (SELECT AVG(grade) from alumni);
7983
COPY students TO 'student.tbl';

test/sql_tests.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -466,16 +466,23 @@ TEST(ImportStatementTest) {
466466
}
467467

468468
TEST(CopyStatementTest) {
469-
TEST_PARSE_SINGLE_SQL("COPY students FROM 'students_file' WITH (FORMAT BINARY);", kStmtImport, ImportStatement,
470-
import_result, import_stmt);
469+
TEST_PARSE_SINGLE_SQL("COPY students FROM 'students_file' WITH (FORMAT CSV, DELIMITER '|', NULL '', QUOTE '\"');",
470+
kStmtImport, ImportStatement, import_result, import_stmt);
471471

472-
ASSERT_EQ(import_stmt->type, kImportBinary);
472+
ASSERT_EQ(import_stmt->type, kImportCSV);
473473
ASSERT_NOTNULL(import_stmt->tableName);
474474
ASSERT_STREQ(import_stmt->tableName, "students");
475475
ASSERT_NOTNULL(import_stmt->filePath);
476476
ASSERT_STREQ(import_stmt->filePath, "students_file");
477477
ASSERT_NULL(import_stmt->whereClause);
478478
ASSERT_NULL(import_stmt->encoding);
479+
ASSERT_NOTNULL(import_stmt->csv_options);
480+
ASSERT_NOTNULL(import_stmt->csv_options->delimiter);
481+
ASSERT_STREQ(import_stmt->csv_options->delimiter, "|");
482+
ASSERT_NOTNULL(import_stmt->csv_options->null);
483+
ASSERT_STREQ(import_stmt->csv_options->null, "");
484+
ASSERT_NOTNULL(import_stmt->csv_options->quote);
485+
ASSERT_STREQ(import_stmt->csv_options->quote, "\"");
479486

480487
TEST_PARSE_SINGLE_SQL("COPY students FROM 'students_file' WHERE lastname = 'Potter';", kStmtImport, ImportStatement,
481488
import_filter_result, import_filter_stmt);
@@ -492,17 +499,19 @@ TEST(CopyStatementTest) {
492499
ASSERT_EQ(import_filter_stmt->whereClause->expr2->type, kExprLiteralString);
493500
ASSERT_STREQ(import_filter_stmt->whereClause->expr2->name, "Potter");
494501
ASSERT_NULL(import_filter_stmt->encoding);
502+
ASSERT_NULL(import_filter_stmt->csv_options);
495503

496-
TEST_PARSE_SINGLE_SQL("COPY students TO 'students_file' WITH (ENCODING 'FSST', FORMAT CSV);", kStmtExport,
504+
TEST_PARSE_SINGLE_SQL("COPY students TO 'students_file' WITH (ENCODING 'FSST', FORMAT BINARY);", kStmtExport,
497505
ExportStatement, export_table_result, export_table_stmt);
498506

499-
ASSERT_EQ(export_table_stmt->type, kImportCSV);
507+
ASSERT_EQ(export_table_stmt->type, kImportBinary);
500508
ASSERT_NOTNULL(export_table_stmt->tableName);
501509
ASSERT_STREQ(export_table_stmt->tableName, "students");
502510
ASSERT_NOTNULL(export_table_stmt->filePath);
503511
ASSERT_STREQ(export_table_stmt->filePath, "students_file");
504512
ASSERT_NULL(export_table_stmt->select);
505513
ASSERT_STREQ(export_table_stmt->encoding, "FSST");
514+
ASSERT_NULL(export_table_stmt->csv_options);
506515

507516
TEST_PARSE_SINGLE_SQL(
508517
"COPY (SELECT firstname, lastname FROM students) TO 'students_file' WITH (ENCODING 'Dictionary');", kStmtExport,
@@ -513,6 +522,7 @@ TEST(CopyStatementTest) {
513522
ASSERT_NOTNULL(export_select_stmt->filePath);
514523
ASSERT_STREQ(export_select_stmt->filePath, "students_file");
515524
ASSERT_STREQ(export_select_stmt->encoding, "Dictionary");
525+
ASSERT_NULL(export_select_stmt->csv_options);
516526

517527
ASSERT_NOTNULL(export_select_stmt->select);
518528
const auto& select_stmt = export_select_stmt->select;

0 commit comments

Comments
 (0)