Skip to content

Commit 25b5e3e

Browse files
Refactored CSV parsing
1 parent 110271b commit 25b5e3e

File tree

7 files changed

+376
-230
lines changed

7 files changed

+376
-230
lines changed

src/ast/dml.rs

Lines changed: 291 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@ use sqlparser_derive::{Visit, VisitMut};
2727
use crate::display_utils::{indented_list, Indent, SpaceOrNewline};
2828

2929
use super::{
30-
display_comma_separated, query::InputFormatClause, Assignment, Expr, FromTable, Ident,
31-
InsertAliases, MysqlInsertPriority, ObjectName, OnInsert, OrderByExpr, Query, SelectItem,
32-
Setting, SqliteOnConflict, TableObject, TableWithJoins, UpdateTableFromKind,
30+
display_comma_separated, display_separated, query::InputFormatClause, Assignment,
31+
CopyLegacyCsvOption, CopyLegacyOption, CopyOption, CopySource, CopyTarget, Expr, FromTable,
32+
Ident, InsertAliases, MysqlInsertPriority, ObjectName, OnInsert, OrderByExpr, Query,
33+
SelectItem, Setting, SqliteOnConflict, TableObject, TableWithJoins, UpdateTableFromKind,
3334
};
3435

3536
/// INSERT statement.
@@ -303,3 +304,290 @@ impl Display for Update {
303304
Ok(())
304305
}
305306
}
307+
308+
/// CSV formatting options extracted from COPY options.
309+
///
310+
/// This struct encapsulates the CSV formatting settings used when parsing
311+
/// or formatting COPY statement data. It extracts relevant options from both
312+
/// modern [`CopyOption`] and legacy [`CopyLegacyOption`] variants.
313+
#[derive(Debug, Clone, PartialEq, Eq)]
314+
pub struct CsvFormatOptions {
315+
/// The field delimiter character (default: tab)
316+
pub(crate) delimiter: char,
317+
/// The quote character used to enclose fields (default: `"`)
318+
pub(crate) quote: char,
319+
/// The escape character (default: `\`)
320+
pub(crate) escape: char,
321+
/// The string representing NULL values (default: `\\N`)
322+
pub(crate) null_symbol: String,
323+
}
324+
325+
impl Default for CsvFormatOptions {
326+
fn default() -> Self {
327+
Self {
328+
delimiter: '\t',
329+
quote: '"',
330+
escape: '\\',
331+
null_symbol: "\\N".to_string(),
332+
}
333+
}
334+
}
335+
336+
impl CsvFormatOptions {
337+
/// Extract CSV format options from CopyOption and CopyLegacyOption lists.
338+
///
339+
/// This method processes both modern and legacy COPY options to determine
340+
/// the CSV formatting settings. Later options in the lists override earlier ones.
341+
///
342+
/// # Arguments
343+
///
344+
/// * `options` - Modern COPY options (PostgreSQL 9.0+)
345+
/// * `legacy_options` - Legacy COPY options (pre-PostgreSQL 9.0)
346+
///
347+
/// # Returns
348+
///
349+
/// A `CsvFormatOptions` instance with the extracted settings, using defaults
350+
/// for any options not specified.
351+
pub(crate) fn from_copy_options(
352+
options: &[CopyOption],
353+
legacy_options: &[CopyLegacyOption],
354+
) -> Self {
355+
let mut csv_options = Self::default();
356+
357+
// Apply options
358+
for option in options {
359+
match option {
360+
CopyOption::Delimiter(c) => {
361+
csv_options.delimiter = *c;
362+
}
363+
CopyOption::Quote(c) => {
364+
csv_options.quote = *c;
365+
}
366+
CopyOption::Escape(c) => {
367+
csv_options.escape = *c;
368+
}
369+
CopyOption::Null(null) => {
370+
csv_options.null_symbol = null.clone();
371+
}
372+
// These options don't affect CSV formatting
373+
CopyOption::Format(_)
374+
| CopyOption::Freeze(_)
375+
| CopyOption::Header(_)
376+
| CopyOption::ForceQuote(_)
377+
| CopyOption::ForceNotNull(_)
378+
| CopyOption::ForceNull(_)
379+
| CopyOption::Encoding(_) => {}
380+
}
381+
}
382+
383+
// Apply legacy options
384+
for option in legacy_options {
385+
match option {
386+
CopyLegacyOption::Delimiter(c) => {
387+
csv_options.delimiter = *c;
388+
}
389+
CopyLegacyOption::Null(null) => {
390+
csv_options.null_symbol = null.clone();
391+
}
392+
CopyLegacyOption::Csv(csv_opts) => {
393+
for csv_option in csv_opts {
394+
match csv_option {
395+
CopyLegacyCsvOption::Quote(c) => {
396+
csv_options.quote = *c;
397+
}
398+
CopyLegacyCsvOption::Escape(c) => {
399+
csv_options.escape = *c;
400+
}
401+
// These CSV options don't affect CSV formatting
402+
CopyLegacyCsvOption::Header
403+
| CopyLegacyCsvOption::ForceQuote(_)
404+
| CopyLegacyCsvOption::ForceNotNull(_) => {}
405+
}
406+
}
407+
}
408+
// These legacy options don't affect CSV formatting
409+
CopyLegacyOption::AcceptAnyDate
410+
| CopyLegacyOption::AcceptInvChars(_)
411+
| CopyLegacyOption::AddQuotes
412+
| CopyLegacyOption::AllowOverwrite
413+
| CopyLegacyOption::Binary
414+
| CopyLegacyOption::BlankAsNull
415+
| CopyLegacyOption::Bzip2
416+
| CopyLegacyOption::CleanPath
417+
| CopyLegacyOption::CompUpdate { .. }
418+
| CopyLegacyOption::DateFormat(_)
419+
| CopyLegacyOption::EmptyAsNull
420+
| CopyLegacyOption::Encrypted { .. }
421+
| CopyLegacyOption::Escape
422+
| CopyLegacyOption::Extension(_)
423+
| CopyLegacyOption::FixedWidth(_)
424+
| CopyLegacyOption::Gzip
425+
| CopyLegacyOption::Header
426+
| CopyLegacyOption::IamRole(_)
427+
| CopyLegacyOption::IgnoreHeader(_)
428+
| CopyLegacyOption::Json
429+
| CopyLegacyOption::Manifest { .. }
430+
| CopyLegacyOption::MaxFileSize(_)
431+
| CopyLegacyOption::Parallel(_)
432+
| CopyLegacyOption::Parquet
433+
| CopyLegacyOption::PartitionBy(_)
434+
| CopyLegacyOption::Region(_)
435+
| CopyLegacyOption::RemoveQuotes
436+
| CopyLegacyOption::RowGroupSize(_)
437+
| CopyLegacyOption::StatUpdate(_)
438+
| CopyLegacyOption::TimeFormat(_)
439+
| CopyLegacyOption::TruncateColumns
440+
| CopyLegacyOption::Zstd => {}
441+
}
442+
}
443+
444+
csv_options
445+
}
446+
447+
/// Format a single CSV field, adding quotes and escaping if necessary.
448+
///
449+
/// This method handles CSV field formatting according to the configured options:
450+
/// - Writes NULL values using the configured `null_symbol`
451+
/// - Adds quotes around fields containing delimiters, quotes, or newlines
452+
/// - Escapes quote characters by doubling them
453+
/// - Escapes escape characters
454+
///
455+
/// # Arguments
456+
///
457+
/// * `f` - The formatter to write to
458+
/// * `field` - The field value to format, or `None` for NULL
459+
///
460+
/// # Returns
461+
///
462+
/// A `fmt::Result` indicating success or failure of the write operation.
463+
fn format_csv_field(&self, f: &mut fmt::Formatter, field: Option<&str>) -> fmt::Result {
464+
let field_value = field.unwrap_or(&self.null_symbol);
465+
466+
// Check if field needs quoting
467+
let needs_quoting = field_value.contains(self.delimiter)
468+
|| field_value.contains(self.quote)
469+
|| field_value.contains('\n')
470+
|| field_value.contains('\r');
471+
472+
if needs_quoting {
473+
write!(f, "{}", self.quote)?;
474+
for ch in field_value.chars() {
475+
if ch == self.quote {
476+
// Escape quote by doubling it
477+
write!(f, "{}{}", self.quote, self.quote)?;
478+
} else if ch == self.escape {
479+
// Escape escape character
480+
write!(f, "{}{}", self.escape, self.escape)?;
481+
} else {
482+
write!(f, "{}", ch)?;
483+
}
484+
}
485+
write!(f, "{}", self.quote)?;
486+
} else {
487+
write!(f, "{}", field_value)?;
488+
}
489+
Ok(())
490+
}
491+
}
492+
493+
/// COPY statement.
494+
///
495+
/// Represents a PostgreSQL COPY statement for bulk data transfer between
496+
/// a file and a table. The statement can copy data FROM a file to a table
497+
/// or TO a file from a table or query.
498+
///
499+
/// # Syntax
500+
///
501+
/// ```sql
502+
/// COPY table_name [(column_list)] FROM { 'filename' | STDIN | PROGRAM 'command' }
503+
/// COPY { table_name [(column_list)] | (query) } TO { 'filename' | STDOUT | PROGRAM 'command' }
504+
/// ```
505+
///
506+
/// # Examples
507+
///
508+
/// ```
509+
/// # use sqlparser::ast::{Copy, CopySource, CopyTarget, ObjectName};
510+
/// # use sqlparser::dialect::PostgreSqlDialect;
511+
/// # use sqlparser::parser::Parser;
512+
/// let sql = "COPY users FROM 'data.csv'";
513+
/// let dialect = PostgreSqlDialect {};
514+
/// let ast = Parser::parse_sql(&dialect, sql).unwrap();
515+
/// ```
516+
///
517+
/// See [PostgreSQL documentation](https://www.postgresql.org/docs/current/sql-copy.html)
518+
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
519+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
520+
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
521+
pub struct Copy {
522+
/// The source of 'COPY TO', or the target of 'COPY FROM'.
523+
/// Can be a table name with optional column list, or a query (for COPY TO only).
524+
pub source: CopySource,
525+
/// Direction of the copy operation.
526+
/// - `true` for COPY TO (table/query to file)
527+
/// - `false` for COPY FROM (file to table)
528+
pub to: bool,
529+
/// The target of 'COPY TO', or the source of 'COPY FROM'.
530+
/// Can be a file, STDIN, STDOUT, or a PROGRAM command.
531+
pub target: CopyTarget,
532+
/// Modern COPY options (PostgreSQL 9.0+), specified within parentheses.
533+
/// Examples: FORMAT, DELIMITER, NULL, HEADER, QUOTE, ESCAPE, etc.
534+
pub options: Vec<CopyOption>,
535+
/// Legacy COPY options (pre-PostgreSQL 9.0), specified without parentheses.
536+
/// Also used by AWS Redshift extensions like IAM_ROLE, MANIFEST, etc.
537+
pub legacy_options: Vec<CopyLegacyOption>,
538+
/// CSV data rows for COPY FROM STDIN statements.
539+
/// Each row is a vector of optional strings (None represents NULL).
540+
/// Populated only when copying from STDIN with inline data.
541+
pub values: Vec<Vec<Option<String>>>,
542+
}
543+
544+
impl Display for Copy {
545+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
546+
write!(f, "COPY")?;
547+
match &self.source {
548+
CopySource::Query(query) => write!(f, " ({query})")?,
549+
CopySource::Table {
550+
table_name,
551+
columns,
552+
} => {
553+
write!(f, " {table_name}")?;
554+
if !columns.is_empty() {
555+
write!(f, " ({})", display_comma_separated(columns))?;
556+
}
557+
}
558+
}
559+
write!(
560+
f,
561+
" {} {}",
562+
if self.to { "TO" } else { "FROM" },
563+
self.target
564+
)?;
565+
if !self.options.is_empty() {
566+
write!(f, " ({})", display_comma_separated(&self.options))?;
567+
}
568+
if !self.legacy_options.is_empty() {
569+
write!(f, " {}", display_separated(&self.legacy_options, " "))?;
570+
}
571+
572+
if !self.values.is_empty() {
573+
writeln!(f, ";")?;
574+
575+
let csv_options =
576+
CsvFormatOptions::from_copy_options(&self.options, &self.legacy_options);
577+
578+
// Write CSV data
579+
for row in &self.values {
580+
for (idx, column) in row.iter().enumerate() {
581+
if idx > 0 {
582+
write!(f, "{}", csv_options.delimiter)?;
583+
}
584+
csv_options.format_csv_field(f, column.as_deref())?;
585+
}
586+
writeln!(f)?;
587+
}
588+
589+
write!(f, "\\.")?;
590+
}
591+
Ok(())
592+
}
593+
}

0 commit comments

Comments
 (0)