Skip to content

Commit a075c25

Browse files
committed
Add parse_sql_with_offsets to preserve original source text
Introduces a new API method `Parser::parse_sql_with_offsets()` that returns parsed statements along with byte offsets into the original source string. This allows users to recover the exact original text for each statement, which is useful for preserving case-sensitive identifiers and type names that may be normalized in the AST. - Add `SourceOffset` type to track byte positions in source text - Add `Parser::parse_sql_with_offsets()` public API method - Add `Parser::parse_statements_with_offsets()` internal method - Add helper function to convert line/column to byte offsets - Add comprehensive tests covering single/multiple statements, case-sensitive type names, and multiline SQL This is particularly useful for dialects like ClickHouse where type names are case-sensitive (e.g., `Nullable(Float64)` vs `Nullable(FLOAT64)`).
1 parent 308a723 commit a075c25

File tree

3 files changed

+236
-1
lines changed

3 files changed

+236
-1
lines changed

src/parser/mod.rs

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,44 @@ pub struct Parser<'a> {
344344
recursion_counter: RecursionCounter,
345345
}
346346

347+
/// Helper function to convert a Location (line, column) to a byte offset in the source string.
348+
///
349+
/// Line and column numbers are 1-indexed as per the Location type.
350+
fn span_to_byte_offset(sql: &str, location: Location) -> usize {
351+
if location.line == 0 || location.column == 0 {
352+
// Empty location
353+
return 0;
354+
}
355+
356+
let mut byte_offset = 0;
357+
let mut current_line = 1u64;
358+
359+
for ch in sql.chars() {
360+
if current_line == location.line {
361+
// We're on the target line, now count columns
362+
let mut current_col = 1u64;
363+
let remaining = &sql[byte_offset..];
364+
for ch in remaining.chars() {
365+
if current_col >= location.column {
366+
return byte_offset;
367+
}
368+
if ch == '\n' {
369+
// Don't go past the end of the line
370+
return byte_offset;
371+
}
372+
byte_offset += ch.len_utf8();
373+
current_col += 1;
374+
}
375+
return byte_offset;
376+
}
377+
if ch == '\n' {
378+
current_line += 1;
379+
}
380+
byte_offset += ch.len_utf8();
381+
}
382+
byte_offset
383+
}
384+
347385
impl<'a> Parser<'a> {
348386
/// Create a parser for a [`Dialect`]
349387
///
@@ -510,6 +548,88 @@ impl<'a> Parser<'a> {
510548
Ok(stmts)
511549
}
512550

551+
/// Parse multiple statements and return them with their byte offsets in the original source.
552+
///
553+
/// Similar to [`Self::parse_statements`], but also returns [`crate::tokenizer::SourceOffset`]
554+
/// for each statement indicating its position in the original SQL string.
555+
pub fn parse_statements_with_offsets(
556+
&mut self,
557+
sql: &str,
558+
) -> Result<Vec<(Statement, crate::tokenizer::SourceOffset)>, ParserError> {
559+
let mut stmts = Vec::new();
560+
let mut expecting_statement_delimiter = false;
561+
loop {
562+
// ignore empty statements (between successive statement delimiters)
563+
while self.consume_token(&Token::SemiColon) {
564+
expecting_statement_delimiter = false;
565+
}
566+
567+
if !self.options.require_semicolon_stmt_delimiter {
568+
expecting_statement_delimiter = false;
569+
}
570+
571+
match self.peek_token().token {
572+
Token::EOF => break,
573+
574+
// end of statement
575+
Token::Word(word) => {
576+
if expecting_statement_delimiter && word.keyword == Keyword::END {
577+
break;
578+
}
579+
}
580+
_ => {}
581+
}
582+
583+
if expecting_statement_delimiter {
584+
return self.expected("end of statement", self.peek_token());
585+
}
586+
587+
// Find the first non-whitespace token to get the actual start position
588+
let mut start_index = self.index;
589+
while start_index < self.tokens.len() {
590+
if matches!(self.tokens[start_index].token, Token::Whitespace(_)) {
591+
start_index += 1;
592+
} else {
593+
break;
594+
}
595+
}
596+
597+
let statement = self.parse_statement()?;
598+
599+
// Find the last non-whitespace token that was consumed
600+
// We need to look backwards from the current position to skip any whitespace
601+
let mut end_index = self.index.saturating_sub(1);
602+
while end_index > start_index {
603+
if matches!(
604+
self.tokens.get(end_index).map(|t| &t.token),
605+
Some(Token::Whitespace(_))
606+
) {
607+
end_index = end_index.saturating_sub(1);
608+
} else {
609+
break;
610+
}
611+
}
612+
613+
// Calculate byte offsets from the token spans
614+
let start_offset = if start_index < self.tokens.len() {
615+
span_to_byte_offset(sql, self.tokens[start_index].span.start)
616+
} else {
617+
sql.len()
618+
};
619+
620+
let end_offset = if end_index < self.tokens.len() {
621+
span_to_byte_offset(sql, self.tokens[end_index].span.end)
622+
} else {
623+
sql.len()
624+
};
625+
626+
let source_offset = crate::tokenizer::SourceOffset::new(start_offset, end_offset);
627+
stmts.push((statement, source_offset));
628+
expecting_statement_delimiter = true;
629+
}
630+
Ok(stmts)
631+
}
632+
513633
/// Convenience method to parse a string with one or more SQL
514634
/// statements into produce an Abstract Syntax Tree (AST).
515635
///
@@ -529,6 +649,37 @@ impl<'a> Parser<'a> {
529649
Parser::new(dialect).try_with_sql(sql)?.parse_statements()
530650
}
531651

652+
/// Convenience method to parse a string with one or more SQL statements and return
653+
/// both the Abstract Syntax Tree (AST) and byte offsets into the original source string.
654+
///
655+
/// This is useful when you need to preserve the original source text for each statement,
656+
/// for example to maintain case-sensitive identifiers or type names that get normalized
657+
/// in the AST.
658+
///
659+
/// # Example
660+
/// ```
661+
/// # use sqlparser::{parser::Parser, dialect::GenericDialect};
662+
/// # fn main() -> Result<(), sqlparser::parser::ParserError> {
663+
/// let dialect = GenericDialect{};
664+
/// let sql = "SELECT * FROM foo; INSERT INTO bar VALUES (1);";
665+
/// let results = Parser::parse_sql_with_offsets(&dialect, sql)?;
666+
///
667+
/// assert_eq!(results.len(), 2);
668+
/// let (stmt, offset) = &results[0];
669+
/// let original_text = &sql[offset.start()..offset.end()];
670+
/// assert_eq!(original_text, "SELECT * FROM foo");
671+
/// # Ok(())
672+
/// # }
673+
/// ```
674+
pub fn parse_sql_with_offsets(
675+
dialect: &dyn Dialect,
676+
sql: &str,
677+
) -> Result<Vec<(Statement, crate::tokenizer::SourceOffset)>, ParserError> {
678+
Parser::new(dialect)
679+
.try_with_sql(sql)?
680+
.parse_statements_with_offsets(sql)
681+
}
682+
532683
/// Parse a single top-level statement (such as SELECT, INSERT, CREATE, etc.),
533684
/// stopping before the statement separator, if any.
534685
pub fn parse_statement(&mut self) -> Result<Statement, ParserError> {

src/tokenizer.rs

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ use alloc::{
2929
vec,
3030
vec::Vec,
3131
};
32-
use core::iter::Peekable;
3332
use core::num::NonZeroU8;
3433
use core::str::Chars;
3534
use core::{cmp, fmt};
35+
use core::{iter::Peekable, ops::Range};
3636

3737
#[cfg(feature = "serde")]
3838
use serde::{Deserialize, Serialize};
@@ -651,6 +651,48 @@ impl Span {
651651
}
652652
}
653653

654+
/// Represents byte offsets into the original source string
655+
///
656+
/// Unlike [`Span`] which tracks line and column numbers, `SourceOffset` tracks
657+
/// byte positions which can be used to directly slice the original source string.
658+
///
659+
/// # Examples
660+
/// ```
661+
/// # use sqlparser::tokenizer::SourceOffset;
662+
/// let sql = "SELECT * FROM users; INSERT INTO foo VALUES (1);";
663+
/// let offset = SourceOffset::new(0, 20);
664+
/// assert_eq!(&sql[offset.start()..offset.end()], "SELECT * FROM users;");
665+
/// ```
666+
#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
667+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
668+
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
669+
pub struct SourceOffset {
670+
start: usize,
671+
end: usize,
672+
}
673+
674+
impl SourceOffset {
675+
/// Create a new `SourceOffset` from start and end byte offsets
676+
pub fn new(start: usize, end: usize) -> Self {
677+
Self { start, end }
678+
}
679+
680+
/// Returns the starting byte offset
681+
pub fn start(&self) -> usize {
682+
self.start
683+
}
684+
685+
/// Returns the ending byte offset
686+
pub fn end(&self) -> usize {
687+
self.end
688+
}
689+
690+
/// Returns a range representing the byte offsets
691+
pub fn range(&self) -> Range<usize> {
692+
self.start..self.end
693+
}
694+
}
695+
654696
/// Backwards compatibility struct for [`TokenWithSpan`]
655697
#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
656698
pub type TokenWithLocation = TokenWithSpan;

tests/sqlparser_common.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17632,3 +17632,45 @@ fn parse_generic_unary_ops() {
1763217632
);
1763317633
}
1763417634
}
17635+
17636+
#[test]
17637+
fn parse_sql_with_offsets() {
17638+
let sql = "SELECT * FROM users";
17639+
let results = Parser::parse_sql_with_offsets(&GenericDialect {}, sql).unwrap();
17640+
17641+
assert_eq!(results.len(), 1);
17642+
let (stmt, offset) = &results[0];
17643+
assert!(matches!(stmt, Statement::Query(_)));
17644+
assert_eq!(&sql[offset.start()..offset.end()], "SELECT * FROM users");
17645+
assert_eq!(&sql[offset.range()], "SELECT * FROM users");
17646+
17647+
// Test with multiple statements
17648+
let sql = "SELECT * FROM foo; INSERT INTO bar VALUES (1);";
17649+
let results = Parser::parse_sql_with_offsets(&GenericDialect {}, sql).unwrap();
17650+
17651+
assert_eq!(results.len(), 2);
17652+
17653+
let (stmt1, offset1) = &results[0];
17654+
assert!(matches!(stmt1, Statement::Query(_)));
17655+
let original1 = &sql[offset1.range()];
17656+
assert_eq!(original1, "SELECT * FROM foo");
17657+
17658+
let (stmt2, offset2) = &results[1];
17659+
assert!(matches!(stmt2, Statement::Insert(_)));
17660+
let original2 = &sql[offset2.range()];
17661+
assert_eq!(original2, "INSERT INTO bar VALUES (1)");
17662+
17663+
// Test with multiline SQL
17664+
let sql = "SELECT a,\n b,\n c\nFROM table1;\nINSERT INTO table2 VALUES (1);";
17665+
let results = Parser::parse_sql_with_offsets(&GenericDialect {}, sql).unwrap();
17666+
17667+
assert_eq!(results.len(), 2);
17668+
let (_, offset1) = &results[0];
17669+
let original1 = &sql[offset1.range()];
17670+
assert!(original1.contains("SELECT"));
17671+
assert!(original1.contains("FROM table1"));
17672+
17673+
let (_, offset2) = &results[1];
17674+
let original2 = &sql[offset2.range()];
17675+
assert_eq!(original2, "INSERT INTO table2 VALUES (1)");
17676+
}

0 commit comments

Comments
 (0)