Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,44 @@ pub struct Parser<'a> {
recursion_counter: RecursionCounter,
}

/// Helper function to convert a Location (line, column) to a byte offset in the source string.
///
/// Line and column numbers are 1-indexed as per the Location type.
fn span_to_byte_offset(sql: &str, location: Location) -> usize {
if location.line == 0 || location.column == 0 {
// Empty location
return 0;
}

let mut byte_offset = 0;
let mut current_line = 1u64;

for ch in sql.chars() {
if current_line == location.line {
// We're on the target line, now count columns
let mut current_col = 1u64;
let remaining = &sql[byte_offset..];
for ch in remaining.chars() {
if current_col >= location.column {
return byte_offset;
}
if ch == '\n' {
// Don't go past the end of the line
return byte_offset;
}
byte_offset += ch.len_utf8();
current_col += 1;
}
return byte_offset;
}
if ch == '\n' {
current_line += 1;
}
byte_offset += ch.len_utf8();
}
byte_offset
}

impl<'a> Parser<'a> {
/// Create a parser for a [`Dialect`]
///
Expand Down Expand Up @@ -510,6 +548,101 @@ impl<'a> Parser<'a> {
Ok(stmts)
}

/// Parse multiple statements and return them with their byte offsets in the original source.
///
/// Similar to [`Self::parse_statements`], but also returns [`crate::tokenizer::SourceOffset`]
/// for each statement indicating its position in the original SQL string.
pub fn parse_statements_with_offsets(
&mut self,
sql: &str,
) -> Result<Vec<(Statement, crate::tokenizer::SourceOffset)>, ParserError> {
let mut stmts = Vec::new();
let mut expecting_statement_delimiter = false;
loop {
// ignore empty statements (between successive statement delimiters)
while self.consume_token(&Token::SemiColon) {
expecting_statement_delimiter = false;
}

if !self.options.require_semicolon_stmt_delimiter {
expecting_statement_delimiter = false;
}

match self.peek_token().token {
Token::EOF => break,

// end of statement
Token::Word(word) => {
if expecting_statement_delimiter && word.keyword == Keyword::END {
break;
}
}
_ => {}
}

if expecting_statement_delimiter {
return self.expected("end of statement", self.peek_token());
}

// Find the first non-whitespace token to get the actual start position
let mut start_index = self.index;
while start_index < self.tokens.len() {
if matches!(self.tokens[start_index].token, Token::Whitespace(_)) {
start_index += 1;
} else {
break;
}
}

let statement = self.parse_statement()?;

// Find the last non-whitespace token that was consumed
// We need to look backwards from the current position to skip any whitespace
let mut end_index = self.index.saturating_sub(1);
while end_index > start_index {
if matches!(
self.tokens.get(end_index).map(|t| &t.token),
Some(Token::Whitespace(_))
) {
end_index = end_index.saturating_sub(1);
} else {
break;
}
}

// Check if the next non-whitespace token is a semicolon and include it in the range
let mut check_index = self.index;
while check_index < self.tokens.len() {
match &self.tokens[check_index].token {
Token::Whitespace(_) => check_index += 1,
Token::SemiColon => {
end_index = check_index;
break;
}
_ => break,
}
}

// Calculate byte offsets from the token spans
let start_offset = if start_index < self.tokens.len() {
span_to_byte_offset(sql, self.tokens[start_index].span.start)
} else {
sql.len()
};

let end_offset = if end_index < self.tokens.len() {
span_to_byte_offset(sql, self.tokens[end_index].span.end)
} else {
sql.len()
};

let source_offset = crate::tokenizer::SourceOffset::new(start_offset, end_offset);
stmts.push((statement, source_offset));
expecting_statement_delimiter = true;
}
Ok(stmts)
}

/// Convenience method to parse a string with one or more SQL
/// statements into produce an Abstract Syntax Tree (AST).
///
Expand All @@ -529,6 +662,37 @@ impl<'a> Parser<'a> {
Parser::new(dialect).try_with_sql(sql)?.parse_statements()
}

/// Convenience method to parse a string with one or more SQL statements and return
/// both the Abstract Syntax Tree (AST) and byte offsets into the original source string.
///
/// This is useful when you need to preserve the original source text for each statement,
/// for example to maintain case-sensitive identifiers or type names that get normalized
/// in the AST.
///
/// # Example
/// ```
/// # use sqlparser::{parser::Parser, dialect::GenericDialect};
/// # fn main() -> Result<(), sqlparser::parser::ParserError> {
/// let dialect = GenericDialect{};
/// let sql = "SELECT * FROM foo; INSERT INTO bar VALUES (1);";
/// let results = Parser::parse_sql_with_offsets(&dialect, sql)?;
///
/// assert_eq!(results.len(), 2);
/// let (stmt, offset) = &results[0];
/// let original_text = &sql[offset.start()..offset.end()];
/// assert_eq!(original_text, "SELECT * FROM foo;");
/// # Ok(())
/// # }
/// ```
pub fn parse_sql_with_offsets(
dialect: &dyn Dialect,
sql: &str,
) -> Result<Vec<(Statement, crate::tokenizer::SourceOffset)>, ParserError> {
Parser::new(dialect)
.try_with_sql(sql)?
.parse_statements_with_offsets(sql)
}

/// Parse a single top-level statement (such as SELECT, INSERT, CREATE, etc.),
/// stopping before the statement separator, if any.
pub fn parse_statement(&mut self) -> Result<Statement, ParserError> {
Expand Down
45 changes: 43 additions & 2 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ use alloc::{
vec,
vec::Vec,
};
use core::iter::Peekable;
use core::num::NonZeroU8;
use core::str::Chars;
use core::{cmp, fmt};
use core::{iter::Peekable, num::NonZeroU8, ops::Range};

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -651,6 +650,48 @@ impl Span {
}
}

/// Represents byte offsets into the original source string
///
/// Unlike [`Span`] which tracks line and column numbers, `SourceOffset` tracks
/// byte positions which can be used to directly slice the original source string.
///
/// # Examples
/// ```
/// # use sqlparser::tokenizer::SourceOffset;
/// let sql = "SELECT * FROM users; INSERT INTO foo VALUES (1);";
/// let offset = SourceOffset::new(0, 20);
/// assert_eq!(&sql[offset.start()..offset.end()], "SELECT * FROM users;");
/// ```
#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct SourceOffset {
start: usize,
end: usize,
}

impl SourceOffset {
/// Create a new `SourceOffset` from start and end byte offsets
pub fn new(start: usize, end: usize) -> Self {
Self { start, end }
}

/// Returns the starting byte offset
pub fn start(&self) -> usize {
self.start
}

/// Returns the ending byte offset
pub fn end(&self) -> usize {
self.end
}

/// Returns a range representing the byte offsets
pub fn range(&self) -> Range<usize> {
self.start..self.end
}
}

/// Backwards compatibility struct for [`TokenWithSpan`]
#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
pub type TokenWithLocation = TokenWithSpan;
Expand Down
Binary file added test_offsets
Binary file not shown.
42 changes: 42 additions & 0 deletions tests/sqlparser_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17632,3 +17632,45 @@ fn parse_generic_unary_ops() {
);
}
}

#[test]
fn parse_sql_with_offsets() {
let sql = "SELECT * FROM users";
let results = Parser::parse_sql_with_offsets(&GenericDialect {}, sql).unwrap();

assert_eq!(results.len(), 1);
let (stmt, offset) = &results[0];
assert!(matches!(stmt, Statement::Query(_)));
assert_eq!(&sql[offset.range()], "SELECT * FROM users");
assert_eq!(&sql[offset.start()..offset.end()], "SELECT * FROM users");

// Test with multiple statements
let sql = "SELECT * FROM foo; INSERT INTO bar VALUES (1);";
let results = Parser::parse_sql_with_offsets(&GenericDialect {}, sql).unwrap();

assert_eq!(results.len(), 2);

let (stmt1, offset1) = &results[0];
assert!(matches!(stmt1, Statement::Query(_)));
let original1 = &sql[offset1.range()];
assert_eq!(original1, "SELECT * FROM foo;");

let (stmt2, offset2) = &results[1];
assert!(matches!(stmt2, Statement::Insert(_)));
let original2 = &sql[offset2.range()];
assert_eq!(original2, "INSERT INTO bar VALUES (1);");

// Test with multiline SQL
let sql = "SELECT a,\n b,\n c\nFROM table1;\nINSERT INTO table2 VALUES (1);";
let results = Parser::parse_sql_with_offsets(&GenericDialect {}, sql).unwrap();

assert_eq!(results.len(), 2);
let (_, offset1) = &results[0];
let original1 = &sql[offset1.range()];
assert!(original1.contains("SELECT"));
assert!(original1.contains("FROM table1"));

let (_, offset2) = &results[1];
let original2 = &sql[offset2.range()];
assert_eq!(original2, "INSERT INTO table2 VALUES (1);");
}