@@ -344,6 +344,44 @@ pub struct Parser<'a> {
344344 recursion_counter: RecursionCounter,
345345}
346346
347+ /// Helper function to convert a Location (line, column) to a byte offset in the source string.
348+ ///
349+ /// Line and column numbers are 1-indexed as per the Location type.
350+ fn span_to_byte_offset(sql: &str, location: Location) -> usize {
351+ if location.line == 0 || location.column == 0 {
352+ // Empty location
353+ return 0;
354+ }
355+
356+ let mut byte_offset = 0;
357+ let mut current_line = 1u64;
358+
359+ for ch in sql.chars() {
360+ if current_line == location.line {
361+ // We're on the target line, now count columns
362+ let mut current_col = 1u64;
363+ let remaining = &sql[byte_offset..];
364+ for ch in remaining.chars() {
365+ if current_col >= location.column {
366+ return byte_offset;
367+ }
368+ if ch == '\n' {
369+ // Don't go past the end of the line
370+ return byte_offset;
371+ }
372+ byte_offset += ch.len_utf8();
373+ current_col += 1;
374+ }
375+ return byte_offset;
376+ }
377+ if ch == '\n' {
378+ current_line += 1;
379+ }
380+ byte_offset += ch.len_utf8();
381+ }
382+ byte_offset
383+ }
384+
347385impl<'a> Parser<'a> {
348386 /// Create a parser for a [`Dialect`]
349387 ///
@@ -510,6 +548,101 @@ impl<'a> Parser<'a> {
510548 Ok(stmts)
511549 }
512550
551+ /// Parse multiple statements and return them with their byte offsets in the original source.
552+ ///
553+ /// Similar to [`Self::parse_statements`], but also returns [`crate::tokenizer::SourceOffset`]
554+ /// for each statement indicating its position in the original SQL string.
555+ pub fn parse_statements_with_offsets(
556+ &mut self,
557+ sql: &str,
558+ ) -> Result<Vec<(Statement, crate::tokenizer::SourceOffset)>, ParserError> {
559+ let mut stmts = Vec::new();
560+ let mut expecting_statement_delimiter = false;
561+ loop {
562+ // ignore empty statements (between successive statement delimiters)
563+ while self.consume_token(&Token::SemiColon) {
564+ expecting_statement_delimiter = false;
565+ }
566+
567+ if !self.options.require_semicolon_stmt_delimiter {
568+ expecting_statement_delimiter = false;
569+ }
570+
571+ match self.peek_token().token {
572+ Token::EOF => break,
573+
574+ // end of statement
575+ Token::Word(word) => {
576+ if expecting_statement_delimiter && word.keyword == Keyword::END {
577+ break;
578+ }
579+ }
580+ _ => {}
581+ }
582+
583+ if expecting_statement_delimiter {
584+ return self.expected("end of statement", self.peek_token());
585+ }
586+
587+ // Find the first non-whitespace token to get the actual start position
588+ let mut start_index = self.index;
589+ while start_index < self.tokens.len() {
590+ if matches!(self.tokens[start_index].token, Token::Whitespace(_)) {
591+ start_index += 1;
592+ } else {
593+ break;
594+ }
595+ }
596+
597+ let statement = self.parse_statement()?;
598+
599+ // Find the last non-whitespace token that was consumed
600+ // We need to look backwards from the current position to skip any whitespace
601+ let mut end_index = self.index.saturating_sub(1);
602+ while end_index > start_index {
603+ if matches!(
604+ self.tokens.get(end_index).map(|t| &t.token),
605+ Some(Token::Whitespace(_))
606+ ) {
607+ end_index = end_index.saturating_sub(1);
608+ } else {
609+ break;
610+ }
611+ }
612+
613+ // Check if the next non-whitespace token is a semicolon and include it in the range
614+ let mut check_index = self.index;
615+ while check_index < self.tokens.len() {
616+ match &self.tokens[check_index].token {
617+ Token::Whitespace(_) => check_index += 1,
618+ Token::SemiColon => {
619+ end_index = check_index;
620+ break;
621+ }
622+ _ => break,
623+ }
624+ }
625+
626+ // Calculate byte offsets from the token spans
627+ let start_offset = if start_index < self.tokens.len() {
628+ span_to_byte_offset(sql, self.tokens[start_index].span.start)
629+ } else {
630+ sql.len()
631+ };
632+
633+ let end_offset = if end_index < self.tokens.len() {
634+ span_to_byte_offset(sql, self.tokens[end_index].span.end)
635+ } else {
636+ sql.len()
637+ };
638+
639+ let source_offset = crate::tokenizer::SourceOffset::new(start_offset, end_offset);
640+ stmts.push((statement, source_offset));
641+ expecting_statement_delimiter = true;
642+ }
643+ Ok(stmts)
644+ }
645+
513646 /// Convenience method to parse a string with one or more SQL
514647 /// statements into produce an Abstract Syntax Tree (AST).
515648 ///
@@ -529,6 +662,37 @@ impl<'a> Parser<'a> {
529662 Parser::new(dialect).try_with_sql(sql)?.parse_statements()
530663 }
531664
665+ /// Convenience method to parse a string with one or more SQL statements and return
666+ /// both the Abstract Syntax Tree (AST) and byte offsets into the original source string.
667+ ///
668+ /// This is useful when you need to preserve the original source text for each statement,
669+ /// for example to maintain case-sensitive identifiers or type names that get normalized
670+ /// in the AST.
671+ ///
672+ /// # Example
673+ /// ```
674+ /// # use sqlparser::{parser::Parser, dialect::GenericDialect};
675+ /// # fn main() -> Result<(), sqlparser::parser::ParserError> {
676+ /// let dialect = GenericDialect{};
677+ /// let sql = "SELECT * FROM foo; INSERT INTO bar VALUES (1);";
678+ /// let results = Parser::parse_sql_with_offsets(&dialect, sql)?;
679+ ///
680+ /// assert_eq!(results.len(), 2);
681+ /// let (stmt, offset) = &results[0];
682+ /// let original_text = &sql[offset.start()..offset.end()];
683+ /// assert_eq!(original_text, "SELECT * FROM foo;");
684+ /// # Ok(())
685+ /// # }
686+ /// ```
687+ pub fn parse_sql_with_offsets(
688+ dialect: &dyn Dialect,
689+ sql: &str,
690+ ) -> Result<Vec<(Statement, crate::tokenizer::SourceOffset)>, ParserError> {
691+ Parser::new(dialect)
692+ .try_with_sql(sql)?
693+ .parse_statements_with_offsets(sql)
694+ }
695+
532696 /// Parse a single top-level statement (such as SELECT, INSERT, CREATE, etc.),
533697 /// stopping before the statement separator, if any.
534698 pub fn parse_statement(&mut self) -> Result<Statement, ParserError> {
0 commit comments