@@ -344,6 +344,44 @@ pub struct Parser<'a> {
344344 recursion_counter: RecursionCounter,
345345}
346346
347+ /// Helper function to convert a Location (line, column) to a byte offset in the source string.
348+ ///
349+ /// Line and column numbers are 1-indexed as per the Location type.
350+ fn span_to_byte_offset(sql: &str, location: Location) -> usize {
351+ if location.line == 0 || location.column == 0 {
352+ // Empty location
353+ return 0;
354+ }
355+
356+ let mut byte_offset = 0;
357+ let mut current_line = 1u64;
358+
359+ for ch in sql.chars() {
360+ if current_line == location.line {
361+ // We're on the target line, now count columns
362+ let mut current_col = 1u64;
363+ let remaining = &sql[byte_offset..];
364+ for ch in remaining.chars() {
365+ if current_col >= location.column {
366+ return byte_offset;
367+ }
368+ if ch == '\n' {
369+ // Don't go past the end of the line
370+ return byte_offset;
371+ }
372+ byte_offset += ch.len_utf8();
373+ current_col += 1;
374+ }
375+ return byte_offset;
376+ }
377+ if ch == '\n' {
378+ current_line += 1;
379+ }
380+ byte_offset += ch.len_utf8();
381+ }
382+ byte_offset
383+ }
384+
347385impl<'a> Parser<'a> {
348386 /// Create a parser for a [`Dialect`]
349387 ///
@@ -510,6 +548,88 @@ impl<'a> Parser<'a> {
510548 Ok(stmts)
511549 }
512550
551+ /// Parse multiple statements and return them with their byte offsets in the original source.
552+ ///
553+ /// Similar to [`Self::parse_statements`], but also returns [`crate::tokenizer::SourceOffset`]
554+ /// for each statement indicating its position in the original SQL string.
555+ pub fn parse_statements_with_offsets(
556+ &mut self,
557+ sql: &str,
558+ ) -> Result<Vec<(Statement, crate::tokenizer::SourceOffset)>, ParserError> {
559+ let mut stmts = Vec::new();
560+ let mut expecting_statement_delimiter = false;
561+ loop {
562+ // ignore empty statements (between successive statement delimiters)
563+ while self.consume_token(&Token::SemiColon) {
564+ expecting_statement_delimiter = false;
565+ }
566+
567+ if !self.options.require_semicolon_stmt_delimiter {
568+ expecting_statement_delimiter = false;
569+ }
570+
571+ match self.peek_token().token {
572+ Token::EOF => break,
573+
574+ // end of statement
575+ Token::Word(word) => {
576+ if expecting_statement_delimiter && word.keyword == Keyword::END {
577+ break;
578+ }
579+ }
580+ _ => {}
581+ }
582+
583+ if expecting_statement_delimiter {
584+ return self.expected("end of statement", self.peek_token());
585+ }
586+
587+ // Find the first non-whitespace token to get the actual start position
588+ let mut start_index = self.index;
589+ while start_index < self.tokens.len() {
590+ if matches!(self.tokens[start_index].token, Token::Whitespace(_)) {
591+ start_index += 1;
592+ } else {
593+ break;
594+ }
595+ }
596+
597+ let statement = self.parse_statement()?;
598+
599+ // Find the last non-whitespace token that was consumed
600+ // We need to look backwards from the current position to skip any whitespace
601+ let mut end_index = self.index.saturating_sub(1);
602+ while end_index > start_index {
603+ if matches!(
604+ self.tokens.get(end_index).map(|t| &t.token),
605+ Some(Token::Whitespace(_))
606+ ) {
607+ end_index = end_index.saturating_sub(1);
608+ } else {
609+ break;
610+ }
611+ }
612+
613+ // Calculate byte offsets from the token spans
614+ let start_offset = if start_index < self.tokens.len() {
615+ span_to_byte_offset(sql, self.tokens[start_index].span.start)
616+ } else {
617+ sql.len()
618+ };
619+
620+ let end_offset = if end_index < self.tokens.len() {
621+ span_to_byte_offset(sql, self.tokens[end_index].span.end)
622+ } else {
623+ sql.len()
624+ };
625+
626+ let source_offset = crate::tokenizer::SourceOffset::new(start_offset, end_offset);
627+ stmts.push((statement, source_offset));
628+ expecting_statement_delimiter = true;
629+ }
630+ Ok(stmts)
631+ }
632+
513633 /// Convenience method to parse a string with one or more SQL
514634 /// statements into produce an Abstract Syntax Tree (AST).
515635 ///
@@ -529,6 +649,37 @@ impl<'a> Parser<'a> {
529649 Parser::new(dialect).try_with_sql(sql)?.parse_statements()
530650 }
531651
652+ /// Convenience method to parse a string with one or more SQL statements and return
653+ /// both the Abstract Syntax Tree (AST) and byte offsets into the original source string.
654+ ///
655+ /// This is useful when you need to preserve the original source text for each statement,
656+ /// for example to maintain case-sensitive identifiers or type names that get normalized
657+ /// in the AST.
658+ ///
659+ /// # Example
660+ /// ```
661+ /// # use sqlparser::{parser::Parser, dialect::GenericDialect};
662+ /// # fn main() -> Result<(), sqlparser::parser::ParserError> {
663+ /// let dialect = GenericDialect{};
664+ /// let sql = "SELECT * FROM foo; INSERT INTO bar VALUES (1);";
665+ /// let results = Parser::parse_sql_with_offsets(&dialect, sql)?;
666+ ///
667+ /// assert_eq!(results.len(), 2);
668+ /// let (stmt, offset) = &results[0];
669+ /// let original_text = &sql[offset.start()..offset.end()];
670+ /// assert_eq!(original_text, "SELECT * FROM foo");
671+ /// # Ok(())
672+ /// # }
673+ /// ```
674+ pub fn parse_sql_with_offsets(
675+ dialect: &dyn Dialect,
676+ sql: &str,
677+ ) -> Result<Vec<(Statement, crate::tokenizer::SourceOffset)>, ParserError> {
678+ Parser::new(dialect)
679+ .try_with_sql(sql)?
680+ .parse_statements_with_offsets(sql)
681+ }
682+
532683 /// Parse a single top-level statement (such as SELECT, INSERT, CREATE, etc.),
533684 /// stopping before the statement separator, if any.
534685 pub fn parse_statement(&mut self) -> Result<Statement, ParserError> {
0 commit comments