diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index e6dd1bc3d..c8409f870 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -35,6 +35,8 @@ unicode-perl = [] unicode-script = [] unicode-segment = [] +look-behinds = [] + [dependencies] arbitrary = { version = "1.3.0", features = ["derive"], optional = true } diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 7e2426dc7..c880910ab 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -144,6 +144,11 @@ pub enum ErrorKind { /// /// The span of this error corresponds to the unclosed parenthesis. GroupUnclosed, + /// An unclosed look-around, e.g., `(? write!(f, "invalid capture group character"), GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), GroupUnclosed => write!(f, "unclosed group"), + #[cfg(feature = "look-behinds")] + LookAroundUnclosed => write!(f, "unclosed look-around"), GroupUnopened => write!(f, "unopened group"), NestLimitExceeded(limit) => write!( f, @@ -301,11 +320,18 @@ impl core::fmt::Display for ErrorKind { UnsupportedBackreference => { write!(f, "backreferences are not supported") } + #[cfg(not(feature = "look-behinds"))] UnsupportedLookAround => write!( f, "look-around, including look-ahead and look-behind, \ is not supported" ), + #[cfg(feature = "look-behinds")] + UnsupportedLookAhead => write!(f, "look-aheads are not supported"), + #[cfg(feature = "look-behinds")] + UnsupportedCaptureInLookBehind => { + write!(f, "capture groups are not supported in look-behinds") + } } } } @@ -477,6 +503,9 @@ pub enum Ast { Dot(Box), /// A single zero-width assertion. Assertion(Box), + /// A single look-around regular expression. + #[cfg(feature = "look-behinds")] + LookAround(Box), /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. ClassUnicode(Box), /// A single perl character class, e.g., `\d` or `\W`. @@ -521,6 +550,12 @@ impl Ast { Ast::Assertion(Box::new(e)) } + /// Create a "look-around" AST item. + #[cfg(feature = "look-behinds")] + pub fn lookaround(e: LookAround) -> Ast { + Ast::LookAround(Box::new(e)) + } + /// Create a "Unicode class" AST item. pub fn class_unicode(e: ClassUnicode) -> Ast { Ast::ClassUnicode(Box::new(e)) @@ -564,6 +599,8 @@ impl Ast { Ast::Literal(ref x) => &x.span, Ast::Dot(ref span) => span, Ast::Assertion(ref x) => &x.span, + #[cfg(feature = "look-behinds")] + Ast::LookAround(ref x) => &x.span, Ast::ClassUnicode(ref x) => &x.span, Ast::ClassPerl(ref x) => &x.span, Ast::ClassBracketed(ref x) => &x.span, @@ -598,6 +635,8 @@ impl Ast { | Ast::Group(_) | Ast::Alternation(_) | Ast::Concat(_) => true, + #[cfg(feature = "look-behinds")] + Ast::LookAround(_) => true, } } } @@ -1342,6 +1381,30 @@ pub enum AssertionKind { WordBoundaryEndHalf, } +/// A single zero-width look-around. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +#[cfg(feature = "look-behinds")] +pub struct LookAround { + /// The span of this look-around. + pub span: Span, + /// The look-around kind, e.g. negative/positive look-behind. + pub kind: LookAroundKind, + /// The regular expression inside the look-around. + pub ast: Box, +} + +/// A look-around kind. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +#[cfg(feature = "look-behinds")] +pub enum LookAroundKind { + /// `(?<=...)` + PositiveLookBehind, + /// `(? return, Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, Ast::Group(ref x) if !x.ast.has_subexprs() => return, + #[cfg(feature = "look-behinds")] + Ast::LookAround(ref x) if !x.ast.has_subexprs() => return, Ast::Alternation(ref x) if x.asts.is_empty() => return, Ast::Concat(ref x) if x.asts.is_empty() => return, _ => {} @@ -1673,6 +1738,10 @@ impl Drop for Ast { Ast::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } + #[cfg(feature = "look-behinds")] + Ast::LookAround(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } Ast::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index bdaab7228..f4cdd1c08 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -159,6 +159,8 @@ impl ParserBuilder { stack_class: RefCell::new(vec![]), capture_names: RefCell::new(vec![]), scratch: RefCell::new(String::new()), + #[cfg(feature = "look-behinds")] + lookaround_depth: Cell::new(0), } } @@ -280,6 +282,10 @@ pub struct Parser { /// A scratch buffer used in various places. Mostly this is used to /// accumulate relevant characters from parts of a pattern. scratch: RefCell, + /// Whether the parser is currently in a look-around. This is used to + /// detect capture groups within look-arounds, which are not supported. + #[cfg(feature = "look-behinds")] + lookaround_depth: Cell, } /// ParserI is the internal parser implementation. @@ -299,9 +305,9 @@ struct ParserI<'s, P> { pattern: &'s str, } -/// GroupState represents a single stack frame while parsing nested groups -/// and alternations. Each frame records the state up to an opening parenthesis -/// or a alternating bracket `|`. +/// GroupState represents a single stack frame while parsing nested groups, +/// look-arounds and alternations. Each frame records the state up to an opening +/// parenthesis or a alternating bracket `|`. #[derive(Clone, Debug)] enum GroupState { /// This state is pushed whenever an opening group is found. @@ -313,6 +319,14 @@ enum GroupState { /// Whether this group has the `x` flag enabled or not. ignore_whitespace: bool, }, + /// This state is pushed whenever an opening look-around is found. + #[cfg(feature = "look-behinds")] + LookAround { + /// The concatenation immediately preceding the opening look-around. + concat: ast::Concat, + /// The look-around that has been opened. Its sub-AST is always empty. + lookaround: ast::LookAround, + }, /// This state is pushed whenever a new alternation branch is found. If /// an alternation branch is found and this state is at the top of the /// stack, then this state should be modified to include the new @@ -385,6 +399,10 @@ impl Parser { self.comments.borrow_mut().clear(); self.stack_group.borrow_mut().clear(); self.stack_class.borrow_mut().clear(); + #[cfg(feature = "look-behinds")] + { + self.lookaround_depth.set(0); + } } } @@ -470,6 +488,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.get() } + /// Return whether the parser is currently in a look-around. + #[cfg(feature = "look-behinds")] + fn in_lookaround(&self) -> bool { + self.parser().lookaround_depth.get() != 0 + } + /// Return the character at the current position of the parser. /// /// This panics if the current position does not point to a valid char. @@ -521,13 +545,14 @@ impl<'s, P: Borrow> ParserI<'s, P> { } } - /// Returns true if and only if the parser is positioned at a look-around + /// Returns true if and only if the parser is positioned at a look-ahead /// prefix. The conditions under which this returns true must always /// correspond to a regular expression that would otherwise be consider /// invalid. /// /// This should only be called immediately after parsing the opening of /// a group or a set of flags. + #[cfg(not(feature = "look-behinds"))] fn is_lookaround_prefix(&self) -> bool { self.bump_if("?=") || self.bump_if("?!") @@ -535,6 +560,13 @@ impl<'s, P: Borrow> ParserI<'s, P> { || self.bump_if("? bool { + self.bump_if("?=") || self.bump_if("?!") + } + /// Bump the parser, and if the `x` flag is enabled, bump through any /// subsequent spaces. Return true if and only if the parser is not at /// EOF. @@ -686,9 +718,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { })); } - /// Parse and push a group AST (and its parent concatenation) on to the - /// parser's internal stack. Return a fresh concatenation corresponding - /// to the group's sub-AST. + /// Parse and push a group or look-around AST (and its parent + /// concatenation) on to the parser's internal stack. Return a fresh + /// concatenation corresponding to the grouping's sub-AST. /// /// If a set of flags was found (with no group), then the concatenation /// is returned with that set of flags added. @@ -697,12 +729,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// parenthesis. It advances the parser to the character at the start /// of the sub-expression (or adjoining expression). /// - /// If there was a problem parsing the start of the group, then an error - /// is returned. + /// If there was a problem parsing the start of the grouping, then an + /// error is returned. #[inline(never)] - fn push_group(&self, mut concat: ast::Concat) -> Result { + fn push_grouping(&self, mut concat: ast::Concat) -> Result { assert_eq!(self.char(), '('); - match self.parse_group()? { + match self.parse_grouping()? { Either::Left(set) => { let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); if let Some(v) = ignore { @@ -712,7 +744,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { concat.asts.push(Ast::flags(set)); Ok(concat) } - Either::Right(group) => { + Either::Right(Either::Left(group)) => { let old_ignore_whitespace = self.ignore_whitespace(); let new_ignore_whitespace = group .flags() @@ -728,61 +760,148 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.set(new_ignore_whitespace); Ok(ast::Concat { span: self.span(), asts: vec![] }) } + #[cfg(feature = "look-behinds")] + Either::Right(Either::Right(lookaround)) => { + self.parser() + .stack_group + .borrow_mut() + .push(GroupState::LookAround { concat, lookaround }); + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() + 1); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } + #[cfg(not(feature = "look-behinds"))] + Either::Right(Either::Right(_)) => { + unimplemented!() + } } } - /// Pop a group AST from the parser's internal stack and set the group's - /// AST to the given concatenation. Return the concatenation containing - /// the group. + /// Pop a group or look-around AST from the parser's internal stack and + /// set the grouping's AST to the given concatenation. Return the + /// concatenation containing the grouping. /// /// This assumes that the parser is currently positioned on the closing /// parenthesis and advances the parser to the character following the `)`. /// - /// If no such group could be popped, then an unopened group error is + /// If no such grouping could be popped, then an unopened group error is /// returned. + /// + /// If a look-behind contains a capture group, then an error is returned. #[inline(never)] - fn pop_group(&self, mut group_concat: ast::Concat) -> Result { + fn pop_grouping( + &self, + mut grouping_concat: ast::Concat, + ) -> Result { use self::GroupState::*; assert_eq!(self.char(), ')'); let mut stack = self.parser().stack_group.borrow_mut(); - let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack - .pop() - { - Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, None) - } - Some(Alternation(alt)) => match stack.pop() { + let (mut prior_concat, mut grouping, ignore_whitespace, alt) = + match stack.pop() { Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, Some(alt)) + (concat, Either::Left(group), ignore_whitespace, None) } - None | Some(Alternation(_)) => { + #[cfg(feature = "look-behinds")] + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.ignore_whitespace(), + None, + ), + Some(Alternation(alt)) => match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => ( + concat, + Either::Left(group), + ignore_whitespace, + Some(alt), + ), + #[cfg(feature = "look-behinds")] + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.ignore_whitespace(), + Some(alt), + ), + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }, + None => { return Err(self.error( self.span_char(), ast::ErrorKind::GroupUnopened, )); } - }, - None => { - return Err(self - .error(self.span_char(), ast::ErrorKind::GroupUnopened)); - } - }; + }; + + #[cfg(not(feature = "look-behinds"))] + let _: Either = grouping; + self.parser().ignore_whitespace.set(ignore_whitespace); - group_concat.span.end = self.pos(); + grouping_concat.span.end = self.pos(); self.bump(); - group.span.end = self.pos(); + match &mut grouping { + Either::Left(group) => group.span.end = self.pos(), + #[cfg(feature = "look-behinds")] + Either::Right(lookaround) => lookaround.span.end = self.pos(), + #[cfg(not(feature = "look-behinds"))] + Either::Right(_) => {} + } match alt { Some(mut alt) => { - alt.span.end = group_concat.span.end; - alt.asts.push(group_concat.into_ast()); - group.ast = Box::new(alt.into_ast()); - } - None => { - group.ast = Box::new(group_concat.into_ast()); + alt.span.end = grouping_concat.span.end; + alt.asts.push(grouping_concat.into_ast()); + match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(alt.into_ast()) + } + #[cfg(feature = "look-behinds")] + Either::Right(lookaround) => { + lookaround.ast = Box::new(alt.into_ast()) + } + #[cfg(not(feature = "look-behinds"))] + Either::Right(_) => {} + } } + None => match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(grouping_concat.into_ast()) + } + #[cfg(feature = "look-behinds")] + Either::Right(lookaround) => { + lookaround.ast = Box::new(grouping_concat.into_ast()) + } + #[cfg(not(feature = "look-behinds"))] + Either::Right(_) => {} + }, } - prior_concat.asts.push(Ast::group(group)); + prior_concat.asts.push(match grouping { + Either::Left(group) => { + #[cfg(feature = "look-behinds")] + if group.is_capturing() && self.in_lookaround() { + return Err(self.error( + group.span, + ast::ErrorKind::UnsupportedCaptureInLookBehind, + )); + } + + Ast::group(group) + } + #[cfg(feature = "look-behinds")] + Either::Right(lookaround) => { + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() - 1); + Ast::lookaround(lookaround) + } + #[cfg(not(feature = "look-behinds"))] + Either::Right(_) => unimplemented!(), + }); Ok(prior_concat) } @@ -793,7 +912,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// This assumes that the parser has advanced to the end. #[inline(never)] - fn pop_group_end(&self, mut concat: ast::Concat) -> Result { + fn pop_grouping_end(&self, mut concat: ast::Concat) -> Result { concat.span.end = self.pos(); let mut stack = self.parser().stack_group.borrow_mut(); let ast = match stack.pop() { @@ -808,6 +927,13 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.error(group.span, ast::ErrorKind::GroupUnclosed) ); } + #[cfg(feature = "look-behinds")] + Some(GroupState::LookAround { lookaround, .. }) => { + return Err(self.error( + lookaround.span, + ast::ErrorKind::LookAroundUnclosed, + )); + } }; // If we try to pop again, there should be nothing. match stack.pop() { @@ -824,6 +950,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(GroupState::Group { group, .. }) => { Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) } + #[cfg(feature = "look-behinds")] + Some(GroupState::LookAround { lookaround, .. }) => Err(self + .error(lookaround.span, ast::ErrorKind::LookAroundUnclosed)), } } @@ -989,8 +1118,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { break; } match self.char() { - '(' => concat = self.push_group(concat)?, - ')' => concat = self.pop_group(concat)?, + '(' => concat = self.push_grouping(concat)?, + ')' => concat = self.pop_grouping(concat)?, '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; @@ -1020,7 +1149,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { _ => concat.asts.push(self.parse_primitive()?.into_ast()), } } - let ast = self.pop_group_end(concat)?; + let ast = self.pop_grouping_end(concat)?; NestLimiter::new(self).check(&ast)?; Ok(ast::WithComments { ast, @@ -1224,7 +1353,10 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// If a capture name is given and it is incorrectly specified, then a /// corresponding error is returned. #[inline(never)] - fn parse_group(&self) -> Result> { + #[cfg(not(feature = "look-behinds"))] + fn parse_grouping( + &self, + ) -> Result>> { assert_eq!(self.char(), '('); let open_span = self.span_char(); self.bump(); @@ -1243,11 +1375,11 @@ impl<'s, P: Borrow> ParserI<'s, P> { } { let capture_index = self.next_capture_index(open_span)?; let name = self.parse_capture_name(capture_index)?; - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::empty(self.span())), - })) + }))) } else if self.bump_if("?") { if self.is_eof() { return Err( @@ -1272,19 +1404,123 @@ impl<'s, P: Borrow> ParserI<'s, P> { })) } else { assert_eq!(char_end, ':'); - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), ast: Box::new(Ast::empty(self.span())), + }))) + } + } else { + let capture_index = self.next_capture_index(open_span)?; + Ok(Either::Right(Either::Left(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureIndex(capture_index), + ast: Box::new(Ast::empty(self.span())), + }))) + } + } + + /// Parse a group or look-around (which contain a sub-expression), or a + /// set of flags. + /// + /// If a group or look-around was found, then it is returned with an + /// empty AST. If a set of flags is found, then that set is returned. + /// + /// The parser should be positioned at the opening parenthesis. + /// + /// This advances the parser to the character before the start of the + /// sub-expression (in the case of a group or look-around) or to the + /// closing parenthesis immediately following the set of flags. + /// + /// # Errors + /// + /// If flags are given and incorrectly specified, then a corresponding + /// error is returned. + /// + /// If a capture name is given and it is incorrectly specified, then a + /// corresponding error is returned. + /// + /// If a look-ahead is given (which is currently unsupported), then an + /// error is returned. + #[inline(never)] + #[cfg(feature = "look-behinds")] + fn parse_grouping( + &self, + ) -> Result>> + { + assert_eq!(self.char(), '('); + let open_span = self.span_char(); + self.bump(); + self.bump_space(); + if self.is_lookahead_prefix() { + return Err(self.error( + Span::new(open_span.start, self.span().end), + ast::ErrorKind::UnsupportedLookAhead, + )); + } + let inner_span = self.span(); + + let mut lookaround_kind = ast::LookAroundKind::PositiveLookBehind; + if self.bump_if("?<=") || { + lookaround_kind = ast::LookAroundKind::NegativeLookBehind; + self.bump_if("?> ast::Visitor for NestLimiter<'p, 's, P> { Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, + #[cfg(feature = "look-behinds")] + Ast::LookAround(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, Ast::Concat(ref x) => &x.span, }; @@ -2354,6 +2592,11 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { self.decrement_depth(); Ok(()) } + #[cfg(feature = "look-behinds")] + Ast::LookAround(_) => { + self.decrement_depth(); + Ok(()) + } } } @@ -3736,33 +3979,212 @@ bar } #[test] - fn parse_unsupported_lookaround() { + fn parse_unsupported_lookahead() { assert_eq!( parser(r"(?=a)").parse().unwrap_err(), TestError { span: span(0..3), + #[cfg(not(feature = "look-behinds"))] kind: ast::ErrorKind::UnsupportedLookAround, + #[cfg(feature = "look-behinds")] + kind: ast::ErrorKind::UnsupportedLookAhead, } ); assert_eq!( parser(r"(?!a)").parse().unwrap_err(), TestError { span: span(0..3), + #[cfg(not(feature = "look-behinds"))] kind: ast::ErrorKind::UnsupportedLookAround, + #[cfg(feature = "look-behinds")] + kind: ast::ErrorKind::UnsupportedLookAhead, } ); + } + + #[test] + #[cfg(feature = "look-behinds")] + fn parse_lookbehinds() { + assert_eq!( + parser(r"(?<=)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..5), + ast: Box::new(Ast::empty(span(4..4))), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?<=(?<=))(a)").parse(), + Ok(concat( + 0..13, + vec![ + Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::lookaround(ast::LookAround { + span: span(4..9), + ast: Box::new(Ast::empty(span(8..8))), + kind: ast::LookAroundKind::PositiveLookBehind + })), + kind: ast::LookAroundKind::PositiveLookBehind + }), + group(10..13, 1, lit('a', 11)), + ] + )) + ); + assert_eq!( + parser(r"(?<=a)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..6), + ast: Box::new(lit('a', 4)), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?<=(?:a))").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::group(ast::Group { + span: span(4..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span(6..6), + items: vec![], + }), + ast: Box::new(lit('a', 7)), + })), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?a))").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, + } + ); + assert_eq!( + parser(r"(?a)|b)").parse().unwrap_err(), + TestError { + span: span(6..16), + kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, } ); } diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 556d91f4a..7200b27d9 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -80,6 +80,8 @@ impl Visitor for Writer { fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast { Ast::Group(ref x) => self.fmt_group_pre(x), + #[cfg(feature = "look-behinds")] + Ast::LookAround(ref x) => self.fmt_lookaround_pre(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } @@ -92,6 +94,8 @@ impl Visitor for Writer { Ast::Literal(ref x) => self.fmt_literal(x), Ast::Dot(_) => self.wtr.write_str("."), Ast::Assertion(ref x) => self.fmt_assertion(x), + #[cfg(feature = "look-behinds")] + Ast::LookAround(ref x) => self.fmt_lookaround_post(x), Ast::ClassPerl(ref x) => self.fmt_class_perl(x), Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), @@ -174,6 +178,20 @@ impl Writer { self.wtr.write_str(")") } + #[cfg(feature = "look-behinds")] + fn fmt_lookaround_pre(&mut self, ast: &ast::LookAround) -> fmt::Result { + use crate::ast::LookAroundKind::*; + match ast.kind { + PositiveLookBehind => self.wtr.write_str("(?<="), + NegativeLookBehind => self.wtr.write_str("(? fmt::Result { + self.wtr.write_str(")") + } + fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { use crate::ast::RepetitionKind::*; match ast.op.kind { @@ -511,6 +529,13 @@ mod tests { roundtrip("(a)"); } + #[cfg(feature = "look-behinds")] + #[test] + fn print_lookaround() { + roundtrip("(?<=a)"); + roundtrip("(? { /// A stack frame allocated just before descending into a group's child /// node. Group(&'a ast::Group), + /// A stack frame allocated just before descending into a look-around's + /// child node. + #[cfg(feature = "look-behinds")] + LookAround(&'a ast::LookAround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -270,6 +274,8 @@ impl<'a> HeapVisitor<'a> { } Ast::Repetition(ref x) => Some(Frame::Repetition(x)), Ast::Group(ref x) => Some(Frame::Group(x)), + #[cfg(feature = "look-behinds")] + Ast::LookAround(ref x) => Some(Frame::LookAround(x)), Ast::Concat(ref x) if x.asts.is_empty() => None, Ast::Concat(ref x) => { Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) @@ -289,6 +295,8 @@ impl<'a> HeapVisitor<'a> { match induct { Frame::Repetition(_) => None, Frame::Group(_) => None, + #[cfg(feature = "look-behinds")] + Frame::LookAround(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -444,6 +452,8 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.ast, Frame::Group(group) => &group.ast, + #[cfg(feature = "look-behinds")] + Frame::LookAround(look) => &look.ast, Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 2a6350e64..023f40253 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -173,6 +173,8 @@ impl Extractor { match *hir.kind() { Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + #[cfg(feature = "look-behinds")] + LookAround(_) => Seq::singleton(self::Literal::exact(vec![])), Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); @@ -2453,6 +2455,22 @@ mod tests { assert_eq!(expected, e(r"^aZ*b")); } + #[test] + #[cfg(feature = "look-behinds")] + fn lookaround() { + assert_eq!(exact([E("ab")]), e(r"a(?<=qwa)b")); + assert_eq!(exact([E("ab")]), e(r"a(? Hir { + let props = Properties::lookaround(&lookaround); + Hir { kind: HirKind::LookAround(lookaround), props } + } + /// Creates a repetition HIR expression. #[inline] pub fn repetition(mut rep: Repetition) -> Hir { @@ -728,6 +736,9 @@ pub enum HirKind { Class(Class), /// A look-around assertion. A look-around match always has zero length. Look(Look), + /// A look-around subexpression. + #[cfg(feature = "look-behinds")] + LookAround(LookAround), /// A repetition operation applied to a sub-expression. Repetition(Repetition), /// A capturing group, which contains a sub-expression. @@ -761,6 +772,8 @@ impl HirKind { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => &[], + #[cfg(feature = "look-behinds")] + HirKind::LookAround(ref lookaround) => from_ref(lookaround.sub()), HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), HirKind::Concat(ref subs) => subs, @@ -1786,6 +1799,54 @@ impl Look { } } +/// Represents a general look-around assertion. +/// +/// Currently, only lookbehind assertions are supported. +/// Furthermore, capture groups inside assertions are not supported. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg(feature = "look-behinds")] +pub enum LookAround { + /// A positive lookbehind assertion. + PositiveLookBehind(Box), + /// A negative lookbehind assertion. + NegativeLookBehind(Box), +} + +#[cfg(feature = "look-behinds")] +impl LookAround { + /// Returns a reference to the inner expression that must match for this + /// look-around assertion to hold. + pub fn sub(&self) -> &Hir { + match self { + Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { + sub + } + } + } + + /// Returns a mutable reference to the inner expression. + pub fn sub_mut(&mut self) -> &mut Hir { + match self { + Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { + sub + } + } + } + + /// Returns a new look-around of the same kind, but with its + /// sub-expression replaced with the one given. + pub fn with(&self, sub: Hir) -> LookAround { + match self { + Self::PositiveLookBehind(_) => { + Self::PositiveLookBehind(Box::new(sub)) + } + Self::NegativeLookBehind(_) => { + Self::NegativeLookBehind(Box::new(sub)) + } + } + } +} + /// The high-level intermediate representation for a capturing group. /// /// A capturing group always has an index and a child expression. It may @@ -1920,6 +1981,10 @@ impl Drop for Hir { | HirKind::Class(_) | HirKind::Look(_) => return, HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + #[cfg(feature = "look-behinds")] + HirKind::LookAround(ref x) if x.sub().kind.subs().is_empty() => { + return + } HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { return } @@ -1935,6 +2000,10 @@ impl Drop for Hir { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {} + #[cfg(feature = "look-behinds")] + HirKind::LookAround(ref mut x) => { + stack.push(mem::replace(x.sub_mut(), Hir::empty())); + } HirKind::Capture(ref mut x) => { stack.push(mem::replace(&mut x.sub, Hir::empty())); } @@ -1979,6 +2048,8 @@ struct PropertiesI { look_set_suffix: LookSet, look_set_prefix_any: LookSet, look_set_suffix_any: LookSet, + #[cfg(feature = "look-behinds")] + contains_lookaround_expr: bool, utf8: bool, explicit_captures_len: usize, static_explicit_captures_len: Option, @@ -2072,6 +2143,16 @@ impl Properties { self.0.look_set_suffix_any } + /// Returns whether there are any look-around expressions in this HIR value. + /// + /// Only returns true for [`HirKind::LookAround`] and not for + /// [`HirKind::Look`], which can be queried by [`look_set`](Properties::look_set) instead. + #[inline] + #[cfg(feature = "look-behinds")] + pub fn contains_lookaround_expr(&self) -> bool { + self.0.contains_lookaround_expr + } + /// Return true if and only if the corresponding HIR will always match /// valid UTF-8. /// @@ -2341,6 +2422,8 @@ impl Properties { look_set_suffix: fix, look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + #[cfg(feature = "look-behinds")] + contains_lookaround_expr: false, utf8: true, explicit_captures_len: 0, static_explicit_captures_len, @@ -2356,6 +2439,12 @@ impl Properties { props.look_set_suffix.set_intersect(p.look_set_suffix()); props.look_set_prefix_any.set_union(p.look_set_prefix_any()); props.look_set_suffix_any.set_union(p.look_set_suffix_any()); + #[cfg(feature = "look-behinds")] + { + props.contains_lookaround_expr = props + .contains_lookaround_expr + || p.contains_lookaround_expr(); + } props.utf8 = props.utf8 && p.is_utf8(); props.explicit_captures_len = props .explicit_captures_len @@ -2403,6 +2492,8 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + #[cfg(feature = "look-behinds")] + contains_lookaround_expr: false, // It is debatable whether an empty regex always matches at valid // UTF-8 boundaries. Strictly speaking, at a byte oriented view, // it is clearly false. There are, for example, many empty strings @@ -2439,6 +2530,8 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + #[cfg(feature = "look-behinds")] + contains_lookaround_expr: false, utf8: core::str::from_utf8(&lit.0).is_ok(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2458,6 +2551,8 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + #[cfg(feature = "look-behinds")] + contains_lookaround_expr: false, utf8: class.is_utf8(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2477,6 +2572,10 @@ impl Properties { look_set_suffix: LookSet::singleton(look), look_set_prefix_any: LookSet::singleton(look), look_set_suffix_any: LookSet::singleton(look), + // Note, this field represents _general_ lookarounds (ones using + // LookAround) and not assertions (using Look). + #[cfg(feature = "look-behinds")] + contains_lookaround_expr: false, // This requires a little explanation. Basically, we don't consider // matching an empty string to be equivalent to matching invalid // UTF-8, even though technically matching every empty string will @@ -2499,6 +2598,25 @@ impl Properties { Properties(Box::new(inner)) } + /// Create a new set of HIR properties for a look-around. + #[cfg(feature = "look-behinds")] + fn lookaround(lookaround: &LookAround) -> Properties { + let sub_p = lookaround.sub().properties(); + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + literal: false, + alternation_literal: false, + contains_lookaround_expr: true, + // We do not want look-around subexpressions to influence matching + // of the main expression when they contain anchors, so we clear the set. + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + ..*sub_p.0.clone() + }; + Properties(Box::new(inner)) + } + /// Create a new set of HIR properties for a repetition. fn repetition(rep: &Repetition) -> Properties { let p = rep.sub.properties(); @@ -2520,6 +2638,8 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: p.look_set_prefix_any(), look_set_suffix_any: p.look_set_suffix_any(), + #[cfg(feature = "look-behinds")] + contains_lookaround_expr: p.contains_lookaround_expr(), utf8: p.is_utf8(), explicit_captures_len: p.explicit_captures_len(), static_explicit_captures_len: p.static_explicit_captures_len(), @@ -2581,6 +2701,8 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + #[cfg(feature = "look-behinds")] + contains_lookaround_expr: false, utf8: true, explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2592,6 +2714,12 @@ impl Properties { let p = x.properties(); props.look_set.set_union(p.look_set()); props.utf8 = props.utf8 && p.is_utf8(); + #[cfg(feature = "look-behinds")] + { + props.contains_lookaround_expr = props + .contains_lookaround_expr + || p.contains_lookaround_expr(); + } props.explicit_captures_len = props .explicit_captures_len .saturating_add(p.explicit_captures_len()); diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 89db08c25..2a3a9e1ca 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -227,6 +227,14 @@ impl Visitor for Writer { self.wtr.write_str(r"\b{end-half}")?; } }, + #[cfg(feature = "look-behinds")] + HirKind::LookAround(hir::LookAround::PositiveLookBehind(_)) => { + self.wtr.write_str(r"(?<=")?; + } + #[cfg(feature = "look-behinds")] + HirKind::LookAround(hir::LookAround::NegativeLookBehind(_)) => { + self.wtr.write_str(r"(? { self.wtr.write_str("(")?; if let Some(ref name) = *name { @@ -296,6 +304,10 @@ impl Visitor for Writer { | HirKind::Alternation(_) => { self.wtr.write_str(r")")?; } + #[cfg(feature = "look-behinds")] + HirKind::LookAround(_) => { + self.wtr.write_str(r")")?; + } } Ok(()) } @@ -477,6 +489,18 @@ mod tests { roundtrip("((((a))))", "((((a))))"); } + #[test] + #[cfg(feature = "look-behinds")] + fn print_look_around() { + roundtrip("(?<=)", "(?<=(?:))"); + roundtrip("(? {} + _ => { + panic!( + "tried to unwrap look-around from HirFrame, got: {:?}", + self + ) + } + } + } + /// Assert that the current stack frame is a group indicator and return /// its corresponding flags (the flags that were active at the time the /// group was entered). @@ -359,6 +380,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::AlternationBranch); } } + #[cfg(feature = "look-behinds")] + Ast::LookAround(_) => self.push(HirFrame::LookAround), _ => {} } Ok(()) @@ -442,6 +465,19 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } + #[cfg(feature = "look-behinds")] + Ast::LookAround(ref x) => { + let expr = Box::new(self.pop().unwrap().unwrap_expr()); + self.pop().unwrap().unwrap_lookaround(); + self.push(HirFrame::Expr(Hir::lookaround(match x.kind { + ast::LookAroundKind::PositiveLookBehind => { + hir::LookAround::PositiveLookBehind(expr) + } + ast::LookAroundKind::NegativeLookBehind => { + hir::LookAround::NegativeLookBehind(expr) + } + }))); + } Ast::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { @@ -755,6 +791,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> { HirFrame::AlternationBranch => { unreachable!("expected expr or concat, got alt branch marker") } + #[cfg(feature = "look-behinds")] + HirFrame::LookAround => { + unreachable!("expected expr or concat, got look-around") + } } } @@ -786,6 +826,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> { HirFrame::AlternationBranch => { unreachable!("expected expr or alt, got alt branch marker") } + #[cfg(feature = "look-behinds")] + HirFrame::LookAround => { + unreachable!("expected expr or alt, got look-around") + } } } @@ -1597,6 +1641,16 @@ mod tests { Hir::look(look) } + #[cfg(feature = "look-behinds")] + fn hir_lookbehind(expr: Hir, positive: bool) -> Hir { + let lookaround = if positive { + hir::LookAround::PositiveLookBehind(Box::new(expr)) + } else { + hir::LookAround::NegativeLookBehind(Box::new(expr)) + }; + Hir::lookaround(lookaround) + } + #[test] fn empty() { assert_eq!(t(""), Hir::empty()); @@ -1820,6 +1874,45 @@ mod tests { assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); } + #[cfg(feature = "look-behinds")] + #[test] + fn lookarounds() { + assert_eq!(t("(?<=a)"), hir_lookbehind(hir_lit("a"), true)); + assert_eq!(t("(? { /// A stack frame allocated just before descending into a capture's child /// node. Capture(&'a hir::Capture), + /// A stack frame allocated just before descending into a look-around's + /// child node. + #[cfg(feature = "look-behinds")] + LookAround(&'a hir::LookAround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -162,6 +166,8 @@ impl<'a> HeapVisitor<'a> { match *hir.kind() { HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), HirKind::Capture(ref x) => Some(Frame::Capture(x)), + #[cfg(feature = "look-behinds")] + HirKind::LookAround(ref x) => Some(Frame::LookAround(x)), HirKind::Concat(ref x) if x.is_empty() => None, HirKind::Concat(ref x) => { Some(Frame::Concat { head: &x[0], tail: &x[1..] }) @@ -180,6 +186,8 @@ impl<'a> HeapVisitor<'a> { match induct { Frame::Repetition(_) => None, Frame::Capture(_) => None, + #[cfg(feature = "look-behinds")] + Frame::LookAround(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -208,6 +216,8 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.sub, Frame::Capture(capture) => &capture.sub, + #[cfg(feature = "look-behinds")] + Frame::LookAround(lookaround) => lookaround.sub(), Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index a4512e23d..82bbb8041 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -162,6 +162,8 @@ The following features are available: [`arbitrary`](https://crates.io/crates/arbitrary) crate. Namely, it implements the `Arbitrary` trait from that crate for the [`Ast`](crate::ast::Ast) type. This feature is disabled by default. +* **look-behinds** - + Enable support for look-behind expressions. This feature is disabled by default. */ #![no_std] diff --git a/regex-syntax/test b/regex-syntax/test index 8626c3bfc..05910225b 100755 --- a/regex-syntax/test +++ b/regex-syntax/test @@ -20,6 +20,7 @@ features=( unicode-perl unicode-script unicode-segment + look-behinds ) for f in "${features[@]}"; do echo "=== FEATURE: $f ==="