@@ -140,12 +140,10 @@ class RegExpRoot extends RegExpTerm {
140140 predicate isRelevant ( ) {
141141 // there is at least one repetition
142142 getRoot ( any ( InfiniteRepetitionQuantifier q ) ) = this and
143- // there are no lookbehinds
144- not exists ( RegExpLookbehind lbh | getRoot ( lbh ) = this ) and
145143 // is actually used as a RegExp
146- this . isUsedAsRegExp ( ) // and
147- // // pragmatic performance optimization: ignore minified files.
148- // not getRootTerm().getParent().(Expr).getTopLevel().isMinified( )
144+ isUsedAsRegExp ( ) and
145+ // not excluded for library specific reasons
146+ not isExcluded ( getRootTerm ( ) .getParent ( ) )
149147 }
150148}
151149
@@ -156,38 +154,68 @@ private class RegexpCharacterConstant extends RegExpConstant {
156154 RegexpCharacterConstant ( ) { this .isCharacter ( ) }
157155}
158156
157+ /**
158+ * A regexp term that is relevant for this ReDoS analysis.
159+ */
160+ class RelevantRegExpTerm extends RegExpTerm {
161+ RelevantRegExpTerm ( ) { getRoot ( this ) .isRelevant ( ) }
162+ }
163+
159164/**
160165 * Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
166+ * The string representation includes which flags are used with the regular expression.
161167 *
162168 * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
163169 * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
164170 */
165- private predicate isCanonicalTerm ( RegExpTerm term , string str ) {
171+ private predicate isCanonicalTerm ( RelevantRegExpTerm term , string str ) {
166172 term =
167- rank [ 1 ] ( RegExpTerm t , Location loc , File file |
173+ min ( RelevantRegExpTerm t , Location loc , File file |
168174 loc = t .getLocation ( ) and
169175 file = t .getFile ( ) and
170- str = t .getRawValue ( )
176+ str = t .getRawValue ( ) + "|" + getCanonicalizationFlags ( t . getRootTerm ( ) )
171177 |
172178 t order by t .getFile ( ) .getRelativePath ( ) , loc .getStartLine ( ) , loc .getStartColumn ( )
173179 )
174180}
175181
182+ /**
183+ * Gets a string reperesentation of the flags used with the regular expression.
184+ * Only the flags that are relevant for the canonicalization are included.
185+ */
186+ string getCanonicalizationFlags ( RegExpTerm root ) {
187+ root .isRootTerm ( ) and
188+ ( if RegExpFlags:: isIgnoreCase ( root ) then result = "i" else result = "" )
189+ }
190+
176191/**
177192 * An abstract input symbol, representing a set of concrete characters.
178193 */
179194private newtype TInputSymbol =
180195 /** An input symbol corresponding to character `c`. */
181196 Char ( string c ) {
182- c = any ( RegexpCharacterConstant cc | getRoot ( cc ) .isRelevant ( ) ) .getValue ( ) .charAt ( _)
197+ c =
198+ any ( RegexpCharacterConstant cc |
199+ cc instanceof RelevantRegExpTerm and
200+ not RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) )
201+ ) .getValue ( ) .charAt ( _)
202+ or
203+ // normalize everything to lower case if the regexp is case insensitive
204+ c =
205+ any ( RegexpCharacterConstant cc , string char |
206+ cc instanceof RelevantRegExpTerm and
207+ RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) ) and
208+ char = cc .getValue ( ) .charAt ( _)
209+ |
210+ char .toLowerCase ( )
211+ )
183212 } or
184213 /**
185214 * An input symbol representing all characters matched by
186215 * a (non-universal) character class that has string representation `charClassString`.
187216 */
188217 CharClass ( string charClassString ) {
189- exists ( RegExpTerm term | term .getRawValue ( ) = charClassString | getRoot ( term ) .isRelevant ( ) ) and
190- exists ( RegExpTerm recc | isCanonicalTerm ( recc , charClassString ) |
218+ exists ( RelevantRegExpTerm recc | isCanonicalTerm ( recc , charClassString ) |
191219 recc instanceof RegExpCharacterClass and
192220 not recc .( RegExpCharacterClass ) .isUniversalClass ( )
193221 or
@@ -254,7 +282,7 @@ class InputSymbol extends TInputSymbol {
254282/**
255283 * An abstract input symbol that represents a character class.
256284 */
257- abstract private class CharacterClass extends InputSymbol {
285+ abstract class CharacterClass extends InputSymbol {
258286 /**
259287 * Gets a character that is relevant for intersection-tests involving this
260288 * character class.
@@ -277,7 +305,7 @@ abstract private class CharacterClass extends InputSymbol {
277305 /**
278306 * Gets a character matched by this character class.
279307 */
280- string choose ( ) { result = this . getARelevantChar ( ) and this . matches ( result ) }
308+ string choose ( ) { result = getARelevantChar ( ) and matches ( result ) }
281309}
282310
283311/**
@@ -289,6 +317,19 @@ private module CharacterClasses {
289317 */
290318 pragma [ noinline]
291319 predicate hasChildThatMatches ( RegExpCharacterClass cc , string char ) {
320+ if RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) )
321+ then
322+ // normalize everything to lower case if the regexp is case insensitive
323+ exists ( string c | hasChildThatMatchesIgnoringCasingFlags ( cc , c ) | char = c .toLowerCase ( ) )
324+ else hasChildThatMatchesIgnoringCasingFlags ( cc , char )
325+ }
326+
327+ /**
328+ * Holds if the character class `cc` has a child (constant or range) that matches `char`.
329+ * Ignores whether the character class is inside a regular expression that has the ignore case flag.
330+ */
331+ pragma [ noinline]
332+ predicate hasChildThatMatchesIgnoringCasingFlags ( RegExpCharacterClass cc , string char ) {
292333 exists ( getCanonicalCharClass ( cc ) ) and
293334 exists ( RegExpTerm child | child = cc .getAChild ( ) |
294335 char = child .( RegexpCharacterConstant ) .getValue ( )
@@ -433,7 +474,7 @@ private module CharacterClasses {
433474 char = "0123456789" .charAt ( _)
434475 or
435476 clazz = "s" and
436- char = [ " " , "\t" , "\r" , "\n" , 11 .toUnicode ( ) , 12 .toUnicode ( ) ] // 11.toUnicode() = \v, 12.toUnicode() = \f'
477+ char = [ " " , "\t" , "\r" , "\n" , 11 .toUnicode ( ) , 12 .toUnicode ( ) ] // 11.toUnicode() = \v, 12.toUnicode() = \f
437478 or
438479 clazz = "w" and
439480 char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_" .charAt ( _)
@@ -477,7 +518,7 @@ private module CharacterClasses {
477518 result = [ "0" , "9" ]
478519 or
479520 cc .getValue ( ) = "s" and
480- result = [ " " ]
521+ result = " "
481522 or
482523 cc .getValue ( ) = "w" and
483524 result = [ "a" , "Z" , "_" , "0" , "9" ]
@@ -490,7 +531,7 @@ private module CharacterClasses {
490531 result = "9"
491532 or
492533 cc .getValue ( ) = "s" and
493- result = [ " " ]
534+ result = " "
494535 or
495536 cc .getValue ( ) = "w" and
496537 result = "a"
@@ -604,7 +645,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
604645/**
605646 * Gets a state the NFA may be in after matching `t`.
606647 */
607- private State after ( RegExpTerm t ) {
648+ State after ( RegExpTerm t ) {
608649 exists ( RegExpAlt alt | t = alt .getAChild ( ) | result = after ( alt ) )
609650 or
610651 exists ( RegExpSequence seq , int i | t = seq .getChild ( i ) |
@@ -633,7 +674,14 @@ private State after(RegExpTerm t) {
633674predicate delta ( State q1 , EdgeLabel lbl , State q2 ) {
634675 exists ( RegexpCharacterConstant s , int i |
635676 q1 = Match ( s , i ) and
636- lbl = Char ( s .getValue ( ) .charAt ( i ) ) and
677+ (
678+ not RegExpFlags:: isIgnoreCase ( s .getRootTerm ( ) ) and
679+ lbl = Char ( s .getValue ( ) .charAt ( i ) )
680+ or
681+ // normalize everything to lower case if the regexp is case insensitive
682+ RegExpFlags:: isIgnoreCase ( s .getRootTerm ( ) ) and
683+ exists ( string c | c = s .getValue ( ) .charAt ( i ) | lbl = Char ( c .toLowerCase ( ) ) )
684+ ) and
637685 (
638686 q2 = Match ( s , i + 1 )
639687 or
@@ -643,20 +691,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
643691 )
644692 or
645693 exists ( RegExpDot dot | q1 = before ( dot ) and q2 = after ( dot ) |
646- if dot .getLiteral ( ) . isDotAll ( ) then lbl = Any ( ) else lbl = Dot ( )
694+ if RegExpFlags :: isDotAll ( dot .getRootTerm ( ) ) then lbl = Any ( ) else lbl = Dot ( )
647695 )
648696 or
649697 exists ( RegExpCharacterClass cc |
650698 cc .isUniversalClass ( ) and q1 = before ( cc ) and lbl = Any ( ) and q2 = after ( cc )
651699 or
652700 q1 = before ( cc ) and
653- lbl = CharClass ( cc .getRawValue ( ) ) and
701+ lbl = CharClass ( cc .getRawValue ( ) + "|" + getCanonicalizationFlags ( cc . getRootTerm ( ) ) ) and
654702 q2 = after ( cc )
655703 )
656704 or
657705 exists ( RegExpCharacterClassEscape cc |
658706 q1 = before ( cc ) and
659- lbl = CharClass ( cc .getRawValue ( ) ) and
707+ lbl = CharClass ( cc .getRawValue ( ) + "|" + getCanonicalizationFlags ( cc . getRootTerm ( ) ) ) and
660708 q2 = after ( cc )
661709 )
662710 or
@@ -729,16 +777,27 @@ RegExpRoot getRoot(RegExpTerm term) {
729777 result = getRoot ( term .getParent ( ) )
730778}
731779
732- private newtype TState =
733- Match ( RegExpTerm t , int i ) {
734- getRoot ( t ) .isRelevant ( ) and
735- (
736- i = 0
737- or
738- exists ( t .( RegexpCharacterConstant ) .getValue ( ) .charAt ( i ) )
739- )
780+ /**
781+ * A state in the NFA.
782+ */
783+ newtype TState =
784+ /**
785+ * A state representing that the NFA is about to match a term.
786+ * `i` is used to index into multi-char literals.
787+ */
788+ Match ( RelevantRegExpTerm t , int i ) {
789+ i = 0
790+ or
791+ exists ( t .( RegexpCharacterConstant ) .getValue ( ) .charAt ( i ) )
740792 } or
793+ /**
794+ * An accept state, where exactly the given input string is accepted.
795+ */
741796 Accept ( RegExpRoot l ) { l .isRelevant ( ) } or
797+ /**
798+ * An accept state, where the given input string, or any string that has this
799+ * string as a prefix, is accepted.
800+ */
742801 AcceptAnySuffix ( RegExpRoot l ) { l .isRelevant ( ) }
743802
744803/**
@@ -851,29 +910,26 @@ InputSymbol getAnInputSymbolMatching(string char) {
851910 result = Any ( )
852911}
853912
913+ /**
914+ * Holds if `state` is a start state.
915+ */
916+ predicate isStartState ( State state ) {
917+ state = mkMatch ( any ( RegExpRoot r ) )
918+ or
919+ exists ( RegExpCaret car | state = after ( car ) )
920+ }
921+
854922/**
855923 * Predicates for constructing a prefix string that leads to a given state.
856924 */
857925private module PrefixConstruction {
858- /**
859- * Holds if `state` starts the string matched by the regular expression.
860- */
861- private predicate isStartState ( State state ) {
862- state instanceof StateInPumpableRegexp and
863- (
864- state = Match ( any ( RegExpRoot r ) , _)
865- or
866- exists ( RegExpCaret car | state = after ( car ) )
867- )
868- }
869-
870926 /**
871927 * Holds if `state` is the textually last start state for the regular expression.
872928 */
873929 private predicate lastStartState ( State state ) {
874930 exists ( RegExpRoot root |
875931 state =
876- max ( State s , Location l |
932+ max ( StateInPumpableRegexp s , Location l |
877933 isStartState ( s ) and getRoot ( s .getRepr ( ) ) = root and l = s .getRepr ( ) .getLocation ( )
878934 |
879935 s
@@ -1173,7 +1229,6 @@ private predicate isReDoSAttackable(RegExpTerm term, string pump, State s) {
11731229 * `prefixMsg` contains a friendly message for a prefix that reaches `s` (or `prefixMsg` is the empty string if the prefix is empty or if no prefix could be found).
11741230 */
11751231predicate hasReDoSResult ( RegExpTerm t , string pump , State s , string prefixMsg ) {
1176- not t .getRegExp ( ) .hasFreeSpacingFlag ( ) and // exclude free-spacing mode regexes
11771232 isReDoSAttackable ( t , pump , s ) and
11781233 (
11791234 prefixMsg = "starting with '" + escape ( PrefixConstruction:: prefix ( s ) ) + "' and " and
0 commit comments