@@ -18,6 +18,20 @@ module Make<RegexTreeViewSig TreeImpl> {
1818 exists ( int code | code = ascii ( c ) | code + 1 = ascii ( result ) )
1919 }
2020
21+ /**
22+ * Gets the `i`th codepoint in `s`.
23+ */
24+ bindingset [ s]
25+ private string getCodepointAt ( string s , int i ) { result = s .regexpFind ( "(.|\\s)" , i , _) }
26+
27+ /**
28+ * Gets the length of `s` in codepoints.
29+ */
30+ bindingset [ str]
31+ private int getCodepointLength ( string str ) {
32+ result = max ( int m | exists ( str .regexpFind ( "(.|\\s)" , m - 1 , _) ) or m = 0 )
33+ }
34+
2135 /**
2236 * Gets an approximation for the ASCII code for `char`.
2337 * Only the easily printable chars are included (so no newline, tab, null, etc).
@@ -190,17 +204,17 @@ module Make<RegexTreeViewSig TreeImpl> {
190204 /** An input symbol corresponding to character `c`. */
191205 Char ( string c ) {
192206 c =
193- any ( RegexpCharacterConstant cc |
194- cc instanceof RelevantRegExpTerm and
195- not isIgnoreCase ( cc .getRootTerm ( ) )
196- ) .getValue ( ) . charAt ( _)
207+ getCodepointAt ( any ( RegexpCharacterConstant cc |
208+ cc instanceof RelevantRegExpTerm and
209+ not isIgnoreCase ( cc .getRootTerm ( ) )
210+ ) .getValue ( ) , _)
197211 or
198212 // normalize everything to lower case if the regexp is case insensitive
199213 c =
200214 any ( RegexpCharacterConstant cc , string char |
201215 cc instanceof RelevantRegExpTerm and
202216 isIgnoreCase ( cc .getRootTerm ( ) ) and
203- char = cc .getValue ( ) . charAt ( _)
217+ char = getCodepointAt ( cc .getValue ( ) , _)
204218 |
205219 char .toLowerCase ( )
206220 )
@@ -396,7 +410,7 @@ module Make<RegexTreeViewSig TreeImpl> {
396410 string getARelevantChar ( ) {
397411 exists ( ascii ( result ) )
398412 or
399- exists ( RegexpCharacterConstant c | result = c .getValue ( ) . charAt ( _) )
413+ exists ( RegexpCharacterConstant c | result = getCodepointAt ( c .getValue ( ) , _) )
400414 or
401415 classEscapeMatches ( _, result )
402416 }
@@ -702,16 +716,16 @@ module Make<RegexTreeViewSig TreeImpl> {
702716 q1 = Match ( s , i ) and
703717 (
704718 not isIgnoreCase ( s .getRootTerm ( ) ) and
705- lbl = Char ( s .getValue ( ) . charAt ( i ) )
719+ lbl = Char ( getCodepointAt ( s .getValue ( ) , i ) )
706720 or
707721 // normalize everything to lower case if the regexp is case insensitive
708722 isIgnoreCase ( s .getRootTerm ( ) ) and
709- exists ( string c | c = s .getValue ( ) . charAt ( i ) | lbl = Char ( c .toLowerCase ( ) ) )
723+ exists ( string c | c = getCodepointAt ( s .getValue ( ) , i ) | lbl = Char ( c .toLowerCase ( ) ) )
710724 ) and
711725 (
712726 q2 = Match ( s , i + 1 )
713727 or
714- s .getValue ( ) . length ( ) = i + 1 and
728+ getCodepointLength ( s .getValue ( ) ) = i + 1 and
715729 q2 = after ( s )
716730 )
717731 )
@@ -812,7 +826,7 @@ module Make<RegexTreeViewSig TreeImpl> {
812826 Match ( RelevantRegExpTerm t , int i ) {
813827 i = 0
814828 or
815- exists ( t .( RegexpCharacterConstant ) .getValue ( ) . charAt ( i ) )
829+ exists ( getCodepointAt ( t .( RegexpCharacterConstant ) .getValue ( ) , i ) )
816830 } or
817831 /**
818832 * An accept state, where exactly the given input string is accepted.
@@ -1105,7 +1119,9 @@ module Make<RegexTreeViewSig TreeImpl> {
11051119 */
11061120 predicate reachesOnlyRejectableSuffixes ( State fork , string w ) {
11071121 isReDoSCandidate ( fork , w ) and
1108- forex ( State next | next = process ( fork , w , w .length ( ) - 1 ) | isLikelyRejectable ( next ) ) and
1122+ forex ( State next | next = process ( fork , w , getCodepointLength ( w ) - 1 ) |
1123+ isLikelyRejectable ( next )
1124+ ) and
11091125 not getProcessPrevious ( fork , _, w ) = acceptsAnySuffix ( ) // we stop `process(..)` early if we can, check here if it happened.
11101126 }
11111127
@@ -1215,6 +1231,12 @@ module Make<RegexTreeViewSig TreeImpl> {
12151231 exists ( string char | char = [ "|" , "\n" , "Z" ] | not deltaClosedChar ( s , char , _) )
12161232 }
12171233
1234+ // `process` can't use pragma[inline] predicates. So a materialized version of `getCodepointAt` is needed.
1235+ private string getCodePointAtForProcess ( string str , int i ) {
1236+ result = getCodepointAt ( str , i ) and
1237+ exists ( getProcessPrevious ( _, _, str ) )
1238+ }
1239+
12181240 /**
12191241 * Gets a state that can be reached from pumpable `fork` consuming all
12201242 * chars in `w` any number of times followed by the first `i+1` characters of `w`.
@@ -1224,7 +1246,7 @@ module Make<RegexTreeViewSig TreeImpl> {
12241246 exists ( State prev | prev = getProcessPrevious ( fork , i , w ) |
12251247 not prev = acceptsAnySuffix ( ) and // we stop `process(..)` early if we can. If the successor accepts any suffix, then we know it can never be rejected.
12261248 exists ( string char , InputSymbol sym |
1227- char = w . charAt ( i ) and
1249+ char = getCodePointAtForProcess ( w , i ) and
12281250 deltaClosed ( prev , sym , result ) and
12291251 // noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`.
12301252 // Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found.
@@ -1246,7 +1268,7 @@ module Make<RegexTreeViewSig TreeImpl> {
12461268 or
12471269 // repeat until fixpoint
12481270 i = 0 and
1249- result = process ( fork , w , w . length ( ) - 1 )
1271+ result = process ( fork , w , getCodepointLength ( w ) - 1 )
12501272 )
12511273 }
12521274
@@ -1262,7 +1284,9 @@ module Make<RegexTreeViewSig TreeImpl> {
12621284 /**
12631285 * Gets a `char` that occurs in a `pump` string.
12641286 */
1265- private string getAProcessChar ( ) { result = any ( string s | isReDoSCandidate ( _, s ) ) .charAt ( _) }
1287+ private string getAProcessChar ( ) {
1288+ result = getCodepointAt ( any ( string s | isReDoSCandidate ( _, s ) ) , _)
1289+ }
12661290 }
12671291
12681292 /**
@@ -1317,7 +1341,8 @@ module Make<RegexTreeViewSig TreeImpl> {
13171341 */
13181342 bindingset [ s]
13191343 private string escapeUnicodeString ( string s ) {
1320- result = concat ( int i , string char | char = escapeUnicodeChar ( s .charAt ( i ) ) | char order by i )
1344+ result =
1345+ concat ( int i , string char | char = escapeUnicodeChar ( getCodepointAt ( s , i ) ) | char order by i )
13211346 }
13221347
13231348 /**
@@ -1328,7 +1353,10 @@ module Make<RegexTreeViewSig TreeImpl> {
13281353 private string escapeUnicodeChar ( string char ) {
13291354 if isPrintable ( char )
13301355 then result = char
1331- else result = "\\u" + to4digitHex ( any ( int i | i .toUnicode ( ) = char ) )
1356+ else
1357+ if exists ( to4digitHex ( any ( int i | i .toUnicode ( ) = char ) ) )
1358+ then result = "\\u" + to4digitHex ( any ( int i | i .toUnicode ( ) = char ) )
1359+ else result = "\\u{" + toHex ( any ( int i | i .toUnicode ( ) = char ) ) + "}"
13321360 }
13331361
13341362 /** Holds if `char` is easily printable char, or whitespace. */
0 commit comments