|
2 | 2 | * Classes and predicates for working with suspicious character ranges. |
3 | 3 | */ |
4 | 4 |
|
5 | | -// We don't need the NFA utils, just the regexp tree. |
6 | | -// but the below is a nice shared library that exposes the API we need. |
7 | | -import regexp.NfaUtils |
8 | | - |
9 | | -/** |
10 | | - * Gets a rank for `range` that is unique for ranges in the same file. |
11 | | - * Prioritizes ranges that match more characters. |
12 | | - */ |
13 | | -int rankRange(RegExpCharacterRange range) { |
14 | | - range = |
15 | | - rank[result](RegExpCharacterRange r, Location l, int low, int high | |
16 | | - r.getLocation() = l and |
17 | | - isRange(r, low, high) |
18 | | - | |
19 | | - r order by (high - low) desc, l.getStartLine(), l.getStartColumn() |
20 | | - ) |
21 | | -} |
22 | | - |
23 | | -/** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */ |
24 | | -predicate isRange(RegExpCharacterRange range, int low, int high) { |
25 | | - exists(string lowc, string highc | |
26 | | - range.isRange(lowc, highc) and |
27 | | - low.toUnicode() = lowc and |
28 | | - high.toUnicode() = highc |
29 | | - ) |
30 | | -} |
31 | | - |
32 | | -/** Holds if `char` is an alpha-numeric character. */ |
33 | | -predicate isAlphanumeric(string char) { |
34 | | - // written like this to avoid having a bindingset for the predicate |
35 | | - char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z |
36 | | -} |
37 | | - |
38 | | -/** |
39 | | - * Holds if the given ranges are from the same character class |
40 | | - * and there exists at least one character matched by both ranges. |
41 | | - */ |
42 | | -predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) { |
43 | | - exists(RegExpCharacterClass clz | |
44 | | - a = clz.getAChild() and |
45 | | - b = clz.getAChild() and |
46 | | - a != b |
47 | | - | |
48 | | - exists(int alow, int ahigh, int blow, int bhigh | |
49 | | - isRange(a, alow, ahigh) and |
50 | | - isRange(b, blow, bhigh) and |
51 | | - alow <= bhigh and |
52 | | - blow <= ahigh |
53 | | - ) |
54 | | - ) |
55 | | -} |
56 | | - |
57 | | -/** |
58 | | - * Holds if `range` overlaps with the char class `escape` from the same character class. |
59 | | - */ |
60 | | -predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) { |
61 | | - exists(RegExpCharacterClass clz, string low, string high | |
62 | | - range = clz.getAChild() and |
63 | | - escape = clz.getAChild() and |
64 | | - range.isRange(low, high) |
65 | | - | |
66 | | - escape.getValue() = "w" and |
67 | | - getInRange(low, high).regexpMatch("\\w") |
68 | | - or |
69 | | - escape.getValue() = "d" and |
70 | | - getInRange(low, high).regexpMatch("\\d") |
71 | | - or |
72 | | - escape.getValue() = "s" and |
73 | | - getInRange(low, high).regexpMatch("\\s") |
74 | | - ) |
75 | | -} |
76 | | - |
77 | | -/** Gets the unicode code point for a `char`. */ |
78 | | -bindingset[char] |
79 | | -int toCodePoint(string char) { result.toUnicode() = char } |
80 | | - |
81 | | -/** A character range that appears to be overly wide. */ |
82 | | -class OverlyWideRange extends RegExpCharacterRange { |
83 | | - OverlyWideRange() { |
84 | | - exists(int low, int high, int numChars | |
85 | | - isRange(this, low, high) and |
86 | | - numChars = (1 + high - low) and |
87 | | - this.getRootTerm().isUsedAsRegExp() and |
88 | | - numChars >= 10 |
89 | | - | |
90 | | - // across the Z-a range (which includes backticks) |
91 | | - toCodePoint("Z") >= low and |
92 | | - toCodePoint("a") <= high |
93 | | - or |
94 | | - // across the 9-A range (which includes e.g. ; and ?) |
95 | | - toCodePoint("9") >= low and |
96 | | - toCodePoint("A") <= high |
97 | | - or |
98 | | - // a non-alphanumeric char as part of the range boundaries |
99 | | - exists(int bound | bound = [low, high] | not isAlphanumeric(bound.toUnicode())) and |
100 | | - // while still being ascii |
101 | | - low < 128 and |
102 | | - high < 128 |
103 | | - ) and |
104 | | - // allowlist for known ranges |
105 | | - not this = allowedWideRanges() |
106 | | - } |
107 | | - |
108 | | - /** Gets a string representation of a character class that matches the same chars as this range. */ |
109 | | - string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) } |
110 | | -} |
111 | | - |
112 | | -/** Gets a range that should not be reported as an overly wide range. */ |
113 | | -RegExpCharacterRange allowedWideRanges() { |
114 | | - // ~ is the last printable ASCII character, it's used right in various wide ranges. |
115 | | - result.isRange(_, "~") |
116 | | - or |
117 | | - // the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character. |
118 | | - result.isRange([" ", "!"], _) |
119 | | - or |
120 | | - // the `[@-_]` range is intentional |
121 | | - result.isRange("@", "_") |
122 | | - or |
123 | | - // starting from the zero byte is a good indication that it's purposely matching a large range. |
124 | | - result.isRange(0.toUnicode(), _) |
125 | | -} |
126 | | - |
127 | | -/** Gets a char between (and including) `low` and `high`. */ |
128 | | -bindingset[low, high] |
129 | | -private string getInRange(string low, string high) { |
130 | | - result = [toCodePoint(low) .. toCodePoint(high)].toUnicode() |
131 | | -} |
132 | | - |
133 | | -/** A module computing an equivalent character class for an overly wide range. */ |
134 | | -module RangePrinter { |
135 | | - bindingset[char] |
136 | | - bindingset[result] |
137 | | - private string next(string char) { |
138 | | - exists(int prev, int next | |
139 | | - prev.toUnicode() = char and |
140 | | - next.toUnicode() = result and |
141 | | - next = prev + 1 |
142 | | - ) |
143 | | - } |
144 | | - |
145 | | - /** Gets the points where the parts of the pretty printed range should be cut off. */ |
146 | | - private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] } |
147 | | - |
148 | | - /** Gets the char to use in the low end of a range for a given `cut` */ |
149 | | - private string lowCut(string cut) { |
150 | | - cut = ["A", "a", "0"] and |
151 | | - result = cut |
152 | | - or |
153 | | - cut = ["Z", "z", "9"] and |
154 | | - result = next(cut) |
155 | | - } |
156 | | - |
157 | | - /** Gets the char to use in the high end of a range for a given `cut` */ |
158 | | - private string highCut(string cut) { |
159 | | - cut = ["Z", "z", "9"] and |
160 | | - result = cut |
161 | | - or |
162 | | - cut = ["A", "a", "0"] and |
163 | | - next(result) = cut |
164 | | - } |
165 | | - |
166 | | - /** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */ |
167 | | - private string cutoff(OverlyWideRange range, int part) { |
168 | | - exists(int low, int high | isRange(range, low, high) | |
169 | | - result = |
170 | | - rank[part + 1](string cut | |
171 | | - cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high |
172 | | - | |
173 | | - cut order by toCodePoint(cut) |
174 | | - ) |
175 | | - ) |
176 | | - } |
177 | | - |
178 | | - /** Gets the number of parts we should print for a given `range`. */ |
179 | | - private int parts(OverlyWideRange range) { result = 1 + count(cutoff(range, _)) } |
180 | | - |
181 | | - /** Holds if the given part of a range should span from `low` to `high`. */ |
182 | | - private predicate part(OverlyWideRange range, int part, string low, string high) { |
183 | | - // first part. |
184 | | - part = 0 and |
185 | | - ( |
186 | | - range.isRange(low, high) and |
187 | | - parts(range) = 1 |
188 | | - or |
189 | | - parts(range) >= 2 and |
190 | | - range.isRange(low, _) and |
191 | | - high = highCut(cutoff(range, part)) |
192 | | - ) |
193 | | - or |
194 | | - // middle |
195 | | - part >= 1 and |
196 | | - part < parts(range) - 1 and |
197 | | - low = lowCut(cutoff(range, part - 1)) and |
198 | | - high = highCut(cutoff(range, part)) |
199 | | - or |
200 | | - // last. |
201 | | - part = parts(range) - 1 and |
202 | | - low = lowCut(cutoff(range, part - 1)) and |
203 | | - range.isRange(_, high) |
204 | | - } |
205 | | - |
206 | | - /** Gets an escaped `char` for use in a character class. */ |
207 | | - bindingset[char] |
208 | | - private string escape(string char) { |
209 | | - exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" | |
210 | | - if char.regexpMatch(reg) then result = "\\" + char else result = char |
211 | | - ) |
212 | | - } |
213 | | - |
214 | | - /** Gets a part of the equivalent range. */ |
215 | | - private string printEquivalentCharClass(OverlyWideRange range, int part) { |
216 | | - exists(string low, string high | part(range, part, low, high) | |
217 | | - if |
218 | | - isAlphanumeric(low) and |
219 | | - isAlphanumeric(high) |
220 | | - then result = low + "-" + high |
221 | | - else |
222 | | - result = |
223 | | - strictconcat(string char | char = getInRange(low, high) | escape(char) order by char) |
224 | | - ) |
225 | | - } |
226 | | - |
227 | | - /** Gets the entire pretty printed equivalent range. */ |
228 | | - string printEquivalentCharClass(OverlyWideRange range) { |
229 | | - result = |
230 | | - strictconcat(string r, int part | |
231 | | - r = "[" and part = -1 and exists(range) |
232 | | - or |
233 | | - r = printEquivalentCharClass(range, part) |
234 | | - or |
235 | | - r = "]" and part = parts(range) |
236 | | - | |
237 | | - r order by part |
238 | | - ) |
239 | | - } |
240 | | -} |
241 | | - |
242 | | -/** Gets a char range that is overly large because of `reason`. */ |
243 | | -RegExpCharacterRange getABadRange(string reason, int priority) { |
244 | | - result instanceof OverlyWideRange and |
245 | | - priority = 0 and |
246 | | - exists(string equiv | equiv = result.(OverlyWideRange).printEquivalent() | |
247 | | - if equiv.length() <= 50 |
248 | | - then reason = "is equivalent to " + equiv |
249 | | - else reason = "is equivalent to " + equiv.substring(0, 50) + "..." |
250 | | - ) |
251 | | - or |
252 | | - priority = 1 and |
253 | | - exists(RegExpCharacterRange other | |
254 | | - reason = "overlaps with " + other + " in the same character class" and |
255 | | - rankRange(result) < rankRange(other) and |
256 | | - overlap(result, other) |
257 | | - ) |
258 | | - or |
259 | | - priority = 2 and |
260 | | - exists(RegExpCharacterClassEscape escape | |
261 | | - reason = "overlaps with " + escape + " in the same character class" and |
262 | | - overlapsWithCharEscape(result, escape) |
263 | | - ) |
264 | | - or |
265 | | - reason = "is empty" and |
266 | | - priority = 3 and |
267 | | - exists(int low, int high | |
268 | | - isRange(result, low, high) and |
269 | | - low > high |
270 | | - ) |
271 | | -} |
272 | | - |
273 | | -/** Holds if `range` matches suspiciously many characters. */ |
274 | | -predicate problem(RegExpCharacterRange range, string reason) { |
275 | | - reason = |
276 | | - strictconcat(string m, int priority | |
277 | | - range = getABadRange(m, priority) |
278 | | - | |
279 | | - m, ", and " order by priority desc |
280 | | - ) and |
281 | | - // specifying a range using an escape is usually OK. |
282 | | - not range.getAChild() instanceof RegExpEscape and |
283 | | - // Unicode escapes in strings are interpreted before it turns into a regexp, |
284 | | - // so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants. |
285 | | - // We therefore exclude these ranges. |
286 | | - range.getRootTerm().getParent() instanceof RegExpLiteral and |
287 | | - // is used as regexp (mostly for JS where regular expressions are parsed eagerly) |
288 | | - range.getRootTerm().isUsedAsRegExp() |
289 | | -} |
| 5 | +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView |
| 6 | +// OverlyLargeRangeQuery should be used directly from the shared pack, and not from this file. |
| 7 | +deprecated import codeql.regex.OverlyLargeRangeQuery::Make<TreeView> as Dep |
| 8 | +import Dep |
0 commit comments