|
3 | 3 | * that match URLs and hostname patterns. |
4 | 4 | */ |
5 | 5 |
|
6 | | -private import HostnameRegexpSpecific |
7 | | - |
8 | | -/** |
9 | | - * Holds if the given constant is unlikely to occur in the origin part of a URL. |
10 | | - */ |
11 | | -predicate isConstantInvalidInsideOrigin(RegExpConstant term) { |
12 | | - // Look for any of these cases: |
13 | | - // - A character that can't occur in the origin |
14 | | - // - Two dashes in a row |
15 | | - // - A colon that is not part of port or scheme separator |
16 | | - // - A slash that is not part of scheme separator |
17 | | - term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*") |
18 | | -} |
19 | | - |
20 | | -/** Holds if `term` is a dot constant of form `\.` or `[.]`. */ |
21 | | -predicate isDotConstant(RegExpTerm term) { |
22 | | - term.(RegExpCharEscape).getValue() = "." |
23 | | - or |
24 | | - exists(RegExpCharacterClass cls | |
25 | | - term = cls and |
26 | | - not cls.isInverted() and |
27 | | - cls.getNumChild() = 1 and |
28 | | - cls.getAChild().(RegExpConstant).getValue() = "." |
29 | | - ) |
30 | | -} |
31 | | - |
32 | | -/** Holds if `term` is a wildcard `.` or an actual `.` character. */ |
33 | | -predicate isDotLike(RegExpTerm term) { |
34 | | - term instanceof RegExpDot |
35 | | - or |
36 | | - isDotConstant(term) |
37 | | -} |
38 | | - |
39 | | -/** Holds if `term` will only ever be matched against the beginning of the input. */ |
40 | | -predicate matchesBeginningOfString(RegExpTerm term) { |
41 | | - term.isRootTerm() |
42 | | - or |
43 | | - exists(RegExpTerm parent | matchesBeginningOfString(parent) | |
44 | | - term = parent.(RegExpSequence).getChild(0) |
45 | | - or |
46 | | - parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and |
47 | | - term = parent.(RegExpSequence).getChild(1) |
48 | | - or |
49 | | - term = parent.(RegExpAlt).getAChild() |
50 | | - or |
51 | | - term = parent.(RegExpGroup).getAChild() |
52 | | - ) |
53 | | -} |
54 | | - |
55 | | -/** |
56 | | - * Holds if the given sequence `seq` contains top-level domain preceded by a dot, such as `.com`, |
57 | | - * excluding cases where this is at the very beginning of the regexp. |
58 | | - * |
59 | | - * `i` is bound to the index of the last child in the top-level domain part. |
60 | | - */ |
61 | | -predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) { |
62 | | - seq.getChild(i) |
63 | | - .(RegExpConstant) |
64 | | - .getValue() |
65 | | - .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and |
66 | | - isDotLike(seq.getChild(i - 1)) and |
67 | | - not (i = 1 and matchesBeginningOfString(seq)) |
68 | | -} |
69 | | - |
70 | | -/** |
71 | | - * Holds if the given regular expression term contains top-level domain preceded by a dot, |
72 | | - * such as `.com`. |
73 | | - */ |
74 | | -predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) } |
75 | | - |
76 | | -/** |
77 | | - * Holds if `term` will always match a hostname, that is, all disjunctions contain |
78 | | - * a hostname pattern that isn't inside a quantifier. |
79 | | - */ |
80 | | -predicate alwaysMatchesHostname(RegExpTerm term) { |
81 | | - hasTopLevelDomainEnding(term, _) |
82 | | - or |
83 | | - // `localhost` is considered a hostname pattern, but has no TLD |
84 | | - term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b") |
85 | | - or |
86 | | - not term instanceof RegExpAlt and |
87 | | - not term instanceof RegExpQuantifier and |
88 | | - alwaysMatchesHostname(term.getAChild()) |
89 | | - or |
90 | | - alwaysMatchesHostnameAlt(term) |
91 | | -} |
92 | | - |
93 | | -/** Holds if every child of `alt` contains a hostname pattern. */ |
94 | | -predicate alwaysMatchesHostnameAlt(RegExpAlt alt) { |
95 | | - alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1) |
96 | | -} |
97 | | - |
98 | | -/** |
99 | | - * Holds if the first `i` children of `alt` contains a hostname pattern. |
100 | | - * |
101 | | - * This is used instead of `forall` to avoid materializing the set of alternatives |
102 | | - * that don't contains hostnames, which is much larger. |
103 | | - */ |
104 | | -predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) { |
105 | | - alwaysMatchesHostname(alt.getChild(0)) and i = 0 |
106 | | - or |
107 | | - alwaysMatchesHostnameAlt(alt, i - 1) and |
108 | | - alwaysMatchesHostname(alt.getChild(i)) |
109 | | -} |
110 | | - |
111 | | -/** |
112 | | - * Holds if `term` occurs inside a quantifier or alternative (and thus |
113 | | - * can not be expected to correspond to a unique match), or as part of |
114 | | - * a lookaround assertion (which are rarely used for capture groups). |
115 | | - */ |
116 | | -predicate isInsideChoiceOrSubPattern(RegExpTerm term) { |
117 | | - exists(RegExpParent parent | parent = term.getParent() | |
118 | | - parent instanceof RegExpAlt |
119 | | - or |
120 | | - parent instanceof RegExpQuantifier |
121 | | - or |
122 | | - parent instanceof RegExpSubPattern |
123 | | - or |
124 | | - isInsideChoiceOrSubPattern(parent) |
125 | | - ) |
126 | | -} |
127 | | - |
128 | | -/** |
129 | | - * Holds if `group` is likely to be used as a capture group. |
130 | | - */ |
131 | | -predicate isLikelyCaptureGroup(RegExpGroup group) { |
132 | | - group.isCapture() and |
133 | | - not isInsideChoiceOrSubPattern(group) |
134 | | -} |
135 | | - |
136 | | -/** |
137 | | - * Holds if `seq` contains two consecutive dots `..` or escaped dots. |
138 | | - * |
139 | | - * At least one of these dots is not intended to be a subdomain separator, |
140 | | - * so we avoid flagging the pattern in this case. |
141 | | - */ |
142 | | -predicate hasConsecutiveDots(RegExpSequence seq) { |
143 | | - exists(int i | |
144 | | - isDotLike(seq.getChild(i)) and |
145 | | - isDotLike(seq.getChild(i + 1)) |
146 | | - ) |
147 | | -} |
148 | | - |
149 | | -predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) { |
150 | | - seq = regexp.getAChild*() and |
151 | | - exists(RegExpDot unescapedDot, int i, string hostname | |
152 | | - hasTopLevelDomainEnding(seq, i) and |
153 | | - not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and |
154 | | - not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and |
155 | | - unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and |
156 | | - unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD |
157 | | - not hasConsecutiveDots(unescapedDot.getParent()) and |
158 | | - hostname = |
159 | | - seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() + |
160 | | - seq.getChild(i).getRawValue() |
161 | | - | |
162 | | - if unescapedDot.getParent() instanceof RegExpQuantifier |
163 | | - then |
164 | | - // `.*\.example.com` can match `evil.com/?x=.example.com` |
165 | | - // |
166 | | - // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin. |
167 | | - // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`. |
168 | | - // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL, |
169 | | - // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor. |
170 | | - seq.getChild(0) instanceof RegExpCaret and |
171 | | - not seq.getAChild() instanceof RegExpDollar and |
172 | | - seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and |
173 | | - msg = |
174 | | - "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue() |
175 | | - + "' which may cause '" + hostname + |
176 | | - "' to be matched anywhere in the URL, outside the hostname." |
177 | | - else |
178 | | - msg = |
179 | | - "has an unescaped '.' before '" + hostname + |
180 | | - "', so it might match more hosts than expected." |
181 | | - ) |
182 | | -} |
183 | | - |
184 | | -predicate incompleteHostnameRegExp( |
185 | | - RegExpSequence hostSequence, string message, DataFlow::Node aux, string label |
186 | | -) { |
187 | | - exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind | |
188 | | - regexp = re.getRegExpTerm() and |
189 | | - isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and |
190 | | - ( |
191 | | - if re.getAParse() != re |
192 | | - then ( |
193 | | - kind = "string, which is used as a regular expression $@," and |
194 | | - aux = re.getAParse() |
195 | | - ) else ( |
196 | | - kind = "regular expression" and aux = re |
197 | | - ) |
198 | | - ) |
199 | | - | |
200 | | - message = "This " + kind + " " + msg and label = "here" |
201 | | - ) |
202 | | -} |
| 6 | +deprecated import semmle.javascript.security.regexp.HostnameRegexp as Dep |
| 7 | +import Dep |
0 commit comments