|
12 | 12 | */ |
13 | 13 |
|
14 | 14 | private import javascript |
15 | | -private import semmle.javascript.security.regexp.HostnameRegexp |
16 | | - |
17 | | -// TODO: Share the below code. |
18 | | -/** |
19 | | - * Holds if `term` is an anchor that is not the first or last node |
20 | | - * in its tree. |
21 | | - */ |
22 | | -predicate isInteriorAnchor(RegExpAnchor term) { |
23 | | - not isLeftArmTerm(term) and |
24 | | - not isRightArmTerm(term) |
25 | | -} |
26 | | - |
27 | | -/** |
28 | | - * Holds if `term` contains an anchor that is not the first or last node |
29 | | - * in its tree, such as `(foo|bar$|baz)`. |
30 | | - */ |
31 | | -predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) } |
32 | | - |
33 | | -/** |
34 | | - * Holds if `term` starts with a word boundary or lookbehind assertion, |
35 | | - * indicating that it's not intended to be anchored on that side. |
36 | | - */ |
37 | | -predicate containsLeadingPseudoAnchor(RegExpSequence term) { |
38 | | - exists(RegExpTerm child | child = term.getChild(0) | |
39 | | - child instanceof RegExpWordBoundary or |
40 | | - child instanceof RegExpNonWordBoundary or |
41 | | - child instanceof RegExpLookbehind |
42 | | - ) |
43 | | -} |
44 | | - |
45 | | -/** |
46 | | - * Holds if `term` ends with a word boundary or lookahead assertion, |
47 | | - * indicating that it's not intended to be anchored on that side. |
48 | | - */ |
49 | | -predicate containsTrailingPseudoAnchor(RegExpSequence term) { |
50 | | - exists(RegExpTerm child | child = term.getLastChild() | |
51 | | - child instanceof RegExpWordBoundary or |
52 | | - child instanceof RegExpNonWordBoundary or |
53 | | - child instanceof RegExpLookahead |
54 | | - ) |
55 | | -} |
56 | | - |
57 | | -/** |
58 | | - * Holds if `term` is an empty sequence, usually arising from |
59 | | - * literals with a trailing alternative such as `foo|`. |
60 | | - */ |
61 | | -predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 } |
62 | | - |
63 | | -/** |
64 | | - * Holds if `term` contains a letter constant. |
65 | | - * |
66 | | - * We use this as a heuristic to filter out uninteresting results. |
67 | | - */ |
68 | | -predicate containsLetters(RegExpTerm term) { |
69 | | - term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*") |
70 | | -} |
71 | | - |
72 | | -/** |
73 | | - * Holds if `term` consists only of an anchor and a parenthesized term, |
74 | | - * such as the left side of `^(foo|bar)|baz`. |
75 | | - * |
76 | | - * The precedence of the anchor is likely to be intentional in this case, |
77 | | - * as the group wouldn't be needed otherwise. |
78 | | - */ |
79 | | -predicate isAnchoredGroup(RegExpSequence term) { |
80 | | - term.getNumChild() = 2 and |
81 | | - term.getAChild() instanceof RegExpAnchor and |
82 | | - term.getAChild() instanceof RegExpGroup |
83 | | -} |
84 | | - |
85 | | -/** |
86 | | - * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz` |
87 | | - * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`. |
88 | | - */ |
89 | | -predicate hasExplicitAnchorPrecedence(RegExpAlt alt) { |
90 | | - isAnchoredGroup(alt.getAChild()) and |
91 | | - not alt.getAChild() instanceof RegExpGroup |
92 | | -} |
93 | | - |
94 | | -/** |
95 | | - * Holds if `src` is a pattern for a collection of alternatives where |
96 | | - * only the first or last alternative is anchored, indicating a |
97 | | - * precedence mistake explained by `msg`. |
98 | | - * |
99 | | - * The canonical example of such a mistake is: `^a|b|c`, which is |
100 | | - * parsed as `(^a)|(b)|(c)`. |
101 | | - */ |
102 | | -predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) { |
103 | | - exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction | |
104 | | - root = src.getRegExpTerm() and |
105 | | - not containsInteriorAnchor(root) and |
106 | | - not isEmpty(root.getAChild()) and |
107 | | - not hasExplicitAnchorPrecedence(root) and |
108 | | - containsLetters(anchoredTerm) and |
109 | | - ( |
110 | | - anchoredTerm = root.getChild(0) and |
111 | | - anchoredTerm.getChild(0) instanceof RegExpCaret and |
112 | | - not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and |
113 | | - containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and |
114 | | - direction = "beginning" |
115 | | - or |
116 | | - anchoredTerm = root.getLastChild() and |
117 | | - anchoredTerm.getLastChild() instanceof RegExpDollar and |
118 | | - not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and |
119 | | - containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and |
120 | | - direction = "end" |
121 | | - ) and |
122 | | - // is not used for replace |
123 | | - not exists(DataFlow::MethodCallNode replace | |
124 | | - replace.getMethodName() = "replace" and |
125 | | - src.getARegExpObject().flowsTo(replace.getArgument(0)) |
126 | | - ) and |
127 | | - msg = |
128 | | - "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() + |
129 | | - "' is anchored at the " + direction + |
130 | | - ", but the other parts of this regular expression are not" |
131 | | - ) |
132 | | -} |
133 | | - |
134 | | -/** |
135 | | - * Holds if `term` is a final term, that is, no term will match anything after this one. |
136 | | - */ |
137 | | -predicate isFinalRegExpTerm(RegExpTerm term) { |
138 | | - term.isRootTerm() |
139 | | - or |
140 | | - exists(RegExpSequence seq | |
141 | | - isFinalRegExpTerm(seq) and |
142 | | - term = seq.getLastChild() |
143 | | - ) |
144 | | - or |
145 | | - exists(RegExpTerm parent | |
146 | | - isFinalRegExpTerm(parent) and |
147 | | - term = parent.getAChild() and |
148 | | - not parent instanceof RegExpSequence and |
149 | | - not parent instanceof RegExpQuantifier |
150 | | - ) |
151 | | -} |
152 | | - |
153 | | -/** |
154 | | - * Holds if `src` contains a hostname pattern that is missing a `$` anchor. |
155 | | - */ |
156 | | -predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { |
157 | | - not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting |
158 | | - exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() | |
159 | | - not isConstantInvalidInsideOrigin(term.getAChild*()) and |
160 | | - tld = term.getAChild*() and |
161 | | - hasTopLevelDomainEnding(tld, i) and |
162 | | - isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD |
163 | | - tld.getChild(0) instanceof RegExpCaret and |
164 | | - msg = |
165 | | - "This hostname pattern may match any domain name, as it is missing a '$' or '/' at the end." |
166 | | - ) |
167 | | -} |
168 | | - |
169 | | -/** |
170 | | - * Holds if `src` is an unanchored pattern for a URL, indicating a |
171 | | - * mistake explained by `msg`. |
172 | | - */ |
173 | | -predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) { |
174 | | - exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() | |
175 | | - alwaysMatchesHostname(term) and |
176 | | - tld = term.getAChild*() and |
177 | | - hasTopLevelDomainEnding(tld) and |
178 | | - not isConstantInvalidInsideOrigin(term.getAChild*()) and |
179 | | - not term.getAChild*() instanceof RegExpAnchor and |
180 | | - // that is not used for capture or replace |
181 | | - not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() | |
| 15 | +private import semmle.javascript.security.regexp.HostnameRegexp as HostnameRegexp |
| 16 | +private import codeql.regex.MissingRegExpAnchor as MissingRegExpAnchor |
| 17 | +private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl |
| 18 | + |
| 19 | +private module Impl implements |
| 20 | +MissingRegExpAnchor::MissingRegExpAnchorSig<TreeImpl, HostnameRegexp::Impl> { |
| 21 | + predicate isUsedAsReplace(RegExpPatternSource pattern) { |
| 22 | + // is used for capture or replace |
| 23 | + exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() | |
182 | 24 | name = "exec" and |
183 | | - mcn = src.getARegExpObject().getAMethodCall() and |
| 25 | + mcn = pattern.getARegExpObject().getAMethodCall() and |
184 | 26 | exists(mcn.getAPropertyRead()) |
185 | 27 | or |
186 | 28 | exists(DataFlow::Node arg | |
187 | 29 | arg = mcn.getArgument(0) and |
188 | 30 | ( |
189 | | - src.getARegExpObject().flowsTo(arg) or |
190 | | - src.getAParse() = arg |
| 31 | + pattern.getARegExpObject().flowsTo(arg) or |
| 32 | + pattern.getAParse() = arg |
191 | 33 | ) |
192 | 34 | | |
193 | 35 | name = "replace" |
194 | 36 | or |
195 | 37 | name = "match" and exists(mcn.getAPropertyRead()) |
196 | 38 | ) |
197 | | - ) and |
198 | | - msg = |
199 | | - "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it." |
200 | | - ) |
| 39 | + ) |
| 40 | + } |
| 41 | + |
| 42 | + string getEndAnchorText() { result = "$" } |
201 | 43 | } |
202 | 44 |
|
| 45 | +import MissingRegExpAnchor::Make<TreeImpl, HostnameRegexp::Impl, Impl> |
| 46 | + |
203 | 47 | from DataFlow::Node nd, string msg |
204 | 48 | where |
205 | 49 | isUnanchoredHostnameRegExp(nd, msg) |
206 | 50 | or |
207 | 51 | isSemiAnchoredHostnameRegExp(nd, msg) |
208 | 52 | or |
209 | 53 | hasMisleadingAnchorPrecedence(nd, msg) |
| 54 | +// isLineAnchoredHostnameRegExp is not used here, as it is not relevant to JS. |
210 | 55 | select nd, msg |
0 commit comments