77
88import regexp.RegExpTreeView // re-export
99private import regexp.internal.ParseRegExp
10- private import regexp.internal.RegExpConfiguration
11- private import codeql.ruby.ast.Literal as Ast
10+ private import regexp.internal.RegExpTracking as RegExpTracking
11+ private import codeql.ruby.AST as Ast
12+ private import codeql.ruby.CFG
1213private import codeql.ruby.DataFlow
1314private import codeql.ruby.ApiGraphs
15+ private import codeql.ruby.Concepts
1416
1517/**
1618 * Provides utility predicates related to regular expressions.
@@ -63,7 +65,11 @@ private class RegExpLiteralPatternSource extends RegExpPatternSource {
6365private class StringRegExpPatternSource extends RegExpPatternSource {
6466 private DataFlow:: Node parse ;
6567
66- StringRegExpPatternSource ( ) { this = regExpSource ( parse ) }
68+ StringRegExpPatternSource ( ) {
69+ this = regExpSource ( parse ) and
70+ // `regExpSource()` tracks both strings and regex literals, narrow it down to strings.
71+ this .asExpr ( ) .getConstantValue ( ) .isString ( _)
72+ }
6773
6874 override DataFlow:: Node getAParse ( ) { result = parse }
6975
@@ -104,6 +110,7 @@ module RegExpInterpretation {
104110
105111/**
106112 * A node interpreted as a regular expression.
113+ * Speficically nodes where string values are interpreted as regular expressions.
107114 */
108115class StdLibRegExpInterpretation extends RegExpInterpretation:: Range {
109116 StdLibRegExpInterpretation ( ) {
@@ -115,16 +122,100 @@ class StdLibRegExpInterpretation extends RegExpInterpretation::Range {
115122 mce .getMethodName ( ) = [ "match" , "match?" ] and
116123 this = mce .getArgument ( 0 ) and
117124 // exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
118- not mce .getReceiver ( ) = trackRegexpType ( )
125+ not mce .getReceiver ( ) = RegExpTracking :: trackRegexpType ( )
119126 )
120127 }
121128}
122129
130+ /**
131+ * Holds if `exec` is a node where `regexp` is interpreted as a regular expression and
132+ * tested against the string value of `input`.
133+ * `name` describes the regexp execution, typically the name of the method being called.
134+ */
135+ private predicate regexExecution (
136+ DataFlow:: Node exec , DataFlow:: Node input , DataFlow:: Node regexp , string name
137+ ) {
138+ // `=~` or `!~`
139+ exists ( CfgNodes:: ExprNodes:: BinaryOperationCfgNode op |
140+ name = op .getOperator ( ) and
141+ exec .asExpr ( ) = op and
142+ (
143+ op .getExpr ( ) instanceof Ast:: RegExpMatchExpr or
144+ op .getExpr ( ) instanceof Ast:: NoRegExpMatchExpr
145+ ) and
146+ (
147+ input .asExpr ( ) = op .getLeftOperand ( ) and regexp .asExpr ( ) = op .getRightOperand ( )
148+ or
149+ input .asExpr ( ) = op .getRightOperand ( ) and regexp .asExpr ( ) = op .getLeftOperand ( )
150+ )
151+ )
152+ or
153+ // Any of the methods on `String` that take a regexp.
154+ exists ( DataFlow:: CallNode call | exec = call |
155+ name = "String#" + call .getMethodName ( ) and
156+ call .getMethodName ( ) =
157+ [
158+ "[]" , "gsub" , "gsub!" , "index" , "match" , "match?" , "partition" , "rindex" , "rpartition" ,
159+ "scan" , "slice!" , "split" , "sub" , "sub!"
160+ ] and
161+ input = call .getReceiver ( ) and
162+ regexp = call .getArgument ( 0 ) and
163+ // exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match, they are handled on the next case of this disjunction
164+ // also see `StdLibRegExpInterpretation`
165+ not (
166+ call .getMethodName ( ) = [ "match" , "match?" ] and
167+ call .getReceiver ( ) = RegExpTracking:: trackRegexpType ( )
168+ )
169+ )
170+ or
171+ // A call to `match` or `match?` where the regexp is the receiver.
172+ exists ( DataFlow:: CallNode call | exec = call |
173+ name = "Regexp#" + call .getMethodName ( ) and
174+ call .getMethodName ( ) = [ "match" , "match?" ] and
175+ regexp = call .getReceiver ( ) and
176+ input = call .getArgument ( 0 )
177+ )
178+ or
179+ // a case-when statement
180+ exists ( CfgNodes:: ExprNodes:: CaseExprCfgNode caseExpr |
181+ exec .asExpr ( ) = caseExpr and
182+ input .asExpr ( ) = caseExpr .getValue ( )
183+ |
184+ name = "case-when" and
185+ regexp .asExpr ( ) = caseExpr .getBranch ( _) .( CfgNodes:: ExprNodes:: WhenClauseCfgNode ) .getPattern ( _)
186+ or
187+ name = "case-in" and
188+ regexp .asExpr ( ) = caseExpr .getBranch ( _) .( CfgNodes:: ExprNodes:: InClauseCfgNode ) .getPattern ( )
189+ )
190+ }
191+
192+ /**
193+ * An execution of a regular expression by the standard library.
194+ */
195+ private class StdRegexpExecution extends RegexExecution:: Range {
196+ DataFlow:: Node regexp ;
197+ DataFlow:: Node input ;
198+ string name ;
199+
200+ StdRegexpExecution ( ) { regexExecution ( this , input , regexp , name ) }
201+
202+ override DataFlow:: Node getRegex ( ) { result = regexp }
203+
204+ override DataFlow:: Node getString ( ) { result = input }
205+
206+ override string getName ( ) { result = name }
207+ }
208+
123209/**
124210 * Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
125211 * as a part of a regular expression.
126212 */
127213cached
128- DataFlow:: Node regExpSource ( DataFlow:: Node re ) {
129- exists ( RegExpConfiguration c | c .hasFlow ( result , re ) )
214+ DataFlow:: Node regExpSource ( DataFlow:: Node re ) { result = RegExpTracking:: regExpSource ( re ) }
215+
216+ /** Gets a parsed regular expression term that is executed at `exec`. */
217+ RegExpTerm getTermForExecution ( RegexExecution exec ) {
218+ exists ( RegExpPatternSource source | source = regExpSource ( exec .getRegex ( ) ) |
219+ result = source .getRegExpTerm ( )
220+ )
130221}
0 commit comments