@@ -192,7 +192,6 @@ private module FunctionNames {
192192
193193/** Get a name of a supported generic token-based feature. */
194194string getASupportedFeatureName ( ) {
195- // allowlist of vetted features that are permitted in production
196195 result = any ( EndpointFeature f ) .getName ( )
197196}
198197
@@ -226,18 +225,18 @@ private newtype TEndpointFeature =
226225 TStringConcatenatedWith ( )
227226
228227/**
229- * An implementation of an endpoint feature: produces feature names and values for use in ML.
228+ * An implementation of an endpoint feature: defines feature-name/value tuples for use in ML.
230229 */
231230abstract class EndpointFeature extends TEndpointFeature {
232231 /**
233232 * Gets the name of the feature. Used by the ML model.
234- * Changes to the name of a feature requires training the model again .
233+ * Names are coupled to models: changing the name of a feature requires retraining the model.
235234 */
236235 abstract string getName ( ) ;
237236
238237 /**
239238 * Gets the value of the feature. Used by the ML model.
240- * Changes to the value of a feature requires training the model again .
239+ * Models are trained based on feature values, so changing the value of a feature requires retraining the model.
241240 */
242241 abstract string getValue ( DataFlow:: Node endpoint ) ;
243242
@@ -275,7 +274,7 @@ class ReceiverName extends EndpointFeature, TReceiverName {
275274
276275/**
277276 * The feature for the natural language tokens from the function that encloses the endpoint in
278- * the order that they appear in the source code.
277+ * the order that they appear in the source code.
279278 */
280279class EnclosingFunctionBody extends EndpointFeature , TEnclosingFunctionBody {
281280 override string getName ( ) { result = "enclosingFunctionBody" }
@@ -322,6 +321,8 @@ class FileImports extends EndpointFeature, TFileImports {
322321 * }
323322 * ```
324323 * In the above example, the feature for the marked endpoint has value '(a, b)\n(c, d)'.
324+ * The line breaks act as a separator between the parameters of different functions but
325+ * will be treated by tokenization as if they were spaces.
325326 */
326327class ContextSurroundingFunctionParameters extends EndpointFeature ,
327328 TContextSurroundingFunctionParameters {
@@ -345,12 +346,14 @@ class ContextSurroundingFunctionParameters extends EndpointFeature,
345346}
346347
347348/**
348- * The feature that gives the name an endpoint is assigned to (if any).
349+ * The feature that gives the name of any properties an endpoint is assigned to (if any).
349350 *
350351 * ### Example
351352 * ```javascript
352353 * const div = document.createElement('div');
353354 * div.innerHTML = endpoint; // feature value is 'innerHTML'
355+ *
356+ * foo({x: endpoint}); // feature value is 'x'
354357 * ```
355358 */
356359class AssignedToPropName extends EndpointFeature , TAssignedToPropName {
@@ -364,12 +367,13 @@ class AssignedToPropName extends EndpointFeature, TAssignedToPropName {
364367}
365368
366369/**
367- * The feature that shows the text an endpoint is being concatenated with.class
370+ * The feature that shows the text an endpoint is being concatenated with.
368371 *
369372 * ### Example
370373 *
371374 * ```javascript
372- * const x = 'foo' + endpoint + 'bar'; // feature value is `'foo' -endpoint- 'bar'`
375+ * const x = 'foo' + endpoint + 'bar'; // feature value is `'foo' -endpoint- 'bar'
376+ * ```
373377 */
374378class StringConcatenatedWith extends EndpointFeature , TStringConcatenatedWith {
375379 override string getName ( ) { result = "stringConcatenatedWith" }
@@ -456,8 +460,6 @@ class CalleeImports extends EndpointFeature, TCalleeImports {
456460 * ...
457461 * }
458462 * ```
459- *
460- * The feature value for the marked endpoint will be `f(a, b, c)\ng(x, y, z)\nh(u, v)`.
461463 */
462464class ContextFunctionInterfaces extends EndpointFeature , TContextFunctionInterfaces {
463465 override string getName ( ) { result = "contextFunctionInterfaces" }
@@ -471,6 +473,10 @@ class ContextFunctionInterfaces extends EndpointFeature, TContextFunctionInterfa
471473 * Syntactic utilities for feature value computation.
472474 */
473475private module SyntacticUtilities {
476+ /**
477+ * Renders an operand in a string concatenation by surrounding a constant in quotes, and
478+ * by using `getSimpleAccessPath` for everything else.
479+ */
474480 string renderStringConcatOperand ( DataFlow:: Node operand ) {
475481 if exists ( unique( string v | operand .mayHaveStringValue ( v ) ) )
476482 then result = "'" + any ( string v | operand .mayHaveStringValue ( v ) ) + "'"
@@ -555,7 +561,7 @@ private module SyntacticUtilities {
555561 * - direct arguments
556562 * - properties of (nested) objects that are arguments
557563 *
558- * Unknown cases and property names results in `?`.
564+ * Unknown cases and property names result in `?`.
559565 */
560566 string getSimpleParameterAccessPath ( DataFlow:: Node node ) {
561567 if exists ( DataFlow:: CallNode call | node = call .getArgument ( _) )
@@ -569,7 +575,7 @@ private module SyntacticUtilities {
569575 * Supports:
570576 * - properties of (nested) objects
571577 *
572- * Unknown cases and property names results in `?`.
578+ * Unknown cases and property names result in `?`.
573579 */
574580 string getSimplePropertyAccessPath ( DataFlow:: Node node ) {
575581 if exists ( ObjectExpr o | o .getAProperty ( ) .getInit ( ) .getUnderlyingValue ( ) = node .asExpr ( ) )
@@ -617,6 +623,17 @@ private module SyntacticUtilities {
617623 * - invocations
618624 *
619625 * Unknown cases and property names results in `?`.
626+ *
627+ * # Examples
628+ *
629+ * - The node `x.foo` will have the simple access path `x.foo`.
630+ * - In the following file, the simple access path will be `import("./foo").bar.baz`:
631+ *
632+ * ```javascript
633+ * import * as lib from "./foo"
634+ * console.log(lib.bar.baz());
635+ * // ^^^^^^^^^^^ node
636+ *
620637 */
621638 string getSimpleAccessPath ( DataFlow:: Node node ) {
622639 exists ( Expr e | e = node .asExpr ( ) .getUnderlyingValue ( ) |
@@ -661,7 +678,16 @@ private module SyntacticUtilities {
661678 if exists ( i .getImportedPath ( ) .getValue ( ) )
662679 then
663680 exists ( string p | p = i .getImportedPath ( ) .getValue ( ) |
664- if p .matches ( ".%" ) then result = "\"p\"" else result = "!" // hide absolute imports from the ML training
681+ // Hide absolute imports from ML training data.
682+ // ============================================
683+ // There is the hypothesis that exposing absolute imports to the model
684+ // might lead to bad generalization. For example, the model might learn
685+ // to strongly associate a specific database client with sinks and no
686+ // longer be able to flag sinks when data flow is broken.
687+ // Placing this logic so deeply within the feature extraction code is
688+ // perhaps a bit of a hack and it is a use case to consider when refactoring
689+ // endpoint filters/data extraction.
690+ if p .matches ( ".%" ) then result = "\"p\"" else result = "!"
665691 )
666692 else result = getUnknownSymbol ( )
667693 }
@@ -688,8 +714,6 @@ private module SyntacticUtilities {
688714 *
689715 * "Containment" is syntactic, and currently means that the endpoint is an argument to the call, or that the endpoint is a (nested) property value of an argument.
690716 *
691- * This feature is intended as a superior version of the many `Callee*` features.
692- *
693717 * Examples:
694718 * ```
695719 * foo(endpoint); // -> foo
@@ -746,8 +770,6 @@ class InputAccessPathFromCallee extends EndpointFeature, TInputAccessPathFromCal
746770 *
747771 * "Containment" is syntactic, and currently means that the endpoint is an argument to the call, or that the endpoint is a (nested) property value of an argument.
748772 *
749- * This feature is intended as a superior version of the `ArgumentIndexFeature`.
750- *
751773 * Examples:
752774 * ```
753775 * foo(endpoint); // -> 0
0 commit comments