88
99private import python
1010private import semmle.python.dataflow.new.DataFlow
11+ private import semmle.python.dataflow.new.TaintTracking
1112private import semmle.python.Concepts
1213private import semmle.python.ApiGraphs
1314private import semmle.python.frameworks.data.ModelsAsData
@@ -68,16 +69,9 @@ module Lxml {
6869 */
6970 class XPathCall extends XML:: XPathExecution:: Range , DataFlow:: CallCfgNode {
7071 XPathCall ( ) {
71- exists ( API:: Node parseResult |
72- parseResult =
73- etreeRef ( ) .getMember ( [ "parse" , "fromstring" , "fromstringlist" , "HTML" , "XML" ] ) .getReturn ( )
74- or
75- // TODO: lxml.etree.parseid(<text>)[0] will contain the root element from parsing <text>
76- // but we don't really have a way to model that nicely.
77- parseResult = etreeRef ( ) .getMember ( "XMLParser" ) .getReturn ( ) .getMember ( "close" ) .getReturn ( )
78- |
79- this = parseResult .getMember ( "xpath" ) .getACall ( )
80- )
72+ // TODO: lxml.etree.parseid(<text>)[0] will contain the root element from parsing <text>
73+ // but we don't really have a way to model that nicely.
74+ this = [ Element:: instance ( ) , ElementTree:: instance ( ) ] .getMember ( "xpath" ) .getACall ( )
8175 }
8276
8377 override DataFlow:: Node getXPath ( ) { result in [ this .getArg ( 0 ) , this .getArgByName ( "_path" ) ] }
@@ -209,31 +203,37 @@ module Lxml {
209203 * A call to either of:
210204 * - `lxml.etree.fromstring`
211205 * - `lxml.etree.fromstringlist`
206+ * - `lxml.etree.HTML`
212207 * - `lxml.etree.XML`
213208 * - `lxml.etree.XMLID`
209+ * - `lxml.etree.XMLDTDID`
214210 * - `lxml.etree.parse`
215211 * - `lxml.etree.parseid`
216212 *
217213 * See
218214 * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstring
219215 * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstringlist
216+ * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.HTML
220217 * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XML
221218 * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XMLID
219+ * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.XMLDTDID
222220 * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parse
223221 * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parseid
224222 */
225223 private class LxmlParsing extends DataFlow:: CallCfgNode , XML:: XmlParsing:: Range {
226224 string functionName ;
227225
228226 LxmlParsing ( ) {
229- functionName in [ "fromstring" , "fromstringlist" , "XML" , "XMLID" , "parse" , "parseid" ] and
227+ functionName in [
228+ "fromstring" , "fromstringlist" , "HTML" , "XML" , "XMLID" , "XMLDTDID" , "parse" , "parseid"
229+ ] and
230230 this = etreeRef ( ) .getMember ( functionName ) .getACall ( )
231231 }
232232
233233 override DataFlow:: Node getAnInput ( ) {
234234 result in [
235235 this .getArg ( 0 ) ,
236- // fromstring / XML / XMLID
236+ // fromstring / HTML / XML / XMLID / XMLDTDID
237237 this .getArgByName ( "text" ) ,
238238 // fromstringlist
239239 this .getArgByName ( "strings" ) ,
@@ -248,7 +248,8 @@ module Lxml {
248248 this .getParserArg ( ) = XmlParser:: instanceVulnerableTo ( kind )
249249 or
250250 kind .isXxe ( ) and
251- not exists ( this .getParserArg ( ) )
251+ not exists ( this .getParserArg ( ) ) and
252+ not functionName = "HTML"
252253 }
253254
254255 override predicate mayExecuteInput ( ) { none ( ) }
@@ -318,4 +319,183 @@ module Lxml {
318319
319320 override DataFlow:: Node getAPathArgument ( ) { result = this .getAnInput ( ) }
320321 }
322+
323+ /** Provides models for the `lxml.etree.Element` class. */
324+ module Element {
325+ /** Gets a reference to the `Element` class. */
326+ API:: Node classRef ( ) { result = etreeRef ( ) .getMember ( [ "Element" , "_Element" ] ) }
327+
328+ /**
329+ * A source of `lxml.etree.Element` instances, extend this class to model new instances.
330+ *
331+ * This can include instantiations of the class, return values from function
332+ * calls, or a special parameter that will be set when functions are called by an external
333+ * library.
334+ *
335+ * Use the predicate `Element::instance()` to get references to instances of `lxml.etree.Element` instances.
336+ */
337+ abstract class InstanceSource instanceof API:: Node {
338+ /** Gets a textual representation of this element. */
339+ string toString ( ) { result = super .toString ( ) }
340+ }
341+
342+ /** Gets a reference to an `lxml.etree.Element` instance. */
343+ API:: Node instance ( ) { result instanceof InstanceSource }
344+
345+ /** An `Element` instantiated directly. */
346+ private class ElementInstance extends InstanceSource {
347+ ElementInstance ( ) { this = classRef ( ) .getAnInstance ( ) }
348+ }
349+
350+ /** The result of a parse operation that returns an `Element`. */
351+ private class ParseResult extends InstanceSource {
352+ ParseResult ( ) {
353+ // TODO: The XmlParser module does not currently use API graphs
354+ this =
355+ [
356+ etreeRef ( ) .getMember ( "XMLParser" ) .getAnInstance ( ) ,
357+ etreeRef ( ) .getMember ( "get_default_parser" ) .getReturn ( )
358+ ] .getMember ( "close" ) .getReturn ( )
359+ or
360+ // TODO: `XMLID`, `XMLDTDID`, `parseid` returns a tuple of which the first element is an `Element`.
361+ // `iterparse` returns an iterator of tuples, each of which has a second element that is an `Element`.
362+ this = etreeRef ( ) .getMember ( [ "XML" , "HTML" , "fromstring" , "fromstringlist" ] ) .getReturn ( )
363+ }
364+ }
365+
366+ /** A call to a method on an `Element` that returns another `Element`. */
367+ private class ElementMethod extends InstanceSource {
368+ ElementMethod ( ) {
369+ // an Element is an iterator of Elements
370+ this = instance ( ) .getASubscript ( )
371+ or
372+ // methods that return an Element
373+ this = instance ( ) .getMember ( [ "find" , "getnext" , "getprevious" , "getparent" ] ) .getReturn ( )
374+ or
375+ // methods that return an iterator of Elements
376+ this =
377+ instance ( )
378+ .getMember ( [
379+ "cssselect" , "findall" , "getchildren" , "getiterator" , "iter" , "iterancestors" ,
380+ "iterdecendants" , "iterchildren" , "itersiblings" , "iterfind" , "xpath"
381+ ] )
382+ .getReturn ( )
383+ .getASubscript ( )
384+ }
385+ }
386+
387+ /** A call to a method on an `ElementTree` that returns an `Element`. */
388+ private class ElementTreeMethod extends InstanceSource {
389+ ElementTreeMethod ( ) {
390+ this = ElementTree:: instance ( ) .getMember ( [ "getroot" , "find" ] ) .getReturn ( )
391+ or
392+ this =
393+ ElementTree:: instance ( )
394+ .getMember ( [ "findall" , "getiterator" , "iter" , "iterfind" , "xpath" ] )
395+ .getReturn ( )
396+ .getASubscript ( )
397+ }
398+ }
399+
400+ /**
401+ * An additional taint step from an `Element` instance.
402+ * See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase.
403+ */
404+ private class ElementTaintStep extends TaintTracking:: AdditionalTaintStep {
405+ override predicate step ( DataFlow:: Node nodeFrom , DataFlow:: Node nodeTo ) {
406+ exists ( DataFlow:: MethodCallNode call |
407+ nodeTo = call and instance ( ) .asSource ( ) .flowsTo ( nodeFrom )
408+ |
409+ call .calls ( nodeFrom ,
410+ // We consider a node to be tainted if there could be taint anywhere in the element tree;
411+ // so sibling nodes (e.g. `getnext`) are also tainted.
412+ // This ensures nodes like `elem[0].getnext()` are tracked.
413+ [
414+ "cssselect" , "find" , "findall" , "findtext" , "get" , "getchildren" , "getiterator" ,
415+ "getnext" , "getparent" , "getprevious" , "getroottree" , "items" , "iter" ,
416+ "iterancestors" , "iterchildren" , "iterdescendants" , "itersiblings" , "iterfind" ,
417+ "itertext" , "keys" , "values" , "xpath"
418+ ] )
419+ )
420+ or
421+ exists ( DataFlow:: AttrRead attr | nodeTo = attr and instance ( ) .asSource ( ) .flowsTo ( nodeFrom ) |
422+ attr .accesses ( nodeFrom , [ "attrib" , "base" , "nsmap" , "prefix" , "tag" , "tail" , "text" ] )
423+ )
424+ }
425+ }
426+ }
427+
428+ /** Provides models for the `lxml.etree.ElementTree` class. */
429+ module ElementTree {
430+ /** Gets a reference to the `ElementTree` class. */
431+ API:: Node classRef ( ) { result = etreeRef ( ) .getMember ( [ "ElementTree" , "_ElementTree" ] ) }
432+
433+ /**
434+ * A source of `lxml.etree.ElementTree` instances; extend this class to model new instances.
435+ *
436+ * This can include instantiations of the class, return values from function
437+ * calls, or a special parameter that will be set when functions are called by an external
438+ * library.
439+ *
440+ * Use the predicate `ElementTree::instance()` to get references to instances of `lxml.etree.ElementTree` instances.
441+ */
442+ abstract class InstanceSource instanceof API:: Node {
443+ /** Gets a textual representation of this element. */
444+ string toString ( ) { result = super .toString ( ) }
445+ }
446+
447+ /** Gets a reference to an `lxml.etree.ElementTree` instance. */
448+ API:: Node instance ( ) { result instanceof InstanceSource }
449+
450+ /** An `ElementTree` instantiated directly. */
451+ private class ElementTreeInstance extends InstanceSource {
452+ ElementTreeInstance ( ) { this = classRef ( ) .getAnInstance ( ) }
453+ }
454+
455+ /** The result of a parse operation that returns an `ElementTree`. */
456+ private class ParseResult extends InstanceSource {
457+ ParseResult ( ) { this = etreeRef ( ) .getMember ( "parse" ) .getReturn ( ) }
458+ }
459+
460+ /** A call to a method on an `Element` that returns another `Element`. */
461+ private class ElementMethod extends InstanceSource {
462+ ElementMethod ( ) { this = Element:: instance ( ) .getMember ( "getroottree" ) .getReturn ( ) }
463+ }
464+
465+ /** An additional taint step from an `ElementTree` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree._ElementTree */
466+ private class ElementTaintStep extends TaintTracking:: AdditionalTaintStep {
467+ override predicate step ( DataFlow:: Node nodeFrom , DataFlow:: Node nodeTo ) {
468+ exists ( DataFlow:: MethodCallNode call |
469+ nodeTo = call and instance ( ) .asSource ( ) .flowsTo ( nodeFrom )
470+ |
471+ call .calls ( nodeFrom ,
472+ [
473+ "find" , "findall" , "findtext" , "get" , "getiterator" , "getroot" , "iter" , "iterfind" ,
474+ "xpath"
475+ ] )
476+ )
477+ or
478+ exists ( DataFlow:: AttrRead attr | nodeTo = attr and instance ( ) .asSource ( ) .flowsTo ( nodeFrom ) |
479+ attr .accesses ( nodeFrom , "docinfo" )
480+ )
481+ }
482+ }
483+ }
484+
485+ /** A call to serialise xml to a string */
486+ private class XmlEncoding extends Encoding:: Range , DataFlow:: CallCfgNode {
487+ XmlEncoding ( ) {
488+ this = etreeRef ( ) .getMember ( [ "tostring" , "tostringlist" , "tounicode" ] ) .getACall ( )
489+ }
490+
491+ override DataFlow:: Node getAnInput ( ) {
492+ result = [ this .getArg ( 0 ) , this .getArgByName ( "element_or_tree" ) ]
493+ }
494+
495+ override DataFlow:: Node getOutput ( ) { result = this }
496+
497+ override string getFormat ( ) { result = "XML" }
498+ }
499+ // TODO: ElementTree.write can write to a file-like object; should that be a flow step?
500+ // It also can accept a filepath which could be a path injection sink.
321501}
0 commit comments