4545
4646
4747class LxmlMicrodataExtractor (object ):
48- # iterate in document order (used below for get_docid optimization )
48+ # iterate in document order (used below for fast get_docid )
4949 _xp_item = lxml .etree .XPath ('descendant-or-self::*[@itemscope]' )
5050 _xp_prop = lxml .etree .XPath ("""set:difference(.//*[@itemprop],
5151 .//*[@itemscope]//*[@itemprop])""" ,
5252 namespaces = {"set" : "http://exslt.org/sets" })
5353 _xp_clean_text = lxml .etree .XPath ('descendant-or-self::*[not(self::script or self::style)]/text()' )
54- # ancestor and preceding axes contain all elements before the context node
55- # so counting them gives the "document order" of the context node
56- _xp_item_docid = lxml .etree .XPath ("""count(preceding::*[@itemscope])
57- + count(ancestor::*[@itemscope])
58- + 1""" )
5954
6055 def __init__ (self , nested = True , strict = False , add_text_content = False , add_html_node = False ):
6156 self .nested = nested
6257 self .strict = strict
6358 self .add_text_content = add_text_content
6459 self .add_html_node = add_html_node
6560
66- def get_docid (self , node , itemids ):
67- try :
68- return itemids [node ] # same as self.get_docid(node, {})
69- except KeyError :
70- # Even after itemids are built,
71- # this might fail if extract_items is called on a part of the document,
72- # and then properties reference some node which is not in itemids,
73- # although this does not look likely in practice,
74- # so not a performance concern.
75- return int (self ._xp_item_docid (node ))
76-
7761 def extract (self , htmlstring , base_url = None , encoding = "UTF-8" ):
7862 tree = parse_html (htmlstring , encoding = encoding )
7963 return self .extract_items (tree , base_url )
@@ -88,19 +72,14 @@ def extract_items(self, document, base_url):
8872 for it in self ._xp_item (document ))
8973 if item ]
9074
75+ def get_docid (self , node , itemids ):
76+ return itemids [node ]
77+
9178 def _build_itemids (self , document ):
92- itemid = None
93- itemids = {}
94- for node in self ._xp_item (document ):
95- if itemid is None :
96- itemid = self .get_docid (node , {})
97- assert itemid is not None
98- else :
99- # this is the same as self.get_docid(node) but faster,
100- # calling get_docid on each iteration leads to quadratic complexity
101- itemid += 1
102- itemids [node ] = itemid
103- return itemids
79+ """ Build itemids for a fast get_docid implementation. Use document order.
80+ """
81+ root = document .getroottree ().getroot ()
82+ return {node : idx + 1 for idx , node in enumerate (self ._xp_item (root ))}
10483
10584 def _extract_item (self , node , items_seen , base_url , itemids ):
10685 itemid = self .get_docid (node , itemids )
0 commit comments