4545
4646
4747class LxmlMicrodataExtractor (object ):
48+ # iterate in document order (used below for get_docid optimization)
4849 _xp_item = lxml .etree .XPath ('descendant-or-self::*[@itemscope]' )
4950 _xp_prop = lxml .etree .XPath ("""set:difference(.//*[@itemprop],
5051 .//*[@itemscope]//*[@itemprop])""" ,
@@ -62,23 +63,47 @@ def __init__(self, nested=True, strict=False, add_text_content=False, add_html_n
6263 self .add_text_content = add_text_content
6364 self .add_html_node = add_html_node
6465
65- def get_docid (self , node ):
66- return int (self ._xp_item_docid (node ))
66+ def get_docid (self , node , itemids ):
67+ try :
68+ return itemids [node ] # same as self.get_docid(node, {})
69+ except KeyError :
70+ # Even after itemids are built,
71+ # this might fail if extract_items is called on a part of the document,
72+ # and then properties reference some node which is not in itemids,
73+ # although this does not look likely in practice,
74+ # so not a performance concern.
75+ return int (self ._xp_item_docid (node ))
6776
6877 def extract (self , htmlstring , base_url = None , encoding = "UTF-8" ):
6978 tree = parse_html (htmlstring , encoding = encoding )
7079 return self .extract_items (tree , base_url )
7180
7281 def extract_items (self , document , base_url ):
82+ itemids = self ._build_itemids (document )
7383 items_seen = set ()
7484 return [
7585 item for item in (
76- self ._extract_item (it , items_seen = items_seen , base_url = base_url )
86+ self ._extract_item (
87+ it , items_seen = items_seen , base_url = base_url , itemids = itemids )
7788 for it in self ._xp_item (document ))
7889 if item ]
7990
80- def _extract_item (self , node , items_seen , base_url ):
81- itemid = self .get_docid (node )
91+ def _build_itemids (self , document ):
92+ itemid = None
93+ itemids = {}
94+ for node in self ._xp_item (document ):
95+ if itemid is None :
96+ itemid = self .get_docid (node , {})
97+ assert itemid is not None
98+ else :
99+ # this is the same as self.get_docid(node) but faster,
100+ # calling get_docid on each iteration leads to quadratic complexity
101+ itemid += 1
102+ itemids [node ] = itemid
103+ return itemids
104+
105+ def _extract_item (self , node , items_seen , base_url , itemids ):
106+ itemid = self .get_docid (node , itemids )
82107
83108 if self .nested :
84109 if itemid in items_seen :
@@ -95,21 +120,22 @@ def _extract_item(self, node, items_seen, base_url):
95120 else :
96121 item ["type" ] = types
97122
98- itemid = node .get ('itemid' )
99- if itemid :
100- item ["id" ] = itemid .strip ()
123+ nodeid = node .get ('itemid' )
124+ if nodeid :
125+ item ["id" ] = nodeid .strip ()
101126
102127 properties = collections .defaultdict (list )
103128 for name , value in self ._extract_properties (
104- node , items_seen = items_seen , base_url = base_url ):
129+ node , items_seen = items_seen , base_url = base_url , itemids = itemids ):
105130 properties [name ].append (value )
106131
107132 # process item references
108133 refs = node .get ('itemref' , '' ).split ()
109134 if refs :
110135 for refid in refs :
111136 for name , value in self ._extract_property_refs (
112- node , refid , items_seen = items_seen , base_url = base_url ):
137+ node , refid , items_seen = items_seen , base_url = base_url ,
138+ itemids = itemids ):
113139 properties [name ].append (value )
114140
115141 props = []
@@ -123,7 +149,8 @@ def _extract_item(self, node, items_seen, base_url):
123149 else :
124150 # item without properties; let's use the node itself
125151 item ["value" ] = self ._extract_property_value (
126- node , force = True , items_seen = items_seen , base_url = base_url )
152+ node , force = True , items_seen = items_seen , base_url = base_url ,
153+ itemids = itemids )
127154
128155 # below are not in the specs, but can be handy
129156 if self .add_text_content :
@@ -135,19 +162,19 @@ def _extract_item(self, node, items_seen, base_url):
135162
136163 return item
137164
138- def _extract_properties (self , node , items_seen , base_url ):
165+ def _extract_properties (self , node , items_seen , base_url , itemids ):
139166 for prop in self ._xp_prop (node ):
140167 for p , v in self ._extract_property (
141- prop , items_seen = items_seen , base_url = base_url ):
168+ prop , items_seen = items_seen , base_url = base_url , itemids = itemids ):
142169 yield p , v
143170
144- def _extract_property_refs (self , node , refid , items_seen , base_url ):
171+ def _extract_property_refs (self , node , refid , items_seen , base_url , itemids ):
145172 ref_node = node .xpath ("id($refid)[1]" , refid = refid )
146173 if not ref_node :
147174 return
148175 ref_node = ref_node [0 ]
149176 extract_fn = partial (self ._extract_property , items_seen = items_seen ,
150- base_url = base_url )
177+ base_url = base_url , itemids = itemids )
151178 if 'itemprop' in ref_node .keys () and 'itemscope' in ref_node .keys ():
152179 # An full item will be extracted from the node, no need to look
153180 # for individual properties in childs
@@ -162,20 +189,20 @@ def _extract_property_refs(self, node, refid, items_seen, base_url):
162189 for p , v in extract_fn (prop ):
163190 yield p , v
164191
165- def _extract_property (self , node , items_seen , base_url ):
192+ def _extract_property (self , node , items_seen , base_url , itemids ):
166193 props = node .get ("itemprop" ).split ()
167194 value = self ._extract_property_value (
168- node , items_seen = items_seen , base_url = base_url )
195+ node , items_seen = items_seen , base_url = base_url , itemids = itemids )
169196 return [(p , value ) for p in props ]
170197
171- def _extract_property_value (self , node , items_seen , base_url , force = False ):
198+ def _extract_property_value (self , node , items_seen , base_url , itemids , force = False ):
172199 #http://www.w3.org/TR/microdata/#values
173200 if not force and node .get ("itemscope" ) is not None :
174201 if self .nested :
175202 return self ._extract_item (
176- node , items_seen = items_seen , base_url = base_url )
203+ node , items_seen = items_seen , base_url = base_url , itemids = itemids )
177204 else :
178- return {"iid_ref" : self .get_docid (node )}
205+ return {"iid_ref" : self .get_docid (node , itemids )}
179206
180207 elif node .tag == "meta" :
181208 return node .get ("content" , "" )
0 commit comments