4545
4646
4747class LxmlMicrodataExtractor (object ):
48+ # iterate in document order (used below for fast get_docid)
4849 _xp_item = lxml .etree .XPath ('descendant-or-self::*[@itemscope]' )
4950 _xp_prop = lxml .etree .XPath ("""set:difference(.//*[@itemprop],
5051 .//*[@itemscope]//*[@itemprop])""" ,
5152 namespaces = {"set" : "http://exslt.org/sets" })
5253 _xp_clean_text = lxml .etree .XPath ('descendant-or-self::*[not(self::script or self::style)]/text()' )
53- # ancestor and preceding axes contain all elements before the context node
54- # so counting them gives the "document order" of the context node
55- _xp_item_docid = lxml .etree .XPath ("""count(preceding::*[@itemscope])
56- + count(ancestor::*[@itemscope])
57- + 1""" )
5854
5955 def __init__ (self , nested = True , strict = False , add_text_content = False , add_html_node = False ):
6056 self .nested = nested
6157 self .strict = strict
6258 self .add_text_content = add_text_content
6359 self .add_html_node = add_html_node
6460
65- def get_docid (self , node ):
66- return int (self ._xp_item_docid (node ))
67-
6861 def extract (self , htmlstring , base_url = None , encoding = "UTF-8" ):
6962 tree = parse_html (htmlstring , encoding = encoding )
7063 return self .extract_items (tree , base_url )
7164
7265 def extract_items (self , document , base_url ):
66+ itemids = self ._build_itemids (document )
7367 items_seen = set ()
7468 return [
7569 item for item in (
76- self ._extract_item (it , items_seen = items_seen , base_url = base_url )
70+ self ._extract_item (
71+ it , items_seen = items_seen , base_url = base_url , itemids = itemids )
7772 for it in self ._xp_item (document ))
7873 if item ]
7974
80- def _extract_item (self , node , items_seen , base_url ):
81- itemid = self .get_docid (node )
75+ def get_docid (self , node , itemids ):
76+ return itemids [node ]
77+
78+ def _build_itemids (self , document ):
79+ """ Build itemids for a fast get_docid implementation. Use document order.
80+ """
81+ root = document .getroottree ().getroot ()
82+ return {node : idx + 1 for idx , node in enumerate (self ._xp_item (root ))}
83+
84+ def _extract_item (self , node , items_seen , base_url , itemids ):
85+ itemid = self .get_docid (node , itemids )
8286
8387 if self .nested :
8488 if itemid in items_seen :
@@ -95,21 +99,22 @@ def _extract_item(self, node, items_seen, base_url):
9599 else :
96100 item ["type" ] = types
97101
98- itemid = node .get ('itemid' )
99- if itemid :
100- item ["id" ] = itemid .strip ()
102+ nodeid = node .get ('itemid' )
103+ if nodeid :
104+ item ["id" ] = nodeid .strip ()
101105
102106 properties = collections .defaultdict (list )
103107 for name , value in self ._extract_properties (
104- node , items_seen = items_seen , base_url = base_url ):
108+ node , items_seen = items_seen , base_url = base_url , itemids = itemids ):
105109 properties [name ].append (value )
106110
107111 # process item references
108112 refs = node .get ('itemref' , '' ).split ()
109113 if refs :
110114 for refid in refs :
111115 for name , value in self ._extract_property_refs (
112- node , refid , items_seen = items_seen , base_url = base_url ):
116+ node , refid , items_seen = items_seen , base_url = base_url ,
117+ itemids = itemids ):
113118 properties [name ].append (value )
114119
115120 props = []
@@ -123,7 +128,8 @@ def _extract_item(self, node, items_seen, base_url):
123128 else :
124129 # item without properties; let's use the node itself
125130 item ["value" ] = self ._extract_property_value (
126- node , force = True , items_seen = items_seen , base_url = base_url )
131+ node , force = True , items_seen = items_seen , base_url = base_url ,
132+ itemids = itemids )
127133
128134 # below are not in the specs, but can be handy
129135 if self .add_text_content :
@@ -135,19 +141,19 @@ def _extract_item(self, node, items_seen, base_url):
135141
136142 return item
137143
138- def _extract_properties (self , node , items_seen , base_url ):
144+ def _extract_properties (self , node , items_seen , base_url , itemids ):
139145 for prop in self ._xp_prop (node ):
140146 for p , v in self ._extract_property (
141- prop , items_seen = items_seen , base_url = base_url ):
147+ prop , items_seen = items_seen , base_url = base_url , itemids = itemids ):
142148 yield p , v
143149
144- def _extract_property_refs (self , node , refid , items_seen , base_url ):
150+ def _extract_property_refs (self , node , refid , items_seen , base_url , itemids ):
145151 ref_node = node .xpath ("id($refid)[1]" , refid = refid )
146152 if not ref_node :
147153 return
148154 ref_node = ref_node [0 ]
149155 extract_fn = partial (self ._extract_property , items_seen = items_seen ,
150- base_url = base_url )
156+ base_url = base_url , itemids = itemids )
151157 if 'itemprop' in ref_node .keys () and 'itemscope' in ref_node .keys ():
152158 # An full item will be extracted from the node, no need to look
153159 # for individual properties in child nodes
@@ -162,20 +168,20 @@ def _extract_property_refs(self, node, refid, items_seen, base_url):
162168 for p , v in extract_fn (prop ):
163169 yield p , v
164170
165- def _extract_property (self , node , items_seen , base_url ):
171+ def _extract_property (self , node , items_seen , base_url , itemids ):
166172 props = node .get ("itemprop" ).split ()
167173 value = self ._extract_property_value (
168- node , items_seen = items_seen , base_url = base_url )
174+ node , items_seen = items_seen , base_url = base_url , itemids = itemids )
169175 return [(p , value ) for p in props ]
170176
171- def _extract_property_value (self , node , items_seen , base_url , force = False ):
177+ def _extract_property_value (self , node , items_seen , base_url , itemids , force = False ):
172178 #http://www.w3.org/TR/microdata/#values
173179 if not force and node .get ("itemscope" ) is not None :
174180 if self .nested :
175181 return self ._extract_item (
176- node , items_seen = items_seen , base_url = base_url )
182+ node , items_seen = items_seen , base_url = base_url , itemids = itemids )
177183 else :
178- return {"iid_ref" : self .get_docid (node )}
184+ return {"iid_ref" : self .get_docid (node , itemids )}
179185
180186 elif node .tag == "meta" :
181187 return node .get ("content" , "" )
0 commit comments