Skip to content

Commit 5858bf6

Browse files
committed
Fast get_docid for microdata parser
Previously, get_docid had quadratic complexity, as for each node, it counted it's position in the tree iterating over all previous nodes. Here we fix this by computing get_docid in the same way for the first element, and then incrementing it by one, which gives the same result but has linear complexity and is much faster (e.g. bringing time from 190+ s to 6 s on example from GH-147). Other discarded implementation options: - using some other identifier for the node - this would break backwards compatibility, as this property was exposed and documented at least in the code - instead of calling get_docid for the first element, just iterate over the whole document (from root) to build ids - this is less efficient in case we call extraction on a part of the document.
1 parent bd49a5f commit 5858bf6

File tree

1 file changed

+47
-20
lines changed

1 file changed

+47
-20
lines changed

extruct/w3cmicrodata.py

Lines changed: 47 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545

4646

4747
class LxmlMicrodataExtractor(object):
48+
# iterate in document order (used below for get_docid optimization)
4849
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
4950
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
5051
.//*[@itemscope]//*[@itemprop])""",
@@ -62,23 +63,47 @@ def __init__(self, nested=True, strict=False, add_text_content=False, add_html_n
6263
self.add_text_content = add_text_content
6364
self.add_html_node = add_html_node
6465

65-
def get_docid(self, node):
66-
return int(self._xp_item_docid(node))
66+
def get_docid(self, node, itemids):
67+
try:
68+
return itemids[node] # same as self.get_docid(node, {})
69+
except KeyError:
70+
# Even after itemids are built,
71+
# this might fail if extract_items is called on a part of the document,
72+
# and then properties reference some node which is not in itemids,
73+
# although this does not look likely in practice,
74+
# so not a performance concern.
75+
return int(self._xp_item_docid(node))
6776

6877
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
6978
tree = parse_html(htmlstring, encoding=encoding)
7079
return self.extract_items(tree, base_url)
7180

7281
def extract_items(self, document, base_url):
82+
itemids = self._build_itemids(document)
7383
items_seen = set()
7484
return [
7585
item for item in (
76-
self._extract_item(it, items_seen=items_seen, base_url=base_url)
86+
self._extract_item(
87+
it, items_seen=items_seen, base_url=base_url, itemids=itemids)
7788
for it in self._xp_item(document))
7889
if item]
7990

80-
def _extract_item(self, node, items_seen, base_url):
81-
itemid = self.get_docid(node)
91+
def _build_itemids(self, document):
92+
itemid = None
93+
itemids = {}
94+
for node in self._xp_item(document):
95+
if itemid is None:
96+
itemid = self.get_docid(node, {})
97+
assert itemid is not None
98+
else:
99+
# this is the same as self.get_docid(node) but faster,
100+
# calling get_docid on each iteration leads to quadratic complexity
101+
itemid += 1
102+
itemids[node] = itemid
103+
return itemids
104+
105+
def _extract_item(self, node, items_seen, base_url, itemids):
106+
itemid = self.get_docid(node, itemids)
82107

83108
if self.nested:
84109
if itemid in items_seen:
@@ -95,21 +120,22 @@ def _extract_item(self, node, items_seen, base_url):
95120
else:
96121
item["type"] = types
97122

98-
itemid = node.get('itemid')
99-
if itemid:
100-
item["id"] = itemid.strip()
123+
nodeid = node.get('itemid')
124+
if nodeid:
125+
item["id"] = nodeid.strip()
101126

102127
properties = collections.defaultdict(list)
103128
for name, value in self._extract_properties(
104-
node, items_seen=items_seen, base_url=base_url):
129+
node, items_seen=items_seen, base_url=base_url, itemids=itemids):
105130
properties[name].append(value)
106131

107132
# process item references
108133
refs = node.get('itemref', '').split()
109134
if refs:
110135
for refid in refs:
111136
for name, value in self._extract_property_refs(
112-
node, refid, items_seen=items_seen, base_url=base_url):
137+
node, refid, items_seen=items_seen, base_url=base_url,
138+
itemids=itemids):
113139
properties[name].append(value)
114140

115141
props = []
@@ -123,7 +149,8 @@ def _extract_item(self, node, items_seen, base_url):
123149
else:
124150
# item without properties; let's use the node itself
125151
item["value"] = self._extract_property_value(
126-
node, force=True, items_seen=items_seen, base_url=base_url)
152+
node, force=True, items_seen=items_seen, base_url=base_url,
153+
itemids=itemids)
127154

128155
# below are not in the specs, but can be handy
129156
if self.add_text_content:
@@ -135,19 +162,19 @@ def _extract_item(self, node, items_seen, base_url):
135162

136163
return item
137164

138-
def _extract_properties(self, node, items_seen, base_url):
165+
def _extract_properties(self, node, items_seen, base_url, itemids):
139166
for prop in self._xp_prop(node):
140167
for p, v in self._extract_property(
141-
prop, items_seen=items_seen, base_url=base_url):
168+
prop, items_seen=items_seen, base_url=base_url, itemids=itemids):
142169
yield p, v
143170

144-
def _extract_property_refs(self, node, refid, items_seen, base_url):
171+
def _extract_property_refs(self, node, refid, items_seen, base_url, itemids):
145172
ref_node = node.xpath("id($refid)[1]", refid=refid)
146173
if not ref_node:
147174
return
148175
ref_node = ref_node[0]
149176
extract_fn = partial(self._extract_property, items_seen=items_seen,
150-
base_url=base_url)
177+
base_url=base_url, itemids=itemids)
151178
if 'itemprop' in ref_node.keys() and 'itemscope' in ref_node.keys():
152179
# An full item will be extracted from the node, no need to look
153180
# for individual properties in childs
@@ -162,20 +189,20 @@ def _extract_property_refs(self, node, refid, items_seen, base_url):
162189
for p, v in extract_fn(prop):
163190
yield p, v
164191

165-
def _extract_property(self, node, items_seen, base_url):
192+
def _extract_property(self, node, items_seen, base_url, itemids):
166193
props = node.get("itemprop").split()
167194
value = self._extract_property_value(
168-
node, items_seen=items_seen, base_url=base_url)
195+
node, items_seen=items_seen, base_url=base_url, itemids=itemids)
169196
return [(p, value) for p in props]
170197

171-
def _extract_property_value(self, node, items_seen, base_url, force=False):
198+
def _extract_property_value(self, node, items_seen, base_url, itemids, force=False):
172199
#http://www.w3.org/TR/microdata/#values
173200
if not force and node.get("itemscope") is not None:
174201
if self.nested:
175202
return self._extract_item(
176-
node, items_seen=items_seen, base_url=base_url)
203+
node, items_seen=items_seen, base_url=base_url, itemids=itemids)
177204
else:
178-
return {"iid_ref": self.get_docid(node)}
205+
return {"iid_ref": self.get_docid(node, itemids)}
179206

180207
elif node.tag == "meta":
181208
return node.get("content", "")

0 commit comments

Comments
 (0)