Skip to content

Commit 6f2e2d2

Browse files
committed
Simplify node id generation for microdata parsing
thanks for suggestion @ivanprado
1 parent 2e5daae commit 6f2e2d2

File tree

1 file changed

+8
-29
lines changed

1 file changed

+8
-29
lines changed

extruct/w3cmicrodata.py

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -45,35 +45,19 @@
4545

4646

4747
class LxmlMicrodataExtractor(object):
48-
# iterate in document order (used below for get_docid optimization)
48+
# iterate in document order (used below for fast get_docid)
4949
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
5050
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
5151
.//*[@itemscope]//*[@itemprop])""",
5252
namespaces = {"set": "http://exslt.org/sets"})
5353
_xp_clean_text = lxml.etree.XPath('descendant-or-self::*[not(self::script or self::style)]/text()')
54-
# ancestor and preceding axes contain all elements before the context node
55-
# so counting them gives the "document order" of the context node
56-
_xp_item_docid = lxml.etree.XPath("""count(preceding::*[@itemscope])
57-
+ count(ancestor::*[@itemscope])
58-
+ 1""")
5954

6055
def __init__(self, nested=True, strict=False, add_text_content=False, add_html_node=False):
6156
self.nested = nested
6257
self.strict = strict
6358
self.add_text_content = add_text_content
6459
self.add_html_node = add_html_node
6560

66-
def get_docid(self, node, itemids):
67-
try:
68-
return itemids[node] # same as self.get_docid(node, {})
69-
except KeyError:
70-
# Even after itemids are built,
71-
# this might fail if extract_items is called on a part of the document,
72-
# and then properties reference some node which is not in itemids,
73-
# although this does not look likely in practice,
74-
# so not a performance concern.
75-
return int(self._xp_item_docid(node))
76-
7761
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
7862
tree = parse_html(htmlstring, encoding=encoding)
7963
return self.extract_items(tree, base_url)
@@ -88,19 +72,14 @@ def extract_items(self, document, base_url):
8872
for it in self._xp_item(document))
8973
if item]
9074

75+
def get_docid(self, node, itemids):
76+
return itemids[node]
77+
9178
def _build_itemids(self, document):
92-
itemid = None
93-
itemids = {}
94-
for node in self._xp_item(document):
95-
if itemid is None:
96-
itemid = self.get_docid(node, {})
97-
assert itemid is not None
98-
else:
99-
# this is the same as self.get_docid(node) but faster,
100-
# calling get_docid on each iteration leads to quadratic complexity
101-
itemid += 1
102-
itemids[node] = itemid
103-
return itemids
79+
""" Build itemids for a fast get_docid implementation. Use document order.
80+
"""
81+
root = document.getroottree().getroot()
82+
return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))}
10483

10584
def _extract_item(self, node, items_seen, base_url, itemids):
10685
itemid = self.get_docid(node, itemids)

0 commit comments

Comments
 (0)