|
10 | 10 | """ |
11 | 11 |
|
12 | 12 | import collections |
| 13 | +from functools import partial |
| 14 | + |
13 | 15 | try: |
14 | 16 | from urlparse import urljoin |
15 | 17 | except ImportError: |
@@ -77,18 +79,18 @@ def _extract_item(self, node, items_seen, base_url): |
77 | 79 | item["id"] = itemid.strip() |
78 | 80 |
|
79 | 81 | properties = collections.defaultdict(list) |
80 | | - # start with item references |
| 82 | + for name, value in self._extract_properties( |
| 83 | + node, items_seen=items_seen, base_url=base_url): |
| 84 | + properties[name].append(value) |
| 85 | + |
| 86 | + # process item references |
81 | 87 | refs = node.get('itemref', '').split() |
82 | 88 | if refs: |
83 | 89 | for refid in refs: |
84 | 90 | for name, value in self._extract_property_refs( |
85 | 91 | node, refid, items_seen=items_seen, base_url=base_url): |
86 | 92 | properties[name].append(value) |
87 | 93 |
|
88 | | - for name, value in self._extract_properties( |
89 | | - node, items_seen=items_seen, base_url=base_url): |
90 | | - properties[name].append(value) |
91 | | - |
92 | 94 | props = [] |
93 | 95 | for (name, values) in properties.items(): |
94 | 96 | if not self.strict and len(values) == 1: |
@@ -119,10 +121,25 @@ def _extract_properties(self, node, items_seen, base_url): |
119 | 121 | yield p, v |
120 | 122 |
|
121 | 123 | def _extract_property_refs(self, node, refid, items_seen, base_url): |
122 | | - for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid): |
123 | | - for p, v in self._extract_property( |
124 | | - prop, items_seen=items_seen, base_url=base_url): |
| 124 | + ref_node = node.xpath("id($refid)[1]", refid=refid) |
| 125 | + if not ref_node: |
| 126 | + return |
| 127 | + ref_node = ref_node[0] |
| 128 | + extract_fn = partial(self._extract_property, items_seen=items_seen, |
| 129 | + base_url=base_url) |
| 130 | + if 'itemprop' in ref_node.keys() and 'itemscope' in ref_node.keys(): |
| 131 | + # An full item will be extracted from the node, no need to look |
| 132 | + # for individual properties in childs |
| 133 | + for p, v in extract_fn(ref_node): |
125 | 134 | yield p, v |
| 135 | + else: |
| 136 | + base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]") |
| 137 | + for prop in ref_node.xpath("descendant-or-self::*[@itemprop]"): |
| 138 | + parent_scope = prop.xpath("ancestor::*[@itemscope][1]") |
| 139 | + # Skip properties defined in a different scope than the ref_node |
| 140 | + if parent_scope == base_parent_scope: |
| 141 | + for p, v in extract_fn(prop): |
| 142 | + yield p, v |
126 | 143 |
|
127 | 144 | def _extract_property(self, node, items_seen, base_url): |
128 | 145 | props = node.get("itemprop").split() |
|
0 commit comments