Skip to content

Commit b99b14d

Browse files
authored
Merge pull request #105 from ivanprado/master
Avoid including itemprop from child itemscopes when using itemref
2 parents 3ab5592 + e655b0a commit b99b14d

File tree

5 files changed

+163
-9
lines changed

5 files changed

+163
-9
lines changed

extruct/w3cmicrodata.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
"""
1111

1212
import collections
13+
from functools import partial
14+
1315
try:
1416
from urlparse import urljoin
1517
except ImportError:
@@ -77,18 +79,18 @@ def _extract_item(self, node, items_seen, base_url):
7779
item["id"] = itemid.strip()
7880

7981
properties = collections.defaultdict(list)
80-
# start with item references
82+
for name, value in self._extract_properties(
83+
node, items_seen=items_seen, base_url=base_url):
84+
properties[name].append(value)
85+
86+
# process item references
8187
refs = node.get('itemref', '').split()
8288
if refs:
8389
for refid in refs:
8490
for name, value in self._extract_property_refs(
8591
node, refid, items_seen=items_seen, base_url=base_url):
8692
properties[name].append(value)
8793

88-
for name, value in self._extract_properties(
89-
node, items_seen=items_seen, base_url=base_url):
90-
properties[name].append(value)
91-
9294
props = []
9395
for (name, values) in properties.items():
9496
if not self.strict and len(values) == 1:
@@ -119,10 +121,25 @@ def _extract_properties(self, node, items_seen, base_url):
119121
yield p, v
120122

121123
def _extract_property_refs(self, node, refid, items_seen, base_url):
122-
for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid):
123-
for p, v in self._extract_property(
124-
prop, items_seen=items_seen, base_url=base_url):
124+
ref_node = node.xpath("id($refid)[1]", refid=refid)
125+
if not ref_node:
126+
return
127+
ref_node = ref_node[0]
128+
extract_fn = partial(self._extract_property, items_seen=items_seen,
129+
base_url=base_url)
130+
if 'itemprop' in ref_node.keys() and 'itemscope' in ref_node.keys():
131+
# An full item will be extracted from the node, no need to look
132+
# for individual properties in childs
133+
for p, v in extract_fn(ref_node):
125134
yield p, v
135+
else:
136+
base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]")
137+
for prop in ref_node.xpath("descendant-or-self::*[@itemprop]"):
138+
parent_scope = prop.xpath("ancestor::*[@itemscope][1]")
139+
# Skip properties defined in a different scope than the ref_node
140+
if parent_scope == base_parent_scope:
141+
for p, v in extract_fn(prop):
142+
yield p, v
126143

127144
def _extract_property(self, node, items_seen, base_url):
128145
props = node.get("itemprop").split()
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
<!DOCTYPE HTML>
2+
<html>
3+
<head>
4+
<title>Photo gallery</title>
5+
</head>
6+
<body>
7+
8+
<div id="product" itemscope itemtype="http://schema.org/Product" itemref="referenced-product more-properties related_products non-existing-ref">
9+
<span itemprop="brand">ACME</span>
10+
<span itemprop="name">Executive Anvil</span>
11+
<img itemprop="image" src=" anvil_executive.jpg" alt="Executive Anvil logo"/>
12+
<span itemprop="description">Sleeker than ACME's Classic Anvil, the
13+
Executive Anvil is perfect for the business traveler
14+
looking for something to drop from a height.
15+
</span>
16+
Product #: <span itemprop="mpn">925872</span>
17+
<span id="aggregateRating" itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
18+
<span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89
19+
</span> reviews
20+
</span>
21+
22+
<span id="offer" itemprop="offers" itemscope itemtype="http://schema.org/Offer">
23+
Regular price: $179.99
24+
<meta itemprop="priceCurrency" content="USD" />
25+
$<span itemprop="price">119.99 </span>
26+
(Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05">
27+
5 November!</time>)
28+
Available from: <span id="organization" itemprop="seller" itemscope itemtype="http://schema.org/Organization">
29+
<span itemprop="name">Executive Objects</span>
30+
</span>
31+
Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned,
32+
in excellent condition
33+
<link itemprop="availability" href=" http://schema.org/InStock"/>In stock! Order now!
34+
</span>
35+
</div>
36+
<div id="referenced-product" itemscope itemtype="http://schema.org/Product" itemprop="referenced_product">
37+
<span itemprop="name">REFERENCED PRODUCT</span>
38+
<img itemprop="image" src="img-ref.jpg">
39+
</div>
40+
<div id="more-properties" itemscope itemtype="http://schema.org/Product">
41+
<span itemprop="prop3">REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT</span>
42+
<img itemprop="image" src="img-2.jpg">
43+
</div>
44+
<div id="related_products">
45+
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
46+
<span itemprop="name">REL PROD 1</span>
47+
<img itemprop="image" src="rel-prod-1.jpg">
48+
</div>
49+
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
50+
<span itemprop="name">REL PROD 2</span>
51+
<img itemprop="image" src="rel-prod-2.jpg">
52+
</div>
53+
</div>
54+
</body>
55+
</html>
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
[
2+
{
3+
"type": "http://schema.org/Product",
4+
"properties": {
5+
"referenced_product": {
6+
"type": "http://schema.org/Product",
7+
"properties": {
8+
"name": "REFERENCED PRODUCT",
9+
"image": "img-ref.jpg"
10+
}
11+
},
12+
"prop3": "REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT",
13+
"image": [
14+
"anvil_executive.jpg",
15+
"img-2.jpg"
16+
],
17+
"related_products": [
18+
{
19+
"type": "http://schema.org/Product",
20+
"properties": {
21+
"name": "REL PROD 1",
22+
"image": "rel-prod-1.jpg"
23+
}
24+
},
25+
{
26+
"type": "http://schema.org/Product",
27+
"properties": {
28+
"name": "REL PROD 2",
29+
"image": "rel-prod-2.jpg"
30+
}
31+
}
32+
],
33+
"brand": "ACME",
34+
"name": "Executive Anvil",
35+
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
36+
"mpn": "925872",
37+
"aggregateRating": {
38+
"type": "http://schema.org/AggregateRating",
39+
"properties": {
40+
"ratingValue": "4.4",
41+
"reviewCount": "89"
42+
}
43+
},
44+
"offers": {
45+
"type": "http://schema.org/Offer",
46+
"properties": {
47+
"priceCurrency": "USD",
48+
"price": "119.99",
49+
"priceValidUntil": "2020-11-05",
50+
"seller": {
51+
"type": "http://schema.org/Organization",
52+
"properties": {
53+
"name": "Executive Objects"
54+
}
55+
},
56+
"itemCondition": "http://schema.org/UsedCondition",
57+
"availability": "http://schema.org/InStock"
58+
}
59+
}
60+
}
61+
},
62+
{
63+
"type": "http://schema.org/Product",
64+
"properties": {
65+
"prop3": "REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT",
66+
"image": "img-2.jpg"
67+
}
68+
}
69+
]

tests/samples/w3c/microdata.5.3.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
{"properties": {"a": ["1", "2"], "b": ["test"]}},
33
{"properties": {"a": ["1", "2"], "b": ["test"]}},
44
{"properties": {"a": ["1", "2"], "b": ["test"]}},
5-
{"properties": {"a": ["1", "2"], "b": ["test"]}}
5+
{"properties": {"a": ["2", "1"], "b": ["test"]}}
66
]
77

tests/test_microdata.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,3 +171,16 @@ def test_join_custom_url(self):
171171
mde = MicrodataExtractor()
172172
data = mde.extract(body, base_url='http://some-example.com')
173173
self.assertEqual(data, expected)
174+
175+
176+
class TestItemref(unittest.TestCase):
177+
178+
maxDiff = None
179+
180+
def test_join_none(self):
181+
body = get_testdata('schema.org', 'product-ref.html')
182+
expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))
183+
184+
mde = MicrodataExtractor()
185+
data = mde.extract(body)
186+
self.assertEqual(data, expected)

0 commit comments

Comments
 (0)