Skip to content

Commit 258664f

Browse files
committed
Avoid including itemprop from child itemscopes when using itemref
1 parent 3ab5592 commit 258664f

File tree

4 files changed

+152
-2
lines changed

4 files changed

+152
-2
lines changed

extruct/w3cmicrodata.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,21 @@ def _extract_properties(self, node, items_seen, base_url):
119119
yield p, v
120120

121121
def _extract_property_refs(self, node, refid, items_seen, base_url):
122-
for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid):
122+
ref_node = node.xpath("id($refid)[1]", refid=refid)
123+
if not ref_node:
124+
return
125+
ref_node = ref_node[0]
126+
base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]")
127+
if 'itemprop' in ref_node.keys():
123128
for p, v in self._extract_property(
124-
prop, items_seen=items_seen, base_url=base_url):
129+
ref_node, items_seen=items_seen, base_url=base_url):
125130
yield p, v
131+
for prop in ref_node.xpath("descendant::*[@itemprop]"):
132+
parent_scope = prop.xpath("ancestor::*[@itemscope][1]")
133+
if parent_scope == base_parent_scope:
134+
for p, v in self._extract_property(
135+
prop, items_seen=items_seen, base_url=base_url):
136+
yield p, v
126137

127138
def _extract_property(self, node, items_seen, base_url):
128139
props = node.get("itemprop").split()
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
<!DOCTYPE HTML>
2+
<html>
3+
<head>
4+
<title>Photo gallery</title>
5+
</head>
6+
<body>
7+
8+
<div id="product" itemscope itemtype="http://schema.org/Product" itemref="other-product-properties more-properties related_products">
9+
<span itemprop="brand">ACME</span>
10+
<span itemprop="name">Executive Anvil</span>
11+
<img itemprop="image" src=" anvil_executive.jpg" alt="Executive Anvil logo"/>
12+
<span itemprop="description">Sleeker than ACME's Classic Anvil, the
13+
Executive Anvil is perfect for the business traveler
14+
looking for something to drop from a height.
15+
</span>
16+
Product #: <span itemprop="mpn">925872</span>
17+
<span id="aggregateRating" itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
18+
<span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89
19+
</span> reviews
20+
</span>
21+
22+
<span id="offer" itemprop="offers" itemscope itemtype="http://schema.org/Offer">
23+
Regular price: $179.99
24+
<meta itemprop="priceCurrency" content="USD" />
25+
$<span itemprop="price">119.99 </span>
26+
(Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05">
27+
5 November!</time>)
28+
Available from: <span id="organization" itemprop="seller" itemscope itemtype="http://schema.org/Organization">
29+
<span itemprop="name">Executive Objects</span>
30+
</span>
31+
Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned,
32+
in excellent condition
33+
<link itemprop="availability" href=" http://schema.org/InStock"/>In stock! Order now!
34+
</span>
35+
</div>
36+
<div id="other-product-properties" itemscope itemtype="http://schema.org/Product" itemprop="referenced_product">
37+
<span itemprop="prop2">PROP 2</span>
38+
<img itemprop="image" src="img-2.jpg">
39+
</div>
40+
<div id="more-properties" itemscope itemtype="http://schema.org/Product">
41+
<span itemprop="prop3">PROP 3</span>
42+
<img itemprop="image" src="img-3.jpg">
43+
</div>
44+
<div id="related_products">
45+
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
46+
<span itemprop="name">REL PROD 1</span>
47+
<img itemprop="image" src="rel-prod-1.jpg">
48+
</div>
49+
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
50+
<span itemprop="name">REL PROD 2</span>
51+
<img itemprop="image" src="rel-prod-2.jpg">
52+
</div>
53+
</div>
54+
</body>
55+
</html>
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
[
2+
{
3+
"type": "http://schema.org/Product",
4+
"properties": {
5+
"referenced_product": {
6+
"type": "http://schema.org/Product",
7+
"properties": {
8+
"prop2": "PROP 2",
9+
"image": "img-2.jpg"
10+
}
11+
},
12+
"prop2": "PROP 2",
13+
"image": [
14+
"img-2.jpg",
15+
"img-3.jpg",
16+
"anvil_executive.jpg"
17+
],
18+
"prop3": "PROP 3",
19+
"related_products": [
20+
{
21+
"type": "http://schema.org/Product",
22+
"properties": {
23+
"name": "REL PROD 1",
24+
"image": "rel-prod-1.jpg"
25+
}
26+
},
27+
{
28+
"type": "http://schema.org/Product",
29+
"properties": {
30+
"name": "REL PROD 2",
31+
"image": "rel-prod-2.jpg"
32+
}
33+
}
34+
],
35+
"brand": "ACME",
36+
"name": "Executive Anvil",
37+
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
38+
"mpn": "925872",
39+
"aggregateRating": {
40+
"type": "http://schema.org/AggregateRating",
41+
"properties": {
42+
"ratingValue": "4.4",
43+
"reviewCount": "89"
44+
}
45+
},
46+
"offers": {
47+
"type": "http://schema.org/Offer",
48+
"properties": {
49+
"priceCurrency": "USD",
50+
"price": "119.99",
51+
"priceValidUntil": "2020-11-05",
52+
"seller": {
53+
"type": "http://schema.org/Organization",
54+
"properties": {
55+
"name": "Executive Objects"
56+
}
57+
},
58+
"itemCondition": "http://schema.org/UsedCondition",
59+
"availability": "http://schema.org/InStock"
60+
}
61+
}
62+
}
63+
},
64+
{
65+
"type": "http://schema.org/Product",
66+
"properties": {
67+
"prop3": "PROP 3",
68+
"image": "img-3.jpg"
69+
}
70+
}
71+
]

tests/test_microdata.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,3 +171,16 @@ def test_join_custom_url(self):
171171
mde = MicrodataExtractor()
172172
data = mde.extract(body, base_url='http://some-example.com')
173173
self.assertEqual(data, expected)
174+
175+
176+
class TestItemref(unittest.TestCase):
177+
178+
maxDiff = None
179+
180+
def test_join_none(self):
181+
body = get_testdata('schema.org', 'product-ref.html')
182+
expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))
183+
184+
mde = MicrodataExtractor()
185+
data = mde.extract(body)
186+
self.assertEqual(data, expected)

0 commit comments

Comments
 (0)