Skip to content

Commit a91b73f

Browse files
authored
Merge branch 'master' into master
2 parents 4a4bfad + a64ce58 commit a91b73f

File tree

7 files changed

+229
-10
lines changed

7 files changed

+229
-10
lines changed

extruct/_extruct.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def extract(htmlstring,
2121
uniform=False,
2222
return_html_node=False,
2323
schema_context='http://schema.org',
24+
with_og_array=False,
2425
**kwargs):
2526
"""htmlstring: string with valid html document;
2627
base_url: base url of the html document
@@ -134,7 +135,7 @@ def extract(htmlstring,
134135
for syntax, uniform, raw, schema_context in uniform_processors:
135136
try:
136137
if syntax == 'opengraph':
137-
output[syntax] = uniform(raw)
138+
output[syntax] = uniform(raw, with_og_array=with_og_array)
138139
else:
139140
output[syntax] = uniform(raw, schema_context)
140141
except Exception as e:

extruct/rdfa.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,13 @@
2626

2727
initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns.update({
2828
"twitter": "https://dev.twitter.com/cards#",
29-
"fb": "http://ogp.me/ns/fb#"
29+
"fb": "http://ogp.me/ns/fb#",
30+
"og": "http://ogp.me/ns#",
31+
"music": "http://ogp.me/ns/music#",
32+
"video": "http://ogp.me/ns/video#",
33+
"article": "http://ogp.me/ns/article#",
34+
"book": "http://ogp.me/ns/book#",
35+
"profile": "http://ogp.me/ns/profile#"
3036
})
3137

3238

@@ -143,7 +149,6 @@ def extract_items(self, document, base_url=None, expanded=True):
143149
vocab_cache_report=False,
144150
refresh_vocab_cache=False,
145151
check_lite=False)
146-
147152
g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
148153
jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8')
149154

extruct/uniform.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,29 @@
11
from six.moves.urllib.parse import urlparse, urljoin
22

33

4-
def _uopengraph(extracted):
4+
def _uopengraph(extracted, with_og_array=False):
55
out = []
66
for obj in extracted:
77
# In order of appearance in the page
8-
properties = list(reversed(obj['properties']))
9-
# Ensuring that never empty value is returned if there is a duplicated
10-
# property with non empty value
11-
non_empty_props = {k for k, v in properties if v and v.strip()}
12-
flattened = {k: v for k, v in properties
13-
if k not in non_empty_props or (v and v.strip())}
8+
properties = list(obj['properties'])
9+
flattened = {}
10+
11+
for k, v in properties:
12+
if k not in flattened.keys():
13+
flattened[k] = v
14+
elif v and v.strip():
15+
# If og_array isn't required add first non empty value
16+
if not with_og_array:
17+
if not flattened[k] or not flattened[k].strip():
18+
flattened[k] = v
19+
else:
20+
if isinstance(flattened[k], list):
21+
flattened[k].append(v)
22+
elif flattened[k] and flattened[k].strip():
23+
flattened[k] = [flattened[k], v]
24+
else:
25+
flattened[k] = v
26+
1427
t = flattened.pop('og:type', None)
1528
if t:
1629
flattened['@type'] = t
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
2+
<html xmlns="https://www.w3.org/1999/xhtml" xmlns:og="https://ogp.me/ns#" xmlns:fb="https://www.facebook.com/2008/fbml">
3+
<head>
4+
<title>Himanshu's Open Graph Protocol</title>
5+
<meta http-equiv="Content-Type" content="text/html;charset=WINDOWS-1252" />
6+
<meta http-equiv="Content-Language" content="en-us" />
7+
<link rel="stylesheet" type="text/css" href="event-education.css" />
8+
<meta name="verify-v1" content="so4y/3aLT7/7bUUB9f6iVXN0tv8upRwaccek7JKB1gs=" >
9+
<meta property="og:image" content="https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg"/>
10+
<meta property="fb:admins" content="himanshu160"/>
11+
<meta property="og:site_name" content="Event Education"/>
12+
13+
<meta property="og:url" content="http://www.nytimes.com/2016/12/15/arts/music/from-steet-theater-to-wagner-on-the-opera-stage.html" />
14+
<meta property="og:type" content="article" />
15+
<meta property="og:title" content="From Street Theater to Wagner on the Opera Stage" />
16+
<meta property="og:description" content="which he set in Bangladesh instead of Norway. The production opens in Madrid on Saturday." />
17+
<meta property="article:published" itemprop="datePublished" content="2016-12-15T05:55:55-05:00" />
18+
<meta property="article:modified" itemprop="dateModified" content="2016-12-15T06:19:30-05:00" />
19+
<meta property="article:section" itemprop="articleSection" content="Music" />
20+
<meta property="article:section-taxonomy-id" itemprop="articleSection" content="C5BFA7D5-359C-427B-90E6-6B7245A6CDD8" />
21+
<meta property="article:section_url" content="http://www.nytimes.com/section/arts" />
22+
<meta property="article:top-level-section" content="arts" />
23+
<meta property="fb:app_id" content="9869919170" />
24+
<meta property="music:duration" content="60" />
25+
<meta property="video:tag" content="Exhilerating" />
26+
<meta property="book:release_date" content="2016-12-15T06:19:30-05:00" />
27+
<meta property="profile:first_name" content="John" />
28+
<meta property="profile:last_name" content="Lennon" />
29+
</head>
30+
<body>
31+
<div id="fb-root"></div>
32+
<script>(function(d, s, id) {
33+
var js, fjs = d.getElementsByTagName(s)[0];
34+
if (d.getElementById(id)) return;
35+
js = d.createElement(s); js.id = id;
36+
js.src = "//connect.facebook.net/en_US/all.js#xfbml=1&appId=501839739845103";
37+
fjs.parentNode.insertBefore(js, fjs);
38+
}(document, 'script', 'facebook-jssdk'));</script>
39+
</body>
40+
</html>
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
[
2+
{
3+
"https://ogp.me/ns#url": [
4+
{
5+
"@value": "http://www.nytimes.com/2016/12/15/arts/music/from-steet-theater-to-wagner-on-the-opera-stage.html"
6+
}
7+
],
8+
"http://ogp.me/ns/profile#first_name": [
9+
{
10+
"@value": "John"
11+
}
12+
],
13+
"https://ogp.me/ns#type": [
14+
{
15+
"@value": "article"
16+
}
17+
],
18+
"http://ogp.me/ns/article#section": [
19+
{
20+
"@value": "Music"
21+
}
22+
],
23+
"http://ogp.me/ns/music#duration": [
24+
{
25+
"@value": "60"
26+
}
27+
],
28+
"http://ogp.me/ns/article#modified": [
29+
{
30+
"@value": "2016-12-15T06:19:30-05:00"
31+
}
32+
],
33+
"http://ogp.me/ns/video#tag": [
34+
{
35+
"@value": "Exhilerating"
36+
}
37+
],
38+
"https://ogp.me/ns#site_name": [
39+
{
40+
"@value": "Event Education"
41+
}
42+
],
43+
"http://ogp.me/ns/profile#last_name": [
44+
{
45+
"@value": "Lennon"
46+
}
47+
],
48+
"https://www.facebook.com/2008/fbmladmins": [
49+
{
50+
"@value": "himanshu160"
51+
}
52+
],
53+
"http://ogp.me/ns/article#section_url": [
54+
{
55+
"@value": "http://www.nytimes.com/section/arts"
56+
}
57+
],
58+
"https://ogp.me/ns#title": [
59+
{
60+
"@value": "From Street Theater to Wagner on the Opera Stage"
61+
}
62+
],
63+
"https://www.facebook.com/2008/fbmlapp_id": [
64+
{
65+
"@value": "9869919170"
66+
}
67+
],
68+
"https://ogp.me/ns#image": [
69+
{
70+
"@value": "https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg"
71+
}
72+
],
73+
"http://ogp.me/ns/book#release_date": [
74+
{
75+
"@value": "2016-12-15T06:19:30-05:00"
76+
}
77+
],
78+
"http://ogp.me/ns/article#section-taxonomy-id": [
79+
{
80+
"@value": "C5BFA7D5-359C-427B-90E6-6B7245A6CDD8"
81+
}
82+
],
83+
"http://ogp.me/ns/article#published": [
84+
{
85+
"@value": "2016-12-15T05:55:55-05:00"
86+
}
87+
],
88+
"https://ogp.me/ns#description": [
89+
{
90+
"@value": "which he set in Bangladesh instead of Norway. The production opens in Madrid on Saturday."
91+
}
92+
],
93+
"@id": "http://www.example.com/index.html",
94+
"http://ogp.me/ns/article#top-level-section": [
95+
{
96+
"@value": "arts"
97+
}
98+
]
99+
}
100+
]

tests/test_rdfa.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,14 @@ def test_wikipedia_xhtml_rdfa_no_prefix(self):
100100
data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/')
101101

102102
self.assertJsonLDEqual(data, expected)
103+
104+
def test_expanded_opengraph_support(self):
105+
body = get_testdata('misc','expanded_OG_support_test.html')
106+
expected = json.loads(
107+
get_testdata('misc','expanded_OG_support_test.json'
108+
).decode('UTF-8'))
109+
110+
rdfae = RDFaExtractor()
111+
data = rdfae.extract(body, base_url='http://www.example.com/index.html')
112+
113+
self.assertJsonLDEqual(data,expected)

tests/test_uniform.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,25 @@ def test_uopengraph(self):
2727
data = extruct.extract(body, syntaxes=['opengraph'], uniform=True)
2828
self.assertEqual(data['opengraph'], expected)
2929

30+
def test_uopengraph_with_og_array(self):
31+
expected = [{"@context": {
32+
"og": "http://ogp.me/ns#",
33+
"fb": "http://www.facebook.com/2008/fbml",
34+
"concerts": "http://ogp.me/ns/fb/songkick-concerts#"
35+
},
36+
"fb:app_id": "308540029359",
37+
"og:site_name": "Songkick",
38+
"@type": "songkick-concerts:artist",
39+
"og:title": "Elysian Fields",
40+
"og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.",
41+
"og:url": "http://www.songkick.com/artists/236156-elysian-fields",
42+
"og:image": [ "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg",
43+
"http://images.sk-static.com/SECONDARY_IMAGE.jpg"],
44+
}]
45+
body = get_testdata('songkick', 'elysianfields.html')
46+
data = extruct.extract(body, syntaxes=['opengraph'], uniform=True, with_og_array=True)
47+
self.assertEqual(data['opengraph'], expected)
48+
3049
def test_uopengraph_duplicated_priorities(self):
3150
# Ensures that first seen property is kept when flattening
3251
data = _uopengraph([{'properties':
@@ -58,6 +77,36 @@ def test_uopengraph_duplicated_priorities(self):
5877
assert data[0]['prop_non_empty2'] == 'value!'
5978
assert data[0]['prop_non_empty3'] == 'value!'
6079

80+
def test_uopengraph_duplicated_with_og_array(self):
81+
# Ensures that first seen property is kept when flattening
82+
data = _uopengraph([{'properties':
83+
[('prop_{}'.format(k), 'value_{}'.format(v))
84+
for k in range(5)
85+
for v in range(5)],
86+
'namespace': 'namespace'}], with_og_array=True)
87+
for k in range(5):
88+
assert data[0]['prop_{}'.format(k)] == ['value_0', 'value_1', 'value_2', 'value_3', 'value_4']
89+
90+
# Ensures that empty is not returned if a property contains any
91+
# non empty value
92+
data = _uopengraph([{'properties':
93+
[('prop_empty', ' '),
94+
95+
('prop_non_empty', ' '),
96+
('prop_non_empty', 'value!'),
97+
98+
('prop_non_empty2', 'value!'),
99+
('prop_non_empty2', ' '),
100+
101+
('prop_non_empty3', ' '),
102+
('prop_non_empty3', 'value!'),
103+
('prop_non_empty3', 'other value'),
104+
],
105+
'namespace': 'namespace'}], with_og_array=True)
106+
assert data[0]['prop_empty'] == ' '
107+
assert data[0]['prop_non_empty'] == 'value!'
108+
assert data[0]['prop_non_empty2'] == 'value!'
109+
assert data[0]['prop_non_empty3'] == ['value!', 'other value']
61110

62111
def test_umicroformat(self):
63112
expected = [ { '@context': 'http://microformats.org/wiki/',

0 commit comments

Comments
 (0)