44
55Based on pyrdfa3 and rdflib
66"""
7+ from collections import defaultdict
8+
79import json
810import logging
11+ import re
912
1013rdflib_logger = logging .getLogger ('rdflib' )
1114rdflib_logger .setLevel (logging .ERROR )
3538
3639class RDFaExtractor (object ):
3740
41+ def _replaceNS (self , prop , html_element , head_element ):
42+ """Expand namespace to match with returned json (e.g.: og -> 'http://ogp.me/ns#')"""
43+
44+ # context namespaces taken from pyrdfa3
45+ # https://github.com/RDFLib/PyRDFa/blob/master/pyRdfa/initialcontext.py
46+ context = {
47+ 'owl' : 'http://www.w3.org/2002/07/owl#' ,
48+ 'gr' : 'http://purl.org/goodrelations/v1#' ,
49+ 'ctag' : 'http://commontag.org/ns#' ,
50+ 'cc' : 'http://creativecommons.org/ns#' ,
51+ 'grddl' : 'http://www.w3.org/2003/g/data-view#' ,
52+ 'rif' : 'http://www.w3.org/2007/rif#' ,
53+ 'sioc' : 'http://rdfs.org/sioc/ns#' ,
54+ 'skos' : 'http://www.w3.org/2004/02/skos/core#' ,
55+ 'xml' : 'http://www.w3.org/XML/1998/namespace' ,
56+ 'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#' ,
57+ 'rev' : 'http://purl.org/stuff/rev#' ,
58+ 'rdfa' : 'http://www.w3.org/ns/rdfa#' ,
59+ 'dc' : 'http://purl.org/dc/terms/' ,
60+ 'foaf' : 'http://xmlns.com/foaf/0.1/' ,
61+ 'void' : 'http://rdfs.org/ns/void#' ,
62+ 'ical' : 'http://www.w3.org/2002/12/cal/icaltzd#' ,
63+ 'vcard' : 'http://www.w3.org/2006/vcard/ns#' ,
64+ 'wdrs' : 'http://www.w3.org/2007/05/powder-s#' ,
65+ 'og' : 'http://ogp.me/ns#' ,
66+ 'wdr' : 'http://www.w3.org/2007/05/powder#' ,
67+ 'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' ,
68+ 'xhv' : 'http://www.w3.org/1999/xhtml/vocab#' ,
69+ 'xsd' : 'http://www.w3.org/2001/XMLSchema#' ,
70+ 'v' : 'http://rdf.data-vocabulary.org/#' ,
71+ 'skosxl' : 'http://www.w3.org/2008/05/skos-xl#' ,
72+ 'schema' : 'http://schema.org/' ,
73+ }
74+
75+ # if bad property
76+ if ':' not in prop :
77+ return prop
78+
79+ # if property has no prefix
80+ if 'http://' in prop :
81+ return prop
82+
83+ prefix = prop .split (':' )[0 ]
84+
85+ match = None
86+ if head_element .get ('prefix' ):
87+ match = re .search (prefix + ': [^\s]+' , head_element .get ('prefix' ))
88+
89+ # if namespace taken from prefix attribute in head tag
90+ if match :
91+ ns = match .group ().split (': ' )[1 ]
92+ return ns + prop .split (':' )[1 ]
93+
94+ # if namespace taken from xmlns attribute in html tag
95+ if ('xmlns:' + prefix ) in html_element .keys ():
96+ return html_element .get ('xmlns:' + prefix ) + prop .split (':' )[1 ]
97+
98+ # if namespace present in inital context
99+ if prefix in context :
100+ return context [prefix ] + prop .split (':' )[1 ]
101+
102+ return prop
103+
104+ def _sort (self , unordered , ordered ):
105+ """Sort the rdfa tags in jsonld string"""
106+ idx_for_value = dict (reversed ([(value , idx ) for idx , value in enumerate (ordered )]))
107+ unordered .sort (key = lambda props : idx_for_value .get (props .get ('@value' ), len (ordered )))
108+
109+
110+ def _fix_order (self , jsonld_string , document ):
111+ """
112+ Fix order of rdfa tags in jsonld string
113+ by checking the appearance order in the HTML
114+ """
115+ json_objects = json .loads (jsonld_string )
116+
117+ html , head = document .xpath ('/html' ), document .xpath ('//head' )
118+ if not html or not head :
119+ return json_objects
120+ html_element , head_element = html [0 ], head [0 ]
121+
122+ # Stores the values or each property in appearance order
123+ values_for_property = defaultdict (list )
124+
125+ for meta_tag in head_element .xpath ("meta[@property]" ):
126+ expanded_property = self ._replaceNS (meta_tag .attrib ['property' ],
127+ html_element ,
128+ head_element )
129+ values_for_property [expanded_property ].append (meta_tag .get ('content' ))
130+
131+ for json_object in json_objects :
132+ keys = json_object .keys ()
133+
134+ for key in keys :
135+ if type (json_object [key ]) is list and len (json_object [key ]) > 1 :
136+ self ._sort (json_object [key ], values_for_property [key ])
137+
138+ return json_objects
139+
38140 def extract (self , htmlstring , base_url = None , encoding = "UTF-8" ,
39141 expanded = True ):
40142 tree = parse_xmldom_html (htmlstring , encoding = encoding )
@@ -51,4 +153,10 @@ def extract_items(self, document, base_url=None, expanded=True):
51153 check_lite = False )
52154 g = PyRdfa (options , base = base_url ).graph_from_DOM (document , graph = Graph (), pgraph = Graph ())
53155 jsonld_string = g .serialize (format = 'json-ld' , auto_compact = not expanded ).decode ('utf-8' )
54- return json .loads (jsonld_string )
156+
157+ try :
158+ # hack to fix the ordering of multi-value properties (see issue 116)
159+ # it should be disabled once PyRDFA fixes itself
160+ return self ._fix_order (jsonld_string , document )
161+ except :
162+ return json .loads (jsonld_string )
0 commit comments