3131
3232
3333class RDFaExtractor (object ):
34-
35- # expands namespace to match with returned json (ex: og -> 'http://ogp.me/ns#')
36- def replaceNS ( self , prop , html_element , head_element ):
34+
35+ def _replaceNS ( self , prop , html_element , head_element ):
36+ """Expand namespace to match with returned json (e.g.: og -> 'http://ogp.me/ns#')"""
3737
3838 # context namespaces taken from pyrdfa3
3939 # https://github.com/RDFLib/PyRDFa/blob/master/pyRdfa/initialcontext.py
@@ -94,26 +94,29 @@ def replaceNS(self, prop, html_element, head_element):
9494 return context [prefix ] + prop .split (':' )[1 ]
9595
9696 return prop
97-
98- # sorts the rdfa tags in jsonld string
99- def sort ( self , unordered , ordered ):
97+
98+ def _sort ( self , unordered , ordered ):
99+ """Sort the rdfa tags in jsonld string"""
100100 idx_for_value = dict (reversed ([(value , idx ) for idx , value in enumerate (ordered )]))
101101 unordered .sort (key = lambda props : idx_for_value .get (props .get ('@value' ), len (ordered )))
102102
103-
104- # fixes order of rdfa tags in jsonld string
105- # by comparing with order in document object
106- def fix_order (self , jsonld_string , document ):
103+
104+ def _fix_order (self , jsonld_string , document ):
105+ """
106+ Fix order of rdfa tags in jsonld string
107+ by checking the appearance order in the HTML
108+ """
107109 html_element = document .xpath ('/html' )[0 ]
108110 head_element = document .xpath ('//head' )[0 ]
109-
110- meta_tags = defaultdict (list )
111+
112+ # Stores the values or each property in appearance order
113+ values_for_property = defaultdict (list )
111114
112115 for meta_tag in head_element .xpath ("meta[@property]" ):
113- expanded_property = self .replaceNS (meta_tag .attrib ['property' ],
114- html_element ,
115- head_element )
116- meta_tags [expanded_property ].append (meta_tag .get ('content' ))
116+ expanded_property = self ._replaceNS (meta_tag .attrib ['property' ],
117+ html_element ,
118+ head_element )
119+ values_for_property [expanded_property ].append (meta_tag .get ('content' ))
117120
118121 json_objects = json .loads (jsonld_string )
119122
@@ -122,7 +125,7 @@ def fix_order(self, jsonld_string, document):
122125
123126 for key in keys :
124127 if type (json_object [key ]) is list and len (json_object [key ]) > 1 :
125- self .sort (json_object [key ], meta_tags [key ])
128+ self ._sort (json_object [key ], values_for_property [key ])
126129
127130 return json_objects
128131
@@ -145,8 +148,8 @@ def extract_items(self, document, base_url=None, expanded=True):
145148 jsonld_string = g .serialize (format = 'json-ld' , auto_compact = not expanded ).decode ('utf-8' )
146149
147150 try :
148- # hack to fix the ordering of duplicate properties (see issue 116)
151+ # hack to fix the ordering of multi-value properties (see issue 116)
149152 # it should be disabled once PyRDFA fixes itself
150- return self .fix_order (jsonld_string , document )
153+ return self ._fix_order (jsonld_string , document )
151154 except :
152155 return json .loads (jsonld_string )
0 commit comments