1313SYNTAXES = ['microdata' , 'opengraph' , 'json-ld' , 'microformat' , 'rdfa' ]
1414
1515
16- def extract (htmlstring , base_url = None , encoding = "UTF-8" ,
16+ def extract (htmlstring ,
17+ base_url = None ,
18+ encoding = "UTF-8" ,
1719 syntaxes = SYNTAXES ,
1820 errors = 'strict' ,
1921 uniform = False ,
@@ -38,48 +40,113 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
3840 Each node is of `lxml.etree.Element` type.
3941 schema_context: schema's context for current page"""
4042 if base_url is None and 'url' in kwargs :
41- warnings .warn ('"url" argument is deprecated, please use "base_url"' ,
42- DeprecationWarning , stacklevel = 2 )
43+ warnings .warn (
44+ '"url" argument is deprecated, please use "base_url"' ,
45+ DeprecationWarning ,
46+ stacklevel = 2 )
4347 base_url = kwargs .pop ('url' )
4448 if kwargs :
4549 raise TypeError ('Unexpected keyword arguments' )
46- if not (isinstance (syntaxes , list ) and all (v in SYNTAXES for v in syntaxes )):
50+ if not (isinstance (syntaxes , list ) and all (v in SYNTAXES
51+ for v in syntaxes )):
4752 raise ValueError ("syntaxes must be a list with any or all (default) of"
4853 "these values: {}" .format (SYNTAXES ))
4954 if errors not in ['log' , 'ignore' , 'strict' ]:
5055 raise ValueError ('Invalid error command, valid values are either "log"'
5156 ', "ignore" or "strict"' )
52- tree = parse_xmldom_html (htmlstring , encoding = encoding )
57+ try :
58+ tree = parse_xmldom_html (htmlstring , encoding = encoding )
59+ except Exception as e :
60+ if errors == 'ignore' :
61+ return {}
62+ if errors == 'log' :
63+ logger .exception (
64+ 'Failed to parse html, raises {}' .format (e ))
65+ return {}
66+ if errors == 'strict' :
67+ raise
5368 processors = []
5469 if 'microdata' in syntaxes :
55- processors .append (('microdata' , MicrodataExtractor (add_html_node = return_html_node ).extract_items , tree ))
70+ processors .append (
71+ ('microdata' ,
72+ MicrodataExtractor (add_html_node = return_html_node ).extract_items ,
73+ tree
74+ ))
5675 if 'json-ld' in syntaxes :
57- processors .append (('json-ld' , JsonLdExtractor ().extract_items , tree ))
76+ processors .append (
77+ ('json-ld' ,
78+ JsonLdExtractor ().extract_items ,
79+ tree ,
80+ ))
5881 if 'opengraph' in syntaxes :
59- processors .append (('opengraph' , OpenGraphExtractor ().extract_items , tree ))
82+ processors .append (
83+ ('opengraph' ,
84+ OpenGraphExtractor ().extract_items ,
85+ tree
86+ ))
6087 if 'microformat' in syntaxes :
61- processors .append (('microformat' , MicroformatExtractor ().extract_items , htmlstring ))
88+ processors .append (
89+ ('microformat' ,
90+ MicroformatExtractor ().extract_items ,
91+ htmlstring
92+ ))
6293 if 'rdfa' in syntaxes :
63- processors .append (('rdfa' , RDFaExtractor ().extract_items , tree ))
94+ processors .append (
95+ ('rdfa' , RDFaExtractor ().extract_items ,
96+ tree ,
97+ ))
6498 output = {}
65- for label , extract , document in processors :
99+ for syntax , extract , document in processors :
66100 try :
67- output [label ] = list (extract (document , base_url = base_url ))
68- except Exception :
101+ output [syntax ] = list (extract (document , base_url = base_url ))
102+ except Exception as e :
69103 if errors == 'log' :
70- logger .exception ('Failed to extract {}' .format (label ))
104+ logger .exception ('Failed to extract {}, raises {}'
105+ .format (syntax , e )
106+ )
71107 if errors == 'ignore' :
72108 pass
73109 if errors == 'strict' :
74110 raise
75-
76111 if uniform :
112+ uniform_processors = []
77113 if 'microdata' in syntaxes :
78- output ['microdata' ] = _umicrodata_microformat (output ['microdata' ],
79- schema_context = schema_context )
114+ uniform_processors .append (
115+ ('microdata' ,
116+ _umicrodata_microformat ,
117+ output ['microdata' ],
118+ schema_context ,
119+ ))
80120 if 'microformat' in syntaxes :
81- output ['microformat' ] = _umicrodata_microformat (output ['microformat' ],
82- schema_context = 'http://microformats.org/wiki/' )
121+ uniform_processors .append (
122+ ('microformat' ,
123+ _umicrodata_microformat ,
124+ output ['microformat' ],
125+ 'http://microformats.org/wiki/' ,
126+ ))
83127 if 'opengraph' in syntaxes :
84- output ['opengraph' ] = _uopengraph (output ['opengraph' ])
128+ uniform_processors .append (
129+ ('opengraph' ,
130+ _uopengraph ,
131+ output ['opengraph' ],
132+ None ,
133+ ))
134+ for syntax , uniform , raw , schema_context in uniform_processors :
135+ try :
136+ if syntax == 'opengraph' :
137+ output [syntax ] = uniform (raw )
138+ else :
139+ output [syntax ] = uniform (raw , schema_context )
140+ except Exception as e :
141+ if errors == 'ignore' :
142+ output [syntax ] = []
143+ if errors == 'log' :
144+ output [syntax ] = []
145+ logger .exception (
146+ 'Failed to uniform extracted for {}, raises {}'
147+ .format (syntax , e )
148+ )
149+ if errors == 'strict' :
150+ raise
151+
85152 return output
0 commit comments