Skip to content

Commit 3ab5592

Browse files
authored
Merge pull request #100 from scrapinghub/ignore-parsing-and-unification-exceptions
ignore any exception if errors='ignore'
2 parents c3ef088 + 49a8570 commit 3ab5592

File tree

2 files changed

+103
-20
lines changed

2 files changed

+103
-20
lines changed

extruct/_extruct.py

Lines changed: 87 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']
1414

1515

16-
def extract(htmlstring, base_url=None, encoding="UTF-8",
16+
def extract(htmlstring,
17+
base_url=None,
18+
encoding="UTF-8",
1719
syntaxes=SYNTAXES,
1820
errors='strict',
1921
uniform=False,
@@ -38,48 +40,113 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
3840
Each node is of `lxml.etree.Element` type.
3941
schema_context: schema's context for current page"""
4042
if base_url is None and 'url' in kwargs:
41-
warnings.warn('"url" argument is deprecated, please use "base_url"',
42-
DeprecationWarning, stacklevel=2)
43+
warnings.warn(
44+
'"url" argument is deprecated, please use "base_url"',
45+
DeprecationWarning,
46+
stacklevel=2)
4347
base_url = kwargs.pop('url')
4448
if kwargs:
4549
raise TypeError('Unexpected keyword arguments')
46-
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
50+
if not (isinstance(syntaxes, list) and all(v in SYNTAXES
51+
for v in syntaxes)):
4752
raise ValueError("syntaxes must be a list with any or all (default) of"
4853
"these values: {}".format(SYNTAXES))
4954
if errors not in ['log', 'ignore', 'strict']:
5055
raise ValueError('Invalid error command, valid values are either "log"'
5156
', "ignore" or "strict"')
52-
tree = parse_xmldom_html(htmlstring, encoding=encoding)
57+
try:
58+
tree = parse_xmldom_html(htmlstring, encoding=encoding)
59+
except Exception as e:
60+
if errors == 'ignore':
61+
return {}
62+
if errors == 'log':
63+
logger.exception(
64+
'Failed to parse html, raises {}'.format(e))
65+
return {}
66+
if errors == 'strict':
67+
raise
5368
processors = []
5469
if 'microdata' in syntaxes:
55-
processors.append(('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree))
70+
processors.append(
71+
('microdata',
72+
MicrodataExtractor(add_html_node=return_html_node).extract_items,
73+
tree
74+
))
5675
if 'json-ld' in syntaxes:
57-
processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
76+
processors.append(
77+
('json-ld',
78+
JsonLdExtractor().extract_items,
79+
tree,
80+
))
5881
if 'opengraph' in syntaxes:
59-
processors.append(('opengraph', OpenGraphExtractor().extract_items, tree))
82+
processors.append(
83+
('opengraph',
84+
OpenGraphExtractor().extract_items,
85+
tree
86+
))
6087
if 'microformat' in syntaxes:
61-
processors.append(('microformat', MicroformatExtractor().extract_items, htmlstring))
88+
processors.append(
89+
('microformat',
90+
MicroformatExtractor().extract_items,
91+
htmlstring
92+
))
6293
if 'rdfa' in syntaxes:
63-
processors.append(('rdfa', RDFaExtractor().extract_items, tree))
94+
processors.append(
95+
('rdfa', RDFaExtractor().extract_items,
96+
tree,
97+
))
6498
output = {}
65-
for label, extract, document in processors:
99+
for syntax, extract, document in processors:
66100
try:
67-
output[label] = list(extract(document, base_url=base_url))
68-
except Exception:
101+
output[syntax] = list(extract(document, base_url=base_url))
102+
except Exception as e:
69103
if errors == 'log':
70-
logger.exception('Failed to extract {}'.format(label))
104+
logger.exception('Failed to extract {}, raises {}'
105+
.format(syntax, e)
106+
)
71107
if errors == 'ignore':
72108
pass
73109
if errors == 'strict':
74110
raise
75-
76111
if uniform:
112+
uniform_processors = []
77113
if 'microdata' in syntaxes:
78-
output['microdata'] = _umicrodata_microformat(output['microdata'],
79-
schema_context=schema_context)
114+
uniform_processors.append(
115+
('microdata',
116+
_umicrodata_microformat,
117+
output['microdata'],
118+
schema_context,
119+
))
80120
if 'microformat' in syntaxes:
81-
output['microformat'] = _umicrodata_microformat(output['microformat'],
82-
schema_context='http://microformats.org/wiki/')
121+
uniform_processors.append(
122+
('microformat',
123+
_umicrodata_microformat,
124+
output['microformat'],
125+
'http://microformats.org/wiki/',
126+
))
83127
if 'opengraph' in syntaxes:
84-
output['opengraph'] = _uopengraph(output['opengraph'])
128+
uniform_processors.append(
129+
('opengraph',
130+
_uopengraph,
131+
output['opengraph'],
132+
None,
133+
))
134+
for syntax, uniform, raw, schema_context in uniform_processors:
135+
try:
136+
if syntax == 'opengraph':
137+
output[syntax] = uniform(raw)
138+
else:
139+
output[syntax] = uniform(raw, schema_context)
140+
except Exception as e:
141+
if errors == 'ignore':
142+
output[syntax] = []
143+
if errors == 'log':
144+
output[syntax] = []
145+
logger.exception(
146+
'Failed to uniform extracted for {}, raises {}'
147+
.format(syntax, e)
148+
)
149+
if errors == 'strict':
150+
raise
151+
85152
return output

tests/test_extruct.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,19 @@ def _microdata_custom_url(self, test_file):
4949
get_testdata('schema.org', test_file)
5050
.decode('UTF-8'))}
5151
return body, expected
52+
53+
def test_errors(self):
54+
body = ''
55+
56+
# raise exceptions
57+
with pytest.raises(Exception):
58+
data = extruct.extract(body)
59+
60+
# ignore exceptions
61+
expected = {}
62+
data = extruct.extract(body, errors='ignore')
63+
assert data == expected
64+
65+
# ignore exceptions
66+
data = extruct.extract(body, errors='log')
67+
assert data == expected

0 commit comments

Comments
 (0)