Skip to content

Commit 0bf1719

Browse files
committed
Merge pull request #21 from kalessin/validation
avoid false positives when extracted item is empty and descriptor has no required attribute
2 parents cd084a2 + 995b6dd commit 0bf1719

File tree

2 files changed

+15
-9
lines changed

2 files changed

+15
-9
lines changed

scrapely/descriptor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def _item_validates(self, item):
4040
"""simply checks that all mandatory attributes are present"""
4141
variant_attrs = set(chain(*
4242
[v.keys() for v in item.get('variants', [])]))
43-
return all([(name in item or name in variant_attrs) \
43+
return item and all([(name in item or name in variant_attrs) \
4444
for name in self._required_attributes])
4545

4646
def get_required_attributes(self):

scrapely/tests/test_extraction.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,7 +1092,7 @@
10921092
'name': [u'A product']}
10931093
),
10941094
('consistency check', [ANNOTATED_PAGE14], EXTRACT_PAGE14, DEFAULT_DESCRIPTOR,
1095-
{},
1095+
None,
10961096
),
10971097
('consecutive nesting', [ANNOTATED_PAGE15], EXTRACT_PAGE15, DEFAULT_DESCRIPTOR,
10981098
{'description': [u'Description\n \n'],
@@ -1204,33 +1204,39 @@
12041204

12051205
),
12061206
('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3,
1207-
{}
1207+
None
12081208
),
12091209
('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3,
12101210
{u'phone': [u'029847272']}
12111211
),
12121212
('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3,
1213-
{}
1213+
None
12141214
),
12151215
('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
1216-
{}
1216+
None
12171217
),
1218-
12191218
]
12201219

12211220
class TestIbl(TestCase):
12221221

12231222
def _run_extraction(self, name, templates, page, descriptor, expected_output):
12241223
self.trace = None
12251224
template_pages = [HtmlPage(None, {}, t) for t in templates]
1225+
# extracts with trace enabled in order to generate traceback
12261226
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
12271227
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
1228-
if not actual_output:
1228+
if actual_output is not None:
1229+
actual_output = actual_output[0]
1230+
self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
1231+
# extracts again with trace disabled in order to get the pure output
1232+
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
1233+
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
1234+
if actual_output is None:
12291235
if expected_output is None:
12301236
return
12311237
assert False, "failed to extract data for test '%s'" % name
1232-
actual_output = actual_output[0]
1233-
self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace', [])
1238+
else:
1239+
actual_output = actual_output[0]
12341240
expected_names = set(expected_output.keys())
12351241
actual_names = set(actual_output.keys())
12361242

0 commit comments

Comments
 (0)