Merge pull request #21 from kalessin/validation

shaneaevans · shaneaevans · commit 0bf17198d1ea · 2012-04-13T04:07:48.000-07:00
avoid false positives when extracted item is empty and descriptor has no required attribute
diff --git a/scrapely/descriptor.py b/scrapely/descriptor.py
@@ -40,7 +40,7 @@ def _item_validates(self, item):
         """simply checks that all mandatory attributes are present"""
         variant_attrs = set(chain(*
             [v.keys() for v in item.get('variants', [])]))
-        return all([(name in item or name in variant_attrs) \
+        return item and all([(name in item or name in variant_attrs) \
                 for name in self._required_attributes])
 
     def get_required_attributes(self):
diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py
@@ -1092,7 +1092,7 @@
            'name': [u'A product']}
     ),
     ('consistency check', [ANNOTATED_PAGE14], EXTRACT_PAGE14, DEFAULT_DESCRIPTOR,
-          {},
+          None,
     ),
     ('consecutive nesting', [ANNOTATED_PAGE15], EXTRACT_PAGE15, DEFAULT_DESCRIPTOR,
           {'description': [u'Description\n \n'],
@@ -1204,33 +1204,39 @@
 
     ),
     ('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3,
-        {}
+        None
     ),
     ('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3,
         {u'phone': [u'029847272']}
     ),
     ('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3,
-        {}
+        None
     ),
     ('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
-        {}
+        None
     ),
-
 ]
 
 class TestIbl(TestCase):
 
     def _run_extraction(self, name, templates, page, descriptor, expected_output):
         self.trace = None
         template_pages = [HtmlPage(None, {}, t) for t in templates]
+        # extracts with trace enabled in order to generate traceback
         extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
         actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
-        if not actual_output:
+        if actual_output is not None:
+            actual_output = actual_output[0]
+            self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
+        # extracts again with trace disabled in order to get the pure output
+        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
+        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
+        if actual_output is None:
             if expected_output is None:
                 return
             assert False, "failed to extract data for test '%s'" % name
-        actual_output = actual_output[0]
-        self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace', [])
+        else:
+            actual_output = actual_output[0]
         expected_names = set(expected_output.keys())
         actual_names = set(actual_output.keys())