|
1092 | 1092 | 'name': [u'A product']} |
1093 | 1093 | ), |
1094 | 1094 | ('consistency check', [ANNOTATED_PAGE14], EXTRACT_PAGE14, DEFAULT_DESCRIPTOR, |
1095 | | - {}, |
| 1095 | + None, |
1096 | 1096 | ), |
1097 | 1097 | ('consecutive nesting', [ANNOTATED_PAGE15], EXTRACT_PAGE15, DEFAULT_DESCRIPTOR, |
1098 | 1098 | {'description': [u'Description\n \n'], |
|
1204 | 1204 |
|
1205 | 1205 | ), |
1206 | 1206 | ('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3, |
1207 | | - {} |
| 1207 | + None |
1208 | 1208 | ), |
1209 | 1209 | ('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3, |
1210 | 1210 | {u'phone': [u'029847272']} |
1211 | 1211 | ), |
1212 | 1212 | ('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3, |
1213 | | - {} |
| 1213 | + None |
1214 | 1214 | ), |
1215 | 1215 | ('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3, |
1216 | | - {} |
| 1216 | + None |
1217 | 1217 | ), |
1218 | | - |
1219 | 1218 | ] |
1220 | 1219 |
|
1221 | 1220 | class TestIbl(TestCase): |
1222 | 1221 |
|
1223 | 1222 | def _run_extraction(self, name, templates, page, descriptor, expected_output): |
1224 | 1223 | self.trace = None |
1225 | 1224 | template_pages = [HtmlPage(None, {}, t) for t in templates] |
| 1225 | + # extracts with trace enabled in order to generate traceback |
1226 | 1226 | extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True) |
1227 | 1227 | actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) |
1228 | | - if not actual_output: |
| 1228 | + if actual_output is not None: |
| 1229 | + actual_output = actual_output[0] |
| 1230 | + self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace') |
| 1231 | + # extracts again with trace disabled in order to get the pure output |
| 1232 | + extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) |
| 1233 | + actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) |
| 1234 | + if actual_output is None: |
1229 | 1235 | if expected_output is None: |
1230 | 1236 | return |
1231 | 1237 | assert False, "failed to extract data for test '%s'" % name |
1232 | | - actual_output = actual_output[0] |
1233 | | - self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace', []) |
| 1238 | + else: |
| 1239 | + actual_output = actual_output[0] |
1234 | 1240 | expected_names = set(expected_output.keys()) |
1235 | 1241 | actual_names = set(actual_output.keys()) |
1236 | 1242 |
|
|
0 commit comments