|
4 | 4 | Page parsing effectiveness is measured through the evaluation system. These |
5 | 5 | tests should focus on specific bits of functionality work correctly. |
6 | 6 | """ |
7 | | -from unittest import TestCase |
8 | | -import numpy |
| 7 | +from functools import partial |
9 | 8 |
|
10 | 9 | from scrapely.htmlpage import HtmlPage |
11 | 10 | from scrapely.descriptor import (FieldDescriptor as A, |
|
1289 | 1288 | ), |
1290 | 1289 | ] |
1291 | 1290 |
|
1292 | | -class TestIbl(TestCase): |
1293 | | - |
1294 | | - def _run_extraction(self, name, templates, page, descriptor, expected_output): |
1295 | | - self.trace = None |
1296 | | - template_pages = [HtmlPage(None, {}, t) for t in templates] |
1297 | | - # extracts with trace enabled in order to generate traceback |
1298 | | - extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True) |
1299 | | - actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) |
1300 | | - if actual_output is not None: |
1301 | | - actual_output = actual_output[0] |
1302 | | - self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace') |
1303 | | - # extracts again with trace disabled in order to get the pure output |
1304 | | - extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) |
1305 | | - actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) |
1306 | | - if actual_output is None: |
1307 | | - if expected_output is None: |
1308 | | - return |
1309 | | - assert False, "failed to extract data for test '%s'" % name |
1310 | | - else: |
1311 | | - actual_output = actual_output[0] |
1312 | | - expected_names = set(expected_output.keys()) |
1313 | | - actual_names = set(actual_output.keys()) |
1314 | | - |
1315 | | - missing_in_output = filter(None, expected_names - actual_names) |
1316 | | - error = "attributes '%s' were expected but were not present in test '%s'" % \ |
1317 | | - ("', '".join(missing_in_output), name) |
1318 | | - assert len(missing_in_output) == 0, error |
1319 | | - |
1320 | | - unexpected = actual_names - expected_names |
1321 | | - error = "unexpected attributes %s in test '%s'" % \ |
1322 | | - (', '.join(unexpected), name) |
1323 | | - assert len(unexpected) == 0, error |
1324 | | - |
1325 | | - for k, v in expected_output.items(): |
1326 | | - extracted = actual_output[k] |
1327 | | - assert v == extracted, "in test '%s' for attribute '%s', " \ |
1328 | | - "expected value '%s' but got '%s'" % (name, k, v, extracted) |
1329 | | - |
1330 | | - def test_expected_outputs(self): |
1331 | | - try: |
1332 | | - for data in TEST_DATA: |
1333 | | - self._run_extraction(*data) |
1334 | | - except AssertionError: |
1335 | | - if self.trace: |
1336 | | - print "Trace:" |
1337 | | - for line in self.trace: |
1338 | | - print "\n---\n%s" % line |
1339 | | - raise |
| 1291 | +def _run_extraction(name, templates, page, descriptor, expected_output): |
| 1292 | + template_pages = [HtmlPage(None, {}, t) for t in templates] |
| 1293 | + |
| 1294 | + extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) |
| 1295 | + actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) |
| 1296 | + if actual_output is None: |
| 1297 | + assert not expected_output, "failed to extract data for test '%s'" % name |
| 1298 | + return |
| 1299 | + else: |
| 1300 | + actual_output = actual_output[0] |
| 1301 | + expected_names = set(expected_output.keys()) |
| 1302 | + actual_names = set(actual_output.keys()) |
| 1303 | + |
| 1304 | + missing_in_output = filter(None, expected_names - actual_names) |
| 1305 | + error = "attributes '%s' were expected but were not present in test '%s'" % \ |
| 1306 | + ("', '".join(missing_in_output), name) |
| 1307 | + assert not missing_in_output, error |
| 1308 | + |
| 1309 | + unexpected = actual_names - expected_names |
| 1310 | + error = "unexpected attributes %s in test '%s'" % \ |
| 1311 | + (', '.join(unexpected), name) |
| 1312 | + assert not unexpected, error |
| 1313 | + |
| 1314 | + for k, v in expected_output.items(): |
| 1315 | + extracted = actual_output[k] |
| 1316 | + assert v == extracted, "in test '%s' for attribute '%s', " \ |
| 1317 | + "expected value '%s' but got '%s'" % (name, k, v, extracted) |
| 1318 | + |
| 1319 | +def test_generator(): |
| 1320 | + for data in TEST_DATA: |
| 1321 | + yield partial(_run_extraction, *data) |
0 commit comments