Skip to content

Commit 7b6951f

Browse files
committed
Merge pull request #42 from AlexRiina/master
Simplifying the test_extraction code and a few clean ups
2 parents bcd64ee + fa5b550 commit 7b6951f

File tree

3 files changed

+18
-54
lines changed

3 files changed

+18
-54
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.pyc

scrapely/tests/test_extraction.py

Lines changed: 8 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
tests should focus on specific bits of functionality work correctly.
66
"""
77
from unittest import TestCase
8-
import numpy
8+
from nose_parameterized import parameterized
99

1010
from scrapely.htmlpage import HtmlPage
1111
from scrapely.descriptor import (FieldDescriptor as A,
@@ -1289,51 +1289,14 @@
12891289
),
12901290
]
12911291

1292-
class TestIbl(TestCase):
12931292

1294-
def _run_extraction(self, name, templates, page, descriptor, expected_output):
1295-
self.trace = None
1293+
1294+
class TestExtraction(TestCase):
1295+
@parameterized.expand(TEST_DATA)
1296+
def test_extraction(self, name, templates, page, descriptor, expected_output):
12961297
template_pages = [HtmlPage(None, {}, t) for t in templates]
1297-
# extracts with trace enabled in order to generate traceback
1298-
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
1299-
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
1300-
if actual_output is not None:
1301-
actual_output = actual_output[0]
1302-
self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
1303-
# extracts again with trace disabled in order to get the pure output
1298+
13041299
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
13051300
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
1306-
if actual_output is None:
1307-
if expected_output is None:
1308-
return
1309-
assert False, "failed to extract data for test '%s'" % name
1310-
else:
1311-
actual_output = actual_output[0]
1312-
expected_names = set(expected_output.keys())
1313-
actual_names = set(actual_output.keys())
1314-
1315-
missing_in_output = filter(None, expected_names - actual_names)
1316-
error = "attributes '%s' were expected but were not present in test '%s'" % \
1317-
("', '".join(missing_in_output), name)
1318-
assert len(missing_in_output) == 0, error
1319-
1320-
unexpected = actual_names - expected_names
1321-
error = "unexpected attributes %s in test '%s'" % \
1322-
(', '.join(unexpected), name)
1323-
assert len(unexpected) == 0, error
1324-
1325-
for k, v in expected_output.items():
1326-
extracted = actual_output[k]
1327-
assert v == extracted, "in test '%s' for attribute '%s', " \
1328-
"expected value '%s' but got '%s'" % (name, k, v, extracted)
1329-
1330-
def test_expected_outputs(self):
1331-
try:
1332-
for data in TEST_DATA:
1333-
self._run_extraction(*data)
1334-
except AssertionError:
1335-
if self.trace:
1336-
print "Trace:"
1337-
for line in self.trace:
1338-
print "\n---\n%s" % line
1339-
raise
1301+
1302+
self.assertEqual(expected_output, actual_output and actual_output[0])

scrapely/tests/test_pageparsing.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,11 @@ def test_instance_parsing(self):
228228
self.assertEqual(_tags(pp, closep), ['</p>', '</html>'])
229229

230230
def _validate_annotation(self, parser, lable_region, name, start_tag, end_tag):
231-
assert lable_region.surrounds_attribute == name
231+
self.assertEqual(lable_region.surrounds_attribute, name)
232232
start_token = parser.token_list[lable_region.start_index]
233-
assert parser.token_dict.token_string(start_token) == start_tag
233+
self.assertEqual(parser.token_dict.token_string(start_token), start_tag)
234234
end_token = parser.token_list[lable_region.end_index]
235-
assert parser.token_dict.token_string(end_token) == end_tag
235+
self.assertEqual(parser.token_dict.token_string(end_token), end_tag)
236236

237237
def test_template_parsing(self):
238238
lp = _parse_page(TemplatePageParser, LABELLED_PAGE1)
@@ -246,16 +246,16 @@ def test_template_parsing(self):
246246
def test_extraction_page_parsing(self):
247247
epp = _parse_page(ExtractionPageParser, SIMPLE_PAGE)
248248
ep = epp.to_extraction_page()
249-
assert len(ep.page_tokens) == 4
250-
assert ep.htmlpage.fragment_data(ep.htmlpage_tag(0)) == '<html>'
251-
assert ep.htmlpage.fragment_data(ep.htmlpage_tag(1)) == '<p some-attr="foo">'
249+
self.assertEqual(len(ep.page_tokens), 4)
250+
self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(0)), '<html>')
251+
self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(1)), '<p some-attr="foo">')
252252

253-
assert ep.htmlpage_region_inside(1, 2) == 'this is a test'
254-
assert ep.htmlpage_region_inside(1, 3) == 'this is a test</p> '
253+
self.assertEqual(ep.htmlpage_region_inside(1, 2), 'this is a test')
254+
self.assertEqual(ep.htmlpage_region_inside(1, 3), 'this is a test</p> ')
255255

256256
def test_invalid_html(self):
257257
p = _parse_page(InstanceLearningParser, BROKEN_PAGE)
258-
assert p
258+
self.assertTrue(p)
259259

260260
def test_ignore_region(self):
261261
"""Test ignored regions"""

0 commit comments

Comments
 (0)