|
1 | 1 | """ |
2 | 2 | htmlpage.py tests |
3 | 3 | """ |
| 4 | +import os |
4 | 5 | import copy |
| 6 | +import json |
5 | 7 | from unittest import TestCase |
6 | 8 |
|
7 | | -from scrapely.htmlpage import parse_html, HtmlTag, HtmlDataFragment, HtmlPage |
| 9 | +from scrapely.htmlpage import ( |
| 10 | + parse_html, HtmlTag, HtmlDataFragment, HtmlPage, url_to_page |
| 11 | +) |
8 | 12 | from .test_htmlpage_data import * |
9 | 13 | from . import iter_samples |
| 14 | +BASE_PATH = os.path.abspath(os.path.dirname(__file__)) |
| 15 | + |
10 | 16 |
|
11 | 17 | def _encode_element(el): |
12 | 18 | """ |
13 | 19 | jsonize parse element |
14 | 20 | """ |
15 | 21 | if isinstance(el, HtmlTag): |
16 | 22 | return {"tag": el.tag, "attributes": el.attributes, |
17 | | - "start": el.start, "end": el.end, "tag_type": el.tag_type} |
| 23 | + "start": el.start, "end": el.end, "tag_type": el.tag_type} |
18 | 24 | if isinstance(el, HtmlDataFragment): |
19 | 25 | return {"start": el.start, "end": el.end, "is_text_content": el.is_text_content} |
20 | 26 | raise TypeError |
21 | 27 |
|
| 28 | + |
22 | 29 | def _decode_element(dct): |
23 | 30 | """ |
24 | 31 | dejsonize parse element |
25 | 32 | """ |
26 | 33 | if "tag" in dct: |
27 | | - return HtmlTag(dct["tag_type"], dct["tag"], \ |
28 | | - dct["attributes"], dct["start"], dct["end"]) |
| 34 | + return HtmlTag(dct["tag_type"], dct["tag"], |
| 35 | + dct["attributes"], dct["start"], dct["end"]) |
29 | 36 | if "start" in dct: |
30 | 37 | return HtmlDataFragment(dct["start"], dct["end"], dct.get("is_text_content", True)) |
31 | 38 | return dct |
32 | 39 |
|
| 40 | + |
33 | 41 | class TestParseHtml(TestCase): |
34 | 42 | """Test for parse_html""" |
35 | 43 | def _test_sample(self, source, expected_parsed, samplecount=None): |
@@ -165,3 +173,12 @@ def test_copy(self): |
165 | 173 | self.assertEqual(regiondeepcopy.end_index, 15) |
166 | 174 | self.assertFalse(region is regiondeepcopy) |
167 | 175 | self.assertFalse(region.htmlpage is regiondeepcopy.htmlpage) |
| 176 | + |
| 177 | + def test_load_page_from_url(self): |
| 178 | + filepath = os.path.join(BASE_PATH, 'samples/samples_htmlpage_0') |
| 179 | + url = 'file://{}.{}'.format(filepath, 'html') |
| 180 | + page = url_to_page(url) |
| 181 | + parsed = json.load(open('{}.{}'.format(filepath, 'json'))) |
| 182 | + parsed = [_decode_element(d) for d in parsed] |
| 183 | + self.assertEqual(page.url, url) |
| 184 | + self._test_sample(page.body, parsed, 1) |
0 commit comments