Skip to content

Commit 5dd8a4f

Browse files
committed
Add python3 support for url_to_page
1 parent 4b0b5de commit 5dd8a4f

File tree

3 files changed

+27
-5
lines changed

3 files changed

+27
-5
lines changed

NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
Scrapely release notes
22
======================
33

4+
0.13.2 (2016-12-21)
5+
-------------------
6+
7+
- Add python3 support for `url_to_page` function
8+
49
0.13.1 (2016-12-21)
510
-------------------
611

scrapely/htmlpage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def url_to_page(url, encoding=None, default_encoding='utf-8'):
187187
if encoding is None:
188188
try:
189189
# Python 3.x
190-
content_type_header = fh.getheader("content-type")
190+
content_type_header = fh.headers.get("content-type")
191191
except AttributeError:
192192
# Python 2.x
193193
content_type_header = info.getheader("content-type")

tests/test_htmlpage.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,43 @@
11
"""
22
htmlpage.py tests
33
"""
4+
import os
45
import copy
6+
import json
57
from unittest import TestCase
68

7-
from scrapely.htmlpage import parse_html, HtmlTag, HtmlDataFragment, HtmlPage
9+
from scrapely.htmlpage import (
10+
parse_html, HtmlTag, HtmlDataFragment, HtmlPage, url_to_page
11+
)
812
from .test_htmlpage_data import *
913
from . import iter_samples
14+
BASE_PATH = os.path.abspath(os.path.dirname(__file__))
15+
1016

1117
def _encode_element(el):
1218
"""
1319
jsonize parse element
1420
"""
1521
if isinstance(el, HtmlTag):
1622
return {"tag": el.tag, "attributes": el.attributes,
17-
"start": el.start, "end": el.end, "tag_type": el.tag_type}
23+
"start": el.start, "end": el.end, "tag_type": el.tag_type}
1824
if isinstance(el, HtmlDataFragment):
1925
return {"start": el.start, "end": el.end, "is_text_content": el.is_text_content}
2026
raise TypeError
2127

28+
2229
def _decode_element(dct):
2330
"""
2431
dejsonize parse element
2532
"""
2633
if "tag" in dct:
27-
return HtmlTag(dct["tag_type"], dct["tag"], \
28-
dct["attributes"], dct["start"], dct["end"])
34+
return HtmlTag(dct["tag_type"], dct["tag"],
35+
dct["attributes"], dct["start"], dct["end"])
2936
if "start" in dct:
3037
return HtmlDataFragment(dct["start"], dct["end"], dct.get("is_text_content", True))
3138
return dct
3239

40+
3341
class TestParseHtml(TestCase):
3442
"""Test for parse_html"""
3543
def _test_sample(self, source, expected_parsed, samplecount=None):
@@ -165,3 +173,12 @@ def test_copy(self):
165173
self.assertEqual(regiondeepcopy.end_index, 15)
166174
self.assertFalse(region is regiondeepcopy)
167175
self.assertFalse(region.htmlpage is regiondeepcopy.htmlpage)
176+
177+
def test_load_page_from_url(self):
178+
filepath = os.path.join(BASE_PATH, 'samples/samples_htmlpage_0')
179+
url = 'file://{}.{}'.format(filepath, 'html')
180+
page = url_to_page(url)
181+
parsed = json.load(open('{}.{}'.format(filepath, 'json')))
182+
parsed = [_decode_element(d) for d in parsed]
183+
self.assertEqual(page.url, url)
184+
self._test_sample(page.body, parsed, 1)

0 commit comments

Comments
 (0)