Skip to content

Commit 4bbc3c7

Browse files
committed
Merge pull request #26 from kalessin/emptyregion
avoid exception when instantiating htmlregion if parent htmlpage has empty body. Added test.
2 parents d72ada3 + c53e507 commit 4bbc3c7

File tree

2 files changed

+9
-4
lines changed

2 files changed

+9
-4
lines changed

scrapely/htmlpage.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,11 @@ class HtmlPageParsedRegion(HtmlPageRegion):
133133
fragments contained within this region
134134
"""
135135
def __new__(cls, htmlpage, start_index, end_index):
136-
text_start = htmlpage.parsed_body[start_index].start
137-
text_end = htmlpage.parsed_body[end_index or -1].end
138-
text = htmlpage.body[text_start:text_end]
136+
text = htmlpage.body
137+
if text:
138+
text_start = htmlpage.parsed_body[start_index].start
139+
text_end = htmlpage.parsed_body[end_index or -1].end
140+
text = htmlpage.body[text_start:text_end]
139141
return HtmlPageRegion.__new__(cls, htmlpage, text)
140142

141143
def __init__(self, htmlpage, start_index, end_index):

scrapely/tests/test_htmlpage.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from unittest import TestCase
66

77
from scrapely.tests import iter_samples
8-
from scrapely.htmlpage import parse_html, HtmlTag, HtmlDataFragment
8+
from scrapely.htmlpage import parse_html, HtmlTag, HtmlDataFragment, HtmlPage
99
from scrapely.tests.test_htmlpage_data import *
1010

1111
def _encode_element(el):
@@ -135,3 +135,6 @@ def test_malformed2(self):
135135
parsed = [_decode_element(d) for d in PARSED9]
136136
self._test_sample(PAGE9, parsed)
137137

138+
def test_empty_subregion(self):
139+
htmlpage = HtmlPage(body=u"")
140+
self.assertEqual(htmlpage.subregion(), u"")

0 commit comments

Comments
 (0)