Skip to content

Commit 9c656a6

Browse files
committed
Merge pull request #60 from tpeng/add-htmlpage-to-template
move htmlpage to Page
2 parents 0fdb412 + 31138a4 commit 9c656a6

File tree

2 files changed

+11
-11
lines changed

2 files changed

+11
-11
lines changed

scrapely/extraction/pageobjects.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,10 @@ class Page(object):
101101
dictionary of tokens and an array of raw token ids
102102
"""
103103

104-
__slots__ = ('token_dict', 'page_tokens')
104+
__slots__ = ('token_dict', 'page_tokens', 'htmlpage')
105105

106-
def __init__(self, token_dict, page_tokens):
106+
def __init__(self, htmlpage, token_dict, page_tokens):
107+
self.htmlpage = htmlpage
107108
self.token_dict = token_dict
108109
# use a numpy array becuase we can index/slice easily and efficiently
109110
if not isinstance(page_tokens, ndarray):
@@ -113,9 +114,9 @@ def __init__(self, token_dict, page_tokens):
113114
class TemplatePage(Page):
114115
__slots__ = ('annotations', 'id', 'ignored_regions', 'extra_required_attrs')
115116

116-
def __init__(self, token_dict, page_tokens, annotations, template_id=None, \
117-
ignored_regions=None, extra_required=None):
118-
Page.__init__(self, token_dict, page_tokens)
117+
def __init__(self, htmlpage, token_dict, page_tokens, annotations, \
118+
template_id=None, ignored_regions=None, extra_required=None):
119+
Page.__init__(self, htmlpage, token_dict, page_tokens)
119120
# ensure order is the same as start tag order in the original page
120121
annotations = sorted(annotations, key=lambda x: x.end_index, reverse=True)
121122
self.annotations = sorted(annotations, key=lambda x: x.start_index)
@@ -136,7 +137,7 @@ class ExtractionPage(Page):
136137
"""Parsed data belonging to a web page upon which we wish to perform
137138
extraction.
138139
"""
139-
__slots__ = ('htmlpage', 'token_page_indexes')
140+
__slots__ = ('token_page_indexes', )
140141

141142
def __init__(self, htmlpage, token_dict, page_tokens, token_page_indexes):
142143
"""Construct a new ExtractionPage
@@ -147,12 +148,11 @@ def __init__(self, htmlpage, token_dict, page_tokens, token_page_indexes):
147148
`page_tokens': array of page tokens for matching
148149
`token_page_indexes`: indexes of each token in the parsed htmlpage
149150
"""
150-
Page.__init__(self, token_dict, page_tokens)
151-
self.htmlpage = htmlpage
151+
Page.__init__(self, htmlpage, token_dict, page_tokens)
152152
self.token_page_indexes = token_page_indexes
153153

154154
def htmlpage_region(self, start_token_index, end_token_index):
155-
"""The region in the HtmlPage corresonding to the area defined by
155+
"""The region in the HtmlPage corresponding to the area defined by
156156
the start_token_index and the end_token_index
157157
158158
This includes the tokens at the specified indexes
@@ -162,7 +162,7 @@ def htmlpage_region(self, start_token_index, end_token_index):
162162
return self.htmlpage.subregion(start, end)
163163

164164
def htmlpage_region_inside(self, start_token_index, end_token_index):
165-
"""The region in the HtmlPage corresonding to the area between
165+
"""The region in the HtmlPage corresponding to the area between
166166
the start_token_index and the end_token_index.
167167
168168
This excludes the tokens at the specified indexes

scrapely/extraction/pageparsing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ def _process_text(self, text):
314314

315315
def to_template(self):
316316
"""create a TemplatePage from the data fed to this parser"""
317-
return TemplatePage(self.token_dict, self.token_list, self.annotations,
317+
return TemplatePage(self.html_page, self.token_dict, self.token_list, self.annotations,
318318
self.html_page.page_id, self.ignored_regions, self.extra_required_attrs)
319319

320320
class ExtractionPageParser(InstanceLearningParser):

0 commit comments

Comments
 (0)