@@ -101,9 +101,10 @@ class Page(object):
101101 dictionary of tokens and an array of raw token ids
102102 """
103103
104- __slots__ = ('token_dict' , 'page_tokens' )
104+ __slots__ = ('token_dict' , 'page_tokens' , 'htmlpage' )
105105
106- def __init__ (self , token_dict , page_tokens ):
106+ def __init__ (self , htmlpage , token_dict , page_tokens ):
107+ self .htmlpage = htmlpage
107108 self .token_dict = token_dict
108109 # use a numpy array becuase we can index/slice easily and efficiently
109110 if not isinstance (page_tokens , ndarray ):
@@ -113,9 +114,9 @@ def __init__(self, token_dict, page_tokens):
113114class TemplatePage (Page ):
114115 __slots__ = ('annotations' , 'id' , 'ignored_regions' , 'extra_required_attrs' )
115116
116- def __init__ (self , token_dict , page_tokens , annotations , template_id = None , \
117- ignored_regions = None , extra_required = None ):
118- Page .__init__ (self , token_dict , page_tokens )
117+ def __init__ (self , htmlpage , token_dict , page_tokens , annotations , \
118+ template_id = None , ignored_regions = None , extra_required = None ):
119+ Page .__init__ (self , htmlpage , token_dict , page_tokens )
119120 # ensure order is the same as start tag order in the original page
120121 annotations = sorted (annotations , key = lambda x : x .end_index , reverse = True )
121122 self .annotations = sorted (annotations , key = lambda x : x .start_index )
@@ -136,7 +137,7 @@ class ExtractionPage(Page):
136137 """Parsed data belonging to a web page upon which we wish to perform
137138 extraction.
138139 """
139- __slots__ = ('htmlpage ' , 'token_page_indexes' )
140+ __slots__ = ('token_page_indexes ' , )
140141
141142 def __init__ (self , htmlpage , token_dict , page_tokens , token_page_indexes ):
142143 """Construct a new ExtractionPage
@@ -147,12 +148,11 @@ def __init__(self, htmlpage, token_dict, page_tokens, token_page_indexes):
147148 `page_tokens': array of page tokens for matching
148149 `token_page_indexes`: indexes of each token in the parsed htmlpage
149150 """
150- Page .__init__ (self , token_dict , page_tokens )
151- self .htmlpage = htmlpage
151+ Page .__init__ (self , htmlpage , token_dict , page_tokens )
152152 self .token_page_indexes = token_page_indexes
153153
154154 def htmlpage_region (self , start_token_index , end_token_index ):
155- """The region in the HtmlPage corresonding to the area defined by
155+ """The region in the HtmlPage corresponding to the area defined by
156156 the start_token_index and the end_token_index
157157
158158 This includes the tokens at the specified indexes
@@ -162,7 +162,7 @@ def htmlpage_region(self, start_token_index, end_token_index):
162162 return self .htmlpage .subregion (start , end )
163163
164164 def htmlpage_region_inside (self , start_token_index , end_token_index ):
165- """The region in the HtmlPage corresonding to the area between
165+ """The region in the HtmlPage corresponding to the area between
166166 the start_token_index and the end_token_index.
167167
168168 This excludes the tokens at the specified indexes
0 commit comments