@@ -95,6 +95,15 @@ def fragment_data(self, data_fragment):
9595 """portion of the body corresponding to the HtmlDataFragment"""
9696 return self .body [data_fragment .start :data_fragment .end ]
9797
98+ class TextPage (HtmlPage ):
99+ """An HtmlPage with one unique HtmlDataFragment, needed to have a
100+ convenient text with same interface as html page but avoiding unnecesary
101+ reparsing"""
102+ def _set_body (self , text ):
103+ self ._body = text
104+ self .parsed_body = [HtmlDataFragment (0 , len (self ._body ), True )]
105+ body = property (lambda x : x ._body , _set_body , doc = "raw text for the page" )
106+
98107class HtmlPageRegion (unicode ):
99108 """A Region of an HtmlPage that has been extracted
100109 """
@@ -107,7 +116,11 @@ def __init__(self, htmlpage, data):
107116 htmlpage is the original page and data is the raw html
108117 """
109118 self .htmlpage = htmlpage
110-
119+
120+ @property
121+ def text_content (self ):
122+ return self
123+
111124class HtmlPageParsedRegion (HtmlPageRegion ):
112125 """A region of an HtmlPage that has been extracted
113126
@@ -131,20 +144,31 @@ def parsed_fragments(self):
131144 end = self .end_index + 1 if self .end_index is not None else None
132145 return self .htmlpage .parsed_body [self .start_index :end ]
133146
147+ @property
148+ def text_content (self ):
149+ """Text content of this parsed region"""
150+ text_all = u" " .join (self .htmlpage .body [_element .start :_element .end ] \
151+ for _element in self .parsed_fragments if \
152+ not isinstance (_element , HtmlTag ) and _element .is_text_content )
153+ return TextPage (self .htmlpage .url , self .htmlpage .headers , \
154+ text_all , encoding = self .htmlpage .encoding ).subregion ()
155+
156+
134157class HtmlTagType (object ):
135158 OPEN_TAG = 1
136159 CLOSE_TAG = 2
137160 UNPAIRED_TAG = 3
138161
139162class HtmlDataFragment (object ):
140- __slots__ = ('start' , 'end' )
163+ __slots__ = ('start' , 'end' , 'is_text_content' )
141164
142- def __init__ (self , start , end ):
165+ def __init__ (self , start , end , is_text_content = False ):
143166 self .start = start
144167 self .end = end
168+ self .is_text_content = is_text_content
145169
146170 def __str__ (self ):
147- return "<HtmlDataFragment [%s:%s]>" % (self .start , self .end )
171+ return "<HtmlDataFragment [%s:%s] is_text_content: %s >" % (self .start , self .end , self . is_text_content )
148172
149173 def __repr__ (self ):
150174 return str (self )
@@ -191,7 +215,7 @@ def parse_html(text):
191215 end = match .end ()
192216
193217 if start > prev_end :
194- yield HtmlDataFragment (prev_end , start )
218+ yield HtmlDataFragment (prev_end , start , True )
195219
196220 if match .groups ()[0 ] is not None : # comment
197221 yield HtmlDataFragment (start , end )
@@ -203,7 +227,7 @@ def parse_html(text):
203227 prev_end = end
204228 textlen = len (text )
205229 if prev_end < textlen :
206- yield HtmlDataFragment (prev_end , textlen )
230+ yield HtmlDataFragment (prev_end , textlen , True )
207231
208232def _parse_script (match ):
209233 """parse a <script>...</script> region matched by _HTML_REGEXP"""
0 commit comments