@@ -75,6 +75,15 @@ def fragment_data(self, data_fragment):
7575 """portion of the body corresponding to the HtmlDataFragment"""
7676 return self .body [data_fragment .start :data_fragment .end ]
7777
78+ class TextPage (HtmlPage ):
79+ """An HtmlPage with one unique HtmlDataFragment, needed to have a
80+ convenient text with same interface as html page but avoiding unnecesary
81+ reparsing"""
82+ def _set_body (self , text ):
83+ self ._body = text
84+ self .parsed_body = [HtmlDataFragment (0 , len (self ._body ), True )]
85+ body = property (lambda x : x ._body , _set_body , doc = "raw text for the page" )
86+
7887class HtmlPageRegion (unicode ):
7988 """A Region of an HtmlPage that has been extracted
8089 """
@@ -87,7 +96,11 @@ def __init__(self, htmlpage, data):
8796 htmlpage is the original page and data is the raw html
8897 """
8998 self .htmlpage = htmlpage
90-
99+
100+ @property
101+ def text_content (self ):
102+ return self
103+
91104class HtmlPageParsedRegion (HtmlPageRegion ):
92105 """A region of an HtmlPage that has been extracted
93106
@@ -111,20 +124,31 @@ def parsed_fragments(self):
111124 end = self .end_index + 1 if self .end_index is not None else None
112125 return self .htmlpage .parsed_body [self .start_index :end ]
113126
127+ @property
128+ def text_content (self ):
129+ """Text content of this parsed region"""
130+ text_all = u" " .join (self .htmlpage .body [_element .start :_element .end ] \
131+ for _element in self .parsed_fragments if \
132+ not isinstance (_element , HtmlTag ) and _element .is_text_content )
133+ return TextPage (self .htmlpage .url , self .htmlpage .headers , \
134+ text_all , encoding = self .htmlpage .encoding ).subregion ()
135+
136+
114137class HtmlTagType (object ):
115138 OPEN_TAG = 1
116139 CLOSE_TAG = 2
117140 UNPAIRED_TAG = 3
118141
119142class HtmlDataFragment (object ):
120- __slots__ = ('start' , 'end' )
143+ __slots__ = ('start' , 'end' , 'is_text_content' )
121144
122- def __init__ (self , start , end ):
145+ def __init__ (self , start , end , is_text_content = False ):
123146 self .start = start
124147 self .end = end
148+ self .is_text_content = is_text_content
125149
126150 def __str__ (self ):
127- return "<HtmlDataFragment [%s:%s]>" % (self .start , self .end )
151+ return "<HtmlDataFragment [%s:%s] is_text_content: %s >" % (self .start , self .end , self . is_text_content )
128152
129153 def __repr__ (self ):
130154 return str (self )
@@ -171,7 +195,7 @@ def parse_html(text):
171195 end = match .end ()
172196
173197 if start > prev_end :
174- yield HtmlDataFragment (prev_end , start )
198+ yield HtmlDataFragment (prev_end , start , True )
175199
176200 if match .groups ()[0 ] is not None : # comment
177201 yield HtmlDataFragment (start , end )
@@ -183,7 +207,7 @@ def parse_html(text):
183207 prev_end = end
184208 textlen = len (text )
185209 if prev_end < textlen :
186- yield HtmlDataFragment (prev_end , textlen )
210+ yield HtmlDataFragment (prev_end , textlen , True )
187211
188212def _parse_script (match ):
189213 """parse a <script>...</script> region matched by _HTML_REGEXP"""
0 commit comments