@@ -66,7 +66,14 @@ def handle_tag(self, html_tag, index):
6666 pass
6767
6868_END_UNPAIREDTAG_TAGS = ["form" , "div" , "p" , "table" , "tr" , "td" ]
69-
69+ _AUTO_CLOSE_TAGS_ON_OPEN = {
70+ # the given keys closes the tags in the list
71+ "p" : ["p" ],
72+ "option" : ["option" ],
73+ }
74+ _AUTO_CLOSE_TAGS_ON_CLOSE = {
75+ "select" : ["option" ],
76+ }
7077class TemplatePageParser (InstanceLearningParser ):
7178 """Template parsing for instance based learning algorithm"""
7279
@@ -169,19 +176,15 @@ def _handle_open_tag(self, html_tag):
169176 self ._close_unpaired_tag ()
170177 else :
171178 self .unpairedtag_stack .append (html_tag .tag )
172-
173- # can't be a p inside another p. Also, an open p element closes
174- # a previous open p element.
175- if html_tag .tag == "p" and html_tag .tag in self .labelled_tag_stacks :
176- annotation = self .labelled_tag_stacks .pop (html_tag .tag )[0 ]
177- annotation .end_index = self .next_tag_index
178- self .annotations .append (annotation )
179-
179+
180+ tagname = replacement or self ._update_replacement_stack (html_tag )
181+ self ._handle_unclosed_tags (tagname , _AUTO_CLOSE_TAGS_ON_OPEN )
182+
180183 jannotation = self ._read_template_annotation (html_tag )
181184 if not jannotation :
182- if html_tag . tag in self .labelled_tag_stacks :
185+ if tagname in self .labelled_tag_stacks :
183186 # add this tag to the stack to match correct end tag
184- self .labelled_tag_stacks [html_tag . tag ].append (None )
187+ self .labelled_tag_stacks [tagname ].append (None )
185188 self .next_tag_index += 1
186189 return
187190
@@ -227,7 +230,7 @@ def _handle_open_tag(self, html_tag):
227230
228231 # look for a closing tag if the content is important
229232 if annotation .surrounds_attribute :
230- self .labelled_tag_stacks [html_tag . tag ].append (annotation )
233+ self .labelled_tag_stacks [tagname ].append (annotation )
231234 else :
232235 annotation .end_index = annotation .start_index + 1
233236 self .annotations .append (annotation )
@@ -239,6 +242,7 @@ def _handle_close_tag(self, html_tag):
239242 self .unpairedtag_stack .pop ()
240243 else :
241244 self ._close_unpaired_tag ()
245+
242246 ignored_tags = self .ignored_tag_stacks .get (html_tag .tag )
243247 if ignored_tags is not None :
244248 tag = ignored_tags .pop ()
@@ -250,15 +254,10 @@ def _handle_close_tag(self, html_tag):
250254 if len (ignored_tags ) == 0 :
251255 del self .ignored_tag_stacks [html_tag .tag ]
252256
253- if html_tag .tag in self .replacement_stacks :
254- replacement = self .replacement_stacks [html_tag .tag ].pop ()
255- if replacement :
256- self .token_list .pop ()
257- self ._add_token (replacement , html_tag .tag_type , html_tag .start , html_tag .end )
258- if len (self .replacement_stacks [html_tag .tag ]) == 0 :
259- del self .replacement_stacks [html_tag .tag ]
257+ tagname = self ._update_replacement_stack (html_tag )
258+ self ._handle_unclosed_tags (tagname , _AUTO_CLOSE_TAGS_ON_CLOSE )
260259
261- labelled_tags = self .labelled_tag_stacks .get (html_tag . tag )
260+ labelled_tags = self .labelled_tag_stacks .get (tagname )
262261 if labelled_tags is None :
263262 self .next_tag_index += 1
264263 return
@@ -274,12 +273,35 @@ def _handle_close_tag(self, html_tag):
274273 else :
275274 self .next_tag_index += 1
276275 if len (labelled_tags ) == 0 :
277- del self .labelled_tag_stacks [html_tag . tag ]
276+ del self .labelled_tag_stacks [tagname ]
278277 if annotation .variant_id and self .variant_stack :
279278 prev = self .variant_stack .pop ()
280279 if prev != annotation .variant_id :
281280 raise ValueError ("unbalanced variant annotation tags" )
282281
282+ def _update_replacement_stack (self , html_tag ):
283+ replacement = html_tag .tag
284+ if html_tag .tag in self .replacement_stacks :
285+ replacement = self .replacement_stacks [html_tag .tag ].pop ()
286+ if replacement :
287+ self .token_list .pop ()
288+ self ._add_token (replacement , html_tag .tag_type , html_tag .start , html_tag .end )
289+ if len (self .replacement_stacks [html_tag .tag ]) == 0 :
290+ del self .replacement_stacks [html_tag .tag ]
291+ return replacement
292+
293+ def _handle_unclosed_tags (self , tagname , auto_close_tags ):
294+ """I.e. can't be a p inside another p. Also, an open p element closes
295+ a previous open p element"""
296+ if tagname in auto_close_tags :
297+ for _close_tag in auto_close_tags [tagname ]:
298+ if _close_tag in self .labelled_tag_stacks :
299+ annotation = self .labelled_tag_stacks .pop (_close_tag )[0 ]
300+ annotation .end_index = self .next_tag_index
301+ self .annotations .append (annotation )
302+ break
303+ return tagname
304+
283305 def handle_data (self , html_data_fragment , index ):
284306 fragment_text = self .html_page .fragment_data (html_data_fragment )
285307 self ._process_text (fragment_text )
0 commit comments