@@ -182,10 +182,10 @@ def summary(self, html_partial=False):
182182 if ruthless :
183183 self .remove_unlikely_candidates ()
184184 self .transform_misused_divs_into_paragraphs ()
185+
185186 candidates = self .score_paragraphs ()
186187
187188 best_candidate = self .select_best_candidate (candidates )
188-
189189 if best_candidate :
190190 article = self .get_article (candidates , best_candidate ,
191191 html_partial = html_partial )
@@ -381,13 +381,13 @@ def class_weight(self, e):
381381 def score_node (self , elem ):
382382 content_score = self .class_weight (elem )
383383 name = elem .tag .lower ()
384- if name == "div" :
384+ if name in [ "div" , "article" ] :
385385 content_score += 5
386386 elif name in ["pre" , "td" , "blockquote" ]:
387387 content_score += 3
388- elif name in ["address" , "ol" , "ul" , "dl" , "dd" , "dt" , "li" , "form" ]:
388+ elif name in ["address" , "ol" , "ul" , "dl" , "dd" , "dt" , "li" , "form" , "aside" ]:
389389 content_score -= 3
390- elif name in ["h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "th" ]:
390+ elif name in ["h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "th" , "header" , "footer" , "nav" ]:
391391 content_score -= 5
392392 return {
393393 'content_score' : content_score ,
@@ -400,8 +400,10 @@ def remove_unlikely_candidates(self):
400400 if len (s ) < 2 :
401401 continue
402402 if REGEXES ['unlikelyCandidatesRe' ].search (s ) and (not REGEXES ['okMaybeItsACandidateRe' ].search (s )) and elem .tag not in ['html' , 'body' ]:
403+ #print("Removing", describe(elem))
403404 log .debug ("Removing unlikely candidate - %s" % describe (elem ))
404405 elem .drop_tree ()
406+ #print("After removal: {}".format(tostring(self.html)))
405407
406408 def transform_misused_divs_into_paragraphs (self ):
407409 for elem in self .tags (self .html , 'div' ):
@@ -463,7 +465,7 @@ def sanitize(self, node, candidates):
463465
464466 allowed = {}
465467 # Conditionally clean <table>s, <ul>s, and <div>s
466- for el in self .reverse_tags (node , "table" , "ul" , "div" ):
468+ for el in self .reverse_tags (node , "table" , "ul" , "div" , "aside" , "header" , "footer" , "section" ):
467469 if el in allowed :
468470 continue
469471 weight = self .class_weight (el )
@@ -577,7 +579,7 @@ def sanitize(self, node, candidates):
577579 if siblings and sum (siblings ) > 1000 :
578580 to_remove = False
579581 log .debug ("Allowing %s" % describe (el ))
580- for desnode in self .tags (el , "table" , "ul" , "div" ):
582+ for desnode in self .tags (el , "table" , "ul" , "div" , "section" ):
581583 allowed [desnode ] = True
582584
583585 if to_remove :
0 commit comments