|
1 | 1 | #!/usr/bin/env python |
| 2 | +from __future__ import print_function |
2 | 3 | import logging |
3 | 4 | import re |
4 | 5 | import sys |
|
20 | 21 | logging.basicConfig(level=logging.INFO) |
21 | 22 | log = logging.getLogger() |
22 | 23 |
|
| 24 | +if sys.version_info[0] == 2: |
| 25 | + str = unicode |
23 | 26 |
|
24 | 27 | REGEXES = { |
25 | 28 | 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), |
@@ -81,11 +84,12 @@ def text_length(i): |
81 | 84 | def compile_pattern(elements): |
82 | 85 | if not elements: |
83 | 86 | return None |
84 | | - if isinstance(elements, regexp_type): |
| 87 | + elif isinstance(elements, regexp_type): |
85 | 88 | return elements |
86 | | - if isinstance(elements, basestring): |
| 89 | + else: |
| 90 | + # assume string or string like object |
87 | 91 | elements = elements.split(',') |
88 | | - return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) |
| 92 | + return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U) |
89 | 93 |
|
90 | 94 | class Document: |
91 | 95 | """Class to build a etree document out of html.""" |
@@ -195,9 +199,20 @@ def summary(self, html_partial=False): |
195 | 199 | continue |
196 | 200 | else: |
197 | 201 | return cleaned_article |
198 | | - except StandardError, e: |
| 202 | + except Exception as e: |
199 | 203 | log.exception('error getting summary: ') |
200 | | - raise Unparseable(str(e)), None, sys.exc_info()[2] |
| 204 | + if sys.version_info[0] == 2: |
| 205 | + # This is the only reason why we can't support Python 3.3: |
| 206 | + # 3.3s parser fails to accept the old syntax (although this |
| 207 | + # code never runs) which would require write this line as: |
| 208 | + # write this line as |
| 209 | + # Unparseable(str(e)) |
| 210 | + # but then we loose the traceback information. 3.4 on the |
| 211 | + # other hand accepts the old syntax and would only complain |
| 212 | + # at runtime. |
| 213 | + raise Unparseable(str(e)), None, sys.exc_info()[2] |
| 214 | + else: |
| 215 | + raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) |
201 | 216 |
|
202 | 217 | def get_article(self, candidates, best_candidate, html_partial=False): |
203 | 218 | # Now that we have the top candidate, look through its siblings for |
@@ -247,7 +262,7 @@ def get_article(self, candidates, best_candidate, html_partial=False): |
247 | 262 | return output |
248 | 263 |
|
249 | 264 | def select_best_candidate(self, candidates): |
250 | | - sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) |
| 265 | + sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True) |
251 | 266 | for candidate in sorted_candidates[:5]: |
252 | 267 | elem = candidate['elem'] |
253 | 268 | self.debug("Top 5 : %6.3f %s" % ( |
@@ -388,7 +403,7 @@ def transform_misused_divs_into_paragraphs(self): |
388 | 403 | # This results in incorrect results in case there is an <img> |
389 | 404 | # buried within an <a> for example |
390 | 405 | if not REGEXES['divToPElementsRe'].search( |
391 | | - unicode(''.join(map(tostring, list(elem))))): |
| 406 | + str(''.join(map(str, map(tostring, list(elem)))))): |
392 | 407 | #self.debug("Altering %s to p" % (describe(elem))) |
393 | 408 | elem.tag = "p" |
394 | 409 | #print "Fixed element "+describe(elem) |
@@ -609,18 +624,18 @@ def main(): |
609 | 624 |
|
610 | 625 | file = None |
611 | 626 | if options.url: |
612 | | - import urllib |
613 | | - file = urllib.urlopen(options.url) |
| 627 | + import urllib.request, urllib.parse, urllib.error |
| 628 | + file = urllib.request.urlopen(options.url) |
614 | 629 | else: |
615 | 630 | file = open(args[0], 'rt') |
616 | 631 | enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING |
617 | 632 | try: |
618 | | - print Document(file.read(), |
| 633 | + print(Document(file.read(), |
619 | 634 | debug=options.verbose, |
620 | 635 | url=options.url, |
621 | 636 | positive_keywords = options.positive_keywords, |
622 | 637 | negative_keywords = options.negative_keywords, |
623 | | - ).summary().encode(enc, 'replace') |
| 638 | + ).summary().encode(enc, 'replace')) |
624 | 639 | finally: |
625 | 640 | file.close() |
626 | 641 |
|
|
0 commit comments