|
1 | 1 | #!/usr/bin/env python |
| 2 | +from __future__ import print_function |
2 | 3 | import logging |
3 | 4 | import re |
4 | 5 | import sys |
|
19 | 20 |
|
20 | 21 | log = logging.getLogger() |
21 | 22 |
|
| 23 | +if sys.version_info[0] == 2: |
| 24 | + str = unicode |
22 | 25 |
|
23 | 26 | REGEXES = { |
24 | 27 | 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), |
@@ -80,11 +83,12 @@ def text_length(i): |
80 | 83 | def compile_pattern(elements): |
81 | 84 | if not elements: |
82 | 85 | return None |
83 | | - if isinstance(elements, regexp_type): |
| 86 | + elif isinstance(elements, regexp_type): |
84 | 87 | return elements |
85 | | - if isinstance(elements, basestring): |
| 88 | + else: |
| 89 | + # assume string or string like object |
86 | 90 | elements = elements.split(',') |
87 | | - return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) |
| 91 | + return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) |
88 | 92 |
|
89 | 93 | class Document: |
90 | 94 | """Class to build a etree document out of html.""" |
@@ -194,9 +198,13 @@ def summary(self, html_partial=False): |
194 | 198 | continue |
195 | 199 | else: |
196 | 200 | return cleaned_article |
197 | | - except StandardError, e: |
| 201 | + except Exception as e: |
198 | 202 | log.exception('error getting summary: ') |
199 | | - raise Unparseable(str(e)), None, sys.exc_info()[2] |
| 203 | + if sys.version_info[0] == 2: |
| 204 | + from .compat.two import raise_with_traceback |
| 205 | + else: |
| 206 | + from .compat.three import raise_with_traceback |
| 207 | + raise_with_traceback(Unparseable, sys.exc_info()[2], str(e)) |
200 | 208 |
|
201 | 209 | def get_article(self, candidates, best_candidate, html_partial=False): |
202 | 210 | # Now that we have the top candidate, look through its siblings for |
@@ -389,7 +397,7 @@ def transform_misused_divs_into_paragraphs(self): |
389 | 397 | # This results in incorrect results in case there is an <img> |
390 | 398 | # buried within an <a> for example |
391 | 399 | if not REGEXES['divToPElementsRe'].search( |
392 | | - unicode(''.join(map(tostring, list(elem))))): |
| 400 | + str(''.join(map(str, map(tostring, list(elem)))))): |
393 | 401 | #self.debug("Altering %s to p" % (describe(elem))) |
394 | 402 | elem.tag = "p" |
395 | 403 | #print "Fixed element "+describe(elem) |
@@ -612,18 +620,18 @@ def main(): |
612 | 620 |
|
613 | 621 | file = None |
614 | 622 | if options.url: |
615 | | - import urllib |
616 | | - file = urllib.urlopen(options.url) |
| 623 | + import urllib.request, urllib.parse, urllib.error |
| 624 | + file = urllib.request.urlopen(options.url) |
617 | 625 | else: |
618 | 626 | file = open(args[0], 'rt') |
619 | 627 | enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING |
620 | 628 | try: |
621 | | - print Document(file.read(), |
| 629 | + print(Document(file.read(), |
622 | 630 | debug=options.verbose, |
623 | 631 | url=options.url, |
624 | 632 | positive_keywords = options.positive_keywords, |
625 | 633 | negative_keywords = options.negative_keywords, |
626 | | - ).summary().encode(enc, 'replace') |
| 634 | + ).summary().encode(enc, 'replace')) |
627 | 635 | finally: |
628 | 636 | file.close() |
629 | 637 |
|
|
0 commit comments