From 13cca1dd196cfb8a8fdf904f4b6c9a44bcc08449 Mon Sep 17 00:00:00 2001 From: Martin Thurau Date: Wed, 29 Apr 2015 14:55:03 +0200 Subject: [PATCH 1/6] Adds tox configuration. Adds tox.ini to support running the tests on multiple versions. Adds requirements.txt to support dependency installtion via pip. --- .gitignore | 3 +++ requirements.txt | 1 + tox.ini | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+) create mode 100644 requirements.txt create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index 84fca1f2..16a2c86e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ dist /man nosetests.xml .coverage +.tox +.idea +.cache diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..d6e1198b --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +-e . diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..e6fced90 --- /dev/null +++ b/tox.ini @@ -0,0 +1,20 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py26, py27 + +[testenv] +deps=pytest +# This creates the virtual envs with --site-packages so already packages +# that are already installed will be reused. This is especially useful on +# Windows. Since we use lxml instead of compiling it locally (which in turn +# requires a Compiler and the build dependencies), you can download +# it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via +# $PYTHONDIR\Scripts\pip.exe install *.whl +#sitepackages=True +commands = + pip install -r requirements.txt + py.test From aa4132f57af0590738067da6d7a068317fce11e2 Mon Sep 17 00:00:00 2001 From: Martin Thurau Date: Wed, 29 Apr 2015 16:18:21 +0200 Subject: [PATCH 2/6] Adds Python 3.4 support. Code now supports Python 2.6, 2.7 and 3.4. PYthon 3.3 isn't support because of some issues with the parser and the difference between old and new `raise` syntax. --- readability/htmls.py | 9 ++++++--- readability/readability.py | 37 ++++++++++++++++++++++++++----------- setup.py | 3 ++- tox.ini | 2 +- 4 files changed, 35 insertions(+), 16 deletions(-) diff --git a/readability/htmls.py b/readability/htmls.py index 536b21b3..526fbce3 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -8,8 +8,11 @@ utf8_parser = lxml.html.HTMLParser(encoding='utf-8') +if sys.version_info[0] == 2: + str = unicode + def build_doc(page): - if isinstance(page, unicode): + if isinstance(page, str): enc = None page_unicode = page else: @@ -33,7 +36,7 @@ def normalize_entities(cur_title): u'\u00BB': '"', u'"': '"', } - for c, r in entities.iteritems(): + for c, r in list(entities.items()): if c in cur_title: cur_title = cur_title.replace(c, r) @@ -105,7 +108,7 @@ def shorten_title(doc): def get_body(doc): [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] - raw_html = unicode(tostring(doc.body or doc)) + raw_html = str(tostring(doc.body or doc)) cleaned = clean_attributes(raw_html) try: #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? diff --git a/readability/readability.py b/readability/readability.py index 255e877e..c6391d7d 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function import logging import re import sys @@ -20,6 +21,8 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger() +if sys.version_info[0] == 2: + str = unicode REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), @@ -81,11 +84,12 @@ def text_length(i): def compile_pattern(elements): if not elements: return None - if isinstance(elements, regexp_type): + elif isinstance(elements, regexp_type): return elements - if isinstance(elements, basestring): + else: + # assume string or string like object elements = elements.split(',') - return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) + return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U) class Document: """Class to build a etree document out of html.""" @@ -195,9 +199,20 @@ def summary(self, html_partial=False): continue else: return cleaned_article - except StandardError, e: + except Exception as e: log.exception('error getting summary: ') - raise Unparseable(str(e)), None, sys.exc_info()[2] + if sys.version_info[0] == 2: + # This is the only reason why we can't support Python 3.3: + # 3.3s parser fails to accept the old syntax (although this + # code never runs) which would require write this line as: + # write this line as + # Unparseable(str(e)) + # but then we loose the traceback information. 3.4 on the + # other hand accepts the old syntax and would only complain + # at runtime. + raise Unparseable(str(e)), None, sys.exc_info()[2] + else: + raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for @@ -247,7 +262,7 @@ def get_article(self, candidates, best_candidate, html_partial=False): return output def select_best_candidate(self, candidates): - sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) + sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True) for candidate in sorted_candidates[:5]: elem = candidate['elem'] self.debug("Top 5 : %6.3f %s" % ( @@ -388,7 +403,7 @@ def transform_misused_divs_into_paragraphs(self): # This results in incorrect results in case there is an # buried within an for example if not REGEXES['divToPElementsRe'].search( - unicode(''.join(map(tostring, list(elem))))): + str(''.join(map(str, map(tostring, list(elem)))))): #self.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" #print "Fixed element "+describe(elem) @@ -609,18 +624,18 @@ def main(): file = None if options.url: - import urllib - file = urllib.urlopen(options.url) + import urllib.request, urllib.parse, urllib.error + file = urllib.request.urlopen(options.url) else: file = open(args[0], 'rt') enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING try: - print Document(file.read(), + print(Document(file.read(), debug=options.verbose, url=options.url, positive_keywords = options.positive_keywords, negative_keywords = options.negative_keywords, - ).summary().encode(enc, 'replace') + ).summary().encode(enc, 'replace')) finally: file.close() diff --git a/setup.py b/setup.py index 5d472d24..6f4cbbfa 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function from setuptools import setup, find_packages import sys @@ -8,7 +9,7 @@ mac_ver = platform.mac_ver()[0] mac_ver_no = int(mac_ver.split('.')[1]) if mac_ver_no < 9: - print "Using lxml<2.4" + print("Using lxml<2.4") lxml_requirement = "lxml<2.4" setup( diff --git a/tox.ini b/tox.ini index e6fced90..f7c6e93f 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27 +envlist = py26, py27, py34 [testenv] deps=pytest From 3ac56329e2a9918497b52bfc9c8c3dd3f6f060ad Mon Sep 17 00:00:00 2001 From: Martin Thurau Date: Wed, 29 Apr 2015 19:33:43 +0200 Subject: [PATCH 3/6] Corrects some things were 2to3 did to much. --- readability/htmls.py | 2 +- readability/readability.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/readability/htmls.py b/readability/htmls.py index 526fbce3..292b4bb3 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -36,7 +36,7 @@ def normalize_entities(cur_title): u'\u00BB': '"', u'"': '"', } - for c, r in list(entities.items()): + for c, r in entities.items(): if c in cur_title: cur_title = cur_title.replace(c, r) diff --git a/readability/readability.py b/readability/readability.py index c6391d7d..18ae4b29 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -89,7 +89,7 @@ def compile_pattern(elements): else: # assume string or string like object elements = elements.split(',') - return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U) + return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) class Document: """Class to build a etree document out of html.""" @@ -207,7 +207,7 @@ def summary(self, html_partial=False): # code never runs) which would require write this line as: # write this line as # Unparseable(str(e)) - # but then we loose the traceback information. 3.4 on the + # but then we lose the traceback information. 3.4 on the # other hand accepts the old syntax and would only complain # at runtime. raise Unparseable(str(e)), None, sys.exc_info()[2] @@ -262,7 +262,7 @@ def get_article(self, candidates, best_candidate, html_partial=False): return output def select_best_candidate(self, candidates): - sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True) + sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) for candidate in sorted_candidates[:5]: elem = candidate['elem'] self.debug("Top 5 : %6.3f %s" % ( From ce7ca2683548c9267c014d4597a7896e28689550 Mon Sep 17 00:00:00 2001 From: Martin Thurau Date: Wed, 29 Apr 2015 23:35:18 +0200 Subject: [PATCH 4/6] Adds compatibility `raise_with_traceback` method to support different `raise` syntax Unfortunately the Python 2 `raise` syntax is not supported in Python 3.3 and not all 3.4.x versions so we deal with that by using conditional imports and a compatibility layer. --- readability/compat/__init__.py | 6 ++++++ readability/compat/three.py | 6 ++++++ readability/compat/two.py | 6 ++++++ readability/readability.py | 13 +++---------- tox.ini | 4 ++-- 5 files changed, 23 insertions(+), 12 deletions(-) create mode 100644 readability/compat/__init__.py create mode 100644 readability/compat/three.py create mode 100644 readability/compat/two.py diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py new file mode 100644 index 00000000..ed4d3504 --- /dev/null +++ b/readability/compat/__init__.py @@ -0,0 +1,6 @@ +""" +This module contains compatibility helpers for Python 2/3 interoperability. + +It mainly exists because their are certain incompatibilities in the Python +syntax that can only be solved by conditionally importing different functions. +""" diff --git a/readability/compat/three.py b/readability/compat/three.py new file mode 100644 index 00000000..26351575 --- /dev/null +++ b/readability/compat/three.py @@ -0,0 +1,6 @@ +def raise_with_traceback(exc_type, traceback, *args, **kwargs): + """ + Raise a new exception of type `exc_type` with an existing `traceback`. All + additional (keyword-)arguments are forwarded to `exc_type` + """ + raise exc_type(*args, **kwargs).with_traceback(traceback) diff --git a/readability/compat/two.py b/readability/compat/two.py new file mode 100644 index 00000000..642ecb75 --- /dev/null +++ b/readability/compat/two.py @@ -0,0 +1,6 @@ +def raise_with_traceback(exc_type, traceback, *args, **kwargs): + """ + Raise a new exception of type `exc_type` with an existing `traceback`. All + additional (keyword-)arguments are forwarded to `exc_type` + """ + raise exc_type(*args, **kwargs), None, traceback diff --git a/readability/readability.py b/readability/readability.py index 18ae4b29..820bc627 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -202,17 +202,10 @@ def summary(self, html_partial=False): except Exception as e: log.exception('error getting summary: ') if sys.version_info[0] == 2: - # This is the only reason why we can't support Python 3.3: - # 3.3s parser fails to accept the old syntax (although this - # code never runs) which would require write this line as: - # write this line as - # Unparseable(str(e)) - # but then we lose the traceback information. 3.4 on the - # other hand accepts the old syntax and would only complain - # at runtime. - raise Unparseable(str(e)), None, sys.exc_info()[2] + from .compat.two import raise_with_traceback else: - raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) + from .compat.three import raise_with_traceback + raise_with_traceback(Unparseable, sys.exc_info()[2], str(e)) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for diff --git a/tox.ini b/tox.ini index f7c6e93f..50b4a74d 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py34 +envlist = py26, py27, py33, py34 [testenv] deps=pytest @@ -14,7 +14,7 @@ deps=pytest # requires a Compiler and the build dependencies), you can download # it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via # $PYTHONDIR\Scripts\pip.exe install *.whl -#sitepackages=True +sitepackages=True commands = pip install -r requirements.txt py.test From 046d2c10c3ff42253867ce919208c63e07e80d62 Mon Sep 17 00:00:00 2001 From: Martin Thurau Date: Wed, 29 Apr 2015 23:36:50 +0200 Subject: [PATCH 5/6] Fixes regex declaration in get_encoding. Since get_encoding() is only called when the input is *not* already unicode we need to declare the regexs as byte type so they continue to work in Python 3. --- readability/encoding.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/readability/encoding.py b/readability/encoding.py index fb4761df..1c1e5050 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -3,9 +3,9 @@ def get_encoding(page): # Regex for XML and HTML Meta charset declaration - charset_re = re.compile(r']', flags=re.I) - pragma_re = re.compile(r']', flags=re.I) - xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') + charset_re = re.compile(br']', flags=re.I) + pragma_re = re.compile(br']', flags=re.I) + xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]') declared_encodings = (charset_re.findall(page) + pragma_re.findall(page) + @@ -21,7 +21,7 @@ def get_encoding(page): pass # Fallback to chardet if declared encodings fail - text = re.sub(']*>\s*', ' ', page) + text = re.sub(b']*>\s*', b' ', page) enc = 'utf-8' if not text.strip() or len(text) < 10: return enc # can't guess From 386e48d29b28e1c988cf676a33bbbfb5a41b038a Mon Sep 17 00:00:00 2001 From: Martin Thurau Date: Thu, 30 Apr 2015 11:47:32 +0200 Subject: [PATCH 6/6] Fixes checking of declared encodings in get_encoding. In PYthon 3 .decode() on bytes requires the name of the encoding to be a str type which means we have to convert the extracted encoding before we can use it. --- readability/encoding.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/readability/encoding.py b/readability/encoding.py index 1c1e5050..b91c3e28 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,5 +1,6 @@ import re import chardet +import sys def get_encoding(page): # Regex for XML and HTML Meta charset declaration @@ -12,13 +13,18 @@ def get_encoding(page): xml_re.findall(page)) # Try any declared encodings - if len(declared_encodings) > 0: - for declared_encoding in declared_encodings: - try: - page.decode(custom_decode(declared_encoding)) - return custom_decode(declared_encoding) - except UnicodeDecodeError: - pass + for declared_encoding in declared_encodings: + try: + if sys.version_info[0] == 3: + # declared_encoding will actually be bytes but .decode() only + # accepts `str` type. Decode blindly with ascii because no one should + # ever use non-ascii characters in the name of an encoding. + declared_encoding = declared_encoding.decode('ascii', 'replace') + + page.decode(custom_decode(declared_encoding)) + return custom_decode(declared_encoding) + except UnicodeDecodeError: + pass # Fallback to chardet if declared encodings fail text = re.sub(b']*>\s*', b' ', page)