From 13cca1dd196cfb8a8fdf904f4b6c9a44bcc08449 Mon Sep 17 00:00:00 2001
From: Martin Thurau <martin.thurau@gmail.com>
Date: Wed, 29 Apr 2015 14:55:03 +0200
Subject: [PATCH 1/6] Adds tox configuration.

Adds tox.ini to support running the tests on multiple versions. Adds
requirements.txt to support dependency installtion via pip.
---
 .gitignore       |  3 +++
 requirements.txt |  1 +
 tox.ini          | 20 ++++++++++++++++++++
 3 files changed, 24 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 tox.ini

diff --git a/.gitignore b/.gitignore
index 84fca1f2..16a2c86e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,6 @@ dist
 /man
 nosetests.xml
 .coverage
+.tox
+.idea
+.cache
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..d6e1198b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+-e .
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 00000000..e6fced90
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,20 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py26, py27
+
+[testenv]
+deps=pytest
+# This creates the virtual envs with --site-packages so already packages
+# that are already installed will be reused. This is especially useful on
+# Windows. Since we use lxml instead of compiling it locally (which in turn
+# requires a Compiler and the build dependencies), you can download
+# it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via
+# $PYTHONDIR\Scripts\pip.exe install *.whl
+#sitepackages=True
+commands =
+    pip install -r requirements.txt
+    py.test

From aa4132f57af0590738067da6d7a068317fce11e2 Mon Sep 17 00:00:00 2001
From: Martin Thurau <martin.thurau@gmail.com>
Date: Wed, 29 Apr 2015 16:18:21 +0200
Subject: [PATCH 2/6] Adds Python 3.4 support.

Code now supports Python 2.6, 2.7 and 3.4. PYthon 3.3 isn't support
because of some issues with the parser and the difference between old and
new `raise` syntax.
---
 readability/htmls.py       |  9 ++++++---
 readability/readability.py | 37 ++++++++++++++++++++++++++-----------
 setup.py                   |  3 ++-
 tox.ini                    |  2 +-
 4 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/readability/htmls.py b/readability/htmls.py
index 536b21b3..526fbce3 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -8,8 +8,11 @@
 
 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
 
+if sys.version_info[0] == 2:
+    str = unicode
+
 def build_doc(page):
-    if isinstance(page, unicode):
+    if isinstance(page, str):
         enc = None
         page_unicode = page
     else:
@@ -33,7 +36,7 @@ def normalize_entities(cur_title):
         u'\u00BB': '"',
         u'&quot;': '"',
     }
-    for c, r in entities.iteritems():
+    for c, r in list(entities.items()):
         if c in cur_title:
             cur_title = cur_title.replace(c, r)
 
@@ -105,7 +108,7 @@ def shorten_title(doc):
 
 def get_body(doc):
     [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
-    raw_html = unicode(tostring(doc.body or doc))
+    raw_html = str(tostring(doc.body or doc))
     cleaned = clean_attributes(raw_html)
     try:
         #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
diff --git a/readability/readability.py b/readability/readability.py
index 255e877e..c6391d7d 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from __future__ import print_function
 import logging
 import re
 import sys
@@ -20,6 +21,8 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger()
 
+if sys.version_info[0] == 2:
+    str = unicode
 
 REGEXES = {
     'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@@ -81,11 +84,12 @@ def text_length(i):
 def compile_pattern(elements):
     if not elements:
         return None
-    if isinstance(elements, regexp_type):
+    elif isinstance(elements, regexp_type):
         return elements
-    if isinstance(elements, basestring):
+    else:
+        # assume string or string like object
         elements = elements.split(',')
-    return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
+        return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U)
 
 class Document:
     """Class to build a etree document out of html."""
@@ -195,9 +199,20 @@ def summary(self, html_partial=False):
                     continue
                 else:
                     return cleaned_article
-        except StandardError, e:
+        except Exception as e:
             log.exception('error getting summary: ')
-            raise Unparseable(str(e)), None, sys.exc_info()[2]
+            if sys.version_info[0] == 2:
+                # This is the only reason why we can't support Python 3.3:
+                # 3.3s parser fails to accept the old syntax (although this
+                # code never runs) which would require write this line as:
+                # write this line as
+                #    Unparseable(str(e))
+                # but then we loose the traceback information. 3.4 on the
+                # other hand accepts the old syntax and would only complain
+                # at runtime.
+                raise Unparseable(str(e)), None, sys.exc_info()[2]
+            else:
+                raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
 
     def get_article(self, candidates, best_candidate, html_partial=False):
         # Now that we have the top candidate, look through its siblings for
@@ -247,7 +262,7 @@ def get_article(self, candidates, best_candidate, html_partial=False):
         return output
 
     def select_best_candidate(self, candidates):
-        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
+        sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True)
         for candidate in sorted_candidates[:5]:
             elem = candidate['elem']
             self.debug("Top 5 : %6.3f %s" % (
@@ -388,7 +403,7 @@ def transform_misused_divs_into_paragraphs(self):
             # This results in incorrect results in case there is an <img>
             # buried within an <a> for example
             if not REGEXES['divToPElementsRe'].search(
-                    unicode(''.join(map(tostring, list(elem))))):
+                    str(''.join(map(str, map(tostring, list(elem)))))):
                 #self.debug("Altering %s to p" % (describe(elem)))
                 elem.tag = "p"
                 #print "Fixed element "+describe(elem)
@@ -609,18 +624,18 @@ def main():
 
     file = None
     if options.url:
-        import urllib
-        file = urllib.urlopen(options.url)
+        import urllib.request, urllib.parse, urllib.error
+        file = urllib.request.urlopen(options.url)
     else:
         file = open(args[0], 'rt')
     enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
     try:
-        print Document(file.read(),
+        print(Document(file.read(),
             debug=options.verbose,
             url=options.url,
             positive_keywords = options.positive_keywords,
             negative_keywords = options.negative_keywords,
-        ).summary().encode(enc, 'replace')
+        ).summary().encode(enc, 'replace'))
     finally:
         file.close()
 
diff --git a/setup.py b/setup.py
index 5d472d24..6f4cbbfa 100755
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from __future__ import print_function
 from setuptools import setup, find_packages
 import sys
 
@@ -8,7 +9,7 @@
     mac_ver = platform.mac_ver()[0]
     mac_ver_no = int(mac_ver.split('.')[1])
     if mac_ver_no < 9:
-        print "Using lxml<2.4"
+        print("Using lxml<2.4")
         lxml_requirement = "lxml<2.4"
 
 setup(
diff --git a/tox.ini b/tox.ini
index e6fced90..f7c6e93f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py26, py27
+envlist = py26, py27, py34
 
 [testenv]
 deps=pytest

From 3ac56329e2a9918497b52bfc9c8c3dd3f6f060ad Mon Sep 17 00:00:00 2001
From: Martin Thurau <martin.thurau@gmail.com>
Date: Wed, 29 Apr 2015 19:33:43 +0200
Subject: [PATCH 3/6] Corrects some things were 2to3 did to much.

---
 readability/htmls.py       | 2 +-
 readability/readability.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/readability/htmls.py b/readability/htmls.py
index 526fbce3..292b4bb3 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -36,7 +36,7 @@ def normalize_entities(cur_title):
         u'\u00BB': '"',
         u'&quot;': '"',
     }
-    for c, r in list(entities.items()):
+    for c, r in entities.items():
         if c in cur_title:
             cur_title = cur_title.replace(c, r)
 
diff --git a/readability/readability.py b/readability/readability.py
index c6391d7d..18ae4b29 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -89,7 +89,7 @@ def compile_pattern(elements):
     else:
         # assume string or string like object
         elements = elements.split(',')
-        return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U)
+        return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
 
 class Document:
     """Class to build a etree document out of html."""
@@ -207,7 +207,7 @@ def summary(self, html_partial=False):
                 # code never runs) which would require write this line as:
                 # write this line as
                 #    Unparseable(str(e))
-                # but then we loose the traceback information. 3.4 on the
+                # but then we lose the traceback information. 3.4 on the
                 # other hand accepts the old syntax and would only complain
                 # at runtime.
                 raise Unparseable(str(e)), None, sys.exc_info()[2]
@@ -262,7 +262,7 @@ def get_article(self, candidates, best_candidate, html_partial=False):
         return output
 
     def select_best_candidate(self, candidates):
-        sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True)
+        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
         for candidate in sorted_candidates[:5]:
             elem = candidate['elem']
             self.debug("Top 5 : %6.3f %s" % (

From ce7ca2683548c9267c014d4597a7896e28689550 Mon Sep 17 00:00:00 2001
From: Martin Thurau <martin.thurau@gmail.com>
Date: Wed, 29 Apr 2015 23:35:18 +0200
Subject: [PATCH 4/6] Adds compatibility `raise_with_traceback` method to
 support different `raise` syntax

Unfortunately the Python 2 `raise` syntax is not supported in Python 3.3 and not all 3.4.x versions so we deal with that by using conditional imports and a compatibility layer.
---
 readability/compat/__init__.py |  6 ++++++
 readability/compat/three.py    |  6 ++++++
 readability/compat/two.py      |  6 ++++++
 readability/readability.py     | 13 +++----------
 tox.ini                        |  4 ++--
 5 files changed, 23 insertions(+), 12 deletions(-)
 create mode 100644 readability/compat/__init__.py
 create mode 100644 readability/compat/three.py
 create mode 100644 readability/compat/two.py

diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py
new file mode 100644
index 00000000..ed4d3504
--- /dev/null
+++ b/readability/compat/__init__.py
@@ -0,0 +1,6 @@
+"""
+This module contains compatibility helpers for Python 2/3 interoperability.
+
+It mainly exists because their are certain incompatibilities in the Python
+syntax that can only be solved by conditionally importing different functions.
+"""
diff --git a/readability/compat/three.py b/readability/compat/three.py
new file mode 100644
index 00000000..26351575
--- /dev/null
+++ b/readability/compat/three.py
@@ -0,0 +1,6 @@
+def raise_with_traceback(exc_type, traceback, *args, **kwargs):
+    """
+    Raise a new exception of type `exc_type` with an existing `traceback`. All
+    additional (keyword-)arguments are forwarded to `exc_type`
+    """
+    raise exc_type(*args, **kwargs).with_traceback(traceback)
diff --git a/readability/compat/two.py b/readability/compat/two.py
new file mode 100644
index 00000000..642ecb75
--- /dev/null
+++ b/readability/compat/two.py
@@ -0,0 +1,6 @@
+def raise_with_traceback(exc_type, traceback, *args, **kwargs):
+    """
+    Raise a new exception of type `exc_type` with an existing `traceback`. All
+    additional (keyword-)arguments are forwarded to `exc_type`
+    """
+    raise exc_type(*args, **kwargs), None, traceback
diff --git a/readability/readability.py b/readability/readability.py
index 18ae4b29..820bc627 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -202,17 +202,10 @@ def summary(self, html_partial=False):
         except Exception as e:
             log.exception('error getting summary: ')
             if sys.version_info[0] == 2:
-                # This is the only reason why we can't support Python 3.3:
-                # 3.3s parser fails to accept the old syntax (although this
-                # code never runs) which would require write this line as:
-                # write this line as
-                #    Unparseable(str(e))
-                # but then we lose the traceback information. 3.4 on the
-                # other hand accepts the old syntax and would only complain
-                # at runtime.
-                raise Unparseable(str(e)), None, sys.exc_info()[2]
+                from .compat.two import raise_with_traceback
             else:
-                raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
+                from .compat.three import raise_with_traceback
+            raise_with_traceback(Unparseable, sys.exc_info()[2], str(e))
 
     def get_article(self, candidates, best_candidate, html_partial=False):
         # Now that we have the top candidate, look through its siblings for
diff --git a/tox.ini b/tox.ini
index f7c6e93f..50b4a74d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py26, py27, py34
+envlist = py26, py27, py33, py34
 
 [testenv]
 deps=pytest
@@ -14,7 +14,7 @@ deps=pytest
 # requires a Compiler and the build dependencies), you can download
 # it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via
 # $PYTHONDIR\Scripts\pip.exe install *.whl
-#sitepackages=True
+sitepackages=True
 commands =
     pip install -r requirements.txt
     py.test

From 046d2c10c3ff42253867ce919208c63e07e80d62 Mon Sep 17 00:00:00 2001
From: Martin Thurau <martin.thurau@gmail.com>
Date: Wed, 29 Apr 2015 23:36:50 +0200
Subject: [PATCH 5/6] Fixes regex declaration in get_encoding.

Since get_encoding() is only called when the input is *not* already unicode we need to declare the regexs as byte type so they continue to work in Python 3.
---
 readability/encoding.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/readability/encoding.py b/readability/encoding.py
index fb4761df..1c1e5050 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -3,9 +3,9 @@
 
 def get_encoding(page):
     # Regex for XML and HTML Meta charset declaration
-    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
-    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
-    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
+    charset_re = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
+    pragma_re = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
+    xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
 
     declared_encodings = (charset_re.findall(page) +
             pragma_re.findall(page) +
@@ -21,7 +21,7 @@ def get_encoding(page):
                 pass
 
     # Fallback to chardet if declared encodings fail
-    text = re.sub('</?[^>]*>\s*', ' ', page)
+    text = re.sub(b'</?[^>]*>\s*', b' ', page)
     enc = 'utf-8'
     if not text.strip() or len(text) < 10:
         return enc # can't guess

From 386e48d29b28e1c988cf676a33bbbfb5a41b038a Mon Sep 17 00:00:00 2001
From: Martin Thurau <martin.thurau@gmail.com>
Date: Thu, 30 Apr 2015 11:47:32 +0200
Subject: [PATCH 6/6] Fixes checking of declared encodings in get_encoding.

In PYthon 3 .decode() on bytes requires the name of the encoding to be a str type which means we have to convert the extracted encoding before we can use it.
---
 readability/encoding.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/readability/encoding.py b/readability/encoding.py
index 1c1e5050..b91c3e28 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -1,5 +1,6 @@
 import re
 import chardet
+import sys
 
 def get_encoding(page):
     # Regex for XML and HTML Meta charset declaration
@@ -12,13 +13,18 @@ def get_encoding(page):
             xml_re.findall(page))
 
     # Try any declared encodings
-    if len(declared_encodings) > 0:
-        for declared_encoding in declared_encodings:
-            try:
-                page.decode(custom_decode(declared_encoding))
-                return custom_decode(declared_encoding)
-            except UnicodeDecodeError:
-                pass
+    for declared_encoding in declared_encodings:
+        try:
+            if sys.version_info[0] == 3:
+                # declared_encoding will actually be bytes but .decode() only
+                # accepts `str` type. Decode blindly with ascii because no one should
+                # ever use non-ascii characters in the name of an encoding.
+                declared_encoding = declared_encoding.decode('ascii', 'replace')
+
+            page.decode(custom_decode(declared_encoding))
+            return custom_decode(declared_encoding)
+        except UnicodeDecodeError:
+            pass
 
     # Fallback to chardet if declared encodings fail
     text = re.sub(b'</?[^>]*>\s*', b' ', page)