v1.2.0 Python 3 support added & refactoring

farjasju · phpdude · commit cf537f8e3cc8 · 2019-01-16T17:32:38.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 .idea/
-env/
+env/
+build/
+dist/
+__pycache__
+*.egg-info
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,13 @@
+sudo: false
+language: python
+python:
+- "2.7"
+- "3.4"
+- "3.5"
+- "3.6"
+- "pypy"
+- "pypy3"
+install:
+- python setup.py install
+- pip install pytest==3.6.4
+script: pytest tests/*
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# urlsresolver v1.1.6
+# urlsresolver v1.2.0
 Python urls resolver library with meta refresh support.
 
 You can expand real address of any shortened url with `urlsresolver.resolve_url(url)` function.
@@ -32,4 +32,4 @@ Total 1 redirects
 
 You can write me to alexandr.shurigin@gmail.com for any question.
 
-Pull Requests are welcome.
+Pull Requests are welcome.
diff --git a/setup.py b/setup.py
@@ -1,9 +1,16 @@
 #!/usr/bin/env python -u
-from setuptools import setup, find_packages
+from os import path
+from setuptools import setup
+from setuptools import find_packages
+
+here = path.abspath(path.dirname(__file__))
+__version__ = None
+with open(path.join(here, 'urlsresolver', '__version.py')) as __version:
+    exec(__version.read())
 
 setup(
     name='urlsresolver',
-    version=".".join(map(str, __import__("urlsresolver").__version__)),
+    version=__version__,
     description='Python urls resolver library',
     author='Alexandr I. Shurigin',
     author_email='ya@helldude.ru',
@@ -25,6 +32,7 @@
         "Topic :: Utilities"
     ],
     install_requires=[
-        'requests'
+        'requests',
+        'future'
     ]
 )
diff --git a/tests/tags_extractor.py b/tests/tags_extractor.py
@@ -1,3 +1,4 @@
+from builtins import next
 from unittest import TestCase
 
 from urlsresolver import get_tags
diff --git a/tests/twitter_urls.py b/tests/twitter_urls.py
@@ -1,4 +1,5 @@
 # coding=utf-8
+from __future__ import unicode_literals
 from random import shuffle, choice
 import unittest
 import re
@@ -18,9 +19,9 @@
 class TestTwitterTrendsUrls(unittest.TestCase):
     def setUp(self):
         trends = requests.get('https://twitter.com/twitter')
-
-        urls = re.findall(URL_REGEX, trends.content)
-        urls = list(x for x in urls if re.match('https?://t.co/[a-z0-9]+$', x, re.IGNORECASE))
+        urls = re.findall(URL_REGEX, trends.content.decode('utf-8'))
+        urls = list(x for x in urls if re.match(
+            r'https?://t.co/[a-z0-9]+$', x, re.IGNORECASE))
         shuffle(urls)
 
         self.urls = urls
diff --git a/urlsresolver/__init__.py b/urlsresolver/__init__.py
@@ -1,40 +1,52 @@
 # coding=utf-8
-from HTMLParser import HTMLParser
+
 from collections import OrderedDict
 from contextlib import closing
+from builtins import next
+from builtins import str
 import re
-from urlparse import urljoin
 
-__version__ = (1, 1, 7)
-__author__ = 'Alexandr Shurigin (https://github.com/phpdude/)'
+try:
+    from urllib.parse import urlparse, urljoin
+except ImportError:
+    from urlparse import urlparse, urljoin
+try:
+    from html import unescape
+except ImportError:
+    try:
+        from html.parser import HTMLParser
+    except:
+        from HTMLParser import HTMLParser
+    parser = HTMLParser()
+    unescape = parser.unescape
+
 
 # HTML tags syntax http://www.w3.org/TR/html-markup/syntax.html
 TAG_ATTRIBUTES_REGEX = \
-    "(?:\s+%(attr)s\s*=\s*\"%(dqval)s\")|" \
-    "(?:\s+%(attr)s\s*=\s*'%(sqval)s')|" \
-    "(?:\s+%(attr)s\s*=\s*%(uqval)s)|" \
-    "(?:\s+%(attr)s)" % {
-        'attr': "([^\s\\x00\"'>/=]+)",
-        'uqval': "([^\s\"'=><`]*)",
-        'sqval': "([^'\\x00]*)",
-        'dqval': "([^\"\\x00]*)"
+    r"(?:\s+%(attr)s\s*=\s*\"%(dqval)s\")|" \
+    r"(?:\s+%(attr)s\s*=\s*'%(sqval)s')|" \
+    r"(?:\s+%(attr)s\s*=\s*%(uqval)s)|" \
+    r"(?:\s+%(attr)s)" % {
+        'attr': r"([^\s\x00\"'>/=]+)",
+        'uqval': r"([^\s\"'=><`]*)",
+        'sqval': r"([^'\x00]*)",
+        'dqval': r"([^\"\x00]*)"
     }
 
 
-def get_tags(html, tag_name):
-    parser = HTMLParser()
-    for m in re.findall('<%s(\s+[^>]*)/*>' % tag_name, html, re.IGNORECASE):
+def get_tags(html_content, tag_name):
+    for m in re.findall(r'<%s(\s+[^>]*)/*>' % tag_name, html_content, re.IGNORECASE):
         attrs = {}
 
-        for x in re.findall('(?:(%s))' % TAG_ATTRIBUTES_REGEX, m, re.UNICODE):
+        for x in re.findall(r'(?:(%s))' % TAG_ATTRIBUTES_REGEX, m, re.UNICODE):
             if x[1]:
-                attrs[x[1]] = parser.unescape(x[2])
+                attrs[x[1]] = unescape(x[2])
             elif x[3]:
-                attrs[x[3]] = parser.unescape(x[4])
+                attrs[x[3]] = unescape(x[4])
             elif x[5]:
-                attrs[x[5]] = parser.unescape(x[6])
+                attrs[x[5]] = unescape(x[6])
             elif x[7]:
-                attrs[x[7]] = parser.unescape(x[7])
+                attrs[x[7]] = unescape(x[7])
 
         yield attrs
 
@@ -75,7 +87,8 @@ def follow_meta_redirects(url, redirects, **kwargs):
         urls_history[url] = True
 
         if redirects < 0:
-            raise ValueError("Cannot resolve real url with max_redirects=%s" % max_redirects)
+            raise ValueError(
+                "Cannot resolve real url with max_redirects=%s" % max_redirects)
 
         redirects -= 1
 
@@ -84,21 +97,22 @@ def follow_meta_redirects(url, redirects, **kwargs):
                 for r in resp.history:
                     urls_history[r.url] = True
 
-            head, real_url = resp.iter_content(chunk_size).next(), resp.url
+            head, real_url = next(resp.iter_content(chunk_size)), resp.url
 
             encoding = resp.encoding
             if encoding is None:
                 # detect encoding
                 encoding = chardet.detect(head)['encoding']
 
             try:
-                head = unicode(head, encoding, errors='replace')
+                head = str(head, encoding, errors='replace')
             except (LookupError, TypeError):
-                head = unicode(head, errors='replace')
+                head = str(head, errors='replace')
 
         # Removing html blocks in <noscript></noscript>
         if remove_noscript:
-            head = re.sub('<noscript[^>]*>.*</noscript[^>]*>', '', head, flags=re.DOTALL)
+            head = re.sub(
+                r'<noscript[^>]*>.*</noscript[^>]*>', '', head, flags=re.DOTALL)
 
         redirect = None
         if 'refresh' in resp.headers:
@@ -109,22 +123,23 @@ def follow_meta_redirects(url, redirects, **kwargs):
                     redirect = tag.get('content', None)
 
         if redirect:
-            m = re.search('url\s*=\s*([^\s;]+)', redirect, re.I)
+            m = re.search(r'url\s*=\s*([^\s;]+)', redirect, re.I)
             if m:
                 m = m.group(1)
 
                 # fixing case url='#url here#'
                 if m.startswith(('"', "'")) and m.endswith(('"', "'")):
                     m = m[1:-1]
 
-                real_url = follow_meta_redirects(urljoin(resp.url, m), redirects)
+                real_url = follow_meta_redirects(
+                    urljoin(resp.url, m), redirects)
 
         urls_history[real_url] = True
 
         return real_url
 
     real_url = follow_meta_redirects(start_url, max_redirects, **kwargs)
     if history:
-        return real_url, urls_history.keys()
+        return real_url, list(urls_history.keys())
     else:
         return real_url
diff --git a/urlsresolver/__main__.py b/urlsresolver/__main__.py
@@ -1,15 +1,16 @@
+from __future__ import print_function
 import argparse
 
 import urlsresolver
 
 if __name__ == '__main__':
     args = argparse.ArgumentParser(
         prog='python -m urlsresolver',
-        description='Urls resolver library. Allow you get real url of shortened.',
-        version=".".join(map(str, urlsresolver.__version__)),
+        description='Urls resolver library. Allow you get real url of shortened.'
     )
     args.add_argument('url')
-    args.add_argument('-V', '--verbose', help='Verbose output', action='store_true')
+    args.add_argument('-V', '--verbose',
+                      help='Verbose output', action='store_true')
     args.add_argument('-A', '--user-agent', help='Custom user agent')
     args.add_argument('-S', '--chunk-size', default=1500, metavar='SIZE',
                       help='Length of fetched html block for testing meta redirects. Default 1500')
@@ -29,14 +30,14 @@
     )
 
     if not args.verbose:
-        print result
+        print(result)
     else:
-        print 'Source:\n    %s\n' % args.url
-        print 'Expanded:\n    %s\n' % result[0]
+        print('Source:\n    %s\n' % args.url)
+        print('Expanded:\n    %s\n' % result[0])
 
         if len(result[1]) > 1:
-            print 'Redirects history:'
+            print('Redirects history:')
             for i, url in enumerate(result[1], start=1):
-                print '    %s. %s' % (i, url)
+                print('    %s. %s' % (i, url))
 
-            print '\nTotal %s redirects' % (len(result[1]) - 1)
+            print('\nTotal %s redirects' % (len(result[1]) - 1))
diff --git a/urlsresolver/__version.py b/urlsresolver/__version.py
@@ -0,0 +1,2 @@
+__version__ = '1.2.0'
+__author__ = 'Alexandr Shurigin (https://github.com/phpdude/)'

-Original file line number
+Diff line change
@@ @@ -1,2 +1,6 @@ @@
 .idea/
 -env/
 +env/
 +build/
 +dist/
 +__pycache__
 +*.egg-info
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+from builtins import next`
`1`	`2`	`from unittest import TestCase`
`2`	`3`
`3`	`4`	`from urlsresolver import get_tags`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+__version__ = '1.2.0'`
	`2`	`+__author__ = 'Alexandr Shurigin (https://github.com/phpdude/)'`