11# coding=utf-8
2- from HTMLParser import HTMLParser
2+
33from collections import OrderedDict
44from contextlib import closing
5+ from builtins import next
6+ from builtins import str
57import re
6- from urlparse import urljoin
78
8- __version__ = (1 , 1 , 7 )
9- __author__ = 'Alexandr Shurigin (https://github.com/phpdude/)'
9+ try :
10+ from urllib .parse import urlparse , urljoin
11+ except ImportError :
12+ from urlparse import urlparse , urljoin
13+ try :
14+ from html import unescape
15+ except ImportError :
16+ try :
17+ from html .parser import HTMLParser
18+ except :
19+ from HTMLParser import HTMLParser
20+ parser = HTMLParser ()
21+ unescape = parser .unescape
22+
1023
1124# HTML tags syntax http://www.w3.org/TR/html-markup/syntax.html
1225TAG_ATTRIBUTES_REGEX = \
13- "(?:\s+%(attr)s\s*=\s*\" %(dqval)s\" )|" \
14- "(?:\s+%(attr)s\s*=\s*'%(sqval)s')|" \
15- "(?:\s+%(attr)s\s*=\s*%(uqval)s)|" \
16- "(?:\s+%(attr)s)" % {
17- 'attr' : "([^\s\ \ x00\" '>/=]+)" ,
18- 'uqval' : "([^\s\" '=><`]*)" ,
19- 'sqval' : "([^'\ \ x00]*)" ,
20- 'dqval' : "([^\" \ \ x00]*)"
26+ r "(?:\s+%(attr)s\s*=\s*\"%(dqval)s\")|" \
27+ r "(?:\s+%(attr)s\s*=\s*'%(sqval)s')|" \
28+ r "(?:\s+%(attr)s\s*=\s*%(uqval)s)|" \
29+ r "(?:\s+%(attr)s)" % {
30+ 'attr' : r "([^\s\x00\"'>/=]+)" ,
31+ 'uqval' : r "([^\s\"'=><`]*)" ,
32+ 'sqval' : r "([^'\x00]*)" ,
33+ 'dqval' : r "([^\"\x00]*)"
2134 }
2235
2336
24- def get_tags (html , tag_name ):
25- parser = HTMLParser ()
26- for m in re .findall ('<%s(\s+[^>]*)/*>' % tag_name , html , re .IGNORECASE ):
37+ def get_tags (html_content , tag_name ):
38+ for m in re .findall (r'<%s(\s+[^>]*)/*>' % tag_name , html_content , re .IGNORECASE ):
2739 attrs = {}
2840
29- for x in re .findall ('(?:(%s))' % TAG_ATTRIBUTES_REGEX , m , re .UNICODE ):
41+ for x in re .findall (r '(?:(%s))' % TAG_ATTRIBUTES_REGEX , m , re .UNICODE ):
3042 if x [1 ]:
31- attrs [x [1 ]] = parser . unescape (x [2 ])
43+ attrs [x [1 ]] = unescape (x [2 ])
3244 elif x [3 ]:
33- attrs [x [3 ]] = parser . unescape (x [4 ])
45+ attrs [x [3 ]] = unescape (x [4 ])
3446 elif x [5 ]:
35- attrs [x [5 ]] = parser . unescape (x [6 ])
47+ attrs [x [5 ]] = unescape (x [6 ])
3648 elif x [7 ]:
37- attrs [x [7 ]] = parser . unescape (x [7 ])
49+ attrs [x [7 ]] = unescape (x [7 ])
3850
3951 yield attrs
4052
@@ -75,7 +87,8 @@ def follow_meta_redirects(url, redirects, **kwargs):
7587 urls_history [url ] = True
7688
7789 if redirects < 0 :
78- raise ValueError ("Cannot resolve real url with max_redirects=%s" % max_redirects )
90+ raise ValueError (
91+ "Cannot resolve real url with max_redirects=%s" % max_redirects )
7992
8093 redirects -= 1
8194
@@ -84,21 +97,22 @@ def follow_meta_redirects(url, redirects, **kwargs):
8497 for r in resp .history :
8598 urls_history [r .url ] = True
8699
87- head , real_url = resp .iter_content (chunk_size ). next ( ), resp .url
100+ head , real_url = next ( resp .iter_content (chunk_size )), resp .url
88101
89102 encoding = resp .encoding
90103 if encoding is None :
91104 # detect encoding
92105 encoding = chardet .detect (head )['encoding' ]
93106
94107 try :
95- head = unicode (head , encoding , errors = 'replace' )
108+ head = str (head , encoding , errors = 'replace' )
96109 except (LookupError , TypeError ):
97- head = unicode (head , errors = 'replace' )
110+ head = str (head , errors = 'replace' )
98111
99112 # Removing html blocks in <noscript></noscript>
100113 if remove_noscript :
101- head = re .sub ('<noscript[^>]*>.*</noscript[^>]*>' , '' , head , flags = re .DOTALL )
114+ head = re .sub (
115+ r'<noscript[^>]*>.*</noscript[^>]*>' , '' , head , flags = re .DOTALL )
102116
103117 redirect = None
104118 if 'refresh' in resp .headers :
@@ -109,22 +123,23 @@ def follow_meta_redirects(url, redirects, **kwargs):
109123 redirect = tag .get ('content' , None )
110124
111125 if redirect :
112- m = re .search ('url\s*=\s*([^\s;]+)' , redirect , re .I )
126+ m = re .search (r 'url\s*=\s*([^\s;]+)' , redirect , re .I )
113127 if m :
114128 m = m .group (1 )
115129
116130 # fixing case url='#url here#'
117131 if m .startswith (('"' , "'" )) and m .endswith (('"' , "'" )):
118132 m = m [1 :- 1 ]
119133
120- real_url = follow_meta_redirects (urljoin (resp .url , m ), redirects )
134+ real_url = follow_meta_redirects (
135+ urljoin (resp .url , m ), redirects )
121136
122137 urls_history [real_url ] = True
123138
124139 return real_url
125140
126141 real_url = follow_meta_redirects (start_url , max_redirects , ** kwargs )
127142 if history :
128- return real_url , urls_history .keys ()
143+ return real_url , list ( urls_history .keys () )
129144 else :
130145 return real_url
0 commit comments