Skip to content

Commit a1eb99e

Browse files
authored
Merge pull request #98 from scrapy/python3-support
Python3 support
2 parents 4769783 + cf0ee1b commit a1eb99e

File tree

10 files changed

+257
-58
lines changed

10 files changed

+257
-58
lines changed

.travis.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@ python: 2.7
33

44
env:
55
- TOXENV=py27
6-
- TOXENV=py33
76
- TOXENV=py34
7+
- TOXENV=pypy
88

99
install:
1010
- pip install cython
11-
- pip install -U tox codecov
12-
11+
- CYTHONIZE=1 python setup.py build
12+
- pip install -U tox
1313
script: tox
1414

1515
after_success:

MANIFEST.in

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
include scrapely/*.pyx
2-
include scrapely/extraction/*.pyx
2+
include scrapely/extraction/*.pyx
3+
include scrapely/*.c
4+
include scrapely/extraction/*.c

requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
numpy
22
w3lib
3-
six
4-
cython
3+
six

scrapely/_htmlpage.pyx

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,18 @@ cdef class CommentParser:
8484
(self.open_state == 4 and c == u'-')):
8585
self.open_state += 1
8686
else:
87+
# Handle <!> comment
88+
if self.open_state == 3 and c == u'>':
89+
self.inside_comment = False
90+
self.reset()
91+
self.start, self.end = i - 2, i
92+
return True
8793
self.open_state = 1
88-
8994
if self.open_state == 5:
9095
if self.open_count == 0:
9196
self.start = i - 3
9297
self.open_state = 1
93-
self.open_count += 1
98+
self.open_count = 1
9499
self.inside_comment = True
95100

96101
if self.close_count < self.open_count:
@@ -141,12 +146,12 @@ cdef class ScriptParser:
141146
self.state = 1
142147
if ((self.state == 1 and c == u'<') or
143148
(self.state == 2 and c == u'/') or
144-
(self.state == 3 and c == u's' or c == u'S') or
145-
(self.state == 4 and c == u'c' or c == u'C') or
146-
(self.state == 5 and c == u'r' or c == u'R') or
147-
(self.state == 6 and c == u'i' or c == u'I') or
148-
(self.state == 7 and c == u'p' or c == u'P') or
149-
(self.state == 8 and c == u't' or c == u'T') or
149+
(self.state == 3 and c in u'sS') or
150+
(self.state == 4 and c in u'cC') or
151+
(self.state == 5 and c in u'rR') or
152+
(self.state == 6 and c in u'iI') or
153+
(self.state == 7 and c in u'pP') or
154+
(self.state == 8 and c in u'tT') or
150155
(self.state == 9 and c == u'>')):
151156
self.state += 1
152157
else:
@@ -233,6 +238,8 @@ cpdef parse_html(s):
233238
parsed.append(
234239
HtmlDataFragment(comment_parser.start, tag_end + 1, False))
235240
reset_tag = True
241+
if (comment_parser.end - comment_parser.start) == 2:
242+
open_tag = False
236243

237244
if comment_parser.inside_comment:
238245
open_tag = False
@@ -288,7 +295,7 @@ cpdef parse_html(s):
288295
if tag_name != u'!doctype':
289296
parsed.append(
290297
HtmlTag(tag_type, tag_name,
291-
tag_attributes, tag_start, tag_end + 1))
298+
tag_attributes, tag_start, tag_end + 1))
292299
if tag_name == u'script':
293300
script = True
294301
if open_tag:

scrapely/extraction/regionextract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -640,12 +640,12 @@ def extract_text(self, text):
640640
pref_index = 0
641641
if self.minprefix > 0:
642642
rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix)
643-
if plen < self.minprefix:
643+
if plen is None or plen < self.minprefix:
644644
return None
645645
pref_index = -rev_idx
646646
if self.minsuffix == 0:
647647
return text[pref_index:]
648648
sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix)
649-
if slen < self.minsuffix:
649+
if slen is None or slen < self.minsuffix:
650650
return None
651651
return text[pref_index:pref_index + sidx]

scrapely/extraction/similarity.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,17 @@
66
from operator import itemgetter
77
from heapq import nlargest
88

9-
# For typical use cases (small sequences and patterns) the naive approach actually
10-
# runs faster than KMP algorithm
11-
from . _similarity import naive_match_length
9+
try:
10+
# For typical use cases (small sequences and patterns) the naive approach
11+
# actually runs faster than KMP algorithm
12+
from . _similarity import naive_match_length
13+
except ImportError:
14+
def naive_match_length(to_search, subsequence, range_start, range_end):
15+
startval = subsequence[0]
16+
return ((i, common_prefix_length(to_search[i:], subsequence))
17+
for i in xrange(range_start, range_end)
18+
if startval == to_search[i])
19+
1220

1321
def common_prefix_length(a, b):
1422
"""Calculate the length of the common prefix in both sequences passed.
@@ -46,7 +54,7 @@ def common_prefix(*sequences):
4654

4755

4856
def longest_unique_subsequence(to_search, subsequence, range_start=0,
49-
range_end=None):
57+
range_end=None):
5058
"""Find the longest unique subsequence of items in an array or string. This
5159
searches to_search looking for the longest overlapping
5260
match with subsequence. If the largest match is unique (there is no other

scrapely/htmlpage.py

Lines changed: 153 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,158 @@
1111
from six.moves.urllib.request import urlopen
1212
from copy import deepcopy
1313
from w3lib.encoding import html_to_unicode
14-
15-
from . import _htmlpage
16-
17-
18-
parse_html = _htmlpage.parse_html
19-
HtmlDataFragment = _htmlpage.HtmlDataFragment
20-
HtmlTag = _htmlpage.HtmlTag
21-
HtmlTagType = _htmlpage.HtmlTagType
14+
try:
15+
from . import _htmlpage
16+
parse_html = _htmlpage.parse_html
17+
HtmlDataFragment = _htmlpage.HtmlDataFragment
18+
HtmlTag = _htmlpage.HtmlTag
19+
HtmlTagType = _htmlpage.HtmlTagType
20+
except ImportError:
21+
import re
22+
from collections import OrderedDict
23+
24+
class HtmlTagType(object):
25+
OPEN_TAG = 1
26+
CLOSE_TAG = 2
27+
UNPAIRED_TAG = 3
28+
29+
class HtmlDataFragment(object):
30+
__slots__ = ('start', 'end', 'is_text_content')
31+
32+
def __init__(self, start, end, is_text_content=False):
33+
self.start = start
34+
self.end = end
35+
self.is_text_content = is_text_content
36+
37+
def __str__(self):
38+
return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (
39+
self.start, self.end, self.is_text_content)
40+
41+
def __repr__(self):
42+
return str(self)
43+
44+
class HtmlTag(HtmlDataFragment):
45+
__slots__ = ('tag_type', 'tag', '_attributes', '_attr_text')
46+
47+
def __init__(self, tag_type, tag, attr_text, start, end):
48+
HtmlDataFragment.__init__(self, start, end)
49+
self.tag_type = tag_type
50+
self.tag = tag
51+
if isinstance(attr_text, dict):
52+
self._attributes = attr_text
53+
self._attr_text = None
54+
else: # defer loading attributes until necessary
55+
self._attributes = OrderedDict()
56+
self._attr_text = attr_text
57+
58+
@property
59+
def attributes(self):
60+
if not self._attributes and self._attr_text:
61+
for attr_match in _ATTR_REGEXP.findall(self._attr_text):
62+
name = attr_match[0].lower()
63+
values = [v for v in attr_match[1:] if v]
64+
# According to HTML spec if attribute name is repeated only
65+
# the first one is taken into account
66+
if name not in self._attributes:
67+
self._attributes[name] = values[0] if values else None
68+
return self._attributes
69+
70+
def __str__(self):
71+
attributes = ', '.join(
72+
sorted(["%s: %s" % (k, repr(v))
73+
for k, v in self.attributes.items()]))
74+
return "<HtmlTag tag='%s' attributes={%s} type='%d' [%s:%s]>" % (
75+
self.tag, attributes, self.tag_type, self.start, self.end)
76+
77+
def __repr__(self):
78+
return str(self)
79+
80+
_ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|"
81+
"([^>\s]+))?)?")
82+
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
83+
_DOCTYPE = r"<!DOCTYPE.*?>"
84+
_SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
85+
_COMMENT = "(<!--.*?--!?>|<\?.+?>|<!>)"
86+
87+
_ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
88+
_HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
89+
re.I | re.DOTALL)
90+
_DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE)
91+
_COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)
92+
93+
def parse_html(text):
94+
"""Higher level html parser. Calls lower level parsers and joins sucesive
95+
HtmlDataFragment elements in a single one.
96+
"""
97+
# If have doctype remove it.
98+
start_pos = 0
99+
match = _DOCTYPE_REGEXP.match(text)
100+
if match:
101+
start_pos = match.end()
102+
prev_end = start_pos
103+
for match in _HTML_REGEXP.finditer(text, start_pos):
104+
start = match.start()
105+
end = match.end()
106+
107+
if start > prev_end:
108+
yield HtmlDataFragment(prev_end, start, True)
109+
110+
if match.groups()[0] is not None: # comment
111+
yield HtmlDataFragment(start, end)
112+
elif match.groups()[1] is not None: # <script>...</script>
113+
for e in _parse_script(match):
114+
yield e
115+
else: # tag
116+
yield _parse_tag(match)
117+
prev_end = end
118+
textlen = len(text)
119+
if prev_end < textlen:
120+
yield HtmlDataFragment(prev_end, textlen, True)
121+
122+
def _parse_script(match):
123+
"""parse a <script>...</script> region matched by _HTML_REGEXP"""
124+
open_text, content, close_text = match.groups()[1:4]
125+
126+
open_tag = _parse_tag(_HTML_REGEXP.match(open_text))
127+
open_tag.start = match.start()
128+
open_tag.end = match.start() + len(open_text)
129+
130+
close_tag = _parse_tag(_HTML_REGEXP.match(close_text))
131+
close_tag.start = match.end() - len(close_text)
132+
close_tag.end = match.end()
133+
134+
yield open_tag
135+
if open_tag.end < close_tag.start:
136+
start_pos = 0
137+
for m in _COMMENT_REGEXP.finditer(content):
138+
if m.start() > start_pos:
139+
yield HtmlDataFragment(
140+
open_tag.end + start_pos, open_tag.end + m.start())
141+
yield HtmlDataFragment(
142+
open_tag.end + m.start(), open_tag.end + m.end())
143+
start_pos = m.end()
144+
if open_tag.end + start_pos < close_tag.start:
145+
yield HtmlDataFragment(
146+
open_tag.end + start_pos, close_tag.start)
147+
yield close_tag
148+
149+
def _parse_tag(match):
150+
"""
151+
parse a tag matched by _HTML_REGEXP
152+
"""
153+
data = match.groups()
154+
closing, tag, attr_text = data[4:7]
155+
# if tag is None then the match is a comment
156+
if tag is not None:
157+
unpaired = data[-1]
158+
if closing:
159+
tag_type = HtmlTagType.CLOSE_TAG
160+
elif unpaired:
161+
tag_type = HtmlTagType.UNPAIRED_TAG
162+
else:
163+
tag_type = HtmlTagType.OPEN_TAG
164+
return HtmlTag(tag_type, tag.lower(), attr_text, match.start(),
165+
match.end())
22166

23167

24168
def url_to_page(url, encoding=None, default_encoding='utf-8'):
@@ -164,6 +308,7 @@ def __new__(cls, htmlpage, start_index, end_index):
164308
text_start = htmlpage.parsed_body[start_index].start
165309
text_end = htmlpage.parsed_body[end_index or -1].end
166310
text = htmlpage.body[text_start:text_end]
311+
167312
return HtmlPageRegion.__new__(cls, htmlpage, text)
168313

169314
def __init__(self, htmlpage, start_index, end_index):

setup.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,32 @@
11
#!/usr/bin/env python
2+
import os
3+
import platform
24
from setuptools import setup, find_packages
35
from setuptools.extension import Extension
4-
from Cython.Build import cythonize
56
import numpy as np
67

8+
9+
USE_CYTHON = 'CYTHONIZE' in os.environ
10+
IS_PYPY = platform.python_implementation() == 'PyPy'
11+
ext = '.pyx' if USE_CYTHON else '.c'
712
extensions = [
813
Extension("scrapely._htmlpage",
9-
["scrapely/_htmlpage.pyx"],
14+
["scrapely/_htmlpage%s" % ext],
1015
include_dirs=[np.get_include()]),
1116
Extension("scrapely.extraction._similarity",
12-
["scrapely/extraction/_similarity.pyx"],
17+
["scrapely/extraction/_similarity%s" % ext],
1318
include_dirs=[np.get_include()]),
1419
]
20+
if USE_CYTHON and not IS_PYPY:
21+
from Cython.Build import cythonize
22+
extensions = cythonize(extensions)
23+
if IS_PYPY:
24+
extensions = []
1525

1626

1727
setup(
1828
name='scrapely',
19-
version='0.12.0',
29+
version='0.13.0b1',
2030
license='BSD',
2131
description='A pure-python HTML screen-scraping library',
2232
author='Scrapy project',
@@ -38,6 +48,9 @@
3848
'Topic :: Internet :: WWW/HTTP',
3949
'Topic :: Text Processing :: Markup :: HTML',
4050
],
41-
install_requires=['numpy', 'w3lib', 'six', 'cython'],
42-
ext_modules=cythonize(extensions),
51+
install_requires=['numpy', 'w3lib', 'six'],
52+
extras_require={
53+
'speedup': ['cython']
54+
},
55+
ext_modules=extensions,
4356
)

0 commit comments

Comments
 (0)