Skip to content

Commit ac62a4e

Browse files
committed
Python 3 fixes
Add compatability function for some tests Add fallback if no c extenstions installed Fix comment parsing in c extension
1 parent 2f537b5 commit ac62a4e

File tree

8 files changed

+198
-31
lines changed

8 files changed

+198
-31
lines changed

scrapely/_htmlpage.pyx

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ cdef class CommentParser:
9090
if self.open_count == 0:
9191
self.start = i - 3
9292
self.open_state = 1
93-
self.open_count += 1
93+
self.open_count = 1
9494
self.inside_comment = True
9595

9696
if self.close_count < self.open_count:
@@ -141,12 +141,12 @@ cdef class ScriptParser:
141141
self.state = 1
142142
if ((self.state == 1 and c == u'<') or
143143
(self.state == 2 and c == u'/') or
144-
(self.state == 3 and c == u's' or c == u'S') or
145-
(self.state == 4 and c == u'c' or c == u'C') or
146-
(self.state == 5 and c == u'r' or c == u'R') or
147-
(self.state == 6 and c == u'i' or c == u'I') or
148-
(self.state == 7 and c == u'p' or c == u'P') or
149-
(self.state == 8 and c == u't' or c == u'T') or
144+
(self.state == 3 and c in u'sS') or
145+
(self.state == 4 and c in u'cC') or
146+
(self.state == 5 and c in u'rR') or
147+
(self.state == 6 and c in u'iI') or
148+
(self.state == 7 and c in u'pP') or
149+
(self.state == 8 and c in u'tT') or
150150
(self.state == 9 and c == u'>')):
151151
self.state += 1
152152
else:
@@ -288,7 +288,7 @@ cpdef parse_html(s):
288288
if tag_name != u'!doctype':
289289
parsed.append(
290290
HtmlTag(tag_type, tag_name,
291-
tag_attributes, tag_start, tag_end + 1))
291+
tag_attributes, tag_start, tag_end + 1))
292292
if tag_name == u'script':
293293
script = True
294294
if open_tag:

scrapely/compat.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
try:
2+
utext = unicode
3+
except NameError:
4+
class utext(str):
5+
def __repr__(self):
6+
return 'u{}'.format(super(utext, self).__repr__())

scrapely/extraction/regionextract.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,25 +64,26 @@ class BasicTypeExtractor(object):
6464
annotations.
6565
6666
For example:
67+
>>> from scrapely.compat import utext
6768
>>> from scrapely.extraction.pageparsing import parse_strings
6869
>>> template, page = parse_strings( \
6970
u'<h1 data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x</h1>', u'<h1> a name</h1>')
7071
>>> ex = BasicTypeExtractor(template.annotations[0])
71-
>>> ex.extract(page, 0, 1, None)
72+
>>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
7273
[(u'name', u' a name')]
7374
7475
It supports attribute descriptors
7576
>>> descriptor = FieldDescriptor('name', None, lambda x: x.strip())
7677
>>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor})
77-
>>> ex.extract(page, 0, 1, None)
78+
>>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
7879
[(u'name', u'a name')]
7980
8081
It supports ignoring regions
8182
>>> template, page = parse_strings(\
8283
u'<div data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x<b> xx</b></div>',\
8384
u'<div>a name<b> id-9</b></div>')
8485
>>> ex = BasicTypeExtractor(template.annotations[0])
85-
>>> ex.extract(page, 0, 3, [PageRegion(1, 2)])
86+
>>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 3, [PageRegion(1, 2)])]
8687
[(u'name', u'a name')]
8788
"""
8889

@@ -640,12 +641,12 @@ def extract_text(self, text):
640641
pref_index = 0
641642
if self.minprefix > 0:
642643
rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix)
643-
if plen < self.minprefix:
644+
if plen is None or plen < self.minprefix:
644645
return None
645646
pref_index = -rev_idx
646647
if self.minsuffix == 0:
647648
return text[pref_index:]
648649
sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix)
649-
if slen < self.minsuffix:
650+
if slen is None or slen < self.minsuffix:
650651
return None
651652
return text[pref_index:pref_index + sidx]

scrapely/extraction/similarity.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,17 @@
66
from operator import itemgetter
77
from heapq import nlargest
88

9-
# For typical use cases (small sequences and patterns) the naive approach actually
10-
# runs faster than KMP algorithm
11-
from . _similarity import naive_match_length
9+
try:
10+
# For typical use cases (small sequences and patterns) the naive approach
11+
# actually runs faster than KMP algorithm
12+
from . _similarity import naive_match_length
13+
except ImportError:
14+
def naive_match_length(to_search, subsequence, range_start, range_end):
15+
startval = subsequence[0]
16+
return ((i, common_prefix_length(to_search[i:], subsequence))
17+
for i in xrange(range_start, range_end)
18+
if startval == to_search[i])
19+
1220

1321
def common_prefix_length(a, b):
1422
"""Calculate the length of the common prefix in both sequences passed.
@@ -46,7 +54,7 @@ def common_prefix(*sequences):
4654

4755

4856
def longest_unique_subsequence(to_search, subsequence, range_start=0,
49-
range_end=None):
57+
range_end=None):
5058
"""Find the longest unique subsequence of items in an array or string. This
5159
searches to_search looking for the longest overlapping
5260
match with subsequence. If the largest match is unique (there is no other

scrapely/extractors.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ def text(region):
8181
removing excessive whitespace,
8282
8383
For example:
84-
>>> t = lambda s: text(htmlregion(s))
84+
>>> from scrapely.compat import utext
85+
>>> t = lambda s: utext(text(htmlregion(s)))
8586
>>> t(u'<h1>test</h1>')
8687
u'test'
8788
@@ -122,7 +123,8 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
122123
opening and closing tag is removed.
123124
124125
For example:
125-
>>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep)
126+
>>> from scrapely.compat import utext
127+
>>> t = lambda s, keep=_TAGS_TO_KEEP: utext(safehtml(htmlregion(s), keep))
126128
>>> t(u'<strong>test <blink>test</blink></strong>')
127129
u'<strong>test test</strong>'
128130
@@ -272,7 +274,8 @@ def extract_number(txt):
272274
>>> extract_number(' 45.3, 7')
273275
274276
It will handle unescaped entities:
275-
>>> extract_number(u'&#163;129&#46;99')
277+
>>> from scrapely.compat import utext
278+
>>> utext(extract_number(u'&#163;129&#46;99'))
276279
u'129.99'
277280
"""
278281
txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
@@ -285,6 +288,7 @@ def extract_price(txt):
285288
"""
286289
Extracts numbers making some price format specific assumptions
287290
291+
>>> from scrapely.compat import utext
288292
>>> extract_price('asdf 234,234.45sdf ')
289293
'234234.45'
290294
>>> extract_price('234,23')
@@ -298,7 +302,7 @@ def extract_price(txt):
298302
>>> extract_price('adsfg')
299303
>>> extract_price('stained, linseed oil finish, clear glas doors')
300304
>>> extract_price('')
301-
>>> extract_price(u'&#163;129&#46;99')
305+
>>> utext(extract_price(u'&#163;129&#46;99'))
302306
u'129.99'
303307
"""
304308
txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)

scrapely/htmlpage.py

Lines changed: 153 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,158 @@
1111
from six.moves.urllib.request import urlopen
1212
from copy import deepcopy
1313
from w3lib.encoding import html_to_unicode
14-
15-
from . import _htmlpage
16-
17-
18-
parse_html = _htmlpage.parse_html
19-
HtmlDataFragment = _htmlpage.HtmlDataFragment
20-
HtmlTag = _htmlpage.HtmlTag
21-
HtmlTagType = _htmlpage.HtmlTagType
14+
try:
15+
from . import _htmlpage
16+
parse_html = _htmlpage.parse_html
17+
HtmlDataFragment = _htmlpage.HtmlDataFragment
18+
HtmlTag = _htmlpage.HtmlTag
19+
HtmlTagType = _htmlpage.HtmlTagType
20+
except ImportError:
21+
import re
22+
from collections import OrderedDict
23+
24+
class HtmlTagType(object):
25+
OPEN_TAG = 1
26+
CLOSE_TAG = 2
27+
UNPAIRED_TAG = 3
28+
29+
class HtmlDataFragment(object):
30+
__slots__ = ('start', 'end', 'is_text_content')
31+
32+
def __init__(self, start, end, is_text_content=False):
33+
self.start = start
34+
self.end = end
35+
self.is_text_content = is_text_content
36+
37+
def __str__(self):
38+
return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (
39+
self.start, self.end, self.is_text_content)
40+
41+
def __repr__(self):
42+
return str(self)
43+
44+
class HtmlTag(HtmlDataFragment):
45+
__slots__ = ('tag_type', 'tag', '_attributes', '_attr_text')
46+
47+
def __init__(self, tag_type, tag, attr_text, start, end):
48+
HtmlDataFragment.__init__(self, start, end)
49+
self.tag_type = tag_type
50+
self.tag = tag
51+
if isinstance(attr_text, dict):
52+
self._attributes = attr_text
53+
self._attr_text = None
54+
else: # defer loading attributes until necessary
55+
self._attributes = OrderedDict()
56+
self._attr_text = attr_text
57+
58+
@property
59+
def attributes(self):
60+
if not self._attributes and self._attr_text:
61+
for attr_match in _ATTR_REGEXP.findall(self._attr_text):
62+
name = attr_match[0].lower()
63+
values = [v for v in attr_match[1:] if v]
64+
# According to HTML spec if attribute name is repeated only
65+
# the first one is taken into account
66+
if name not in self._attributes:
67+
self._attributes[name] = values[0] if values else None
68+
return self._attributes
69+
70+
def __str__(self):
71+
attributes = ', '.join(
72+
sorted(["%s: %s" % (k, repr(v))
73+
for k, v in self.attributes.items()]))
74+
return "<HtmlTag tag='%s' attributes={%s} type='%d' [%s:%s]>" % (
75+
self.tag, attributes, self.tag_type, self.start, self.end)
76+
77+
def __repr__(self):
78+
return str(self)
79+
80+
_ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|"
81+
"([^>\s]+))?)?")
82+
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
83+
_DOCTYPE = r"<!DOCTYPE.*?>"
84+
_SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
85+
_COMMENT = "(<!--.*?-->|<\?.+?>)"
86+
87+
_ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
88+
_HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
89+
re.I | re.DOTALL)
90+
_DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE)
91+
_COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)
92+
93+
def parse_html(text):
94+
"""Higher level html parser. Calls lower level parsers and joins sucesive
95+
HtmlDataFragment elements in a single one.
96+
"""
97+
# If have doctype remove it.
98+
start_pos = 0
99+
match = _DOCTYPE_REGEXP.match(text)
100+
if match:
101+
start_pos = match.end()
102+
prev_end = start_pos
103+
for match in _HTML_REGEXP.finditer(text, start_pos):
104+
start = match.start()
105+
end = match.end()
106+
107+
if start > prev_end:
108+
yield HtmlDataFragment(prev_end, start, True)
109+
110+
if match.groups()[0] is not None: # comment
111+
yield HtmlDataFragment(start, end)
112+
elif match.groups()[1] is not None: # <script>...</script>
113+
for e in _parse_script(match):
114+
yield e
115+
else: # tag
116+
yield _parse_tag(match)
117+
prev_end = end
118+
textlen = len(text)
119+
if prev_end < textlen:
120+
yield HtmlDataFragment(prev_end, textlen, True)
121+
122+
def _parse_script(match):
123+
"""parse a <script>...</script> region matched by _HTML_REGEXP"""
124+
open_text, content, close_text = match.groups()[1:4]
125+
126+
open_tag = _parse_tag(_HTML_REGEXP.match(open_text))
127+
open_tag.start = match.start()
128+
open_tag.end = match.start() + len(open_text)
129+
130+
close_tag = _parse_tag(_HTML_REGEXP.match(close_text))
131+
close_tag.start = match.end() - len(close_text)
132+
close_tag.end = match.end()
133+
134+
yield open_tag
135+
if open_tag.end < close_tag.start:
136+
start_pos = 0
137+
for m in _COMMENT_REGEXP.finditer(content):
138+
if m.start() > start_pos:
139+
yield HtmlDataFragment(
140+
open_tag.end + start_pos, open_tag.end + m.start())
141+
yield HtmlDataFragment(
142+
open_tag.end + m.start(), open_tag.end + m.end())
143+
start_pos = m.end()
144+
if open_tag.end + start_pos < close_tag.start:
145+
yield HtmlDataFragment(
146+
open_tag.end + start_pos, close_tag.start)
147+
yield close_tag
148+
149+
def _parse_tag(match):
150+
"""
151+
parse a tag matched by _HTML_REGEXP
152+
"""
153+
data = match.groups()
154+
closing, tag, attr_text = data[4:7]
155+
# if tag is None then the match is a comment
156+
if tag is not None:
157+
unpaired = data[-1]
158+
if closing:
159+
tag_type = HtmlTagType.CLOSE_TAG
160+
elif unpaired:
161+
tag_type = HtmlTagType.UNPAIRED_TAG
162+
else:
163+
tag_type = HtmlTagType.OPEN_TAG
164+
return HtmlTag(tag_type, tag.lower(), attr_text, match.start(),
165+
match.end())
22166

23167

24168
def url_to_page(url, encoding=None, default_encoding='utf-8'):
@@ -164,6 +308,7 @@ def __new__(cls, htmlpage, start_index, end_index):
164308
text_start = htmlpage.parsed_body[start_index].start
165309
text_end = htmlpage.parsed_body[end_index or -1].end
166310
text = htmlpage.body[text_start:text_end]
311+
167312
return HtmlPageRegion.__new__(cls, htmlpage, text)
168313

169314
def __init__(self, htmlpage, start_index, end_index):

setup.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
setup(
2424
name='scrapely',
25-
version='0.12.0',
25+
version='0.13.0b1',
2626
license='BSD',
2727
description='A pure-python HTML screen-scraping library',
2828
author='Scrapy project',
@@ -45,5 +45,8 @@
4545
'Topic :: Text Processing :: Markup :: HTML',
4646
],
4747
install_requires=['numpy', 'w3lib', 'six'],
48+
extras_requires={
49+
'speedup': ['cython']
50+
},
4851
ext_modules=extensions,
4952
)

tox.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# and then run "tox" from this directory.
55

66
[tox]
7-
envlist = py27,py33,py34
7+
envlist = py27,py34
88
usedevelop = True
99

1010
[testenv]

0 commit comments

Comments
 (0)