Skip to content

Commit cf0ee1b

Browse files
committed
Handle parsing of <!>. Use pypy as test environment.
Test python parsing implementation Fallback to pure python parser if no cython available
1 parent ac62a4e commit cf0ee1b

File tree

9 files changed

+71
-46
lines changed

9 files changed

+71
-46
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ python: 2.7
33

44
env:
55
- TOXENV=py27
6-
- TOXENV=py33
76
- TOXENV=py34
7+
- TOXENV=pypy
88

99
install:
1010
- pip install cython

scrapely/_htmlpage.pyx

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,13 @@ cdef class CommentParser:
8484
(self.open_state == 4 and c == u'-')):
8585
self.open_state += 1
8686
else:
87+
# Handle <!> comment
88+
if self.open_state == 3 and c == u'>':
89+
self.inside_comment = False
90+
self.reset()
91+
self.start, self.end = i - 2, i
92+
return True
8793
self.open_state = 1
88-
8994
if self.open_state == 5:
9095
if self.open_count == 0:
9196
self.start = i - 3
@@ -233,6 +238,8 @@ cpdef parse_html(s):
233238
parsed.append(
234239
HtmlDataFragment(comment_parser.start, tag_end + 1, False))
235240
reset_tag = True
241+
if (comment_parser.end - comment_parser.start) == 2:
242+
open_tag = False
236243

237244
if comment_parser.inside_comment:
238245
open_tag = False

scrapely/compat.py

Lines changed: 0 additions & 6 deletions
This file was deleted.

scrapely/extraction/regionextract.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,26 +64,25 @@ class BasicTypeExtractor(object):
6464
annotations.
6565
6666
For example:
67-
>>> from scrapely.compat import utext
6867
>>> from scrapely.extraction.pageparsing import parse_strings
6968
>>> template, page = parse_strings( \
7069
u'<h1 data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x</h1>', u'<h1> a name</h1>')
7170
>>> ex = BasicTypeExtractor(template.annotations[0])
72-
>>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
71+
>>> ex.extract(page, 0, 1, None)
7372
[(u'name', u' a name')]
7473
7574
It supports attribute descriptors
7675
>>> descriptor = FieldDescriptor('name', None, lambda x: x.strip())
7776
>>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor})
78-
>>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
77+
>>> ex.extract(page, 0, 1, None)
7978
[(u'name', u'a name')]
8079
8180
It supports ignoring regions
8281
>>> template, page = parse_strings(\
8382
u'<div data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x<b> xx</b></div>',\
8483
u'<div>a name<b> id-9</b></div>')
8584
>>> ex = BasicTypeExtractor(template.annotations[0])
86-
>>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 3, [PageRegion(1, 2)])]
85+
>>> ex.extract(page, 0, 3, [PageRegion(1, 2)])
8786
[(u'name', u'a name')]
8887
"""
8988

scrapely/extractors.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,7 @@ def text(region):
8181
removing excessive whitespace,
8282
8383
For example:
84-
>>> from scrapely.compat import utext
85-
>>> t = lambda s: utext(text(htmlregion(s)))
84+
>>> t = lambda s: text(htmlregion(s))
8685
>>> t(u'<h1>test</h1>')
8786
u'test'
8887
@@ -123,8 +122,7 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
123122
opening and closing tag is removed.
124123
125124
For example:
126-
>>> from scrapely.compat import utext
127-
>>> t = lambda s, keep=_TAGS_TO_KEEP: utext(safehtml(htmlregion(s), keep))
125+
>>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep)
128126
>>> t(u'<strong>test <blink>test</blink></strong>')
129127
u'<strong>test test</strong>'
130128
@@ -274,8 +272,7 @@ def extract_number(txt):
274272
>>> extract_number(' 45.3, 7')
275273
276274
It will handle unescaped entities:
277-
>>> from scrapely.compat import utext
278-
>>> utext(extract_number(u'&#163;129&#46;99'))
275+
>>> extract_number(u'&#163;129&#46;99')
279276
u'129.99'
280277
"""
281278
txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
@@ -288,7 +285,6 @@ def extract_price(txt):
288285
"""
289286
Extracts numbers making some price format specific assumptions
290287
291-
>>> from scrapely.compat import utext
292288
>>> extract_price('asdf 234,234.45sdf ')
293289
'234234.45'
294290
>>> extract_price('234,23')
@@ -302,7 +298,7 @@ def extract_price(txt):
302298
>>> extract_price('adsfg')
303299
>>> extract_price('stained, linseed oil finish, clear glas doors')
304300
>>> extract_price('')
305-
>>> utext(extract_price(u'&#163;129&#46;99'))
301+
>>> extract_price(u'&#163;129&#46;99')
306302
u'129.99'
307303
"""
308304
txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)

scrapely/htmlpage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def __repr__(self):
8282
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
8383
_DOCTYPE = r"<!DOCTYPE.*?>"
8484
_SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
85-
_COMMENT = "(<!--.*?-->|<\?.+?>)"
85+
_COMMENT = "(<!--.*?--!?>|<\?.+?>|<!>)"
8686

8787
_ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
8888
_HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),

setup.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
#!/usr/bin/env python
22
import os
3+
import platform
34
from setuptools import setup, find_packages
45
from setuptools.extension import Extension
56
import numpy as np
67

78

89
USE_CYTHON = 'CYTHONIZE' in os.environ
10+
IS_PYPY = platform.python_implementation() == 'PyPy'
911
ext = '.pyx' if USE_CYTHON else '.c'
1012
extensions = [
1113
Extension("scrapely._htmlpage",
@@ -15,9 +17,11 @@
1517
["scrapely/extraction/_similarity%s" % ext],
1618
include_dirs=[np.get_include()]),
1719
]
18-
if USE_CYTHON:
20+
if USE_CYTHON and not IS_PYPY:
1921
from Cython.Build import cythonize
2022
extensions = cythonize(extensions)
23+
if IS_PYPY:
24+
extensions = []
2125

2226

2327
setup(
@@ -45,7 +49,7 @@
4549
'Topic :: Text Processing :: Markup :: HTML',
4650
],
4751
install_requires=['numpy', 'w3lib', 'six'],
48-
extras_requires={
52+
extras_require={
4953
'speedup': ['cython']
5054
},
5155
ext_modules=extensions,

tests/test_htmlpage_data.py

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
PAGE = u"""
2-
<style id="scrapy-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);
3-
</style>
2+
<style id="scrapy-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);
3+
</style>
44
<body>
55
<div class="scrapy-selected" id="header">
66
<img src="company_logo.jpg" style="margin-left: 68px; padding-top:5px;" alt="Logo" width="530" height="105">
@@ -152,27 +152,52 @@
152152
{'end': 150, 'start': 149},
153153
]
154154

155-
# for testing tags inside comments
156-
PAGE3 = u"""<html><body><h1>Helloooo!!</h1><p>Did i say hello??</p><!--<p>
157-
</p>--><script type="text/javascript">bla<!--comment-->blabla</script></body></html>"""
155+
# for testing tags in different forms
156+
PAGE3 = u"""<!DOCTYPE html>
157+
<html>
158+
<head>
159+
<!-- Standard comment style -->
160+
<title>Page name</title>
161+
<meta name="name" content="value"><!> <!-- <- Self Closing Comment --!>
162+
</head>
163+
164+
<!-- Comment used for ignoring a script
165+
<script type="text/javascript">
166+
var a = 1;
167+
</script>
168+
-->
169+
<body>
170+
</body>
171+
</html>
172+
"""
158173

159174
PARSED3 = [
160-
{'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
161-
{'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
162-
{'attributes': {}, 'end': 16, 'start': 12, 'tag': u'h1', 'tag_type': 1},
163-
{'end': 26, 'start': 16},
164-
{'attributes': {}, 'end': 31, 'start': 26, 'tag': u'h1', 'tag_type': 2},
165-
{'attributes': {}, 'end': 34, 'start': 31, 'tag': u'p', 'tag_type': 1},
166-
{'end': 51, 'start': 34},
167-
{'attributes': {}, 'end': 55, 'start': 51, 'tag': u'p', 'tag_type': 2},
168-
{'end': 70, 'start': 55, 'is_text_content': False},
169-
{'attributes': {u'type': u'text/javascript'}, 'end': 101, 'start': 70, 'tag': u'script', 'tag_type': 1},
170-
{'end': 104, 'start': 101, 'is_text_content': False},
171-
{'end': 118, 'start': 104, 'is_text_content': False},
172-
{'end': 124, 'start': 118, 'is_text_content': False},
173-
{'attributes': {}, 'end': 133, 'start': 124, 'tag': u'script', 'tag_type': 2},
174-
{'attributes': {}, 'end': 140, 'start': 133, 'tag': u'body', 'tag_type': 2},
175-
{'attributes': {}, 'end': 147, 'start': 140, 'tag': u'html', 'tag_type': 2}
175+
{'end': 16, 'start': 15, 'is_text_content': True},
176+
{'end': 22, 'start': 16, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'html'},
177+
{'end': 27, 'start': 22, 'is_text_content': True},
178+
{'end': 33, 'start': 27, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'head'},
179+
{'end': 38, 'start': 33, 'is_text_content': True},
180+
{'end': 69, 'start': 38, 'is_text_content': False},
181+
{'end': 74, 'start': 69, 'is_text_content': True},
182+
{'end': 81, 'start': 74, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'title'},
183+
{'end': 90, 'start': 81, 'is_text_content': True},
184+
{'end': 98, 'start': 90, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'title'},
185+
{'end': 103, 'start': 98, 'is_text_content': True},
186+
{'end': 137, 'start': 103, 'attributes': {'content': 'value', 'name': 'name'}, 'tag_type': 1, 'is_text_content': False, 'tag': 'meta'},
187+
{'end': 140, 'start': 137, 'is_text_content': False},
188+
{'end': 141, 'start': 140, 'is_text_content': True},
189+
{'end': 174, 'start': 141, 'is_text_content': False},
190+
{'end': 179, 'start': 174, 'is_text_content': True},
191+
{'end': 186, 'start': 179, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'head'},
192+
{'end': 192, 'start': 186, 'is_text_content': True},
193+
{'end': 320, 'start': 192, 'is_text_content': False},
194+
{'end': 325, 'start': 320, 'is_text_content': True},
195+
{'end': 331, 'start': 325, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'body'},
196+
{'end': 336, 'start': 331, 'is_text_content': True},
197+
{'end': 343, 'start': 336, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'body'},
198+
{'end': 344, 'start': 343, 'is_text_content': True},
199+
{'end': 351, 'start': 344, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'html'},
200+
{'end': 352, 'start': 351, 'is_text_content': True}
176201
]
177202

178203
# for testing tags inside scripts
@@ -293,4 +318,3 @@
293318
{"attributes": {}, "end": 91, "start": 84, "tag": "body", "tag_type": 2},
294319
{"attributes": {}, "end": 98, "start": 91, "tag": "html", "tag_type": 2}
295320
]
296-

tox.ini

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# and then run "tox" from this directory.
55

66
[tox]
7-
envlist = py27,py34
7+
envlist = py27,py34,pypy,pypy3
88
usedevelop = True
99

1010
[testenv]
@@ -14,6 +14,7 @@ deps =
1414
nose-parameterized
1515
doctest-ignore-unicode
1616
coverage
17+
cython
1718
commands =
1819
pip install -e .
1920
nosetests \

0 commit comments

Comments
 (0)