Handle parsing of <!>. Use pypy as test environment.

ruairif · ruairif · commit cf0ee1b8d2d8 · 2016-12-20T17:06:43.000Z
Test python parsing implementation
Fallback to pure python parser if no cython available
diff --git a/.travis.yml b/.travis.yml
@@ -3,8 +3,8 @@ python: 2.7
 
 env:
 - TOXENV=py27
-- TOXENV=py33
 - TOXENV=py34
+- TOXENV=pypy
 
 install:
 - pip install cython
diff --git a/scrapely/_htmlpage.pyx b/scrapely/_htmlpage.pyx
@@ -84,8 +84,13 @@ cdef class CommentParser:
             (self.open_state == 4 and c == u'-')):
             self.open_state += 1
         else:
+            # Handle <!> comment
+            if self.open_state == 3 and c == u'>':
+                self.inside_comment = False
+                self.reset()
+                self.start, self.end = i - 2, i
+                return True
             self.open_state = 1
-
         if self.open_state == 5:
             if self.open_count == 0:
                 self.start = i - 3
@@ -233,6 +238,8 @@ cpdef parse_html(s):
                 parsed.append(
                     HtmlDataFragment(comment_parser.start, tag_end + 1, False))
                 reset_tag = True
+                if (comment_parser.end - comment_parser.start) == 2:
+                    open_tag = False
 
         if comment_parser.inside_comment:
             open_tag = False
diff --git a/scrapely/compat.py b/scrapely/compat.py
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
@@ -64,26 +64,25 @@ class BasicTypeExtractor(object):
     annotations.
 
     For example:
-    >>> from scrapely.compat import utext
     >>> from scrapely.extraction.pageparsing import parse_strings
     >>> template, page = parse_strings( \
         u'<h1 data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x</h1>', u'<h1> a name</h1>')
     >>> ex = BasicTypeExtractor(template.annotations[0])
-    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
+    >>> ex.extract(page, 0, 1, None)
     [(u'name', u' a name')]
 
     It supports attribute descriptors
     >>> descriptor = FieldDescriptor('name', None, lambda x: x.strip())
     >>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor})
-    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
+    >>> ex.extract(page, 0, 1, None)
     [(u'name', u'a name')]
 
     It supports ignoring regions
     >>> template, page = parse_strings(\
         u'<div data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x<b> xx</b></div>',\
         u'<div>a name<b> id-9</b></div>')
     >>> ex = BasicTypeExtractor(template.annotations[0])
-    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 3, [PageRegion(1, 2)])]
+    >>> ex.extract(page, 0, 3, [PageRegion(1, 2)])
     [(u'name', u'a name')]
     """
 
diff --git a/scrapely/extractors.py b/scrapely/extractors.py
@@ -81,8 +81,7 @@ def text(region):
     removing excessive whitespace,
 
     For example:
-    >>> from scrapely.compat import utext
-    >>> t = lambda s: utext(text(htmlregion(s)))
+    >>> t = lambda s: text(htmlregion(s))
     >>> t(u'<h1>test</h1>')
     u'test'
 
@@ -123,8 +122,7 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
             opening and closing tag is removed.
 
     For example:
-    >>> from scrapely.compat import utext
-    >>> t = lambda s, keep=_TAGS_TO_KEEP: utext(safehtml(htmlregion(s), keep))
+    >>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep)
     >>> t(u'<strong>test <blink>test</blink></strong>')
     u'<strong>test test</strong>'
 
@@ -274,8 +272,7 @@ def extract_number(txt):
     >>> extract_number('  45.3, 7')
 
     It will handle unescaped entities:
-    >>> from scrapely.compat import utext
-    >>> utext(extract_number(u'&#163;129&#46;99'))
+    >>> extract_number(u'&#163;129&#46;99')
     u'129.99'
     """
     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
@@ -288,7 +285,6 @@ def extract_price(txt):
     """
     Extracts numbers making some price format specific assumptions
 
-    >>> from scrapely.compat import utext
     >>> extract_price('asdf 234,234.45sdf ')
     '234234.45'
     >>> extract_price('234,23')
@@ -302,7 +298,7 @@ def extract_price(txt):
     >>> extract_price('adsfg')
     >>> extract_price('stained, linseed oil finish, clear glas doors')
     >>> extract_price('')
-    >>> utext(extract_price(u'&#163;129&#46;99'))
+    >>> extract_price(u'&#163;129&#46;99')
     u'129.99'
     """
     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
@@ -82,7 +82,7 @@ def __repr__(self):
     _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
     _DOCTYPE = r"<!DOCTYPE.*?>"
     _SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
-    _COMMENT = "(<!--.*?-->|<\?.+?>)"
+    _COMMENT = "(<!--.*?--!?>|<\?.+?>|<!>)"
 
     _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
     _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
diff --git a/setup.py b/setup.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python
 import os
+import platform
 from setuptools import setup, find_packages
 from setuptools.extension import Extension
 import numpy as np
 
 
 USE_CYTHON = 'CYTHONIZE' in os.environ
+IS_PYPY = platform.python_implementation() == 'PyPy'
 ext = '.pyx' if USE_CYTHON else '.c'
 extensions = [
     Extension("scrapely._htmlpage",
@@ -15,9 +17,11 @@
               ["scrapely/extraction/_similarity%s" % ext],
               include_dirs=[np.get_include()]),
 ]
-if USE_CYTHON:
+if USE_CYTHON and not IS_PYPY:
     from Cython.Build import cythonize
     extensions = cythonize(extensions)
+if IS_PYPY:
+    extensions = []
 
 
 setup(
@@ -45,7 +49,7 @@
         'Topic :: Text Processing :: Markup :: HTML',
     ],
     install_requires=['numpy', 'w3lib', 'six'],
-    extras_requires={
+    extras_require={
         'speedup': ['cython']
     },
     ext_modules=extensions,
diff --git a/tests/test_htmlpage_data.py b/tests/test_htmlpage_data.py
@@ -1,6 +1,6 @@
 PAGE = u"""
-<style id="scrapy-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);                           
-</style>
+<style id="scrapy-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);
+                           </style>
 <body>
 <div class="scrapy-selected" id="header">
 <img src="company_logo.jpg" style="margin-left: 68px; padding-top:5px;" alt="Logo" width="530" height="105">
@@ -152,27 +152,52 @@
  {'end': 150, 'start': 149},
 ]
 
-# for testing tags inside comments
-PAGE3 = u"""<html><body><h1>Helloooo!!</h1><p>Did i say hello??</p><!--<p>
-</p>--><script type="text/javascript">bla<!--comment-->blabla</script></body></html>"""
+# for testing tags in different forms
+PAGE3 = u"""<!DOCTYPE html>
+<html>
+    <head>
+    <!-- Standard comment style -->
+    <title>Page name</title>
+    <meta name="name" content="value"><!> <!-- <- Self Closing Comment --!>
+    </head>
+
+    <!-- Comment used for ignoring a script
+        <script type="text/javascript">
+            var a = 1;
+        </script>
+    -->
+    <body>
+    </body>
+</html>
+"""
 
 PARSED3 = [
- {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
- {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
- {'attributes': {}, 'end': 16, 'start': 12, 'tag': u'h1', 'tag_type': 1},
- {'end': 26, 'start': 16},
- {'attributes': {}, 'end': 31, 'start': 26, 'tag': u'h1', 'tag_type': 2},
- {'attributes': {}, 'end': 34, 'start': 31, 'tag': u'p', 'tag_type': 1},
- {'end': 51, 'start': 34},
- {'attributes': {}, 'end': 55, 'start': 51, 'tag': u'p', 'tag_type': 2},
- {'end': 70, 'start': 55, 'is_text_content': False},
- {'attributes': {u'type': u'text/javascript'}, 'end': 101, 'start': 70, 'tag': u'script', 'tag_type': 1},
- {'end': 104, 'start': 101, 'is_text_content': False},
- {'end': 118, 'start': 104, 'is_text_content': False},
- {'end': 124, 'start': 118, 'is_text_content': False},
- {'attributes': {}, 'end': 133, 'start': 124, 'tag': u'script', 'tag_type': 2},
- {'attributes': {}, 'end': 140, 'start': 133, 'tag': u'body', 'tag_type': 2},
- {'attributes': {}, 'end': 147, 'start': 140, 'tag': u'html', 'tag_type': 2}
+    {'end': 16, 'start': 15, 'is_text_content': True},
+    {'end': 22, 'start': 16, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'html'},
+    {'end': 27, 'start': 22, 'is_text_content': True},
+    {'end': 33, 'start': 27, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'head'},
+    {'end': 38, 'start': 33, 'is_text_content': True},
+    {'end': 69, 'start': 38, 'is_text_content': False},
+    {'end': 74, 'start': 69, 'is_text_content': True},
+    {'end': 81, 'start': 74, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'title'},
+    {'end': 90, 'start': 81, 'is_text_content': True},
+    {'end': 98, 'start': 90, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'title'},
+    {'end': 103, 'start': 98, 'is_text_content': True},
+    {'end': 137, 'start': 103, 'attributes': {'content': 'value', 'name': 'name'}, 'tag_type': 1, 'is_text_content': False, 'tag': 'meta'},
+    {'end': 140, 'start': 137, 'is_text_content': False},
+    {'end': 141, 'start': 140, 'is_text_content': True},
+    {'end': 174, 'start': 141, 'is_text_content': False},
+    {'end': 179, 'start': 174, 'is_text_content': True},
+    {'end': 186, 'start': 179, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'head'},
+    {'end': 192, 'start': 186, 'is_text_content': True},
+    {'end': 320, 'start': 192, 'is_text_content': False},
+    {'end': 325, 'start': 320, 'is_text_content': True},
+    {'end': 331, 'start': 325, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'body'},
+    {'end': 336, 'start': 331, 'is_text_content': True},
+    {'end': 343, 'start': 336, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'body'},
+    {'end': 344, 'start': 343, 'is_text_content': True},
+    {'end': 351, 'start': 344, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'html'},
+    {'end': 352, 'start': 351, 'is_text_content': True}
 ]
 
 # for testing tags inside scripts
@@ -293,4 +318,3 @@
     {"attributes": {}, "end": 91, "start": 84, "tag": "body", "tag_type": 2},
     {"attributes": {}, "end": 98, "start": 91, "tag": "html", "tag_type": 2}
 ]
-
diff --git a/tox.ini b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py27,py34
+envlist = py27,py34,pypy,pypy3
 usedevelop = True
 
 [testenv]
@@ -14,6 +14,7 @@ deps =
     nose-parameterized
     doctest-ignore-unicode
     coverage
+    cython
 commands =
     pip install -e .
     nosetests \