Merge pull request #98 from scrapy/python3-support

kmike · web-flow · commit a1eb99e4ad56 · 2016-12-20T23:18:29.000+05:00
Python3 support
diff --git a/.travis.yml b/.travis.yml
@@ -3,13 +3,13 @@ python: 2.7
 
 env:
 - TOXENV=py27
-- TOXENV=py33
 - TOXENV=py34
+- TOXENV=pypy
 
 install:
 - pip install cython
-- pip install -U tox codecov
-
+- CYTHONIZE=1 python setup.py build
+- pip install -U tox
 script: tox
 
 after_success:
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,4 @@
 include scrapely/*.pyx
-include scrapely/extraction/*.pyx
+include scrapely/extraction/*.pyx
+include scrapely/*.c
+include scrapely/extraction/*.c
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
 numpy
 w3lib
-six
-cython
+six
diff --git a/scrapely/_htmlpage.pyx b/scrapely/_htmlpage.pyx
@@ -84,13 +84,18 @@ cdef class CommentParser:
             (self.open_state == 4 and c == u'-')):
             self.open_state += 1
         else:
+            # Handle <!> comment
+            if self.open_state == 3 and c == u'>':
+                self.inside_comment = False
+                self.reset()
+                self.start, self.end = i - 2, i
+                return True
             self.open_state = 1
-
         if self.open_state == 5:
             if self.open_count == 0:
                 self.start = i - 3
             self.open_state = 1
-            self.open_count += 1
+            self.open_count = 1
             self.inside_comment = True
 
         if self.close_count < self.open_count:
@@ -141,12 +146,12 @@ cdef class ScriptParser:
             self.state = 1
         if ((self.state == 1 and c == u'<') or
             (self.state == 2 and c == u'/') or
-            (self.state == 3 and c == u's' or c == u'S') or
-            (self.state == 4 and c == u'c' or c == u'C') or
-            (self.state == 5 and c == u'r' or c == u'R') or
-            (self.state == 6 and c == u'i' or c == u'I') or
-            (self.state == 7 and c == u'p' or c == u'P') or
-            (self.state == 8 and c == u't' or c == u'T') or
+            (self.state == 3 and c in u'sS') or
+            (self.state == 4 and c in u'cC') or
+            (self.state == 5 and c in u'rR') or
+            (self.state == 6 and c in u'iI') or
+            (self.state == 7 and c in u'pP') or
+            (self.state == 8 and c in u'tT') or
             (self.state == 9 and c == u'>')):
             self.state += 1
         else:
@@ -233,6 +238,8 @@ cpdef parse_html(s):
                 parsed.append(
                     HtmlDataFragment(comment_parser.start, tag_end + 1, False))
                 reset_tag = True
+                if (comment_parser.end - comment_parser.start) == 2:
+                    open_tag = False
 
         if comment_parser.inside_comment:
             open_tag = False
@@ -288,7 +295,7 @@ cpdef parse_html(s):
                     if tag_name != u'!doctype':
                         parsed.append(
                             HtmlTag(tag_type, tag_name,
-                                      tag_attributes, tag_start, tag_end + 1))
+                                    tag_attributes, tag_start, tag_end + 1))
                     if tag_name == u'script':
                         script = True
                     if open_tag:
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
@@ -640,12 +640,12 @@ def extract_text(self, text):
         pref_index = 0
         if self.minprefix > 0:
             rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix)
-            if plen < self.minprefix:
+            if plen is None or plen < self.minprefix:
                 return None
             pref_index = -rev_idx
         if self.minsuffix == 0:
             return text[pref_index:]
         sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix)
-        if slen < self.minsuffix:
+        if slen is None or slen < self.minsuffix:
             return None
         return text[pref_index:pref_index + sidx]
diff --git a/scrapely/extraction/similarity.py b/scrapely/extraction/similarity.py
@@ -6,9 +6,17 @@
 from operator import itemgetter
 from heapq import nlargest
 
-# For typical use cases (small sequences and patterns) the naive approach actually
-# runs faster than KMP algorithm
-from . _similarity import naive_match_length
+try:
+    # For typical use cases (small sequences and patterns) the naive approach
+    # actually runs faster than KMP algorithm
+    from . _similarity import naive_match_length
+except ImportError:
+    def naive_match_length(to_search, subsequence, range_start, range_end):
+        startval = subsequence[0]
+        return ((i, common_prefix_length(to_search[i:], subsequence))
+                for i in xrange(range_start, range_end)
+                if startval == to_search[i])
+
 
 def common_prefix_length(a, b):
     """Calculate the length of the common prefix in both sequences passed.
@@ -46,7 +54,7 @@ def common_prefix(*sequences):
 
 
 def longest_unique_subsequence(to_search, subsequence, range_start=0,
-        range_end=None):
+                               range_end=None):
     """Find the longest unique subsequence of items in an array or string.  This
     searches to_search looking for the longest overlapping
     match with subsequence. If the largest match is unique (there is no other
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
@@ -11,14 +11,158 @@
 from six.moves.urllib.request import urlopen
 from copy import deepcopy
 from w3lib.encoding import html_to_unicode
-
-from . import _htmlpage
-
-
-parse_html = _htmlpage.parse_html
-HtmlDataFragment = _htmlpage.HtmlDataFragment
-HtmlTag = _htmlpage.HtmlTag
-HtmlTagType = _htmlpage.HtmlTagType
+try:
+    from . import _htmlpage
+    parse_html = _htmlpage.parse_html
+    HtmlDataFragment = _htmlpage.HtmlDataFragment
+    HtmlTag = _htmlpage.HtmlTag
+    HtmlTagType = _htmlpage.HtmlTagType
+except ImportError:
+    import re
+    from collections import OrderedDict
+
+    class HtmlTagType(object):
+        OPEN_TAG = 1
+        CLOSE_TAG = 2
+        UNPAIRED_TAG = 3
+
+    class HtmlDataFragment(object):
+        __slots__ = ('start', 'end', 'is_text_content')
+
+        def __init__(self, start, end, is_text_content=False):
+            self.start = start
+            self.end = end
+            self.is_text_content = is_text_content
+
+        def __str__(self):
+            return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (
+                self.start, self.end, self.is_text_content)
+
+        def __repr__(self):
+            return str(self)
+
+    class HtmlTag(HtmlDataFragment):
+        __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text')
+
+        def __init__(self, tag_type, tag, attr_text, start, end):
+            HtmlDataFragment.__init__(self, start, end)
+            self.tag_type = tag_type
+            self.tag = tag
+            if isinstance(attr_text, dict):
+                self._attributes = attr_text
+                self._attr_text = None
+            else:  # defer loading attributes until necessary
+                self._attributes = OrderedDict()
+                self._attr_text = attr_text
+
+        @property
+        def attributes(self):
+            if not self._attributes and self._attr_text:
+                for attr_match in _ATTR_REGEXP.findall(self._attr_text):
+                    name = attr_match[0].lower()
+                    values = [v for v in attr_match[1:] if v]
+                    # According to HTML spec if attribute name is repeated only
+                    # the first one is taken into account
+                    if name not in self._attributes:
+                        self._attributes[name] = values[0] if values else None
+            return self._attributes
+
+        def __str__(self):
+            attributes = ', '.join(
+                sorted(["%s: %s" % (k, repr(v))
+                       for k, v in self.attributes.items()]))
+            return "<HtmlTag tag='%s' attributes={%s} type='%d' [%s:%s]>" % (
+                self.tag, attributes, self.tag_type, self.start, self.end)
+
+        def __repr__(self):
+            return str(self)
+
+    _ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|"
+             "([^>\s]+))?)?")
+    _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
+    _DOCTYPE = r"<!DOCTYPE.*?>"
+    _SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
+    _COMMENT = "(<!--.*?--!?>|<\?.+?>|<!>)"
+
+    _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
+    _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
+                              re.I | re.DOTALL)
+    _DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE)
+    _COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)
+
+    def parse_html(text):
+        """Higher level html parser. Calls lower level parsers and joins sucesive
+        HtmlDataFragment elements in a single one.
+        """
+        # If have doctype remove it.
+        start_pos = 0
+        match = _DOCTYPE_REGEXP.match(text)
+        if match:
+            start_pos = match.end()
+        prev_end = start_pos
+        for match in _HTML_REGEXP.finditer(text, start_pos):
+            start = match.start()
+            end = match.end()
+
+            if start > prev_end:
+                yield HtmlDataFragment(prev_end, start, True)
+
+            if match.groups()[0] is not None:  # comment
+                yield HtmlDataFragment(start, end)
+            elif match.groups()[1] is not None:  # <script>...</script>
+                for e in _parse_script(match):
+                    yield e
+            else:  # tag
+                yield _parse_tag(match)
+            prev_end = end
+        textlen = len(text)
+        if prev_end < textlen:
+            yield HtmlDataFragment(prev_end, textlen, True)
+
+    def _parse_script(match):
+        """parse a <script>...</script> region matched by _HTML_REGEXP"""
+        open_text, content, close_text = match.groups()[1:4]
+
+        open_tag = _parse_tag(_HTML_REGEXP.match(open_text))
+        open_tag.start = match.start()
+        open_tag.end = match.start() + len(open_text)
+
+        close_tag = _parse_tag(_HTML_REGEXP.match(close_text))
+        close_tag.start = match.end() - len(close_text)
+        close_tag.end = match.end()
+
+        yield open_tag
+        if open_tag.end < close_tag.start:
+            start_pos = 0
+            for m in _COMMENT_REGEXP.finditer(content):
+                if m.start() > start_pos:
+                    yield HtmlDataFragment(
+                        open_tag.end + start_pos, open_tag.end + m.start())
+                yield HtmlDataFragment(
+                    open_tag.end + m.start(), open_tag.end + m.end())
+                start_pos = m.end()
+            if open_tag.end + start_pos < close_tag.start:
+                yield HtmlDataFragment(
+                    open_tag.end + start_pos, close_tag.start)
+        yield close_tag
+
+    def _parse_tag(match):
+        """
+        parse a tag matched by _HTML_REGEXP
+        """
+        data = match.groups()
+        closing, tag, attr_text = data[4:7]
+        # if tag is None then the match is a comment
+        if tag is not None:
+            unpaired = data[-1]
+            if closing:
+                tag_type = HtmlTagType.CLOSE_TAG
+            elif unpaired:
+                tag_type = HtmlTagType.UNPAIRED_TAG
+            else:
+                tag_type = HtmlTagType.OPEN_TAG
+            return HtmlTag(tag_type, tag.lower(), attr_text, match.start(),
+                           match.end())
 
 
 def url_to_page(url, encoding=None, default_encoding='utf-8'):
@@ -164,6 +308,7 @@ def __new__(cls, htmlpage, start_index, end_index):
             text_start = htmlpage.parsed_body[start_index].start
             text_end = htmlpage.parsed_body[end_index or -1].end
             text = htmlpage.body[text_start:text_end]
+
         return HtmlPageRegion.__new__(cls, htmlpage, text)
 
     def __init__(self, htmlpage, start_index, end_index):
diff --git a/setup.py b/setup.py
@@ -1,22 +1,32 @@
 #!/usr/bin/env python
+import os
+import platform
 from setuptools import setup, find_packages
 from setuptools.extension import Extension
-from Cython.Build import cythonize
 import numpy as np
 
+
+USE_CYTHON = 'CYTHONIZE' in os.environ
+IS_PYPY = platform.python_implementation() == 'PyPy'
+ext = '.pyx' if USE_CYTHON else '.c'
 extensions = [
     Extension("scrapely._htmlpage",
-              ["scrapely/_htmlpage.pyx"],
+              ["scrapely/_htmlpage%s" % ext],
               include_dirs=[np.get_include()]),
     Extension("scrapely.extraction._similarity",
-              ["scrapely/extraction/_similarity.pyx"],
+              ["scrapely/extraction/_similarity%s" % ext],
               include_dirs=[np.get_include()]),
 ]
+if USE_CYTHON and not IS_PYPY:
+    from Cython.Build import cythonize
+    extensions = cythonize(extensions)
+if IS_PYPY:
+    extensions = []
 
 
 setup(
     name='scrapely',
-    version='0.12.0',
+    version='0.13.0b1',
     license='BSD',
     description='A pure-python HTML screen-scraping library',
     author='Scrapy project',
@@ -38,6 +48,9 @@
         'Topic :: Internet :: WWW/HTTP',
         'Topic :: Text Processing :: Markup :: HTML',
     ],
-    install_requires=['numpy', 'w3lib', 'six', 'cython'],
-    ext_modules=cythonize(extensions),
+    install_requires=['numpy', 'w3lib', 'six'],
+    extras_require={
+        'speedup': ['cython']
+    },
+    ext_modules=extensions,
 )
diff --git a/tests/test_htmlpage_data.py b/tests/test_htmlpage_data.py
diff --git a/tox.ini b/tox.ini

-Original file line number
+Diff line change
@@ @@ -1,4 +1,3 @@ @@
 numpy
 w3lib
 -six
 -cython
 +six