Python 3 fixes

ruairif · ruairif · commit ac62a4e64113 · 2016-11-08T11:39:07.000Z
Add compatability function for some tests
Add fallback if no c extenstions installed
Fix comment parsing in c extension
diff --git a/scrapely/_htmlpage.pyx b/scrapely/_htmlpage.pyx
@@ -90,7 +90,7 @@ cdef class CommentParser:
             if self.open_count == 0:
                 self.start = i - 3
             self.open_state = 1
-            self.open_count += 1
+            self.open_count = 1
             self.inside_comment = True
 
         if self.close_count < self.open_count:
@@ -141,12 +141,12 @@ cdef class ScriptParser:
             self.state = 1
         if ((self.state == 1 and c == u'<') or
             (self.state == 2 and c == u'/') or
-            (self.state == 3 and c == u's' or c == u'S') or
-            (self.state == 4 and c == u'c' or c == u'C') or
-            (self.state == 5 and c == u'r' or c == u'R') or
-            (self.state == 6 and c == u'i' or c == u'I') or
-            (self.state == 7 and c == u'p' or c == u'P') or
-            (self.state == 8 and c == u't' or c == u'T') or
+            (self.state == 3 and c in u'sS') or
+            (self.state == 4 and c in u'cC') or
+            (self.state == 5 and c in u'rR') or
+            (self.state == 6 and c in u'iI') or
+            (self.state == 7 and c in u'pP') or
+            (self.state == 8 and c in u'tT') or
             (self.state == 9 and c == u'>')):
             self.state += 1
         else:
@@ -288,7 +288,7 @@ cpdef parse_html(s):
                     if tag_name != u'!doctype':
                         parsed.append(
                             HtmlTag(tag_type, tag_name,
-                                      tag_attributes, tag_start, tag_end + 1))
+                                    tag_attributes, tag_start, tag_end + 1))
                     if tag_name == u'script':
                         script = True
                     if open_tag:
diff --git a/scrapely/compat.py b/scrapely/compat.py
@@ -0,0 +1,6 @@
+try:
+    utext = unicode
+except NameError:
+    class utext(str):
+        def __repr__(self):
+            return 'u{}'.format(super(utext, self).__repr__())
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
@@ -64,25 +64,26 @@ class BasicTypeExtractor(object):
     annotations.
 
     For example:
+    >>> from scrapely.compat import utext
     >>> from scrapely.extraction.pageparsing import parse_strings
     >>> template, page = parse_strings( \
         u'<h1 data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x</h1>', u'<h1> a name</h1>')
     >>> ex = BasicTypeExtractor(template.annotations[0])
-    >>> ex.extract(page, 0, 1, None)
+    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
     [(u'name', u' a name')]
 
     It supports attribute descriptors
     >>> descriptor = FieldDescriptor('name', None, lambda x: x.strip())
     >>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor})
-    >>> ex.extract(page, 0, 1, None)
+    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 1, None)]
     [(u'name', u'a name')]
 
     It supports ignoring regions
     >>> template, page = parse_strings(\
         u'<div data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x<b> xx</b></div>',\
         u'<div>a name<b> id-9</b></div>')
     >>> ex = BasicTypeExtractor(template.annotations[0])
-    >>> ex.extract(page, 0, 3, [PageRegion(1, 2)])
+    >>> [tuple(map(utext, r)) for r in ex.extract(page, 0, 3, [PageRegion(1, 2)])]
     [(u'name', u'a name')]
     """
 
@@ -640,12 +641,12 @@ def extract_text(self, text):
         pref_index = 0
         if self.minprefix > 0:
             rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix)
-            if plen < self.minprefix:
+            if plen is None or plen < self.minprefix:
                 return None
             pref_index = -rev_idx
         if self.minsuffix == 0:
             return text[pref_index:]
         sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix)
-        if slen < self.minsuffix:
+        if slen is None or slen < self.minsuffix:
             return None
         return text[pref_index:pref_index + sidx]
diff --git a/scrapely/extraction/similarity.py b/scrapely/extraction/similarity.py
@@ -6,9 +6,17 @@
 from operator import itemgetter
 from heapq import nlargest
 
-# For typical use cases (small sequences and patterns) the naive approach actually
-# runs faster than KMP algorithm
-from . _similarity import naive_match_length
+try:
+    # For typical use cases (small sequences and patterns) the naive approach
+    # actually runs faster than KMP algorithm
+    from . _similarity import naive_match_length
+except ImportError:
+    def naive_match_length(to_search, subsequence, range_start, range_end):
+        startval = subsequence[0]
+        return ((i, common_prefix_length(to_search[i:], subsequence))
+                for i in xrange(range_start, range_end)
+                if startval == to_search[i])
+
 
 def common_prefix_length(a, b):
     """Calculate the length of the common prefix in both sequences passed.
@@ -46,7 +54,7 @@ def common_prefix(*sequences):
 
 
 def longest_unique_subsequence(to_search, subsequence, range_start=0,
-        range_end=None):
+                               range_end=None):
     """Find the longest unique subsequence of items in an array or string.  This
     searches to_search looking for the longest overlapping
     match with subsequence. If the largest match is unique (there is no other
diff --git a/scrapely/extractors.py b/scrapely/extractors.py
@@ -81,7 +81,8 @@ def text(region):
     removing excessive whitespace,
 
     For example:
-    >>> t = lambda s: text(htmlregion(s))
+    >>> from scrapely.compat import utext
+    >>> t = lambda s: utext(text(htmlregion(s)))
     >>> t(u'<h1>test</h1>')
     u'test'
 
@@ -122,7 +123,8 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
             opening and closing tag is removed.
 
     For example:
-    >>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep)
+    >>> from scrapely.compat import utext
+    >>> t = lambda s, keep=_TAGS_TO_KEEP: utext(safehtml(htmlregion(s), keep))
     >>> t(u'<strong>test <blink>test</blink></strong>')
     u'<strong>test test</strong>'
 
@@ -272,7 +274,8 @@ def extract_number(txt):
     >>> extract_number('  45.3, 7')
 
     It will handle unescaped entities:
-    >>> extract_number(u'&#163;129&#46;99')
+    >>> from scrapely.compat import utext
+    >>> utext(extract_number(u'&#163;129&#46;99'))
     u'129.99'
     """
     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
@@ -285,6 +288,7 @@ def extract_price(txt):
     """
     Extracts numbers making some price format specific assumptions
 
+    >>> from scrapely.compat import utext
     >>> extract_price('asdf 234,234.45sdf ')
     '234234.45'
     >>> extract_price('234,23')
@@ -298,7 +302,7 @@ def extract_price(txt):
     >>> extract_price('adsfg')
     >>> extract_price('stained, linseed oil finish, clear glas doors')
     >>> extract_price('')
-    >>> extract_price(u'&#163;129&#46;99')
+    >>> utext(extract_price(u'&#163;129&#46;99'))
     u'129.99'
     """
     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
@@ -11,14 +11,158 @@
 from six.moves.urllib.request import urlopen
 from copy import deepcopy
 from w3lib.encoding import html_to_unicode
-
-from . import _htmlpage
-
-
-parse_html = _htmlpage.parse_html
-HtmlDataFragment = _htmlpage.HtmlDataFragment
-HtmlTag = _htmlpage.HtmlTag
-HtmlTagType = _htmlpage.HtmlTagType
+try:
+    from . import _htmlpage
+    parse_html = _htmlpage.parse_html
+    HtmlDataFragment = _htmlpage.HtmlDataFragment
+    HtmlTag = _htmlpage.HtmlTag
+    HtmlTagType = _htmlpage.HtmlTagType
+except ImportError:
+    import re
+    from collections import OrderedDict
+
+    class HtmlTagType(object):
+        OPEN_TAG = 1
+        CLOSE_TAG = 2
+        UNPAIRED_TAG = 3
+
+    class HtmlDataFragment(object):
+        __slots__ = ('start', 'end', 'is_text_content')
+
+        def __init__(self, start, end, is_text_content=False):
+            self.start = start
+            self.end = end
+            self.is_text_content = is_text_content
+
+        def __str__(self):
+            return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (
+                self.start, self.end, self.is_text_content)
+
+        def __repr__(self):
+            return str(self)
+
+    class HtmlTag(HtmlDataFragment):
+        __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text')
+
+        def __init__(self, tag_type, tag, attr_text, start, end):
+            HtmlDataFragment.__init__(self, start, end)
+            self.tag_type = tag_type
+            self.tag = tag
+            if isinstance(attr_text, dict):
+                self._attributes = attr_text
+                self._attr_text = None
+            else:  # defer loading attributes until necessary
+                self._attributes = OrderedDict()
+                self._attr_text = attr_text
+
+        @property
+        def attributes(self):
+            if not self._attributes and self._attr_text:
+                for attr_match in _ATTR_REGEXP.findall(self._attr_text):
+                    name = attr_match[0].lower()
+                    values = [v for v in attr_match[1:] if v]
+                    # According to HTML spec if attribute name is repeated only
+                    # the first one is taken into account
+                    if name not in self._attributes:
+                        self._attributes[name] = values[0] if values else None
+            return self._attributes
+
+        def __str__(self):
+            attributes = ', '.join(
+                sorted(["%s: %s" % (k, repr(v))
+                       for k, v in self.attributes.items()]))
+            return "<HtmlTag tag='%s' attributes={%s} type='%d' [%s:%s]>" % (
+                self.tag, attributes, self.tag_type, self.start, self.end)
+
+        def __repr__(self):
+            return str(self)
+
+    _ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|"
+             "([^>\s]+))?)?")
+    _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
+    _DOCTYPE = r"<!DOCTYPE.*?>"
+    _SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
+    _COMMENT = "(<!--.*?-->|<\?.+?>)"
+
+    _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
+    _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
+                              re.I | re.DOTALL)
+    _DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE)
+    _COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)
+
+    def parse_html(text):
+        """Higher level html parser. Calls lower level parsers and joins sucesive
+        HtmlDataFragment elements in a single one.
+        """
+        # If have doctype remove it.
+        start_pos = 0
+        match = _DOCTYPE_REGEXP.match(text)
+        if match:
+            start_pos = match.end()
+        prev_end = start_pos
+        for match in _HTML_REGEXP.finditer(text, start_pos):
+            start = match.start()
+            end = match.end()
+
+            if start > prev_end:
+                yield HtmlDataFragment(prev_end, start, True)
+
+            if match.groups()[0] is not None:  # comment
+                yield HtmlDataFragment(start, end)
+            elif match.groups()[1] is not None:  # <script>...</script>
+                for e in _parse_script(match):
+                    yield e
+            else:  # tag
+                yield _parse_tag(match)
+            prev_end = end
+        textlen = len(text)
+        if prev_end < textlen:
+            yield HtmlDataFragment(prev_end, textlen, True)
+
+    def _parse_script(match):
+        """parse a <script>...</script> region matched by _HTML_REGEXP"""
+        open_text, content, close_text = match.groups()[1:4]
+
+        open_tag = _parse_tag(_HTML_REGEXP.match(open_text))
+        open_tag.start = match.start()
+        open_tag.end = match.start() + len(open_text)
+
+        close_tag = _parse_tag(_HTML_REGEXP.match(close_text))
+        close_tag.start = match.end() - len(close_text)
+        close_tag.end = match.end()
+
+        yield open_tag
+        if open_tag.end < close_tag.start:
+            start_pos = 0
+            for m in _COMMENT_REGEXP.finditer(content):
+                if m.start() > start_pos:
+                    yield HtmlDataFragment(
+                        open_tag.end + start_pos, open_tag.end + m.start())
+                yield HtmlDataFragment(
+                    open_tag.end + m.start(), open_tag.end + m.end())
+                start_pos = m.end()
+            if open_tag.end + start_pos < close_tag.start:
+                yield HtmlDataFragment(
+                    open_tag.end + start_pos, close_tag.start)
+        yield close_tag
+
+    def _parse_tag(match):
+        """
+        parse a tag matched by _HTML_REGEXP
+        """
+        data = match.groups()
+        closing, tag, attr_text = data[4:7]
+        # if tag is None then the match is a comment
+        if tag is not None:
+            unpaired = data[-1]
+            if closing:
+                tag_type = HtmlTagType.CLOSE_TAG
+            elif unpaired:
+                tag_type = HtmlTagType.UNPAIRED_TAG
+            else:
+                tag_type = HtmlTagType.OPEN_TAG
+            return HtmlTag(tag_type, tag.lower(), attr_text, match.start(),
+                           match.end())
 
 
 def url_to_page(url, encoding=None, default_encoding='utf-8'):
@@ -164,6 +308,7 @@ def __new__(cls, htmlpage, start_index, end_index):
             text_start = htmlpage.parsed_body[start_index].start
             text_end = htmlpage.parsed_body[end_index or -1].end
             text = htmlpage.body[text_start:text_end]
+
         return HtmlPageRegion.__new__(cls, htmlpage, text)
 
     def __init__(self, htmlpage, start_index, end_index):
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name='scrapely',
-    version='0.12.0',
+    version='0.13.0b1',
     license='BSD',
     description='A pure-python HTML screen-scraping library',
     author='Scrapy project',
@@ -45,5 +45,8 @@
         'Topic :: Text Processing :: Markup :: HTML',
     ],
     install_requires=['numpy', 'w3lib', 'six'],
+    extras_requires={
+        'speedup': ['cython']
+    },
     ext_modules=extensions,
 )
diff --git a/tox.ini b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py27,py33,py34
+envlist = py27,py34
 usedevelop = True
 
 [testenv]