scrapy
diff --git a/‎scrapely/extraction/regionextract.py‎
Lines changed: 147 additions & 2 deletions b/‎scrapely/extraction/regionextract.py‎
Lines changed: 147 additions & 2 deletions
diff --git a/‎tests/__init__.py‎
Lines changed: 9 additions & 1 deletion b/‎tests/__init__.py‎
Lines changed: 9 additions & 1 deletion
@@ -8,16 +8,18 @@
 import copy
 import pprint
 import cStringIO
+import json
 from itertools import groupby, izip, starmap
 
 from numpy import array
 
 from scrapely.descriptor import FieldDescriptor
-from scrapely.htmlpage import HtmlPageRegion
+from scrapely.htmlpage import HtmlPage, HtmlPageRegion
 from scrapely.extraction.similarity import (similar_region,
-    longest_unique_subsequence, common_prefix)
+    longest_unique_subsequence, common_prefix, common_prefix_length)
 from scrapely.extraction.pageobjects import (AnnotationTag,
     PageRegion, FragmentedHtmlPageRegion)
+from scrapely.extraction.pageparsing import parse_extraction_page
 
 _EXTRACT_HTML = lambda x: x
 _DEFAULT_DESCRIPTOR = FieldDescriptor('none', None)
@@ -466,6 +468,149 @@ def apply(cls, template, extractors):
     def __repr__(self):
         return str(self)
 
+class MdrExtractor(object):
+    """Extractor to use MDR_ to detect and extract listing data.
+
+    .. MDR: https://pypi.python.org/pypi/mdr/
+
+    """
+    def __init__(self, token_dict, xpath, record, extractors):
+        self.token_dict = token_dict
+        self.xpath = xpath
+        self.record = record
+        self.extractors = dict([extractor.annotation.surrounds_attribute, extractor] for extractor in extractors)
+
+    def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
+        from mdr import MDR
+        from lxml.html import document_fromstring, tostring
+
+        mdr = MDR()
+
+        doc = document_fromstring(page.htmlpage.body)
+        element = doc.xpath(self.xpath)
+        items = {}
+
+        if element:
+            _, mapping = mdr.extract(element[0], record=self.record)
+            for seed_elem, elements in mapping.iteritems():
+                annotation_elem = [elem for elem in ([seed_elem] + elements) if elem.attrib.get('data-scrapy-annotate')]
+                if annotation_elem:
+                    annotation = self._read_template_annotation(annotation_elem[0])
+                    name = annotation.get('annotations', {}).get('content')
+                    ex = self.extractors[name]
+                    for elem in elements:
+                        elem_page = HtmlPage(None, {}, tostring(elem, encoding='unicode'))
+                        parsed_elem_page = parse_extraction_page(self.token_dict, elem_page)
+                        items.setdefault(name, []).extend([v for _, v in ex.extract(parsed_elem_page, 0,
+                            len(parsed_elem_page.page_tokens) - 1)])
+        return [items]
+
+    @classmethod
+    def apply(cls, template, extractors):
+        try:
+            from mdr import MDR
+        except ImportError:
+            import warnings
+            warnings.warn("MDR is not available")
+            return None, extractors
+
+        mdr = MDR()
+        htmlpage = template.htmlpage.body
+
+        candidates, doc = mdr.list_candidates(htmlpage.encode('utf8'))
+
+        # no repated data detected
+        if not candidates:
+            return None, extractors
+
+        candidate_xpaths = [doc.getpath(candidate) for candidate in candidates]
+
+        listing_data_annotations = [a for a in template.annotations if a.metadata.get('listingData')]
+        # no annotation has listingData property
+        if not listing_data_annotations:
+            return None, extractors
+
+        ancestor_xpath = cls._get_common_ancestor_xpath(doc, cls._get_listingdata_elements(doc))
+        candidate_xpath = max(candidate_xpaths, key=lambda x: common_prefix_length(x.split('/'), ancestor_xpath.split(('/'))))
+
+        candidate = doc.xpath(candidate_xpath)[0]
+
+        # XXX: use xpath to find the element on target page, using ``similar_region`` might be better
+        if candidate.xpath('.//*[@data-scrapy-annotate]'):
+            # remove the listing annotation from the template and basic extractor,
+            # since they're going to extract them with MdrExtractor
+            listing_data_extractors = []
+            for annotation in listing_data_annotations:
+                template.annotations.remove(annotation)
+                name = annotation.surrounds_attribute
+                for extractor in list(extractors):
+                    if name == extractor.annotation.surrounds_attribute:
+                        listing_data_extractors.append(extractor)
+                        extractors.remove(extractor)
+            record, mapping = mdr.extract(candidate)
+            cls._propagate_annotations(mapping)
+            return cls(template.token_dict, cls._get_candidate_xpath(doc, candidate), record, listing_data_extractors), extractors
+
+        return extractors
+
+    @staticmethod
+    def _get_candidate_xpath(doc, element):
+        _id = element.attrib.get('id')
+        _class = element.attrib.get('class')
+
+        if _id:
+            xpath = '//%s[@id="%s"]' % (element.tag, _id)
+            if len(doc.xpath(xpath)) == 1:
+                return xpath
+
+        if _class:
+            xpath = '//%s[@class="%s"]' % (element.tag, _class)
+            if len(doc.xpath(xpath)) == 1:
+                return xpath
+
+        return doc.getpath(element)
+
+    @staticmethod
+    def _get_listingdata_elements(doc):
+        """ Gets the elements has the listingData in the data-scrapy-annotate. """
+        elements = []
+        for element in doc.xpath('//*[@data-scrapy-annotate]'):
+            annotation = MdrExtractor._read_template_annotation(element)
+            if annotation.get('listingData'):
+                elements.append(element)
+        return elements
+
+    @staticmethod
+    def _read_template_annotation(element):
+        template_attr = element.attrib.get('data-scrapy-annotate')
+        if template_attr is None:
+            return None
+        unescaped = template_attr.replace('&quot;', '"')
+        return json.loads(unescaped)
+
+    @staticmethod
+    def _get_common_ancestor_xpath(doc, elements):
+        """ Gets the xpath of the common ancestor of the given elements. """
+        return "/".join(common_prefix(*[doc.getpath(elem).split('/') for elem in elements]))
+
+    @staticmethod
+    def _propagate_annotations(mapping):
+        for elem, targ_elements in mapping.iteritems():
+            elements = [elem] + targ_elements
+            for _elem in elements:
+                annotation = _elem.attrib.get('data-scrapy-annotate')
+                if annotation:
+                    break
+            if annotation:
+                for _elem in elements:
+                    _elem.attrib['data-scrapy-annotate'] = annotation
+
+    def __repr__(self):
+        return "MDR(%r)" % self.extractors
+
+    def __str__(self):
+        return "MDR(%s)" % self.extractors
+
 class TraceExtractor(object):
     """Extractor that wraps other extractors and prints an execution
     trace of the extraction process to aid debugging
 
@@ -1,4 +1,3 @@
-import sys
 import json
 from os import path
 from itertools import count
@@ -23,3 +22,12 @@ def iter_samples(prefix, html_encoding='utf-8', **json_kwargs):
         html_str = open(html_page, 'rb').read()
         sample_data = json.load(open(fname + '.json'), **json_load_kwargs)
         yield html_str.decode(html_encoding), sample_data
+
+def get_page(prefix, html_encoding='utf-8'):
+    SAMPLES_FILE_PREFIX = path.join(_PATH, "samples/samples_" + prefix)
+    fname = SAMPLES_FILE_PREFIX
+    html_page = fname + ".html"
+    if not path.exists(html_page):
+        return
+    html_str = open(html_page, 'rb').read()
+    return html_str.decode(html_encoding)