Skip to content

Commit d0dcba3

Browse files
committed
add MDR extractor
MDR extractor is base on https://pypi.python.org/pypi/mdr/ which can detect the listing data automatically and extract listing data with scrapely annnotation supervision.
1 parent 9644da3 commit d0dcba3

File tree

5 files changed

+20043
-3
lines changed

5 files changed

+20043
-3
lines changed

scrapely/extraction/regionextract.py

Lines changed: 147 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,18 @@
88
import copy
99
import pprint
1010
import cStringIO
11+
import json
1112
from itertools import groupby, izip, starmap
1213

1314
from numpy import array
1415

1516
from scrapely.descriptor import FieldDescriptor
16-
from scrapely.htmlpage import HtmlPageRegion
17+
from scrapely.htmlpage import HtmlPage, HtmlPageRegion
1718
from scrapely.extraction.similarity import (similar_region,
18-
longest_unique_subsequence, common_prefix)
19+
longest_unique_subsequence, common_prefix, common_prefix_length)
1920
from scrapely.extraction.pageobjects import (AnnotationTag,
2021
PageRegion, FragmentedHtmlPageRegion)
22+
from scrapely.extraction.pageparsing import parse_extraction_page
2123

2224
_EXTRACT_HTML = lambda x: x
2325
_DEFAULT_DESCRIPTOR = FieldDescriptor('none', None)
@@ -466,6 +468,149 @@ def apply(cls, template, extractors):
466468
def __repr__(self):
467469
return str(self)
468470

471+
class MdrExtractor(object):
472+
"""Extractor to use MDR_ to detect and extract listing data.
473+
474+
.. MDR: https://pypi.python.org/pypi/mdr/
475+
476+
"""
477+
def __init__(self, token_dict, xpath, record, extractors):
478+
self.token_dict = token_dict
479+
self.xpath = xpath
480+
self.record = record
481+
self.extractors = dict([extractor.annotation.surrounds_attribute, extractor] for extractor in extractors)
482+
483+
def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
484+
from mdr import MDR
485+
from lxml.html import document_fromstring, tostring
486+
487+
mdr = MDR()
488+
489+
doc = document_fromstring(page.htmlpage.body)
490+
element = doc.xpath(self.xpath)
491+
items = {}
492+
493+
if element:
494+
_, mapping = mdr.extract(element[0], record=self.record)
495+
for seed_elem, elements in mapping.iteritems():
496+
annotation_elem = [elem for elem in ([seed_elem] + elements) if elem.attrib.get('data-scrapy-annotate')]
497+
if annotation_elem:
498+
annotation = self._read_template_annotation(annotation_elem[0])
499+
name = annotation.get('annotations', {}).get('content')
500+
ex = self.extractors[name]
501+
for elem in elements:
502+
elem_page = HtmlPage(None, {}, tostring(elem, encoding='unicode'))
503+
parsed_elem_page = parse_extraction_page(self.token_dict, elem_page)
504+
items.setdefault(name, []).extend([v for _, v in ex.extract(parsed_elem_page, 0,
505+
len(parsed_elem_page.page_tokens) - 1)])
506+
return [items]
507+
508+
@classmethod
509+
def apply(cls, template, extractors):
510+
try:
511+
from mdr import MDR
512+
except ImportError:
513+
import warnings
514+
warnings.warn("MDR is not available")
515+
return None, extractors
516+
517+
mdr = MDR()
518+
htmlpage = template.htmlpage.body
519+
520+
candidates, doc = mdr.list_candidates(htmlpage.encode('utf8'))
521+
522+
# no repated data detected
523+
if not candidates:
524+
return None, extractors
525+
526+
candidate_xpaths = [doc.getpath(candidate) for candidate in candidates]
527+
528+
listing_data_annotations = [a for a in template.annotations if a.metadata.get('listingData')]
529+
# no annotation has listingData property
530+
if not listing_data_annotations:
531+
return None, extractors
532+
533+
ancestor_xpath = cls._get_common_ancestor_xpath(doc, cls._get_listingdata_elements(doc))
534+
candidate_xpath = max(candidate_xpaths, key=lambda x: common_prefix_length(x.split('/'), ancestor_xpath.split(('/'))))
535+
536+
candidate = doc.xpath(candidate_xpath)[0]
537+
538+
# XXX: use xpath to find the element on target page, using ``similar_region`` might be better
539+
if candidate.xpath('.//*[@data-scrapy-annotate]'):
540+
# remove the listing annotation from the template and basic extractor,
541+
# since they're going to extract them with MdrExtractor
542+
listing_data_extractors = []
543+
for annotation in listing_data_annotations:
544+
template.annotations.remove(annotation)
545+
name = annotation.surrounds_attribute
546+
for extractor in list(extractors):
547+
if name == extractor.annotation.surrounds_attribute:
548+
listing_data_extractors.append(extractor)
549+
extractors.remove(extractor)
550+
record, mapping = mdr.extract(candidate)
551+
cls._propagate_annotations(mapping)
552+
return cls(template.token_dict, cls._get_candidate_xpath(doc, candidate), record, listing_data_extractors), extractors
553+
554+
return extractors
555+
556+
@staticmethod
557+
def _get_candidate_xpath(doc, element):
558+
_id = element.attrib.get('id')
559+
_class = element.attrib.get('class')
560+
561+
if _id:
562+
xpath = '//%s[@id="%s"]' % (element.tag, _id)
563+
if len(doc.xpath(xpath)) == 1:
564+
return xpath
565+
566+
if _class:
567+
xpath = '//%s[@class="%s"]' % (element.tag, _class)
568+
if len(doc.xpath(xpath)) == 1:
569+
return xpath
570+
571+
return doc.getpath(element)
572+
573+
@staticmethod
574+
def _get_listingdata_elements(doc):
575+
""" Gets the elements has the listingData in the data-scrapy-annotate. """
576+
elements = []
577+
for element in doc.xpath('//*[@data-scrapy-annotate]'):
578+
annotation = MdrExtractor._read_template_annotation(element)
579+
if annotation.get('listingData'):
580+
elements.append(element)
581+
return elements
582+
583+
@staticmethod
584+
def _read_template_annotation(element):
585+
template_attr = element.attrib.get('data-scrapy-annotate')
586+
if template_attr is None:
587+
return None
588+
unescaped = template_attr.replace('"', '"')
589+
return json.loads(unescaped)
590+
591+
@staticmethod
592+
def _get_common_ancestor_xpath(doc, elements):
593+
""" Gets the xpath of the common ancestor of the given elements. """
594+
return "/".join(common_prefix(*[doc.getpath(elem).split('/') for elem in elements]))
595+
596+
@staticmethod
597+
def _propagate_annotations(mapping):
598+
for elem, targ_elements in mapping.iteritems():
599+
elements = [elem] + targ_elements
600+
for _elem in elements:
601+
annotation = _elem.attrib.get('data-scrapy-annotate')
602+
if annotation:
603+
break
604+
if annotation:
605+
for _elem in elements:
606+
_elem.attrib['data-scrapy-annotate'] = annotation
607+
608+
def __repr__(self):
609+
return "MDR(%r)" % self.extractors
610+
611+
def __str__(self):
612+
return "MDR(%s)" % self.extractors
613+
469614
class TraceExtractor(object):
470615
"""Extractor that wraps other extractors and prints an execution
471616
trace of the extraction process to aid debugging

tests/__init__.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import sys
21
import json
32
from os import path
43
from itertools import count
@@ -23,3 +22,12 @@ def iter_samples(prefix, html_encoding='utf-8', **json_kwargs):
2322
html_str = open(html_page, 'rb').read()
2423
sample_data = json.load(open(fname + '.json'), **json_load_kwargs)
2524
yield html_str.decode(html_encoding), sample_data
25+
26+
def get_page(prefix, html_encoding='utf-8'):
27+
SAMPLES_FILE_PREFIX = path.join(_PATH, "samples/samples_" + prefix)
28+
fname = SAMPLES_FILE_PREFIX
29+
html_page = fname + ".html"
30+
if not path.exists(html_page):
31+
return
32+
html_str = open(html_page, 'rb').read()
33+
return html_str.decode(html_encoding)

0 commit comments

Comments
 (0)