|
8 | 8 | import copy |
9 | 9 | import pprint |
10 | 10 | import cStringIO |
| 11 | +import json |
11 | 12 | from itertools import groupby, izip, starmap |
12 | 13 |
|
13 | 14 | from numpy import array |
14 | 15 |
|
15 | 16 | from scrapely.descriptor import FieldDescriptor |
16 | | -from scrapely.htmlpage import HtmlPageRegion |
| 17 | +from scrapely.htmlpage import HtmlPage, HtmlPageRegion |
17 | 18 | from scrapely.extraction.similarity import (similar_region, |
18 | | - longest_unique_subsequence, common_prefix) |
| 19 | + longest_unique_subsequence, common_prefix, common_prefix_length) |
19 | 20 | from scrapely.extraction.pageobjects import (AnnotationTag, |
20 | 21 | PageRegion, FragmentedHtmlPageRegion) |
| 22 | +from scrapely.extraction.pageparsing import parse_extraction_page |
21 | 23 |
|
22 | 24 | _EXTRACT_HTML = lambda x: x |
23 | 25 | _DEFAULT_DESCRIPTOR = FieldDescriptor('none', None) |
@@ -466,6 +468,149 @@ def apply(cls, template, extractors): |
466 | 468 | def __repr__(self): |
467 | 469 | return str(self) |
468 | 470 |
|
| 471 | +class MdrExtractor(object): |
| 472 | + """Extractor to use MDR_ to detect and extract listing data. |
| 473 | +
|
| 474 | + .. MDR: https://pypi.python.org/pypi/mdr/ |
| 475 | +
|
| 476 | + """ |
| 477 | + def __init__(self, token_dict, xpath, record, extractors): |
| 478 | + self.token_dict = token_dict |
| 479 | + self.xpath = xpath |
| 480 | + self.record = record |
| 481 | + self.extractors = dict([extractor.annotation.surrounds_attribute, extractor] for extractor in extractors) |
| 482 | + |
| 483 | + def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs): |
| 484 | + from mdr import MDR |
| 485 | + from lxml.html import document_fromstring, tostring |
| 486 | + |
| 487 | + mdr = MDR() |
| 488 | + |
| 489 | + doc = document_fromstring(page.htmlpage.body) |
| 490 | + element = doc.xpath(self.xpath) |
| 491 | + items = {} |
| 492 | + |
| 493 | + if element: |
| 494 | + _, mapping = mdr.extract(element[0], record=self.record) |
| 495 | + for seed_elem, elements in mapping.iteritems(): |
| 496 | + annotation_elem = [elem for elem in ([seed_elem] + elements) if elem.attrib.get('data-scrapy-annotate')] |
| 497 | + if annotation_elem: |
| 498 | + annotation = self._read_template_annotation(annotation_elem[0]) |
| 499 | + name = annotation.get('annotations', {}).get('content') |
| 500 | + ex = self.extractors[name] |
| 501 | + for elem in elements: |
| 502 | + elem_page = HtmlPage(None, {}, tostring(elem, encoding='unicode')) |
| 503 | + parsed_elem_page = parse_extraction_page(self.token_dict, elem_page) |
| 504 | + items.setdefault(name, []).extend([v for _, v in ex.extract(parsed_elem_page, 0, |
| 505 | + len(parsed_elem_page.page_tokens) - 1)]) |
| 506 | + return [items] |
| 507 | + |
| 508 | + @classmethod |
| 509 | + def apply(cls, template, extractors): |
| 510 | + try: |
| 511 | + from mdr import MDR |
| 512 | + except ImportError: |
| 513 | + import warnings |
| 514 | + warnings.warn("MDR is not available") |
| 515 | + return None, extractors |
| 516 | + |
| 517 | + mdr = MDR() |
| 518 | + htmlpage = template.htmlpage.body |
| 519 | + |
| 520 | + candidates, doc = mdr.list_candidates(htmlpage.encode('utf8')) |
| 521 | + |
| 522 | + # no repated data detected |
| 523 | + if not candidates: |
| 524 | + return None, extractors |
| 525 | + |
| 526 | + candidate_xpaths = [doc.getpath(candidate) for candidate in candidates] |
| 527 | + |
| 528 | + listing_data_annotations = [a for a in template.annotations if a.metadata.get('listingData')] |
| 529 | + # no annotation has listingData property |
| 530 | + if not listing_data_annotations: |
| 531 | + return None, extractors |
| 532 | + |
| 533 | + ancestor_xpath = cls._get_common_ancestor_xpath(doc, cls._get_listingdata_elements(doc)) |
| 534 | + candidate_xpath = max(candidate_xpaths, key=lambda x: common_prefix_length(x.split('/'), ancestor_xpath.split(('/')))) |
| 535 | + |
| 536 | + candidate = doc.xpath(candidate_xpath)[0] |
| 537 | + |
| 538 | + # XXX: use xpath to find the element on target page, using ``similar_region`` might be better |
| 539 | + if candidate.xpath('.//*[@data-scrapy-annotate]'): |
| 540 | + # remove the listing annotation from the template and basic extractor, |
| 541 | + # since they're going to extract them with MdrExtractor |
| 542 | + listing_data_extractors = [] |
| 543 | + for annotation in listing_data_annotations: |
| 544 | + template.annotations.remove(annotation) |
| 545 | + name = annotation.surrounds_attribute |
| 546 | + for extractor in list(extractors): |
| 547 | + if name == extractor.annotation.surrounds_attribute: |
| 548 | + listing_data_extractors.append(extractor) |
| 549 | + extractors.remove(extractor) |
| 550 | + record, mapping = mdr.extract(candidate) |
| 551 | + cls._propagate_annotations(mapping) |
| 552 | + return cls(template.token_dict, cls._get_candidate_xpath(doc, candidate), record, listing_data_extractors), extractors |
| 553 | + |
| 554 | + return extractors |
| 555 | + |
| 556 | + @staticmethod |
| 557 | + def _get_candidate_xpath(doc, element): |
| 558 | + _id = element.attrib.get('id') |
| 559 | + _class = element.attrib.get('class') |
| 560 | + |
| 561 | + if _id: |
| 562 | + xpath = '//%s[@id="%s"]' % (element.tag, _id) |
| 563 | + if len(doc.xpath(xpath)) == 1: |
| 564 | + return xpath |
| 565 | + |
| 566 | + if _class: |
| 567 | + xpath = '//%s[@class="%s"]' % (element.tag, _class) |
| 568 | + if len(doc.xpath(xpath)) == 1: |
| 569 | + return xpath |
| 570 | + |
| 571 | + return doc.getpath(element) |
| 572 | + |
| 573 | + @staticmethod |
| 574 | + def _get_listingdata_elements(doc): |
| 575 | + """ Gets the elements has the listingData in the data-scrapy-annotate. """ |
| 576 | + elements = [] |
| 577 | + for element in doc.xpath('//*[@data-scrapy-annotate]'): |
| 578 | + annotation = MdrExtractor._read_template_annotation(element) |
| 579 | + if annotation.get('listingData'): |
| 580 | + elements.append(element) |
| 581 | + return elements |
| 582 | + |
| 583 | + @staticmethod |
| 584 | + def _read_template_annotation(element): |
| 585 | + template_attr = element.attrib.get('data-scrapy-annotate') |
| 586 | + if template_attr is None: |
| 587 | + return None |
| 588 | + unescaped = template_attr.replace('"', '"') |
| 589 | + return json.loads(unescaped) |
| 590 | + |
| 591 | + @staticmethod |
| 592 | + def _get_common_ancestor_xpath(doc, elements): |
| 593 | + """ Gets the xpath of the common ancestor of the given elements. """ |
| 594 | + return "/".join(common_prefix(*[doc.getpath(elem).split('/') for elem in elements])) |
| 595 | + |
| 596 | + @staticmethod |
| 597 | + def _propagate_annotations(mapping): |
| 598 | + for elem, targ_elements in mapping.iteritems(): |
| 599 | + elements = [elem] + targ_elements |
| 600 | + for _elem in elements: |
| 601 | + annotation = _elem.attrib.get('data-scrapy-annotate') |
| 602 | + if annotation: |
| 603 | + break |
| 604 | + if annotation: |
| 605 | + for _elem in elements: |
| 606 | + _elem.attrib['data-scrapy-annotate'] = annotation |
| 607 | + |
| 608 | + def __repr__(self): |
| 609 | + return "MDR(%r)" % self.extractors |
| 610 | + |
| 611 | + def __str__(self): |
| 612 | + return "MDR(%s)" % self.extractors |
| 613 | + |
469 | 614 | class TraceExtractor(object): |
470 | 615 | """Extractor that wraps other extractors and prints an execution |
471 | 616 | trace of the extraction process to aid debugging |
|
0 commit comments