Skip to content

Commit fd47935

Browse files
committed
allow no_content_validate for MdrExtractor
since sometimes the extract data is empty, this will make the validated false. but we still want to add to extracted listing data to indicate there are some data missing on the page. also fix a problem when the annotation was added to other records rather than seed record. fix it by propogating the annotations to aligned elements.
1 parent d0dcba3 commit fd47935

File tree

1 file changed

+33
-22
lines changed

1 file changed

+33
-22
lines changed

scrapely/extraction/regionextract.py

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pprint
1010
import cStringIO
1111
import json
12+
import warnings
1213
from itertools import groupby, izip, starmap
1314

1415
from numpy import array
@@ -117,8 +118,11 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
117118
region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions))
118119
else:
119120
region = extraction_page.htmlpage_region_inside(start_index, end_index)
120-
validated = self.content_validate(region)
121-
return [(self.annotation.surrounds_attribute, validated)] if validated else []
121+
if kwargs.get('no_content_validate'):
122+
validated = True
123+
else:
124+
validated = self.content_validate(region)
125+
return [(self.annotation.surrounds_attribute, self.content_validate(region))] if validated else []
122126

123127
def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
124128
data = []
@@ -488,29 +492,36 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
488492

489493
doc = document_fromstring(page.htmlpage.body)
490494
element = doc.xpath(self.xpath)
495+
496+
if not element:
497+
warnings.warn("MDRExtractor can't find element with xpath: %s" % self.xpath)
498+
return [{}]
499+
491500
items = {}
492501

493-
if element:
494-
_, mapping = mdr.extract(element[0], record=self.record)
495-
for seed_elem, elements in mapping.iteritems():
496-
annotation_elem = [elem for elem in ([seed_elem] + elements) if elem.attrib.get('data-scrapy-annotate')]
497-
if annotation_elem:
498-
annotation = self._read_template_annotation(annotation_elem[0])
499-
name = annotation.get('annotations', {}).get('content')
500-
ex = self.extractors[name]
501-
for elem in elements:
502-
elem_page = HtmlPage(None, {}, tostring(elem, encoding='unicode'))
503-
parsed_elem_page = parse_extraction_page(self.token_dict, elem_page)
504-
items.setdefault(name, []).extend([v for _, v in ex.extract(parsed_elem_page, 0,
505-
len(parsed_elem_page.page_tokens) - 1)])
502+
_, mapping = mdr.extract(element[0], record=self.record)
503+
for seed_elem, elements in mapping.iteritems():
504+
annotation_elem = [elem for elem in ([seed_elem] + elements) if elem.attrib.get('data-scrapy-annotate')]
505+
if annotation_elem:
506+
annotation = self._read_template_annotation(annotation_elem[0])
507+
name = annotation.get('annotations', {}).get('content')
508+
ex = self.extractors[name]
509+
for elem in elements:
510+
elem_page = HtmlPage(None, {}, tostring(elem, encoding='unicode'))
511+
parsed_elem_page = parse_extraction_page(self.token_dict, elem_page)
512+
items.setdefault(name, []).extend([v for _, v in ex.extract(parsed_elem_page, 0,
513+
len(parsed_elem_page.page_tokens) - 1, no_content_validate=True)])
514+
515+
if items:
516+
lengths = [len(values) for values in items.values()]
517+
assert len(set(lengths)) == 1, 'extract items %r should be have same count' % items
506518
return [items]
507519

508520
@classmethod
509521
def apply(cls, template, extractors):
510522
try:
511523
from mdr import MDR
512524
except ImportError:
513-
import warnings
514525
warnings.warn("MDR is not available")
515526
return None, extractors
516527

@@ -519,14 +530,14 @@ def apply(cls, template, extractors):
519530

520531
candidates, doc = mdr.list_candidates(htmlpage.encode('utf8'))
521532

522-
# no repated data detected
533+
# early return if no repated data detected
523534
if not candidates:
524535
return None, extractors
525536

526537
candidate_xpaths = [doc.getpath(candidate) for candidate in candidates]
527538

528539
listing_data_annotations = [a for a in template.annotations if a.metadata.get('listingData')]
529-
# no annotation has listingData property
540+
# early return if no annotations has listingData property set
530541
if not listing_data_annotations:
531542
return None, extractors
532543

@@ -538,7 +549,7 @@ def apply(cls, template, extractors):
538549
# XXX: use xpath to find the element on target page, using ``similar_region`` might be better
539550
if candidate.xpath('.//*[@data-scrapy-annotate]'):
540551
# remove the listing annotation from the template and basic extractor,
541-
# since they're going to extract them with MdrExtractor
552+
# since they're going to extract by MdrExtractor
542553
listing_data_extractors = []
543554
for annotation in listing_data_annotations:
544555
template.annotations.remove(annotation)
@@ -551,7 +562,7 @@ def apply(cls, template, extractors):
551562
cls._propagate_annotations(mapping)
552563
return cls(template.token_dict, cls._get_candidate_xpath(doc, candidate), record, listing_data_extractors), extractors
553564

554-
return extractors
565+
return None, extractors
555566

556567
@staticmethod
557568
def _get_candidate_xpath(doc, element):
@@ -606,10 +617,10 @@ def _propagate_annotations(mapping):
606617
_elem.attrib['data-scrapy-annotate'] = annotation
607618

608619
def __repr__(self):
609-
return "MDR(%r)" % self.extractors
620+
return "MdrExtractor(%s %r)" % (self.xpath, self.extractors)
610621

611622
def __str__(self):
612-
return "MDR(%s)" % self.extractors
623+
return "MdrExtractor(%s %s)" % (self.xpath, self.extractors)
613624

614625
class TraceExtractor(object):
615626
"""Extractor that wraps other extractors and prints an execution

0 commit comments

Comments
 (0)