Skip to content

Commit 87c1677

Browse files
committed
fix MdrExtractor extraction when the annotated elements has listing data
1 parent fd47935 commit 87c1677

File tree

4 files changed

+24164
-8
lines changed

4 files changed

+24164
-8
lines changed

scrapely/extraction/regionextract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,7 @@ def apply(cls, template, extractors):
534534
if not candidates:
535535
return None, extractors
536536

537-
candidate_xpaths = [doc.getpath(candidate) for candidate in candidates]
537+
candidate_xpaths = [doc.getpath(candidate) for candidate in candidates if not candidate.get('data-scrapy-annotate')]
538538

539539
listing_data_annotations = [a for a in template.annotations if a.metadata.get('listingData')]
540540
# early return if no annotations has listingData property set
@@ -547,7 +547,7 @@ def apply(cls, template, extractors):
547547
candidate = doc.xpath(candidate_xpath)[0]
548548

549549
# XXX: use xpath to find the element on target page, using ``similar_region`` might be better
550-
if candidate.xpath('.//*[@data-scrapy-annotate]'):
550+
if candidate.xpath('descendant-or-self::*[@data-scrapy-annotate]'):
551551
# remove the listing annotation from the template and basic extractor,
552552
# since they're going to extract by MdrExtractor
553553
listing_data_extractors = []

0 commit comments

Comments
 (0)