99import pprint
1010import cStringIO
1111import json
12+ import warnings
1213from itertools import groupby , izip , starmap
1314
1415from numpy import array
@@ -117,8 +118,11 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
117118 region = FragmentedHtmlPageRegion (extraction_page .htmlpage , list (regions ))
118119 else :
119120 region = extraction_page .htmlpage_region_inside (start_index , end_index )
120- validated = self .content_validate (region )
121- return [(self .annotation .surrounds_attribute , validated )] if validated else []
121+ if kwargs .get ('no_content_validate' ):
122+ validated = True
123+ else :
124+ validated = self .content_validate (region )
125+ return [(self .annotation .surrounds_attribute , self .content_validate (region ))] if validated else []
122126
123127 def _extract_attribute (self , extraction_page , start_index , end_index , ignored_regions = None , ** kwargs ):
124128 data = []
@@ -488,29 +492,36 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
488492
489493 doc = document_fromstring (page .htmlpage .body )
490494 element = doc .xpath (self .xpath )
495+
496+ if not element :
497+ warnings .warn ("MDRExtractor can't find element with xpath: %s" % self .xpath )
498+ return [{}]
499+
491500 items = {}
492501
493- if element :
494- _ , mapping = mdr .extract (element [0 ], record = self .record )
495- for seed_elem , elements in mapping .iteritems ():
496- annotation_elem = [elem for elem in ([seed_elem ] + elements ) if elem .attrib .get ('data-scrapy-annotate' )]
497- if annotation_elem :
498- annotation = self ._read_template_annotation (annotation_elem [0 ])
499- name = annotation .get ('annotations' , {}).get ('content' )
500- ex = self .extractors [name ]
501- for elem in elements :
502- elem_page = HtmlPage (None , {}, tostring (elem , encoding = 'unicode' ))
503- parsed_elem_page = parse_extraction_page (self .token_dict , elem_page )
504- items .setdefault (name , []).extend ([v for _ , v in ex .extract (parsed_elem_page , 0 ,
505- len (parsed_elem_page .page_tokens ) - 1 )])
502+ _ , mapping = mdr .extract (element [0 ], record = self .record )
503+ for seed_elem , elements in mapping .iteritems ():
504+ annotation_elem = [elem for elem in ([seed_elem ] + elements ) if elem .attrib .get ('data-scrapy-annotate' )]
505+ if annotation_elem :
506+ annotation = self ._read_template_annotation (annotation_elem [0 ])
507+ name = annotation .get ('annotations' , {}).get ('content' )
508+ ex = self .extractors [name ]
509+ for elem in elements :
510+ elem_page = HtmlPage (None , {}, tostring (elem , encoding = 'unicode' ))
511+ parsed_elem_page = parse_extraction_page (self .token_dict , elem_page )
512+ items .setdefault (name , []).extend ([v for _ , v in ex .extract (parsed_elem_page , 0 ,
513+ len (parsed_elem_page .page_tokens ) - 1 , no_content_validate = True )])
514+
515+ if items :
516+ lengths = [len (values ) for values in items .values ()]
517+ assert len (set (lengths )) == 1 , 'extract items %r should be have same count' % items
506518 return [items ]
507519
508520 @classmethod
509521 def apply (cls , template , extractors ):
510522 try :
511523 from mdr import MDR
512524 except ImportError :
513- import warnings
514525 warnings .warn ("MDR is not available" )
515526 return None , extractors
516527
@@ -519,14 +530,14 @@ def apply(cls, template, extractors):
519530
520531 candidates , doc = mdr .list_candidates (htmlpage .encode ('utf8' ))
521532
522- # no repated data detected
533+ # early return if no repated data detected
523534 if not candidates :
524535 return None , extractors
525536
526537 candidate_xpaths = [doc .getpath (candidate ) for candidate in candidates ]
527538
528539 listing_data_annotations = [a for a in template .annotations if a .metadata .get ('listingData' )]
529- # no annotation has listingData property
540+ # early return if no annotations has listingData property set
530541 if not listing_data_annotations :
531542 return None , extractors
532543
@@ -538,7 +549,7 @@ def apply(cls, template, extractors):
538549 # XXX: use xpath to find the element on target page, using ``similar_region`` might be better
539550 if candidate .xpath ('.//*[@data-scrapy-annotate]' ):
540551 # remove the listing annotation from the template and basic extractor,
541- # since they're going to extract them with MdrExtractor
552+ # since they're going to extract by MdrExtractor
542553 listing_data_extractors = []
543554 for annotation in listing_data_annotations :
544555 template .annotations .remove (annotation )
@@ -551,7 +562,7 @@ def apply(cls, template, extractors):
551562 cls ._propagate_annotations (mapping )
552563 return cls (template .token_dict , cls ._get_candidate_xpath (doc , candidate ), record , listing_data_extractors ), extractors
553564
554- return extractors
565+ return None , extractors
555566
556567 @staticmethod
557568 def _get_candidate_xpath (doc , element ):
@@ -606,10 +617,10 @@ def _propagate_annotations(mapping):
606617 _elem .attrib ['data-scrapy-annotate' ] = annotation
607618
608619 def __repr__ (self ):
609- return "MDR(% r)" % self .extractors
620+ return "MdrExtractor(%s % r)" % ( self .xpath , self . extractors )
610621
611622 def __str__ (self ):
612- return "MDR (%s)" % self .extractors
623+ return "MdrExtractor (%s %s )" % ( self .xpath , self . extractors )
613624
614625class TraceExtractor (object ):
615626 """Extractor that wraps other extractors and prints an execution
0 commit comments