@@ -102,11 +102,11 @@ def __init__(self, annotation, attribute_descriptors=None):
102102 self .extract = self ._extract_both if \
103103 annotation .surrounds_attribute else self ._extract_attribute
104104
105- def _extract_both (self , page , start_index , end_index , ignored_regions = None ):
105+ def _extract_both (self , page , start_index , end_index , ignored_regions = None , ** kwargs ):
106106 return self ._extract_content (page , start_index , end_index , ignored_regions ) + \
107107 self ._extract_attribute (page , start_index , end_index , ignored_regions )
108108
109- def _extract_content (self , extraction_page , start_index , end_index , ignored_regions = None ):
109+ def _extract_content (self , extraction_page , start_index , end_index , ignored_regions = None , ** kwargs ):
110110 # extract content between annotation indexes
111111 if not ignored_regions :
112112 region = extraction_page .htmlpage_region_inside (start_index , end_index )
@@ -126,7 +126,7 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
126126 validated = self .content_validate (region )
127127 return [(self .annotation .surrounds_attribute , validated )] if validated else []
128128
129- def _extract_attribute (self , extraction_page , start_index , end_index , ignored_regions = None ):
129+ def _extract_attribute (self , extraction_page , start_index , end_index , ignored_regions = None , ** kwargs ):
130130 data = []
131131 for (f , ta , ea ) in self .tag_data :
132132 tag_value = extraction_page .htmlpage_tag (start_index ).attributes .get (ta )
@@ -218,7 +218,7 @@ def extract(self, page, start_index, end_index, ignored_regions):
218218 if (page .page_tokens [peek :peek + suffixlen ] \
219219 == self .suffix ).all ():
220220 extracted += self .extractor .extract (page ,
221- prefix_end - 1 , peek , ignored_regions )
221+ prefix_end - 1 , peek , ignored_regions , suffix_max_length = suffixlen )
222222 index = max (peek , index + 1 )
223223 break
224224 else :
@@ -329,7 +329,7 @@ def __init__(self, extractors, template_tokens):
329329 end_index = max (e .annotation .end_index for e in extractors )
330330 self .annotation = AnnotationTag (start_index , end_index )
331331
332- def extract (self , page , start_index = 0 , end_index = None , ignored_regions = None ):
332+ def extract (self , page , start_index = 0 , end_index = None , ignored_regions = None , ** kwargs ):
333333 """extract data from an extraction page
334334
335335 The region in the page to be extracted from may be specified using
@@ -339,7 +339,7 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
339339 ignored_regions = []
340340 region_elements = sorted (self .extractors + ignored_regions , key = lambda x : _labelled (x ).start_index )
341341 _ , _ , attributes = self ._doextract (page , region_elements , start_index ,
342- end_index )
342+ end_index , ** kwargs )
343343 # collect variant data, maintaining the order of variants
344344 variant_ids = []; variants = {}; items = []
345345 for k , v in attributes :
@@ -357,7 +357,7 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
357357 items += variant_records
358358 return [_attrs2dict (items )]
359359
360- def _doextract (self , page , region_elements , start_index , end_index , nested_regions = None , ignored_regions = None ):
360+ def _doextract (self , page , region_elements , start_index , end_index , nested_regions = None , ignored_regions = None , ** kwargs ):
361361 """Carry out extraction of records using the given annotations
362362 in the page tokens bounded by start_index and end_index
363363 """
@@ -382,36 +382,36 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio
382382 labelled = _labelled (first_region )
383383 score , pindex , sindex = \
384384 similar_region (page .page_tokens , self .template_tokens ,
385- labelled , start_index , end_region )
385+ labelled , start_index , end_region , ** kwargs )
386386 if score > 0 :
387387 if isinstance (labelled , AnnotationTag ):
388388 similar_ignored_regions = []
389389 start = pindex
390390 for i in ignored_regions :
391391 s , p , e = similar_region (page .page_tokens , self .template_tokens , \
392- i , start , sindex )
392+ i , start , sindex , ** kwargs )
393393 if s > 0 :
394394 similar_ignored_regions .append (PageRegion (p , e ))
395395 start = e or start
396- extracted_data = first_region .extract (page , pindex , sindex , similar_ignored_regions )
396+ extracted_data = first_region .extract (page , pindex , sindex , similar_ignored_regions , ** kwargs )
397397 if extracted_data :
398398 if first_region .annotation .variant_id :
399399 extracted_data = [(first_region .annotation .variant_id , extracted_data )]
400400
401401 if nested_regions :
402- _ , _ , nested_data = self ._doextract (page , nested_regions , pindex , sindex )
402+ _ , _ , nested_data = self ._doextract (page , nested_regions , pindex , sindex , ** kwargs )
403403 extracted_data += nested_data
404404 if following_regions :
405- _ , _ , following_data = self ._doextract (page , following_regions , sindex or start_index , end_index )
405+ _ , _ , following_data = self ._doextract (page , following_regions , sindex or start_index , end_index , ** kwargs )
406406 extracted_data += following_data
407407
408408 elif following_regions :
409- end_index , _ , following_data = self ._doextract (page , following_regions , start_index , end_index )
409+ end_index , _ , following_data = self ._doextract (page , following_regions , start_index , end_index , ** kwargs )
410410 if end_index is not None :
411- pindex , sindex , extracted_data = self ._doextract (page , [first_region ], start_index , end_index - 1 , nested_regions , ignored_regions )
411+ pindex , sindex , extracted_data = self ._doextract (page , [first_region ], start_index , end_index - 1 , nested_regions , ignored_regions , ** kwargs )
412412 extracted_data += following_data
413413 elif nested_regions :
414- _ , _ , nested_data = self ._doextract (page , nested_regions , start_index , end_index )
414+ _ , _ , nested_data = self ._doextract (page , nested_regions , start_index , end_index , ** kwargs )
415415 extracted_data += nested_data
416416 return pindex , sindex , extracted_data
417417
@@ -445,8 +445,8 @@ class AdjacentVariantExtractor(RecordExtractor):
445445 it will appear as one record so that it can be handled by the RepeatedDataExtractor.
446446 """
447447
448- def extract (self , page , start_index = 0 , end_index = None , ignored_regions = None ):
449- records = RecordExtractor .extract (self , page , start_index , end_index , ignored_regions )
448+ def extract (self , page , start_index = 0 , end_index = None , ignored_regions = None , ** kwargs ):
449+ records = RecordExtractor .extract (self , page , start_index , end_index , ignored_regions , ** kwargs )
450450 return [('variants' , r ['variants' ][0 ]) for r in records if r ]
451451
452452 @classmethod
@@ -513,8 +513,8 @@ def summarize_trace(self, page, start, end, ret):
513513 self .tprefix , self .annotation , self .tsuffix , [r for r in ret if 'trace' not in r ])
514514 return pre_summary , post_summary
515515
516- def extract (self , page , start , end , ignored_regions ):
517- ret = self .traced .extract (page , start , end , ignored_regions )
516+ def extract (self , page , start , end , ignored_regions , ** kwargs ):
517+ ret = self .traced .extract (page , start , end , ignored_regions , ** kwargs )
518518 if not ret :
519519 return []
520520
0 commit comments