Skip to content

Commit 7fd2454

Browse files
author
olveyra
committed
avoid similarity mismatch when extracting last variant, for certain cases when suffix of the first annotation is not the same as the suffix of the last variant (repeated extractor uses the first annotation).
1 parent 2dc6daf commit 7fd2454

File tree

3 files changed

+147
-22
lines changed

3 files changed

+147
-22
lines changed

scrapely/extraction/regionextract.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,11 @@ def __init__(self, annotation, attribute_descriptors=None):
102102
self.extract = self._extract_both if \
103103
annotation.surrounds_attribute else self._extract_attribute
104104

105-
def _extract_both(self, page, start_index, end_index, ignored_regions=None):
105+
def _extract_both(self, page, start_index, end_index, ignored_regions=None, **kwargs):
106106
return self._extract_content(page, start_index, end_index, ignored_regions) + \
107107
self._extract_attribute(page, start_index, end_index, ignored_regions)
108108

109-
def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None):
109+
def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
110110
# extract content between annotation indexes
111111
if not ignored_regions:
112112
region = extraction_page.htmlpage_region_inside(start_index, end_index)
@@ -126,7 +126,7 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
126126
validated = self.content_validate(region)
127127
return [(self.annotation.surrounds_attribute, validated)] if validated else []
128128

129-
def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None):
129+
def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
130130
data = []
131131
for (f, ta, ea) in self.tag_data:
132132
tag_value = extraction_page.htmlpage_tag(start_index).attributes.get(ta)
@@ -218,7 +218,7 @@ def extract(self, page, start_index, end_index, ignored_regions):
218218
if (page.page_tokens[peek:peek + suffixlen] \
219219
== self.suffix).all():
220220
extracted += self.extractor.extract(page,
221-
prefix_end - 1, peek, ignored_regions)
221+
prefix_end - 1, peek, ignored_regions, suffix_max_length=suffixlen)
222222
index = max(peek, index + 1)
223223
break
224224
else:
@@ -329,7 +329,7 @@ def __init__(self, extractors, template_tokens):
329329
end_index = max(e.annotation.end_index for e in extractors)
330330
self.annotation = AnnotationTag(start_index, end_index)
331331

332-
def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
332+
def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
333333
"""extract data from an extraction page
334334
335335
The region in the page to be extracted from may be specified using
@@ -339,7 +339,7 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
339339
ignored_regions = []
340340
region_elements = sorted(self.extractors + ignored_regions, key=lambda x: _labelled(x).start_index)
341341
_, _, attributes = self._doextract(page, region_elements, start_index,
342-
end_index)
342+
end_index, **kwargs)
343343
# collect variant data, maintaining the order of variants
344344
variant_ids = []; variants = {}; items = []
345345
for k, v in attributes:
@@ -357,7 +357,7 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
357357
items += variant_records
358358
return [_attrs2dict(items)]
359359

360-
def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None):
360+
def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs):
361361
"""Carry out extraction of records using the given annotations
362362
in the page tokens bounded by start_index and end_index
363363
"""
@@ -382,36 +382,36 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio
382382
labelled = _labelled(first_region)
383383
score, pindex, sindex = \
384384
similar_region(page.page_tokens, self.template_tokens,
385-
labelled, start_index, end_region)
385+
labelled, start_index, end_region, **kwargs)
386386
if score > 0:
387387
if isinstance(labelled, AnnotationTag):
388388
similar_ignored_regions = []
389389
start = pindex
390390
for i in ignored_regions:
391391
s, p, e = similar_region(page.page_tokens, self.template_tokens, \
392-
i, start, sindex)
392+
i, start, sindex, **kwargs)
393393
if s > 0:
394394
similar_ignored_regions.append(PageRegion(p, e))
395395
start = e or start
396-
extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions)
396+
extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions, **kwargs)
397397
if extracted_data:
398398
if first_region.annotation.variant_id:
399399
extracted_data = [(first_region.annotation.variant_id, extracted_data)]
400400

401401
if nested_regions:
402-
_, _, nested_data = self._doextract(page, nested_regions, pindex, sindex)
402+
_, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs)
403403
extracted_data += nested_data
404404
if following_regions:
405-
_, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index)
405+
_, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index, **kwargs)
406406
extracted_data += following_data
407407

408408
elif following_regions:
409-
end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index)
409+
end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index, **kwargs)
410410
if end_index is not None:
411-
pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions)
411+
pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs)
412412
extracted_data += following_data
413413
elif nested_regions:
414-
_, _, nested_data = self._doextract(page, nested_regions, start_index, end_index)
414+
_, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs)
415415
extracted_data += nested_data
416416
return pindex, sindex, extracted_data
417417

@@ -445,8 +445,8 @@ class AdjacentVariantExtractor(RecordExtractor):
445445
it will appear as one record so that it can be handled by the RepeatedDataExtractor.
446446
"""
447447

448-
def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
449-
records = RecordExtractor.extract(self, page, start_index, end_index, ignored_regions)
448+
def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
449+
records = RecordExtractor.extract(self, page, start_index, end_index, ignored_regions, **kwargs)
450450
return [('variants', r['variants'][0]) for r in records if r]
451451

452452
@classmethod
@@ -513,8 +513,8 @@ def summarize_trace(self, page, start, end, ret):
513513
self.tprefix, self.annotation, self.tsuffix, [r for r in ret if 'trace' not in r])
514514
return pre_summary, post_summary
515515

516-
def extract(self, page, start, end, ignored_regions):
517-
ret = self.traced.extract(page, start, end, ignored_regions)
516+
def extract(self, page, start, end, ignored_regions, **kwargs):
517+
ret = self.traced.extract(page, start, end, ignored_regions, **kwargs)
518518
if not ret:
519519
return []
520520

scrapely/extraction/similarity.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def longest_unique_subsequence(to_search, subsequence, range_start=0,
7979
return None, None
8080

8181
def similar_region(extracted_tokens, template_tokens, labelled_region,
82-
range_start=0, range_end=None):
82+
range_start=0, range_end=None, **kwargs):
8383
"""Given a labelled section in a template, identify a similar region
8484
in the extracted tokens.
8585
@@ -103,15 +103,17 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,
103103
(rpi, pscore) = longest_unique_subsequence(reverse_tokens, reverse_prefix,
104104
data_length - range_end, data_length - range_start)
105105

106-
# None means nothing exracted. Index 0 means there cannot be a suffix.
106+
# None means nothing extracted. Index 0 means there cannot be a suffix.
107107
if not rpi:
108108
return 0, None, None
109109

110110
# convert to an index from the start instead of in reverse
111111
prefix_index = len(extracted_tokens) - rpi - 1
112-
112+
113113
if labelled_region.end_index is None:
114114
return pscore, prefix_index, None
115+
elif kwargs.get("suffix_max_length", None) == 0:
116+
return pscore, prefix_index, range_start + 1
115117

116118
suffix = template_tokens[labelled_region.end_index:]
117119

@@ -134,3 +136,4 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,
134136
if match_index is None:
135137
return 0, None, None
136138
return (pscore + sscore, prefix_index, match_index)
139+

scrapely/tests/test_extraction.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,104 @@
794794
</span>
795795
"""
796796

797+
ANNOTATED_PAGE26 = u"""
798+
<span>
799+
<br>
800+
<input type="radio" name="size" checked value='44'>
801+
<ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true,
802+
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"Large"</ins>
803+
<br>
804+
<input type="radio" name="size" checked value='45'>
805+
"X Large"
806+
<br>
807+
<input type="radio" name="size" checked value='46'>
808+
<ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true,
809+
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"XX Large"</ins>
810+
</span>
811+
"""
812+
813+
EXTRACT_PAGE26 = u"""
814+
<span>
815+
<br>
816+
<input type="radio" name="size" checked value='44'>
817+
"Large"
818+
<br>
819+
<input type="radio" name="size" checked value='45'>
820+
"X Large"
821+
<br>
822+
<input type="radio" name="size" checked value='46'>
823+
"XX Large"
824+
</span>
825+
"""
826+
827+
ANNOTATED_PAGE27 = u"""
828+
<span>
829+
<br>
830+
<input type="radio" name="size" checked value='44' data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: false,
831+
&quot;annotations&quot;: {&quot;value&quot;: &quot;site_id&quot;}}">
832+
<ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true,
833+
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"Large"</ins>
834+
<br>
835+
<input type="radio" name="size" checked value='45'>
836+
"X Large"
837+
<br>
838+
<input type="radio" name="size" checked value='46' data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: false,
839+
&quot;annotations&quot;: {&quot;value&quot;: &quot;site_id&quot;}}">
840+
<ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true,
841+
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"XX Large"</ins>
842+
</span>
843+
"""
844+
845+
EXTRACT_PAGE27 = u"""
846+
<span>
847+
<br>
848+
<input type="radio" name="size" checked value='44'>
849+
"Large"
850+
<br>
851+
<input type="radio" name="size" checked value='45'>
852+
"X Large"
853+
<br>
854+
<input type="radio" name="size" checked value='46'>
855+
"XX Large"
856+
</span>
857+
"""
858+
859+
ANNOTATED_PAGE28 = u"""
860+
<span>
861+
<br>
862+
<input type="radio" name="size" checked value='44' data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: false,
863+
&quot;annotations&quot;: {&quot;value&quot;: &quot;site_id&quot;}}">
864+
<ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true,
865+
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"Large"</ins>
866+
<br>
867+
<input type="radio" name="size" checked value='45'>
868+
"X Large"
869+
<br>
870+
<input type="radio" name="size" checked value='46' data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: false,
871+
&quot;annotations&quot;: {&quot;value&quot;: &quot;site_id&quot;}}">
872+
<ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true,
873+
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"XX Large"</ins>
874+
</span>
875+
<div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false,
876+
&quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}"">Price: 45</div>
877+
"""
878+
879+
EXTRACT_PAGE28 = u"""
880+
<span>
881+
<br>
882+
<input type="radio" name="size" checked value='44'>
883+
"Large"
884+
<br>
885+
<input type="radio" name="size" checked value='45'>
886+
"X Large"
887+
<br>
888+
<input type="radio" name="size" checked value='46'>
889+
"XX Large"
890+
</span>
891+
<div>Price: 45</div>
892+
"""
893+
894+
797895
DEFAULT_DESCRIPTOR = ItemDescriptor('test',
798896
'item test, removes tags from description attribute',
799897
[A('description', 'description field without tags', notags)])
@@ -1025,6 +1123,30 @@
10251123
('repeated partial annotation within same tag', [ANNOTATED_PAGE25], EXTRACT_PAGE25, DEFAULT_DESCRIPTOR,
10261124
{"name": ['"Large"', '"X Large"', '"XX Large"']}
10271125
),
1126+
('repeated partial annotation within same tag, variants version', [ANNOTATED_PAGE26], EXTRACT_PAGE26, DEFAULT_DESCRIPTOR,
1127+
{"variants": [
1128+
{"name": ['"Large"']},
1129+
{"name": ['"X Large"']},
1130+
{"name": ['"XX Large"']}
1131+
]}
1132+
),
1133+
('repeated partial annotation within same tag, variants version with more than one attribute',
1134+
[ANNOTATED_PAGE27], EXTRACT_PAGE27, DEFAULT_DESCRIPTOR,
1135+
{"variants": [
1136+
{"name": ['"Large"'], "site_id": ["44"]},
1137+
{"name": ['"X Large"'], "site_id": ["45"]},
1138+
{"name": ['"XX Large"'], "site_id": ["46"]}
1139+
]}
1140+
),
1141+
('repeated partial annotation within same tag, variants version with more than one attribute, more annotations around',
1142+
[ANNOTATED_PAGE28], EXTRACT_PAGE28, DEFAULT_DESCRIPTOR, {
1143+
"price": ["Price: 45"],
1144+
"variants": [
1145+
{"name": ['"Large"'], "site_id": ["44"]},
1146+
{"name": ['"X Large"'], "site_id": ["45"]},
1147+
{"name": ['"XX Large"'], "site_id": ["46"]}]
1148+
}
1149+
),
10281150
]
10291151

10301152
class TestIbl(TestCase):

0 commit comments

Comments
 (0)