Skip to content

Commit 3e6244a

Browse files
author
olveyra
committed
fix case when there is a repeated annotation inside variants. Added test.
1 parent 7fd2454 commit 3e6244a

File tree

2 files changed

+35
-4
lines changed

2 files changed

+35
-4
lines changed

scrapely/extraction/regionextract.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def build_extraction_tree(template, type_descriptor, trace=True):
2727
extractors = BasicTypeExtractor.create(template.annotations, attribute_map)
2828
if trace:
2929
extractors = TraceExtractor.apply(template, extractors)
30-
for cls in (AdjacentVariantExtractor, RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor,
30+
for cls in (RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor, AdjacentVariantExtractor, RepeatedDataExtractor,
3131
RecordExtractor):
3232
extractors = cls.apply(template, extractors)
3333
if trace:
@@ -201,7 +201,7 @@ def __init__(self, prefix, suffix, extractors):
201201
self.annotation = copy.copy(self.extractor.annotation)
202202
self.annotation.end_index = extractors[-1].annotation.end_index
203203

204-
def extract(self, page, start_index, end_index, ignored_regions):
204+
def extract(self, page, start_index, end_index, ignored_regions, **kwargs):
205205
"""repeatedly find regions bounded by the repeated
206206
prefix and suffix and extract them
207207
"""
@@ -231,7 +231,7 @@ def extract(self, page, start_index, end_index, ignored_regions):
231231
def apply(template, extractors):
232232
tokens = template.page_tokens
233233
output_extractors = []
234-
group_key = lambda x: x.extracted_item()
234+
group_key = lambda x: (x.extracted_item(), x.annotation.variant_id)
235235
for extr_key, extraction_group in groupby(extractors, group_key):
236236
extraction_group = list(extraction_group)
237237
if extr_key is None or len(extraction_group) == 1:

scrapely/tests/test_extraction.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -873,7 +873,7 @@
873873
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"XX Large"</ins>
874874
</span>
875875
<div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false,
876-
&quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}"">Price: 45</div>
876+
&quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">Price: 45</div>
877877
"""
878878

879879
EXTRACT_PAGE28 = u"""
@@ -891,6 +891,29 @@
891891
<div>Price: 45</div>
892892
"""
893893

894+
ANNOTATED_PAGE29 = u"""
895+
<table>
896+
<tr><td data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: false,
897+
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Name 1</td><td data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: false,
898+
&quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">Desc 1</td><td><span data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: false,
899+
&quot;annotations&quot;: {&quot;content&quot;: &quot;tag&quot;}}">Tag 1</span><span>Tag2</span><span data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: false,
900+
&quot;annotations&quot;: {&quot;content&quot;: &quot;tag&quot;}}">Tag 3</span></td></tr>
901+
<tr><td>Name 2</td><td>Desc 2</td><td><span>Tag 7</span><span>Tag 8</span></span>Tag 9</span></td></tr>
902+
<tr><td data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: false,
903+
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Name 3</td><td data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: false,
904+
&quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">Desc 3</td><td><span data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: false,
905+
&quot;annotations&quot;: {&quot;content&quot;: &quot;tag&quot;}}">Tag 4</span><span>Tag5</span><span data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: false,
906+
&quot;annotations&quot;: {&quot;content&quot;: &quot;tag&quot;}}">Tag 6</span></td></tr>
907+
</table>
908+
"""
909+
910+
EXTRACT_PAGE29 = u"""
911+
<table>
912+
<tr><td>Name 1</td><td>Desc 1</td><td><span>Tag 1</span><span>Tag 2</span><span>Tag 3</span></td></tr>
913+
<tr><td>Name 2</td><td>Desc 2</td><td><span>Tag 4</span><span>Tag 5</span><span>Tag 6</span></td></tr>
914+
<tr><td>Name 3</td><td>Desc 3</td><td><span>Tag 7</span><span>Tag 8</span><span>Tag 9</span></td></tr>
915+
</table>
916+
"""
894917

895918
DEFAULT_DESCRIPTOR = ItemDescriptor('test',
896919
'item test, removes tags from description attribute',
@@ -1147,6 +1170,14 @@
11471170
{"name": ['"XX Large"'], "site_id": ["46"]}]
11481171
}
11491172
),
1173+
('repeated annotation inside variants', [ANNOTATED_PAGE29], EXTRACT_PAGE29, DEFAULT_DESCRIPTOR,
1174+
{'variants': [
1175+
{u'tag': [u'Tag 1', u'Tag 2', u'Tag 3'], u'description': [u'Desc 1'], u'name': [u'Name 1']},
1176+
{u'tag': [u'Tag 4', u'Tag 5', u'Tag 6'], u'description': [u'Desc 2'], u'name': [u'Name 2']},
1177+
{u'tag': [u'Tag 7', u'Tag 8', u'Tag 9'], u'description': [u'Desc 3'], u'name': [u'Name 3']}]
1178+
}
1179+
1180+
),
11501181
]
11511182

11521183
class TestIbl(TestCase):

0 commit comments

Comments
 (0)