avoid similarity mismatch when extracting last variant, for certain cases when suffix of the first annotation is not the same as the suffix of the last variant (repeated extractor uses the first annotation).

olveyra · olveyra · commit 7fd2454f548a · 2011-12-22T18:35:51.000Z
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
@@ -102,11 +102,11 @@ def __init__(self, annotation, attribute_descriptors=None):
             self.extract = self._extract_both if \
                     annotation.surrounds_attribute else self._extract_attribute
         
-    def _extract_both(self, page, start_index, end_index, ignored_regions=None):
+    def _extract_both(self, page, start_index, end_index, ignored_regions=None, **kwargs):
         return self._extract_content(page, start_index, end_index, ignored_regions) + \
             self._extract_attribute(page, start_index, end_index, ignored_regions)
 
-    def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None):
+    def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
         # extract content between annotation indexes
         if not ignored_regions:
             region = extraction_page.htmlpage_region_inside(start_index, end_index)
@@ -126,7 +126,7 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
         validated = self.content_validate(region)
         return [(self.annotation.surrounds_attribute, validated)] if validated else []
     
-    def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None):
+    def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
         data = []
         for (f, ta, ea) in self.tag_data:
             tag_value = extraction_page.htmlpage_tag(start_index).attributes.get(ta)
@@ -218,7 +218,7 @@ def extract(self, page, start_index, end_index, ignored_regions):
                     if (page.page_tokens[peek:peek + suffixlen] \
                             == self.suffix).all():
                         extracted += self.extractor.extract(page, 
-                                prefix_end - 1, peek, ignored_regions)
+                                prefix_end - 1, peek, ignored_regions, suffix_max_length=suffixlen)
                         index = max(peek, index + 1)
                         break
                 else:
@@ -329,7 +329,7 @@ def __init__(self, extractors, template_tokens):
         end_index = max(e.annotation.end_index for e in extractors)
         self.annotation = AnnotationTag(start_index, end_index)
     
-    def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
+    def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
         """extract data from an extraction page
         
         The region in the page to be extracted from may be specified using
@@ -339,7 +339,7 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
             ignored_regions = []
         region_elements = sorted(self.extractors + ignored_regions, key=lambda x: _labelled(x).start_index)
         _, _, attributes = self._doextract(page, region_elements, start_index, 
-                end_index)
+                end_index, **kwargs)
         # collect variant data, maintaining the order of variants
         variant_ids = []; variants = {}; items = []
         for k, v in attributes:
@@ -357,7 +357,7 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
         items += variant_records
         return [_attrs2dict(items)]
     
-    def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None):
+    def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs):
         """Carry out extraction of records using the given annotations
         in the page tokens bounded by start_index and end_index
         """
@@ -382,36 +382,36 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio
         labelled = _labelled(first_region)
         score, pindex, sindex = \
             similar_region(page.page_tokens, self.template_tokens,
-                labelled, start_index, end_region)
+                labelled, start_index, end_region, **kwargs)
         if score > 0:
             if isinstance(labelled, AnnotationTag):
                 similar_ignored_regions = []
                 start = pindex
                 for i in ignored_regions:
                     s, p, e = similar_region(page.page_tokens, self.template_tokens, \
-                              i, start, sindex)
+                              i, start, sindex, **kwargs)
                     if s > 0:
                         similar_ignored_regions.append(PageRegion(p, e))
                         start = e or start
-                extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions)
+                extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions, **kwargs)
                 if extracted_data:
                     if first_region.annotation.variant_id:
                         extracted_data = [(first_region.annotation.variant_id, extracted_data)]
             
             if nested_regions:
-                _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex)
+                _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs)
                 extracted_data += nested_data
             if following_regions:
-                _, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index)
+                _, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index, **kwargs)
                 extracted_data += following_data
         
         elif following_regions:
-            end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index)
+            end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index, **kwargs)
             if end_index is not None:
-                pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions)
+                pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs)
                 extracted_data += following_data
         elif nested_regions:
-            _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index)
+            _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs)
             extracted_data += nested_data
         return pindex, sindex, extracted_data
                 
@@ -445,8 +445,8 @@ class AdjacentVariantExtractor(RecordExtractor):
     it will appear as one record so that it can be handled by the RepeatedDataExtractor. 
     """
 
-    def extract(self, page, start_index=0, end_index=None, ignored_regions=None):
-        records = RecordExtractor.extract(self, page, start_index, end_index, ignored_regions)
+    def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
+        records = RecordExtractor.extract(self, page, start_index, end_index, ignored_regions, **kwargs)
         return [('variants', r['variants'][0]) for r in records if r]
     
     @classmethod
@@ -513,8 +513,8 @@ def summarize_trace(self, page, start, end, ret):
                 self.tprefix, self.annotation, self.tsuffix, [r for r in ret if 'trace' not in r])
         return pre_summary, post_summary
 
-    def extract(self, page, start, end, ignored_regions):
-        ret = self.traced.extract(page, start, end, ignored_regions)
+    def extract(self, page, start, end, ignored_regions, **kwargs):
+        ret = self.traced.extract(page, start, end, ignored_regions, **kwargs)
         if not ret:
             return []
 
diff --git a/scrapely/extraction/similarity.py b/scrapely/extraction/similarity.py
@@ -79,7 +79,7 @@ def longest_unique_subsequence(to_search, subsequence, range_start=0,
     return None, None
 
 def similar_region(extracted_tokens, template_tokens, labelled_region, 
-        range_start=0, range_end=None):
+        range_start=0, range_end=None, **kwargs):
     """Given a labelled section in a template, identify a similar region
     in the extracted tokens.
 
@@ -103,15 +103,17 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,
     (rpi, pscore) = longest_unique_subsequence(reverse_tokens, reverse_prefix,
             data_length - range_end, data_length - range_start)
 
-    # None means nothing exracted. Index 0 means there cannot be a suffix.
+    # None means nothing extracted. Index 0 means there cannot be a suffix.
     if not rpi:
         return 0, None, None
     
     # convert to an index from the start instead of in reverse
     prefix_index = len(extracted_tokens) - rpi - 1
-
+ 
     if labelled_region.end_index is None:
         return pscore, prefix_index, None
+    elif kwargs.get("suffix_max_length", None) == 0:
+        return pscore, prefix_index, range_start + 1
 
     suffix = template_tokens[labelled_region.end_index:]
 
@@ -134,3 +136,4 @@ def similar_region(extracted_tokens, template_tokens, labelled_region,
     if match_index is None:
         return 0, None, None
     return (pscore + sscore, prefix_index, match_index)
+
diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py
@@ -794,6 +794,104 @@
 </span>
 """
 
+ANNOTATED_PAGE26 = u"""
+<span>
+<br>
+<input type="radio" name="size" checked value='44'>
+<ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true, 
+&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"Large"</ins>
+<br>
+<input type="radio" name="size" checked value='45'>
+"X Large"
+<br>
+<input type="radio" name="size" checked value='46'>
+<ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true, 
+&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"XX Large"</ins>
+</span>
+"""
+
+EXTRACT_PAGE26 = u"""
+<span>
+<br>
+<input type="radio" name="size" checked value='44'>
+"Large"
+<br>
+<input type="radio" name="size" checked value='45'>
+"X Large"
+<br>
+<input type="radio" name="size" checked value='46'>
+"XX Large"
+</span>
+"""
+
+ANNOTATED_PAGE27 = u"""
+<span>
+<br>
+<input type="radio" name="size" checked value='44' data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: false,
+&quot;annotations&quot;: {&quot;value&quot;: &quot;site_id&quot;}}">
+<ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true, 
+&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"Large"</ins>
+<br>
+<input type="radio" name="size" checked value='45'>
+"X Large"
+<br>
+<input type="radio" name="size" checked value='46' data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: false,
+&quot;annotations&quot;: {&quot;value&quot;: &quot;site_id&quot;}}">
+<ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true, 
+&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"XX Large"</ins>
+</span>
+"""
+
+EXTRACT_PAGE27 = u"""
+<span>
+<br>
+<input type="radio" name="size" checked value='44'>
+"Large"
+<br>
+<input type="radio" name="size" checked value='45'>
+"X Large"
+<br>
+<input type="radio" name="size" checked value='46'>
+"XX Large"
+</span>
+"""
+
+ANNOTATED_PAGE28 = u"""
+<span>
+<br>
+<input type="radio" name="size" checked value='44' data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: false,
+&quot;annotations&quot;: {&quot;value&quot;: &quot;site_id&quot;}}">
+<ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true, 
+&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"Large"</ins>
+<br>
+<input type="radio" name="size" checked value='45'>
+"X Large"
+<br>
+<input type="radio" name="size" checked value='46' data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: false,
+&quot;annotations&quot;: {&quot;value&quot;: &quot;site_id&quot;}}">
+<ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true, 
+&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">"XX Large"</ins>
+</span>
+<div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false,
+&quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}"">Price: 45</div>
+"""
+
+EXTRACT_PAGE28 = u"""
+<span>
+<br>
+<input type="radio" name="size" checked value='44'>
+"Large"
+<br>
+<input type="radio" name="size" checked value='45'>
+"X Large"
+<br>
+<input type="radio" name="size" checked value='46'>
+"XX Large"
+</span>
+<div>Price: 45</div>
+"""
+
+
 DEFAULT_DESCRIPTOR = ItemDescriptor('test', 
         'item test, removes tags from description attribute',
         [A('description', 'description field without tags', notags)])
@@ -1025,6 +1123,30 @@
     ('repeated partial annotation within same tag', [ANNOTATED_PAGE25], EXTRACT_PAGE25, DEFAULT_DESCRIPTOR,
             {"name": ['"Large"', '"X Large"', '"XX Large"']}
     ),
+    ('repeated partial annotation within same tag, variants version', [ANNOTATED_PAGE26], EXTRACT_PAGE26, DEFAULT_DESCRIPTOR,
+            {"variants": [
+                {"name": ['"Large"']},
+                {"name": ['"X Large"']},
+                {"name": ['"XX Large"']}
+            ]}
+    ),
+    ('repeated partial annotation within same tag, variants version with more than one attribute',
+            [ANNOTATED_PAGE27], EXTRACT_PAGE27, DEFAULT_DESCRIPTOR,
+            {"variants": [
+                {"name": ['"Large"'], "site_id": ["44"]},
+                {"name": ['"X Large"'], "site_id": ["45"]},
+                {"name": ['"XX Large"'], "site_id": ["46"]}
+            ]}
+    ),
+    ('repeated partial annotation within same tag, variants version with more than one attribute, more annotations around',
+            [ANNOTATED_PAGE28], EXTRACT_PAGE28, DEFAULT_DESCRIPTOR, {
+                "price": ["Price: 45"],
+                "variants": [
+                    {"name": ['"Large"'], "site_id": ["44"]},
+                    {"name": ['"X Large"'], "site_id": ["45"]},
+                    {"name": ['"XX Large"'], "site_id": ["46"]}]
+            }
+    ),
 ]
 
 class TestIbl(TestCase):