From dc17593ef6c3eebd9287c72b0b4edc9f052dd5d1 Mon Sep 17 00:00:00 2001 From: Mohamed Nabil Hafez Date: Tue, 15 Dec 2015 17:53:03 +0200 Subject: [PATCH 1/4] Issue #1 Make sure that the correctly_extracted list does not contain empty dicts --- scrapely/extraction/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scrapely/extraction/__init__.py b/scrapely/extraction/__init__.py index a4c10c4..b39ec9b 100644 --- a/scrapely/extraction/__init__.py +++ b/scrapely/extraction/__init__.py @@ -118,7 +118,9 @@ def extract(self, html, pref_template_id=None): for extraction_tree in extraction_trees: extracted = extraction_tree.extract(extraction_page) correctly_extracted = self.validated[extraction_tree.template.id](extracted) - if len(correctly_extracted) > 0: + # Make sure that correctly_extracted list does not contain empty dicts + correctly_extracted = [extracted_data for extracted_data in correctly_extracted if extracted_data] + if correctly_extracted: return correctly_extracted, extraction_tree.template return None, None From c6fa64c74473e71ba050e00f5f871b6b3b1f0cd6 Mon Sep 17 00:00:00 2001 From: Mohamed Nabil Hafez Date: Wed, 16 Dec 2015 19:26:19 +0200 Subject: [PATCH 2/4] Issue #2 Make sure that we match the template which extract more data --- scrapely/extraction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapely/extraction/__init__.py b/scrapely/extraction/__init__.py index b39ec9b..697e275 100644 --- a/scrapely/extraction/__init__.py +++ b/scrapely/extraction/__init__.py @@ -120,7 +120,7 @@ def extract(self, html, pref_template_id=None): correctly_extracted = self.validated[extraction_tree.template.id](extracted) # Make sure that correctly_extracted list does not contain empty dicts correctly_extracted = [extracted_data for extracted_data in correctly_extracted if extracted_data] - if correctly_extracted: + if correctly_extracted and len(correctly_extracted[0].keys()) >= 2: return correctly_extracted, extraction_tree.template return None, None From 889c721124cae0b3bd94d6346e19fd4fdf8b45bd Mon Sep 17 00:00:00 2001 From: Mohamed Nabil Hafez Date: Wed, 16 Dec 2015 19:58:29 +0200 Subject: [PATCH 3/4] Issue #2 Refine the fix a little bit --- scrapely/extraction/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapely/extraction/__init__.py b/scrapely/extraction/__init__.py index 697e275..0f3339d 100644 --- a/scrapely/extraction/__init__.py +++ b/scrapely/extraction/__init__.py @@ -35,7 +35,7 @@ class InstanceBasedLearningExtractor(object): RepeatedDataExtractor, RecordExtractor, ] - + _ext_items_max_number = 2 def __init__(self, td_pairs, trace=False, apply_extrarequired=True): """Initialise this extractor @@ -120,7 +120,7 @@ def extract(self, html, pref_template_id=None): correctly_extracted = self.validated[extraction_tree.template.id](extracted) # Make sure that correctly_extracted list does not contain empty dicts correctly_extracted = [extracted_data for extracted_data in correctly_extracted if extracted_data] - if correctly_extracted and len(correctly_extracted[0].keys()) >= 2: + if correctly_extracted and len(correctly_extracted[0].keys()) >= self._ext_items_max_number: return correctly_extracted, extraction_tree.template return None, None From 59298a91829da3b239fe18865de5c92dd01425d2 Mon Sep 17 00:00:00 2001 From: Mohamed Nabil Hafez Date: Sun, 20 Dec 2015 18:48:56 +0200 Subject: [PATCH 4/4] Issue #2 Return the dict with the maximum number of extracted values --- scrapely/extraction/__init__.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/scrapely/extraction/__init__.py b/scrapely/extraction/__init__.py index 0f3339d..99c6d67 100644 --- a/scrapely/extraction/__init__.py +++ b/scrapely/extraction/__init__.py @@ -36,6 +36,7 @@ class InstanceBasedLearningExtractor(object): RecordExtractor, ] _ext_items_max_number = 2 + def __init__(self, td_pairs, trace=False, apply_extrarequired=True): """Initialise this extractor @@ -77,11 +78,11 @@ def __init__(self, td_pairs, trace=False, apply_extrarequired=True): modified_parsed_tdpairs.append((parsed, (t, descriptor))) # templates with more attributes are considered first sorted_tdpairs = sorted(modified_parsed_tdpairs, - key=lambda x: _annotation_count(x[0]), reverse=True) + key=lambda x: _annotation_count(x[0]), reverse=True) self.extraction_trees = [ self.build_extraction_tree(p, td[1], trace) for p, td in sorted_tdpairs - ] + ] self.validated = dict( (td[0].page_id, td[1].validated if td[1] else self._filter_not_none) for _, td in sorted_tdpairs @@ -108,25 +109,27 @@ def extract(self, html, pref_template_id=None): If pref_template_url is specified, the template with that url will be used first. """ + max_extracted_value = {} + correctly_extracted_template = '' extraction_page = parse_extraction_page(self.token_dict, html) if pref_template_id is not None: extraction_trees = sorted(self.extraction_trees, - key=lambda x: x.template.id != pref_template_id) + key=lambda x: x.template.id != pref_template_id) else: extraction_trees = self.extraction_trees for extraction_tree in extraction_trees: extracted = extraction_tree.extract(extraction_page) correctly_extracted = self.validated[extraction_tree.template.id](extracted) - # Make sure that correctly_extracted list does not contain empty dicts - correctly_extracted = [extracted_data for extracted_data in correctly_extracted if extracted_data] - if correctly_extracted and len(correctly_extracted[0].keys()) >= self._ext_items_max_number: - return correctly_extracted, extraction_tree.template - return None, None + if len(correctly_extracted[0]) > len(max_extracted_value): + max_extracted_value = correctly_extracted[0] + correctly_extracted_template = extraction_tree.template + + return [max_extracted_value], correctly_extracted_template def __str__(self): return "InstanceBasedLearningExtractor[\n%s\n]" % \ - (',\n'.join(map(str, self.extraction_trees))) + (',\n'.join(map(str, self.extraction_trees))) @staticmethod def _filter_not_none(items):