Skip to content

Commit 9644da3

Browse files
committed
Merge pull request #64 from tpeng/allow-multi-top-extractors
allow use multiple extractors in TemplatePageExtractor
2 parents 9c656a6 + c951456 commit 9644da3

File tree

1 file changed

+14
-6
lines changed

1 file changed

+14
-6
lines changed

scrapely/extraction/regionextract.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -544,18 +544,26 @@ class TemplatePageExtractor(object):
544544
"""Top level extractor for a template page"""
545545

546546
def __init__(self, template, extractors):
547-
# fixme: handle multiple items per page
548-
self.extractor = extractors[0]
547+
self.extractors = extractors
549548
self.template = template
550549

551550
def extract(self, page, start_index=0, end_index=None):
552-
return self.extractor.extract(page, start_index, end_index, self.template.ignored_regions)
553-
551+
items = []
552+
for extractor in self.extractors:
553+
items.extend(extractor.extract(page, start_index, end_index, self.template.ignored_regions))
554+
return [self._merge_list_dicts(items)]
555+
556+
def _merge_list_dicts(self, dicts):
557+
res = {}
558+
for d in dicts:
559+
res.update(d)
560+
return res
561+
554562
def __repr__(self):
555-
return repr(self.extractor)
563+
return repr(self.extractors)
556564

557565
def __str__(self):
558-
return str(self.extractor)
566+
return str(self.extractors)
559567

560568
# Based on nltk's WordPunctTokenizer
561569
_tokenize = re.compile(r'\w+|[^\w\s]+', re.UNICODE | re.MULTILINE | re.DOTALL).findall

0 commit comments

Comments
 (0)