Skip to content

Commit 1d2e8cf

Browse files
committed
Merge pull request #22 from kalessin/master
fix ignore region extraction when it is inside an annotation which uses text content extraction, and added test
2 parents 0bf1719 + 0c3eca9 commit 1d2e8cf

File tree

2 files changed

+15
-7
lines changed

2 files changed

+15
-7
lines changed

scrapely/extraction/pageobjects.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from itertools import chain
88
from numpy import array, ndarray
99

10-
from scrapely.htmlpage import HtmlTagType, HtmlPageRegion
10+
from scrapely.htmlpage import HtmlTagType, HtmlPageRegion, HtmlPageParsedRegion
1111

1212
class TokenType(HtmlTagType):
1313
"""constants for token types"""
@@ -82,7 +82,7 @@ def __str__(self):
8282
def __repr__(self):
8383
return str(self)
8484

85-
class FragmentedHtmlPageRegion(HtmlPageRegion):
85+
class FragmentedHtmlPageRegion(HtmlPageParsedRegion, HtmlPageRegion):
8686
"""An HtmlPageRegion consisting of possibly non-contiguous sub-regions"""
8787
def __new__(cls, htmlpage, regions):
8888
text = u''.join(regions)
@@ -95,11 +95,7 @@ def __init__(self, htmlpage, regions):
9595
@property
9696
def parsed_fragments(self):
9797
return chain(*(r.parsed_fragments for r in self.regions))
98-
99-
@property
100-
def text_content(self):
101-
return chain(*(r.text_content for r in self.regions))
102-
98+
10399
class Page(object):
104100
"""Basic representation of a page. This consists of a reference to a
105101
dictionary of tokens and an array of raw token ids

scrapely/tests/test_extraction.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,10 @@
960960
'item test',
961961
[A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))])
962962

963+
SAMPLE_DESCRIPTOR4 = ItemDescriptor('test',
964+
'item test, removes tags from description attribute',
965+
[A('description', 'description field without tags', lambda x: x.text_content)])
966+
963967
# A list of (test name, [templates], page, extractors, expected_result)
964968
TEST_DATA = [
965969
# extract from a similar page
@@ -1052,6 +1056,14 @@
10521056
'price': [u'\n12.00\n(VAT exc.)'],
10531057
}
10541058
),
1059+
# ignored regions and text content extraction
1060+
(
1061+
'ignored_regions', [ANNOTATED_PAGE8], EXTRACT_PAGE8, SAMPLE_DESCRIPTOR4,
1062+
{
1063+
'description': [u'\n A very nice product for all intelligent people \n \n'],
1064+
'price': [u'\n12.00\n(VAT exc.)'],
1065+
}
1066+
),
10551067
# shifted ignored regions (detected by region similarity)
10561068
(
10571069
'shifted_ignored_regions', [ANNOTATED_PAGE9], EXTRACT_PAGE9, DEFAULT_DESCRIPTOR,

0 commit comments

Comments
 (0)