|
936 | 936 | <div><span><script>var myvar= 10;</script></span></div> |
937 | 937 | """ |
938 | 938 |
|
| 939 | +ANNOTATED_PAGE31 = u""" |
| 940 | +<html><body> |
| 941 | +<div> |
| 942 | +<span data-scrapy-annotate="{"variant": 0, "annotations": {"content": "name"}}">Product name</span> |
| 943 | +<div><p data-scrapy-annotate="{"variant": 0, "annotations": {"content": "price"}}">60.00</p> |
| 944 | +<span data-scrapy-annotate="{"variant": 0, "annotations": {"content": "description"}}">description</span> |
| 945 | +<span data-scrapy-annotate="{"variant": 0, "annotations": {"content": "features"}}">features</span> |
| 946 | +<img data-scrapy-annotate="{"variant": 0, "annotations": {"src": "image_urls"}}" src="image.jpg" /> |
| 947 | +<table></table> |
| 948 | +</div></div> |
| 949 | +</body></html> |
| 950 | +""" |
| 951 | + |
| 952 | +EXTRACT_PAGE31 = u""" |
| 953 | +<html><body> |
| 954 | +<div> |
| 955 | +<span>Product name</span> |
| 956 | +<div><p>60.00</p> |
| 957 | +<img src="http://example.com/image.jpg" /> |
| 958 | +<table></table> |
| 959 | +</div></div> |
| 960 | +</body></html> |
| 961 | +""" |
939 | 962 |
|
940 | 963 | DEFAULT_DESCRIPTOR = ItemDescriptor('test', |
941 | 964 | 'item test, removes tags from description attribute', |
|
950 | 973 | ] |
951 | 974 | ) |
952 | 975 |
|
| 976 | +SAMPLE_DESCRIPTOR1a = ItemDescriptor('test', 'product test', [ |
| 977 | + A('name', "Product name"), |
| 978 | + A('price', "Product price, including any discounts and tax or vat", |
| 979 | + contains_any_numbers), |
| 980 | + A('image_urls', "URLs for one or more images", image_url), |
| 981 | + A('description', "The full description of the product", html), |
| 982 | + ] |
| 983 | + ) |
| 984 | + |
953 | 985 | SAMPLE_DESCRIPTOR2 = ItemDescriptor('test', 'item test', [ |
954 | 986 | A('description', 'description field without tags', notags), |
955 | 987 | A('price', "Product price, including any discounts and tax or vat", |
|
1227 | 1259 | ('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3, |
1228 | 1260 | None |
1229 | 1261 | ), |
| 1262 | + ('correctly extract regions that follows more than one consecutive misses', [ANNOTATED_PAGE31], EXTRACT_PAGE31, SAMPLE_DESCRIPTOR1a, |
| 1263 | + { |
| 1264 | + u'price': [u'60.00'], |
| 1265 | + u'name': [u'Product name'], |
| 1266 | + u'image_urls': [['http://example.com/image.jpg']] |
| 1267 | + } |
| 1268 | + ) |
1230 | 1269 | ] |
1231 | 1270 |
|
1232 | 1271 | class TestIbl(TestCase): |
|
0 commit comments