Skip to content

Commit 6d6d5ce

Browse files
authored
Merge pull request #107 from AmbientLighter/feature/fix-typo
Fix typo in docstrings, remove unused imports, fix PEP indents. Update Travis configuration.
2 parents ff8af77 + 93e3520 commit 6d6d5ce

File tree

11 files changed

+28
-24
lines changed

11 files changed

+28
-24
lines changed

.travis.yml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
language: python
2-
python: 2.7
32

4-
env:
5-
- TOXENV=py27
6-
- TOXENV=py34
7-
- TOXENV=pypy
3+
matrix:
4+
include:
5+
- python: 2.7
6+
env: TOXENV=py27
7+
- python: 2.7
8+
env: TOXENV=py34
9+
- python: pypy
10+
env: TOXENV=pypy
811

912
install:
1013
- pip install cython

scrapely/extraction/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def __init__(self, td_pairs, trace=False, apply_extrarequired=True):
7272
descriptor._required_attributes.append(attr)
7373
# not always is present a descriptor for a given attribute
7474
if attr in descriptor.attribute_map:
75-
# not strictly necesary, but avoid possible inconsistences for user
75+
# not strictly necessary, but avoid possible inconsistencies for user
7676
descriptor.attribute_map[attr].required = True
7777
modified_parsed_tdpairs.append((parsed, (t, descriptor)))
7878
# templates with more attributes are considered first

scrapely/extraction/pageobjects.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ class Page(object):
111111
def __init__(self, htmlpage, token_dict, page_tokens):
112112
self.htmlpage = htmlpage
113113
self.token_dict = token_dict
114-
# use a numpy array becuase we can index/slice easily and efficiently
114+
# use a numpy array because we can index/slice easily and efficiently
115115
if not isinstance(page_tokens, ndarray):
116116
page_tokens = array(page_tokens)
117117
self.page_tokens = page_tokens

scrapely/extraction/pageparsing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from scrapely.htmlpage import HtmlTagType, HtmlTag, HtmlPage
1111
from scrapely.extraction.pageobjects import (AnnotationTag,
12-
TemplatePage, ExtractionPage, AnnotationText, TokenDict, FragmentedHtmlPageRegion)
12+
TemplatePage, ExtractionPage, AnnotationText, TokenDict)
1313

1414

1515
def parse_strings(template_html, extraction_html):
@@ -79,6 +79,8 @@ def handle_tag(self, html_tag, index):
7979
_AUTO_CLOSE_TAGS_ON_CLOSE = {
8080
"select": ["option"],
8181
}
82+
83+
8284
class TemplatePageParser(InstanceLearningParser):
8385
"""Template parsing for instance based learning algorithm"""
8486

scrapely/extractors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ def image_url(txt):
401401
>>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
402402
['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
403403
>>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg')
404-
['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
404+
['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg']
405405
>>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
406406
['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
407407
>>> image_url('http://www.site.com/image.php')

scrapely/htmlpage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __repr__(self):
9191
_COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)
9292

9393
def parse_html(text):
94-
"""Higher level html parser. Calls lower level parsers and joins sucesive
94+
"""Higher level html parser. Calls lower level parsers and joins successive
9595
HtmlDataFragment elements in a single one.
9696
"""
9797
# If have doctype remove it.

tests/test_extraction.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,11 @@
55
tests should focus on specific bits of functionality work correctly.
66
"""
77
from unittest import TestCase
8-
from nose_parameterized import parameterized
8+
from parameterized import parameterized
99

1010
from scrapely.htmlpage import HtmlPage
11-
from scrapely.descriptor import (FieldDescriptor as A,
12-
ItemDescriptor)
13-
from scrapely.extractors import (contains_any_numbers,
14-
image_url, html, notags)
11+
from scrapely.descriptor import FieldDescriptor as A, ItemDescriptor
12+
from scrapely.extractors import contains_any_numbers, image_url, html, notags
1513
from scrapely.extraction import InstanceBasedLearningExtractor
1614

1715
# simple page with all features
@@ -1290,7 +1288,6 @@
12901288
]
12911289

12921290

1293-
12941291
class TestExtraction(TestCase):
12951292
@parameterized.expand(TEST_DATA)
12961293
def test_extraction(self, name, templates, page, descriptor, expected_output):

tests/test_pageparsing.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
"""
22
Unit tests for pageparsing
33
"""
4-
import os
5-
from six import StringIO
64
from unittest import TestCase
7-
import numpy
85

96
from scrapely.htmlpage import HtmlPage
107
from scrapely.extraction.pageparsing import (
@@ -204,16 +201,19 @@
204201
</head>
205202
"""
206203

204+
207205
def _parse_page(parser_class, pagetext):
208206
htmlpage = HtmlPage(None, {}, pagetext)
209207
parser = parser_class(TokenDict())
210208
parser.feed(htmlpage)
211209
return parser
212210

211+
213212
def _tags(pp, predicate):
214213
return [pp.token_dict.token_string(s) for s in pp.token_list \
215214
if predicate(s)]
216215

216+
217217
class TestPageParsing(TestCase):
218218

219219
def test_instance_parsing(self):
@@ -227,11 +227,11 @@ def test_instance_parsing(self):
227227
closep = lambda x: pp.token_dict.token_type(x) == TokenType.CLOSE_TAG
228228
self.assertEqual(_tags(pp, closep), ['</p>', '</html>'])
229229

230-
def _validate_annotation(self, parser, lable_region, name, start_tag, end_tag):
231-
self.assertEqual(lable_region.surrounds_attribute, name)
232-
start_token = parser.token_list[lable_region.start_index]
230+
def _validate_annotation(self, parser, label_region, name, start_tag, end_tag):
231+
self.assertEqual(label_region.surrounds_attribute, name)
232+
start_token = parser.token_list[label_region.start_index]
233233
self.assertEqual(parser.token_dict.token_string(start_token), start_tag)
234-
end_token = parser.token_list[lable_region.end_index]
234+
end_token = parser.token_list[label_region.end_index]
235235
self.assertEqual(parser.token_dict.token_string(end_token), end_tag)
236236

237237
def test_template_parsing(self):

tests/test_scraper.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from scrapely.htmlpage import HtmlPage
66
from . import iter_samples
77

8+
89
class ScraperTest(TestCase):
910

1011
def _assert_extracted(self, extracted, expected):

tests/test_template.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
FragmentAlreadyAnnotated, best_match
66
from scrapely.extraction import InstanceBasedLearningExtractor
77

8+
89
class TemplateMakerTest(TestCase):
910

1011
PAGE = HtmlPage("http://www.example.com", body=u"""

0 commit comments

Comments
 (0)