diff --git a/scrapely/tests/test_template.py b/scrapely/tests/test_template.py index bce771d..2937562 100644 --- a/scrapely/tests/test_template.py +++ b/scrapely/tests/test_template.py @@ -1,3 +1,5 @@ +# encoding: utf8 + from unittest import TestCase from scrapely.htmlpage import HtmlPage @@ -5,7 +7,19 @@ FragmentAlreadyAnnotated, best_match from scrapely.extraction import InstanceBasedLearningExtractor -class TemplateMakerTest(TestCase): + +class BaseTestCase(TestCase): + PAGE = HtmlPage("http://www.example.com", body=u'') + + def _matches(self, text): + bm = best_match(text) + matches = [(bm(f, self.PAGE), f) for f in self.PAGE.parsed_body] + matches = [x for x in matches if x[0]] + matches.sort(reverse=True) + return [self.PAGE.fragment_data(x[1]) for x in matches] + + +class TemplateMakerTest(BaseTestCase): PAGE = HtmlPage("http://www.example.com", body=u""" @@ -72,9 +86,19 @@ def test_best_match(self): self.assertEquals(self._matches('text to annotate'), ['Some text to annotate here', 'Another text to annotate there']) - def _matches(self, text): - bm = best_match(text) - matches = [(bm(f, self.PAGE), f) for f in self.PAGE.parsed_body] - matches = [x for x in matches if x[0]] - matches.sort(reverse=True) - return [self.PAGE.fragment_data(x[1]) for x in matches] + +class TemplateMakerCJKTest(BaseTestCase): + + PAGE = HtmlPage("http://www.example.com", body=u""" + + +

标题

+

段落

+

另一个标题

+

另一个段落

+ + + """) + + def test_best_match(self): + self.assertEquals(self._matches(u'标题'), [u'标题', u'另一个标题']) diff --git a/scrapely/tests/test_tool.py b/scrapely/tests/test_tool.py new file mode 100644 index 0000000..0e3f305 --- /dev/null +++ b/scrapely/tests/test_tool.py @@ -0,0 +1,40 @@ +# encoding: utf8 + +from unittest import TestCase + +from scrapely.htmlpage import HtmlPage +from scrapely.template import TemplateMaker +from scrapely.tool import parse_criteria, apply_criteria, readable_repr + + +class ToolCJKTestCase(TestCase): + + PAGE = HtmlPage("http://www.example.com", body=u""" + + +

标题

+

段落

+

另一个标题

+

另一个段落

+ + + """) + + def test_apply_criteria_should_support_cjk_chars(self): + criteria = parse_criteria('标题') + tm = TemplateMaker(self.PAGE) + + selection = apply_criteria(criteria, tm) + + self.assertEqual(selection, [6, 14]) + self.assertEqual(tm.selected_data(6), u'

标题

') + self.assertEqual(tm.selected_data(14), u'

另一个标题

') + + +class ReadableReprTextCase(TestCase): + + def test_readable_repr(self): + cjk = u'cjk\t中日韩\n\\u535a' + readable = u"u'cjk\\t中日韩\\n\\\\u535a'" + + self.assertEqual(readable_repr(cjk), readable) diff --git a/scrapely/tool.py b/scrapely/tool.py index 172c695..c84c7df 100644 --- a/scrapely/tool.py +++ b/scrapely/tool.py @@ -6,6 +6,21 @@ from scrapely.template import TemplateMaker, best_match from scrapely.extraction import InstanceBasedLearningExtractor +REPR_UNICODE_CHAR = re.compile(r'(? ' @@ -17,6 +32,8 @@ def __init__(self, filename, **kw): def do_ta(self, line): """ta [--encoding ENCODING] - add template""" opts, (url,) = parse_at(line) + if assert_or_print(url, "missing url"): + return t = url_to_page(url, opts.encoding) templates = self._load_templates() templates.append(t) @@ -31,7 +48,11 @@ def do_tl(self, line): def do_td(self, template_id): """dt