diff --git a/scrapely/tests/test_template.py b/scrapely/tests/test_template.py
index bce771d..2937562 100644
--- a/scrapely/tests/test_template.py
+++ b/scrapely/tests/test_template.py
@@ -1,3 +1,5 @@
+# encoding: utf8
+
from unittest import TestCase
from scrapely.htmlpage import HtmlPage
@@ -5,7 +7,19 @@
FragmentAlreadyAnnotated, best_match
from scrapely.extraction import InstanceBasedLearningExtractor
-class TemplateMakerTest(TestCase):
+
+class BaseTestCase(TestCase):
+ PAGE = HtmlPage("http://www.example.com", body=u'')
+
+ def _matches(self, text):
+ bm = best_match(text)
+ matches = [(bm(f, self.PAGE), f) for f in self.PAGE.parsed_body]
+ matches = [x for x in matches if x[0]]
+ matches.sort(reverse=True)
+ return [self.PAGE.fragment_data(x[1]) for x in matches]
+
+
+class TemplateMakerTest(BaseTestCase):
PAGE = HtmlPage("http://www.example.com", body=u"""
@@ -72,9 +86,19 @@ def test_best_match(self):
self.assertEquals(self._matches('text to annotate'),
['Some text to annotate here', 'Another text to annotate there'])
- def _matches(self, text):
- bm = best_match(text)
- matches = [(bm(f, self.PAGE), f) for f in self.PAGE.parsed_body]
- matches = [x for x in matches if x[0]]
- matches.sort(reverse=True)
- return [self.PAGE.fragment_data(x[1]) for x in matches]
+
+class TemplateMakerCJKTest(BaseTestCase):
+
+ PAGE = HtmlPage("http://www.example.com", body=u"""
+
+
+ 标题
+ 段落
+ 另一个标题
+ 另一个段落
+
+
+ """)
+
+ def test_best_match(self):
+ self.assertEquals(self._matches(u'标题'), [u'标题', u'另一个标题'])
diff --git a/scrapely/tests/test_tool.py b/scrapely/tests/test_tool.py
new file mode 100644
index 0000000..0e3f305
--- /dev/null
+++ b/scrapely/tests/test_tool.py
@@ -0,0 +1,40 @@
+# encoding: utf8
+
+from unittest import TestCase
+
+from scrapely.htmlpage import HtmlPage
+from scrapely.template import TemplateMaker
+from scrapely.tool import parse_criteria, apply_criteria, readable_repr
+
+
+class ToolCJKTestCase(TestCase):
+
+ PAGE = HtmlPage("http://www.example.com", body=u"""
+
+
+ 标题
+ 段落
+ 另一个标题
+ 另一个段落
+
+
+ """)
+
+ def test_apply_criteria_should_support_cjk_chars(self):
+ criteria = parse_criteria('标题')
+ tm = TemplateMaker(self.PAGE)
+
+ selection = apply_criteria(criteria, tm)
+
+ self.assertEqual(selection, [6, 14])
+ self.assertEqual(tm.selected_data(6), u'标题
')
+ self.assertEqual(tm.selected_data(14), u'另一个标题
')
+
+
+class ReadableReprTextCase(TestCase):
+
+ def test_readable_repr(self):
+ cjk = u'cjk\t中日韩\n\\u535a'
+ readable = u"u'cjk\\t中日韩\\n\\\\u535a'"
+
+ self.assertEqual(readable_repr(cjk), readable)
diff --git a/scrapely/tool.py b/scrapely/tool.py
index 172c695..c84c7df 100644
--- a/scrapely/tool.py
+++ b/scrapely/tool.py
@@ -6,6 +6,21 @@
from scrapely.template import TemplateMaker, best_match
from scrapely.extraction import InstanceBasedLearningExtractor
+REPR_UNICODE_CHAR = re.compile(r'(? '
@@ -17,6 +32,8 @@ def __init__(self, filename, **kw):
def do_ta(self, line):
"""ta [--encoding ENCODING] - add template"""
opts, (url,) = parse_at(line)
+ if assert_or_print(url, "missing url"):
+ return
t = url_to_page(url, opts.encoding)
templates = self._load_templates()
templates.append(t)
@@ -31,7 +48,11 @@ def do_tl(self, line):
def do_td(self, template_id):
"""dt - delete template"""
+ if assert_or_print(template_id, "missing template id"):
+ return
templates = self._load_templates()
+ if assert_or_print(teamplates, "no templates available"):
+ return
try:
del templates[int(template_id)]
self._save_templates(templates)
@@ -41,13 +62,20 @@ def do_td(self, template_id):
def do_t(self, line):
"""t - test selection text"""
+ if assert_or_print(line, "missing template id or selection text"):
+ return
+ if assert_or_print(' ' in line, "missing template id or selection text"):
+ return
template_id, criteria = line.split(' ', 1)
t = self._load_template(template_id)
+ if assert_or_print(t, "template not found: %s" % template_id):
+ return
criteria = parse_criteria(criteria)
tm = TemplateMaker(t)
selection = apply_criteria(criteria, tm)
for n, i in enumerate(selection):
- print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
+ print "[%d] %s" % (n,
+ readable_repr(remove_annotation(tm.selected_data(i))))
def do_a(self, line):
"""a [-n number] [-f field]- add or test annotation
@@ -55,8 +83,14 @@ def do_a(self, line):
Add a new annotation (if -f is passed) or test what would be annotated
otherwise
"""
+ if assert_or_print(line, "missing template id and selection text"):
+ return
+ if assert_or_print(' ' in line, "missing template id or selection text"):
+ return
template_id, criteria = line.split(' ', 1)
t = self._load_template(template_id)
+ if assert_or_print(t, "template not found: %s" % template_id):
+ return
criteria = parse_criteria(criteria)
tm = TemplateMaker(t)
selection = apply_criteria(criteria, tm)
@@ -65,27 +99,31 @@ def do_a(self, line):
index = selection[0]
tm.annotate_fragment(index, criteria.field)
self._save_template(template_id, tm.get_template())
- print "[new] (%s) %r" % (criteria.field,
- remove_annotation(tm.selected_data(index)))
+ print "[new] (%s) %s" % (criteria.field,
+ readable_repr(remove_annotation(tm.selected_data(index))))
else:
for n, i in enumerate(selection):
- print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
+ print "[%d] %s" % (n, readable_repr(remove_annotation(tm.selected_data(i))))
def do_al(self, template_id):
"""al - list annotations"""
if assert_or_print(template_id, "missing template id"):
return
t = self._load_template(template_id)
+ if assert_or_print(t, "template not found: %s" % template_id):
+ return
tm = TemplateMaker(t)
for n, (a, i) in enumerate(tm.annotations()):
- print "[%s-%d] (%s) %r" % (template_id, n, a['annotations']['content'],
- remove_annotation(tm.selected_data(i)))
+ print "[%s-%d] (%s) %s" % (template_id, n, a['annotations']['content'],
+ readable_repr(remove_annotation(tm.selected_data(i))))
def do_s(self, url):
"""s - scrape url"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
+ if assert_or_print(url, "missing url"):
+ return
# fall back to the template encoding if none is specified
page = url_to_page(url, default_encoding=templates[0].encoding)
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
@@ -143,7 +181,10 @@ def parse_criteria(criteria_str):
def apply_criteria(criteria, tm):
"""Apply the given criteria object to the given template"""
- func = best_match(criteria.text) if criteria.text else lambda x, y: False
+ text = criteria.text
+ if text and isinstance(text, str):
+ text = text.decode(tm.get_template().encoding or 'utf-8')
+ func = best_match(text) if text else lambda x, y: False
sel = tm.select(func)
if criteria.number is not None:
if criteria.number < len(sel):