From 0135002b201c12ade5676cbba90902525caff9f8 Mon Sep 17 00:00:00 2001 From: xyb Date: Tue, 10 Sep 2013 13:00:56 +0800 Subject: [PATCH 1/3] support CJK string annotation; print readably CJK string in scrapely.tool's output --- scrapely/template.py | 2 +- scrapely/tool.py | 48 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/scrapely/template.py b/scrapely/template.py index 33ae2e1..5c96b8f 100644 --- a/scrapely/template.py +++ b/scrapely/template.py @@ -92,7 +92,7 @@ def best_match(text): """Function to use in TemplateMaker.annotate()""" def func(fragment, page): fdata = page.fragment_data(fragment).strip() - if text in fdata: + if text.decode('utf8') in fdata: return float(len(text)) / len(fdata) - (1e-6 * fragment.start) else: return 0.0 diff --git a/scrapely/tool.py b/scrapely/tool.py index 172c695..5b87403 100644 --- a/scrapely/tool.py +++ b/scrapely/tool.py @@ -6,6 +6,19 @@ from scrapely.template import TemplateMaker, best_match from scrapely.extraction import InstanceBasedLearningExtractor +REPR_UNICODE_CHAR = re.compile(r'(? ' @@ -17,6 +30,8 @@ def __init__(self, filename, **kw): def do_ta(self, line): """ta [--encoding ENCODING] - add template""" opts, (url,) = parse_at(line) + if assert_or_print(url, "missing url"): + return t = url_to_page(url, opts.encoding) templates = self._load_templates() templates.append(t) @@ -31,7 +46,11 @@ def do_tl(self, line): def do_td(self, template_id): """dt