Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scrapely/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def best_match(text):
"""Function to use in TemplateMaker.annotate()"""
def func(fragment, page):
fdata = page.fragment_data(fragment).strip()
if text in fdata:
if text.decode('utf8') in fdata:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I haven't looked at your changes in detail, but why are you decoding from UTF-8 and not from some other encoding?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have the same question. Also, it's probably more appropiate to do this on scrapely.tool, when calling best_match, not here.

return float(len(text)) / len(fdata) - (1e-6 * fragment.start)
else:
return 0.0
Expand Down
48 changes: 42 additions & 6 deletions scrapely/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@
from scrapely.template import TemplateMaker, best_match
from scrapely.extraction import InstanceBasedLearningExtractor

REPR_UNICODE_CHAR = re.compile(r'(?<!\\)(\\u[0-9a-f]{4,4})')


def readable_repr(obj):
'''Return printing-friendly unicode repr string
'''
def replace_unicode_char(repr_char):
return unichr(int(str(repr_char.group())[2:], base=16))

repr_string = repr(obj)
return REPR_UNICODE_CHAR.sub(replace_unicode_char, repr_string)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it different from repr_bytesting.decode('unicode-escape') ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is repr_bytesting?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is repr_string

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kmike, it's different. The decode('unicode-escape') restore whole string; readable_repr restore CJK characters(all four bytes characters actually) only, not include '\n', '\t', '\\', etc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @xyb,

Thanks for the fix and an explanation. This approach makes sense; it is basically undoing what Python 2.x repr is doing for unicode strings (if we don't want to print new lines, etc.)

A couple of notes:

  1. Your regex doesn't catch all symbols that can be safely decoded, e.g. ² (\xb2) or £ (\xa3) could be nice to see in the output;
  2. 'readable_repr' name is a bit confusing because in Python 2.x repr must be a bytestring, and readable_repr returns unicode. What do you think about calling it e.g. 'unicode_repr'?

The best fix for this issue would be to port scrapely to Python 3 - it doesn't escape non-ascii letters and symbols in repr of unicode strings, but w3lib must be ported before that :)



class IblTool(cmd.Cmd):

prompt = 'scrapely> '
Expand All @@ -17,6 +30,8 @@ def __init__(self, filename, **kw):
def do_ta(self, line):
"""ta <url> [--encoding ENCODING] - add template"""
opts, (url,) = parse_at(line)
if assert_or_print(url, "missing url"):
return
t = url_to_page(url, opts.encoding)
templates = self._load_templates()
templates.append(t)
Expand All @@ -31,7 +46,11 @@ def do_tl(self, line):

def do_td(self, template_id):
"""dt <template> - delete template"""
if assert_or_print(template_id, "missing template id"):
return
templates = self._load_templates()
if assert_or_print(teamplates, "no templates available"):
return
try:
del templates[int(template_id)]
self._save_templates(templates)
Expand All @@ -41,22 +60,35 @@ def do_td(self, template_id):

def do_t(self, line):
"""t <template> <text> - test selection text"""
if assert_or_print(line, "missing template id or selection text"):
return
if assert_or_print(' ' in line, "missing template id or selection text"):
return
template_id, criteria = line.split(' ', 1)
t = self._load_template(template_id)
if assert_or_print(t, "template not found: %s" % template_id):
return
criteria = parse_criteria(criteria)
tm = TemplateMaker(t)
selection = apply_criteria(criteria, tm)
for n, i in enumerate(selection):
print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
print "[%d] %s" % (n,
readable_repr(remove_annotation(tm.selected_data(i))))

def do_a(self, line):
"""a <template> <data> [-n number] [-f field]- add or test annotation

Add a new annotation (if -f is passed) or test what would be annotated
otherwise
"""
if assert_or_print(line, "missing template id and selection text"):
return
if assert_or_print(' ' in line, "missing template id or selection text"):
return
template_id, criteria = line.split(' ', 1)
t = self._load_template(template_id)
if assert_or_print(t, "template not found: %s" % template_id):
return
criteria = parse_criteria(criteria)
tm = TemplateMaker(t)
selection = apply_criteria(criteria, tm)
Expand All @@ -65,27 +97,31 @@ def do_a(self, line):
index = selection[0]
tm.annotate_fragment(index, criteria.field)
self._save_template(template_id, tm.get_template())
print "[new] (%s) %r" % (criteria.field,
remove_annotation(tm.selected_data(index)))
print "[new] (%s) %s" % (criteria.field,
readable_repr(remove_annotation(tm.selected_data(index))))
else:
for n, i in enumerate(selection):
print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
print "[%d] %s" % (n, readable_repr(remove_annotation(tm.selected_data(i))))

def do_al(self, template_id):
"""al <template> - list annotations"""
if assert_or_print(template_id, "missing template id"):
return
t = self._load_template(template_id)
if assert_or_print(t, "template not found: %s" % template_id):
return
tm = TemplateMaker(t)
for n, (a, i) in enumerate(tm.annotations()):
print "[%s-%d] (%s) %r" % (template_id, n, a['annotations']['content'],
remove_annotation(tm.selected_data(i)))
print "[%s-%d] (%s) %s" % (template_id, n, a['annotations']['content'],
readable_repr(remove_annotation(tm.selected_data(i))))

def do_s(self, url):
"""s <url> - scrape url"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
if assert_or_print(url, "missing url"):
return
# fall back to the template encoding if none is specified
page = url_to_page(url, default_encoding=templates[0].encoding)
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
Expand Down