22import sys , os , re , cmd , shlex , json , optparse , json , urllib , pprint
33from cStringIO import StringIO
44
5- from scrapely .htmlpage import HtmlPage , page_to_dict
5+ from scrapely .htmlpage import HtmlPage , page_to_dict , url_to_page
66from scrapely .template import TemplateMaker , best_match
77from scrapely .extraction import InstanceBasedLearningExtractor
88
@@ -17,7 +17,7 @@ def __init__(self, filename, **kw):
1717 def do_ta (self , line ):
1818 """ta <url> [--encoding ENCODING] - add template"""
1919 opts , (url ,) = parse_at (line )
20- t = get_page (url , opts .encoding )
20+ t = url_to_page (url , opts .encoding )
2121 templates = self ._load_templates ()
2222 templates .append (t )
2323 self ._save_templates (templates )
@@ -82,11 +82,12 @@ def do_al(self, template_id):
8282 remove_annotation (tm .selected_data (i )))
8383
8484 def do_s (self , url ):
85- """s <url> - scrape url (uses encoding from templates) """
85+ """s <url> - scrape url"""
8686 templates = self ._load_templates ()
8787 if assert_or_print (templates , "no templates available" ):
8888 return
89- page = get_page (url , templates [0 ].encoding )
89+ # fall back to the template encoding if none is specified
90+ page = url_to_page (url , default_encoding = templates [0 ].encoding )
9091 ex = InstanceBasedLearningExtractor ((t , None ) for t in templates )
9192 pprint .pprint (ex .extract (page )[0 ])
9293
@@ -126,13 +127,9 @@ def _save_templates(self, templates):
126127 templates = [page_to_dict (t ) for t in templates ]
127128 return json .dump ({'templates' : templates }, f )
128129
129- def get_page (url , encoding ):
130- body = urllib .urlopen (url ).read ().decode (encoding )
131- return HtmlPage (url , body = body , encoding = encoding )
132-
133130def parse_at (ta_line ):
134131 p = optparse .OptionParser ()
135- p .add_option ('-e' , '--encoding' , default = 'utf-8' , help = 'page encoding' )
132+ p .add_option ('-e' , '--encoding' , help = 'page encoding' )
136133 return p .parse_args (shlex .split (ta_line ))
137134
138135def parse_criteria (criteria_str ):
0 commit comments