44except ImportError :
55 import simplejson as json
66
7- from scrapely .htmlpage import HtmlPage , page_to_dict
7+ from scrapely .htmlpage import HtmlPage , page_to_dict , url_to_page
88from scrapely .template import TemplateMaker , best_match
99from scrapely .extraction import InstanceBasedLearningExtractor
1010
1111class Scraper (object ):
1212
1313 def __init__ (self , templates = None ):
1414 """Initialize an empty scraper."""
15- self .templates = templates or []
15+ self ._templates = templates or []
16+ self ._ex = None
1617
1718 @classmethod
1819 def fromfile (cls , file ):
@@ -21,31 +22,38 @@ def fromfile(cls, file):
2122 """
2223 templates = [HtmlPage (** x ) for x in json .load (file )['templates' ]]
2324 return cls (templates )
24-
25+
2526 def tofile (self , file ):
2627 """Store the scraper into the given file-like object"""
27- tpls = [page_to_dict (x ) for x in self .templates ]
28+ tpls = [page_to_dict (x ) for x in self ._templates ]
2829 json .dump ({'templates' : tpls }, file )
30+
31+ def add_template (self , template ):
32+ self ._templates .append (template )
33+ self ._ex = None
2934
30- def train (self , url , data , encoding = 'utf-8' ):
35+ def train_from_htmlpage (self , htmlpage , data ):
3136 assert data , "Cannot train with empty data"
32- page = self ._get_page (url , encoding )
33- tm = TemplateMaker (page )
37+ tm = TemplateMaker (htmlpage )
3438 for field , values in data .items ():
3539 if not hasattr (values , '__iter__' ):
3640 values = [values ]
3741 for value in values :
3842 if isinstance (value , str ):
39- value = value .decode (encoding )
43+ value = value .decode (htmlpage . encoding or 'utf-8' )
4044 tm .annotate (field , best_match (value ))
41- self .templates .append (tm .get_template ())
45+ self .add_template (tm .get_template ())
46+
47+ def train (self , url , data , encoding = None ):
48+ page = url_to_page (url , encoding )
49+ self .train_from_htmlpage (page , data )
4250
43- def scrape (self , url , encoding = 'utf-8' ):
44- page = self ._get_page (url , encoding )
45- ex = InstanceBasedLearningExtractor ((t , None ) for t in self .templates )
46- return ex .extract (page )[0 ]
51+ def scrape (self , url , encoding = None ):
52+ page = url_to_page (url , encoding )
53+ return self .scrape_page (page )
4754
48- @staticmethod
49- def _get_page (url , encoding ):
50- body = urllib .urlopen (url ).read ().decode (encoding )
51- return HtmlPage (url , body = body , encoding = encoding )
55+ def scrape_page (self , page ):
56+ if self ._ex is None :
57+ self ._ex = InstanceBasedLearningExtractor ((t , None ) for t in
58+ self ._templates )
59+ return self ._ex .extract (page )[0 ]
0 commit comments