2222
2323
2424def parse (doc , treebuilder = "etree" , encoding = None ,
25- namespaceHTMLElements = True ):
25+ namespaceHTMLElements = True , scripting = False ):
2626 """Parse a string or file-like object into a tree"""
2727 tb = treebuilders .getTreeBuilder (treebuilder )
2828 p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
29- return p .parse (doc , encoding = encoding )
29+ return p .parse (doc , encoding = encoding , scripting = scripting )
3030
3131
3232def parseFragment (doc , container = "div" , treebuilder = "etree" , encoding = None ,
33- namespaceHTMLElements = True ):
33+ namespaceHTMLElements = True , scripting = False ):
3434 tb = treebuilders .getTreeBuilder (treebuilder )
3535 p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
36- return p .parseFragment (doc , container = container , encoding = encoding )
36+ return p .parseFragment (doc , container = container , encoding = encoding , scripting = scripting )
3737
3838
3939def method_decorator_metaclass (function ):
@@ -78,11 +78,12 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
7878 self .phases = dict ([(name , cls (self , self .tree )) for name , cls in
7979 getPhases (debug ).items ()])
8080
81- def _parse (self , stream , innerHTML = False , container = "div" ,
82- encoding = None , parseMeta = True , useChardet = True , ** kwargs ):
81+ def _parse (self , stream , innerHTML = False , container = "div" , encoding = None ,
82+ parseMeta = True , useChardet = True , scripting = False , ** kwargs ):
8383
8484 self .innerHTMLMode = innerHTML
8585 self .container = container
86+ self .scripting = scripting
8687 self .tokenizer = self .tokenizer_class (stream , encoding = encoding ,
8788 parseMeta = parseMeta ,
8889 useChardet = useChardet ,
@@ -221,7 +222,8 @@ def normalizedTokens(self):
221222 for token in self .tokenizer :
222223 yield self .normalizeToken (token )
223224
224- def parse (self , stream , encoding = None , parseMeta = True , useChardet = True ):
225+ def parse (self , stream , encoding = None , parseMeta = True ,
226+ useChardet = True , scripting = False ):
225227 """Parse a HTML document into a well-formed tree
226228
227229 stream - a filelike object or string containing the HTML to be parsed
@@ -230,13 +232,15 @@ def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
230232 the encoding. If specified, that encoding will be used,
231233 regardless of any BOM or later declaration (such as in a meta
232234 element)
235+
236+ scripting - treat noscript elements as if javascript was turned on
233237 """
234238 self ._parse (stream , innerHTML = False , encoding = encoding ,
235- parseMeta = parseMeta , useChardet = useChardet )
239+ parseMeta = parseMeta , useChardet = useChardet , scripting = scripting )
236240 return self .tree .getDocument ()
237241
238242 def parseFragment (self , stream , container = "div" , encoding = None ,
239- parseMeta = False , useChardet = True ):
243+ parseMeta = False , useChardet = True , scripting = False ):
240244 """Parse a HTML fragment into a well-formed tree fragment
241245
242246 container - name of the element we're setting the innerHTML property
@@ -248,8 +252,11 @@ def parseFragment(self, stream, container="div", encoding=None,
248252 the encoding. If specified, that encoding will be used,
249253 regardless of any BOM or later declaration (such as in a meta
250254 element)
255+
256+ scripting - treat noscript elements as if javascript was turned on
251257 """
252- self ._parse (stream , True , container = container , encoding = encoding )
258+ self ._parse (stream , True , container = container ,
259+ encoding = encoding , scripting = scripting )
253260 return self .tree .getFragment ()
254261
255262 def parseError (self , errorcode = "XXX-undefined-error" , datavars = {}):
@@ -707,7 +714,8 @@ def __init__(self, parser, tree):
707714 self .startTagHandler = utils .MethodDispatcher ([
708715 ("html" , self .startTagHtml ),
709716 ("title" , self .startTagTitle ),
710- (("noscript" , "noframes" , "style" ), self .startTagNoScriptNoFramesStyle ),
717+ (("noframes" , "style" ), self .startTagNoFramesStyle ),
718+ ("noscript" , self .startTagNoscript ),
711719 ("script" , self .startTagScript ),
712720 (("base" , "basefont" , "bgsound" , "command" , "link" ),
713721 self .startTagBaseLinkCommand ),
@@ -716,7 +724,7 @@ def __init__(self, parser, tree):
716724 ])
717725 self .startTagHandler .default = self .startTagOther
718726
719- self . endTagHandler = utils .MethodDispatcher ([
727+ self .endTagHandler = utils .MethodDispatcher ([
720728 ("head" , self .endTagHead ),
721729 (("br" , "html" , "body" ), self .endTagHtmlBodyBr )
722730 ])
@@ -766,10 +774,17 @@ def startTagMeta(self, token):
766774 def startTagTitle (self , token ):
767775 self .parser .parseRCDataRawtext (token , "RCDATA" )
768776
769- def startTagNoScriptNoFramesStyle (self , token ):
777+ def startTagNoFramesStyle (self , token ):
770778 # Need to decide whether to implement the scripting-disabled case
771779 self .parser .parseRCDataRawtext (token , "RAWTEXT" )
772780
781+ def startTagNoscript (self , token ):
782+ if self .parser .scripting :
783+ self .parser .parseRCDataRawtext (token , "RAWTEXT" )
784+ else :
785+ self .tree .insertElement (token )
786+ self .parser .phase = self .parser .phases ["inHeadNoscript" ]
787+
773788 def startTagScript (self , token ):
774789 self .tree .insertElement (token )
775790 self .parser .tokenizer .state = self .parser .tokenizer .scriptDataState
@@ -795,10 +810,51 @@ def endTagOther(self, token):
795810 def anythingElse (self ):
796811 self .endTagHead (impliedTagToken ("head" ))
797812
798- # XXX If we implement a parser for which scripting is disabled we need to
799- # implement this phase.
800- #
801- # class InHeadNoScriptPhase(Phase):
813+ class InHeadNoscriptPhase (Phase ):
814+ def __init__ (self , parser , tree ):
815+ Phase .__init__ (self , parser , tree )
816+
817+ self .startTagHandler = utils .MethodDispatcher ([
818+ ("html" , self .startTagHtml ),
819+ (("basefont" , "bgsound" , "link" , "meta" , "noframes" , "style" ), self .startTagBaseLinkCommand ),
820+ (("head" , "noscript" ), self .startTagHeadNoscript ),
821+ ])
822+ self .startTagHandler .default = self .startTagOther
823+
824+ self .endTagHandler = utils .MethodDispatcher ([
825+ ("noscript" , self .endTagNoscript ),
826+ ("br" , self .endTagBr ),
827+ ])
828+ self .endTagHandler .default = self .endTagOther
829+
830+ def startTagHtml (self , token ):
831+ return self .parser .phases ["inBody" ].processStartTag (token )
832+
833+ def startTagBaseLinkCommand (self , token ):
834+ return self .parser .phases ["inHead" ].startTagBaseLinkCommand (token )
835+
836+ def startTagHeadNoscript (self , token ):
837+ self .parser .parseError ("unexpected-start-tag" , {"name" : token ["name" ]})
838+
839+ def startTagOther (self , token ):
840+ return self .anythingElse (token )
841+
842+ def endTagNoscript (self , token ):
843+ node = self .parser .tree .openElements .pop ()
844+ assert node .name == "noscript" , "Expected noscript got %s" % node .name
845+ self .parser .phase = self .parser .phases ["inHead" ]
846+
847+ def endTagBr (self , token ):
848+ return self .anythingElse (token )
849+
850+ def endTagOther (self , token ):
851+ self .parser .parseError ("unexpected-end-tag" , {"name" : token ["name" ]})
852+
853+ def anythingElse (self , token ):
854+ self .parser .parseError ("unexpected-inhead-noscript-tag" , {"name" : token ["name" ]})
855+ self .endTagNoscript (impliedTagToken ("noscript" ))
856+ return token
857+
802858 class AfterHeadPhase (Phase ):
803859 def __init__ (self , parser , tree ):
804860 Phase .__init__ (self , parser , tree )
@@ -909,7 +965,8 @@ def __init__(self, parser, tree):
909965 ("isindex" , self .startTagIsIndex ),
910966 ("textarea" , self .startTagTextarea ),
911967 ("iframe" , self .startTagIFrame ),
912- (("noembed" , "noframes" , "noscript" ), self .startTagRawtext ),
968+ ("noscript" , self .startTagNoscript ),
969+ (("noembed" , "noframes" ), self .startTagRawtext ),
913970 ("select" , self .startTagSelect ),
914971 (("rp" , "rt" ), self .startTagRpRt ),
915972 (("option" , "optgroup" ), self .startTagOpt ),
@@ -1230,6 +1287,12 @@ def startTagIFrame(self, token):
12301287 self .parser .framesetOK = False
12311288 self .startTagRawtext (token )
12321289
1290+ def startTagNoscript (self , token ):
1291+ if self .parser .scripting :
1292+ self .startTagRawtext (token )
1293+ else :
1294+ self .startTagOther (token )
1295+
12331296 def startTagRawtext (self , token ):
12341297 """iframe, noembed noframes, noscript(if scripting enabled)"""
12351298 self .parser .parseRCDataRawtext (token , "RAWTEXT" )
@@ -2686,7 +2749,7 @@ def processEndTag(self, token):
26862749 "beforeHtml" : BeforeHtmlPhase ,
26872750 "beforeHead" : BeforeHeadPhase ,
26882751 "inHead" : InHeadPhase ,
2689- # XXX "inHeadNoscript": InHeadNoScriptPhase ,
2752+ "inHeadNoscript" : InHeadNoscriptPhase ,
26902753 "afterHead" : AfterHeadPhase ,
26912754 "inBody" : InBodyPhase ,
26922755 "text" : TextPhase ,
0 commit comments