110110import re
111111import shlex
112112from collections import namedtuple
113- try :
114- from html .parser import HTMLParser
115- except ImportError :
116- from HTMLParser import HTMLParser
117- try :
118- from xml .etree import cElementTree as ET
119- except ImportError :
120- from xml .etree import ElementTree as ET
121-
122- try :
123- from html .entities import name2codepoint
124- except ImportError :
125- from htmlentitydefs import name2codepoint
126-
127- # "void elements" (no closing tag) from the HTML Standard section 12.1.2
128- VOID_ELEMENTS = {'area' , 'base' , 'br' , 'col' , 'embed' , 'hr' , 'img' , 'input' , 'keygen' ,
129- 'link' , 'menuitem' , 'meta' , 'param' , 'source' , 'track' , 'wbr' }
130-
131- # Python 2 -> 3 compatibility
132- try :
133- unichr
134- except NameError :
135- unichr = chr
136-
113+ from parsel import Selector
137114
138115channel = os .environ ["DOC_RUST_LANG_ORG_CHANNEL" ]
139-
140- class CustomHTMLParser (HTMLParser ):
141- """simplified HTML parser.
142-
143- this is possible because we are dealing with very regular HTML from
144- rustdoc; we only have to deal with i) void elements and ii) empty
145- attributes."""
146- def __init__ (self , target = None ):
147- HTMLParser .__init__ (self )
148- self .__builder = target or ET .TreeBuilder ()
149-
150- def handle_starttag (self , tag , attrs ):
151- attrs = {k : v or '' for k , v in attrs }
152- self .__builder .start (tag , attrs )
153- if tag in VOID_ELEMENTS :
154- self .__builder .end (tag )
155-
156- def handle_endtag (self , tag ):
157- self .__builder .end (tag )
158-
159- def handle_startendtag (self , tag , attrs ):
160- attrs = {k : v or '' for k , v in attrs }
161- self .__builder .start (tag , attrs )
162- self .__builder .end (tag )
163-
164- def handle_data (self , data ):
165- self .__builder .data (data )
166-
167- def handle_entityref (self , name ):
168- self .__builder .data (unichr (name2codepoint [name ]))
169-
170- def handle_charref (self , name ):
171- code = int (name [1 :], 16 ) if name .startswith (('x' , 'X' )) else int (name , 10 )
172- self .__builder .data (unichr (code ))
173-
174- def close (self ):
175- HTMLParser .close (self )
176- return self .__builder .close ()
177-
178-
179116Command = namedtuple ('Command' , 'negated cmd args lineno context' )
180117
181118
@@ -256,29 +193,11 @@ def get_commands(template):
256193 yield Command (negated = negated , cmd = cmd , args = args , lineno = lineno + 1 , context = line )
257194
258195
259- def _flatten (node , acc ):
260- if node .text :
261- acc .append (node .text )
262- for e in node :
263- _flatten (e , acc )
264- if e .tail :
265- acc .append (e .tail )
266-
267-
268- def flatten (node ):
269- acc = []
270- _flatten (node , acc )
271- return '' .join (acc )
272-
273-
274196def normalize_xpath (path ):
275197 path = path .replace ("{{channel}}" , channel )
276- if path .startswith ('//' ):
277- return '.' + path # avoid warnings
278- elif path .startswith ('.//' ):
279- return path
280- else :
198+ if not path .startswith ('//' ):
281199 raise InvalidCheck ('Non-absolute XPath is not supported due to implementation issues' )
200+ return path
282201
283202
284203class CachedFiles (object ):
@@ -323,7 +242,7 @@ def get_tree(self, path):
323242
324243 with io .open (abspath , encoding = 'utf-8' ) as f :
325244 try :
326- tree = ET . fromstringlist ( f . readlines (), CustomHTMLParser ())
245+ tree = Selector ( text = f . read ())
327246 except Exception as e :
328247 raise RuntimeError ('Cannot parse an HTML file {!r}: {}' .format (path , e ))
329248 self .trees [path ] = tree
@@ -351,7 +270,7 @@ def check_string(data, pat, regexp):
351270def check_tree_attr (tree , path , attr , pat , regexp ):
352271 path = normalize_xpath (path )
353272 ret = False
354- for e in tree .findall (path ):
273+ for e in tree .xpath (path ):
355274 if attr in e .attrib :
356275 value = e .attrib [attr ]
357276 else :
@@ -363,19 +282,19 @@ def check_tree_attr(tree, path, attr, pat, regexp):
363282 return ret
364283
365284
285+ def flatten (elem ):
286+ return '' .join (elem .css ('::text' ).getall ())
287+
288+
366289def check_tree_text (tree , path , pat , regexp ):
367290 path = normalize_xpath (path )
368291 ret = False
369292 try :
370- for e in tree .findall (path ):
371- try :
372- value = flatten (e )
373- except KeyError :
374- continue
375- else :
376- ret = check_string (value , pat , regexp )
377- if ret :
378- break
293+ for e in tree .xpath (path ):
294+ value = flatten (e )
295+ ret = check_string (value , pat , regexp )
296+ if ret :
297+ break
379298 except Exception :
380299 print ('Failed to get path "{}"' .format (path ))
381300 raise
@@ -384,7 +303,7 @@ def check_tree_text(tree, path, pat, regexp):
384303
385304def get_tree_count (tree , path ):
386305 path = normalize_xpath (path )
387- return len (tree .findall (path ))
306+ return len (tree .xpath (path ))
388307
389308
390309def stderr (* args ):
0 commit comments