1+ #!/usr/bin/env python
2+ # -*- coding: utf-8 -*-
3+
14r"""
25htmldocck.py is a custom checker script for Rustdoc HTML outputs.
36
98101
99102"""
100103
101- from __future__ import print_function
104+ from __future__ import absolute_import , print_function , unicode_literals
105+
106+ import codecs
107+ import io
102108import sys
103109import os .path
104110import re
110116 from HTMLParser import HTMLParser
111117from xml .etree import cElementTree as ET
112118
113- # ⇤/⇥ are not in HTML 4 but are in HTML 5
114119try :
115- from html .entities import entitydefs
120+ from html .entities import name2codepoint
116121except ImportError :
117- from htmlentitydefs import entitydefs
118- entitydefs ['larrb' ] = u'\u21e4 '
119- entitydefs ['rarrb' ] = u'\u21e5 '
120- entitydefs ['nbsp' ] = ' '
122+ from htmlentitydefs import name2codepoint
121123
122124# "void elements" (no closing tag) from the HTML Standard section 12.1.2
123125VOID_ELEMENTS = set (['area' , 'base' , 'br' , 'col' , 'embed' , 'hr' , 'img' , 'input' , 'keygen' ,
@@ -157,11 +159,11 @@ def handle_data(self, data):
157159 self .__builder .data (data )
158160
159161 def handle_entityref (self , name ):
160- self .__builder .data (entitydefs [name ])
162+ self .__builder .data (unichr ( name2codepoint [name ]) )
161163
162164 def handle_charref (self , name ):
163165 code = int (name [1 :], 16 ) if name .startswith (('x' , 'X' )) else int (name , 10 )
164- self .__builder .data (unichr (code ). encode ( 'utf-8' ) )
166+ self .__builder .data (unichr (code ))
165167
166168 def close (self ):
167169 HTMLParser .close (self )
@@ -210,11 +212,11 @@ def concat_multi_lines(f):
210212 (?<=(?<!\S)@)(?P<negated>!?)
211213 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
212214 (?P<args>.*)$
213- ''' , re .X )
215+ ''' , re .X | re . UNICODE )
214216
215217
216218def get_commands (template ):
217- with open (template , 'rU ' ) as f :
219+ with io . open (template , encoding = 'utf-8 ' ) as f :
218220 for lineno , line in concat_multi_lines (f ):
219221 m = LINE_PATTERN .search (line )
220222 if not m :
@@ -226,7 +228,10 @@ def get_commands(template):
226228 if args and not args [:1 ].isspace ():
227229 print_err (lineno , line , 'Invalid template syntax' )
228230 continue
229- args = shlex .split (args )
231+ try :
232+ args = shlex .split (args )
233+ except UnicodeEncodeError :
234+ args = [arg .decode ('utf-8' ) for arg in shlex .split (args .encode ('utf-8' ))]
230235 yield Command (negated = negated , cmd = cmd , args = args , lineno = lineno + 1 , context = line )
231236
232237
@@ -280,7 +285,7 @@ def get_file(self, path):
280285 if not (os .path .exists (abspath ) and os .path .isfile (abspath )):
281286 raise FailedCheck ('File does not exist {!r}' .format (path ))
282287
283- with open (abspath ) as f :
288+ with io . open (abspath , encoding = 'utf-8' ) as f :
284289 data = f .read ()
285290 self .files [path ] = data
286291 return data
@@ -294,9 +299,9 @@ def get_tree(self, path):
294299 if not (os .path .exists (abspath ) and os .path .isfile (abspath )):
295300 raise FailedCheck ('File does not exist {!r}' .format (path ))
296301
297- with open (abspath ) as f :
302+ with io . open (abspath , encoding = 'utf-8' ) as f :
298303 try :
299- tree = ET .parse ( f , CustomHTMLParser ())
304+ tree = ET .fromstringlist ( f . readlines () , CustomHTMLParser ())
300305 except Exception as e :
301306 raise RuntimeError ('Cannot parse an HTML file {!r}: {}' .format (path , e ))
302307 self .trees [path ] = tree
@@ -313,7 +318,7 @@ def check_string(data, pat, regexp):
313318 if not pat :
314319 return True # special case a presence testing
315320 elif regexp :
316- return re .search (pat , data ) is not None
321+ return re .search (pat , data , flags = re . UNICODE ) is not None
317322 else :
318323 data = ' ' .join (data .split ())
319324 pat = ' ' .join (pat .split ())
@@ -350,7 +355,7 @@ def check_tree_text(tree, path, pat, regexp):
350355 break
351356 except Exception as e :
352357 print ('Failed to get path "{}"' .format (path ))
353- raise e
358+ raise
354359 return ret
355360
356361
@@ -359,7 +364,12 @@ def get_tree_count(tree, path):
359364 return len (tree .findall (path ))
360365
361366def stderr (* args ):
362- print (* args , file = sys .stderr )
367+ if sys .version_info .major < 3 :
368+ file = codecs .getwriter ('utf-8' )(sys .stderr )
369+ else :
370+ file = sys .stderr
371+
372+ print (* args , file = file )
363373
364374def print_err (lineno , context , err , message = None ):
365375 global ERR_COUNT
0 commit comments