2828# we don't use enum.Enum because of Python 2.7 compatibility
2929class UnicodeFiles (object ):
3030 # ReadMe does not contain any unicode data, we
31- # use it to extract versions.
31+ # only use it to extract versions.
3232 README = "ReadMe.txt"
3333
3434 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt"
3535 DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt"
36- SPECIAL_CASING = "SpecialCasing.txt"
37- SCRIPTS = "Scripts.txt"
3836 PROPS = "PropList.txt"
37+ SCRIPTS = "Scripts.txt"
38+ SPECIAL_CASING = "SpecialCasing.txt"
3939 UNICODE_DATA = "UnicodeData.txt"
4040
4141
@@ -66,15 +66,15 @@ class UnicodeFiles(object):
6666# Mapping taken from Table 12 from:
6767# http://www.unicode.org/reports/tr44/#General_Category_Values
6868EXPANDED_CATEGORIES = {
69- 'Lu' : ['LC' , 'L' ], 'Ll' : ['LC' , 'L' ], 'Lt' : ['LC' , 'L' ],
70- 'Lm' : ['L' ], 'Lo' : ['L' ],
71- 'Mn' : ['M' ], 'Mc' : ['M' ], 'Me' : ['M' ],
72- 'Nd' : ['N' ], 'Nl' : ['N' ], 'No' : ['N' ],
73- 'Pc' : ['P' ], 'Pd' : ['P' ], 'Ps' : ['P' ], 'Pe' : ['P' ],
74- 'Pi' : ['P' ], 'Pf' : ['P' ], 'Po' : ['P' ],
75- 'Sm' : ['S' ], 'Sc' : ['S' ], 'Sk' : ['S' ], 'So' : ['S' ],
76- 'Zs' : ['Z' ], 'Zl' : ['Z' ], 'Zp' : ['Z' ],
77- 'Cc' : ['C' ], 'Cf' : ['C' ], 'Cs' : ['C' ], 'Co' : ['C' ], 'Cn' : ['C' ],
69+ "Lu" : ["LC" , "L" ], "Ll" : ["LC" , "L" ], "Lt" : ["LC" , "L" ],
70+ "Lm" : ["L" ], "Lo" : ["L" ],
71+ "Mn" : ["M" ], "Mc" : ["M" ], "Me" : ["M" ],
72+ "Nd" : ["N" ], "Nl" : ["N" ], "No" : ["N" ],
73+ "Pc" : ["P" ], "Pd" : ["P" ], "Ps" : ["P" ], "Pe" : ["P" ],
74+ "Pi" : ["P" ], "Pf" : ["P" ], "Po" : ["P" ],
75+ "Sm" : ["S" ], "Sc" : ["S" ], "Sk" : ["S" ], "So" : ["S" ],
76+ "Zs" : ["Z" ], "Zl" : ["Z" ], "Zp" : ["Z" ],
77+ "Cc" : ["C" ], "Cf" : ["C" ], "Cs" : ["C" ], "Co" : ["C" ], "Cn" : ["C" ],
7878}
7979
8080# these are the surrogate codepoints, which are not valid rust characters
@@ -115,7 +115,7 @@ def fetch_files(version=None):
115115 readme_content = subprocess .check_output (("curl" , readme_url ))
116116
117117 unicode_version = parse_unicode_version (
118- str ( readme_content , "utf8" )
118+ readme_content . decode ( "utf8" )
119119 )
120120
121121 download_dir = os .path .join (FETCH_DIR , unicode_version .as_str )
@@ -415,7 +415,7 @@ def compute_trie(rawdata, chunksize):
415415 child_data = []
416416 for i in range (len (rawdata ) // chunksize ):
417417 data = rawdata [i * chunksize : (i + 1 ) * chunksize ]
418- child = '|' .join (map (str , data ))
418+ child = "|" .join (map (str , data ))
419419 if child not in childmap :
420420 childmap [child ] = len (childmap )
421421 child_data .extend (data )
@@ -444,34 +444,34 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
444444 pub_string = "pub "
445445 f .write (" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n " % (pub_string , name ))
446446 f .write (" r1: [\n " )
447- data = ',' .join (' 0x%016x' % chunk for chunk in chunks [0 :0x800 // chunk_size ])
447+ data = "," .join (" 0x%016x" % chunk for chunk in chunks [0 :0x800 // chunk_size ])
448448 format_table_content (f , data , 12 )
449449 f .write ("\n ],\n " )
450450
451451 # 0x800..0x10000 trie
452452 (r2 , r3 ) = compute_trie (chunks [0x800 // chunk_size : 0x10000 // chunk_size ], 64 // chunk_size )
453453 f .write (" r2: [\n " )
454- data = ',' .join (str (node ) for node in r2 )
454+ data = "," .join (str (node ) for node in r2 )
455455 format_table_content (f , data , 12 )
456456 f .write ("\n ],\n " )
457457 f .write (" r3: &[\n " )
458- data = ',' .join (' 0x%016x' % chunk for chunk in r3 )
458+ data = "," .join (" 0x%016x" % chunk for chunk in r3 )
459459 format_table_content (f , data , 12 )
460460 f .write ("\n ],\n " )
461461
462462 # 0x10000..0x110000 trie
463463 (mid , r6 ) = compute_trie (chunks [0x10000 // chunk_size : 0x110000 // chunk_size ], 64 // chunk_size )
464464 (r4 , r5 ) = compute_trie (mid , 64 )
465465 f .write (" r4: [\n " )
466- data = ',' .join (str (node ) for node in r4 )
466+ data = "," .join (str (node ) for node in r4 )
467467 format_table_content (f , data , 12 )
468468 f .write ("\n ],\n " )
469469 f .write (" r5: &[\n " )
470- data = ',' .join (str (node ) for node in r5 )
470+ data = "," .join (str (node ) for node in r5 )
471471 format_table_content (f , data , 12 )
472472 f .write ("\n ],\n " )
473473 f .write (" r6: &[\n " )
474- data = ',' .join (' 0x%016x' % chunk for chunk in r6 )
474+ data = "," .join (" 0x%016x" % chunk for chunk in r6 )
475475 format_table_content (f , data , 12 )
476476 f .write ("\n ],\n " )
477477
@@ -497,12 +497,12 @@ def emit_small_bool_trie(f, name, t_data, is_pub=True):
497497 (r1 , r2 ) = compute_trie (chunks , 1 )
498498
499499 f .write (" r1: &[\n " )
500- data = ',' .join (str (node ) for node in r1 )
500+ data = "," .join (str (node ) for node in r1 )
501501 format_table_content (f , data , 12 )
502502 f .write ("\n ],\n " )
503503
504504 f .write (" r2: &[\n " )
505- data = ',' .join (' 0x%016x' % node for node in r2 )
505+ data = "," .join (" 0x%016x" % node for node in r2 )
506506 format_table_content (f , data , 12 )
507507 f .write ("\n ],\n " )
508508
@@ -599,11 +599,9 @@ def main():
599599 print ("Using Unicode version: {}" .format (unicode_version .as_str ))
600600
601601 tables_rs_path = os .path .join (THIS_DIR , "tables.rs" )
602- if os .path .exists (tables_rs_path ):
603- os .remove (tables_rs_path )
604602
603+ # will overwrite the file if it exists
605604 with open (tables_rs_path , "w" ) as rf :
606- # write the file's preamble
607605 rf .write (PREAMBLE )
608606
609607 unicode_version_notice = textwrap .dedent ("""
0 commit comments