2727import os
2828import re
2929import sys
30+ import urllib .request
3031from collections import defaultdict
3132from itertools import batched
3233
34+ UNICODE_VERSION = "15.1.0"
35+ """The version of the Unicode data files to download."""
36+
3337NUM_CODEPOINTS = 0x110000
3438"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
3539
@@ -61,24 +65,28 @@ class OffsetType(enum.IntEnum):
6165
6266If this is edited, you must ensure that `emit_module` reflects your changes."""
6367
64- MODULE_FILENAME = "tables.rs"
65- """The filename of the emitted Rust module (will be created in the working directory)"""
68+ MODULE_PATH = "../src/ tables.rs"
69+ """The path of the emitted Rust module (relative to the working directory)"""
6670
6771Codepoint = int
6872BitPos = int
6973
7074
71- def fetch_open (filename : str ):
75+ def fetch_open (filename : str , local_prefix : str = "" ):
7276 """Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
73- fetches it from `http ://www.unicode.org/Public/UNIDATA /`. Exits with code 1 on failure.
77+ fetches it from `https ://www.unicode.org/Public/`. Exits with code 1 on failure.
7478 """
7579 basename = os .path .basename (filename )
76- if not os .path .exists (basename ):
77- os .system (f"curl -O http://www.unicode.org/Public/UNIDATA/{ filename } " )
80+ localname = os .path .join (local_prefix , basename )
81+ if not os .path .exists (localname ):
82+ urllib .request .urlretrieve (
83+ f"https://www.unicode.org/Public/{ UNICODE_VERSION } /ucd/{ filename } " ,
84+ localname ,
85+ )
7886 try :
79- return open (basename , encoding = "utf-8" )
87+ return open (localname , encoding = "utf-8" )
8088 except OSError :
81- sys .stderr .write (f"cannot load { basename } " )
89+ sys .stderr .write (f"cannot load { localname } " )
8290 sys .exit (1 )
8391
8492
@@ -637,7 +645,7 @@ def emit_module(
637645 module .write ("}\n " )
638646
639647
640- def main (module_filename : str ):
648+ def main (module_path : str ):
641649 """Obtain character data from the latest version of Unicode, transform it into a multi-level
642650 lookup table for character width, and write a Rust module utilizing that table to
643651 `module_filename`.
@@ -677,6 +685,9 @@ def main(module_filename: str):
677685 emoji_variations = load_variation_sequences ()
678686 variation_table = make_variation_sequence_table (emoji_variations , width_map )
679687
688+ # Download normalization test file for use by tests
689+ fetch_open ("NormalizationTest.txt" , "../tests/" )
690+
680691 print ("------------------------" )
681692 total_size = 0
682693 for i , table in enumerate (tables ):
@@ -692,9 +703,9 @@ def main(module_filename: str):
692703 print ("------------------------" )
693704 print (f" Total size: { total_size } bytes" )
694705
695- emit_module (module_filename , version , tables , variation_table )
696- print (f'Wrote to "{ module_filename } "' )
706+ emit_module (module_path , version , tables , variation_table )
707+ print (f'Wrote to "{ module_path } "' )
697708
698709
699710if __name__ == "__main__" :
700- main (MODULE_FILENAME )
711+ main (MODULE_PATH )
0 commit comments