1212
1313# This script uses the following Unicode security tables:
1414# - IdentifierStatus.txt
15+ # - IdentifierType.txt
16+ # - PropertyValueAliases.txt
17+ # - confusables.txt
1518# - ReadMe.txt
19+ # This script also uses the following Unicode UCD data:
20+ # - Scripts.txt
1621#
1722# Since this should not require frequent updates, we just store this
18- # out-of-line and check the unicode .rs file into git.
23+ # out-of-line and check the tables .rs file into git.
1924
2025import fileinput , re , os , sys , operator
2126
3843
3944UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
4045
46+ # Download a Unicode security table file
4147def fetch (f ):
4248 if not os .path .exists (os .path .basename (f )):
4349 os .system ("curl -O http://www.unicode.org/Public/security/%s/%s"
@@ -47,6 +53,7 @@ def fetch(f):
4753 sys .stderr .write ("cannot load %s\n " % f )
4854 exit (1 )
4955
56+ # Download a UCD table file
5057def fetch_unidata (f ):
5158 if not os .path .exists (os .path .basename (f )):
5259 os .system ("curl -O http://www.unicode.org/Public/%s/ucd/%s"
@@ -56,6 +63,8 @@ def fetch_unidata(f):
5663 sys .stderr .write ("cannot load %s" % f )
5764 exit (1 )
5865
66+ # Loads code point data from IdentifierStatus.txt and
67+ # IdentifierType.txt
5968# Implementation from unicode-segmentation
6069def load_properties (f , interestingprops = None ):
6170 fetch (f )
@@ -90,6 +99,7 @@ def load_properties(f, interestingprops = None):
9099
91100 return props
92101
102+ # Loads script data from Scripts.txt
93103def load_script_properties (f , interestingprops ):
94104 fetch_unidata (f )
95105 props = {}
@@ -125,6 +135,7 @@ def load_script_properties(f, interestingprops):
125135
126136 return props
127137
138+ # Loads confusables data from confusables.txt
128139def load_confusables (f ):
129140 fetch (f )
130141 confusables = []
@@ -147,6 +158,7 @@ def load_confusables(f):
147158
148159 return confusables
149160
161+ # Loads Unicode script name correspondence from PropertyValueAliases.txt
150162def aliases ():
151163 # This function is taken from the `unicode-script` crate. If significant
152164 # changes are introduced, update accordingly.
@@ -171,6 +183,7 @@ def aliases():
171183
172184 return (longforms , shortforms )
173185
186+ # Loads Unicode script name list and correspondence mapping
174187def load_scripts (f ):
175188 # This function is taken from the `unicode-script` crate. If significant
176189 # changes are introduced, update accordingly.
@@ -192,6 +205,16 @@ def load_scripts(f):
192205def is_script_ignored_in_mixedscript (source ):
193206 return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz'
194207
208+ # When a codepoint's prototype consists of multiple codepoints.
209+ # The situation is more complex. Here we make up a few rules
210+ # to cover all the cases in confusables.txt .
211+ # The principle is that when replacing the original codepoint with its prototype.
212+ # Neither a "non-ignored script" appears nor it disappears.
213+ #
214+ # We make up several rules to cover the cases occurred within confusables.txt
215+ # Return True, True when we want to consider it confusable,
216+ # and return True, False when we want to consider it non-confusable.
217+ # and return False, _ when new not-yet-processed cases are added in future Unicode versions.
195218def process_mixedscript_single_to_multi (item_i , script_i , proto_lst , scripts ):
196219 script_lst = script_list (proto_lst , scripts )
197220 script_lst .sort ()
@@ -239,6 +262,21 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
239262 return True
240263 return False
241264
265+ # This function load and generates a table of all the confusable characters.
266+ # It returns a pair consists of a `mixedscript_confusable` table and a
267+ # `mixedscript_confusable_unresolved` table.
268+ # The `mixedscript_confusable` is a dict, its keys are Unicode script names, and each
269+ # entry has a value of a inner dict. The inner dict's keys are confusable code points
270+ # converted to string with the `escape_char` function, and its values are pairs.
271+ # pair[0] keeps a copy of the confusable code point itself but as integer.
272+ # pair[1] keeps a list of all the code points that are mixed script confusable with it.
273+ # which is only used for debugging purposes.
274+ # note that the string 'multi' will occur in the list when pair[0] is considered
275+ # confusable with its multiple code point prototype.
276+ # Usually the `mixedscript_confusable_unresolved` table is empty, but it's possible
277+ # that future Unicode version update may cause that table become nonempty, in which
278+ # case more rules needs to be added to the `process_mixedscript_single_to_multi` function
279+ # above to cover those new cases.
242280def load_potential_mixedscript_confusables (f , identifier_allowed , scripts ):
243281 # First, load all confusables data from confusables.txt
244282 confusables = load_confusables (f )
@@ -375,6 +413,7 @@ def codepoint_script(c, scripts):
375413 return script
376414 raise Exception ("Not in scripts: " + escape_char (c ))
377415
416+ # Emit some useful information for debugging when further update happens.
378417def debug_emit_mixedscript_confusable (f , mixedscript_confusable , text , scripts ):
379418 f .write ("/* " + text + "\n " )
380419 for script , lst in mixedscript_confusable .items ():
0 commit comments