@@ -47,6 +47,15 @@ def fetch(f):
4747 sys .stderr .write ("cannot load %s\n " % f )
4848 exit (1 )
4949
50+ def fetch_unidata (f ):
51+ if not os .path .exists (os .path .basename (f )):
52+ os .system ("curl -O http://www.unicode.org/Public/%s/ucd/%s"
53+ % (UNICODE_VERSION_NUMBER , f ))
54+
55+ if not os .path .exists (os .path .basename (f )):
56+ sys .stderr .write ("cannot load %s" % f )
57+ exit (1 )
58+
5059# Implementation from unicode-segmentation
5160def load_properties (f , interestingprops = None ):
5261 fetch (f )
@@ -81,6 +90,41 @@ def load_properties(f, interestingprops = None):
8190
8291 return props
8392
93+ def load_script_properties (f , interestingprops ):
94+ fetch_unidata (f )
95+ props = {}
96+ # Note: these regexes are different from those in unicode-segmentation,
97+ # becase we need to handle spaces here
98+ re1 = re .compile (r"^ *([0-9A-F]+) *; *([^#]+) *#" )
99+ re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#" )
100+
101+ for line in fileinput .input (os .path .basename (f )):
102+ prop = None
103+ d_lo = 0
104+ d_hi = 0
105+ m = re1 .match (line )
106+ if m :
107+ d_lo = m .group (1 )
108+ d_hi = m .group (1 )
109+ prop = m .group (2 ).strip ()
110+ else :
111+ m = re2 .match (line )
112+ if m :
113+ d_lo = m .group (1 )
114+ d_hi = m .group (2 )
115+ prop = m .group (3 ).strip ()
116+ else :
117+ continue
118+ if interestingprops and prop not in interestingprops :
119+ continue
120+ d_lo = int (d_lo , 16 )
121+ d_hi = int (d_hi , 16 )
122+ if prop not in props :
123+ props [prop ] = []
124+ props [prop ].append ((d_lo , d_hi ))
125+
126+ return props
127+
84128def load_confusables (f ):
85129 fetch (f )
86130 confusables = []
@@ -97,12 +141,244 @@ def load_confusables(f):
97141 raise Exception ('More than one code point in first column' )
98142 d_input = int (d_inputs [0 ].strip (), 16 )
99143 for d_output in m .group (2 ).split ():
100- d_outputitem = int (d_output , 16 );
101- d_outputs .append (d_outputitem );
144+ d_outputitem = int (d_output , 16 )
145+ d_outputs .append (d_outputitem )
102146 confusables .append ((d_input , d_outputs ))
103147
104148 return confusables
105149
150+ def aliases ():
151+ """
152+ Fetch the shorthand aliases for each longhand Script name
153+ """
154+ fetch_unidata ("PropertyValueAliases.txt" )
155+ longforms = {}
156+ shortforms = {}
157+ re1 = re .compile (r"^ *sc *; *(\w+) *; *(\w+)" )
158+ for line in fileinput .input (os .path .basename ("PropertyValueAliases.txt" )):
159+ m = re1 .match (line )
160+ if m :
161+ l = m .group (2 ).strip ()
162+ s = m .group (1 ).strip ()
163+ assert (s not in longforms )
164+ assert (l not in shortforms )
165+ longforms [s ] = l
166+ shortforms [l ] = s
167+ else :
168+ continue
169+
170+ return (longforms , shortforms )
171+
172+ def load_scripts (f ):
173+ (longforms , shortforms ) = aliases ()
174+ scripts = load_script_properties (f , [])
175+
176+ script_table = []
177+ script_list = []
178+
179+ for script in scripts :
180+ if script not in ["Common" , "Unknown" , "Inherited" ]:
181+ script_list .append (shortforms [script ])
182+ script_table .extend ([(x , y , shortforms [script ]) for (x , y ) in scripts [script ]])
183+ script_list .sort ()
184+ script_table .sort (key = lambda w : w [0 ])
185+ return (longforms , script_table )
186+
187+ def is_script_ignored_in_mixedscript (source ):
188+ return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz'
189+
190+ def process_mixedscript_single_to_multi (item_i , script_i , proto_lst , scripts ):
191+ script_lst = script_list (proto_lst , scripts )
192+ script_lst .sort ()
193+ # here's a few rules to process current version of Unicode data (13.0 at this time)
194+ script_lst_len = len (script_lst )
195+ assert (script_lst_len > 0 )
196+ # Rule: A - A -> Processed, DontAdd
197+ if script_lst_len == 1 and script_lst [0 ] == script_i :
198+ return True , False
199+ # Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
200+ if (script_lst_len == 1 and not is_script_ignored_in_mixedscript (script_lst [0 ])
201+ and not is_script_ignored_in_mixedscript (script_i )
202+ and script_lst [0 ] != script_i ):
203+ return True , True
204+ # Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
205+ if (script_lst_len == 1 and is_script_ignored_in_mixedscript (script_lst [0 ])
206+ and not is_script_ignored_in_mixedscript (script_i )):
207+ return True , True
208+ # Rule: A ... - A -> Processed, DontAdd
209+ if script_lst_len > 1 and script_i in script_lst :
210+ return True , False
211+ # Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
212+ if (script_lst_len == 2 and is_script_ignored_in_mixedscript (script_lst [0 ])
213+ and not is_script_ignored_in_mixedscript (script_lst [1 ])
214+ and not is_script_ignored_in_mixedscript (script_i )
215+ and script_lst [1 ] != script_i ):
216+ return True , True
217+ if (script_lst_len == 2 and is_script_ignored_in_mixedscript (script_lst [1 ])
218+ and not is_script_ignored_in_mixedscript (script_lst [0 ])
219+ and not is_script_ignored_in_mixedscript (script_i )
220+ and script_lst [0 ] != script_i ):
221+ return True , True
222+ # Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
223+ if (script_lst_len == 2 and is_script_ignored_in_mixedscript (script_lst [0 ])
224+ and is_script_ignored_in_mixedscript (script_lst [1 ])
225+ and not is_script_ignored_in_mixedscript (script_i )):
226+ return True , True
227+
228+ # NotProcessed, DontAdd
229+ return False , False
230+
231+ def is_codepoint_identifier_allowed (c , identifier_allowed ):
232+ for data in identifier_allowed :
233+ if c >= data [0 ] and c <= data [1 ]:
234+ return True
235+ return False
236+
237+ def load_rustc_mixedscript_confusables (f , identifier_allowed , scripts ):
238+ confusables = load_confusables (f )
239+ seekup_map = {}
240+ for item in confusables :
241+ d_proto_list = item [1 ]
242+ d_source = item [0 ]
243+ assert (len (d_proto_list ) > 0 )
244+ if len (d_proto_list ) == 1 :
245+ seekup_map [escape_char (d_source )] = d_proto_list
246+ # collect prototypes
247+ codepoint_map = {}
248+ multicodepoint_map = {}
249+ for item in confusables :
250+ d_source = item [0 ]
251+ if not is_codepoint_identifier_allowed (d_source , identifier_allowed ):
252+ continue
253+ d_proto_list = item [1 ]
254+ if len (d_proto_list ) == 1 :
255+ d_proto = escape_char (d_proto_list [0 ])
256+ if d_proto not in codepoint_map :
257+ codepoint_map [d_proto ] = []
258+ if d_proto not in seekup_map and is_codepoint_identifier_allowed (d_proto_list [0 ], identifier_allowed ):
259+ codepoint_map [d_proto ].append (d_proto_list [0 ])
260+ codepoint_map [d_proto ].append (d_source )
261+ else :
262+ d_protos = escape_char_list (d_proto_list )
263+ if d_protos not in multicodepoint_map :
264+ multicodepoint_map [d_protos ] = (d_proto_list , [])
265+ multicodepoint_map [d_protos ][1 ].append (d_source )
266+
267+ mixedscript_confusable = {}
268+
269+ def confusable_entry_item (confusable , script , item_text , item ):
270+ if script not in confusable :
271+ confusable [script ] = {}
272+ script_entry = confusable [script ]
273+ if item_text not in script_entry :
274+ script_entry [item_text ] = (item , [])
275+ return script_entry [item_text ][1 ]
276+
277+ # between single charpoint that has single charpoint prototype
278+ for _ , source in codepoint_map .items ():
279+ source_len = len (source )
280+ for i in range (0 , source_len - 1 ):
281+ for j in range (i + 1 , source_len ):
282+ item_i , item_j = source [i ], source [j ]
283+ script_i , script_j = codepoint_script (item_i , scripts ), codepoint_script (item_j , scripts )
284+ if script_i == script_j :
285+ continue
286+ if not is_script_ignored_in_mixedscript (script_i ):
287+ confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append (item_j )
288+ if not is_script_ignored_in_mixedscript (script_j ):
289+ confusable_entry_item (mixedscript_confusable , script_j , escape_char (item_j ), item_j ).append (item_i )
290+
291+ # between single charpoint that has multi charpoint prototype
292+ for _ , proto_lst_and_source in multicodepoint_map .items ():
293+ source = proto_lst_and_source [1 ]
294+ source_len = len (source )
295+ for i in range (0 , source_len - 1 ):
296+ for j in range (i + 1 , source_len ):
297+ item_i , item_j = source [i ], source [j ]
298+ script_i , script_j = codepoint_script (item_i , scripts ), codepoint_script (item_j , scripts )
299+ if script_i == script_j :
300+ continue
301+ if not is_script_ignored_in_mixedscript (script_i ):
302+ confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append (item_j )
303+ if not is_script_ignored_in_mixedscript (script_j ):
304+ confusable_entry_item (mixedscript_confusable , script_j , escape_char (item_j ), item_j ).append (item_i )
305+
306+ mixedscript_confusable_unresolved = {}
307+ # single charpoint that has multi charpoint prototype and its prototype
308+ for _ , proto_lst_and_source in multicodepoint_map .items ():
309+ proto_lst = proto_lst_and_source [0 ]
310+ proto_lst_can_be_part_of_identifier = True
311+ for c in proto_lst :
312+ if not is_codepoint_identifier_allowed (c , identifier_allowed ):
313+ proto_lst_can_be_part_of_identifier = False
314+ break
315+ if not proto_lst_can_be_part_of_identifier :
316+ continue
317+ source = proto_lst_and_source [1 ]
318+ source_len = len (source )
319+ for i in range (0 , source_len ):
320+ item_i = source [i ]
321+ script_i = codepoint_script (item_i , scripts )
322+ if is_script_ignored_in_mixedscript (script_i ):
323+ continue
324+ processed , should_add = process_mixedscript_single_to_multi (item_i , script_i , proto_lst , scripts )
325+ if should_add :
326+ assert (processed )
327+ confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append ('multi' )
328+ if processed :
329+ continue
330+ proto_lst_text = escape_char_list (proto_lst )
331+ if not proto_lst_text in mixedscript_confusable_unresolved :
332+ mixedscript_confusable_unresolved [proto_lst_text ] = (proto_lst , [])
333+ mixedscript_confusable_unresolved [proto_lst_text ][1 ].append (item_i )
334+ return (mixedscript_confusable , mixedscript_confusable_unresolved )
335+
336+ def codepoint_script (c , scripts ):
337+ for x , y , script in scripts :
338+ if c >= x and c <= y :
339+ return script
340+ raise Exception ("Not in scripts: " + escape_char (c ))
341+
342+ def debug_emit_mixedscript_confusable (f , mixedscript_confusable , text , scripts ):
343+ f .write ("/* " + text + "\n " )
344+ for script , lst in mixedscript_confusable .items ():
345+ f .write ("/// Script - " + script + "\n " )
346+ source_lst = [v [0 ] for (_ , v ) in lst .items ()]
347+ source_lst .sort ()
348+ for source in source_lst :
349+ source_text = escape_char (source )
350+ source_item_and_target_lst = lst [source_text ]
351+ target_lst = source_item_and_target_lst [1 ]
352+ f .write (source_text + " => " + escape_char_list (target_lst ) + " // " + escape_script_list (target_lst , scripts )+ "\n " )
353+ f .write ("*/\n " )
354+
355+
356+ def script_list (char_lst , scripts ):
357+ script_lst = []
358+ for c in char_lst :
359+ if c == 'multi' :
360+ script = 'Z~multi'
361+ else :
362+ script = codepoint_script (c , scripts )
363+ if script not in script_lst :
364+ script_lst .append (script )
365+ return script_lst
366+
367+ def escape_script_list (char_lst , scripts ):
368+ script_lst = script_list (char_lst , scripts )
369+ script_lst .sort ()
370+ return str (script_lst )
371+
372+ def debug_emit_mixedscript_confusable_unresolved (f , map , text , scripts ):
373+ if len (map ) == 0 :
374+ return
375+ print ("// " + text + "\n " )
376+ for prototype_text , pair in map .items ():
377+ prototype = pair [0 ]
378+ source = pair [1 ]
379+ print (prototype_text + " => " + escape_char_list (source ) + " // " + escape_script_list (prototype , scripts ) + " => " + escape_script_list (source , scripts ) + "\n " )
380+ raise Exception ("update the python script to add new rules for new data" )
381+
106382def format_table_content (f , content , indent ):
107383 line = " " * indent
108384 first = True
@@ -119,18 +395,20 @@ def format_table_content(f, content, indent):
119395 f .write (line )
120396
121397def escape_char (c ):
398+ if c == 'multi' :
399+ return "\" <multiple code points>\" "
122400 return "'\\ u{%x}'" % c
123401
124402def escape_char_list (l ):
125- line = "[" ;
126- first = True ;
403+ line = "["
404+ first = True
127405 for c in l :
128406 if first :
129- line += escape_char (c );
407+ line += escape_char (c )
130408 else :
131- line += ", " + escape_char (c );
132- first = False ;
133- line += "]" ;
409+ line += ", " + escape_char (c )
410+ first = False
411+ line += "]"
134412 return line
135413
136414def emit_table (f , name , t_data , t_type = "&'static [(char, char)]" , is_pub = True ,
@@ -226,7 +504,7 @@ def emit_confusable_detection_module(f):
226504 confusable_table .sort (key = lambda w : w [0 ])
227505
228506 last_key = None
229- for (k , v ) in confusable_table :
507+ for (k , _ ) in confusable_table :
230508 if k == last_key :
231509 raise Exception ("duplicate keys in confusables table: %s" % k )
232510 last_key = k
@@ -235,6 +513,40 @@ def emit_confusable_detection_module(f):
235513 pfun = lambda x : "(%s, &%s)" % (escape_char (x [0 ]), escape_char_list (x [1 ])))
236514 f .write ("}\n \n " )
237515
516+ def escape_script_constant (name , longforms ):
517+ return "Script::" + longforms [name ].strip ()
518+
519+ def emit_rustc_mixed_script_confusable_detection (f ):
520+ f .write ("pub mod rustc_mixed_script_confusable_detection {" )
521+ f .write ("""
522+ use unicode_script::Script;
523+
524+ #[inline]
525+ pub fn is_rustc_mixed_script_confusable(c: char) -> Option<Script> {
526+ match c as usize {
527+ _ => super::util::bsearch_value_table(c, CONFUSABLES)
528+ }
529+ }
530+
531+ """ )
532+ identifier_status_table = load_properties ("IdentifierStatus.txt" )
533+ longforms , scripts = load_scripts ("Scripts.txt" )
534+ identifier_allowed = identifier_status_table ['Allowed' ]
535+ (mixedscript_confusable , mixedscript_confusable_unresolved ) = load_rustc_mixedscript_confusables ("confusables.txt" , identifier_allowed , scripts )
536+ debug = False
537+ if debug == True :
538+ debug_emit_mixedscript_confusable (f , mixedscript_confusable , "mixedscript_confusable" , scripts )
539+ debug_emit_mixedscript_confusable_unresolved (f , mixedscript_confusable_unresolved , "mixedscript_confusable_unresolved" , scripts )
540+ confusable_table = []
541+ for script , lst in mixedscript_confusable .items ():
542+ for _ , pair in lst .items ():
543+ source = pair [0 ]
544+ confusable_table .append ((source , script ))
545+ confusable_table .sort (key = lambda w : w [0 ])
546+ emit_table (f , "CONFUSABLES" , confusable_table , "&'static [(char, Script)]" , is_pub = False ,
547+ pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_script_constant (x [1 ], longforms )))
548+ f .write ("}\n \n " )
549+
238550
239551def emit_util_mod (f ):
240552 f .write ("""
@@ -301,3 +613,5 @@ def emit_util_mod(f):
301613 emit_identifier_module (rf )
302614 ### confusable_detection module
303615 emit_confusable_detection_module (rf )
616+ ### mixed_script_confusable_detection module
617+ emit_rustc_mixed_script_confusable_detection (rf )
0 commit comments