@@ -148,9 +148,11 @@ def load_confusables(f):
148148 return confusables
149149
150150def aliases ():
151- """
152- Fetch the shorthand aliases for each longhand Script name
153- """
151+ # This function is taken from the `unicode-script` crate. If significant
152+ # changes are introduced, update accordingly.
153+
154+ # Note that this file is in UCD directly, not security directory.
155+ # we use `fetch_unidata` function to download it.
154156 fetch_unidata ("PropertyValueAliases.txt" )
155157 longforms = {}
156158 shortforms = {}
@@ -170,6 +172,9 @@ def aliases():
170172 return (longforms , shortforms )
171173
172174def load_scripts (f ):
175+ # This function is taken from the `unicode-script` crate. If significant
176+ # changes are introduced, update accordingly.
177+
173178 (longforms , shortforms ) = aliases ()
174179 scripts = load_script_properties (f , [])
175180
@@ -235,31 +240,52 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
235240 return False
236241
237242def load_rustc_mixedscript_confusables (f , identifier_allowed , scripts ):
243+ # First, load all confusables data from confusables.txt
238244 confusables = load_confusables (f )
245+
246+ # The confusables.txt is reductive, means that it is intended to be used in
247+ # on the fly substitutions. The code points that didn't occur in the file can be
248+ # seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C,
249+ # and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable.
250+
251+ # here we first make a dict that contains all As and Bs whose corresponding C is single code point.
239252 seekup_map = {}
240253 for item in confusables :
241254 d_proto_list = item [1 ]
242255 d_source = item [0 ]
243256 assert (len (d_proto_list ) > 0 )
244257 if len (d_proto_list ) == 1 :
245258 seekup_map [escape_char (d_source )] = d_proto_list
246- # collect prototypes
259+
260+ # Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes.
261+ # Principally we'll be using the rhs operands as the representive element of its equivalence classes.
262+ # However some rhs operands are single code point, while some others are not.
263+ # Here we collect them separately into `codepoint_map` and `multicodepoint_map`.
247264 codepoint_map = {}
248265 multicodepoint_map = {}
249266 for item in confusables :
250267 d_source = item [0 ]
268+ # According to the RFC, we'll skip those code points that are restricted from identifier usage.
251269 if not is_codepoint_identifier_allowed (d_source , identifier_allowed ):
252270 continue
253271 d_proto_list = item [1 ]
254272 if len (d_proto_list ) == 1 :
255273 d_proto = escape_char (d_proto_list [0 ])
274+ # we use the escaped representation of rhs as key to the dict when creating new equivalence class.
256275 if d_proto not in codepoint_map :
257276 codepoint_map [d_proto ] = []
277+ # when we create new equivalence class, we'll check whether the representative element should be collected.
278+ # i.e. if it is not subject to substituion, and not restricted from identifier usage,
279+ # we collect it into the equivalence class.
258280 if d_proto not in seekup_map and is_codepoint_identifier_allowed (d_proto_list [0 ], identifier_allowed ):
259281 codepoint_map [d_proto ].append (d_proto_list [0 ])
282+ # we collect the original code point to be substituted into this list.
260283 codepoint_map [d_proto ].append (d_source )
261284 else :
262285 d_protos = escape_char_list (d_proto_list )
286+ # difference in multi code point case: the rhs part is not directly usable, however we store it in
287+ # dict for further special examination between each lhs and this multi code point rhs.
288+ # and there's an extra level of tuple here.
263289 if d_protos not in multicodepoint_map :
264290 multicodepoint_map [d_protos ] = (d_proto_list , [])
265291 multicodepoint_map [d_protos ][1 ].append (d_source )
@@ -274,24 +300,33 @@ def confusable_entry_item(confusable, script, item_text, item):
274300 script_entry [item_text ] = (item , [])
275301 return script_entry [item_text ][1 ]
276302
277- # between single charpoint that has single charpoint prototype
303+ # First let's examine the each code point having single code point prototype case.
278304 for _ , source in codepoint_map .items ():
279305 source_len = len (source )
306+ # Examine each pair in the equivalence class
280307 for i in range (0 , source_len - 1 ):
281308 for j in range (i + 1 , source_len ):
282309 item_i , item_j = source [i ], source [j ]
283310 script_i , script_j = codepoint_script (item_i , scripts ), codepoint_script (item_j , scripts )
311+ # If they're in the same script, just skip this pair.
284312 if script_i == script_j :
285313 continue
314+ # If `item_i` (the first) is not in a non-ignored script, and `item_j` (the second) is in a differnt one (maybe ignored),
315+ # this means that this usage of the `item_i` can be suspicious, when it occurs in a document that is written in `script_j`.
316+ # We'll consider it a mixed_script_confusable code point.
286317 if not is_script_ignored_in_mixedscript (script_i ):
318+ # store it within the map, saving as much information as possible, for further investigation on the final results.
287319 confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append (item_j )
320+ # Do the same in reverse from `item_j` to `item_i`
288321 if not is_script_ignored_in_mixedscript (script_j ):
289322 confusable_entry_item (mixedscript_confusable , script_j , escape_char (item_j ), item_j ).append (item_i )
290323
291- # between single charpoint that has multi charpoint prototype
324+ # Then let's examine the each code point having multiple code point prototype case.
325+ # We'll check between the code points that shares the same prototype
292326 for _ , proto_lst_and_source in multicodepoint_map .items ():
293327 source = proto_lst_and_source [1 ]
294328 source_len = len (source )
329+ # This is basically the same as the single code point case.
295330 for i in range (0 , source_len - 1 ):
296331 for j in range (i + 1 , source_len ):
297332 item_i , item_j = source [i ], source [j ]
@@ -304,10 +339,11 @@ def confusable_entry_item(confusable, script, item_text, item):
304339 confusable_entry_item (mixedscript_confusable , script_j , escape_char (item_j ), item_j ).append (item_i )
305340
306341 mixedscript_confusable_unresolved = {}
307- # single charpoint that has multi charpoint prototype and its prototype
342+ # We'll also check between each code points and its multiple codepoint prototype
308343 for _ , proto_lst_and_source in multicodepoint_map .items ():
309344 proto_lst = proto_lst_and_source [0 ]
310345 proto_lst_can_be_part_of_identifier = True
346+ # If the prototype contains one or more restricted code point, then we skip it.
311347 for c in proto_lst :
312348 if not is_codepoint_identifier_allowed (c , identifier_allowed ):
313349 proto_lst_can_be_part_of_identifier = False
@@ -318,15 +354,25 @@ def confusable_entry_item(confusable, script, item_text, item):
318354 source_len = len (source )
319355 for i in range (0 , source_len ):
320356 item_i = source [i ]
357+ # So here we're just checking whether the single code point should be considered confusable.
321358 script_i = codepoint_script (item_i , scripts )
359+ # If it's in ignored script, we don't need to do anything here.
322360 if is_script_ignored_in_mixedscript (script_i ):
323361 continue
362+ # Here're some rules on examining whether the single code point should be considered confusable.
363+ # The principle is that, when subsitution happens, no new non-ignored script are introduced, and its
364+ # own script is not lost.
324365 processed , should_add = process_mixedscript_single_to_multi (item_i , script_i , proto_lst , scripts )
325366 if should_add :
326367 assert (processed )
368+ # Mark the single code point as confusable.
327369 confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append ('multi' )
328370 if processed :
371+ # Finished dealing with this code point.
329372 continue
373+ # If it's not processed we must be dealing with a newer version Unicode data, which introduced some significant
374+ # changes. We don't throw an exception here, instead we collect it into a table for debugging purpose, and throw
375+ # an exception after we returned and printed the table out.
330376 proto_lst_text = escape_char_list (proto_lst )
331377 if not proto_lst_text in mixedscript_confusable_unresolved :
332378 mixedscript_confusable_unresolved [proto_lst_text ] = (proto_lst , [])
0 commit comments