@@ -98,6 +98,10 @@ def _load_unicode_data(self):
9898 self .compat_decomp = {}
9999 self .canon_decomp = {}
100100 self .general_category_mark = []
101+ self .general_category_public_assigned = []
102+
103+ assigned_start = 0 ;
104+ prev_char_int = - 1 ;
101105
102106 for line in self ._fetch ("UnicodeData.txt" ).splitlines ():
103107 # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
@@ -120,6 +124,15 @@ def _load_unicode_data(self):
120124 if category == 'M' or 'M' in expanded_categories .get (category , []):
121125 self .general_category_mark .append (char_int )
122126
127+ assert category != 'Cn' , "Unexpected: Unassigned codepoint in UnicodeData.txt"
128+ if category not in ['Co' , 'Cs' ]:
129+ if char_int != prev_char_int + 1 :
130+ self .general_category_public_assigned .append ((assigned_start , prev_char_int ))
131+ assigned_start = char_int
132+ prev_char_int = char_int
133+
134+ self .general_category_public_assigned .append ((assigned_start , prev_char_int ))
135+
123136 def _load_cjk_compat_ideograph_variants (self ):
124137 for line in self ._fetch ("StandardizedVariants.txt" ).splitlines ():
125138 strip_comments = line .split ('#' , 1 )[0 ].strip ()
@@ -418,6 +431,30 @@ def gen_combining_mark(general_category_mark, out):
418431 gen_mph_data ('combining_mark' , general_category_mark , 'u32' ,
419432 lambda k : '0x{:04x}' .format (k ))
420433
434+ def gen_public_assigned (general_category_public_assigned , out ):
435+ # This could be done as a hash but the table is somewhat small.
436+ out .write ("#[inline]\n " )
437+ out .write ("pub fn is_public_assigned(c: char) -> bool {\n " )
438+ out .write (" match c {\n " )
439+
440+ start = True
441+ for first , last in general_category_public_assigned :
442+ if start :
443+ out .write (" " )
444+ start = False
445+ else :
446+ out .write (" | " )
447+ if first == last :
448+ out .write ("'\\ u{%s}'\n " % hexify (first ))
449+ else :
450+ out .write ("'\\ u{%s}'..='\\ u{%s}'\n " % (hexify (first ), hexify (last )))
451+ out .write (" => true,\n " )
452+
453+ out .write (" _ => false,\n " )
454+ out .write (" }\n " )
455+ out .write ("}\n " )
456+ out .write ("\n " )
457+
421458def gen_stream_safe (leading , trailing , out ):
422459 # This could be done as a hash but the table is very small.
423460 out .write ("#[inline]\n " )
@@ -540,6 +577,9 @@ def minimal_perfect_hash(d):
540577 gen_combining_mark (data .general_category_mark , out )
541578 out .write ("\n " )
542579
580+ gen_public_assigned (data .general_category_public_assigned , out )
581+ out .write ("\n " )
582+
543583 gen_nfc_qc (data .norm_props , out )
544584 out .write ("\n " )
545585
0 commit comments