1919# Since this should not require frequent updates, we just store this
2020# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2121import collections
22+ import re
2223import urllib .request
2324from itertools import batched
2425
6768class UnicodeData (object ):
6869 def __init__ (self ):
6970 self ._load_unicode_data ()
71+ self ._load_default_ignorable_marks ()
72+
7073 self .norm_props = self ._load_norm_props ()
7174 self .norm_tests = self ._load_norm_tests ()
7275
@@ -101,6 +104,11 @@ def _load_unicode_data(self):
101104 self .general_category_mark = []
102105 self .general_category_public_assigned = []
103106
107+ # Characters that cannot be part of a combining character sequence:
108+ # control characters, format characters other than ZWJ and ZWNJ,
109+ # the line and paragraph separators, and noncharacters.
110+ self .not_in_ccs = []
111+
104112 assigned_start = 0 ;
105113 prev_char_int = - 1 ;
106114 prev_name = "" ;
@@ -126,6 +134,9 @@ def _load_unicode_data(self):
126134 if category == 'M' or 'M' in expanded_categories .get (category , []):
127135 self .general_category_mark .append (char_int )
128136
137+ if category in ['Cc' , 'Cf' , 'Zl' , 'Zp' ] and char_int not in [0x200C , 0x200D ]:
138+ self .not_in_ccs .append (char_int )
139+
129140 assert category != 'Cn' , "Unexpected: Unassigned codepoint in UnicodeData.txt"
130141 if category not in ['Co' , 'Cs' ]:
131142 if char_int != prev_char_int + 1 and not is_first_and_last (prev_name , name ):
@@ -136,6 +147,44 @@ def _load_unicode_data(self):
136147
137148 self .general_category_public_assigned .append ((assigned_start , prev_char_int ))
138149
150+ # Mark noncharacters as nongraphic
151+ for i in range (0xFDD0 , 0xFDF0 ):
152+ self .not_in_ccs .append (i )
153+ for prefix in range (0 , 0x11 ):
154+ shifted = prefix << 16
155+ self .not_in_ccs .append (shifted | 0xFFFE )
156+ self .not_in_ccs .append (shifted | 0xFFFF )
157+
158+ self .not_in_ccs .sort ()
159+
160+ def _load_default_ignorable_marks (self ):
161+ default_ignorable_cps = set ()
162+
163+ single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
164+ multiple = re .compile (
165+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
166+ )
167+
168+ for line in self ._fetch ("DerivedCoreProperties.txt" ).splitlines ():
169+ raw_data = None # (low, high)
170+ if match := single .match (line ):
171+ raw_data = (match .group (1 ), match .group (1 ))
172+ elif match := multiple .match (line ):
173+ raw_data = (match .group (1 ), match .group (2 ))
174+ else :
175+ continue
176+ low = int (raw_data [0 ], 16 )
177+ high = int (raw_data [1 ], 16 )
178+ for cp in range (low , high + 1 ):
179+ default_ignorable_cps .add (cp )
180+
181+ self .default_ignorable_marks = []
182+ for cp in self .general_category_mark :
183+ if cp in default_ignorable_cps :
184+ self .default_ignorable_marks .append (cp )
185+
186+ self .default_ignorable_marks .sort ()
187+
139188 def _load_cjk_compat_ideograph_variants (self ):
140189 for line in self ._fetch ("StandardizedVariants.txt" ).splitlines ():
141190 strip_comments = line .split ('#' , 1 )[0 ].strip ()
@@ -461,7 +510,7 @@ def gen_combining_mark(general_category_mark, out):
461510
462511def gen_public_assigned (general_category_public_assigned , out ):
463512 # This could be done as a hash but the table is somewhat small.
464- out .write ("#[inline]\n " )
513+ out .write ("\n #[inline]\n " )
465514 out .write ("pub fn is_public_assigned(c: char) -> bool {\n " )
466515 out .write (" match c {\n " )
467516
@@ -482,6 +531,66 @@ def gen_public_assigned(general_category_public_assigned, out):
482531 out .write (" }\n " )
483532 out .write ("}\n " )
484533
534+ def gen_not_in_ccs (not_in_ccs , out ):
535+ # List of codepoints to list of ranges
536+ range_list = []
537+ for cp in not_in_ccs :
538+ if len (range_list ) != 0 and range_list [- 1 ][1 ] == cp - 1 :
539+ range_list [- 1 ] = (range_list [- 1 ][0 ], cp )
540+ else :
541+ range_list .append ((cp , cp ))
542+
543+ out .write ("\n #[inline]\n " )
544+ out .write ("pub fn not_in_ccs(c: char) -> bool {\n " )
545+ out .write (" match c {\n " )
546+
547+ start = True
548+ for first , last in range_list :
549+ if start :
550+ out .write (" " )
551+ start = False
552+ else :
553+ out .write ("\n | " )
554+ if first == last :
555+ out .write ("'\\ u{%s}'" % hexify (first ))
556+ else :
557+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ), hexify (last )))
558+ out .write (" => true,\n " )
559+
560+ out .write (" _ => false,\n " )
561+ out .write (" }\n " )
562+ out .write ("}\n " )
563+
564+ def gen_default_ignorable_mark (default_ignorable_marks , out ):
565+ # List of codepoints to list of ranges
566+ range_list = []
567+ for cp in default_ignorable_marks :
568+ if len (range_list ) != 0 and range_list [- 1 ][1 ] == cp - 1 :
569+ range_list [- 1 ] = (range_list [- 1 ][0 ], cp )
570+ else :
571+ range_list .append ((cp , cp ))
572+
573+ out .write ("\n #[inline]\n " )
574+ out .write ("pub fn is_default_ignorable_mark(c: char) -> bool {\n " )
575+ out .write (" match c {\n " )
576+
577+ start = True
578+ for first , last in range_list :
579+ if start :
580+ out .write (" " )
581+ start = False
582+ else :
583+ out .write ("\n | " )
584+ if first == last :
585+ out .write ("'\\ u{%s}'" % hexify (first ))
586+ else :
587+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ), hexify (last )))
588+ out .write (" => true,\n " )
589+
590+ out .write (" _ => false,\n " )
591+ out .write (" }\n " )
592+ out .write ("}\n " )
593+
485594def gen_stream_safe (leading , trailing , out ):
486595 # This could be done as a hash but the table is very small.
487596 out .write ("#[inline]\n " )
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
602711
603712 gen_public_assigned (data .general_category_public_assigned , out )
604713
714+ gen_not_in_ccs (data .not_in_ccs , out )
715+
716+ gen_default_ignorable_mark (data .default_ignorable_marks , out )
717+
605718 gen_nfc_qc (data .norm_props , out )
606719
607720 gen_nfkc_qc (data .norm_props , out )
0 commit comments