1919# Since this should not require frequent updates, we just store this
2020# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2121import collections
22+ import re
2223import urllib .request
2324
2425UNICODE_VERSION = "15.1.0"
6667class UnicodeData (object ):
6768 def __init__ (self ):
6869 self ._load_unicode_data ()
70+ self ._load_default_ignorable_marks ()
71+
6972 self .norm_props = self ._load_norm_props ()
7073 self .norm_tests = self ._load_norm_tests ()
7174
@@ -100,6 +103,11 @@ def _load_unicode_data(self):
100103 self .general_category_mark = []
101104 self .general_category_public_assigned = []
102105
106+ # Characters that cannot be part of a combining character sequence:
107+ # control characters, format characters other than ZWJ and ZWNJ,
108+ # the line and paragraph separators, and noncharacters.
109+ self .not_in_ccs = []
110+
103111 assigned_start = 0 ;
104112 prev_char_int = - 1 ;
105113 prev_name = "" ;
@@ -125,6 +133,9 @@ def _load_unicode_data(self):
125133 if category == 'M' or 'M' in expanded_categories .get (category , []):
126134 self .general_category_mark .append (char_int )
127135
136+ if category in ['Cc' , 'Cf' , 'Zl' , 'Zp' ] and char_int not in [0x200C , 0x200D ]:
137+ self .not_in_ccs .append (char_int )
138+
128139 assert category != 'Cn' , "Unexpected: Unassigned codepoint in UnicodeData.txt"
129140 if category not in ['Co' , 'Cs' ]:
130141 if char_int != prev_char_int + 1 and not is_first_and_last (prev_name , name ):
@@ -135,6 +146,44 @@ def _load_unicode_data(self):
135146
136147 self .general_category_public_assigned .append ((assigned_start , prev_char_int ))
137148
149+ # Mark noncharacters as nongraphic
150+ for i in range (0xFDD0 , 0xFDF0 ):
151+ self .not_in_ccs .append (i )
152+ for prefix in range (0 , 0x11 ):
153+ shifted = prefix << 16
154+ self .not_in_ccs .append (shifted | 0xFFFE )
155+ self .not_in_ccs .append (shifted | 0xFFFF )
156+
157+ self .not_in_ccs .sort ()
158+
159+ def _load_default_ignorable_marks (self ):
160+ default_ignorable_cps = set ()
161+
162+ single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
163+ multiple = re .compile (
164+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
165+ )
166+
167+ for line in self ._fetch ("DerivedCoreProperties.txt" ).splitlines ():
168+ raw_data = None # (low, high)
169+ if match := single .match (line ):
170+ raw_data = (match .group (1 ), match .group (1 ))
171+ elif match := multiple .match (line ):
172+ raw_data = (match .group (1 ), match .group (2 ))
173+ else :
174+ continue
175+ low = int (raw_data [0 ], 16 )
176+ high = int (raw_data [1 ], 16 )
177+ for cp in range (low , high + 1 ):
178+ default_ignorable_cps .add (cp )
179+
180+ self .default_ignorable_marks = []
181+ for cp in self .general_category_mark :
182+ if cp in default_ignorable_cps :
183+ self .default_ignorable_marks .append (cp )
184+
185+ self .default_ignorable_marks .sort ()
186+
138187 def _load_cjk_compat_ideograph_variants (self ):
139188 for line in self ._fetch ("StandardizedVariants.txt" ).splitlines ():
140189 strip_comments = line .split ('#' , 1 )[0 ].strip ()
@@ -454,7 +503,7 @@ def gen_combining_mark(general_category_mark, out):
454503
455504def gen_public_assigned (general_category_public_assigned , out ):
456505 # This could be done as a hash but the table is somewhat small.
457- out .write ("#[inline]\n " )
506+ out .write ("\n #[inline]\n " )
458507 out .write ("pub fn is_public_assigned(c: char) -> bool {\n " )
459508 out .write (" match c {\n " )
460509
@@ -476,6 +525,66 @@ def gen_public_assigned(general_category_public_assigned, out):
476525 out .write ("}\n " )
477526 out .write ("\n " )
478527
528+ def gen_not_in_ccs (not_in_ccs , out ):
529+ # List of codepoints to list of ranges
530+ range_list = []
531+ for cp in not_in_ccs :
532+ if len (range_list ) != 0 and range_list [- 1 ][1 ] == cp - 1 :
533+ range_list [- 1 ] = (range_list [- 1 ][0 ], cp )
534+ else :
535+ range_list .append ((cp , cp ))
536+
537+ out .write ("\n #[inline]\n " )
538+ out .write ("pub fn not_in_ccs(c: char) -> bool {\n " )
539+ out .write (" match c {\n " )
540+
541+ start = True
542+ for first , last in range_list :
543+ if start :
544+ out .write (" " )
545+ start = False
546+ else :
547+ out .write ("\n | " )
548+ if first == last :
549+ out .write ("'\\ u{%s}'" % hexify (first ))
550+ else :
551+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ), hexify (last )))
552+ out .write (" => true,\n " )
553+
554+ out .write (" _ => false,\n " )
555+ out .write (" }\n " )
556+ out .write ("}\n " )
557+
558+ def gen_default_ignorable_mark (default_ignorable_marks , out ):
559+ # List of codepoints to list of ranges
560+ range_list = []
561+ for cp in default_ignorable_marks :
562+ if len (range_list ) != 0 and range_list [- 1 ][1 ] == cp - 1 :
563+ range_list [- 1 ] = (range_list [- 1 ][0 ], cp )
564+ else :
565+ range_list .append ((cp , cp ))
566+
567+ out .write ("\n #[inline]\n " )
568+ out .write ("pub fn is_default_ignorable_mark(c: char) -> bool {\n " )
569+ out .write (" match c {\n " )
570+
571+ start = True
572+ for first , last in range_list :
573+ if start :
574+ out .write (" " )
575+ start = False
576+ else :
577+ out .write ("\n | " )
578+ if first == last :
579+ out .write ("'\\ u{%s}'" % hexify (first ))
580+ else :
581+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ), hexify (last )))
582+ out .write (" => true,\n " )
583+
584+ out .write (" _ => false,\n " )
585+ out .write (" }\n " )
586+ out .write ("}\n " )
587+
479588def gen_stream_safe (leading , trailing , out ):
480589 # This could be done as a hash but the table is very small.
481590 out .write ("#[inline]\n " )
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
602711 gen_public_assigned (data .general_category_public_assigned , out )
603712 out .write ("\n " )
604713
714+ gen_not_in_ccs (data .not_in_ccs , out )
715+
716+ gen_default_ignorable_mark (data .default_ignorable_marks , out )
717+
605718 gen_nfc_qc (data .norm_props , out )
606719 out .write ("\n " )
607720
0 commit comments