@@ -76,6 +76,8 @@ def stats(name, table):
7676 stats ("Canonical fully decomp" , self .canon_fully_decomp )
7777 stats ("Compatible fully decomp" , self .compat_fully_decomp )
7878
79+ self .ss_leading , self .ss_trailing = self ._compute_stream_safe_tables ()
80+
7981 def _fetch (self , filename ):
8082 resp = requests .get (UCD_URL + filename )
8183 return resp .text
@@ -91,17 +93,18 @@ def _load_unicode_data(self):
9193 pieces = line .split (';' )
9294 assert len (pieces ) == 15
9395 char , category , cc , decomp = pieces [0 ], pieces [2 ], pieces [3 ], pieces [5 ]
96+ char_int = int (char , 16 )
9497
9598 if cc != '0' :
96- self .combining_classes [char ] = cc
99+ self .combining_classes [char_int ] = cc
97100
98101 if decomp .startswith ('<' ):
99- self .compat_decomp [char ] = decomp .split ()[1 :]
102+ self .compat_decomp [char_int ] = [ int ( c , 16 ) for c in decomp .split ()[1 :] ]
100103 elif decomp != '' :
101- self .canon_decomp [char ] = decomp .split ()
104+ self .canon_decomp [char_int ] = [ int ( c , 16 ) for c in decomp .split ()]
102105
103106 if category == 'M' or 'M' in expanded_categories .get (category , []):
104- self .general_category_mark .append (char )
107+ self .general_category_mark .append (char_int )
105108
106109 def _load_norm_props (self ):
107110 props = collections .defaultdict (list )
@@ -146,14 +149,13 @@ def _compute_canonical_comp(self):
146149 (int (low , 16 ), int (high or low , 16 ))
147150 for low , high , _ in self .norm_props ["Full_Composition_Exclusion" ]
148151 ]
149- for char , decomp in self .canon_decomp .items ():
150- char_int = int (char , 16 )
152+ for char_int , decomp in self .canon_decomp .items ():
151153 if any (lo <= char_int <= hi for lo , hi in comp_exclusions ):
152154 continue
153155
154156 assert len (decomp ) == 2
155157 assert (decomp [0 ], decomp [1 ]) not in canon_comp
156- canon_comp [(decomp [0 ], decomp [1 ])] = char
158+ canon_comp [(decomp [0 ], decomp [1 ])] = char_int
157159
158160 return canon_comp
159161
@@ -181,15 +183,6 @@ def _compute_fully_decomposed(self):
181183 S_BASE , L_COUNT , V_COUNT , T_COUNT = 0xAC00 , 19 , 21 , 28
182184 S_COUNT = L_COUNT * V_COUNT * T_COUNT
183185
184- canon_decomp = {
185- int (k , 16 ): [int (c , 16 ) for c in v ]
186- for k , v in self .canon_decomp .items ()
187- }
188- compat_decomp = {
189- int (k , 16 ): [int (c , 16 ) for c in v ]
190- for k , v in self .compat_decomp .items ()
191- }
192-
193186 def _decompose (char_int , compatible ):
194187 # 7-bit ASCII never decomposes
195188 if char_int <= 0x7f :
@@ -199,15 +192,15 @@ def _decompose(char_int, compatible):
199192 # Assert that we're handling Hangul separately.
200193 assert not (S_BASE <= char_int < S_BASE + S_COUNT )
201194
202- decomp = canon_decomp .get (char_int )
195+ decomp = self . canon_decomp .get (char_int )
203196 if decomp is not None :
204197 for decomposed_ch in decomp :
205198 for fully_decomposed_ch in _decompose (decomposed_ch , compatible ):
206199 yield fully_decomposed_ch
207200 return
208201
209- if compatible and char_int in compat_decomp :
210- for decomposed_ch in compat_decomp [char_int ]:
202+ if compatible and char_int in self . compat_decomp :
203+ for decomposed_ch in self . compat_decomp [char_int ]:
211204 for fully_decomposed_ch in _decompose (decomposed_ch , compatible ):
212205 yield fully_decomposed_ch
213206 return
@@ -216,12 +209,13 @@ def _decompose(char_int, compatible):
216209 return
217210
218211 end_codepoint = max (
219- max (canon_decomp .keys ()),
220- max (compat_decomp .keys ()),
212+ max (self . canon_decomp .keys ()),
213+ max (self . compat_decomp .keys ()),
221214 )
222215
223- canon_fully_decomposed = {}
224- compat_fully_decomposed = {}
216+ canon_fully_decomp = {}
217+ compat_fully_decomp = {}
218+
225219 for char_int in range (0 , end_codepoint + 1 ):
226220 # Always skip Hangul, since it's more efficient to represent its
227221 # decomposition programmatically.
@@ -230,31 +224,75 @@ def _decompose(char_int, compatible):
230224
231225 canon = list (_decompose (char_int , False ))
232226 if not (len (canon ) == 1 and canon [0 ] == char_int ):
233- canon_fully_decomposed [char_int ] = canon
227+ canon_fully_decomp [char_int ] = canon
234228
235229 compat = list (_decompose (char_int , True ))
236230 if not (len (compat ) == 1 and compat [0 ] == char_int ):
237- compat_fully_decomposed [char_int ] = compat
231+ compat_fully_decomp [char_int ] = compat
238232
239- # Since canon_decomp is a subset of compat_decomp , we don't need to
240- # store their overlap when they agree. When they don't agree, store the
241- # decomposition in the compatibility table since we'll check that first
242- # when normalizing to NFKD.
243- assert canon_fully_decomposed <= compat_fully_decomposed
233+ # Since canon_fully_decomp is a subset of compat_fully_decomp , we don't
234+ # need to store their overlap when they agree. When they don't agree,
235+ # store the decomposition in the compatibility table since we'll check
236+ # that first when normalizing to NFKD.
237+ assert canon_fully_decomp <= compat_fully_decomp
244238
245- for ch in set (canon_fully_decomposed ) & set (compat_fully_decomposed ):
246- if canon_fully_decomposed [ch ] == compat_fully_decomposed [ch ]:
247- del compat_fully_decomposed [ch ]
239+ for ch in set (canon_fully_decomp ) & set (compat_fully_decomp ):
240+ if canon_fully_decomp [ch ] == compat_fully_decomp [ch ]:
241+ del compat_fully_decomp [ch ]
242+
243+ return canon_fully_decomp , compat_fully_decomp
244+
245+ def _compute_stream_safe_tables (self ):
246+ """
247+ To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
248+ we need to be able to know the number of contiguous non-starters *after*
249+ applying compatibility decomposition to each character.
250+
251+ We can do this incrementally by computing the number of leading and
252+ trailing non-starters for each character's compatibility decomposition
253+ with the following rules:
254+
255+ 1) If a character is not affected by compatibility decomposition, look
256+ up its canonical combining class to find out if it's a non-starter.
257+ 2) All Hangul characters are starters, even under decomposition.
258+ 3) Otherwise, very few decomposing characters have a nonzero count
259+ of leading or trailing non-starters, so store these characters
260+ with their associated counts in a separate table.
261+ """
262+ leading_nonstarters = {}
263+ trailing_nonstarters = {}
248264
249- return canon_fully_decomposed , compat_fully_decomposed
265+ for c in set (self .canon_fully_decomp ) | set (self .compat_fully_decomp ):
266+ decomposed = self .compat_fully_decomp .get (c ) or self .canon_fully_decomp [c ]
267+
268+ num_leading = 0
269+ for d in decomposed :
270+ if d not in self .combining_classes :
271+ break
272+ num_leading += 1
273+
274+ num_trailing = 0
275+ for d in reversed (decomposed ):
276+ if d not in self .combining_classes :
277+ break
278+ num_trailing += 1
279+
280+ if num_leading > 0 :
281+ leading_nonstarters [c ] = num_leading
282+ if num_trailing > 0 :
283+ trailing_nonstarters [c ] = num_trailing
284+
285+ return leading_nonstarters , trailing_nonstarters
286+
287+ hexify = lambda c : hex (c )[2 :].upper ().rjust (4 , '0' )
250288
251289def gen_combining_class (combining_classes , out ):
252290 out .write ("#[inline]\n " )
253291 out .write ("pub fn canonical_combining_class(c: char) -> u8 {\n " )
254292 out .write (" match c {\n " )
255293
256- for char , combining_class in sorted (combining_classes .items (), key = lambda ( k , _ ): int ( k , 16 ) ):
257- out .write (" '\u{%s}' => %s,\n " % (char , combining_class ))
294+ for char , combining_class in sorted (combining_classes .items ()):
295+ out .write (" '\u{%s}' => %s,\n " % (hexify ( char ) , combining_class ))
258296
259297 out .write (" _ => 0,\n " )
260298 out .write (" }\n " )
@@ -265,8 +303,8 @@ def gen_composition_table(canon_comp, out):
265303 out .write ("pub fn composition_table(c1: char, c2: char) -> Option<char> {\n " )
266304 out .write (" match (c1, c2) {\n " )
267305
268- for (c1 , c2 ), c3 in sorted (canon_comp .items (), key = lambda (( c1 , c2 ), _ ): ( int ( c1 , 16 ), int ( c2 , 16 )) ):
269- out .write (" ('\u{%s}', '\u{%s}') => Some('\u{%s}'),\n " % (c1 , c2 , c3 ))
306+ for (c1 , c2 ), c3 in sorted (canon_comp .items ()):
307+ out .write (" ('\u{%s}', '\u{%s}') => Some('\u{%s}'),\n " % (hexify ( c1 ), hexify ( c2 ), hexify ( c3 ) ))
270308
271309 out .write (" _ => None,\n " )
272310 out .write (" }\n " )
@@ -279,8 +317,6 @@ def gen_decomposition_tables(canon_decomp, compat_decomp, out):
279317 out .write ("pub fn %s_fully_decomposed(c: char) -> Option<&'static [char]> {\n " % name )
280318 out .write (" match c {\n " )
281319
282- hexify = lambda c : hex (c )[2 :].upper ()
283-
284320 for char , chars in sorted (table .items ()):
285321 d = ", " .join ("'\u{%s}'" % hexify (c ) for c in chars )
286322 out .write (" '\u{%s}' => Some(&[%s]),\n " % (hexify (char ), d ))
@@ -323,12 +359,36 @@ def gen_combining_mark(general_category_mark, out):
323359 out .write (" match c {\n " )
324360
325361 for char in general_category_mark :
326- out .write (" '\u{%s}' => true,\n " % char )
362+ out .write (" '\u{%s}' => true,\n " % hexify ( char ) )
327363
328364 out .write (" _ => false,\n " )
329365 out .write (" }\n " )
330366 out .write ("}\n " )
331367
368+ def gen_stream_safe (leading , trailing , out ):
369+ out .write ("#[inline]\n " )
370+ out .write ("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n " )
371+ out .write (" match c {\n " )
372+
373+ for char , num_leading in leading .items ():
374+ out .write (" '\u{%s}' => %d,\n " % (hexify (char ), num_leading ))
375+
376+ out .write (" _ => 0,\n " )
377+ out .write (" }\n " )
378+ out .write ("}\n " )
379+ out .write ("\n " )
380+
381+ out .write ("#[inline]\n " )
382+ out .write ("pub fn stream_safe_trailing_nonstarters(c: char) -> usize {\n " )
383+ out .write (" match c {\n " )
384+
385+ for char , num_trailing in trailing .items ():
386+ out .write (" '\u{%s}' => %d,\n " % (hexify (char ), num_trailing ))
387+
388+ out .write (" _ => 0,\n " )
389+ out .write (" }\n " )
390+ out .write ("}\n " )
391+
332392def gen_tests (tests , out ):
333393 out .write ("""#[derive(Debug)]
334394pub struct NormalizationTest {
@@ -384,6 +444,9 @@ def gen_tests(tests, out):
384444 gen_nfd_qc (data .norm_props , out )
385445 out .write ("\n " )
386446
447+ gen_stream_safe (data .ss_leading , data .ss_trailing , out )
448+ out .write ("\n " )
449+
387450 with open ("normalization_tests.rs" , "w" ) as out :
388451 out .write (PREAMBLE )
389452 gen_tests (data .norm_tests , out )
0 commit comments