@@ -288,14 +288,18 @@ def _compute_stream_safe_tables(self):
288288
289289def gen_combining_class (combining_classes , out ):
290290 out .write ("#[inline]\n " )
291+ (salt , keys ) = minimal_perfect_hash (combining_classes )
291292 out .write ("pub fn canonical_combining_class(c: char) -> u8 {\n " )
292- out .write (" match c {\n " )
293-
294- for char , combining_class in sorted (combining_classes .items ()):
295- out .write (" '\\ u{%s}' => %s,\n " % (hexify (char ), combining_class ))
296-
297- out .write (" _ => 0,\n " )
298- out .write (" }\n " )
293+ out .write (" mph_lookup(c.into(), &[\n " )
294+ for s in salt :
295+ out .write (" 0x{:x},\n " .format (s ))
296+ out .write (" ],\n " )
297+ out .write (" &[\n " )
298+ for k in keys :
299+ kv = int (combining_classes [k ]) | (k << 8 )
300+ out .write (" 0x{:x},\n " .format (kv ))
301+ out .write (" ],\n " )
302+ out .write (" u8_lookup_fk, u8_lookup_fv, 0) as u8\n " )
299303 out .write ("}\n " )
300304
301305def gen_composition_table (canon_comp , out ):
@@ -432,13 +436,61 @@ def gen_tests(tests, out):
432436
433437 out .write ("];\n " )
434438
439+ def my_hash (x , salt , n ):
440+ # This is hash based on the theory that multiplication is efficient
441+ mask_32 = 0xffffffff
442+ y = ((x + salt ) * 2654435769 ) & mask_32
443+ y ^= (x * 0x31415926 ) & mask_32
444+ return (y * n ) >> 32
445+
446+ # Compute minimal perfect hash function, d can be either a dict or list of keys.
447+ def minimal_perfect_hash (d , singleton_buckets = False ):
448+ n = len (d )
449+ buckets = dict ((h , []) for h in range (n ))
450+ for key in d :
451+ h = my_hash (key , 0 , n )
452+ buckets [h ].append (key )
453+ bsorted = [(len (buckets [h ]), h ) for h in range (n )]
454+ bsorted .sort (reverse = True )
455+ claimed = [False ] * n
456+ salts = [0 ] * n
457+ keys = [0 ] * n
458+ for (bucket_size , h ) in bsorted :
459+ if bucket_size == 0 :
460+ break
461+ elif singleton_buckets and bucket_size == 1 :
462+ for i in range (n ):
463+ if not claimed [i ]:
464+ salts [h ] = - (i + 1 )
465+ claimed [i ] = True
466+ keys [i ] = buckets [h ][0 ]
467+ break
468+ else :
469+ for salt in range (1 , 32768 ):
470+ rehashes = [my_hash (key , salt , n ) for key in buckets [h ]]
471+ if all (not claimed [hash ] for hash in rehashes ):
472+ if len (set (rehashes )) < bucket_size :
473+ continue
474+ salts [h ] = salt
475+ for key in buckets [h ]:
476+ rehash = my_hash (key , salt , n )
477+ claimed [rehash ] = True
478+ keys [rehash ] = key
479+ break
480+ if salts [h ] == 0 :
481+ print ("minimal perfect hashing failed" )
482+ exit (1 )
483+ return (salts , keys )
484+
435485if __name__ == '__main__' :
436486 data = UnicodeData ()
437487 with open ("tables.rs" , "w" , newline = "\n " ) as out :
438488 out .write (PREAMBLE )
439489 out .write ("use quick_check::IsNormalized;\n " )
440490 out .write ("use quick_check::IsNormalized::*;\n " )
441491 out .write ("\n " )
492+ out .write ("use perfect_hash::*;\n " )
493+ out .write ("\n " )
442494
443495 version = "(%s, %s, %s)" % tuple (UNICODE_VERSION .split ("." ))
444496 out .write ("#[allow(unused)]\n " )
0 commit comments