@@ -286,67 +286,46 @@ def _compute_stream_safe_tables(self):
286286
287287hexify = lambda c : '{:04X}' .format (c )
288288
289- def gen_combining_class (combining_classes , out ):
290- (salt , keys ) = minimal_perfect_hash (combining_classes )
291- out .write ("pub fn canonical_combining_class(c: char) -> u8 {\n " )
292- out .write (" mph_lookup(c.into(), &[\n " )
289+ def gen_mph_data (name , d , kv_type , kv_callback ):
290+ (salt , keys ) = minimal_perfect_hash (d )
291+ out .write ("pub(crate) const %s_SALT: &[u16] = &[\n " % name .upper ())
293292 for s in salt :
294- out .write (" 0x{:x},\n " .format (s ))
295- out .write (" ], \n " )
296- out .write (" &[\n " )
293+ out .write (" 0x{:x},\n " .format (s ))
294+ out .write ("]; \n " )
295+ out .write ("pub(crate) const {}_KV: &[{}] = &[\n " . format ( name . upper (), kv_type ) )
297296 for k in keys :
298- kv = int (combining_classes [k ]) | (k << 8 )
299- out .write (" 0x{:x},\n " .format (kv ))
300- out .write (" ],\n " )
301- out .write (" u8_lookup_fk, u8_lookup_fv, 0)\n " )
302- out .write ("}\n " )
297+ out .write (" {},\n " .format (kv_callback (k )))
298+ out .write ("];\n \n " )
299+
300+ def gen_combining_class (combining_classes , out ):
301+ gen_mph_data ('canonical_combining_class' , combining_classes , 'u32' ,
302+ lambda k : "0x{:X}" .format (int (combining_classes [k ]) | (k << 8 )))
303303
304304def gen_composition_table (canon_comp , out ):
305305 table = {}
306306 for (c1 , c2 ), c3 in canon_comp .items ():
307307 if c1 < 0x10000 and c2 < 0x10000 :
308308 table [(c1 << 16 ) | c2 ] = c3
309309 (salt , keys ) = minimal_perfect_hash (table )
310- out .write ("pub fn composition_table(c1: char, c2: char) -> Option<char> {\n " )
311- out .write (" if c1 < '\\ u{10000}' && c2 < '\\ u{10000}' {\n " )
312- out .write (" mph_lookup((c1 as u32) << 16 | (c2 as u32), &[\n " )
313- for s in salt :
314- out .write (" 0x{:x},\n " .format (s ))
315- out .write (" ],\n " )
316- out .write (" &[\n " )
317- for k in keys :
318- out .write (" (0x%s, '\\ u{%s}'),\n " % (hexify (k ), hexify (table [k ])))
319- out .write (" ],\n " )
320- out .write (" pair_lookup_fk, pair_lookup_fv_opt, None)\n " )
321- out .write (" } else {\n " )
322- out .write (" match (c1, c2) {\n " )
310+ gen_mph_data ('COMPOSITION_TABLE' , table , '(u32, char)' ,
311+ lambda k : "(0x%s, '\\ u{%s}')" % (hexify (k ), hexify (table [k ])))
323312
313+ out .write ("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n " )
314+ out .write (" match (c1, c2) {\n " )
324315 for (c1 , c2 ), c3 in sorted (canon_comp .items ()):
325316 if c1 >= 0x10000 and c2 >= 0x10000 :
326- out .write (" ('\\ u{%s}', '\\ u{%s}') => Some('\\ u{%s}'),\n " % (hexify (c1 ), hexify (c2 ), hexify (c3 )))
317+ out .write (" ('\\ u{%s}', '\\ u{%s}') => Some('\\ u{%s}'),\n " % (hexify (c1 ), hexify (c2 ), hexify (c3 )))
327318
328- out .write (" _ => None,\n " )
329- out .write (" }\n " )
319+ out .write (" _ => None,\n " )
330320 out .write (" }\n " )
331321 out .write ("}\n " )
332322
333323def gen_decomposition_tables (canon_decomp , compat_decomp , out ):
334324 tables = [(canon_decomp , 'canonical' ), (compat_decomp , 'compatibility' )]
335325 for table , name in tables :
336- (salt , keys ) = minimal_perfect_hash (table )
337- out .write ("const {}_DECOMPOSED_KV: &[(u32, &'static [char])] = &[\n " .format (name .upper ()))
338- for char in keys :
339- d = ", " .join ("'\\ u{%s}'" % hexify (c ) for c in table [char ])
340- out .write (" (0x{:x}, &[{}]),\n " .format (char , d ))
341- out .write ("];\n " )
342- out .write ("pub fn %s_fully_decomposed(c: char) -> Option<&'static [char]> {\n " % name )
343- out .write (" mph_lookup(c.into(), &[\n " )
344- for s in salt :
345- out .write (" 0x{:x},\n " .format (s ))
346- out .write (" ],\n " )
347- out .write (" {}_DECOMPOSED_KV,\n " .format (name .upper ()))
348- out .write (" pair_lookup_fk, pair_lookup_fv_opt, None)\n " )
349- out .write ("}\n " )
326+ gen_mph_data (name + '_decomposed' , table , "(u32, &'static [char])" ,
327+ lambda k : "(0x{:x}, &[{}])" .format (k ,
328+ ", " .join ("'\\ u{%s}'" % hexify (c ) for c in table [k ])))
350329
351330def gen_qc_match (prop_table , out ):
352331 out .write (" match c {\n " )
@@ -388,18 +367,8 @@ def gen_nfkd_qc(prop_tables, out):
388367 out .write ("}\n " )
389368
390369def gen_combining_mark (general_category_mark , out ):
391- (salt , keys ) = minimal_perfect_hash (general_category_mark )
392- out .write ("pub fn is_combining_mark(c: char) -> bool {\n " )
393- out .write (" mph_lookup(c.into(), &[\n " )
394- for s in salt :
395- out .write (" 0x{:x},\n " .format (s ))
396- out .write (" ],\n " )
397- out .write (" &[\n " )
398- for k in keys :
399- out .write (" 0x{:x},\n " .format (k ))
400- out .write (" ],\n " )
401- out .write (" bool_lookup_fk, bool_lookup_fv, false)\n " )
402- out .write ("}\n " )
370+ gen_mph_data ('combining_mark' , general_category_mark , 'u32' ,
371+ lambda k : '0x{:04x}' .format (k ))
403372
404373def gen_stream_safe (leading , trailing , out ):
405374 # This could be done as a hash but the table is very small.
@@ -415,19 +384,8 @@ def gen_stream_safe(leading, trailing, out):
415384 out .write ("}\n " )
416385 out .write ("\n " )
417386
418- (salt , keys ) = minimal_perfect_hash (trailing )
419- out .write ("pub fn stream_safe_trailing_nonstarters(c: char) -> usize {\n " )
420- out .write (" mph_lookup(c.into(), &[\n " )
421- for s in salt :
422- out .write (" 0x{:x},\n " .format (s ))
423- out .write (" ],\n " )
424- out .write (" &[\n " )
425- for k in keys :
426- kv = int (trailing [k ]) | (k << 8 )
427- out .write (" 0x{:x},\n " .format (kv ))
428- out .write (" ],\n " )
429- out .write (" u8_lookup_fk, u8_lookup_fv, 0) as usize\n " )
430- out .write ("}\n " )
387+ gen_mph_data ('trailing_nonstarters' , trailing , 'u32' ,
388+ lambda k : "0x{:X}" .format (int (trailing [k ]) | (k << 8 )))
431389
432390def gen_tests (tests , out ):
433391 out .write ("""#[derive(Debug)]
@@ -463,7 +421,7 @@ def my_hash(x, salt, n):
463421 return (y * n ) >> 32
464422
465423# Compute minimal perfect hash function, d can be either a dict or list of keys.
466- def minimal_perfect_hash (d , singleton_buckets = False ):
424+ def minimal_perfect_hash (d ):
467425 n = len (d )
468426 buckets = dict ((h , []) for h in range (n ))
469427 for key in d :
@@ -475,18 +433,16 @@ def minimal_perfect_hash(d, singleton_buckets = False):
475433 salts = [0 ] * n
476434 keys = [0 ] * n
477435 for (bucket_size , h ) in bsorted :
436+ # Note: the traditional perfect hashing approach would also special-case
437+ # bucket_size == 1 here and assign any empty slot, rather than iterating
438+ # until rehash finds an empty slot. But we're not doing that so we can
439+ # avoid the branch.
478440 if bucket_size == 0 :
479441 break
480- elif singleton_buckets and bucket_size == 1 :
481- for i in range (n ):
482- if not claimed [i ]:
483- salts [h ] = - (i + 1 )
484- claimed [i ] = True
485- keys [i ] = buckets [h ][0 ]
486- break
487442 else :
488443 for salt in range (1 , 32768 ):
489444 rehashes = [my_hash (key , salt , n ) for key in buckets [h ]]
445+ # Make sure there are no rehash collisions within this bucket.
490446 if all (not claimed [hash ] for hash in rehashes ):
491447 if len (set (rehashes )) < bucket_size :
492448 continue
@@ -498,6 +454,17 @@ def minimal_perfect_hash(d, singleton_buckets = False):
498454 break
499455 if salts [h ] == 0 :
500456 print ("minimal perfect hashing failed" )
457+ # Note: if this happens (because of unfortunate data), then there are
458+ # a few things that could be done. First, the hash function could be
459+ # tweaked. Second, the bucket order could be scrambled (especially the
460+ # singletons). Right now, the buckets are sorted, which has the advantage
461+ # of being deterministic.
462+ #
463+ # As a more extreme approach, the singleton bucket optimization could be
464+ # applied (give the direct address for singleton buckets, rather than
465+ # relying on a rehash). That is definitely the more standard approach in
466+ # the minimal perfect hashing literature, but in testing the branch was a
467+ # significant slowdown.
501468 exit (1 )
502469 return (salts , keys )
503470
@@ -508,8 +475,6 @@ def minimal_perfect_hash(d, singleton_buckets = False):
508475 out .write ("use quick_check::IsNormalized;\n " )
509476 out .write ("use quick_check::IsNormalized::*;\n " )
510477 out .write ("\n " )
511- out .write ("use perfect_hash::*;\n " )
512- out .write ("\n " )
513478
514479 version = "(%s, %s, %s)" % tuple (UNICODE_VERSION .split ("." ))
515480 out .write ("#[allow(unused)]\n " )
0 commit comments