@@ -309,11 +309,24 @@ intrinsics! {
309309 target_pointer_width = "64"
310310 ) ) ]
311311 pub extern "C" fn __clzsi2( x: usize ) -> usize {
312- // TODO: const this? Would require const-if
313- // Note(Lokathor): the `intrinsics!` macro can't process mut inputs
312+ // Note: This routine produces the correct value for `x == 0`. Zero is probably common
313+ // enough that it could warrant adding a zero check at the beginning, but this function has
314+ // a precondition that `x != 0`. Compilers will insert the check for zero in cases where it
315+ // is needed.
316+
317+ // The basic idea is to test if the higher bits of `x` are zero and bisect the number of
318+ // leading zeros (done here by subtraction instead of addition because it simplifies the
319+ // final bisection step).
320+ // It is possible for all branches of the bisection to use the same code path via
321+ // conditionally shifting the higher parts down to let the next bisection step work on the
322+ // higher or lower parts of `x`.
323+ // This method using `(x >= power-of-two) as usize` to test if the higher bits are zero is
324+ // branchless on most architectures (performed on many architectures with a single
325+ // set-if-more-than-or-equal instruction, or done with some kind of conditional move).
326+
314327 let mut x = x;
315- let mut y : usize ;
316- let mut n : usize = {
328+ // The number of potential leading zeros
329+ let mut z = {
317330 #[ cfg( target_pointer_width = "64" ) ]
318331 {
319332 64
@@ -327,42 +340,60 @@ intrinsics! {
327340 16
328341 }
329342 } ;
343+
344+ // a temporary
345+ let mut t: usize ;
346+
330347 #[ cfg( target_pointer_width = "64" ) ]
331348 {
332- y = x >> 32 ;
333- if y != 0 {
334- n -= 32 ;
335- x = y;
336- }
349+ // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise `t` is
350+ // set to 0.
351+ t = ( ( x >= ( 1 << 32 ) ) as usize ) << 5 ;
352+ // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the next step
353+ // to process.
354+ x >>= t;
355+ // If `t` was set to `1 << 5`, then we subtract 32 from the number of potential leading
356+ // zeros
357+ z -= t;
337358 }
359+
338360 #[ cfg( any( target_pointer_width = "32" , target_pointer_width = "64" ) ) ]
339361 {
340- y = x >> 16 ;
341- if y != 0 {
342- n -= 16 ;
343- x = y;
344- }
345- }
346- y = x >> 8 ;
347- if y != 0 {
348- n -= 8 ;
349- x = y;
350- }
351- y = x >> 4 ;
352- if y != 0 {
353- n -= 4 ;
354- x = y;
355- }
356- y = x >> 2 ;
357- if y != 0 {
358- n -= 2 ;
359- x = y;
360- }
361- y = x >> 1 ;
362- if y != 0 {
363- n - 2
364- } else {
365- n - x
362+ t = ( ( x >= ( 1 << 16 ) ) as usize ) << 4 ;
363+ x >>= t;
364+ z -= t;
366365 }
366+
367+ t = ( ( x >= ( 1 << 8 ) ) as usize ) << 3 ;
368+ x >>= t;
369+ z -= t;
370+
371+ t = ( ( x >= ( 1 << 4 ) ) as usize ) << 2 ;
372+ x >>= t;
373+ z -= t;
374+
375+ t = ( ( x >= ( 1 << 2 ) ) as usize ) << 1 ;
376+ x >>= t;
377+ z -= t;
378+
379+ t = ( x >= ( 1 << 1 ) ) as usize ;
380+ x >>= t;
381+ z -= t;
382+
383+ // All bits except LSB are guaranteed to be zero for this final bisection step. If `x != 0`
384+ // then `x == 1` and subtracts a potential zero from `z`.
385+ z - x
386+
387+ // We could potentially save a few cycles by using the LUT trick from
388+ // "https://embeddedgurus.com/state-space/2014/09/
389+ // fast-deterministic-and-portable-counting-leading-zeros/". However, 256 bytes for a LUT is
390+ // too large for embedded use cases. We could perform bisection down to
391+ // `((x >= (1 << 4)) as usize) << 2` and use this 16 byte LUT for the rest of the work:
392+ //const LUT: [u8; 16] = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4];
393+ //z -= LUT[x] as usize;
394+ //z
395+ // However, it ends up generating about the same number of instructions. When benchmarked on
396+ // x86_64, it is slightly faster to use the LUT, but this is probably because of OOO
397+ // execution effects. Changing to using a LUT and branching is risky for smaller cores.
367398 }
368399}
0 commit comments