Implement remaining __clz*i2 intrinsics

tea · tea · commit b4ab20944d8c · 2024-07-06T15:24:26.000+03:00
diff --git a/README.md b/README.md
@@ -157,6 +157,9 @@ rely on CI.
 - [x] bswapdi2.c
 - [x] bswapsi2.c
 - [x] bswapti2.c
+- [x] clzdi2.c
+- [x] clzsi2.c
+- [x] clzti2.c
 - [x] comparedf2.c
 - [x] comparesf2.c
 - [x] divdf3.c
@@ -325,9 +328,6 @@ These builtins are never called by LLVM.
 - ~~arm/switch32.S~~
 - ~~arm/switch8.S~~
 - ~~arm/switchu8.S~~
-- ~~clzdi2.c~~
-- ~~clzsi2.c~~
-- ~~clzti2.c~~
 - ~~cmpdi2.c~~
 - ~~cmpti2.c~~
 - ~~ctzdi2.c~~
diff --git a/build.rs b/build.rs
@@ -165,6 +165,8 @@ fn configure_check_cfg() {
         "__bswapdi2",
         "__bswapti2",
         "__clzsi2",
+        "__clzdi2",
+        "__clzti2",
         "__divdi3",
         "__divsi3",
         "__divmoddi4",
diff --git a/src/int/leading_zeros.rs b/src/int/leading_zeros.rs
@@ -1,149 +1,19 @@
-// Note: these functions happen to produce the correct `usize::leading_zeros(0)` value
-// without a explicit zero check. Zero is probably common enough that it could warrant
-// adding a zero check at the beginning, but `__clzsi2` has a precondition that `x != 0`.
-// Compilers will insert the check for zero in cases where it is needed.
-
-public_test_dep! {
-/// Returns the number of leading binary zeros in `x`.
-#[allow(dead_code)]
-pub(crate) fn usize_leading_zeros_default(x: usize) -> usize {
-    // The basic idea is to test if the higher bits of `x` are zero and bisect the number
-    // of leading zeros. It is possible for all branches of the bisection to use the same
-    // code path by conditionally shifting the higher parts down to let the next bisection
-    // step work on the higher or lower parts of `x`. Instead of starting with `z == 0`
-    // and adding to the number of zeros, it is slightly faster to start with
-    // `z == usize::MAX.count_ones()` and subtract from the potential number of zeros,
-    // because it simplifies the final bisection step.
-    let mut x = x;
-    // the number of potential leading zeros
-    let mut z = usize::MAX.count_ones() as usize;
-    // a temporary
-    let mut t: usize;
-    #[cfg(target_pointer_width = "64")]
-    {
-        t = x >> 32;
-        if t != 0 {
-            z -= 32;
-            x = t;
-        }
-    }
-    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
-    {
-        t = x >> 16;
-        if t != 0 {
-            z -= 16;
-            x = t;
-        }
-    }
-    t = x >> 8;
-    if t != 0 {
-        z -= 8;
-        x = t;
-    }
-    t = x >> 4;
-    if t != 0 {
-        z -= 4;
-        x = t;
-    }
-    t = x >> 2;
-    if t != 0 {
-        z -= 2;
-        x = t;
-    }
-    // the last two bisections are combined into one conditional
-    t = x >> 1;
-    if t != 0 {
-        z - 2
-    } else {
-        z - x
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    /// Returns the number of leading binary zeros in `x` (SI aka 32 bit version)
+    pub extern "C" fn __clzsi2(x: u32) -> usize {
+        x.leading_zeros() as usize
     }
 
-    // We could potentially save a few cycles by using the LUT trick from
-    // "https://embeddedgurus.com/state-space/2014/09/
-    // fast-deterministic-and-portable-counting-leading-zeros/".
-    // However, 256 bytes for a LUT is too large for embedded use cases. We could remove
-    // the last 3 bisections  and use this 16 byte LUT for the rest of the work:
-    //const LUT: [u8; 16] = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4];
-    //z -= LUT[x] as usize;
-    //z
-    // However, it ends up generating about the same number of instructions. When benchmarked
-    // on x86_64, it is slightly faster to use the LUT, but this is probably because of OOO
-    // execution effects. Changing to using a LUT and branching is risky for smaller cores.
-}
-}
-
-// The above method does not compile well on RISC-V (because of the lack of predicated
-// instructions), producing code with many branches or using an excessively long
-// branchless solution. This method takes advantage of the set-if-less-than instruction on
-// RISC-V that allows `(x >= power-of-two) as usize` to be branchless.
-
-public_test_dep! {
-/// Returns the number of leading binary zeros in `x`.
-#[allow(dead_code)]
-pub(crate) fn usize_leading_zeros_riscv(x: usize) -> usize {
-    let mut x = x;
-    // the number of potential leading zeros
-    let mut z = usize::MAX.count_ones() as usize;
-    // a temporary
-    let mut t: usize;
-
-    // RISC-V does not have a set-if-greater-than-or-equal instruction and
-    // `(x >= power-of-two) as usize` will get compiled into two instructions, but this is
-    // still the most optimal method. A conditional set can only be turned into a single
-    // immediate instruction if `x` is compared with an immediate `imm` (that can fit into
-    // 12 bits) like `x < imm` but not `imm < x` (because the immediate is always on the
-    // right). If we try to save an instruction by using `x < imm` for each bisection, we
-    // have to shift `x` left and compare with powers of two approaching `usize::MAX + 1`,
-    // but the immediate will never fit into 12 bits and never save an instruction.
-    #[cfg(target_pointer_width = "64")]
-    {
-        // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise
-        // `t` is set to 0.
-        t = ((x >= (1 << 32)) as usize) << 5;
-        // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the
-        // next step to process.
-        x >>= t;
-        // If `t` was set to `1 << 5`, then we subtract 32 from the number of potential
-        // leading zeros
-        z -= t;
-    }
-    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
-    {
-        t = ((x >= (1 << 16)) as usize) << 4;
-        x >>= t;
-        z -= t;
+    #[maybe_use_optimized_c_shim]
+    /// Returns the number of leading binary zeros in `x` (DI aka 64 bit version).
+    pub extern "C" fn __clzdi2(x: u64) -> usize {
+        x.leading_zeros() as usize
     }
-    t = ((x >= (1 << 8)) as usize) << 3;
-    x >>= t;
-    z -= t;
-    t = ((x >= (1 << 4)) as usize) << 2;
-    x >>= t;
-    z -= t;
-    t = ((x >= (1 << 2)) as usize) << 1;
-    x >>= t;
-    z -= t;
-    t = (x >= (1 << 1)) as usize;
-    x >>= t;
-    z -= t;
-    // All bits except the LSB are guaranteed to be zero for this final bisection step.
-    // If `x != 0` then `x == 1` and subtracts one potential zero from `z`.
-    z - x
-}
-}
 
-intrinsics! {
     #[maybe_use_optimized_c_shim]
-    #[cfg(any(
-        target_pointer_width = "16",
-        target_pointer_width = "32",
-        target_pointer_width = "64"
-    ))]
-    /// Returns the number of leading binary zeros in `x`.
-    pub extern "C" fn __clzsi2(x: usize) -> usize {
-        if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
-            usize_leading_zeros_riscv(x)
-        } else {
-            usize_leading_zeros_default(x)
-        }
+    /// Returns the number of leading binary zeros in `x` (TI mode int aka int128_t).
+    pub extern "C" fn __clzti2(x: u128) -> usize {
+        x.leading_zeros() as usize
     }
 }
diff --git a/src/int/mod.rs b/src/int/mod.rs
@@ -12,7 +12,6 @@ pub mod shift;
 pub mod udiv;
 
 pub use big::{i256, u256};
-pub use leading_zeros::__clzsi2;
 
 public_test_dep! {
 /// Minimal integer implementations needed on all integer types, including wide integers.
diff --git a/testcrate/tests/misc.rs b/testcrate/tests/misc.rs
@@ -65,31 +65,46 @@ fn fuzz_values() {
 
 #[test]
 fn leading_zeros() {
-    use compiler_builtins::int::__clzsi2;
-    use compiler_builtins::int::leading_zeros::{
-        usize_leading_zeros_default, usize_leading_zeros_riscv,
-    };
-    fuzz(N, |x: usize| {
+    use compiler_builtins::int::leading_zeros::__clzsi2;
+    fuzz(N, |x: u32| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
         let lz = x.leading_zeros() as usize;
         let lz0 = __clzsi2(x);
-        let lz1 = usize_leading_zeros_default(x);
-        let lz2 = usize_leading_zeros_riscv(x);
         if lz0 != lz {
             panic!("__clzsi2({}): std: {}, builtins: {}", x, lz, lz0);
         }
-        if lz1 != lz {
-            panic!(
-                "usize_leading_zeros_default({}): std: {}, builtins: {}",
-                x, lz, lz1
-            );
-        }
-        if lz2 != lz {
-            panic!(
-                "usize_leading_zeros_riscv({}): std: {}, builtins: {}",
-                x, lz, lz2
-            );
-        }
-    })
+    });
+
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    {
+        use compiler_builtins::int::leading_zeros::__clzdi2;
+        fuzz(N, |x: u64| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzdi2(x);
+            if lz0 != lz {
+                panic!("__clzdi2({}): std: {}, builtins: {}", x, lz, lz0);
+            }
+        });
+    }
+    #[cfg(target_pointer_width = "64")]
+    {
+        use compiler_builtins::int::leading_zeros::__clzti2;
+        fuzz(N, |x: u128| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzti2(x);
+            if lz0 != lz {
+                panic!("__clzti2({}): std: {}, builtins: {}", x, lz, lz0);
+            }
+        });
+    }
 }
 
 #[test]