docs

tgross35 · tgross35 · commit c4a1e83bd4de · 2025-02-05T12:42:07.000Z
diff --git a/src/math/generic/scalbn.rs b/src/math/generic/scalbn.rs
@@ -1,19 +1,5 @@
-#![allow(unused)]
-#![allow(clippy::all)]
-
-use super::super::support::Hexf;
 use super::super::{CastFrom, CastInto, Float, IntTy, MinInt};
 
-#[cfg(not(optimizations_enabled))]
-extern crate std;
-#[cfg(not(optimizations_enabled))]
-use std::dbg;
-
-#[cfg(optimizations_enabled)]
-macro_rules! dbg {
-    ($($tt:tt)*) => {};
-}
-
 /// Scale the exponent.
 ///
 /// From N3220:
@@ -36,7 +22,6 @@ where
     F::Int: CastFrom<i32>,
     F::Int: CastFrom<u32>,
 {
-    dbg!();
     let zero = IntTy::<F>::ZERO;
 
     // Bits including the implicit bit
@@ -55,10 +40,6 @@ where
     // 2 ^ sig_total_bits, representation of what can be accounted for with subnormals
     let f_exp_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
 
-    dbg!(exp_max, exp_min, sig_total_bits, sig_total_bits + F::EXP_BIAS);
-    dbg!(Hexf(f_exp_max), Hexf(f_exp_min), Hexf(f_exp_subnorm));
-    dbg!(Hexf(x), n);
-
     // The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
     // where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
     // `n = -EMin + 2`. To get around this, reduce the magnitude of the final scale operation by
@@ -86,56 +67,62 @@ where
 
             let mul = f_exp_min * f_exp_subnorm;
             let add = -exp_min - sig_total_bits as i32;
-            dbg!(Hexf(mul), add);
+
+            // Worse case negative `n`: `x`  is the maximum positive value, the result is `F::MIN`.
+            // This can be reached by three scaling multiplications (two here and one final).
+            debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min);
 
             x *= mul;
             n += add;
 
             if n < exp_min {
                 x *= mul;
                 n += add;
+
                 if n < exp_min {
                     x *= mul;
                     n += add;
+
                     if n < exp_min {
                         n = exp_min;
-                        dbg!(Hexf(x), n);
                     }
                 }
             }
         } else {
-            let add = (n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
-            let mul = F::from_parts(false, (F::EXP_BIAS as i32 + add) as u32, zero);
-            let add = -add;
-            dbg!(add, Hexf(mul));
+            // `f16` is unique compared to other float types in that the difference between the
+            // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
+            // small, only three. The above method depend on decrementing `n` by `add` two times;
+            // for other float types this works out because `add` is a substantial fraction of
+            // the exponent range. For `f16`, however, 3 is relatively small compared to the
+            // exponent range (which is 39), so that would require a lot of rounds.
+            //
+            // Work aroudn this by using a different algorithm that scales by the max possible
+            // value that does not exceed the minimum normal exponent.
+
+            let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
+            let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
 
             x *= mul;
             n += add;
-            dbg!(Hexf(x), n);
 
             if n < exp_min {
                 let add = (n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
                 let mul = F::from_parts(false, (F::EXP_BIAS as i32 + add) as u32, zero);
                 let add = -add;
-                dbg!(add, Hexf(mul));
 
                 x *= mul;
                 n += add;
-                dbg!(Hexf(x), n);
 
                 if n < exp_min {
                     n = exp_min;
-                    dbg!(Hexf(x), n);
                 }
             }
             // f16
         }
     }
-    dbg!(Hexf(x), n);
+
     let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero);
-    let ret = x * scale;
-    dbg!(Hexf(scale), Hexf(ret));
-    ret
+    x * scale
 }
 
 #[cfg(test)]