@@ -31,16 +31,27 @@ where
3131 let exp_max: i32 = F :: EXP_BIAS as i32 ;
3232 let exp_min = -( exp_max - 1 ) ;
3333
34- // 2 ^ Emax, where Emax is the maximum biased exponent value (1023 for f64)
34+ // 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
3535 let f_exp_max = F :: from_parts ( false , F :: EXP_BIAS << 1 , zero) ;
3636
37- // 2 ^ Emin, where Emin is the minimum biased exponent value ( -1022 for f64)
37+ // 2 ^ Emin, minimum positive normal with null significand (0x1p -1022 for f64)
3838 let f_exp_min = F :: from_parts ( false , 1 , zero) ;
3939
40- // 2 ^ sig_total_bits, representation of what can be accounted for with subnormals
41- let f_exp_subnorm = F :: from_parts ( false , sig_total_bits + F :: EXP_BIAS , zero) ;
40+ // 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
41+ let f_pow_subnorm = F :: from_parts ( false , sig_total_bits + F :: EXP_BIAS , zero) ;
42+
43+ /*
44+ * The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
45+ * where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
46+ * `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
47+ * the final scale operation by prescaling by the max/min power representable by `F`.
48+ */
4249
4350 if n > exp_max {
51+ // Worse case positive `n`: `x` is the minimum subnormal value, the result is `F::MAX`.
52+ // This can be reached by three scaling multiplications (two here and one final).
53+ debug_assert ! ( -exp_min + F :: SIG_BITS as i32 + exp_max <= exp_max * 3 ) ;
54+
4455 x *= f_exp_max;
4556 n -= exp_max;
4657 if n > exp_max {
@@ -51,21 +62,61 @@ where
5162 }
5263 }
5364 } else if n < exp_min {
54- let mul = f_exp_min * f_exp_subnorm;
55- let add = ( exp_max - 1 ) - sig_total_bits as i32 ;
65+ // When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
66+ // go subnormal. This avoids double rounding.
67+ if F :: BITS > 16 {
68+ // `mul` s.t. `!(x * mul).is_subnormal() ∀ x`
69+ let mul = f_exp_min * f_pow_subnorm;
70+ let add = -exp_min - sig_total_bits as i32 ;
71+
72+ // Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`.
73+ // This must be reachable by three scaling multiplications (two here and one final).
74+ debug_assert ! ( -exp_min + F :: SIG_BITS as i32 + exp_max <= add * 2 + -exp_min) ;
5675
57- x *= mul;
58- n += add;
59- if n < exp_min {
6076 x *= mul;
6177 n += add;
78+
6279 if n < exp_min {
63- n = exp_min;
80+ x *= mul;
81+ n += add;
82+
83+ if n < exp_min {
84+ n = exp_min;
85+ }
86+ }
87+ } else {
88+ // `f16` is unique compared to other float types in that the difference between the
89+ // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
90+ // small, only three. The above method depend on decrementing `n` by `add` two times;
91+ // for other float types this works out because `add` is a substantial fraction of
92+ // the exponent range. For `f16`, however, 3 is relatively small compared to the
93+ // exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
94+ //
95+ // Work aroudn this by using a different algorithm that calculates the prescale
96+ // dynamically based on the maximum possible value. This adds more operations per round
97+ // since it needs to construct the scale, but works better in the general case.
98+ let add = -( n + sig_total_bits as i32 ) . clamp ( exp_min, sig_total_bits as i32 ) ;
99+ let mul = F :: from_parts ( false , ( F :: EXP_BIAS as i32 - add) as u32 , zero) ;
100+
101+ x *= mul;
102+ n += add;
103+
104+ if n < exp_min {
105+ let add = -( n + sig_total_bits as i32 ) . clamp ( exp_min, sig_total_bits as i32 ) ;
106+ let mul = F :: from_parts ( false , ( F :: EXP_BIAS as i32 - add) as u32 , zero) ;
107+
108+ x *= mul;
109+ n += add;
110+
111+ if n < exp_min {
112+ n = exp_min;
113+ }
64114 }
65115 }
66116 }
67117
68- x * F :: from_parts ( false , ( F :: EXP_BIAS as i32 + n) as u32 , zero)
118+ let scale = F :: from_parts ( false , ( F :: EXP_BIAS as i32 + n) as u32 , zero) ;
119+ x * scale
69120}
70121
71122#[ cfg( test) ]
@@ -111,6 +162,12 @@ mod tests {
111162 assert ! ( scalbn( -F :: NAN , -10 ) . is_nan( ) ) ;
112163 }
113164
165+ #[ test]
166+ #[ cfg( f16_enabled) ]
167+ fn spec_test_f16 ( ) {
168+ spec_test :: < f16 > ( ) ;
169+ }
170+
114171 #[ test]
115172 fn spec_test_f32 ( ) {
116173 spec_test :: < f32 > ( ) ;
@@ -120,4 +177,10 @@ mod tests {
120177 fn spec_test_f64 ( ) {
121178 spec_test :: < f64 > ( ) ;
122179 }
180+
181+ #[ test]
182+ #[ cfg( f128_enabled) ]
183+ fn spec_test_f128 ( ) {
184+ spec_test :: < f128 > ( ) ;
185+ }
123186}
0 commit comments