|
1 | | -#![allow(unused)] |
2 | | -#![allow(clippy::all)] |
3 | | - |
4 | | -use super::super::support::Hexf; |
5 | 1 | use super::super::{CastFrom, CastInto, Float, IntTy, MinInt}; |
6 | 2 |
|
7 | | -#[cfg(not(optimizations_enabled))] |
8 | | -extern crate std; |
9 | | -#[cfg(not(optimizations_enabled))] |
10 | | -use std::dbg; |
11 | | - |
12 | | -#[cfg(optimizations_enabled)] |
13 | | -macro_rules! dbg { |
14 | | - ($($tt:tt)*) => {}; |
15 | | -} |
16 | | - |
17 | 3 | /// Scale the exponent. |
18 | 4 | /// |
19 | 5 | /// From N3220: |
|
36 | 22 | F::Int: CastFrom<i32>, |
37 | 23 | F::Int: CastFrom<u32>, |
38 | 24 | { |
39 | | - dbg!(); |
40 | 25 | let zero = IntTy::<F>::ZERO; |
41 | 26 |
|
42 | 27 | // Bits including the implicit bit |
|
55 | 40 | // 2 ^ sig_total_bits, representation of what can be accounted for with subnormals |
56 | 41 | let f_exp_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero); |
57 | 42 |
|
58 | | - dbg!(exp_max, exp_min, sig_total_bits, sig_total_bits + F::EXP_BIAS); |
59 | | - dbg!(Hexf(f_exp_max), Hexf(f_exp_min), Hexf(f_exp_subnorm)); |
60 | | - dbg!(Hexf(x), n); |
61 | | - |
62 | 43 | // The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases |
63 | 44 | // where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with |
64 | 45 | // `n = -EMin + 2`. To get around this, reduce the magnitude of the final scale operation by |
@@ -86,56 +67,62 @@ where |
86 | 67 |
|
87 | 68 | let mul = f_exp_min * f_exp_subnorm; |
88 | 69 | let add = -exp_min - sig_total_bits as i32; |
89 | | - dbg!(Hexf(mul), add); |
| 70 | + |
| 71 | + // Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`. |
| 72 | + // This can be reached by three scaling multiplications (two here and one final). |
| 73 | + debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min); |
90 | 74 |
|
91 | 75 | x *= mul; |
92 | 76 | n += add; |
93 | 77 |
|
94 | 78 | if n < exp_min { |
95 | 79 | x *= mul; |
96 | 80 | n += add; |
| 81 | + |
97 | 82 | if n < exp_min { |
98 | 83 | x *= mul; |
99 | 84 | n += add; |
| 85 | + |
100 | 86 | if n < exp_min { |
101 | 87 | n = exp_min; |
102 | | - dbg!(Hexf(x), n); |
103 | 88 | } |
104 | 89 | } |
105 | 90 | } |
106 | 91 | } else { |
107 | | - let add = (n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); |
108 | | - let mul = F::from_parts(false, (F::EXP_BIAS as i32 + add) as u32, zero); |
109 | | - let add = -add; |
110 | | - dbg!(add, Hexf(mul)); |
| 92 | + // `f16` is unique compared to other float types in that the difference between the |
| 93 | + // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is |
| 94 | + // small, only three. The above method depend on decrementing `n` by `add` two times; |
| 95 | + // for other float types this works out because `add` is a substantial fraction of |
| 96 | + // the exponent range. For `f16`, however, 3 is relatively small compared to the |
| 97 | + // exponent range (which is 39), so that would require a lot of rounds. |
| 98 | + // |
| 99 | + // Work aroudn this by using a different algorithm that scales by the max possible |
| 100 | + // value that does not exceed the minimum normal exponent. |
| 101 | + |
| 102 | + let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); |
| 103 | + let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero); |
111 | 104 |
|
112 | 105 | x *= mul; |
113 | 106 | n += add; |
114 | | - dbg!(Hexf(x), n); |
115 | 107 |
|
116 | 108 | if n < exp_min { |
117 | 109 | let add = (n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); |
118 | 110 | let mul = F::from_parts(false, (F::EXP_BIAS as i32 + add) as u32, zero); |
119 | 111 | let add = -add; |
120 | | - dbg!(add, Hexf(mul)); |
121 | 112 |
|
122 | 113 | x *= mul; |
123 | 114 | n += add; |
124 | | - dbg!(Hexf(x), n); |
125 | 115 |
|
126 | 116 | if n < exp_min { |
127 | 117 | n = exp_min; |
128 | | - dbg!(Hexf(x), n); |
129 | 118 | } |
130 | 119 | } |
131 | 120 | // f16 |
132 | 121 | } |
133 | 122 | } |
134 | | - dbg!(Hexf(x), n); |
| 123 | + |
135 | 124 | let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero); |
136 | | - let ret = x * scale; |
137 | | - dbg!(Hexf(scale), Hexf(ret)); |
138 | | - ret |
| 125 | + x * scale |
139 | 126 | } |
140 | 127 |
|
141 | 128 | #[cfg(test)] |
|
0 commit comments