|
5 | 5 | */ |
6 | 6 |
|
7 | 7 | use super::Float; |
8 | | -use super::support::{FpResult, Round, cold_path}; |
| 8 | +use super::support::{CastFrom, FpResult, Int, MinInt, Round, cold_path}; |
9 | 9 |
|
10 | 10 | /// Compute the cube root of the argument. |
11 | 11 | #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)] |
12 | 12 | pub fn cbrt(x: f64) -> f64 { |
13 | 13 | cbrt_round(x, Round::Nearest).val |
14 | 14 | } |
15 | 15 |
|
16 | | -pub fn cbrt_round(x: f64, round: Round) -> FpResult<f64> { |
17 | | - const ESCALE: [f64; 3] = [ |
18 | | - 1.0, |
19 | | - hf64!("0x1.428a2f98d728bp+0"), /* 2^(1/3) */ |
20 | | - hf64!("0x1.965fea53d6e3dp+0"), /* 2^(2/3) */ |
21 | | - ]; |
22 | | - |
23 | | - /* the polynomial c0+c1*x+c2*x^2+c3*x^3 approximates x^(1/3) on [1,2] |
24 | | - with maximal error < 9.2e-5 (attained at x=2) */ |
25 | | - const C: [f64; 4] = [ |
26 | | - hf64!("0x1.1b0babccfef9cp-1"), |
27 | | - hf64!("0x1.2c9a3e94d1da5p-1"), |
28 | | - hf64!("-0x1.4dc30b1a1ddbap-3"), |
29 | | - hf64!("0x1.7a8d3e4ec9b07p-6"), |
30 | | - ]; |
31 | | - |
32 | | - let u0: f64 = hf64!("0x1.5555555555555p-2"); |
33 | | - let u1: f64 = hf64!("0x1.c71c71c71c71cp-3"); |
34 | | - |
35 | | - let rsc = [1.0, -1.0, 0.5, -0.5, 0.25, -0.25]; |
36 | | - |
37 | | - let off = [hf64!("0x1p-53"), 0.0, 0.0, 0.0]; |
| 16 | +pub fn cbrt_round<F: Float + CbrtHelper>(x: F, round: Round) -> FpResult<F> |
| 17 | +where |
| 18 | + F::Int: CastFrom<u64>, |
| 19 | + F::Int: CastFrom<u32>, |
| 20 | + F::Int: From<u8>, |
| 21 | +{ |
| 22 | + let zero = F::Int::ZERO; |
| 23 | + let one = F::Int::ONE; |
| 24 | + let u0: F = F::U0; |
| 25 | + let u1: F = F::U1; |
| 26 | + let rsc = F::RSC; |
| 27 | + let off = F::OFF; |
38 | 28 |
|
39 | 29 | /* rm=0 for rounding to nearest, and other values for directed roundings */ |
40 | | - let hx: u64 = x.to_bits(); |
41 | | - let mut mant: u64 = hx & f64::SIG_MASK; |
42 | | - let sign: u64 = hx >> 63; |
| 30 | + let hx = x.to_bits(); |
| 31 | + let mut mant: F::Int = hx & F::SIG_MASK; |
| 32 | + let sign: F::Int = hx >> (F::BITS - 1); |
| 33 | + let neg = x.is_sign_negative(); |
43 | 34 |
|
44 | | - let mut e: u32 = (hx >> f64::SIG_BITS) as u32 & f64::EXP_SAT; |
| 35 | + let mut e: u32 = x.exp(); |
45 | 36 |
|
46 | | - if ((e + 1) & f64::EXP_SAT) < 2 { |
| 37 | + if ((e + 1) & F::EXP_SAT) < 2 { |
47 | 38 | cold_path(); |
48 | 39 |
|
49 | | - let ix: u64 = hx & !f64::SIGN_MASK; |
| 40 | + let ix = hx & !F::SIGN_MASK; |
50 | 41 |
|
51 | 42 | /* 0, inf, nan: we return x + x instead of simply x, |
52 | 43 | to that for x a signaling NaN, it correctly triggers |
53 | 44 | the invalid exception. */ |
54 | | - if e == f64::EXP_SAT || ix == 0 { |
| 45 | + if e == F::EXP_SAT || ix == zero { |
55 | 46 | return FpResult::ok(x + x); |
56 | 47 | } |
57 | 48 |
|
58 | 49 | let nz = ix.leading_zeros() - 11; /* subnormal */ |
59 | 50 | mant <<= nz; |
60 | | - mant &= f64::SIG_MASK; |
| 51 | + mant &= F::SIG_MASK; |
61 | 52 | e = e.wrapping_sub(nz - 1); |
62 | 53 | } |
63 | 54 |
|
64 | 55 | e = e.wrapping_add(3072); |
65 | | - let cvt1: u64 = mant | (0x3ffu64 << 52); |
66 | | - let mut cvt5: u64 = cvt1; |
| 56 | + let cvt1 = mant | (F::Int::cast_from(F::EXP_BIAS) << F::SIG_BITS); |
| 57 | + let mut cvt5 = cvt1; |
67 | 58 |
|
68 | 59 | let et: u32 = e / 3; |
69 | 60 | let it: u32 = e % 3; |
70 | 61 |
|
71 | 62 | /* 2^(3k+it) <= x < 2^(3k+it+1), with 0 <= it <= 3 */ |
72 | | - cvt5 += u64::from(it) << f64::SIG_BITS; |
73 | | - cvt5 |= sign << 63; |
74 | | - let zz: f64 = f64::from_bits(cvt5); |
| 63 | + cvt5 += F::Int::cast_from(it) << F::SIG_BITS; |
| 64 | + cvt5 |= sign << (F::BITS - 1); |
| 65 | + let zz: F = F::from_bits(cvt5); |
75 | 66 |
|
76 | 67 | /* cbrt(x) = cbrt(zz)*2^(et-1365) where 1 <= zz < 8 */ |
77 | | - let mut isc: u64 = ESCALE[it as usize].to_bits(); // todo: index |
78 | | - isc |= sign << 63; |
79 | | - let cvt2: u64 = isc; |
80 | | - let z: f64 = f64::from_bits(cvt1); |
| 68 | + let mut isc = F::ESCALE[it as usize].to_bits(); // todo: index |
| 69 | + isc |= sign << (F::BITS - 1); |
| 70 | + let cvt2 = isc; |
| 71 | + let z: F = F::from_bits(cvt1); |
81 | 72 |
|
82 | 73 | /* cbrt(zz) = cbrt(z)*isc, where isc encodes 1, 2^(1/3) or 2^(2/3), |
83 | 74 | and 1 <= z < 2 */ |
84 | | - let r: f64 = 1.0 / z; |
85 | | - let rr: f64 = r * rsc[((it as usize) << 1) | sign as usize]; |
86 | | - let z2: f64 = z * z; |
87 | | - let c0: f64 = C[0] + z * C[1]; |
88 | | - let c2: f64 = C[2] + z * C[3]; |
89 | | - let mut y: f64 = c0 + z2 * c2; |
90 | | - let mut y2: f64 = y * y; |
| 75 | + let r: F = F::ONE / z; |
| 76 | + let rr: F = r * rsc[((it as usize) << 1) | neg as usize]; |
| 77 | + let z2: F = z * z; |
| 78 | + let c0: F = F::C[0] + z * F::C[1]; |
| 79 | + let c2: F = F::C[2] + z * F::C[3]; |
| 80 | + let mut y: F = c0 + z2 * c2; |
| 81 | + let mut y2: F = y * y; |
91 | 82 |
|
92 | 83 | /* y is an approximation of z^(1/3) */ |
93 | | - let mut h: f64 = y2 * (y * r) - 1.0; |
| 84 | + let mut h: F = y2 * (y * r) - F::ONE; |
94 | 85 |
|
95 | 86 | /* h determines the error between y and z^(1/3) */ |
96 | 87 | y -= (h * y) * (u0 - u1 * h); |
97 | 88 |
|
98 | 89 | /* The correction y -= (h*y)*(u0 - u1*h) corresponds to a cubic variant |
99 | 90 | of Newton's method, with the function f(y) = 1-z/y^3. */ |
100 | | - y *= f64::from_bits(cvt2); |
| 91 | + y *= F::from_bits(cvt2); |
101 | 92 |
|
102 | 93 | /* Now y is an approximation of zz^(1/3), |
103 | 94 | * and rr an approximation of 1/zz. We now perform another iteration of |
104 | 95 | * Newton-Raphson, this time with a linear approximation only. */ |
105 | 96 | y2 = y * y; |
106 | | - let mut y2l: f64 = fmaf64(y, y, -y2); |
| 97 | + let mut y2l: F = y.fma(y, -y2); |
107 | 98 |
|
108 | 99 | /* y2 + y2l = y^2 exactly */ |
109 | | - let mut y3: f64 = y2 * y; |
110 | | - let mut y3l: f64 = fmaf64(y, y2, -y3) + y * y2l; |
| 100 | + let mut y3: F = y2 * y; |
| 101 | + let mut y3l: F = y.fma(y2, -y3) + y * y2l; |
111 | 102 |
|
112 | 103 | /* y3 + y3l approximates y^3 with about 106 bits of accuracy */ |
113 | 104 | h = ((y3 - zz) + y3l) * rr; |
114 | | - let mut dy: f64 = h * (y * u0); |
| 105 | + let mut dy: F = h * (y * u0); |
115 | 106 |
|
116 | 107 | /* the approximation of zz^(1/3) is y - dy */ |
117 | | - let mut y1: f64 = y - dy; |
| 108 | + let mut y1: F = y - dy; |
118 | 109 | dy = (y - y1) - dy; |
119 | 110 |
|
120 | 111 | /* the approximation of zz^(1/3) is now y1 + dy, where |dy| < 1/2 ulp(y) |
121 | 112 | * (for rounding to nearest) */ |
122 | | - let mut ady: f64 = dy.abs(); |
| 113 | + let mut ady: F = dy.abs(); |
123 | 114 |
|
124 | 115 | /* For directed roundings, ady0 is tiny when dy is tiny, or ady0 is near |
125 | 116 | * from ulp(1); |
126 | 117 | * for rounding to nearest, ady0 is tiny when dy is near from 1/2 ulp(1), |
127 | 118 | * or from 3/2 ulp(1). */ |
128 | | - let mut ady0: f64 = (ady - off[round as usize]).abs(); |
129 | | - let mut ady1: f64 = (ady - (hf64!("0x1p-52") + off[round as usize])).abs(); |
| 119 | + let mut ady0: F = (ady - off[round as usize]).abs(); |
| 120 | + let mut ady1: F = (ady - (F::TWO_POW_NEG_SIG_BITS + off[round as usize])).abs(); |
130 | 121 |
|
131 | | - if ady0 < hf64!("0x1p-75") || ady1 < hf64!("0x1p-75") { |
| 122 | + let magic = F::from_parts(false, (-75 + F::EXP_BIAS as i32) as u32, zero); |
| 123 | + |
| 124 | + if ady0 < magic || ady1 < magic { |
132 | 125 | cold_path(); |
133 | 126 |
|
134 | 127 | y2 = y1 * y1; |
135 | | - y2l = fmaf64(y1, y1, -y2); |
| 128 | + y2l = y1.fma(y1, -y2); |
136 | 129 | y3 = y2 * y1; |
137 | | - y3l = fmaf64(y1, y2, -y3) + y1 * y2l; |
| 130 | + y3l = y1.fma(y2, -y3) + y1 * y2l; |
138 | 131 | h = ((y3 - zz) + y3l) * rr; |
139 | 132 | dy = h * (y1 * u0); |
140 | 133 | y = y1 - dy; |
141 | 134 | dy = (y1 - y) - dy; |
142 | 135 | y1 = y; |
143 | 136 | ady = dy.abs(); |
144 | 137 | ady0 = (ady - off[round as usize]).abs(); |
145 | | - ady1 = (ady - (hf64!("0x1p-52") + off[round as usize])).abs(); |
| 138 | + ady1 = (ady - (F::TWO_POW_NEG_SIG_BITS + off[round as usize])).abs(); |
146 | 139 |
|
147 | | - if ady0 < hf64!("0x1p-98") || ady1 < hf64!("0x1p-98") { |
| 140 | + let magic2 = F::from_parts(false, (-98 + F::EXP_BIAS as i32) as u32, zero); |
| 141 | + if ady0 < magic2 || ady1 < magic2 { |
148 | 142 | cold_path(); |
149 | | - let azz: f64 = zz.abs(); |
| 143 | + let azz: F = zz.abs(); |
150 | 144 |
|
151 | 145 | // ~ 0x1.79d15d0e8d59b80000000000000ffc3dp+0 |
152 | | - if azz == hf64!("0x1.9b78223aa307cp+1") { |
153 | | - y1 = hf64!("0x1.79d15d0e8d59cp+0").copysign(zz); |
| 146 | + if azz == F::AZMAGIC1 { |
| 147 | + y1 = F::AZMAGIC2.copysign(zz); |
154 | 148 | } |
155 | 149 |
|
156 | 150 | // ~ 0x1.de87aa837820e80000000000001c0f08p+0 |
157 | | - if azz == hf64!("0x1.a202bfc89ddffp+2") { |
158 | | - y1 = hf64!("0x1.de87aa837820fp+0").copysign(zz); |
| 151 | + if azz == F::AZMAGIC3 { |
| 152 | + y1 = F::AZMAGIC4.copysign(zz); |
159 | 153 | } |
160 | 154 |
|
161 | 155 | if round != Round::Nearest { |
162 | | - let wlist = [ |
163 | | - (hf64!("0x1.3a9ccd7f022dbp+0"), hf64!("0x1.1236160ba9b93p+0")), // ~ 0x1.1236160ba9b930000000000001e7e8fap+0 |
164 | | - (hf64!("0x1.7845d2faac6fep+0"), hf64!("0x1.23115e657e49cp+0")), // ~ 0x1.23115e657e49c0000000000001d7a799p+0 |
165 | | - (hf64!("0x1.d1ef81cbbbe71p+0"), hf64!("0x1.388fb44cdcf5ap+0")), // ~ 0x1.388fb44cdcf5a0000000000002202c55p+0 |
166 | | - (hf64!("0x1.0a2014f62987cp+1"), hf64!("0x1.46bcbf47dc1e8p+0")), // ~ 0x1.46bcbf47dc1e8000000000000303aa2dp+0 |
167 | | - (hf64!("0x1.fe18a044a5501p+1"), hf64!("0x1.95decfec9c904p+0")), // ~ 0x1.95decfec9c9040000000000000159e8ep+0 |
168 | | - (hf64!("0x1.a6bb8c803147bp+2"), hf64!("0x1.e05335a6401dep+0")), // ~ 0x1.e05335a6401de00000000000027ca017p+0 |
169 | | - (hf64!("0x1.ac8538a031cbdp+2"), hf64!("0x1.e281d87098de8p+0")), // ~ 0x1.e281d87098de80000000000000ee9314p+0 |
170 | | - ]; |
171 | | - |
172 | | - for (a, b) in wlist { |
| 156 | + for (a, b) in F::WLIST { |
173 | 157 | if azz == a { |
174 | | - let tmp = if round as u64 + sign == 2 { hf64!("0x1p-52") } else { 0.0 }; |
| 158 | + let tmp = if F::Int::from(round as u8) + sign == F::Int::cast_from(2) { |
| 159 | + F::TWO_POW_NEG_SIG_BITS |
| 160 | + } else { |
| 161 | + F::ZERO |
| 162 | + }; |
175 | 163 | y1 = (b + tmp).copysign(zz); |
176 | 164 | } |
177 | 165 | } |
178 | 166 | } |
179 | 167 | } |
180 | 168 | } |
181 | 169 |
|
182 | | - let mut cvt3: u64 = y1.to_bits(); |
183 | | - cvt3 = cvt3.wrapping_add(((et.wrapping_sub(342).wrapping_sub(1023)) as u64) << 52); |
184 | | - let m0: u64 = cvt3 << 30; |
| 170 | + let mut cvt3 = y1.to_bits(); |
| 171 | + cvt3 = cvt3.wrapping_add((F::Int::cast_from(et.wrapping_sub(342).wrapping_sub(1023))) << 52); |
| 172 | + let m0 = cvt3 << 30; |
185 | 173 | let m1 = m0 >> 63; |
186 | 174 |
|
187 | | - if (m0 ^ m1) <= (1u64 << 30) { |
| 175 | + if (m0 ^ m1) <= (one << 30) { |
188 | 176 | cold_path(); |
189 | 177 |
|
190 | | - let mut cvt4: u64 = y1.to_bits(); |
191 | | - cvt4 = (cvt4 + (164 << 15)) & 0xffffffffffff0000u64; |
| 178 | + let mut cvt4 = y1.to_bits(); |
| 179 | + cvt4 = (cvt4 + (F::Int::cast_from(164) << 15)) & F::Int::cast_from(0xffffffffffff0000u64); |
192 | 180 |
|
193 | | - if ((f64::from_bits(cvt4) - y1) - dy).abs() < hf64!("0x1p-60") || (zz).abs() == 1.0 { |
194 | | - cvt3 = (cvt3 + (1u64 << 15)) & 0xffffffffffff0000u64; |
| 181 | + let magic3 = F::from_parts(false, (-60 + F::EXP_BIAS as i32) as u32, zero); |
| 182 | + if ((F::from_bits(cvt4) - y1) - dy).abs() < magic3 || (zz).abs() == F::ONE { |
| 183 | + cvt3 = (cvt3 + (one << 15)) & F::Int::cast_from(0xffffffffffff0000u64); |
195 | 184 | } |
196 | 185 | } |
197 | 186 |
|
198 | | - FpResult::ok(f64::from_bits(cvt3)) |
| 187 | + FpResult::ok(F::from_bits(cvt3)) |
199 | 188 | } |
200 | 189 |
|
201 | | -fn fmaf64(x: f64, y: f64, z: f64) -> f64 { |
202 | | - #[cfg(intrinsics_enabled)] |
203 | | - { |
204 | | - return unsafe { core::intrinsics::fmaf64(x, y, z) }; |
205 | | - } |
| 190 | +trait CbrtHelper: Float { |
| 191 | + /// 2^(n / 3) for n = [0, 1, 2] |
| 192 | + const ESCALE: [Self; 3]; |
| 193 | + /// The polynomial `c0+c1*x+c2*x^2+c3*x^3` approximates `x^(1/3)` on `[1,2]` |
| 194 | + /// with maximal error < 9.2e-5 (attained at x=2) |
| 195 | + const C: [Self; 4]; |
| 196 | + const U0: Self; |
| 197 | + const U1: Self; |
| 198 | + const RSC: [Self; 6]; |
| 199 | + const OFF: [Self; 4]; |
| 200 | + const WLIST: [(Self, Self); 7]; |
| 201 | + const AZMAGIC1: Self; |
| 202 | + const AZMAGIC2: Self; |
| 203 | + const AZMAGIC3: Self; |
| 204 | + const AZMAGIC4: Self; |
| 205 | + fn fma(self, y: Self, z: Self) -> Self; |
| 206 | +} |
206 | 207 |
|
207 | | - #[cfg(not(intrinsics_enabled))] |
208 | | - { |
209 | | - return super::fma(x, y, z); |
| 208 | +impl CbrtHelper for f64 { |
| 209 | + const ESCALE: [Self; 3] = [ |
| 210 | + 1.0, |
| 211 | + hf64!("0x1.428a2f98d728bp+0"), /* 2^(1/3) */ |
| 212 | + hf64!("0x1.965fea53d6e3dp+0"), /* 2^(2/3) */ |
| 213 | + ]; |
| 214 | + |
| 215 | + const C: [Self; 4] = [ |
| 216 | + hf64!("0x1.1b0babccfef9cp-1"), |
| 217 | + hf64!("0x1.2c9a3e94d1da5p-1"), |
| 218 | + hf64!("-0x1.4dc30b1a1ddbap-3"), |
| 219 | + hf64!("0x1.7a8d3e4ec9b07p-6"), |
| 220 | + ]; |
| 221 | + |
| 222 | + const U0: Self = hf64!("0x1.5555555555555p-2"); |
| 223 | + |
| 224 | + const U1: Self = hf64!("0x1.c71c71c71c71cp-3"); |
| 225 | + |
| 226 | + const RSC: [Self; 6] = [1.0, -1.0, 0.5, -0.5, 0.25, -0.25]; |
| 227 | + |
| 228 | + const OFF: [Self; 4] = [hf64!("0x1p-53"), 0.0, 0.0, 0.0]; |
| 229 | + |
| 230 | + const WLIST: [(Self, Self); 7] = [ |
| 231 | + (hf64!("0x1.3a9ccd7f022dbp+0"), hf64!("0x1.1236160ba9b93p+0")), // ~ 0x1.1236160ba9b930000000000001e7e8fap+0 |
| 232 | + (hf64!("0x1.7845d2faac6fep+0"), hf64!("0x1.23115e657e49cp+0")), // ~ 0x1.23115e657e49c0000000000001d7a799p+0 |
| 233 | + (hf64!("0x1.d1ef81cbbbe71p+0"), hf64!("0x1.388fb44cdcf5ap+0")), // ~ 0x1.388fb44cdcf5a0000000000002202c55p+0 |
| 234 | + (hf64!("0x1.0a2014f62987cp+1"), hf64!("0x1.46bcbf47dc1e8p+0")), // ~ 0x1.46bcbf47dc1e8000000000000303aa2dp+0 |
| 235 | + (hf64!("0x1.fe18a044a5501p+1"), hf64!("0x1.95decfec9c904p+0")), // ~ 0x1.95decfec9c9040000000000000159e8ep+0 |
| 236 | + (hf64!("0x1.a6bb8c803147bp+2"), hf64!("0x1.e05335a6401dep+0")), // ~ 0x1.e05335a6401de00000000000027ca017p+0 |
| 237 | + (hf64!("0x1.ac8538a031cbdp+2"), hf64!("0x1.e281d87098de8p+0")), // ~ 0x1.e281d87098de80000000000000ee9314p+0 |
| 238 | + ]; |
| 239 | + |
| 240 | + const AZMAGIC1: Self = hf64!("0x1.9b78223aa307cp+1"); |
| 241 | + const AZMAGIC2: Self = hf64!("0x1.79d15d0e8d59cp+0"); |
| 242 | + const AZMAGIC3: Self = hf64!("0x1.a202bfc89ddffp+2"); |
| 243 | + const AZMAGIC4: Self = hf64!("0x1.de87aa837820fp+0"); |
| 244 | + |
| 245 | + fn fma(self, y: Self, z: Self) -> Self { |
| 246 | + #[cfg(intrinsics_enabled)] |
| 247 | + { |
| 248 | + return unsafe { core::intrinsics::fmaf64(self, y, z) }; |
| 249 | + } |
| 250 | + |
| 251 | + #[cfg(not(intrinsics_enabled))] |
| 252 | + { |
| 253 | + return super::fma(self, y, z); |
| 254 | + } |
210 | 255 | } |
211 | 256 | } |
212 | 257 |
|
|
0 commit comments