@@ -189,163 +189,18 @@ struct erf_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScala
189189{
190190 static FloatingPoint __call (NBL_CONST_REF_ARG (FloatingPoint) _x)
191191 {
192- // glibc implementation
193- const float64_t tiny = NBL_FP64_LITERAL (1e-300 ),
194- one = NBL_FP64_LITERAL (1. 00000000000000000000e+00 ), /* 0x3FF00000, 0x00000000 */
195- erx = NBL_FP64_LITERAL (8. 45062911510467529297e-01 ); /* 0x3FEB0AC1, 0x60000000 */
196-
197- // Coefficients for approximation to erf in [0,0.84375]
198- const float64_t efx = NBL_FP64_LITERAL (1. 28379167095512586316e-01 ); /* 0x3FC06EBA, 0x8214DB69 */
199- const float64_t pp0 = NBL_FP64_LITERAL (1. 28379167095512558561e-01 ); /* 0x3FC06EBA, 0x8214DB68 */
200- const float64_t pp1 = NBL_FP64_LITERAL (-3. 25042107247001499370e-01 ); /* 0xBFD4CD7D, 0x691CB913 */
201- const float64_t pp2 = NBL_FP64_LITERAL (-2. 84817495755985104766e-02 ); /* 0xBF9D2A51, 0xDBD7194F */
202- const float64_t pp3 = NBL_FP64_LITERAL (-5. 77027029648944159157e-03 ); /* 0xBF77A291, 0x236668E4 */
203- const float64_t pp4 = NBL_FP64_LITERAL (-2. 37630166566501626084e-05 ); /* 0xBEF8EAD6, 0x120016AC */
204- const float64_t qq1 = NBL_FP64_LITERAL (3. 97917223959155352819e-01 ); /* 0x3FD97779, 0xCDDADC09 */
205- const float64_t qq2 = NBL_FP64_LITERAL (6. 50222499887672944485e-02 ); /* 0x3FB0A54C, 0x5536CEBA */
206- const float64_t qq3 = NBL_FP64_LITERAL (5. 08130628187576562776e-03 ); /* 0x3F74D022, 0xC4D36B0F */
207- const float64_t qq4 = NBL_FP64_LITERAL (1. 32494738004321644526e-04 ); /* 0x3F215DC9, 0x221C1A10 */
208- const float64_t qq5 = NBL_FP64_LITERAL (-3. 96022827877536812320e-06 ); /* 0xBED09C43, 0x42A26120 */
209-
210- //Coefficients for approximation to erf in [0.84375,1.25]
211- const float64_t pa0 = NBL_FP64_LITERAL (-2. 36211856075265944077e-03 ); /* 0xBF6359B8, 0xBEF77538 */
212- const float64_t pa1 = NBL_FP64_LITERAL (4. 14856118683748331666e-01 ); /* 0x3FDA8D00, 0xAD92B34D */
213- const float64_t pa2 = NBL_FP64_LITERAL (-3. 72207876035701323847e-01 ); /* 0xBFD7D240, 0xFBB8C3F1 */
214- const float64_t pa3 = NBL_FP64_LITERAL (3. 18346619901161753674e-01 ); /* 0x3FD45FCA, 0x805120E4 */
215- const float64_t pa4 = NBL_FP64_LITERAL (-1. 10894694282396677476e-01 ); /* 0xBFBC6398, 0x3D3E28EC */
216- const float64_t pa5 = NBL_FP64_LITERAL (3. 54783043256182359371e-02 ); /* 0x3FA22A36, 0x599795EB */
217- const float64_t pa6 = NBL_FP64_LITERAL (-2. 16637559486879084300e-03 ); /* 0xBF61BF38, 0x0A96073F */
218- const float64_t qa1 = NBL_FP64_LITERAL (1. 06420880400844228286e-01 ); /* 0x3FBB3E66, 0x18EEE323 */
219- const float64_t qa2 = NBL_FP64_LITERAL (5. 40397917702171048937e-01 ); /* 0x3FE14AF0, 0x92EB6F33 */
220- const float64_t qa3 = NBL_FP64_LITERAL (7. 18286544141962662868e-02 ); /* 0x3FB2635C, 0xD99FE9A7 */
221- const float64_t qa4 = NBL_FP64_LITERAL (1. 26171219808761642112e-01 ); /* 0x3FC02660, 0xE763351F */
222- const float64_t qa5 = NBL_FP64_LITERAL (1. 36370839120290507362e-02 ); /* 0x3F8BEDC2, 0x6B51DD1C */
223- const float64_t qa6 = NBL_FP64_LITERAL (1. 19844998467991074170e-02 ); /* 0x3F888B54, 0x5735151D */
224-
225- // Coefficients for approximation to erfc in [1.25,1/0.35]
226- const float64_t ra0 = NBL_FP64_LITERAL (-9. 86494403484714822705e-03 ); /* 0xBF843412, 0x600D6435 */
227- const float64_t ra1 = NBL_FP64_LITERAL (-6. 93858572707181764372e-01 ); /* 0xBFE63416, 0xE4BA7360 */
228- const float64_t ra2 = NBL_FP64_LITERAL (-1. 05586262253232909814e+01 ); /* 0xC0251E04, 0x41B0E726 */
229- const float64_t ra3 = NBL_FP64_LITERAL (-6. 23753324503260060396e+01 ); /* 0xC04F300A, 0xE4CBA38D */
230- const float64_t ra4 = NBL_FP64_LITERAL (-1. 62396669462573470355e+02 ); /* 0xC0644CB1, 0x84282266 */
231- const float64_t ra5 = NBL_FP64_LITERAL (-1. 84605092906711035994e+02 ); /* 0xC067135C, 0xEBCCABB2 */
232- const float64_t ra6 = NBL_FP64_LITERAL (-8. 12874355063065934246e+01 ); /* 0xC0545265, 0x57E4D2F2 */
233- const float64_t ra7 = NBL_FP64_LITERAL (-9. 81432934416914548592e+00 ); /* 0xC023A0EF, 0xC69AC25C */
234- const float64_t sa1 = NBL_FP64_LITERAL (1. 96512716674392571292e+01 ); /* 0x4033A6B9, 0xBD707687 */
235- const float64_t sa2 = NBL_FP64_LITERAL (1. 37657754143519042600e+02 ); /* 0x4061350C, 0x526AE721 */
236- const float64_t sa3 = NBL_FP64_LITERAL (4. 34565877475229228821e+02 ); /* 0x407B290D, 0xD58A1A71 */
237- const float64_t sa4 = NBL_FP64_LITERAL (6. 45387271733267880336e+02 ); /* 0x40842B19, 0x21EC2868 */
238- const float64_t sa5 = NBL_FP64_LITERAL (4. 29008140027567833386e+02 ); /* 0x407AD021, 0x57700314 */
239- const float64_t sa6 = NBL_FP64_LITERAL (1. 08635005541779435134e+02 ); /* 0x405B28A3, 0xEE48AE2C */
240- const float64_t sa7 = NBL_FP64_LITERAL (6. 57024977031928170135e+00 ); /* 0x401A47EF, 0x8E484A93 */
241- const float64_t sa8 = NBL_FP64_LITERAL (-6. 04244152148580987438e-02 ); /* 0xBFAEEFF2, 0xEE749A62 */
242-
243- // Coefficients for approximation to erfc in [1/.35,28]
244- const float64_t rb0 = NBL_FP64_LITERAL (-9. 86494292470009928597e-03 ); /* 0xBF843412, 0x39E86F4A */
245- const float64_t rb1 = NBL_FP64_LITERAL (-7. 99283237680523006574e-01 ); /* 0xBFE993BA, 0x70C285DE */
246- const float64_t rb2 = NBL_FP64_LITERAL (-1. 77579549177547519889e+01 ); /* 0xC031C209, 0x555F995A */
247- const float64_t rb3 = NBL_FP64_LITERAL (-1. 60636384855821916062e+02 ); /* 0xC064145D, 0x43C5ED98 */
248- const float64_t rb4 = NBL_FP64_LITERAL (-6. 37566443368389627722e+02 ); /* 0xC083EC88, 0x1375F228 */
249- const float64_t rb5 = NBL_FP64_LITERAL (-1. 02509513161107724954e+03 ); /* 0xC0900461, 0x6A2E5992 */
250- const float64_t rb6 = NBL_FP64_LITERAL (-4. 83519191608651397019e+02 ); /* 0xC07E384E, 0x9BDC383F */
251- const float64_t sb1 = NBL_FP64_LITERAL (3. 03380607434824582924e+01 ); /* 0x403E568B, 0x261D5190 */
252- const float64_t sb2 = NBL_FP64_LITERAL (3. 25792512996573918826e+02 ); /* 0x40745CAE, 0x221B9F0A */
253- const float64_t sb3 = NBL_FP64_LITERAL (1. 53672958608443695994e+03 ); /* 0x409802EB, 0x189D5118 */
254- const float64_t sb4 = NBL_FP64_LITERAL (3. 19985821950859553908e+03 ); /* 0x40A8FFB7, 0x688C246A */
255- const float64_t sb5 = NBL_FP64_LITERAL (2. 55305040643316442583e+03 ); /* 0x40A3F219, 0xCEDF3BE6 */
256- const float64_t sb6 = NBL_FP64_LITERAL (4. 74528541206955367215e+02 ); /* 0x407DA874, 0xE79FE763 */
257- const float64_t sb7 = NBL_FP64_LITERAL (-2. 24409524465858183362e+01 ); /* 0xC03670E2, 0x42712D62 */
258-
259- float64_t x = float64_t (_x);
260- int32_t hx, ix;
261- float64_t s, y, z, r;
262- hx = int32_t (bit_cast<uint64_t, float64_t>(x) >> 32 );
263- ix = hx & 0x7fffffff ;
264- if (ix >= 0x7ff00000 ) // erf(nan)=nan, erf(+-inf)=+-1
265- {
266- int32_t i = ((uint32_t)hx >> 31 ) << 1 ;
267- return (float64_t)(1.0 - i) + one / x;
268- }
269-
270- float64_t P, Q;
271- if (ix < 0x3feb0000 ) // |x| < 0.84375
272- {
273- if (ix < 0x3e300000 ) // |x| < 2**-28
274- {
275- if (ix < 0x00800000 )
276- {
277- // avoid underflow
278- return FloatingPoint (0.0625 * (16.0 * x + (16.0 * efx) * x));
279- }
280- return FloatingPoint (x + efx * x);
281- }
282- z = x * x;
283- r = pp0 + z * (pp1 + z * (pp2 + z * (pp3 + z * pp4)));
284- s = one + z * (qq1 + z * (qq2 + z * (qq3 + z * (qq4 + z * qq5))));
285- y = r / s;
286- return FloatingPoint (x + x * y);
287- }
288- if (ix < 0x3ff40000 ) // 0.84375 <= |x| < 1.25
289- {
290- s = abs_helper<float64_t>::__call (x) - one ;
291- P = pa0 + s * (pa1 + s * (pa2 + s * (pa3 + s * (pa4 + s * (pa5 + s * pa6)))));
292- Q = one + s * (qa1 + s * (qa2 + s * (qa3 + s * (qa4 + s * (qa5 + s * (qa5 + s * qa6))))));
293- if (hx >= 0 )
294- return FloatingPoint (erx + P / Q);
295- else
296- return FloatingPoint (-erx - P / Q);
297- }
298- if (ix >= 0x40180000 ) // inf > |x| >= 6
299- {
300- if (hx >= 0 )
301- return FloatingPoint (one - tiny);
302- else
303- return FloatingPoint (tiny - one );
304- }
305-
306- x = abs_helper<float64_t>::__call (x);
307- s = one / (x * x);
308- float64_t R, S;
309- if (ix < 0x4006DB6E ) // |x| < 1/0.35 ~2.85714
310- {
311- R = ra0 + s * (ra1 + s * (ra2 + s * (ra3 + s * (ra4 + s * (ra5 + s * (ra6 + s * ra7))))));
312- S = one + s * (sa1 + s * (sa2 + s * (sa3 + s * (sa4 + s * (sa5 + s * (sa6 + s * sa7))))));
313- }
314- else // |x| >= 1/0.35
315- {
316- R = rb0 + s * (rb1 + s * (rb2 + s * (rb3 + s * (rb4 + s * rb5))));
317- S = one + s * (sb1 + s * (sb2 + s * (sb3 + s * (sb4 + s * (sb5 + s * (sb6 + s * sb7))))));
318- }
319- z = x;
320- uint64_t z1 = bit_cast<uint64_t, float64_t>(x);
321- z1 &= 0xffffffff00000000 ;
322- z = bit_cast<float64_t, uint64_t>(z1);
323- r = exp_helper<float64_t>::__call (-z * z - 0.5625 ) * exp_helper<float64_t>::__call ((z - x) * (z + x) + R / S);
324- if (hx >= 0 )
325- return FloatingPoint (one - r / x);
326- else
327- return FloatingPoint (r / x - one );
328- }
329- };
330-
331- template<>
332- struct erf_helper<float32_t>
333- {
334- static float32_t __call (NBL_CONST_REF_ARG (float32_t) _x)
335- {
336- // A&S approximation to 1.5x10-7
337- const float32_t a1 = float32_t (NBL_FP64_LITERAL (0.254829592 ));
338- const float32_t a2 = float32_t (NBL_FP64_LITERAL (-0.284496736 ));
339- const float32_t a3 = float32_t (NBL_FP64_LITERAL (1.421413741 ));
340- const float32_t a4 = float32_t (NBL_FP64_LITERAL (-1.453152027 ));
341- const float32_t a5 = float32_t (NBL_FP64_LITERAL (1.061405429 ));
342- const float32_t p = float32_t (NBL_FP64_LITERAL (0.3275911 ));
343-
344- float32_t _sign = float32_t (sign (_x));
345- float32_t x = abs (_x);
346-
347- float32_t t = float32_t (NBL_FP64_LITERAL (1.0 )) / (float32_t (NBL_FP64_LITERAL (1.0 )) + p * x);
348- float32_t y = float32_t (NBL_FP64_LITERAL (1.0 )) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp (-x * x);
192+ const FloatingPoint a1 = FloatingPoint (NBL_FP64_LITERAL (0.254829592 ));
193+ const FloatingPoint a2 = FloatingPoint (NBL_FP64_LITERAL (-0.284496736 ));
194+ const FloatingPoint a3 = FloatingPoint (NBL_FP64_LITERAL (1.421413741 ));
195+ const FloatingPoint a4 = FloatingPoint (NBL_FP64_LITERAL (-1.453152027 ));
196+ const FloatingPoint a5 = FloatingPoint (NBL_FP64_LITERAL (1.061405429 ));
197+ const FloatingPoint p = FloatingPoint (NBL_FP64_LITERAL (0.3275911 ));
198+
199+ FloatingPoint _sign = FloatingPoint (sign (_x));
200+ FloatingPoint x = abs (_x);
201+
202+ FloatingPoint t = FloatingPoint (NBL_FP64_LITERAL (1.0 )) / (FloatingPoint (NBL_FP64_LITERAL (1.0 )) + p * x);
203+ FloatingPoint y = FloatingPoint (NBL_FP64_LITERAL (1.0 )) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp (-x * x);
349204
350205 return _sign * y;
351206 }
0 commit comments