@@ -20,11 +20,12 @@ Division routines must solve for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`
2020 `res = (m_a / m_b) * 2^{p_a - p_b}`
2121- Check for early exits (infinity, zero, etc).
2222- If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
23+ - Set the implicit bit so math is correct.
2324- Shift the significand (with implicit bit) fully left so that arithmetic can happen with
2425 greater precision.
25- - Calculate the reciprocal of `b`, `x`
26- - Multiply: `res = m_a * x_b * 2^{p_a - p_b}`
27- - Reapply rounding
26+ - Calculate the reciprocal of `b`, `x`.
27+ - Multiply: `res = m_a * x_b * 2^{p_a - p_b}`.
28+ - Reapply rounding.
2829
2930The reciprocal and multiplication steps must happen with more bits of precision than the
3031mantissa has; otherwise, precision would be lost rounding at each step. It is sufficient to use
@@ -49,7 +50,6 @@ x_{n+1} = x_n - f(x_n) / f'(x_n)
4950
5051Applying this to finding the reciprocal:
5152
52-
5353```text
54541 / x = b
5555
@@ -75,6 +75,14 @@ use crate::float::Float;
7575use crate :: int:: { CastFrom , CastInto , DInt , HInt , Int , MinInt } ;
7676use core:: mem:: size_of;
7777
78+ macro_rules! guess {
79+ ( $ty: ty) => {
80+ const { ( INITIAL_GUESS >> ( u128 :: BITS - <$ty>:: BITS ) ) as $ty }
81+ } ;
82+ }
83+
84+ const INITIAL_GUESS : u128 = 0x7504f333f9de6108b2fb1366eaa6a542 ;
85+
7886/// Type-specific configuration used for float division
7987trait FloatDivision : Float
8088where
@@ -171,7 +179,9 @@ impl FloatDivision for f32 {
171179 /// for float32 division. This is expected to be useful for some 16-bit
172180 /// targets. Not used by default as it requires performing more work during
173181 /// rounding and would hardly help on regular 32- or 64-bit targets.
174- const C_HW : HalfRep < Self > = 0x7504 ;
182+ const C_HW : HalfRep < Self > = guess ! ( HalfRep <Self >) ;
183+
184+ // 0x7504;
175185}
176186
177187impl FloatDivision for f64 {
@@ -185,13 +195,16 @@ impl FloatDivision for f64 {
185195impl FloatDivision for f128 {
186196 const HALF_ITERATIONS : usize = 4 ;
187197
188- const C_HW : HalfRep < Self > = 0x7504F333 << ( HalfRep :: < Self > :: BITS - 32 ) ;
198+ // const C_HW: HalfRep<Self> = 0x7504F333 << (HalfRep::<Self>::BITS - 32);
199+ const C_HW : HalfRep < Self > = 0x7504f333f9de6108 ;
189200}
190201
191202extern crate std;
192203#[ allow( unused) ]
193204use std:: { dbg, fmt, println} ;
194205
206+ // TODO: try adding const where possible
207+
195208fn div < F > ( a : F , b : F ) -> F
196209where
197210 F : FloatDivision ,
@@ -332,7 +345,8 @@ where
332345 b_significand,
333346 ) ;
334347
335- // Transform to a fixed-point representation
348+ // Transform to a fixed-point representation. We know this is in the range [1.0, 2.0] since
349+ // the explicit bit is set.
336350 let b_uq1 = b_significand << ( F :: BITS - significand_bits - 1 ) ;
337351
338352 println ! ( "b_uq1: {:#034x}" , b_uq1) ;
@@ -384,94 +398,92 @@ where
384398 // b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
385399 let c_hw = F :: C_HW ;
386400
401+ debug_assert ! (
402+ b_uq1_hw & HalfRep :: <F >:: ONE << ( HalfRep :: <F >:: BITS - 1 ) > HalfRep :: <F >:: ZERO
403+ ) ;
387404 // b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
388405 // so x0 fits to UQ0.HW without wrapping.
389- let x_uq0_hw: HalfRep < F > = {
390- let mut x_uq0_hw: HalfRep < F > =
391- c_hw. wrapping_sub ( b_uq1_hw /* exact b_hw/2 as UQ0.HW */ ) ;
392-
393- // An e_0 error is comprised of errors due to
394- // * x0 being an inherently imprecise first approximation of 1/b_hw
395- // * C_hw being some (irrational) number **truncated** to W0 bits
396- // Please note that e_0 is calculated against the infinitely precise
397- // reciprocal of b_hw (that is, **truncated** version of b).
398- //
399- // e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
400- //
401- // By construction, 1 <= b < 2
402- // f(x) = x * (2 - b*x) = 2*x - b*x^2
403- // f'(x) = 2 * (1 - b*x)
404- //
405- // On the [0, 1] interval, f(0) = 0,
406- // then it increses until f(1/b) = 1 / b, maximum on (0, 1),
407- // then it decreses to f(1) = 2 - b
408- //
409- // Let g(x) = x - f(x) = b*x^2 - x.
410- // On (0, 1/b), g(x) < 0 <=> f(x) > x
411- // On (1/b, 1], g(x) > 0 <=> f(x) < x
412- //
413- // For half-width iterations, b_hw is used instead of b.
414- for _ in 0 ..half_iterations {
415- // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
416- // of corr_UQ1_hw.
417- // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
418- // On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
419- // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
420- // expected to be strictly positive because b_UQ1_hw has its highest bit set
421- // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
422- let corr_uq1_hw: HalfRep < F > = zero
423- . wrapping_sub (
424- ( F :: Int :: from ( x_uq0_hw) . wrapping_mul ( F :: Int :: from ( b_uq1_hw) ) ) >> hw,
425- )
426- . cast ( ) ;
427-
428- // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
429- // obtaining an UQ1.(HW-1) number and proving its highest bit could be
430- // considered to be 0 to be able to represent it in UQ0.HW.
431- // From the above analysis of f(x), if corr_UQ1_hw would be represented
432- // without any intermediate loss of precision (that is, in twice_rep_t)
433- // x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
434- // less otherwise. On the other hand, to obtain [1.]000..., one have to pass
435- // 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
436- // to 1.0 being not representable as UQ0.HW).
437- // The fact corr_UQ1_hw was virtually round up (due to result of
438- // multiplication being **first** truncated, then negated - to improve
439- // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
440- x_uq0_hw = ( F :: Int :: from ( x_uq0_hw) . wrapping_mul ( F :: Int :: from ( corr_uq1_hw) )
441- >> ( hw - 1 ) )
442- . cast ( ) ;
443-
444- // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
445- // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
446- // any number of iterations, so just subtract 2 from the reciprocal
447- // approximation after last iteration.
448- //
449- // In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
450- // corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
451- // = 1 - e_n * b_hw + 2*eps1
452- // x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
453- // = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
454- // = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
455- // e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
456- // = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
457- // \------ >0 -------/ \-- >0 ---/
458- // abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
459- }
406+ let mut x_uq0_hw: HalfRep < F > =
407+ c_hw. wrapping_sub ( b_uq1_hw /* exact b_hw/2 as UQ0.HW */ ) ;
408+
409+ // An e_0 error is comprised of errors due to
410+ // * x0 being an inherently imprecise first approximation of 1/b_hw
411+ // * C_hw being some (irrational) number **truncated** to W0 bits
412+ // Please note that e_0 is calculated against the infinitely precise
413+ // reciprocal of b_hw (that is, **truncated** version of b).
414+ //
415+ // e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
416+ //
417+ // By construction, 1 <= b < 2
418+ // f(x) = x * (2 - b*x) = 2*x - b*x^2
419+ // f'(x) = 2 * (1 - b*x)
420+ //
421+ // On the [0, 1] interval, f(0) = 0,
422+ // then it increses until f(1/b) = 1 / b, maximum on (0, 1),
423+ // then it decreses to f(1) = 2 - b
424+ //
425+ // Let g(x) = x - f(x) = b*x^2 - x.
426+ // On (0, 1/b), g(x) < 0 <=> f(x) > x
427+ // On (1/b, 1], g(x) > 0 <=> f(x) < x
428+ //
429+ // For half-width iterations, b_hw is used instead of b.
430+ for _ in 0 ..half_iterations {
431+ // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
432+ // of corr_UQ1_hw.
433+ // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
434+ // On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
435+ // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
436+ // expected to be strictly positive because b_UQ1_hw has its highest bit set
437+ // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
438+ let corr_uq1_hw: HalfRep < F > = zero
439+ . wrapping_sub ( ( F :: Int :: from ( x_uq0_hw) . wrapping_mul ( F :: Int :: from ( b_uq1_hw) ) ) >> hw)
440+ . cast ( ) ;
460441
461- // For initial half-width iterations, U = 2^-HW
462- // Let abs(e_n) <= u_n * U,
463- // then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
464- // u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
442+ // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
443+ // obtaining an UQ1.(HW-1) number and proving its highest bit could be
444+ // considered to be 0 to be able to represent it in UQ0.HW.
445+ // From the above analysis of f(x), if corr_UQ1_hw would be represented
446+ // without any intermediate loss of precision (that is, in twice_rep_t)
447+ // x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
448+ // less otherwise. On the other hand, to obtain [1.]000..., one have to pass
449+ // 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
450+ // to 1.0 being not representable as UQ0.HW).
451+ // The fact corr_UQ1_hw was virtually round up (due to result of
452+ // multiplication being **first** truncated, then negated - to improve
453+ // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
454+ x_uq0_hw =
455+ ( F :: Int :: from ( x_uq0_hw) . wrapping_mul ( F :: Int :: from ( corr_uq1_hw) ) >> ( hw - 1 ) ) . cast ( ) ;
456+
457+ // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
458+ // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
459+ // any number of iterations, so just subtract 2 from the reciprocal
460+ // approximation after last iteration.
465461 //
466- // Account for possible overflow (see above). For an overflow to occur for the
467- // first time, for "ideal" corr_UQ1_hw (that is, without intermediate
468- // truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
469- // value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
470- // be not below that value (see g(x) above), so it is safe to decrement just
471- // once after the final iteration. On the other hand, an effective value of
472- // divisor changes after this point (from b_hw to b), so adjust here.
473- x_uq0_hw. wrapping_sub ( HalfRep :: < F > :: ONE )
474- } ;
462+ // In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
463+ // corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
464+ // = 1 - e_n * b_hw + 2*eps1
465+ // x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
466+ // = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
467+ // = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
468+ // e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
469+ // = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
470+ // \------ >0 -------/ \-- >0 ---/
471+ // abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
472+ }
473+
474+ // For initial half-width iterations, U = 2^-HW
475+ // Let abs(e_n) <= u_n * U,
476+ // then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
477+ // u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
478+ //
479+ // Account for possible overflow (see above). For an overflow to occur for the
480+ // first time, for "ideal" corr_UQ1_hw (that is, without intermediate
481+ // truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
482+ // value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
483+ // be not below that value (see g(x) above), so it is safe to decrement just
484+ // once after the final iteration. On the other hand, an effective value of
485+ // divisor changes after this point (from b_hw to b), so adjust here.
486+ x_uq0_hw = x_uq0_hw. wrapping_sub ( HalfRep :: < F > :: ONE ) ;
475487
476488 // Error estimations for full-precision iterations are calculated just
477489 // as above, but with U := 2^-W and taking extra decrementing into account.
@@ -544,11 +556,6 @@ where
544556 as u32 )
545557 . cast ( ) ;
546558 }
547- } else {
548- assert ! (
549- F :: BITS != 32 ,
550- "native full iterations onlydoaijfoisd supports f32"
551- ) ;
552559 }
553560
554561 // Finally, account for possible overflow, as explained above.
0 commit comments