11/*
2+ Floating point division routines.
23
3- Solving for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`.
4+ This module documentation gives an overview of the method used. More documentation is inline.
5+
6+ Relevant notation:
7+
8+ - `m_a`: the mantissa of `a`, in base 2
9+ - `p_a`: the exponent of `a`, in base 2. I.e. `a = m_a * 2^p_a`
10+ - `uqN` (e.g. `uq1`): this refers to Q notation for fixed-point numbers. UQ1.31 is an unsigned
11+ fixed-point number with 1 integral bit, and 31 decimal bits. A `uqN` variable of type `uM`
12+ will have N bits of integer and M-N bits of fraction.
13+ - `hw`: half width, i.e. for `f64` this will be a `u32`.
14+
15+ # Method Overview
16+
17+ Division routines must solve for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`. Process:
418
519- Separate the exponent and significand
6- `res = (m_a / m_b) * 2^( p_a - p_b) `
7- - Check for early exits
20+ `res = (m_a / m_b) * 2^{ p_a - p_b} `
21+ - Check for early exits (infinity, zero, etc).
822- If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
9- - Shift the significand (with implicit bit) fully left so that arithmetic can happen with greater
10- precision.
11- - Calculate the reciprocal of `b`, `r`
12- - Multiply: `res = m_a * r_b * 2^(p_a - p_b)`
23+ - Shift the significand (with implicit bit) fully left so that arithmetic can happen with
24+ greater precision.
25+ - Calculate the reciprocal of `b`, `x`
26+ - Multiply: `res = m_a * x_b * 2^{p_a - p_b}`
27+ - Reapply rounding
28+
29+ The reciprocal and multiplication steps must happen with more bits of precision than the
30+ mantissa has; otherwise, precision would be lost rounding at each step. It is sufficient to use
31+ the float's same-sized integer.
32+
33+ # Reciprocal calculation
34+
35+ Calculating the reciprocal is the most complicated part of this process. It uses the
36+ [Newton-Raphson method], which picks an initial estimation (of the reciprocal) and performs
37+ a number of iterations to increase its precision.
38+
39+ In general, Newton's method takes the following form:
40+
41+ ```
42+ `x_n` is a guess or the result of a previous iteration. Increasing `n` converges to the
43+ desired result.
44+
45+ The result is a zero of `f(x)`.
46+
47+ x_{n+1} = x_n - f(x_n) / f'(x_n)
48+ ```
49+
50+ Applying this to finding the reciprocal:
51+
52+
53+ ```text
54+ 1 / x = b
55+
56+ Rearrange so we can solve by finding a zero
57+ 0 = (1 / x) - b = f(x)
58+
59+ f'(x) = -x^{-2}
60+
61+ x_{n+1} = 2*x_n - b*x_n^2
62+
63+
64+ ```
65+
66+ [Newton-Raphson method]: https://en.wikipedia.org/wiki/Newton%27s_method
1367
1468The most complicated part of this process is calculating the reciprocal.
1569
16- Note that variables named e.g. `uq0` refer to Q notation. E.g. Q1.31 refers to a fixed-point
17- number that has 1 bit of integer and 31 bits of decimal.
1870
1971*/
2072
73+ use super :: HalfRep ;
2174use crate :: float:: Float ;
2275use crate :: int:: { CastFrom , CastInto , DInt , HInt , Int , MinInt } ;
23-
24- use super :: HalfRep ;
76+ use core:: mem:: size_of;
2577
2678/// Type-specific configuration used for float division
2779trait FloatDivision : Float
3284 const HALF_ITERATIONS : usize ;
3385
3486 /// Iterations that are done at the full float's width. Must be at least one.
35- const FULL_ITERATIONS : usize ;
87+ const FULL_ITERATIONS : usize = 1 ;
3688
37- const USE_NATIVE_FULL_ITERATIONS : bool = false ;
89+ const USE_NATIVE_FULL_ITERATIONS : bool = size_of :: < Self > ( ) < size_of :: < * const ( ) > ( ) ;
3890
3991 /// C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
4092 /// with W0 being either 16 or 32 and W0 <= HW.
@@ -90,10 +142,30 @@ where
90142 } ;
91143}
92144
145+ /// Calculate the number of iterations required to get needed precision of a float type.
146+ ///
147+ /// This returns `(h, f)` where `h` is the number of iterations to be donei using integers
148+ /// at half the float's width, and `f` is the number of iterations done using integers of the
149+ /// float's full width. Doing some iterations at half width is an optimization when the float
150+ /// is larger than a word.
151+ ///
152+ /// ASSUMPTION: the initial estimate should have at least 8 bits of precision. If this is not
153+ /// true, results will be inaccurate.
154+ const fn calc_iterations < F : Float > ( ) -> ( usize , usize ) {
155+ // Precision doubles with each iteration
156+ let total_iterations = F :: BITS . ilog2 ( ) as usize - 2 ;
157+
158+ if size_of :: < F > ( ) < size_of :: < * const ( ) > ( ) {
159+ // No need to use half iterations if math at the half
160+ ( 0 , total_iterations)
161+ } else {
162+ ( total_iterations - 1 , 1 )
163+ }
164+ }
165+
93166impl FloatDivision for f32 {
94167 const HALF_ITERATIONS : usize = 0 ;
95168 const FULL_ITERATIONS : usize = 3 ;
96- const USE_NATIVE_FULL_ITERATIONS : bool = true ;
97169
98170 /// Use 16-bit initial estimation in case we are using half-width iterations
99171 /// for float32 division. This is expected to be useful for some 16-bit
@@ -104,15 +176,14 @@ impl FloatDivision for f32 {
104176
105177impl FloatDivision for f64 {
106178 const HALF_ITERATIONS : usize = 3 ;
107- const FULL_ITERATIONS : usize = 1 ;
179+
108180 /// HW is at least 32. Shifting into the highest bits if needed.
109181 const C_HW : HalfRep < Self > = 0x7504F333 << ( HalfRep :: < Self > :: BITS - 32 ) ;
110182}
111183
112184#[ cfg( not( feature = "no-f16-f128" ) ) ]
113185impl FloatDivision for f128 {
114186 const HALF_ITERATIONS : usize = 4 ;
115- const FULL_ITERATIONS : usize = 1 ;
116187
117188 const C_HW : HalfRep < Self > = 0x7504F333 << ( HalfRep :: < Self > :: BITS - 32 ) ;
118189}
@@ -169,6 +240,7 @@ where
169240 let inf_rep = exponent_mask;
170241 let quiet_bit = implicit_bit >> 1 ;
171242 let qnan_rep = exponent_mask | quiet_bit;
243+ let ( half_iterations, full_iterations) = calc_iterations :: < F > ( ) ;
172244
173245 let a_rep = a. repr ( ) ;
174246 let b_rep = b. repr ( ) ;
@@ -300,7 +372,7 @@ where
300372 // NB: Using half-width iterations increases computation errors due to
301373 // rounding, so error estimations have to be computed taking the selected
302374 // mode into account!
303- let mut x_uq0 = if F :: HALF_ITERATIONS > 0 {
375+ let mut x_uq0 = if half_iterations > 0 {
304376 // Starting with (n-1) half-width iterations
305377 let b_uq1_hw: HalfRep < F > = b_uq1. hi ( ) ;
306378
@@ -339,7 +411,7 @@ where
339411 // On (1/b, 1], g(x) > 0 <=> f(x) < x
340412 //
341413 // For half-width iterations, b_hw is used instead of b.
342- for _ in 0 ..F :: HALF_ITERATIONS {
414+ for _ in 0 ..half_iterations {
343415 // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
344416 // of corr_UQ1_hw.
345417 // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
@@ -458,12 +530,12 @@ where
458530 x_uq0
459531 } ;
460532
461- if F :: USE_NATIVE_FULL_ITERATIONS {
533+ if full_iterations > 1 {
462534 // Need to use concrete types since `F::Int::D` might not support math. So, restrict to
463535 // one type.
464536 assert ! ( F :: BITS == 32 , "native full iterations only supports f32" ) ;
465537
466- for _ in 0 ..F :: FULL_ITERATIONS {
538+ for _ in 0 ..full_iterations {
467539 let corr_uq1: u32 = 0 . wrapping_sub (
468540 ( ( CastInto :: < u32 > :: cast ( x_uq0) as u64 ) * ( CastInto :: < u32 > :: cast ( b_uq1) as u64 ) )
469541 >> F :: BITS ,
@@ -472,6 +544,11 @@ where
472544 as u32 )
473545 . cast ( ) ;
474546 }
547+ } else {
548+ assert ! (
549+ F :: BITS != 32 ,
550+ "native full iterations onlydoaijfoisd supports f32"
551+ ) ;
475552 }
476553
477554 // Finally, account for possible overflow, as explained above.
0 commit comments