@@ -34,7 +34,7 @@ static int32_t s_pow(int32_t base, int32_t exponent)
3434#define MP_COMPUTE_ESS (T ) ((int)((int32_t)((uint32_t)1 << (T)) * k))
3535
3636static mp_err s_mp_to_radix_recursive (const mp_int * a , char * * str , size_t * part_maxlen , size_t * part_written ,
37- int radix , int32_t k , int32_t t , bool pad , mp_int * P , mp_int * R )
37+ int radix , int32_t k , int32_t t , bool pad , bool first , mp_int * P , mp_int * R )
3838{
3939
4040 mp_int r , q , a1 ;
@@ -47,42 +47,45 @@ static mp_err s_mp_to_radix_recursive(const mp_int *a, char **str, size_t *part_
4747
4848 } else {
4949 if ((err = mp_init_multi (& q , & r , & a1 , NULL )) != MP_OKAY ) goto LTM_ERR ;
50- /*
51- Barrett reduction. A step by step proof can be found at
52- https://www.nayuki.io/page/barrett-reduction-algorithm
50+ if (first ) {
51+ if ((err = mp_div (a , & P [t ], & q , & r )) != MP_OKAY ) goto LTM_ERR ;
52+ } else {
53+ /*
54+ Barrett reduction. A step by step proof can be found at
55+ https://www.nayuki.io/page/barrett-reduction-algorithm
5356
54- See also: Modern Computer Arithmetic, version 0.5.9, page 59
55- */
57+ See also: Modern Computer Arithmetic, version 0.5.9, page 59
58+ */
5659
57- Beta = MP_COMPUTE_ESS (t + 1 );
60+ Beta = MP_COMPUTE_ESS (t + 1 );
5861
59- /* Q = floor(A1 * I / 2^Beta) */
60- /* I = floor( (2^(2*Beta)) / B) Here we have R[t] = I, P[t] = B */
61- if ((err = mp_mul (a , & R [t ], & q )) != MP_OKAY ) goto LTM_ERR ;
62- if ((err = mp_div_2d (& q , Beta , & q , NULL )) != MP_OKAY ) goto LTM_ERR ;
62+ /* Q = floor(A1 * I / 2^Beta) */
63+ /* I = floor( (2^(2*Beta)) / B) Here we have R[t] = I, P[t] = B */
64+ if ((err = mp_mul (a , & R [t ], & q )) != MP_OKAY ) goto LTM_ERR ;
65+ if ((err = mp_div_2d (& q , Beta , & q , NULL )) != MP_OKAY ) goto LTM_ERR ;
6366
64- /* R = A - Q*B */
65- if ((err = mp_mul (& q , & P [t ], & r )) != MP_OKAY ) goto LTM_ERR ;
66- if ((err = mp_sub (a , & r , & r )) != MP_OKAY ) goto LTM_ERR ;
67+ /* R = A - Q*B */
68+ if ((err = mp_mul (& q , & P [t ], & r )) != MP_OKAY ) goto LTM_ERR ;
69+ if ((err = mp_sub (a , & r , & r )) != MP_OKAY ) goto LTM_ERR ;
6770
68- /* We can use this simple correction because of the way we computed the reciprocal */
69- if (r .sign == MP_NEG ) {
70- if ((err = mp_decr (& q )) != MP_OKAY ) goto LTM_ERR ;
71- if ((err = mp_add (& r , & P [t ], & r )) != MP_OKAY ) goto LTM_ERR ;
71+ /* We can use this simple correction because of the way we computed the reciprocal */
72+ if (r .sign == MP_NEG ) {
73+ if ((err = mp_decr (& q )) != MP_OKAY ) goto LTM_ERR ;
74+ if ((err = mp_add (& r , & P [t ], & r )) != MP_OKAY ) goto LTM_ERR ;
75+ }
7276 }
73-
7477 /* Go down the lists while climbing up the tree. */
7578 t -- ;
7679
7780 /* Follow branches */
7881 if (mp_iszero (& q ) && (!pad )) {
7982 if ((err = s_mp_to_radix_recursive (& r , str , part_maxlen , part_written , radix ,
80- k , t , false, P , R )) != MP_OKAY ) goto LTM_ERR ;
83+ k , t , false, false, P , R )) != MP_OKAY ) goto LTM_ERR ;
8184 } else {
8285 if ((err = s_mp_to_radix_recursive (& q , str , part_maxlen , part_written , radix ,
83- k , t , pad , P , R )) != MP_OKAY ) goto LTM_ERR ;
86+ k , t , pad , false, P , R )) != MP_OKAY ) goto LTM_ERR ;
8487 if ((err = s_mp_to_radix_recursive (& r , str , part_maxlen , part_written , radix ,
85- k , t , true, P , R )) != MP_OKAY ) goto LTM_ERR ;
88+ k , t , true, false, P , R )) != MP_OKAY ) goto LTM_ERR ;
8689 }
8790 mp_clear_multi (& q , & r , & a1 , NULL );
8891 }
@@ -97,30 +100,20 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
97100 mp_err err ;
98101 int32_t n = 0 , k , t = 0 , steps = 0 ;
99102 int ilog2a ;
100- #ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
101- int s , g = 3 ;
102- mp_int M2 , M4 ;
103- bool use_newton_raphson = true;
104- #endif
105103
106104 /* Use given buffer directly, no temporary buffers for the individual chunks */
107105 char * * sptr = & str ;
108106 /* Size of the chunk */
109107 size_t part_written = 0 ;
110108 size_t part_maxlen = maxlen ;
111109
110+ bool num_ovf = false;
111+
112112 /* List of reciprocals */
113113 mp_int * R = NULL ;
114114 /* List of moduli */
115115 mp_int * P = NULL ;
116116
117- #ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
118- /* Be nice and utter a warning. For now. */
119- if (radix != 10 ) {
120- fprintf (stderr ,"The Newton-Raphson method is for base 10 only!\n" );
121- }
122- #endif
123-
124117 /* Denominator for the reciprocal: b^y */
125118 n = s_pow ((int32_t )radix , (int32_t )s_mp_radix_exponent_y [radix ]);
126119
@@ -130,7 +123,7 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
130123 /* steps = floor(log_2(floor(log_2(a))))*/
131124 ilog2a = mp_count_bits (a ) - 1 ;
132125
133- /* Cutoff at about twice the size of P[0]. Interestingly far below Karatsuba cut-off. */
126+ /* Cutoff at about twice the size of P[0]. */
134127 if (ilog2a < (2 * k * MP_RADIX_BARRETT_START_MULTIPLICATOR )) {
135128 if ((err = s_mp_slower_to_radix (a , sptr , & part_maxlen , & part_written , radix , false)) != MP_OKAY ) goto LTM_ERR ;
136129 /* part_written does not count EOS */
@@ -175,9 +168,6 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
175168 if ((err = mp_div (& R [0 ], & P [0 ], & R [0 ], NULL )) != MP_OKAY ) goto LTM_ERR ;
176169 if ((err = mp_incr (& R [0 ])) != MP_OKAY ) goto LTM_ERR ;
177170
178- #ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
179- if ((err = mp_init_multi (& M2 , & M4 , NULL )) != MP_OKAY ) goto LTM_ERR ;
180- #endif
181171
182172 /* Compute the rest of the reciprocals if as needed */
183173 for (t = 1 ; t < steps ; t ++ ) {
@@ -195,15 +185,29 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
195185 /* TODO: This can only happen near MP_MAX_DIGIT_COUNT and we can use
196186 the reciprocal R[t-1] to do the division but R[t] != R[t-1]^2
197187 so we cannot just divide by R[t-1] twice.
188+
189+ But as it is the root of the tree it is used only once and caching
190+ makes no sense in the first place, we can divide a/P[last] directly
191+
192+ This is always the case for the first division and we can do it
193+ in general to save about half of the cache memory and a bit of
194+ computation time by avoiding the overhead of the Barrett division.
195+
196+ We can set a flag (MP_OVF is an error and it might be frowned upon
197+ using it as a flag) or R[last] to zero (minus one) or just start
198+ with a plain division every time as described above.
199+
200+ Problem with the "always dividing directly" is that it is not known
201+ for sure if P[t-1]^2 > a without actualy computing P[t-1]^2 but it
202+ is a rare event that the heuristic check below fails, so the cost is
203+ not as high as it seems.
198204 */
199- err = MP_OVF ;
200- goto LTM_ERR ;
205+ num_ovf = true;
201206 }
202207
203- /* P[t-1]^2 > a at most likely more than just a bit or too, so check if we
204- can bail out early without actually computing the square. The
205- constant "10" is comprised of unity plus some angst-allowance */
206- if ((2 * mp_count_bits (& P [t - 1 ]) - 10 ) > ilog2a ) {
208+ /* P[t-1]^2 > a is most likely more than just a bit or too, so check if we
209+ can bail out early without actually computing the square. */
210+ if ((2 * mp_count_bits (& P [t - 1 ]) - 4 ) > ilog2a ) {
207211 /* Correct index */
208212 t -- ;
209213 break ;
@@ -221,48 +225,24 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
221225 t -- ;
222226 break ;
223227 }
224- /* Compute numerator */
225- if ((err = mp_init (& R [t ])) != MP_OKAY ) goto LTM_ERR ;
226-
227- #ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
228- if (use_newton_raphson && (radix == 10 )) {
229- /* Use a round of Newton-Raphson to compute the next reciprocal */
230- /* s = 2^t*k */
231- s = MP_COMPUTE_ESS (t );
232- /* M = R[t-1] * 2^g */
233- if ((err = mp_mul_2d (& R [t - 1 ], 3 , & M2 )) != MP_OKAY ) goto LTM_ERR ;
234- if ((err = mp_sub_d (& M2 , (mp_digit )2 , & M2 )) != MP_OKAY ) goto LTM_ERR ;
235- /* Do the M-R round: M = floor( ( 2*M^2 - floor((n^2*M^4) / (2^(2*(s+g)))) ) / 2^(2*g)) + 1 */
236- /* M2 = (M * 2^g)^2*/
237- if ((err = mp_sqr (& M2 , & M2 )) != MP_OKAY ) goto LTM_ERR ;
238- /* M4 = M2^2 */
239- if ((err = mp_sqr (& M2 , & M4 )) != MP_OKAY ) goto LTM_ERR ;
240- /* Compute numerator: n^2*M^4 = (P[t]) * M4*/
241- if ((err = mp_mul (& P [t ], & M4 , & M4 )) != MP_OKAY ) goto LTM_ERR ;
242- /* Compute fraction by shifting right: M4>>2*(s+g) where 2*(s+g) < MAX_INT */
243- if ((err = mp_div_2d (& M4 , 2 * (s + g ), & M4 , NULL )) != MP_OKAY ) goto LTM_ERR ;
244- if ((err = mp_mul_2 (& M2 ,& M2 )) != MP_OKAY ) goto LTM_ERR ;
245- if ((err = mp_sub (& M2 ,& M4 ,& M4 )) != MP_OKAY ) goto LTM_ERR ;
246- /* R[t] = M / 2^(2*g) remove extra bits before storage */
247- if ((err = mp_div_2d (& M4 , 2 * g , & (R [t ]), & M4 )) != MP_OKAY ) goto LTM_ERR ;
248- if ((err = mp_incr (& R [t ])) != MP_OKAY ) goto LTM_ERR ;
249- } else {
250- #endif
228+
229+ /* We cannot evaluate the numerator if the computation would overflow */
230+ if (!num_ovf ) {
231+ /* Compute numerator */
232+ if ((err = mp_init (& R [t ])) != MP_OKAY ) goto LTM_ERR ;
251233 /* R[t] = R[t] << (2^t * k) The factor cannot overflow, we checked that above */
252- if ((err = mp_2expt (& (R [t ]), MP_COMPUTE_ESS (t + 1 ) )) != MP_OKAY ) goto LTM_ERR ;
234+ if ((err = mp_2expt (& (R [t ]), MP_COMPUTE_ESS (t + 1 ))) != MP_OKAY ) goto LTM_ERR ;
253235 /* Compute reciprocal */
254236 /* R[t] = floor(2^(2^t * k) / P[t] */
255237 if ((err = mp_div (& R [t ], & P [t ], & R [t ], NULL )) != MP_OKAY ) goto LTM_ERR ;
256238 /* Ceiling if P[t] is not a power of two but it is not a problem if P[t] is a power of two. */
257239 if ((err = mp_incr (& R [t ])) != MP_OKAY ) goto LTM_ERR ;
258- #ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
259240 }
260- #endif
261241 }
262242
263243 /* And finally: start the recursion. */
264244 if ((err = s_mp_to_radix_recursive (a , sptr , & part_maxlen , & part_written , radix ,
265- k , t , false, P , R )) != MP_OKAY ) goto LTM_ERR ;
245+ k , t , false, num_ovf , P , R )) != MP_OKAY ) goto LTM_ERR ;
266246 /* part_written does not account for EOS */
267247 * written = part_written + 1 ;
268248
@@ -274,9 +254,6 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
274254 } while (t -- );
275255 MP_FREE_BUF (P , (size_t ) steps * sizeof (mp_int ));
276256 MP_FREE_BUF (R , (size_t ) steps * sizeof (mp_int ));
277- #ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
278- mp_clear_multi (& M2 , & M4 , NULL );
279- #endif
280257 return err ;
281258}
282259
0 commit comments