Skip to content

Commit f917ce7

Browse files
committed
Removed N-R method; refined tests
1 parent fdf85b1 commit f917ce7

File tree

4 files changed

+128
-109
lines changed

4 files changed

+128
-109
lines changed

demo/test.c

Lines changed: 68 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,23 +1073,17 @@ static int test_mp_montgomery_reduce(void)
10731073

10741074
}
10751075

1076-
const uint8_t test_s_mp_radix_exponent_y[] = { 0, 0, /* 0 .. 1*/
1077-
20, 12, 10, 8, 7, 7, 6, 6, /* 2 .. 9 */
1078-
6, 5, 5, 5, 5, 5, 5, 4, /* 10 .. 17 */
1079-
4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 25 */
1080-
4, 4, 4, 4, 4, 4, 4, 3, /* 26 .. 33 */
1081-
3, 3, 3, 3, 3, 3, 3, 3, /* 34 .. 41 */
1082-
3, 3, 3, 3, 3, 3, 3, 3, /* 42 .. 49 */
1083-
3, 3, 3, 3, 3, 3, 3, 3, /* 51 .. 57 */
1084-
3, 3, 3, 3, 3, 3, 3 /* 58 .. 64 */
1085-
};
1086-
1076+
#include <time.h>
10871077
static int test_mp_read_radix(void)
10881078
{
10891079
char buf[4096];
1090-
size_t written;
1080+
size_t written, maxlen;
10911081
int bignum, i;
10921082

1083+
char *buffer, *bcpy;
1084+
1085+
clock_t start, stop, t_slow, t_fast;
1086+
10931087
mp_int a, b;
10941088
DOR(mp_init_multi(&a, &b, NULL));
10951089

@@ -1120,25 +1114,73 @@ static int test_mp_read_radix(void)
11201114
/* Test the fast method with a slightly larger number */
11211115

11221116
/* Must be bigger than the cut-off value, of course */
1123-
bignum = 2* (2 * test_s_mp_radix_exponent_y[2] * MP_RADIX_BARRETT_START_MULTIPLICATOR);
1124-
printf("Size of bignum_size = %d\n", bignum);
1125-
/* Check if "bignum" is small enough for the result to fit into "buf"
1126-
otherwise lead tester to this function */
1127-
if (bignum >= 4096) {
1128-
fprintf(stderr, "Buffer too small, please check function \"test_mp_read_radix\" in \"test.c\"");
1117+
bignum = (2 * 20 * MP_RADIX_BARRETT_START_MULTIPLICATOR) * 10;
1118+
buffer = (char *)malloc(bignum + 2);
1119+
if (buffer == NULL) {
11291120
goto LBL_ERR;
11301121
}
1131-
/* Produce a random number */
1132-
bignum /= MP_DIGIT_BIT;
1133-
DO(mp_rand(&b, bignum));
1134-
/* Check if it makes the round */
1135-
printf("Number of limbs in &b = %d, bit_count of &b = %d\n", bignum, mp_count_bits(&b));
1122+
DO(mp_rand(&a, bignum / MP_DIGIT_BIT));
1123+
printf("\nNumber of limbs in &b = %d, bit_count of &b = %d\n", bignum / MP_DIGIT_BIT, mp_count_bits(&a));
1124+
start = clock();
11361125
for (i = 2; i < 65; i++) {
1137-
DO(mp_to_radix(&b, buf, sizeof(buf), &written, i));
1138-
DO(mp_read_radix(&a, buf, i));
1126+
/* printf("FAST radix = %d\n",i); */
1127+
DO(mp_to_radix(&a, buffer, bignum + 1, &written, i));
1128+
DO(mp_read_radix(&b, buffer, i));
11391129
EXPECT(mp_cmp(&a, &b) == MP_EQ);
1140-
/* fprintf(stderr,"radix = %d\n",i); */
11411130
}
1131+
stop = clock();
1132+
t_fast = stop - start;
1133+
1134+
printf("Same number, slow radix conversions\n");
1135+
start = clock();
1136+
for (i = 2; i < 65; i++) {
1137+
/* printf("SLOW radix = %d\n",i); */
1138+
maxlen = bignum + 1;
1139+
bcpy = buffer;
1140+
DO(s_mp_slower_to_radix(&a, &bcpy, &maxlen, &written, i, false));
1141+
DO(s_mp_slower_read_radix(&b, bcpy, 0, strlen(bcpy), i));
1142+
EXPECT(mp_cmp(&a, &b) == MP_EQ);
1143+
}
1144+
stop = clock();
1145+
t_slow = stop - start;
1146+
1147+
/* It is "long int" in GLibC but can be bigger and/or even a floating point elsewhere */
1148+
printf("SLOW: %.10f, FAST: %.10f\n", (double)t_slow/(double)CLOCKS_PER_SEC, (double)t_fast/(double)CLOCKS_PER_SEC);
1149+
1150+
/* Check if the branching works. */
1151+
if (MP_HAS(S_MP_FASTER_READ_RADIX) && MP_HAS(S_MP_FASTER_TO_RADIX)) {
1152+
if (t_fast > t_slow) {
1153+
fprintf(stderr, "Timing suspicious in test_mp_read_radix. No fast multiplication? Cut-off too low?\n");
1154+
goto LBL_ERR;
1155+
}
1156+
}
1157+
1158+
1159+
free(buffer);
1160+
1161+
#if ((MP_DIGIT_BIT <= 16) && (defined MP_CHECK_RADIX_OVF))
1162+
/* Check a number of size (MP_MAX_DIGIT_COUNT * MP_DIGIT_BIT - 1) at fixed radix "10". */
1163+
/* Will not work if test is run on platforms with larger int's because
1164+
#define MP_MAX_DIGIT_COUNT ((INT_MAX - 2) / MP_DIGIT_BIT)
1165+
So we have to replace the value for INT_MAX with 2^15 - 1 = 32767 to test 16-bit int's. Not
1166+
very elegant but it works.
1167+
*/
1168+
bignum = ((32767 - 2) / MP_DIGIT_BIT);
1169+
bignum = ((bignum - 1) * MP_DIGIT_BIT) + (MP_DIGIT_BIT - 1);
1170+
/* Manual computation because the automatic methods might not have been included in the build */
1171+
buffer = (char *)malloc(((bignum + 2)/1000) * 333);
1172+
if (buffer == NULL) {
1173+
goto LBL_ERR;
1174+
}
1175+
DO(mp_2expt(&a, bignum));
1176+
DO(mp_decr(&a));
1177+
printf("Number of limbs in &b = %d, bit_count of &b = %d\n", bignum / MP_DIGIT_BIT, mp_count_bits(&a));
1178+
DO(mp_to_radix(&a, buffer, ((bignum + 2)/1000) * 333, &written, 10));
1179+
DO(mp_read_radix(&b, buffer, 10));
1180+
EXPECT(mp_cmp(&a, &b) == MP_EQ);
1181+
free(buffer);
1182+
#endif
1183+
11421184

11431185

11441186
while (0) {

s_mp_faster_to_radix.c

Lines changed: 55 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ static int32_t s_pow(int32_t base, int32_t exponent)
3434
#define MP_COMPUTE_ESS(T) ((int)((int32_t)((uint32_t)1 << (T)) * k))
3535

3636
static mp_err s_mp_to_radix_recursive(const mp_int *a, char **str, size_t *part_maxlen, size_t *part_written,
37-
int radix, int32_t k, int32_t t, bool pad, mp_int *P, mp_int *R)
37+
int radix, int32_t k, int32_t t, bool pad, bool first, mp_int *P, mp_int *R)
3838
{
3939

4040
mp_int r, q, a1;
@@ -47,42 +47,45 @@ static mp_err s_mp_to_radix_recursive(const mp_int *a, char **str, size_t *part_
4747

4848
} else {
4949
if ((err = mp_init_multi(&q, &r, &a1, NULL)) != MP_OKAY) goto LTM_ERR;
50-
/*
51-
Barrett reduction. A step by step proof can be found at
52-
https://www.nayuki.io/page/barrett-reduction-algorithm
50+
if (first) {
51+
if ((err = mp_div(a, &P[t], &q, &r)) != MP_OKAY) goto LTM_ERR;
52+
} else {
53+
/*
54+
Barrett reduction. A step by step proof can be found at
55+
https://www.nayuki.io/page/barrett-reduction-algorithm
5356
54-
See also: Modern Computer Arithmetic, version 0.5.9, page 59
55-
*/
57+
See also: Modern Computer Arithmetic, version 0.5.9, page 59
58+
*/
5659

57-
Beta = MP_COMPUTE_ESS(t+1);
60+
Beta = MP_COMPUTE_ESS(t+1);
5861

59-
/* Q = floor(A1 * I / 2^Beta) */
60-
/* I = floor( (2^(2*Beta)) / B) Here we have R[t] = I, P[t] = B */
61-
if ((err = mp_mul(a, &R[t], &q)) != MP_OKAY) goto LTM_ERR;
62-
if ((err = mp_div_2d(&q, Beta, &q, NULL)) != MP_OKAY) goto LTM_ERR;
62+
/* Q = floor(A1 * I / 2^Beta) */
63+
/* I = floor( (2^(2*Beta)) / B) Here we have R[t] = I, P[t] = B */
64+
if ((err = mp_mul(a, &R[t], &q)) != MP_OKAY) goto LTM_ERR;
65+
if ((err = mp_div_2d(&q, Beta, &q, NULL)) != MP_OKAY) goto LTM_ERR;
6366

64-
/* R = A - Q*B */
65-
if ((err = mp_mul(&q, &P[t], &r)) != MP_OKAY) goto LTM_ERR;
66-
if ((err = mp_sub(a, &r, &r)) != MP_OKAY) goto LTM_ERR;
67+
/* R = A - Q*B */
68+
if ((err = mp_mul(&q, &P[t], &r)) != MP_OKAY) goto LTM_ERR;
69+
if ((err = mp_sub(a, &r, &r)) != MP_OKAY) goto LTM_ERR;
6770

68-
/* We can use this simple correction because of the way we computed the reciprocal */
69-
if (r.sign == MP_NEG) {
70-
if ((err = mp_decr(&q)) != MP_OKAY) goto LTM_ERR;
71-
if ((err = mp_add(&r, &P[t], &r)) != MP_OKAY) goto LTM_ERR;
71+
/* We can use this simple correction because of the way we computed the reciprocal */
72+
if (r.sign == MP_NEG) {
73+
if ((err = mp_decr(&q)) != MP_OKAY) goto LTM_ERR;
74+
if ((err = mp_add(&r, &P[t], &r)) != MP_OKAY) goto LTM_ERR;
75+
}
7276
}
73-
7477
/* Go down the lists while climbing up the tree. */
7578
t--;
7679

7780
/* Follow branches */
7881
if (mp_iszero(&q) && (!pad)) {
7982
if ((err = s_mp_to_radix_recursive(&r, str, part_maxlen, part_written, radix,
80-
k, t, false, P, R)) != MP_OKAY) goto LTM_ERR;
83+
k, t, false, false, P, R)) != MP_OKAY) goto LTM_ERR;
8184
} else {
8285
if ((err = s_mp_to_radix_recursive(&q, str, part_maxlen, part_written, radix,
83-
k, t, pad, P, R)) != MP_OKAY) goto LTM_ERR;
86+
k, t, pad, false, P, R)) != MP_OKAY) goto LTM_ERR;
8487
if ((err = s_mp_to_radix_recursive(&r, str, part_maxlen, part_written, radix,
85-
k, t, true, P, R)) != MP_OKAY) goto LTM_ERR;
88+
k, t, true, false, P, R)) != MP_OKAY) goto LTM_ERR;
8689
}
8790
mp_clear_multi(&q, &r, &a1, NULL);
8891
}
@@ -97,30 +100,20 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
97100
mp_err err;
98101
int32_t n = 0, k, t = 0, steps = 0;
99102
int ilog2a;
100-
#ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
101-
int s, g = 3;
102-
mp_int M2, M4;
103-
bool use_newton_raphson = true;
104-
#endif
105103

106104
/* Use given buffer directly, no temporary buffers for the individual chunks */
107105
char **sptr = &str;
108106
/* Size of the chunk */
109107
size_t part_written = 0;
110108
size_t part_maxlen = maxlen;
111109

110+
bool num_ovf = false;
111+
112112
/* List of reciprocals */
113113
mp_int *R = NULL;
114114
/* List of moduli */
115115
mp_int *P = NULL;
116116

117-
#ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
118-
/* Be nice and utter a warning. For now. */
119-
if (radix != 10) {
120-
fprintf(stderr,"The Newton-Raphson method is for base 10 only!\n");
121-
}
122-
#endif
123-
124117
/* Denominator for the reciprocal: b^y */
125118
n = s_pow((int32_t)radix, (int32_t)s_mp_radix_exponent_y[radix]);
126119

@@ -130,7 +123,7 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
130123
/* steps = floor(log_2(floor(log_2(a))))*/
131124
ilog2a = mp_count_bits(a) - 1;
132125

133-
/* Cutoff at about twice the size of P[0]. Interestingly far below Karatsuba cut-off. */
126+
/* Cutoff at about twice the size of P[0]. */
134127
if (ilog2a < (2 * k * MP_RADIX_BARRETT_START_MULTIPLICATOR)) {
135128
if ((err = s_mp_slower_to_radix(a, sptr, &part_maxlen, &part_written, radix, false)) != MP_OKAY) goto LTM_ERR;
136129
/* part_written does not count EOS */
@@ -175,9 +168,6 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
175168
if ((err = mp_div(&R[0], &P[0], &R[0], NULL)) != MP_OKAY) goto LTM_ERR;
176169
if ((err = mp_incr(&R[0])) != MP_OKAY) goto LTM_ERR;
177170

178-
#ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
179-
if ((err = mp_init_multi(&M2, &M4, NULL)) != MP_OKAY) goto LTM_ERR;
180-
#endif
181171

182172
/* Compute the rest of the reciprocals if as needed */
183173
for (t = 1; t < steps; t++) {
@@ -195,15 +185,29 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
195185
/* TODO: This can only happen near MP_MAX_DIGIT_COUNT and we can use
196186
the reciprocal R[t-1] to do the division but R[t] != R[t-1]^2
197187
so we cannot just divide by R[t-1] twice.
188+
189+
But as it is the root of the tree it is used only once and caching
190+
makes no sense in the first place, we can divide a/P[last] directly
191+
192+
This is always the case for the first division and we can do it
193+
in general to save about half of the cache memory and a bit of
194+
computation time by avoiding the overhead of the Barrett division.
195+
196+
We can set a flag (MP_OVF is an error and it might be frowned upon
197+
using it as a flag) or R[last] to zero (minus one) or just start
198+
with a plain division every time as described above.
199+
200+
Problem with the "always dividing directly" is that it is not known
201+
for sure if P[t-1]^2 > a without actualy computing P[t-1]^2 but it
202+
is a rare event that the heuristic check below fails, so the cost is
203+
not as high as it seems.
198204
*/
199-
err = MP_OVF;
200-
goto LTM_ERR;
205+
num_ovf = true;
201206
}
202207

203-
/* P[t-1]^2 > a at most likely more than just a bit or too, so check if we
204-
can bail out early without actually computing the square. The
205-
constant "10" is comprised of unity plus some angst-allowance */
206-
if ((2 * mp_count_bits(&P[t-1]) - 10) > ilog2a) {
208+
/* P[t-1]^2 > a is most likely more than just a bit or too, so check if we
209+
can bail out early without actually computing the square. */
210+
if ((2 * mp_count_bits(&P[t-1]) - 4) > ilog2a) {
207211
/* Correct index */
208212
t--;
209213
break;
@@ -221,48 +225,24 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
221225
t--;
222226
break;
223227
}
224-
/* Compute numerator */
225-
if ((err = mp_init(&R[t])) != MP_OKAY) goto LTM_ERR;
226-
227-
#ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
228-
if (use_newton_raphson && (radix == 10)) {
229-
/* Use a round of Newton-Raphson to compute the next reciprocal */
230-
/* s = 2^t*k */
231-
s = MP_COMPUTE_ESS(t);
232-
/* M = R[t-1] * 2^g */
233-
if ((err = mp_mul_2d(&R[t-1], 3, &M2)) != MP_OKAY) goto LTM_ERR;
234-
if ((err = mp_sub_d(&M2, (mp_digit)2, &M2)) != MP_OKAY) goto LTM_ERR;
235-
/* Do the M-R round: M = floor( ( 2*M^2 - floor((n^2*M^4) / (2^(2*(s+g)))) ) / 2^(2*g)) + 1 */
236-
/* M2 = (M * 2^g)^2*/
237-
if ((err = mp_sqr(&M2, &M2)) != MP_OKAY) goto LTM_ERR;
238-
/* M4 = M2^2 */
239-
if ((err = mp_sqr(&M2, &M4)) != MP_OKAY) goto LTM_ERR;
240-
/* Compute numerator: n^2*M^4 = (P[t]) * M4*/
241-
if ((err = mp_mul(&P[t], &M4, &M4)) != MP_OKAY) goto LTM_ERR;
242-
/* Compute fraction by shifting right: M4>>2*(s+g) where 2*(s+g) < MAX_INT */
243-
if ((err = mp_div_2d(&M4, 2*(s+g), &M4, NULL)) != MP_OKAY) goto LTM_ERR;
244-
if ((err = mp_mul_2(&M2,&M2)) != MP_OKAY) goto LTM_ERR;
245-
if ((err = mp_sub(&M2,&M4,&M4)) != MP_OKAY) goto LTM_ERR;
246-
/* R[t] = M / 2^(2*g) remove extra bits before storage */
247-
if ((err = mp_div_2d(&M4, 2*g, &(R[t]), &M4)) != MP_OKAY) goto LTM_ERR;
248-
if ((err = mp_incr(&R[t])) != MP_OKAY) goto LTM_ERR;
249-
} else {
250-
#endif
228+
229+
/* We cannot evaluate the numerator if the computation would overflow */
230+
if (!num_ovf) {
231+
/* Compute numerator */
232+
if ((err = mp_init(&R[t])) != MP_OKAY) goto LTM_ERR;
251233
/* R[t] = R[t] << (2^t * k) The factor cannot overflow, we checked that above */
252-
if ((err = mp_2expt(&(R[t]), MP_COMPUTE_ESS(t + 1) )) != MP_OKAY) goto LTM_ERR;
234+
if ((err = mp_2expt(&(R[t]), MP_COMPUTE_ESS(t + 1))) != MP_OKAY) goto LTM_ERR;
253235
/* Compute reciprocal */
254236
/* R[t] = floor(2^(2^t * k) / P[t] */
255237
if ((err = mp_div(&R[t], &P[t], &R[t], NULL)) != MP_OKAY) goto LTM_ERR;
256238
/* Ceiling if P[t] is not a power of two but it is not a problem if P[t] is a power of two. */
257239
if ((err = mp_incr(&R[t])) != MP_OKAY) goto LTM_ERR;
258-
#ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
259240
}
260-
#endif
261241
}
262242

263243
/* And finally: start the recursion. */
264244
if ((err = s_mp_to_radix_recursive(a, sptr, &part_maxlen, &part_written, radix,
265-
k, t, false, P, R)) != MP_OKAY) goto LTM_ERR;
245+
k, t, false, num_ovf, P, R)) != MP_OKAY) goto LTM_ERR;
266246
/* part_written does not account for EOS */
267247
*written = part_written + 1;
268248

@@ -274,9 +254,6 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
274254
} while (t--);
275255
MP_FREE_BUF(P, (size_t) steps * sizeof(mp_int));
276256
MP_FREE_BUF(R, (size_t) steps * sizeof(mp_int));
277-
#ifdef MP_TO_RADIX_USE_NEWTON_RAPHSON
278-
mp_clear_multi(&M2, &M4, NULL);
279-
#endif
280257
return err;
281258
}
282259

s_mp_slower_to_radix.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ mp_err s_mp_slower_to_radix(const mp_int *a, char **str,
4545
ybar--;
4646
}
4747

48-
/* Fill in leading zeros if this chunk contains the most significant digits. */
48+
/* Fill in leading zeros if this chunk does not contain the most significant digits. */
4949
if (pad) {
5050
while ((ybar-- > 0) && (((*part_maxlen)--) > 0)) {
5151
*s++ = '0';
@@ -60,7 +60,7 @@ mp_err s_mp_slower_to_radix(const mp_int *a, char **str,
6060
s_reverse(s, digs);
6161
/* step forward */
6262
*str += digs;
63-
/* Add EOS at teh end of every chunk to allow this function to be used stand-alone */
63+
/* Add EOS at the end of every chunk to allow this function to be used stand-alone */
6464
**str = '\0';
6565

6666
/* TODO: this method to increase "written" is not threadsafe! */

tommath_private.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,11 +226,11 @@ extern MP_PRIVATE const mp_digit s_mp_prime_tab[];
226226
extern MP_PRIVATE const uint8_t s_mp_radix_exponent_y[];
227227

228228
/*
229-
There is not much to tune here, the steps are of the form 2^k and too large
230-
for tuning to make a alot of sense.
229+
This is the value without the Newton-Raphson optimization.
230+
Tuneable?
231231
*/
232232
#ifndef MP_RADIX_BARRETT_START_MULTIPLICATOR
233-
# define MP_RADIX_BARRETT_START_MULTIPLICATOR 8
233+
# define MP_RADIX_BARRETT_START_MULTIPLICATOR 50
234234
#endif
235235

236236

0 commit comments

Comments
 (0)