From 9fbe4854fe9b937f151fc1dac656a54ba8a64222 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Wed, 15 Jul 2020 14:34:11 +0700 Subject: [PATCH 01/34] "safegcd" field and scalar inversion - see "Fast constant-time gcd computation and modular inversion" by Daniel J. Bernstein and Bo-Yin Yang https://gcd.cr.yp.to() --- src/field_10x26.h | 2 + src/field_5x52_impl.h | 363 ++++++++++++++++++++++++++++++++ src/field_impl.h | 2 + src/scalar_4x64_impl.h | 457 +++++++++++++++++++++++++++++++++++++++++ src/scalar_8x32.h | 2 + src/scalar_impl.h | 4 +- src/util.h | 1 + 7 files changed, 830 insertions(+), 1 deletion(-) diff --git a/src/field_10x26.h b/src/field_10x26.h index 5ff03c8abc..312a94c3ae 100644 --- a/src/field_10x26.h +++ b/src/field_10x26.h @@ -47,4 +47,6 @@ typedef struct { #define SECP256K1_FE_STORAGE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }} #define SECP256K1_FE_STORAGE_CONST_GET(d) d.n[7], d.n[6], d.n[5], d.n[4],d.n[3], d.n[2], d.n[1], d.n[0] +#define SECP256K1_FE_INV_DEFAULT + #endif /* SECP256K1_FIELD_REPR_H */ diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 71a38f915b..a08032afb7 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -498,4 +498,367 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se #endif } +static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST( + 0x0E90A100UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, + 0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL +); + +static void secp256k1_fe_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { + + int64_t z0, z1, z2, z3; + int128_t tt; + + tt = (int128_t)a0 * b0 + + (int128_t)c0 * d0; + z0 = (int64_t)tt; tt -= z0; tt >>= 64; + + tt += (int128_t)a0 * b1 + + (int128_t)a1 * b0 + + (int128_t)c0 * d1 + + (int128_t)c1 * d0; + z1 = (int64_t)tt; tt -= z1; tt >>= 64; + + tt += (int128_t)a1 * b1 + + (int128_t)c1 * d1; + z2 = (int64_t)tt; tt -= z2; tt >>= 64; + + z3 = (int64_t)tt; + + t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3; +} + +static void secp256k1_fe_combine_1s(int64_t *t) { + + int64_t a = t[0], b = t[1], c = t[2], d = t[3], + e = t[4], f = t[5], g = t[6], h = t[7]; + int128_t I, J, K, L; + + I = (int128_t)e * a + (int128_t)f * c; + J = (int128_t)e * b + (int128_t)f * d; + K = (int128_t)g * a + (int128_t)h * c; + L = (int128_t)g * b + (int128_t)h * d; + + a = (int64_t)I; I -= a; I >>= 64; b = (int64_t)I; + c = (int64_t)J; J -= c; J >>= 64; d = (int64_t)J; + e = (int64_t)K; K -= e; K >>= 64; f = (int64_t)K; + g = (int64_t)L; L -= g; L >>= 64; h = (int64_t)L; + + t[0] = a; t[1] = b; t[2] = c; t[3] = d; + t[4] = e; t[5] = f; t[6] = g; t[7] = h; +} + +static void secp256k1_fe_combine_2s(int64_t *t) { + + int64_t a0 = t[ 0], a1 = t[ 1]; + int64_t b0 = t[ 2], b1 = t[ 3]; + int64_t c0 = t[ 4], c1 = t[ 5]; + int64_t d0 = t[ 6], d1 = t[ 7]; + int64_t e0 = t[ 8], e1 = t[ 9]; + int64_t f0 = t[10], f1 = t[11]; + int64_t g0 = t[12], g1 = t[13]; + int64_t h0 = t[14], h1 = t[15]; + + secp256k1_fe_mul_add(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); + secp256k1_fe_mul_add(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); + secp256k1_fe_mul_add(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); + secp256k1_fe_mul_add(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); +} + +static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) { + + uint64_t u0, u1, u2, u3, u4; + uint64_t r0, r1, r2, r3, r4; + + /* TODO Need proper carry chain */ + + u0 = (uint64_t)t[0]; + u1 = (uint64_t)t[1] - (u0 >> 63); + u2 = (uint64_t)t[2] - (u1 >> 63); + u3 = (uint64_t)t[3] - (u2 >> 63); + u4 = - (u3 >> 63); + + r0 = 0xFFFFEFFFFFC2FULL * 2; + r1 = 0xFFFFFFFFFFFFFULL * 2; + r2 = 0xFFFFFFFFFFFFFULL * 2; + r3 = 0xFFFFFFFFFFFFFULL * 2; + r4 = 0x0FFFFFFFFFFFFULL * 2; + + r0 += u0 & 0xFFFFFFFFFFFFFULL; + r1 += u0 >> 52 | ((u1 << 12) & 0xFFFFFFFFFFFFFULL); + r2 += u1 >> 40 | ((u2 << 24) & 0xFFFFFFFFFFFFFULL); + r3 += u2 >> 28 | ((u3 << 36) & 0xFFFFFFFFFFFFFULL); + r4 += u3 >> 16 | (u4 << 48); + + r->n[0] = r0; + r->n[1] = r1; + r->n[2] = r2; + r->n[3] = r3; + r->n[4] = r4; + +#ifdef VERIFY + /* TODO Probably 2 is enough? */ + r->magnitude = 3; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) { + + const uint64_t M62 = UINT64_MAX >> 2; + const uint64_t *n = &a->n[0]; + uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4]; + +#ifdef VERIFY + VERIFY_CHECK(a->normalized); +#endif + + r[0] = (a0 | a1 << 52) & M62; + r[1] = (a1 >> 10 | a2 << 42) & M62; + r[2] = (a2 >> 20 | a3 << 32) & M62; + r[3] = (a3 >> 30 | a4 << 22) & M62; + r[4] = a4 >> 40; +} + +static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f, uint64_t g, int64_t *t) { + + uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; + uint64_t c1, c2, x, y, z; + int i; + + for (i = 0; i < 62; ++i) { + + c1 = -(g & (eta >> 15)); + + x = (f ^ g) & c1; + f ^= x; g ^= x; g ^= c1; g -= c1; + + y = (u ^ q) & c1; + u ^= y; q ^= y; q ^= c1; q -= c1; + + z = (v ^ r) & c1; + v ^= z; r ^= z; r ^= c1; r -= c1; + + eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1; + + c2 = -(g & 1); + + g += (f & c2); g >>= 1; + q += (u & c2); u <<= 1; + r += (v & c2); v <<= 1; + } + + t[0] = (int64_t)u; + t[1] = (int64_t)v; + t[2] = (int64_t)q; + t[3] = (int64_t)r; + + return eta; +} + +static void secp256k1_fe_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) { + + const int64_t M62 = (int64_t)(UINT64_MAX >> 2); + int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; + int128_t cf = 0, cg = 0; + int i; + + VERIFY_CHECK(len > 0); + + fi = f[0]; + gi = g[0]; + + cf -= (int128_t)u * fi + (int128_t)v * gi; + cg -= (int128_t)q * fi + (int128_t)r * gi; + + VERIFY_CHECK(((int64_t)cf & M62) == 0); + VERIFY_CHECK(((int64_t)cg & M62) == 0); + + cf >>= 62; + cg >>= 62; + + for (i = 1; i < len; ++i) { + + fi = f[i]; + gi = g[i]; + + cf -= (int128_t)u * fi + (int128_t)v * gi; + cg -= (int128_t)q * fi + (int128_t)r * gi; + + f[i - 1] = (int64_t)cf & M62; cf >>= 62; + g[i - 1] = (int64_t)cg & M62; cg >>= 62; + } + + f[i - 1] = (int64_t)cf; + g[i - 1] = (int64_t)cg; +} + +static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { + +#if 1 + + /* TODO Check for a == 0? */ + + int64_t t[12 * 4]; + int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, + 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; + int64_t g[5]; + secp256k1_fe b0, d0, a1, b1, c1, d1; + int i, len, sign; + int16_t eta; + + /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input + * by 2^768, and then the output by 2^24. */ + /* Instead of dividing the output by 2^744, we scale the input. */ + secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); + secp256k1_fe_normalize(&b0); + secp256k1_fe_encode_62(&g[0], &b0); + + eta = -1; + + for (i = 0; i < 12; ++i) { + eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], &t[i * 4]); + len = i <= 6 ? 5 : i >= 10 ? 1 : 11 - i; + secp256k1_fe_update_fg(len, f, g, &t[i * 4]); + } + + /* At this point, f must equal +/- 1 (the GCD). */ + sign = (f[0] >> 1) & 1; + + for (i = 0; i < 3; ++i) { + int tOff = i * 16; + secp256k1_fe_combine_1s(&t[tOff + 0]); + secp256k1_fe_combine_1s(&t[tOff + 8]); + secp256k1_fe_combine_2s(&t[tOff + 0]); + } + + /* secp256k1_fe_decode_matrix(&a0, &t[0]); */ + secp256k1_fe_decode_matrix(&b0, &t[4]); + /* secp256k1_fe_decode_matrix(&c0, &t[8]); */ + secp256k1_fe_decode_matrix(&d0, &t[12]); + + secp256k1_fe_decode_matrix(&a1, &t[16]); + secp256k1_fe_decode_matrix(&b1, &t[20]); + secp256k1_fe_decode_matrix(&c1, &t[24]); + secp256k1_fe_decode_matrix(&d1, &t[28]); + + secp256k1_fe_mul(&a1, &a1, &b0); + secp256k1_fe_mul(&b1, &b1, &d0); + secp256k1_fe_mul(&c1, &c1, &b0); + secp256k1_fe_mul(&d1, &d1, &d0); + + b0 = a1; secp256k1_fe_add(&b0, &b1); + d0 = c1; secp256k1_fe_add(&d0, &d1); + + secp256k1_fe_decode_matrix(&a1, &t[32]); + secp256k1_fe_decode_matrix(&b1, &t[36]); + /* secp256k1_fe_decode_matrix(&c1, &t[40]); */ + /* secp256k1_fe_decode_matrix(&d1, &t[44]); */ + + secp256k1_fe_mul(&a1, &a1, &b0); + secp256k1_fe_mul(&b1, &b1, &d0); + /* secp256k1_fe_mul(&c1, &c1, &b0); */ + /* secp256k1_fe_mul(&d1, &d1, &d0); */ + + b0 = a1; secp256k1_fe_add(&b0, &b1); + /* d0 = c1; secp256k1_fe_add(&d0, &d1); */ + + secp256k1_fe_negate(&b1, &b0, 2); + secp256k1_fe_cmov(&b0, &b1, sign); + secp256k1_fe_normalize_weak(&b0); + + *r = b0; + +#else + + secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1; + int j; + + /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in + * { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block: + * [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223] + */ + + secp256k1_fe_sqr(&x2, a); + secp256k1_fe_mul(&x2, &x2, a); + + secp256k1_fe_sqr(&x3, &x2); + secp256k1_fe_mul(&x3, &x3, a); + + x6 = x3; + for (j=0; j<3; j++) { + secp256k1_fe_sqr(&x6, &x6); + } + secp256k1_fe_mul(&x6, &x6, &x3); + + x9 = x6; + for (j=0; j<3; j++) { + secp256k1_fe_sqr(&x9, &x9); + } + secp256k1_fe_mul(&x9, &x9, &x3); + + x11 = x9; + for (j=0; j<2; j++) { + secp256k1_fe_sqr(&x11, &x11); + } + secp256k1_fe_mul(&x11, &x11, &x2); + + x22 = x11; + for (j=0; j<11; j++) { + secp256k1_fe_sqr(&x22, &x22); + } + secp256k1_fe_mul(&x22, &x22, &x11); + + x44 = x22; + for (j=0; j<22; j++) { + secp256k1_fe_sqr(&x44, &x44); + } + secp256k1_fe_mul(&x44, &x44, &x22); + + x88 = x44; + for (j=0; j<44; j++) { + secp256k1_fe_sqr(&x88, &x88); + } + secp256k1_fe_mul(&x88, &x88, &x44); + + x176 = x88; + for (j=0; j<88; j++) { + secp256k1_fe_sqr(&x176, &x176); + } + secp256k1_fe_mul(&x176, &x176, &x88); + + x220 = x176; + for (j=0; j<44; j++) { + secp256k1_fe_sqr(&x220, &x220); + } + secp256k1_fe_mul(&x220, &x220, &x44); + + x223 = x220; + for (j=0; j<3; j++) { + secp256k1_fe_sqr(&x223, &x223); + } + secp256k1_fe_mul(&x223, &x223, &x3); + + /* The final result is then assembled using a sliding window over the blocks. */ + + t1 = x223; + for (j=0; j<23; j++) { + secp256k1_fe_sqr(&t1, &t1); + } + secp256k1_fe_mul(&t1, &t1, &x22); + for (j=0; j<5; j++) { + secp256k1_fe_sqr(&t1, &t1); + } + secp256k1_fe_mul(&t1, &t1, a); + for (j=0; j<3; j++) { + secp256k1_fe_sqr(&t1, &t1); + } + secp256k1_fe_mul(&t1, &t1, &x2); + for (j=0; j<2; j++) { + secp256k1_fe_sqr(&t1, &t1); + } + secp256k1_fe_mul(r, a, &t1); +#endif +} + #endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/field_impl.h b/src/field_impl.h index 485921a60e..c2b1cd2df2 100644 --- a/src/field_impl.h +++ b/src/field_impl.h @@ -136,6 +136,7 @@ static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a) { return secp256k1_fe_equal(&t1, a); } +#if defined(SECP256K1_FE_INV_DEFAULT) static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1; int j; @@ -225,6 +226,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { } secp256k1_fe_mul(r, a, &t1); } +#endif static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { #if defined(USE_FIELD_INV_BUILTIN) diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 8f539c4bc6..1fcf1ba37c 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -957,4 +957,461 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1); } +static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALAR_CONST( + 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFDUL, + 0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL +); + +static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CONST( + 0x4E165355UL, 0x5D800C18UL, 0xEF116DB1UL, 0xB31347F1UL, + 0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL +); + +static void secp256k1_scalar_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { + + int64_t z0, z1, z2, z3; + int128_t tt; + + tt = (int128_t)a0 * b0 + + (int128_t)c0 * d0; + z0 = (int64_t)tt; tt -= z0; tt >>= 64; + + tt += (int128_t)a0 * b1 + + (int128_t)a1 * b0 + + (int128_t)c0 * d1 + + (int128_t)c1 * d0; + z1 = (int64_t)tt; tt -= z1; tt >>= 64; + + tt += (int128_t)a1 * b1 + + (int128_t)c1 * d1; + z2 = (int64_t)tt; tt -= z2; tt >>= 64; + + z3 = (int64_t)tt; + + t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3; +} + +static void secp256k1_scalar_combine_1s(int64_t *t) { + + int64_t a = t[0], b = t[1], c = t[2], d = t[3], + e = t[4], f = t[5], g = t[6], h = t[7]; + int128_t I, J, K, L; + + I = (int128_t)e * a + (int128_t)f * c; + J = (int128_t)e * b + (int128_t)f * d; + K = (int128_t)g * a + (int128_t)h * c; + L = (int128_t)g * b + (int128_t)h * d; + + a = (int64_t)I; I -= a; I >>= 64; b = (int64_t)I; + c = (int64_t)J; J -= c; J >>= 64; d = (int64_t)J; + e = (int64_t)K; K -= e; K >>= 64; f = (int64_t)K; + g = (int64_t)L; L -= g; L >>= 64; h = (int64_t)L; + + t[0] = a; t[1] = b; t[2] = c; t[3] = d; + t[4] = e; t[5] = f; t[6] = g; t[7] = h; +} + +static void secp256k1_scalar_combine_2s(int64_t *t) { + + int64_t a0 = t[ 0], a1 = t[ 1]; + int64_t b0 = t[ 2], b1 = t[ 3]; + int64_t c0 = t[ 4], c1 = t[ 5]; + int64_t d0 = t[ 6], d1 = t[ 7]; + int64_t e0 = t[ 8], e1 = t[ 9]; + int64_t f0 = t[10], f1 = t[11]; + int64_t g0 = t[12], g1 = t[13]; + int64_t h0 = t[14], h1 = t[15]; + + secp256k1_scalar_mul_add(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); + secp256k1_scalar_mul_add(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); + secp256k1_scalar_mul_add(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); + secp256k1_scalar_mul_add(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); +} + +static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) { + +#if 1 + + uint64_t r0, r1, r2, r3; + int flag; + secp256k1_scalar u; + + /* TODO Need proper carry chain */ + r0 = (uint64_t)t[0]; + r1 = (uint64_t)t[1] - (r0 >> 63); + r2 = (uint64_t)t[2] - (r1 >> 63); + r3 = (uint64_t)t[3] - (r2 >> 63); + + flag = (int)(r3 >> 63); + + r->d[0] = r0; + r->d[1] = r1; + r->d[2] = r2; + r->d[3] = r3; + + secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256); + secp256k1_scalar_cmov(r, &u, flag); + +#else + + uint64_t u0, u1, u2, u3, u4; + uint64_t r0, r1, r2, r3, r4; + + u0 = (uint64_t)t[0]; + u1 = (uint64_t)t[1] - (u0 >> 63); + u2 = (uint64_t)t[2] - (u1 >> 63); + u3 = (uint64_t)t[3] - (u2 >> 63); + u4 = - (u3 >> 63); + + r0 = 0xFFFFEFFFFFC2FULL * 2; + r1 = 0xFFFFFFFFFFFFFULL * 2; + r2 = 0xFFFFFFFFFFFFFULL * 2; + r3 = 0xFFFFFFFFFFFFFULL * 2; + r4 = 0x0FFFFFFFFFFFFULL * 2; + + r0 += u0 & 0xFFFFFFFFFFFFFULL; + r1 += u0 >> 52 | ((u1 << 12) & 0xFFFFFFFFFFFFFULL); + r2 += u1 >> 40 | ((u2 << 24) & 0xFFFFFFFFFFFFFULL); + r3 += u2 >> 28 | ((u3 << 36) & 0xFFFFFFFFFFFFFULL); + r4 += u3 >> 16 | (u4 << 48); + + r->n[0] = r0; + r->n[1] = r1; + r->n[2] = r2; + r->n[3] = r3; + r->n[4] = r4; + +#endif +} + +static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { + + const uint64_t M62 = UINT64_MAX >> 2; + const uint64_t *d = &a->d[0]; + uint64_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3]; + +#ifdef VERIFY + VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0); +#endif + + r[0] = a0 & M62; + r[1] = (a0 >> 62 | a1 << 2) & M62; + r[2] = (a1 >> 60 | a2 << 4) & M62; + r[3] = (a2 >> 58 | a3 << 6) & M62; + r[4] = a3 >> 56; + +#ifdef VERIFY + VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0); +#endif +} + +static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f, uint64_t g, int64_t *t) { + + uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; + uint64_t c1, c2, x, y, z; + int i; + + for (i = 0; i < 62; ++i) { + + c1 = -(g & (eta >> 15)); + + x = (f ^ g) & c1; + f ^= x; g ^= x; g ^= c1; g -= c1; + + y = (u ^ q) & c1; + u ^= y; q ^= y; q ^= c1; q -= c1; + + z = (v ^ r) & c1; + v ^= z; r ^= z; r ^= c1; r -= c1; + + eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1; + + c2 = -(g & 1); + + g += (f & c2); g >>= 1; + q += (u & c2); u <<= 1; + r += (v & c2); v <<= 1; + } + + t[0] = (int64_t)u; + t[1] = (int64_t)v; + t[2] = (int64_t)q; + t[3] = (int64_t)r; + + return eta; +} + +static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) { + + const int64_t M62 = (int64_t)(UINT64_MAX >> 2); + int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; + int128_t cf = 0, cg = 0; + int i; + + VERIFY_CHECK(len > 0); + + fi = f[0]; + gi = g[0]; + + cf -= (int128_t)u * fi + (int128_t)v * gi; + cg -= (int128_t)q * fi + (int128_t)r * gi; + + VERIFY_CHECK(((int64_t)cf & M62) == 0); + VERIFY_CHECK(((int64_t)cg & M62) == 0); + + cf >>= 62; + cg >>= 62; + + for (i = 1; i < len; ++i) { + + fi = f[i]; + gi = g[i]; + + cf -= (int128_t)u * fi + (int128_t)v * gi; + cg -= (int128_t)q * fi + (int128_t)r * gi; + + f[i - 1] = (int64_t)cf & M62; cf >>= 62; + g[i - 1] = (int64_t)cg & M62; cg >>= 62; + } + + f[i - 1] = (int64_t)cf; + g[i - 1] = (int64_t)cg; +} + +static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) { + +#if defined(EXHAUSTIVE_TEST_ORDER) + int i; + *r = 0; + for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++) + if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1) + *r = i; + /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus + * have a composite group order; fix it in exhaustive_tests.c). */ + VERIFY_CHECK(*r != 0); +} +#elif 1 + + /* TODO Check for x == 0? */ + + int64_t t[12 * 4]; + int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, + 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; + int64_t g[5]; + secp256k1_scalar b0, d0, a1, b1, c1, d1; + int i, len, sign; + int16_t eta; + + /* Instead of dividing the output by 2^744, we scale the input. */ + secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); + secp256k1_scalar_encode_62(&g[0], &b0); + + eta = -1; + + for (i = 0; i < 12; ++i) { + eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], &t[i * 4]); + len = i <= 6 ? 5 : i >= 10 ? 1 : 11 - i; + secp256k1_scalar_update_fg(len, f, g, &t[i * 4]); + } + + /* At this point, f must equal +/- 1 (the GCD). */ + sign = (f[0] >> 1) & 1; + + for (i = 0; i < 3; ++i) { + int tOff = i * 16; + secp256k1_scalar_combine_1s(&t[tOff + 0]); + secp256k1_scalar_combine_1s(&t[tOff + 8]); + secp256k1_scalar_combine_2s(&t[tOff + 0]); + } + + /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */ + secp256k1_scalar_decode_matrix(&b0, &t[4]); + /* secp256k1_scalar_decode_matrix(&c0, &t[8]); */ + secp256k1_scalar_decode_matrix(&d0, &t[12]); + + secp256k1_scalar_decode_matrix(&a1, &t[16]); + secp256k1_scalar_decode_matrix(&b1, &t[20]); + secp256k1_scalar_decode_matrix(&c1, &t[24]); + secp256k1_scalar_decode_matrix(&d1, &t[28]); + + secp256k1_scalar_mul(&a1, &a1, &b0); + secp256k1_scalar_mul(&b1, &b1, &d0); + secp256k1_scalar_mul(&c1, &c1, &b0); + secp256k1_scalar_mul(&d1, &d1, &d0); + + secp256k1_scalar_add(&b0, &a1, &b1); + secp256k1_scalar_add(&d0, &c1, &d1); + + secp256k1_scalar_decode_matrix(&a1, &t[32]); + secp256k1_scalar_decode_matrix(&b1, &t[36]); + /* secp256k1_scalar_decode_matrix(&c1, &t[40]); */ + /* secp256k1_scalar_decode_matrix(&d1, &t[44]); */ + + secp256k1_scalar_mul(&a1, &a1, &b0); + secp256k1_scalar_mul(&b1, &b1, &d0); + /* secp256k1_scalar_mul(&c1, &c1, &b0); */ + /* secp256k1_scalar_mul(&d1, &d1, &d0); */ + + secp256k1_scalar_add(&b0, &a1, &b1); + /* secp256k1_scalar_add(&d0, &c1, &d1); */ + + secp256k1_scalar_cond_negate(&b0, sign); + + *r = b0; +} +#else + secp256k1_scalar *t; + int i; + /* First compute xN as x ^ (2^N - 1) for some values of N, + * and uM as x ^ M for some values of M. */ + secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126; + secp256k1_scalar u2, u5, u9, u11, u13; + + secp256k1_scalar_sqr(&u2, x); + secp256k1_scalar_mul(&x2, &u2, x); + secp256k1_scalar_mul(&u5, &u2, &x2); + secp256k1_scalar_mul(&x3, &u5, &u2); + secp256k1_scalar_mul(&u9, &x3, &u2); + secp256k1_scalar_mul(&u11, &u9, &u2); + secp256k1_scalar_mul(&u13, &u11, &u2); + + secp256k1_scalar_sqr(&x6, &u13); + secp256k1_scalar_sqr(&x6, &x6); + secp256k1_scalar_mul(&x6, &x6, &u11); + + secp256k1_scalar_sqr(&x8, &x6); + secp256k1_scalar_sqr(&x8, &x8); + secp256k1_scalar_mul(&x8, &x8, &x2); + + secp256k1_scalar_sqr(&x14, &x8); + for (i = 0; i < 5; i++) { + secp256k1_scalar_sqr(&x14, &x14); + } + secp256k1_scalar_mul(&x14, &x14, &x6); + + secp256k1_scalar_sqr(&x28, &x14); + for (i = 0; i < 13; i++) { + secp256k1_scalar_sqr(&x28, &x28); + } + secp256k1_scalar_mul(&x28, &x28, &x14); + + secp256k1_scalar_sqr(&x56, &x28); + for (i = 0; i < 27; i++) { + secp256k1_scalar_sqr(&x56, &x56); + } + secp256k1_scalar_mul(&x56, &x56, &x28); + + secp256k1_scalar_sqr(&x112, &x56); + for (i = 0; i < 55; i++) { + secp256k1_scalar_sqr(&x112, &x112); + } + secp256k1_scalar_mul(&x112, &x112, &x56); + + secp256k1_scalar_sqr(&x126, &x112); + for (i = 0; i < 13; i++) { + secp256k1_scalar_sqr(&x126, &x126); + } + secp256k1_scalar_mul(&x126, &x126, &x14); + + /* Then accumulate the final result (t starts at x126). */ + t = &x126; + for (i = 0; i < 3; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u5); /* 101 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u5); /* 101 */ + for (i = 0; i < 5; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u11); /* 1011 */ + for (i = 0; i < 4; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u11); /* 1011 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 5; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 6; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u5); /* 101 */ + for (i = 0; i < 3; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 5; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u9); /* 1001 */ + for (i = 0; i < 6; i++) { /* 000 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u5); /* 101 */ + for (i = 0; i < 10; i++) { /* 0000000 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 9; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x8); /* 11111111 */ + for (i = 0; i < 5; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u9); /* 1001 */ + for (i = 0; i < 6; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u11); /* 1011 */ + for (i = 0; i < 4; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 5; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x2); /* 11 */ + for (i = 0; i < 6; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 10; i++) { /* 000000 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 4; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u9); /* 1001 */ + for (i = 0; i < 6; i++) { /* 00000 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, x); /* 1 */ + for (i = 0; i < 8; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(r, t, &x6); /* 111111 */ +} +#endif + #endif /* SECP256K1_SCALAR_REPR_IMPL_H */ diff --git a/src/scalar_8x32.h b/src/scalar_8x32.h index 2c9a348e24..10c55f1f8b 100644 --- a/src/scalar_8x32.h +++ b/src/scalar_8x32.h @@ -16,4 +16,6 @@ typedef struct { #define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{(d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7)}} +#define SECP256K1_SCALAR_INV_DEFAULT + #endif /* SECP256K1_SCALAR_REPR_H */ diff --git a/src/scalar_impl.h b/src/scalar_impl.h index 70cd73db06..2318fcb0fd 100644 --- a/src/scalar_impl.h +++ b/src/scalar_impl.h @@ -61,6 +61,7 @@ static int secp256k1_scalar_set_b32_seckey(secp256k1_scalar *r, const unsigned c return (!overflow) & (!secp256k1_scalar_is_zero(r)); } +#if defined(SECP256K1_SCALAR_INV_DEFAULT) static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) { #if defined(EXHAUSTIVE_TEST_ORDER) int i; @@ -225,11 +226,12 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar } secp256k1_scalar_mul(r, t, &x6); /* 111111 */ } +#endif +#endif SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { return !(a->d[0] & 1); } -#endif static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) { #if defined(USE_SCALAR_INV_BUILTIN) diff --git a/src/util.h b/src/util.h index 8289e23e0c..b4f7b77344 100644 --- a/src/util.h +++ b/src/util.h @@ -176,6 +176,7 @@ static SECP256K1_INLINE void *manual_alloc(void** prealloc_ptr, size_t alloc_siz # else # define SECP256K1_GNUC_EXT # endif +SECP256K1_GNUC_EXT typedef __int128 int128_t; SECP256K1_GNUC_EXT typedef unsigned __int128 uint128_t; #endif From 4fab082c9a9c7cea2ce2b770f1ba9b74a9bffdda Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Wed, 15 Jul 2020 15:34:53 +0700 Subject: [PATCH 02/34] Fix secp256k1_scalar_is_even/scalar_low issue --- src/scalar_4x64_impl.h | 9 ++++++++- src/scalar_impl.h | 4 ++-- src/scalar_low.h | 2 ++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 1fcf1ba37c..8ed839e9d8 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1179,7 +1179,6 @@ static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t } static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) { - #if defined(EXHAUSTIVE_TEST_ORDER) int i; *r = 0; @@ -1259,6 +1258,10 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *r = b0; } + +SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { + return !(a->d[0] & 1); +} #else secp256k1_scalar *t; int i; @@ -1412,6 +1415,10 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar } secp256k1_scalar_mul(r, t, &x6); /* 111111 */ } + +SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { + return !(a->d[0] & 1); +} #endif #endif /* SECP256K1_SCALAR_REPR_IMPL_H */ diff --git a/src/scalar_impl.h b/src/scalar_impl.h index 2318fcb0fd..a63b735491 100644 --- a/src/scalar_impl.h +++ b/src/scalar_impl.h @@ -226,12 +226,12 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar } secp256k1_scalar_mul(r, t, &x6); /* 111111 */ } -#endif -#endif SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { return !(a->d[0] & 1); } +#endif +#endif static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) { #if defined(USE_SCALAR_INV_BUILTIN) diff --git a/src/scalar_low.h b/src/scalar_low.h index 2794a7f171..c31ca35376 100644 --- a/src/scalar_low.h +++ b/src/scalar_low.h @@ -14,4 +14,6 @@ typedef uint32_t secp256k1_scalar; #define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) (d0) +#define SECP256K1_SCALAR_INV_DEFAULT + #endif /* SECP256K1_SCALAR_REPR_H */ From 0b90a57f7e7e6a4e5035ae8f86b0639ef5362109 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Thu, 16 Jul 2020 12:53:23 +0700 Subject: [PATCH 03/34] TODOs and comments --- src/field_5x52_impl.h | 26 ++++++++++++++------ src/scalar_4x64_impl.h | 56 +++++++++++++----------------------------- 2 files changed, 35 insertions(+), 47 deletions(-) diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index a08032afb7..0e6f21d299 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -505,6 +505,11 @@ static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST( static void secp256k1_fe_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { + /* Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64. + * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and + * writes the 252-bit signed result to [t[0],t[1],t[2],t[3]]. + */ + int64_t z0, z1, z2, z3; int128_t tt; @@ -568,15 +573,21 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) { uint64_t u0, u1, u2, u3, u4; uint64_t r0, r1, r2, r3, r4; + int128_t cc; - /* TODO Need proper carry chain */ + cc = t[0]; + u0 = (uint64_t)cc; cc >>= 64; + cc += t[1]; + u1 = (uint64_t)cc; cc >>= 64; + cc += t[2]; + u2 = (uint64_t)cc; cc >>= 64; + cc += t[3]; + u3 = (uint64_t)cc; cc >>= 64; + u4 = (uint64_t)cc; - u0 = (uint64_t)t[0]; - u1 = (uint64_t)t[1] - (u0 >> 63); - u2 = (uint64_t)t[2] - (u1 >> 63); - u3 = (uint64_t)t[3] - (u2 >> 63); - u4 = - (u3 >> 63); + VERIFY_CHECK(u4 == 0 || u4 == UINT64_MAX); + /* Add twice the field prime in case u4 is non-zero (which represents -2^256). */ r0 = 0xFFFFEFFFFFC2FULL * 2; r1 = 0xFFFFFFFFFFFFFULL * 2; r2 = 0xFFFFFFFFFFFFFULL * 2; @@ -596,8 +607,7 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) { r->n[4] = r4; #ifdef VERIFY - /* TODO Probably 2 is enough? */ - r->magnitude = 3; + r->magnitude = 2; r->normalized = 0; secp256k1_fe_verify(r); #endif diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 8ed839e9d8..8f5eea587d 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -969,6 +969,11 @@ static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CO static void secp256k1_scalar_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { + /* Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64. + * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and + * writes the 252-bit signed result to [t[0],t[1],t[2],t[3]]. + */ + int64_t z0, z1, z2, z3; int128_t tt; @@ -1030,19 +1035,23 @@ static void secp256k1_scalar_combine_2s(int64_t *t) { static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) { -#if 1 - uint64_t r0, r1, r2, r3; int flag; secp256k1_scalar u; + int128_t cc; - /* TODO Need proper carry chain */ - r0 = (uint64_t)t[0]; - r1 = (uint64_t)t[1] - (r0 >> 63); - r2 = (uint64_t)t[2] - (r1 >> 63); - r3 = (uint64_t)t[3] - (r2 >> 63); + cc = t[0]; + r0 = (uint64_t)cc; cc >>= 64; + cc += t[1]; + r1 = (uint64_t)cc; cc >>= 64; + cc += t[2]; + r2 = (uint64_t)cc; cc >>= 64; + cc += t[3]; + r3 = (uint64_t)cc; cc >>= 64; - flag = (int)(r3 >> 63); + VERIFY_CHECK(cc == 0 || cc == -1); + + flag = (int)cc & 1; r->d[0] = r0; r->d[1] = r1; @@ -1051,37 +1060,6 @@ static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) { secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256); secp256k1_scalar_cmov(r, &u, flag); - -#else - - uint64_t u0, u1, u2, u3, u4; - uint64_t r0, r1, r2, r3, r4; - - u0 = (uint64_t)t[0]; - u1 = (uint64_t)t[1] - (u0 >> 63); - u2 = (uint64_t)t[2] - (u1 >> 63); - u3 = (uint64_t)t[3] - (u2 >> 63); - u4 = - (u3 >> 63); - - r0 = 0xFFFFEFFFFFC2FULL * 2; - r1 = 0xFFFFFFFFFFFFFULL * 2; - r2 = 0xFFFFFFFFFFFFFULL * 2; - r3 = 0xFFFFFFFFFFFFFULL * 2; - r4 = 0x0FFFFFFFFFFFFULL * 2; - - r0 += u0 & 0xFFFFFFFFFFFFFULL; - r1 += u0 >> 52 | ((u1 << 12) & 0xFFFFFFFFFFFFFULL); - r2 += u1 >> 40 | ((u2 << 24) & 0xFFFFFFFFFFFFFULL); - r3 += u2 >> 28 | ((u3 << 36) & 0xFFFFFFFFFFFFFULL); - r4 += u3 >> 16 | (u4 << 48); - - r->n[0] = r0; - r->n[1] = r1; - r->n[2] = r2; - r->n[3] = r3; - r->n[4] = r4; - -#endif } static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { From 0c3869a46d3eccf0324f4c40c076215093de0ab5 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sat, 18 Jul 2020 11:39:17 +0700 Subject: [PATCH 04/34] VERIFY_CHECK _divsteps_62 loop invariant --- src/field_5x52_impl.h | 7 +++++-- src/scalar_4x64_impl.h | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 0e6f21d299..758e6f000d 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -630,14 +630,17 @@ static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) { r[4] = a4 >> 40; } -static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f, uint64_t g, int64_t *t) { +static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) { uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; - uint64_t c1, c2, x, y, z; + uint64_t c1, c2, f = f0, g = g0, x, y, z; int i; for (i = 0; i < 62; ++i) { + VERIFY_CHECK((u * f0 + v * g0) == -f << i); + VERIFY_CHECK((q * f0 + r * g0) == -g << i); + c1 = -(g & (eta >> 15)); x = (f ^ g) & c1; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 8f5eea587d..08ccd68175 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1083,14 +1083,17 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { #endif } -static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f, uint64_t g, int64_t *t) { +static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) { uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; - uint64_t c1, c2, x, y, z; + uint64_t c1, c2, f = f0, g = g0, x, y, z; int i; for (i = 0; i < 62; ++i) { + VERIFY_CHECK((u * f0 + v * g0) == -f << i); + VERIFY_CHECK((q * f0 + r * g0) == -g << i); + c1 = -(g & (eta >> 15)); x = (f ^ g) & c1; From 11b525c71c599c3dccd48b7e345a88ea052391eb Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Tue, 21 Jul 2020 17:00:04 +0700 Subject: [PATCH 05/34] More checks and comments --- src/field_5x52_impl.h | 30 +++++++++++++++++++++++++++--- src/scalar_4x64_impl.h | 26 +++++++++++++++++++++++--- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 758e6f000d..1f2b24ddcd 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -638,6 +638,7 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6 for (i = 0; i < 62; ++i) { + VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((u * f0 + v * g0) == -f << i); VERIFY_CHECK((q * f0 + r * g0) == -g << i); @@ -710,7 +711,9 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { #if 1 - /* TODO Check for a == 0? */ + /* Modular inversion based on the paper "Fast constant-time gcd computation and + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. + */ int64_t t[12 * 4]; int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, @@ -719,14 +722,22 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe b0, d0, a1, b1, c1, d1; int i, len, sign; int16_t eta; +#ifdef VERIFY + int zero_in; +#endif /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input * by 2^768, and then the output by 2^24. */ - /* Instead of dividing the output by 2^744, we scale the input. */ + /* Instead of dividing the output by 2^744, scale the input. */ secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); secp256k1_fe_normalize(&b0); secp256k1_fe_encode_62(&g[0], &b0); +#ifdef VERIFY + zero_in = secp256k1_fe_is_zero(&b0); +#endif + + /* The paper uses 'delta'; eta == -delta (a performance tweak). */ eta = -1; for (i = 0; i < 12; ++i) { @@ -735,7 +746,16 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe_update_fg(len, f, g, &t[i * 4]); } - /* At this point, f must equal +/- 1 (the GCD). */ + /* At this point sufficient iterations have been performed that g must have reached 0 + * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g + * values i.e. +/- 1. The matrix outputs from each _divstep_62 are combined to get the + * Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divstep_62 introduce an extra factor of 2^62 each, so there is a total extra + * factor of 2^744 to account for (by scaling the input and/or output accordingly). + */ + + VERIFY_CHECK(g[0] == 0); + sign = (f[0] >> 1) & 1; for (i = 0; i < 3; ++i) { @@ -780,6 +800,10 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); +#ifdef VERIFY + VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in); +#endif + *r = b0; #else diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 08ccd68175..b153f73050 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1091,6 +1091,7 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, for (i = 0; i < 62; ++i) { + VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((u * f0 + v * g0) == -f << i); VERIFY_CHECK((q * f0 + r * g0) == -g << i); @@ -1172,7 +1173,9 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar } #elif 1 - /* TODO Check for x == 0? */ + /* Modular inversion based on the paper "Fast constant-time gcd computation and + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. + */ int64_t t[12 * 4]; int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, @@ -1181,11 +1184,15 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar b0, d0, a1, b1, c1, d1; int i, len, sign; int16_t eta; +#ifdef VERIFY + int zero_in = secp256k1_scalar_is_zero(x); +#endif - /* Instead of dividing the output by 2^744, we scale the input. */ + /* Instead of dividing the output by 2^744, scale the input. */ secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); secp256k1_scalar_encode_62(&g[0], &b0); + /* The paper uses 'delta'; eta == -delta (a performance tweak). */ eta = -1; for (i = 0; i < 12; ++i) { @@ -1194,7 +1201,16 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar_update_fg(len, f, g, &t[i * 4]); } - /* At this point, f must equal +/- 1 (the GCD). */ + /* At this point sufficient iterations have been performed that g must have reached 0 + * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g + * values i.e. +/- 1. The matrix outputs from each _divstep_62 are combined to get the + * Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divstep_62 introduce an extra factor of 2^62 each, so there is a total extra + * factor of 2^744 to account for (by scaling the input and/or output accordingly). + */ + + VERIFY_CHECK(g[0] == 0); + sign = (f[0] >> 1) & 1; for (i = 0; i < 3; ++i) { @@ -1237,6 +1253,10 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar_cond_negate(&b0, sign); +#ifdef VERIFY + VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in); +#endif + *r = b0; } From 3ae7179ad78398c23f27b41dd94baa7c0662c964 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Tue, 21 Jul 2020 20:44:13 +0700 Subject: [PATCH 06/34] Update f,g at full length until proper analysis --- src/field_5x52_impl.h | 15 ++++++--------- src/scalar_4x64_impl.h | 15 ++++++--------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 1f2b24ddcd..6f632e2079 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -670,15 +670,13 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6 return eta; } -static void secp256k1_fe_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) { +static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) { const int64_t M62 = (int64_t)(UINT64_MAX >> 2); int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; int128_t cf = 0, cg = 0; int i; - VERIFY_CHECK(len > 0); - fi = f[0]; gi = g[0]; @@ -691,7 +689,7 @@ static void secp256k1_fe_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) cf >>= 62; cg >>= 62; - for (i = 1; i < len; ++i) { + for (i = 1; i < 5; ++i) { fi = f[i]; gi = g[i]; @@ -703,8 +701,8 @@ static void secp256k1_fe_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) g[i - 1] = (int64_t)cg & M62; cg >>= 62; } - f[i - 1] = (int64_t)cf; - g[i - 1] = (int64_t)cg; + f[4] = (int64_t)cf; + g[4] = (int64_t)cg; } static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { @@ -720,7 +718,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; secp256k1_fe b0, d0, a1, b1, c1, d1; - int i, len, sign; + int i, sign; int16_t eta; #ifdef VERIFY int zero_in; @@ -742,8 +740,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { for (i = 0; i < 12; ++i) { eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], &t[i * 4]); - len = i <= 6 ? 5 : i >= 10 ? 1 : 11 - i; - secp256k1_fe_update_fg(len, f, g, &t[i * 4]); + secp256k1_fe_update_fg(f, g, &t[i * 4]); } /* At this point sufficient iterations have been performed that g must have reached 0 diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index b153f73050..91e3a73848 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1123,15 +1123,13 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, return eta; } -static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) { +static void secp256k1_scalar_update_fg(int64_t *f, int64_t *g, int64_t *t) { const int64_t M62 = (int64_t)(UINT64_MAX >> 2); int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; int128_t cf = 0, cg = 0; int i; - VERIFY_CHECK(len > 0); - fi = f[0]; gi = g[0]; @@ -1144,7 +1142,7 @@ static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t cf >>= 62; cg >>= 62; - for (i = 1; i < len; ++i) { + for (i = 1; i < 5; ++i) { fi = f[i]; gi = g[i]; @@ -1156,8 +1154,8 @@ static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t g[i - 1] = (int64_t)cg & M62; cg >>= 62; } - f[i - 1] = (int64_t)cf; - g[i - 1] = (int64_t)cg; + f[4] = (int64_t)cf; + g[4] = (int64_t)cg; } static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) { @@ -1182,7 +1180,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; secp256k1_scalar b0, d0, a1, b1, c1, d1; - int i, len, sign; + int i, sign; int16_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); @@ -1197,8 +1195,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar for (i = 0; i < 12; ++i) { eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], &t[i * 4]); - len = i <= 6 ? 5 : i >= 10 ? 1 : 11 - i; - secp256k1_scalar_update_fg(len, f, g, &t[i * 4]); + secp256k1_scalar_update_fg(f, g, &t[i * 4]); } /* At this point sufficient iterations have been performed that g must have reached 0 From 2f643ad31d2a0c9ec58f6be302a302ee3f2a98f4 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Wed, 22 Jul 2020 00:13:20 +0700 Subject: [PATCH 07/34] Initial 32bit safegcd - definitely needs bounds analysis --- src/field_10x26.h | 2 - src/field_10x26_impl.h | 543 +++++++++++++++++++++++++++++++++++++ src/field_5x52_impl.h | 18 +- src/scalar_4x64_impl.h | 20 +- src/scalar_8x32.h | 2 - src/scalar_8x32_impl.h | 596 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 1158 insertions(+), 23 deletions(-) diff --git a/src/field_10x26.h b/src/field_10x26.h index 312a94c3ae..5ff03c8abc 100644 --- a/src/field_10x26.h +++ b/src/field_10x26.h @@ -47,6 +47,4 @@ typedef struct { #define SECP256K1_FE_STORAGE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }} #define SECP256K1_FE_STORAGE_CONST_GET(d) d.n[7], d.n[6], d.n[5], d.n[4],d.n[3], d.n[2], d.n[1], d.n[0] -#define SECP256K1_FE_INV_DEFAULT - #endif /* SECP256K1_FIELD_REPR_H */ diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 651500ee8e..52995de68c 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1164,4 +1164,547 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se #endif } +static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST( + 0x0E90A100UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, + 0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL +); + +static void secp256k1_fe_mul_add_2(int32_t a0, int32_t a1, int32_t b0, int32_t b1, int32_t c0, int32_t c1, int32_t d0, int32_t d1, int32_t *t) { + + /* Each [a0,a1], etc. pair is a ??-bit signed value e.g. a0 + a1 * 2^32. + * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and + * writes the ???-bit signed result to [t[0],t[1],t[2],t[3]]. + */ + + int32_t z0, z1, z2, z3; + int64_t tt; + + tt = (int64_t)a0 * b0 + + (int64_t)c0 * d0; + z0 = (int32_t)tt; tt -= z0; tt >>= 32; + + tt += (int64_t)a0 * b1 + + (int64_t)a1 * b0 + + (int64_t)c0 * d1 + + (int64_t)c1 * d0; + z1 = (int32_t)tt; tt -= z1; tt >>= 32; + + tt += (int64_t)a1 * b1 + + (int64_t)c1 * d1; + z2 = (int32_t)tt; tt -= z2; tt >>= 32; + + z3 = (int32_t)tt; + + t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3; +} + +static void secp256k1_fe_mul_add_4(int32_t* tIn, int xPos, int yPos, int uPos, int vPos, int32_t *tOut, int zzPos) { + int32_t y0 = tIn[yPos + 0]; + int32_t y1 = tIn[yPos + 1]; + int32_t y2 = tIn[yPos + 2]; + int32_t y3 = tIn[yPos + 3]; + int32_t v0 = tIn[vPos + 0]; + int32_t v1 = tIn[vPos + 1]; + int32_t v2 = tIn[vPos + 2]; + int32_t v3 = tIn[vPos + 3]; + int32_t xVal, uVal; + int32_t z0, z1, z2, z3, z4, z5, z6, z7; + int64_t c; + + xVal = tIn[xPos]; + uVal = tIn[uPos]; + + c = (int64_t)xVal * y0 + (int64_t)uVal * v0; + z0 = (int32_t)c; c -= z0; c >>= 32; + + c += (int64_t)xVal * y1 + (int64_t)uVal * v1; + z1 = (int32_t)c; c -= z1; c >>= 32; + + c += (int64_t)xVal * y2 + (int64_t)uVal * v2; + z2 = (int32_t)c; c -= z2; c >>= 32; + + c += (int64_t)xVal * y3 + (int64_t)uVal * v3; + z3 = (int32_t)c; c -= z3; c >>= 32; + z4 = (int32_t)c; + + xVal = tIn[xPos + 1]; + uVal = tIn[uPos + 1]; + + c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z1; + z1 = (int32_t)c; c -= z1; c >>= 32; + + c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z2; + z2 = (int32_t)c; c -= z2; c >>= 32; + + c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z3; + z3 = (int32_t)c; c -= z3; c >>= 32; + + c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z4; + z4 = (int32_t)c; c -= z4; c >>= 32; + z5 = (int32_t)c; + + xVal = tIn[xPos + 2]; + uVal = tIn[uPos + 2]; + + c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z2; + z2 = (int32_t)c; c -= z2; c >>= 32; + + c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z3; + z3 = (int32_t)c; c -= z3; c >>= 32; + + c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z4; + z4 = (int32_t)c; c -= z4; c >>= 32; + + c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z5; + z5 = (int32_t)c; c -= z5; c >>= 32; + z6 = (int32_t)c; + + xVal = tIn[xPos + 3]; + uVal = tIn[uPos + 3]; + + c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z3; + z3 = (int32_t)c; c -= z3; c >>= 32; + + c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z4; + z4 = (int32_t)c; c -= z4; c >>= 32; + + c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z5; + z5 = (int32_t)c; c -= z5; c >>= 32; + + c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z6; + z6 = (int32_t)c; c -= z6; c >>= 32; + z7 = (int32_t)c; + + tOut[zzPos + 0] = z0; + tOut[zzPos + 1] = z1; + tOut[zzPos + 2] = z2; + tOut[zzPos + 3] = z3; + tOut[zzPos + 4] = z4; + tOut[zzPos + 5] = z5; + tOut[zzPos + 6] = z6; + tOut[zzPos + 7] = z7; +} + +static void secp256k1_fe_combine_1s(int32_t *t) { + + int32_t a = t[0], b = t[1], c = t[2], d = t[3], + e = t[4], f = t[5], g = t[6], h = t[7]; + int64_t I, J, K, L; + + I = (int64_t)e * a + (int64_t)f * c; + J = (int64_t)e * b + (int64_t)f * d; + K = (int64_t)g * a + (int64_t)h * c; + L = (int64_t)g * b + (int64_t)h * d; + + a = (int32_t)I; I -= a; I >>= 32; b = (int32_t)I; + c = (int32_t)J; J -= c; J >>= 32; d = (int32_t)J; + e = (int32_t)K; K -= e; K >>= 32; f = (int32_t)K; + g = (int32_t)L; L -= g; L >>= 32; h = (int32_t)L; + + t[0] = a; t[1] = b; t[2] = c; t[3] = d; + t[4] = e; t[5] = f; t[6] = g; t[7] = h; +} + +static void secp256k1_fe_combine_2s(int32_t *t) { + + int32_t a0 = t[ 0], a1 = t[ 1]; + int32_t b0 = t[ 2], b1 = t[ 3]; + int32_t c0 = t[ 4], c1 = t[ 5]; + int32_t d0 = t[ 6], d1 = t[ 7]; + int32_t e0 = t[ 8], e1 = t[ 9]; + int32_t f0 = t[10], f1 = t[11]; + int32_t g0 = t[12], g1 = t[13]; + int32_t h0 = t[14], h1 = t[15]; + + secp256k1_fe_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); + secp256k1_fe_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); + secp256k1_fe_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); + secp256k1_fe_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); +} + +static void secp256k1_fe_combine_4s(int32_t *t) +{ + int32_t tmp[32]; + + int aPos = 0; + int bPos = 4; + int cPos = 8; + int dPos = 12; + int ePos = 16; + int fPos = 20; + int gPos = 24; + int hPos = 28; + + int IPos = 0; + int JPos = 8; + int KPos = 16; + int LPos = 24; + + secp256k1_fe_mul_add_4(t, ePos, aPos, fPos, cPos, tmp, IPos); + secp256k1_fe_mul_add_4(t, ePos, bPos, fPos, dPos, tmp, JPos); + secp256k1_fe_mul_add_4(t, gPos, aPos, hPos, cPos, tmp, KPos); + secp256k1_fe_mul_add_4(t, gPos, bPos, hPos, dPos, tmp, LPos); + + memcpy(t, tmp, 32 * sizeof(int32_t)); +} + +static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int32_t *t) { + + uint32_t u0, u1, u2, u3, u4, u5, u6, u7, u8; + uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; + int64_t cc; + + cc = t[0]; + u0 = (uint32_t)cc; cc >>= 32; + cc += t[1]; + u1 = (uint32_t)cc; cc >>= 32; + cc += t[2]; + u2 = (uint32_t)cc; cc >>= 32; + cc += t[3]; + u3 = (uint32_t)cc; cc >>= 32; + cc += t[4]; + u4 = (uint32_t)cc; cc >>= 32; + cc += t[5]; + u5 = (uint32_t)cc; cc >>= 32; + cc += t[6]; + u6 = (uint32_t)cc; cc >>= 32; + cc += t[7]; + u7 = (uint32_t)cc; cc >>= 32; + u8 = (uint32_t)cc; + + VERIFY_CHECK(u8 == 0 || u8 == UINT32_MAX); + + /* Add twice the field prime in case u8 is non-zero (which represents -2^256). */ + r0 = 0x3FFFC2FUL * 2; + r1 = 0x3FFFFBFUL * 2; + r2 = 0x3FFFFFFUL * 2; + r3 = 0x3FFFFFFUL * 2; + r4 = 0x3FFFFFFUL * 2; + r5 = 0x3FFFFFFUL * 2; + r6 = 0x3FFFFFFUL * 2; + r7 = 0x3FFFFFFUL * 2; + r8 = 0x3FFFFFFUL * 2; + r9 = 0x03FFFFFUL * 2; + + r0 += ( u0 ) & 0x3FFFFFFUL; + r1 += (u0 >> 26 | u1 << 6) & 0x3FFFFFFUL; + r2 += (u1 >> 20 | u2 << 12) & 0x3FFFFFFUL; + r3 += (u2 >> 14 | u3 << 18) & 0x3FFFFFFUL; + r4 += (u3 >> 8 | u4 << 24) & 0x3FFFFFFUL; + r5 += (u4 >> 2 ) & 0x3FFFFFFUL; + r6 += (u4 >> 28 | u5 << 4) & 0x3FFFFFFUL; + r7 += (u5 >> 22 | u6 << 10) & 0x3FFFFFFUL; + r8 += (u6 >> 16 | u7 << 16) & 0x3FFFFFFUL; + r9 += (u7 >> 10 | u8 << 22); + + r->n[0] = r0; + r->n[1] = r1; + r->n[2] = r2; + r->n[3] = r3; + r->n[4] = r4; + r->n[5] = r5; + r->n[6] = r6; + r->n[7] = r7; + r->n[8] = r8; + r->n[9] = r9; + +#ifdef VERIFY + r->magnitude = 2; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_encode_31(int32_t *r, const secp256k1_fe *a) { + + const uint32_t M31 = UINT32_MAX >> 1; + const uint32_t *n = &a->n[0]; + uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4], + a5 = n[5], a6 = n[6], a7 = n[7], a8 = n[8], a9 = n[9]; + +#ifdef VERIFY + VERIFY_CHECK(a->normalized); +#endif + + r[0] = (a0 | a1 << 26) & M31; + r[1] = (a1 >> 5 | a2 << 21) & M31; + r[2] = (a2 >> 10 | a3 << 16) & M31; + r[3] = (a3 >> 15 | a4 << 11) & M31; + r[4] = (a4 >> 20 | a5 << 6) & M31; + r[5] = (a5 >> 25 | a6 << 1 + | a7 << 27) & M31; + r[6] = (a7 >> 4 | a8 << 22) & M31; + r[7] = (a8 >> 9 | a9 << 17) & M31; + r[8] = a9 >> 14; +} + +static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) { + + uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; + uint32_t c1, c2, f = f0, g = g0, x, y, z; + int i; + + for (i = 0; i < 31; ++i) { + + VERIFY_CHECK((f & 1) == 1); + VERIFY_CHECK((u * f0 + v * g0) == -f << i); + VERIFY_CHECK((q * f0 + r * g0) == -g << i); + + c1 = -(g & (eta >> 15)); + + x = (f ^ g) & c1; + f ^= x; g ^= x; g ^= c1; g -= c1; + + y = (u ^ q) & c1; + u ^= y; q ^= y; q ^= c1; q -= c1; + + z = (v ^ r) & c1; + v ^= z; r ^= z; r ^= c1; r -= c1; + + eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1; + + c2 = -(g & 1); + + g += (f & c2); g >>= 1; + q += (u & c2); u <<= 1; + r += (v & c2); v <<= 1; + } + + t[0] = (int32_t)u; + t[1] = (int32_t)v; + t[2] = (int32_t)q; + t[3] = (int32_t)r; + + return eta; +} + +static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) { + + const int32_t M31 = (int32_t)(UINT32_MAX >> 1); + int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; + int64_t cf = 0, cg = 0; + int i; + + fi = f[0]; + gi = g[0]; + + cf -= (int64_t)u * fi + (int64_t)v * gi; + cg -= (int64_t)q * fi + (int64_t)r * gi; + + VERIFY_CHECK(((int32_t)cf & M31) == 0); + VERIFY_CHECK(((int32_t)cg & M31) == 0); + + cf >>= 31; + cg >>= 31; + + for (i = 1; i < 9; ++i) { + + fi = f[i]; + gi = g[i]; + + cf -= (int64_t)u * fi + (int64_t)v * gi; + cg -= (int64_t)q * fi + (int64_t)r * gi; + + f[i - 1] = (int32_t)cf & M31; cf >>= 31; + g[i - 1] = (int32_t)cg & M31; cg >>= 31; + } + + f[8] = (int32_t)cf; + g[8] = (int32_t)cg; +} + +static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { + +#if 1 + + /* Modular inversion based on the paper "Fast constant-time gcd computation and + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. + */ + + int32_t t[24 * 4]; + int32_t f[9] = { 0x7FFFFC2FL, 0x7FFFFFFDL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, + 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL }; + int32_t g[9]; + secp256k1_fe b0, d0, a1, b1, c1, d1; + int i, sign; + int16_t eta; +#ifdef VERIFY + int zero_in; +#endif + + /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input + * by 2^768, and then the output by 2^24. */ + /* Instead of dividing the output by 2^744, scale the input. */ + secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); + secp256k1_fe_normalize(&b0); + secp256k1_fe_encode_31(&g[0], &b0); + +#ifdef VERIFY + zero_in = secp256k1_fe_is_zero(&b0); +#endif + + /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + eta = -1; + + for (i = 0; i < 24; ++i) { + eta = secp256k1_fe_divsteps_31(eta, f[0], g[0], &t[i * 4]); + secp256k1_fe_update_fg(f, g, &t[i * 4]); + } + + /* At this point sufficient iterations have been performed that g must have reached 0 + * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g + * values i.e. +/- 1. The matrix outputs from each _divstep_31 are combined to get the + * Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divstep_31 introduce an extra factor of 2^31 each, so there is a total extra + * factor of 2^744 to account for (by scaling the input and/or output accordingly). + */ + + VERIFY_CHECK(g[0] == 0); + + sign = (f[0] >> 1) & 1; + + for (i = 0; i < 3; ++i) { + int tOff = i * 32; + secp256k1_fe_combine_1s(&t[tOff + 0]); + secp256k1_fe_combine_1s(&t[tOff + 8]); + secp256k1_fe_combine_1s(&t[tOff + 16]); + secp256k1_fe_combine_1s(&t[tOff + 24]); + secp256k1_fe_combine_2s(&t[tOff + 0]); + secp256k1_fe_combine_2s(&t[tOff + 16]); + secp256k1_fe_combine_4s(&t[tOff + 0]); + } + + /* secp256k1_fe_decode_matrix(&a0, &t[0]); */ + secp256k1_fe_decode_matrix(&b0, &t[8]); + /* secp256k1_fe_decode_matrix(&c0, &t[16]); */ + secp256k1_fe_decode_matrix(&d0, &t[24]); + + secp256k1_fe_decode_matrix(&a1, &t[32]); + secp256k1_fe_decode_matrix(&b1, &t[40]); + secp256k1_fe_decode_matrix(&c1, &t[48]); + secp256k1_fe_decode_matrix(&d1, &t[56]); + + secp256k1_fe_mul(&a1, &a1, &b0); + secp256k1_fe_mul(&b1, &b1, &d0); + secp256k1_fe_mul(&c1, &c1, &b0); + secp256k1_fe_mul(&d1, &d1, &d0); + + b0 = a1; secp256k1_fe_add(&b0, &b1); + d0 = c1; secp256k1_fe_add(&d0, &d1); + + secp256k1_fe_decode_matrix(&a1, &t[64]); + secp256k1_fe_decode_matrix(&b1, &t[72]); + /* secp256k1_fe_decode_matrix(&c1, &t[80]); */ + /* secp256k1_fe_decode_matrix(&d1, &t[88]); */ + + secp256k1_fe_mul(&a1, &a1, &b0); + secp256k1_fe_mul(&b1, &b1, &d0); + /* secp256k1_fe_mul(&c1, &c1, &b0); */ + /* secp256k1_fe_mul(&d1, &d1, &d0); */ + + b0 = a1; secp256k1_fe_add(&b0, &b1); + /* d0 = c1; secp256k1_fe_add(&d0, &d1); */ + + secp256k1_fe_negate(&b1, &b0, 2); + secp256k1_fe_cmov(&b0, &b1, sign); + secp256k1_fe_normalize_weak(&b0); + +#ifdef VERIFY + VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in); +#endif + + *r = b0; + +#else + + secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1; + int j; + + /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in + * { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block: + * [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223] + */ + + secp256k1_fe_sqr(&x2, a); + secp256k1_fe_mul(&x2, &x2, a); + + secp256k1_fe_sqr(&x3, &x2); + secp256k1_fe_mul(&x3, &x3, a); + + x6 = x3; + for (j=0; j<3; j++) { + secp256k1_fe_sqr(&x6, &x6); + } + secp256k1_fe_mul(&x6, &x6, &x3); + + x9 = x6; + for (j=0; j<3; j++) { + secp256k1_fe_sqr(&x9, &x9); + } + secp256k1_fe_mul(&x9, &x9, &x3); + + x11 = x9; + for (j=0; j<2; j++) { + secp256k1_fe_sqr(&x11, &x11); + } + secp256k1_fe_mul(&x11, &x11, &x2); + + x22 = x11; + for (j=0; j<11; j++) { + secp256k1_fe_sqr(&x22, &x22); + } + secp256k1_fe_mul(&x22, &x22, &x11); + + x44 = x22; + for (j=0; j<22; j++) { + secp256k1_fe_sqr(&x44, &x44); + } + secp256k1_fe_mul(&x44, &x44, &x22); + + x88 = x44; + for (j=0; j<44; j++) { + secp256k1_fe_sqr(&x88, &x88); + } + secp256k1_fe_mul(&x88, &x88, &x44); + + x176 = x88; + for (j=0; j<88; j++) { + secp256k1_fe_sqr(&x176, &x176); + } + secp256k1_fe_mul(&x176, &x176, &x88); + + x220 = x176; + for (j=0; j<44; j++) { + secp256k1_fe_sqr(&x220, &x220); + } + secp256k1_fe_mul(&x220, &x220, &x44); + + x223 = x220; + for (j=0; j<3; j++) { + secp256k1_fe_sqr(&x223, &x223); + } + secp256k1_fe_mul(&x223, &x223, &x3); + + /* The final result is then assembled using a sliding window over the blocks. */ + + t1 = x223; + for (j=0; j<23; j++) { + secp256k1_fe_sqr(&t1, &t1); + } + secp256k1_fe_mul(&t1, &t1, &x22); + for (j=0; j<5; j++) { + secp256k1_fe_sqr(&t1, &t1); + } + secp256k1_fe_mul(&t1, &t1, a); + for (j=0; j<3; j++) { + secp256k1_fe_sqr(&t1, &t1); + } + secp256k1_fe_mul(&t1, &t1, &x2); + for (j=0; j<2; j++) { + secp256k1_fe_sqr(&t1, &t1); + } + secp256k1_fe_mul(r, a, &t1); +#endif +} + #endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 6f632e2079..03d10b1dce 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -503,7 +503,7 @@ static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST( 0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL ); -static void secp256k1_fe_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { +static void secp256k1_fe_mul_add_2(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { /* Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64. * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and @@ -563,10 +563,10 @@ static void secp256k1_fe_combine_2s(int64_t *t) { int64_t g0 = t[12], g1 = t[13]; int64_t h0 = t[14], h1 = t[15]; - secp256k1_fe_mul_add(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); - secp256k1_fe_mul_add(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); - secp256k1_fe_mul_add(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); - secp256k1_fe_mul_add(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); + secp256k1_fe_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); + secp256k1_fe_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); + secp256k1_fe_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); + secp256k1_fe_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); } static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) { @@ -595,10 +595,10 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) { r4 = 0x0FFFFFFFFFFFFULL * 2; r0 += u0 & 0xFFFFFFFFFFFFFULL; - r1 += u0 >> 52 | ((u1 << 12) & 0xFFFFFFFFFFFFFULL); - r2 += u1 >> 40 | ((u2 << 24) & 0xFFFFFFFFFFFFFULL); - r3 += u2 >> 28 | ((u3 << 36) & 0xFFFFFFFFFFFFFULL); - r4 += u3 >> 16 | (u4 << 48); + r1 += (u0 >> 52 | u1 << 12) & 0xFFFFFFFFFFFFFULL; + r2 += (u1 >> 40 | u2 << 24) & 0xFFFFFFFFFFFFFULL; + r3 += (u2 >> 28 | u3 << 36) & 0xFFFFFFFFFFFFFULL; + r4 += (u3 >> 16 | u4 << 48); r->n[0] = r0; r->n[1] = r1; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 91e3a73848..6dea776161 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -967,7 +967,7 @@ static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CO 0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL ); -static void secp256k1_scalar_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { +static void secp256k1_scalar_mul_add_2(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { /* Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64. * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and @@ -1027,10 +1027,10 @@ static void secp256k1_scalar_combine_2s(int64_t *t) { int64_t g0 = t[12], g1 = t[13]; int64_t h0 = t[14], h1 = t[15]; - secp256k1_scalar_mul_add(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); - secp256k1_scalar_mul_add(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); - secp256k1_scalar_mul_add(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); - secp256k1_scalar_mul_add(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); + secp256k1_scalar_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); + secp256k1_scalar_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); + secp256k1_scalar_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); + secp256k1_scalar_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); } static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) { @@ -1041,13 +1041,13 @@ static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) { int128_t cc; cc = t[0]; - r0 = (uint64_t)cc; cc >>= 64; + r0 = (uint64_t)cc; cc >>= 64; cc += t[1]; - r1 = (uint64_t)cc; cc >>= 64; + r1 = (uint64_t)cc; cc >>= 64; cc += t[2]; - r2 = (uint64_t)cc; cc >>= 64; + r2 = (uint64_t)cc; cc >>= 64; cc += t[3]; - r3 = (uint64_t)cc; cc >>= 64; + r3 = (uint64_t)cc; cc >>= 64; VERIFY_CHECK(cc == 0 || cc == -1); @@ -1188,7 +1188,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar /* Instead of dividing the output by 2^744, scale the input. */ secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); - secp256k1_scalar_encode_62(&g[0], &b0); + secp256k1_scalar_encode_62(g, &b0); /* The paper uses 'delta'; eta == -delta (a performance tweak). */ eta = -1; diff --git a/src/scalar_8x32.h b/src/scalar_8x32.h index 10c55f1f8b..2c9a348e24 100644 --- a/src/scalar_8x32.h +++ b/src/scalar_8x32.h @@ -16,6 +16,4 @@ typedef struct { #define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{(d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7)}} -#define SECP256K1_SCALAR_INV_DEFAULT - #endif /* SECP256K1_SCALAR_REPR_H */ diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 3c372f34fe..cd5a985213 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -733,4 +733,600 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se r->d[7] = (r->d[7] & mask0) | (a->d[7] & mask1); } +static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALAR_CONST( + 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFDUL, + 0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL +); + +static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CONST( + 0x4E165355UL, 0x5D800C18UL, 0xEF116DB1UL, 0xB31347F1UL, + 0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL +); + +static void secp256k1_scalar_mul_add_2(int32_t a0, int32_t a1, int32_t b0, int32_t b1, int32_t c0, int32_t c1, int32_t d0, int32_t d1, int32_t *t) { + + /* Each [a0,a1], etc. pair is a ??-bit signed value e.g. a0 + a1 * 2^32. + * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and + * writes the ???-bit signed result to [t[0],t[1],t[2],t[3]]. + */ + + int32_t z0, z1, z2, z3; + int64_t tt; + + tt = (int64_t)a0 * b0 + + (int64_t)c0 * d0; + z0 = (int32_t)tt; tt -= z0; tt >>= 32; + + tt += (int64_t)a0 * b1 + + (int64_t)a1 * b0 + + (int64_t)c0 * d1 + + (int64_t)c1 * d0; + z1 = (int32_t)tt; tt -= z1; tt >>= 32; + + tt += (int64_t)a1 * b1 + + (int64_t)c1 * d1; + z2 = (int32_t)tt; tt -= z2; tt >>= 32; + + z3 = (int32_t)tt; + + t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3; +} + +static void secp256k1_scalar_mul_add_4(int32_t* tIn, int xPos, int yPos, int uPos, int vPos, int32_t *tOut, int zzPos) { + int32_t y0 = tIn[yPos + 0]; + int32_t y1 = tIn[yPos + 1]; + int32_t y2 = tIn[yPos + 2]; + int32_t y3 = tIn[yPos + 3]; + int32_t v0 = tIn[vPos + 0]; + int32_t v1 = tIn[vPos + 1]; + int32_t v2 = tIn[vPos + 2]; + int32_t v3 = tIn[vPos + 3]; + int32_t xVal, uVal; + int32_t z0, z1, z2, z3, z4, z5, z6, z7; + int64_t c; + + xVal = tIn[xPos]; + uVal = tIn[uPos]; + + c = (int64_t)xVal * y0 + (int64_t)uVal * v0; + z0 = (int32_t)c; c -= z0; c >>= 32; + + c += (int64_t)xVal * y1 + (int64_t)uVal * v1; + z1 = (int32_t)c; c -= z1; c >>= 32; + + c += (int64_t)xVal * y2 + (int64_t)uVal * v2; + z2 = (int32_t)c; c -= z2; c >>= 32; + + c += (int64_t)xVal * y3 + (int64_t)uVal * v3; + z3 = (int32_t)c; c -= z3; c >>= 32; + z4 = (int32_t)c; + + xVal = tIn[xPos + 1]; + uVal = tIn[uPos + 1]; + + c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z1; + z1 = (int32_t)c; c -= z1; c >>= 32; + + c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z2; + z2 = (int32_t)c; c -= z2; c >>= 32; + + c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z3; + z3 = (int32_t)c; c -= z3; c >>= 32; + + c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z4; + z4 = (int32_t)c; c -= z4; c >>= 32; + z5 = (int32_t)c; + + xVal = tIn[xPos + 2]; + uVal = tIn[uPos + 2]; + + c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z2; + z2 = (int32_t)c; c -= z2; c >>= 32; + + c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z3; + z3 = (int32_t)c; c -= z3; c >>= 32; + + c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z4; + z4 = (int32_t)c; c -= z4; c >>= 32; + + c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z5; + z5 = (int32_t)c; c -= z5; c >>= 32; + z6 = (int32_t)c; + + xVal = tIn[xPos + 3]; + uVal = tIn[uPos + 3]; + + c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z3; + z3 = (int32_t)c; c -= z3; c >>= 32; + + c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z4; + z4 = (int32_t)c; c -= z4; c >>= 32; + + c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z5; + z5 = (int32_t)c; c -= z5; c >>= 32; + + c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z6; + z6 = (int32_t)c; c -= z6; c >>= 32; + z7 = (int32_t)c; + + tOut[zzPos + 0] = z0; + tOut[zzPos + 1] = z1; + tOut[zzPos + 2] = z2; + tOut[zzPos + 3] = z3; + tOut[zzPos + 4] = z4; + tOut[zzPos + 5] = z5; + tOut[zzPos + 6] = z6; + tOut[zzPos + 7] = z7; +} + +static void secp256k1_scalar_combine_1s(int32_t *t) { + + int32_t a = t[0], b = t[1], c = t[2], d = t[3], + e = t[4], f = t[5], g = t[6], h = t[7]; + int64_t I, J, K, L; + + I = (int64_t)e * a + (int64_t)f * c; + J = (int64_t)e * b + (int64_t)f * d; + K = (int64_t)g * a + (int64_t)h * c; + L = (int64_t)g * b + (int64_t)h * d; + + a = (int32_t)I; I -= a; I >>= 32; b = (int32_t)I; + c = (int32_t)J; J -= c; J >>= 32; d = (int32_t)J; + e = (int32_t)K; K -= e; K >>= 32; f = (int32_t)K; + g = (int32_t)L; L -= g; L >>= 32; h = (int32_t)L; + + t[0] = a; t[1] = b; t[2] = c; t[3] = d; + t[4] = e; t[5] = f; t[6] = g; t[7] = h; +} + +static void secp256k1_scalar_combine_2s(int32_t *t) { + + int32_t a0 = t[ 0], a1 = t[ 1]; + int32_t b0 = t[ 2], b1 = t[ 3]; + int32_t c0 = t[ 4], c1 = t[ 5]; + int32_t d0 = t[ 6], d1 = t[ 7]; + int32_t e0 = t[ 8], e1 = t[ 9]; + int32_t f0 = t[10], f1 = t[11]; + int32_t g0 = t[12], g1 = t[13]; + int32_t h0 = t[14], h1 = t[15]; + + secp256k1_scalar_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); + secp256k1_scalar_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); + secp256k1_scalar_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); + secp256k1_scalar_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); +} + +static void secp256k1_scalar_combine_4s(int32_t *t) +{ + int32_t tmp[32]; + + int aPos = 0; + int bPos = 4; + int cPos = 8; + int dPos = 12; + int ePos = 16; + int fPos = 20; + int gPos = 24; + int hPos = 28; + + int IPos = 0; + int JPos = 8; + int KPos = 16; + int LPos = 24; + + secp256k1_scalar_mul_add_4(t, ePos, aPos, fPos, cPos, tmp, IPos); + secp256k1_scalar_mul_add_4(t, ePos, bPos, fPos, dPos, tmp, JPos); + secp256k1_scalar_mul_add_4(t, gPos, aPos, hPos, cPos, tmp, KPos); + secp256k1_scalar_mul_add_4(t, gPos, bPos, hPos, dPos, tmp, LPos); + + memcpy(t, tmp, 32 * sizeof(int32_t)); +} + +static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int32_t *t) { + + uint32_t r0, r1, r2, r3, r4, r5, r6, r7; + int flag; + secp256k1_scalar u; + int64_t cc; + + cc = t[0]; + r0 = (uint32_t)cc; cc >>= 32; + cc += t[1]; + r1 = (uint32_t)cc; cc >>= 32; + cc += t[2]; + r2 = (uint32_t)cc; cc >>= 32; + cc += t[3]; + r3 = (uint32_t)cc; cc >>= 32; + cc += t[4]; + r4 = (uint32_t)cc; cc >>= 32; + cc += t[5]; + r5 = (uint32_t)cc; cc >>= 32; + cc += t[6]; + r6 = (uint32_t)cc; cc >>= 32; + cc += t[7]; + r7 = (uint32_t)cc; cc >>= 32; + + VERIFY_CHECK(cc == 0 || cc == -1); + + flag = (int)cc & 1; + + r->d[0] = r0; + r->d[1] = r1; + r->d[2] = r2; + r->d[3] = r3; + r->d[4] = r4; + r->d[5] = r5; + r->d[6] = r6; + r->d[7] = r7; + + secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256); + secp256k1_scalar_cmov(r, &u, flag); +} + +static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) { + + const uint32_t M31 = UINT32_MAX >> 1; + const uint32_t *d = &a->d[0]; + uint32_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3], + a4 = d[4], a5 = d[5], a6 = d[6], a7 = d[7]; + +#ifdef VERIFY + VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0); +#endif + + r[0] = a0 & M31; + r[1] = (a0 >> 31 | a1 << 1) & M31; + r[2] = (a1 >> 30 | a2 << 2) & M31; + r[3] = (a2 >> 29 | a3 << 3) & M31; + r[4] = (a3 >> 28 | a4 << 4) & M31; + r[5] = (a4 >> 27 | a5 << 5) & M31; + r[6] = (a5 >> 26 | a6 << 6) & M31; + r[7] = (a6 >> 25 | a7 << 7) & M31; + r[8] = a7 >> 24; + +#ifdef VERIFY + VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0); +#endif +} + +static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) { + + uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; + uint32_t c1, c2, f = f0, g = g0, x, y, z; + int i; + + for (i = 0; i < 31; ++i) { + + VERIFY_CHECK((f & 1) == 1); + VERIFY_CHECK((u * f0 + v * g0) == -f << i); + VERIFY_CHECK((q * f0 + r * g0) == -g << i); + + c1 = -(g & (eta >> 15)); + + x = (f ^ g) & c1; + f ^= x; g ^= x; g ^= c1; g -= c1; + + y = (u ^ q) & c1; + u ^= y; q ^= y; q ^= c1; q -= c1; + + z = (v ^ r) & c1; + v ^= z; r ^= z; r ^= c1; r -= c1; + + eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1; + + c2 = -(g & 1); + + g += (f & c2); g >>= 1; + q += (u & c2); u <<= 1; + r += (v & c2); v <<= 1; + } + + t[0] = (int32_t)u; + t[1] = (int32_t)v; + t[2] = (int32_t)q; + t[3] = (int32_t)r; + + return eta; +} + +static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) { + + const int32_t M31 = (int32_t)(UINT32_MAX >> 1); + int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; + int64_t cf = 0, cg = 0; + int i; + + fi = f[0]; + gi = g[0]; + + cf -= (int64_t)u * fi + (int64_t)v * gi; + cg -= (int64_t)q * fi + (int64_t)r * gi; + + VERIFY_CHECK(((int32_t)cf & M31) == 0); + VERIFY_CHECK(((int32_t)cg & M31) == 0); + + cf >>= 31; + cg >>= 31; + + for (i = 1; i < 9; ++i) { + + fi = f[i]; + gi = g[i]; + + cf -= (int64_t)u * fi + (int64_t)v * gi; + cg -= (int64_t)q * fi + (int64_t)r * gi; + + f[i - 1] = (int32_t)cf & M31; cf >>= 31; + g[i - 1] = (int32_t)cg & M31; cg >>= 31; + } + + f[8] = (int32_t)cf; + g[8] = (int32_t)cg; +} + +static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) { +#if defined(EXHAUSTIVE_TEST_ORDER) + int i; + *r = 0; + for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++) + if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1) + *r = i; + /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus + * have a composite group order; fix it in exhaustive_tests.c). */ + VERIFY_CHECK(*r != 0); +} +#elif 1 + + /* Modular inversion based on the paper "Fast constant-time gcd computation and + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. + */ + + int32_t t[24 * 4]; + int32_t f[9] = { 0x50364141L, 0x7FA4BD19L, 0x3D2280EEL, 0x5576E735L, 0x7FFFFFEBL, + 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL }; + int32_t g[9]; + secp256k1_scalar b0, d0, a1, b1, c1, d1; + int i, sign; + int16_t eta; +#ifdef VERIFY + int zero_in = secp256k1_scalar_is_zero(x); +#endif + + /* Instead of dividing the output by 2^744, scale the input. */ + secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); + secp256k1_scalar_encode_31(g, &b0); + + /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + eta = -1; + + for (i = 0; i < 24; ++i) { + eta = secp256k1_scalar_divsteps_31(eta, f[0], g[0], &t[i * 4]); + secp256k1_scalar_update_fg(f, g, &t[i * 4]); + } + + /* At this point sufficient iterations have been performed that g must have reached 0 + * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g + * values i.e. +/- 1. The matrix outputs from each _divstep_31 are combined to get the + * Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divstep_31 introduce an extra factor of 2^31 each, so there is a total extra + * factor of 2^744 to account for (by scaling the input and/or output accordingly). + */ + + VERIFY_CHECK(g[0] == 0); + + sign = (f[0] >> 1) & 1; + + for (i = 0; i < 3; ++i) { + int tOff = i * 32; + secp256k1_scalar_combine_1s(&t[tOff + 0]); + secp256k1_scalar_combine_1s(&t[tOff + 8]); + secp256k1_scalar_combine_1s(&t[tOff + 16]); + secp256k1_scalar_combine_1s(&t[tOff + 24]); + secp256k1_scalar_combine_2s(&t[tOff + 0]); + secp256k1_scalar_combine_2s(&t[tOff + 16]); + secp256k1_scalar_combine_4s(&t[tOff + 0]); + } + + /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */ + secp256k1_scalar_decode_matrix(&b0, &t[8]); + /* secp256k1_scalar_decode_matrix(&c0, &t[16]); */ + secp256k1_scalar_decode_matrix(&d0, &t[24]); + + secp256k1_scalar_decode_matrix(&a1, &t[32]); + secp256k1_scalar_decode_matrix(&b1, &t[40]); + secp256k1_scalar_decode_matrix(&c1, &t[48]); + secp256k1_scalar_decode_matrix(&d1, &t[56]); + + secp256k1_scalar_mul(&a1, &a1, &b0); + secp256k1_scalar_mul(&b1, &b1, &d0); + secp256k1_scalar_mul(&c1, &c1, &b0); + secp256k1_scalar_mul(&d1, &d1, &d0); + + secp256k1_scalar_add(&b0, &a1, &b1); + secp256k1_scalar_add(&d0, &c1, &d1); + + secp256k1_scalar_decode_matrix(&a1, &t[64]); + secp256k1_scalar_decode_matrix(&b1, &t[72]); + /* secp256k1_scalar_decode_matrix(&c1, &t[80]); */ + /* secp256k1_scalar_decode_matrix(&d1, &t[88]); */ + + secp256k1_scalar_mul(&a1, &a1, &b0); + secp256k1_scalar_mul(&b1, &b1, &d0); + /* secp256k1_scalar_mul(&c1, &c1, &b0); */ + /* secp256k1_scalar_mul(&d1, &d1, &d0); */ + + secp256k1_scalar_add(&b0, &a1, &b1); + /* secp256k1_scalar_add(&d0, &c1, &d1); */ + + secp256k1_scalar_cond_negate(&b0, sign); + +#ifdef VERIFY + VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in); +#endif + + *r = b0; +} + +SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { + return !(a->d[0] & 1); +} +#else + secp256k1_scalar *t; + int i; + /* First compute xN as x ^ (2^N - 1) for some values of N, + * and uM as x ^ M for some values of M. */ + secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126; + secp256k1_scalar u2, u5, u9, u11, u13; + + secp256k1_scalar_sqr(&u2, x); + secp256k1_scalar_mul(&x2, &u2, x); + secp256k1_scalar_mul(&u5, &u2, &x2); + secp256k1_scalar_mul(&x3, &u5, &u2); + secp256k1_scalar_mul(&u9, &x3, &u2); + secp256k1_scalar_mul(&u11, &u9, &u2); + secp256k1_scalar_mul(&u13, &u11, &u2); + + secp256k1_scalar_sqr(&x6, &u13); + secp256k1_scalar_sqr(&x6, &x6); + secp256k1_scalar_mul(&x6, &x6, &u11); + + secp256k1_scalar_sqr(&x8, &x6); + secp256k1_scalar_sqr(&x8, &x8); + secp256k1_scalar_mul(&x8, &x8, &x2); + + secp256k1_scalar_sqr(&x14, &x8); + for (i = 0; i < 5; i++) { + secp256k1_scalar_sqr(&x14, &x14); + } + secp256k1_scalar_mul(&x14, &x14, &x6); + + secp256k1_scalar_sqr(&x28, &x14); + for (i = 0; i < 13; i++) { + secp256k1_scalar_sqr(&x28, &x28); + } + secp256k1_scalar_mul(&x28, &x28, &x14); + + secp256k1_scalar_sqr(&x56, &x28); + for (i = 0; i < 27; i++) { + secp256k1_scalar_sqr(&x56, &x56); + } + secp256k1_scalar_mul(&x56, &x56, &x28); + + secp256k1_scalar_sqr(&x112, &x56); + for (i = 0; i < 55; i++) { + secp256k1_scalar_sqr(&x112, &x112); + } + secp256k1_scalar_mul(&x112, &x112, &x56); + + secp256k1_scalar_sqr(&x126, &x112); + for (i = 0; i < 13; i++) { + secp256k1_scalar_sqr(&x126, &x126); + } + secp256k1_scalar_mul(&x126, &x126, &x14); + + /* Then accumulate the final result (t starts at x126). */ + t = &x126; + for (i = 0; i < 3; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u5); /* 101 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u5); /* 101 */ + for (i = 0; i < 5; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u11); /* 1011 */ + for (i = 0; i < 4; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u11); /* 1011 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 5; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 6; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u5); /* 101 */ + for (i = 0; i < 3; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 5; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u9); /* 1001 */ + for (i = 0; i < 6; i++) { /* 000 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u5); /* 101 */ + for (i = 0; i < 10; i++) { /* 0000000 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 9; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x8); /* 11111111 */ + for (i = 0; i < 5; i++) { /* 0 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u9); /* 1001 */ + for (i = 0; i < 6; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u11); /* 1011 */ + for (i = 0; i < 4; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 5; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &x2); /* 11 */ + for (i = 0; i < 6; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 10; i++) { /* 000000 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 4; i++) { + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, &u9); /* 1001 */ + for (i = 0; i < 6; i++) { /* 00000 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(t, t, x); /* 1 */ + for (i = 0; i < 8; i++) { /* 00 */ + secp256k1_scalar_sqr(t, t); + } + secp256k1_scalar_mul(r, t, &x6); /* 111111 */ +} + +SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { + return !(a->d[0] & 1); +} +#endif + #endif /* SECP256K1_SCALAR_REPR_IMPL_H */ From b29e51e1eef281c60b6d5449d249ae5aa688e23b Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Mon, 27 Jul 2020 14:57:40 +0700 Subject: [PATCH 08/34] Minor cleanup --- src/field_10x26_impl.h | 8 ++++---- src/field_5x52_impl.h | 8 ++++---- src/scalar_4x64_impl.h | 6 +++--- src/scalar_8x32_impl.h | 6 +++--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 52995de68c..9173d5ae36 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1537,7 +1537,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { /* Instead of dividing the output by 2^744, scale the input. */ secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); secp256k1_fe_normalize(&b0); - secp256k1_fe_encode_31(&g[0], &b0); + secp256k1_fe_encode_31(g, &b0); #ifdef VERIFY zero_in = secp256k1_fe_is_zero(&b0); @@ -1553,9 +1553,9 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { /* At this point sufficient iterations have been performed that g must have reached 0 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divstep_31 are combined to get the - * Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divstep_31 introduce an extra factor of 2^31 each, so there is a total extra + * values i.e. +/- 1. The matrix outputs from each _divsteps_31 are combined to get + * the Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divsteps_31 introduce an extra factor of 2^31 each, so there is a total extra * factor of 2^744 to account for (by scaling the input and/or output accordingly). */ diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 03d10b1dce..f73f356257 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -729,7 +729,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { /* Instead of dividing the output by 2^744, scale the input. */ secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); secp256k1_fe_normalize(&b0); - secp256k1_fe_encode_62(&g[0], &b0); + secp256k1_fe_encode_62(g, &b0); #ifdef VERIFY zero_in = secp256k1_fe_is_zero(&b0); @@ -745,9 +745,9 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { /* At this point sufficient iterations have been performed that g must have reached 0 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divstep_62 are combined to get the - * Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divstep_62 introduce an extra factor of 2^62 each, so there is a total extra + * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get + * the Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra * factor of 2^744 to account for (by scaling the input and/or output accordingly). */ diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 6dea776161..43fc415eeb 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1200,9 +1200,9 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar /* At this point sufficient iterations have been performed that g must have reached 0 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divstep_62 are combined to get the - * Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divstep_62 introduce an extra factor of 2^62 each, so there is a total extra + * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get + * the Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra * factor of 2^744 to account for (by scaling the input and/or output accordingly). */ diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index cd5a985213..d0593939ed 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -1106,9 +1106,9 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar /* At this point sufficient iterations have been performed that g must have reached 0 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divstep_31 are combined to get the - * Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divstep_31 introduce an extra factor of 2^31 each, so there is a total extra + * values i.e. +/- 1. The matrix outputs from each _divsteps_31 are combined to get + * the Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divsteps_31 introduce an extra factor of 2^31 each, so there is a total extra * factor of 2^744 to account for (by scaling the input and/or output accordingly). */ From 3519dccfe4aedac46d73d11736151ee3f7966deb Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Tue, 28 Jul 2020 01:22:58 +0700 Subject: [PATCH 09/34] Initial _inv_var implementations --- src/field_10x26_impl.h | 208 ++++++++++++++++++----------- src/field_5x52_impl.h | 204 ++++++++++++++++++----------- src/field_impl.h | 2 + src/scalar_4x64_impl.h | 283 +++++++++++++++++++--------------------- src/scalar_8x32_impl.h | 287 ++++++++++++++++++++--------------------- src/scalar_impl.h | 2 + src/scalar_low.h | 1 + 7 files changed, 539 insertions(+), 448 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 9173d5ae36..c502c772e6 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1478,6 +1478,58 @@ static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int3 return eta; } +static int secp256k1_fe_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) { + + uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; + uint32_t f = f0, g = g0, m, w, x, y, z; + int i = 31, limit, zeros; + + for (;;) { + + /* Use a sentinel bit to count zeros only up to i. */ + zeros = __builtin_ctzl(g | (UINT32_MAX << i)); + + g >>= zeros; + u <<= zeros; + v <<= zeros; + eta -= zeros; + i -= zeros; + + if (i <= 0) { + break; + } + + VERIFY_CHECK((f & 1) == 1); + VERIFY_CHECK((g & 1) == 1); + VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i)); + VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i)); + + if ((int16_t)eta < 0) { + eta = -eta; + x = f; f = g; g = -x; + y = u; u = q; q = -y; + z = v; v = r; r = -z; + } + + /* Handle up to 3 divsteps at once, subject to eta and i. */ + limit = (eta + 1) > i ? i : (eta + 1); + m = (UINT32_MAX >> (32 - limit)) & 7U; + + /* Note that f * f == 1 mod 8, for any f. */ + w = (-f * g) & m; + g += f * w; + q += u * w; + r += v * w; + } + + t[0] = (int32_t)u; + t[1] = (int32_t)v; + t[2] = (int32_t)q; + t[3] = (int32_t)r; + + return eta; +} + static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) { const int32_t M31 = (int32_t)(UINT32_MAX >> 1); @@ -1515,8 +1567,6 @@ static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) { static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { -#if 1 - /* Modular inversion based on the paper "Fast constant-time gcd computation and * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ @@ -1614,97 +1664,107 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { #endif *r = b0; +} -#else - - secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1; - int j; +static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { - /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in - * { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block: - * [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223] + /* Modular inversion based on the paper "Fast constant-time gcd computation and + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - secp256k1_fe_sqr(&x2, a); - secp256k1_fe_mul(&x2, &x2, a); + int32_t t[24 * 4]; + int32_t f[9] = { 0x7FFFFC2FL, 0x7FFFFFFDL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, + 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL }; + int32_t g[9]; + secp256k1_fe b0, d0, a1, b1, c1, d1; + int i, sign; + int16_t eta; +#ifdef VERIFY + int zero_in; +#endif - secp256k1_fe_sqr(&x3, &x2); - secp256k1_fe_mul(&x3, &x3, a); + /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input + * by 2^768, and then the output by 2^24. */ + /* Instead of dividing the output by 2^744, scale the input. */ + secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); + secp256k1_fe_normalize(&b0); + secp256k1_fe_encode_31(g, &b0); - x6 = x3; - for (j=0; j<3; j++) { - secp256k1_fe_sqr(&x6, &x6); - } - secp256k1_fe_mul(&x6, &x6, &x3); +#ifdef VERIFY + zero_in = secp256k1_fe_is_zero(&b0); +#endif - x9 = x6; - for (j=0; j<3; j++) { - secp256k1_fe_sqr(&x9, &x9); - } - secp256k1_fe_mul(&x9, &x9, &x3); + /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + eta = -1; - x11 = x9; - for (j=0; j<2; j++) { - secp256k1_fe_sqr(&x11, &x11); + for (i = 0; i < 24; ++i) { + eta = secp256k1_fe_divsteps_31_var(eta, f[0], g[0], &t[i * 4]); + secp256k1_fe_update_fg(f, g, &t[i * 4]); } - secp256k1_fe_mul(&x11, &x11, &x2); - x22 = x11; - for (j=0; j<11; j++) { - secp256k1_fe_sqr(&x22, &x22); - } - secp256k1_fe_mul(&x22, &x22, &x11); + /* At this point sufficient iterations have been performed that g must have reached 0 + * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g + * values i.e. +/- 1. The matrix outputs from each _divsteps_31_var are combined to + * get the Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divsteps_31_var introduce an extra factor of 2^31 each, so there is a total extra + * factor of 2^744 to account for (by scaling the input and/or output accordingly). + */ - x44 = x22; - for (j=0; j<22; j++) { - secp256k1_fe_sqr(&x44, &x44); - } - secp256k1_fe_mul(&x44, &x44, &x22); + VERIFY_CHECK(g[0] == 0); - x88 = x44; - for (j=0; j<44; j++) { - secp256k1_fe_sqr(&x88, &x88); - } - secp256k1_fe_mul(&x88, &x88, &x44); + sign = (f[0] >> 1) & 1; - x176 = x88; - for (j=0; j<88; j++) { - secp256k1_fe_sqr(&x176, &x176); + for (i = 0; i < 3; ++i) { + int tOff = i * 32; + secp256k1_fe_combine_1s(&t[tOff + 0]); + secp256k1_fe_combine_1s(&t[tOff + 8]); + secp256k1_fe_combine_1s(&t[tOff + 16]); + secp256k1_fe_combine_1s(&t[tOff + 24]); + secp256k1_fe_combine_2s(&t[tOff + 0]); + secp256k1_fe_combine_2s(&t[tOff + 16]); + secp256k1_fe_combine_4s(&t[tOff + 0]); } - secp256k1_fe_mul(&x176, &x176, &x88); - x220 = x176; - for (j=0; j<44; j++) { - secp256k1_fe_sqr(&x220, &x220); - } - secp256k1_fe_mul(&x220, &x220, &x44); + /* secp256k1_fe_decode_matrix(&a0, &t[0]); */ + secp256k1_fe_decode_matrix(&b0, &t[8]); + /* secp256k1_fe_decode_matrix(&c0, &t[16]); */ + secp256k1_fe_decode_matrix(&d0, &t[24]); - x223 = x220; - for (j=0; j<3; j++) { - secp256k1_fe_sqr(&x223, &x223); - } - secp256k1_fe_mul(&x223, &x223, &x3); + secp256k1_fe_decode_matrix(&a1, &t[32]); + secp256k1_fe_decode_matrix(&b1, &t[40]); + secp256k1_fe_decode_matrix(&c1, &t[48]); + secp256k1_fe_decode_matrix(&d1, &t[56]); + + secp256k1_fe_mul(&a1, &a1, &b0); + secp256k1_fe_mul(&b1, &b1, &d0); + secp256k1_fe_mul(&c1, &c1, &b0); + secp256k1_fe_mul(&d1, &d1, &d0); - /* The final result is then assembled using a sliding window over the blocks. */ + b0 = a1; secp256k1_fe_add(&b0, &b1); + d0 = c1; secp256k1_fe_add(&d0, &d1); - t1 = x223; - for (j=0; j<23; j++) { - secp256k1_fe_sqr(&t1, &t1); - } - secp256k1_fe_mul(&t1, &t1, &x22); - for (j=0; j<5; j++) { - secp256k1_fe_sqr(&t1, &t1); - } - secp256k1_fe_mul(&t1, &t1, a); - for (j=0; j<3; j++) { - secp256k1_fe_sqr(&t1, &t1); - } - secp256k1_fe_mul(&t1, &t1, &x2); - for (j=0; j<2; j++) { - secp256k1_fe_sqr(&t1, &t1); - } - secp256k1_fe_mul(r, a, &t1); + secp256k1_fe_decode_matrix(&a1, &t[64]); + secp256k1_fe_decode_matrix(&b1, &t[72]); + /* secp256k1_fe_decode_matrix(&c1, &t[80]); */ + /* secp256k1_fe_decode_matrix(&d1, &t[88]); */ + + secp256k1_fe_mul(&a1, &a1, &b0); + secp256k1_fe_mul(&b1, &b1, &d0); + /* secp256k1_fe_mul(&c1, &c1, &b0); */ + /* secp256k1_fe_mul(&d1, &d1, &d0); */ + + b0 = a1; secp256k1_fe_add(&b0, &b1); + /* d0 = c1; secp256k1_fe_add(&d0, &d1); */ + + secp256k1_fe_negate(&b1, &b0, 2); + secp256k1_fe_cmov(&b0, &b1, sign); + secp256k1_fe_normalize_weak(&b0); + +#ifdef VERIFY + VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in); #endif + + *r = b0; } #endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index f73f356257..0cc7e80056 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -670,6 +670,58 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6 return eta; } +static int secp256k1_fe_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) { + + uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; + uint64_t f = f0, g = g0, m, w, x, y, z; + int i = 62, limit, zeros; + + for (;;) { + + /* Use a sentinel bit to count zeros only up to i. */ + zeros = __builtin_ctzll(g | (UINT64_MAX << i)); + + g >>= zeros; + u <<= zeros; + v <<= zeros; + eta -= zeros; + i -= zeros; + + if (i <= 0) { + break; + } + + VERIFY_CHECK((f & 1) == 1); + VERIFY_CHECK((g & 1) == 1); + VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i)); + VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i)); + + if ((int16_t)eta < 0) { + eta = -eta; + x = f; f = g; g = -x; + y = u; u = q; q = -y; + z = v; v = r; r = -z; + } + + /* Handle up to 3 divsteps at once, subject to eta and i. */ + limit = (eta + 1) > i ? i : (eta + 1); + m = (UINT64_MAX >> (64 - limit)) & 7U; + + /* Note that f * f == 1 mod 8, for any f. */ + w = (-f * g) & m; + g += f * w; + q += u * w; + r += v * w; + } + + t[0] = (int64_t)u; + t[1] = (int64_t)v; + t[2] = (int64_t)q; + t[3] = (int64_t)r; + + return eta; +} + static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) { const int64_t M62 = (int64_t)(UINT64_MAX >> 2); @@ -707,8 +759,6 @@ static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) { static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { -#if 1 - /* Modular inversion based on the paper "Fast constant-time gcd computation and * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ @@ -802,97 +852,103 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { #endif *r = b0; +} -#else - - secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1; - int j; +static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { - /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in - * { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block: - * [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223] + /* Modular inversion based on the paper "Fast constant-time gcd computation and + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - secp256k1_fe_sqr(&x2, a); - secp256k1_fe_mul(&x2, &x2, a); + int64_t t[12 * 4]; + int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, + 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; + int64_t g[5]; + secp256k1_fe b0, d0, a1, b1, c1, d1; + int i, sign; + int16_t eta; +#ifdef VERIFY + int zero_in; +#endif - secp256k1_fe_sqr(&x3, &x2); - secp256k1_fe_mul(&x3, &x3, a); + /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input + * by 2^768, and then the output by 2^24. */ + /* Instead of dividing the output by 2^744, scale the input. */ + secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); + secp256k1_fe_normalize(&b0); + secp256k1_fe_encode_62(g, &b0); - x6 = x3; - for (j=0; j<3; j++) { - secp256k1_fe_sqr(&x6, &x6); - } - secp256k1_fe_mul(&x6, &x6, &x3); +#ifdef VERIFY + zero_in = secp256k1_fe_is_zero(&b0); +#endif - x9 = x6; - for (j=0; j<3; j++) { - secp256k1_fe_sqr(&x9, &x9); - } - secp256k1_fe_mul(&x9, &x9, &x3); + /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + eta = -1; - x11 = x9; - for (j=0; j<2; j++) { - secp256k1_fe_sqr(&x11, &x11); + for (i = 0; i < 12; ++i) { + eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], &t[i * 4]); + secp256k1_fe_update_fg(f, g, &t[i * 4]); } - secp256k1_fe_mul(&x11, &x11, &x2); - x22 = x11; - for (j=0; j<11; j++) { - secp256k1_fe_sqr(&x22, &x22); - } - secp256k1_fe_mul(&x22, &x22, &x11); + /* At this point sufficient iterations have been performed that g must have reached 0 + * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g + * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get + * the Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra + * factor of 2^744 to account for (by scaling the input and/or output accordingly). + */ - x44 = x22; - for (j=0; j<22; j++) { - secp256k1_fe_sqr(&x44, &x44); - } - secp256k1_fe_mul(&x44, &x44, &x22); + VERIFY_CHECK(g[0] == 0); - x88 = x44; - for (j=0; j<44; j++) { - secp256k1_fe_sqr(&x88, &x88); - } - secp256k1_fe_mul(&x88, &x88, &x44); + sign = (f[0] >> 1) & 1; - x176 = x88; - for (j=0; j<88; j++) { - secp256k1_fe_sqr(&x176, &x176); + for (i = 0; i < 3; ++i) { + int tOff = i * 16; + secp256k1_fe_combine_1s(&t[tOff + 0]); + secp256k1_fe_combine_1s(&t[tOff + 8]); + secp256k1_fe_combine_2s(&t[tOff + 0]); } - secp256k1_fe_mul(&x176, &x176, &x88); - x220 = x176; - for (j=0; j<44; j++) { - secp256k1_fe_sqr(&x220, &x220); - } - secp256k1_fe_mul(&x220, &x220, &x44); + /* secp256k1_fe_decode_matrix(&a0, &t[0]); */ + secp256k1_fe_decode_matrix(&b0, &t[4]); + /* secp256k1_fe_decode_matrix(&c0, &t[8]); */ + secp256k1_fe_decode_matrix(&d0, &t[12]); - x223 = x220; - for (j=0; j<3; j++) { - secp256k1_fe_sqr(&x223, &x223); - } - secp256k1_fe_mul(&x223, &x223, &x3); + secp256k1_fe_decode_matrix(&a1, &t[16]); + secp256k1_fe_decode_matrix(&b1, &t[20]); + secp256k1_fe_decode_matrix(&c1, &t[24]); + secp256k1_fe_decode_matrix(&d1, &t[28]); + + secp256k1_fe_mul(&a1, &a1, &b0); + secp256k1_fe_mul(&b1, &b1, &d0); + secp256k1_fe_mul(&c1, &c1, &b0); + secp256k1_fe_mul(&d1, &d1, &d0); - /* The final result is then assembled using a sliding window over the blocks. */ + b0 = a1; secp256k1_fe_add(&b0, &b1); + d0 = c1; secp256k1_fe_add(&d0, &d1); - t1 = x223; - for (j=0; j<23; j++) { - secp256k1_fe_sqr(&t1, &t1); - } - secp256k1_fe_mul(&t1, &t1, &x22); - for (j=0; j<5; j++) { - secp256k1_fe_sqr(&t1, &t1); - } - secp256k1_fe_mul(&t1, &t1, a); - for (j=0; j<3; j++) { - secp256k1_fe_sqr(&t1, &t1); - } - secp256k1_fe_mul(&t1, &t1, &x2); - for (j=0; j<2; j++) { - secp256k1_fe_sqr(&t1, &t1); - } - secp256k1_fe_mul(r, a, &t1); + secp256k1_fe_decode_matrix(&a1, &t[32]); + secp256k1_fe_decode_matrix(&b1, &t[36]); + /* secp256k1_fe_decode_matrix(&c1, &t[40]); */ + /* secp256k1_fe_decode_matrix(&d1, &t[44]); */ + + secp256k1_fe_mul(&a1, &a1, &b0); + secp256k1_fe_mul(&b1, &b1, &d0); + /* secp256k1_fe_mul(&c1, &c1, &b0); */ + /* secp256k1_fe_mul(&d1, &d1, &d0); */ + + b0 = a1; secp256k1_fe_add(&b0, &b1); + /* d0 = c1; secp256k1_fe_add(&d0, &d1); */ + + secp256k1_fe_negate(&b1, &b0, 2); + secp256k1_fe_cmov(&b0, &b1, sign); + secp256k1_fe_normalize_weak(&b0); + +#ifdef VERIFY + VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in); #endif + + *r = b0; } #endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/field_impl.h b/src/field_impl.h index c2b1cd2df2..ef15a0fc85 100644 --- a/src/field_impl.h +++ b/src/field_impl.h @@ -228,6 +228,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { } #endif +#if defined(SECP256K1_FE_INV_VAR_DEFAULT) static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { #if defined(USE_FIELD_INV_BUILTIN) secp256k1_fe_inv(r, a); @@ -264,6 +265,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { #error "Please select field inverse implementation" #endif } +#endif static void secp256k1_fe_inv_all_var(secp256k1_fe *r, const secp256k1_fe *a, size_t len) { secp256k1_fe u; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 43fc415eeb..f51580a19d 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1123,6 +1123,58 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, return eta; } +static int secp256k1_scalar_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) { + + uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; + uint64_t f = f0, g = g0, m, w, x, y, z; + int i = 62, limit, zeros; + + for (;;) { + + /* Use a sentinel bit to count zeros only up to i. */ + zeros = __builtin_ctzll(g | (UINT64_MAX << i)); + + g >>= zeros; + u <<= zeros; + v <<= zeros; + eta -= zeros; + i -= zeros; + + if (i <= 0) { + break; + } + + VERIFY_CHECK((f & 1) == 1); + VERIFY_CHECK((g & 1) == 1); + VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i)); + VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i)); + + if ((int16_t)eta < 0) { + eta = -eta; + x = f; f = g; g = -x; + y = u; u = q; q = -y; + z = v; v = r; r = -z; + } + + /* Handle up to 3 divsteps at once, subject to eta and i. */ + limit = (eta + 1) > i ? i : (eta + 1); + m = (UINT64_MAX >> (64 - limit)) & 7U; + + /* Note that f * f == 1 mod 8, for any f. */ + w = (-f * g) & m; + g += f * w; + q += u * w; + r += v * w; + } + + t[0] = (int64_t)u; + t[1] = (int64_t)v; + t[2] = (int64_t)q; + t[3] = (int64_t)r; + + return eta; +} + static void secp256k1_scalar_update_fg(int64_t *f, int64_t *g, int64_t *t) { const int64_t M62 = (int64_t)(UINT64_MAX >> 2); @@ -1169,7 +1221,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar * have a composite group order; fix it in exhaustive_tests.c). */ VERIFY_CHECK(*r != 0); } -#elif 1 +#else /* Modular inversion based on the paper "Fast constant-time gcd computation and * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. @@ -1260,163 +1312,94 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { return !(a->d[0] & 1); } -#else - secp256k1_scalar *t; - int i; - /* First compute xN as x ^ (2^N - 1) for some values of N, - * and uM as x ^ M for some values of M. */ - secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126; - secp256k1_scalar u2, u5, u9, u11, u13; - - secp256k1_scalar_sqr(&u2, x); - secp256k1_scalar_mul(&x2, &u2, x); - secp256k1_scalar_mul(&u5, &u2, &x2); - secp256k1_scalar_mul(&x3, &u5, &u2); - secp256k1_scalar_mul(&u9, &x3, &u2); - secp256k1_scalar_mul(&u11, &u9, &u2); - secp256k1_scalar_mul(&u13, &u11, &u2); - - secp256k1_scalar_sqr(&x6, &u13); - secp256k1_scalar_sqr(&x6, &x6); - secp256k1_scalar_mul(&x6, &x6, &u11); - - secp256k1_scalar_sqr(&x8, &x6); - secp256k1_scalar_sqr(&x8, &x8); - secp256k1_scalar_mul(&x8, &x8, &x2); - - secp256k1_scalar_sqr(&x14, &x8); - for (i = 0; i < 5; i++) { - secp256k1_scalar_sqr(&x14, &x14); - } - secp256k1_scalar_mul(&x14, &x14, &x6); +#endif - secp256k1_scalar_sqr(&x28, &x14); - for (i = 0; i < 13; i++) { - secp256k1_scalar_sqr(&x28, &x28); - } - secp256k1_scalar_mul(&x28, &x28, &x14); +static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) { - secp256k1_scalar_sqr(&x56, &x28); - for (i = 0; i < 27; i++) { - secp256k1_scalar_sqr(&x56, &x56); - } - secp256k1_scalar_mul(&x56, &x56, &x28); + /* Modular inversion based on the paper "Fast constant-time gcd computation and + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. + */ - secp256k1_scalar_sqr(&x112, &x56); - for (i = 0; i < 55; i++) { - secp256k1_scalar_sqr(&x112, &x112); - } - secp256k1_scalar_mul(&x112, &x112, &x56); + int64_t t[12 * 4]; + int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, + 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; + int64_t g[5]; + secp256k1_scalar b0, d0, a1, b1, c1, d1; + int i, sign; + int16_t eta; +#ifdef VERIFY + int zero_in = secp256k1_scalar_is_zero(x); +#endif - secp256k1_scalar_sqr(&x126, &x112); - for (i = 0; i < 13; i++) { - secp256k1_scalar_sqr(&x126, &x126); - } - secp256k1_scalar_mul(&x126, &x126, &x14); + /* Instead of dividing the output by 2^744, scale the input. */ + secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); + secp256k1_scalar_encode_62(g, &b0); - /* Then accumulate the final result (t starts at x126). */ - t = &x126; - for (i = 0; i < 3; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 5; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u11); /* 1011 */ - for (i = 0; i < 4; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u11); /* 1011 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 5; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 6; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u13); /* 1101 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 3; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 5; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u9); /* 1001 */ - for (i = 0; i < 6; i++) { /* 000 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 10; i++) { /* 0000000 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 9; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x8); /* 11111111 */ - for (i = 0; i < 5; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u9); /* 1001 */ - for (i = 0; i < 6; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u11); /* 1011 */ - for (i = 0; i < 4; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u13); /* 1101 */ - for (i = 0; i < 5; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x2); /* 11 */ - for (i = 0; i < 6; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u13); /* 1101 */ - for (i = 0; i < 10; i++) { /* 000000 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u13); /* 1101 */ - for (i = 0; i < 4; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u9); /* 1001 */ - for (i = 0; i < 6; i++) { /* 00000 */ - secp256k1_scalar_sqr(t, t); + /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + eta = -1; + + for (i = 0; i < 12; ++i) { + eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], &t[i * 4]); + secp256k1_scalar_update_fg(f, g, &t[i * 4]); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 8; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); + + /* At this point sufficient iterations have been performed that g must have reached 0 + * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g + * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get + * the Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra + * factor of 2^744 to account for (by scaling the input and/or output accordingly). + */ + + VERIFY_CHECK(g[0] == 0); + + sign = (f[0] >> 1) & 1; + + for (i = 0; i < 3; ++i) { + int tOff = i * 16; + secp256k1_scalar_combine_1s(&t[tOff + 0]); + secp256k1_scalar_combine_1s(&t[tOff + 8]); + secp256k1_scalar_combine_2s(&t[tOff + 0]); } - secp256k1_scalar_mul(r, t, &x6); /* 111111 */ -} -SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { - return !(a->d[0] & 1); -} + /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */ + secp256k1_scalar_decode_matrix(&b0, &t[4]); + /* secp256k1_scalar_decode_matrix(&c0, &t[8]); */ + secp256k1_scalar_decode_matrix(&d0, &t[12]); + + secp256k1_scalar_decode_matrix(&a1, &t[16]); + secp256k1_scalar_decode_matrix(&b1, &t[20]); + secp256k1_scalar_decode_matrix(&c1, &t[24]); + secp256k1_scalar_decode_matrix(&d1, &t[28]); + + secp256k1_scalar_mul(&a1, &a1, &b0); + secp256k1_scalar_mul(&b1, &b1, &d0); + secp256k1_scalar_mul(&c1, &c1, &b0); + secp256k1_scalar_mul(&d1, &d1, &d0); + + secp256k1_scalar_add(&b0, &a1, &b1); + secp256k1_scalar_add(&d0, &c1, &d1); + + secp256k1_scalar_decode_matrix(&a1, &t[32]); + secp256k1_scalar_decode_matrix(&b1, &t[36]); + /* secp256k1_scalar_decode_matrix(&c1, &t[40]); */ + /* secp256k1_scalar_decode_matrix(&d1, &t[44]); */ + + secp256k1_scalar_mul(&a1, &a1, &b0); + secp256k1_scalar_mul(&b1, &b1, &d0); + /* secp256k1_scalar_mul(&c1, &c1, &b0); */ + /* secp256k1_scalar_mul(&d1, &d1, &d0); */ + + secp256k1_scalar_add(&b0, &a1, &b1); + /* secp256k1_scalar_add(&d0, &c1, &d1); */ + + secp256k1_scalar_cond_negate(&b0, sign); + +#ifdef VERIFY + VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in); #endif + *r = b0; +} + #endif /* SECP256K1_SCALAR_REPR_IMPL_H */ diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index d0593939ed..02d03f4504 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -1029,6 +1029,58 @@ static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, return eta; } +static int secp256k1_scalar_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) { + + uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; + uint32_t f = f0, g = g0, m, w, x, y, z; + int i = 31, limit, zeros; + + for (;;) { + + /* Use a sentinel bit to count zeros only up to i. */ + zeros = __builtin_ctzl(g | (UINT32_MAX << i)); + + g >>= zeros; + u <<= zeros; + v <<= zeros; + eta -= zeros; + i -= zeros; + + if (i <= 0) { + break; + } + + VERIFY_CHECK((f & 1) == 1); + VERIFY_CHECK((g & 1) == 1); + VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i)); + VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i)); + + if ((int16_t)eta < 0) { + eta = -eta; + x = f; f = g; g = -x; + y = u; u = q; q = -y; + z = v; v = r; r = -z; + } + + /* Handle up to 3 divsteps at once, subject to eta and i. */ + limit = (eta + 1) > i ? i : (eta + 1); + m = (UINT32_MAX >> (32 - limit)) & 7U; + + /* Note that f * f == 1 mod 8, for any f. */ + w = (-f * g) & m; + g += f * w; + q += u * w; + r += v * w; + } + + t[0] = (int32_t)u; + t[1] = (int32_t)v; + t[2] = (int32_t)q; + t[3] = (int32_t)r; + + return eta; +} + static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) { const int32_t M31 = (int32_t)(UINT32_MAX >> 1); @@ -1075,7 +1127,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar * have a composite group order; fix it in exhaustive_tests.c). */ VERIFY_CHECK(*r != 0); } -#elif 1 +#else /* Modular inversion based on the paper "Fast constant-time gcd computation and * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. @@ -1170,163 +1222,98 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { return !(a->d[0] & 1); } -#else - secp256k1_scalar *t; - int i; - /* First compute xN as x ^ (2^N - 1) for some values of N, - * and uM as x ^ M for some values of M. */ - secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126; - secp256k1_scalar u2, u5, u9, u11, u13; - - secp256k1_scalar_sqr(&u2, x); - secp256k1_scalar_mul(&x2, &u2, x); - secp256k1_scalar_mul(&u5, &u2, &x2); - secp256k1_scalar_mul(&x3, &u5, &u2); - secp256k1_scalar_mul(&u9, &x3, &u2); - secp256k1_scalar_mul(&u11, &u9, &u2); - secp256k1_scalar_mul(&u13, &u11, &u2); - - secp256k1_scalar_sqr(&x6, &u13); - secp256k1_scalar_sqr(&x6, &x6); - secp256k1_scalar_mul(&x6, &x6, &u11); - - secp256k1_scalar_sqr(&x8, &x6); - secp256k1_scalar_sqr(&x8, &x8); - secp256k1_scalar_mul(&x8, &x8, &x2); - - secp256k1_scalar_sqr(&x14, &x8); - for (i = 0; i < 5; i++) { - secp256k1_scalar_sqr(&x14, &x14); - } - secp256k1_scalar_mul(&x14, &x14, &x6); +#endif - secp256k1_scalar_sqr(&x28, &x14); - for (i = 0; i < 13; i++) { - secp256k1_scalar_sqr(&x28, &x28); - } - secp256k1_scalar_mul(&x28, &x28, &x14); +static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) { - secp256k1_scalar_sqr(&x56, &x28); - for (i = 0; i < 27; i++) { - secp256k1_scalar_sqr(&x56, &x56); - } - secp256k1_scalar_mul(&x56, &x56, &x28); + /* Modular inversion based on the paper "Fast constant-time gcd computation and + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. + */ - secp256k1_scalar_sqr(&x112, &x56); - for (i = 0; i < 55; i++) { - secp256k1_scalar_sqr(&x112, &x112); - } - secp256k1_scalar_mul(&x112, &x112, &x56); + int32_t t[24 * 4]; + int32_t f[9] = { 0x50364141L, 0x7FA4BD19L, 0x3D2280EEL, 0x5576E735L, 0x7FFFFFEBL, + 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL }; + int32_t g[9]; + secp256k1_scalar b0, d0, a1, b1, c1, d1; + int i, sign; + int16_t eta; +#ifdef VERIFY + int zero_in = secp256k1_scalar_is_zero(x); +#endif - secp256k1_scalar_sqr(&x126, &x112); - for (i = 0; i < 13; i++) { - secp256k1_scalar_sqr(&x126, &x126); - } - secp256k1_scalar_mul(&x126, &x126, &x14); + /* Instead of dividing the output by 2^744, scale the input. */ + secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); + secp256k1_scalar_encode_31(g, &b0); - /* Then accumulate the final result (t starts at x126). */ - t = &x126; - for (i = 0; i < 3; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 5; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u11); /* 1011 */ - for (i = 0; i < 4; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u11); /* 1011 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 5; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 6; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u13); /* 1101 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 3; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 5; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u9); /* 1001 */ - for (i = 0; i < 6; i++) { /* 000 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 10; i++) { /* 0000000 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 9; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x8); /* 11111111 */ - for (i = 0; i < 5; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u9); /* 1001 */ - for (i = 0; i < 6; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u11); /* 1011 */ - for (i = 0; i < 4; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u13); /* 1101 */ - for (i = 0; i < 5; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x2); /* 11 */ - for (i = 0; i < 6; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u13); /* 1101 */ - for (i = 0; i < 10; i++) { /* 000000 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u13); /* 1101 */ - for (i = 0; i < 4; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u9); /* 1001 */ - for (i = 0; i < 6; i++) { /* 00000 */ - secp256k1_scalar_sqr(t, t); + /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + eta = -1; + + for (i = 0; i < 24; ++i) { + eta = secp256k1_scalar_divsteps_31_var(eta, f[0], g[0], &t[i * 4]); + secp256k1_scalar_update_fg(f, g, &t[i * 4]); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 8; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); + + /* At this point sufficient iterations have been performed that g must have reached 0 + * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g + * values i.e. +/- 1. The matrix outputs from each _divsteps_31_var are combined to + * get the Bézout coefficients, and thus the modular inverse. The matrix outputs of + * _divsteps_31_var introduce an extra factor of 2^31 each, so there is a total extra + * factor of 2^744 to account for (by scaling the input and/or output accordingly). + */ + + VERIFY_CHECK(g[0] == 0); + + sign = (f[0] >> 1) & 1; + + for (i = 0; i < 3; ++i) { + int tOff = i * 32; + secp256k1_scalar_combine_1s(&t[tOff + 0]); + secp256k1_scalar_combine_1s(&t[tOff + 8]); + secp256k1_scalar_combine_1s(&t[tOff + 16]); + secp256k1_scalar_combine_1s(&t[tOff + 24]); + secp256k1_scalar_combine_2s(&t[tOff + 0]); + secp256k1_scalar_combine_2s(&t[tOff + 16]); + secp256k1_scalar_combine_4s(&t[tOff + 0]); } - secp256k1_scalar_mul(r, t, &x6); /* 111111 */ -} -SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { - return !(a->d[0] & 1); -} + /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */ + secp256k1_scalar_decode_matrix(&b0, &t[8]); + /* secp256k1_scalar_decode_matrix(&c0, &t[16]); */ + secp256k1_scalar_decode_matrix(&d0, &t[24]); + + secp256k1_scalar_decode_matrix(&a1, &t[32]); + secp256k1_scalar_decode_matrix(&b1, &t[40]); + secp256k1_scalar_decode_matrix(&c1, &t[48]); + secp256k1_scalar_decode_matrix(&d1, &t[56]); + + secp256k1_scalar_mul(&a1, &a1, &b0); + secp256k1_scalar_mul(&b1, &b1, &d0); + secp256k1_scalar_mul(&c1, &c1, &b0); + secp256k1_scalar_mul(&d1, &d1, &d0); + + secp256k1_scalar_add(&b0, &a1, &b1); + secp256k1_scalar_add(&d0, &c1, &d1); + + secp256k1_scalar_decode_matrix(&a1, &t[64]); + secp256k1_scalar_decode_matrix(&b1, &t[72]); + /* secp256k1_scalar_decode_matrix(&c1, &t[80]); */ + /* secp256k1_scalar_decode_matrix(&d1, &t[88]); */ + + secp256k1_scalar_mul(&a1, &a1, &b0); + secp256k1_scalar_mul(&b1, &b1, &d0); + /* secp256k1_scalar_mul(&c1, &c1, &b0); */ + /* secp256k1_scalar_mul(&d1, &d1, &d0); */ + + secp256k1_scalar_add(&b0, &a1, &b1); + /* secp256k1_scalar_add(&d0, &c1, &d1); */ + + secp256k1_scalar_cond_negate(&b0, sign); + +#ifdef VERIFY + VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in); #endif + *r = b0; +} + #endif /* SECP256K1_SCALAR_REPR_IMPL_H */ diff --git a/src/scalar_impl.h b/src/scalar_impl.h index a63b735491..69f31f6c51 100644 --- a/src/scalar_impl.h +++ b/src/scalar_impl.h @@ -233,6 +233,7 @@ SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) #endif #endif +#if defined(SECP256K1_SCALAR_INV_VAR_DEFAULT) static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) { #if defined(USE_SCALAR_INV_BUILTIN) secp256k1_scalar_inverse(r, x); @@ -253,6 +254,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc #error "Please select scalar inverse implementation" #endif } +#endif #ifdef USE_ENDOMORPHISM #if defined(EXHAUSTIVE_TEST_ORDER) diff --git a/src/scalar_low.h b/src/scalar_low.h index c31ca35376..53ea913203 100644 --- a/src/scalar_low.h +++ b/src/scalar_low.h @@ -15,5 +15,6 @@ typedef uint32_t secp256k1_scalar; #define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) (d0) #define SECP256K1_SCALAR_INV_DEFAULT +#define SECP256K1_SCALAR_INV_VAR_DEFAULT #endif /* SECP256K1_SCALAR_REPR_H */ From bd184711c8897e36ca7a24ee5f502dcc8c6ccabb Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Fri, 31 Jul 2020 22:04:11 +0700 Subject: [PATCH 10/34] Simplify type of 'eta' --- src/field_10x26_impl.h | 20 ++++++++++---------- src/field_5x52_impl.h | 20 ++++++++++---------- src/scalar_4x64_impl.h | 20 ++++++++++---------- src/scalar_8x32_impl.h | 20 ++++++++++---------- 4 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index c502c772e6..8d954a5360 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1438,7 +1438,7 @@ static void secp256k1_fe_encode_31(int32_t *r, const secp256k1_fe *a) { r[8] = a9 >> 14; } -static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +static uint32_t secp256k1_fe_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t c1, c2, f = f0, g = g0, x, y, z; @@ -1450,7 +1450,7 @@ static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int3 VERIFY_CHECK((u * f0 + v * g0) == -f << i); VERIFY_CHECK((q * f0 + r * g0) == -g << i); - c1 = -(g & (eta >> 15)); + c1 = -(g & (eta >> 31)); x = (f ^ g) & c1; f ^= x; g ^= x; g ^= c1; g -= c1; @@ -1461,7 +1461,7 @@ static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int3 z = (v ^ r) & c1; v ^= z; r ^= z; r ^= c1; r -= c1; - eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1; + eta = (eta ^ c1) - c1 - 1; c2 = -(g & 1); @@ -1478,7 +1478,7 @@ static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int3 return eta; } -static int secp256k1_fe_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +static uint32_t secp256k1_fe_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t f = f0, g = g0, m, w, x, y, z; @@ -1504,7 +1504,7 @@ static int secp256k1_fe_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i)); VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i)); - if ((int16_t)eta < 0) { + if ((int32_t)eta < 0) { eta = -eta; x = f; f = g; g = -x; y = u; u = q; q = -y; @@ -1512,7 +1512,7 @@ static int secp256k1_fe_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, } /* Handle up to 3 divsteps at once, subject to eta and i. */ - limit = (eta + 1) > i ? i : (eta + 1); + limit = ((int)eta + 1) > i ? i : ((int)eta + 1); m = (UINT32_MAX >> (32 - limit)) & 7U; /* Note that f * f == 1 mod 8, for any f. */ @@ -1577,7 +1577,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { int32_t g[9]; secp256k1_fe b0, d0, a1, b1, c1, d1; int i, sign; - int16_t eta; + uint32_t eta; #ifdef VERIFY int zero_in; #endif @@ -1594,7 +1594,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { #endif /* The paper uses 'delta'; eta == -delta (a performance tweak). */ - eta = -1; + eta = -(uint32_t)1; for (i = 0; i < 24; ++i) { eta = secp256k1_fe_divsteps_31(eta, f[0], g[0], &t[i * 4]); @@ -1678,7 +1678,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { int32_t g[9]; secp256k1_fe b0, d0, a1, b1, c1, d1; int i, sign; - int16_t eta; + uint32_t eta; #ifdef VERIFY int zero_in; #endif @@ -1695,7 +1695,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { #endif /* The paper uses 'delta'; eta == -delta (a performance tweak). */ - eta = -1; + eta = -(uint32_t)1; for (i = 0; i < 24; ++i) { eta = secp256k1_fe_divsteps_31_var(eta, f[0], g[0], &t[i * 4]); diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 0cc7e80056..8a56aeac38 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -630,7 +630,7 @@ static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) { r[4] = a4 >> 40; } -static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) { +static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) { uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; uint64_t c1, c2, f = f0, g = g0, x, y, z; @@ -642,7 +642,7 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6 VERIFY_CHECK((u * f0 + v * g0) == -f << i); VERIFY_CHECK((q * f0 + r * g0) == -g << i); - c1 = -(g & (eta >> 15)); + c1 = -(g & (eta >> 63)); x = (f ^ g) & c1; f ^= x; g ^= x; g ^= c1; g -= c1; @@ -653,7 +653,7 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6 z = (v ^ r) & c1; v ^= z; r ^= z; r ^= c1; r -= c1; - eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1; + eta = (eta ^ c1) - c1 - 1; c2 = -(g & 1); @@ -670,7 +670,7 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6 return eta; } -static int secp256k1_fe_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) { +static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) { uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; uint64_t f = f0, g = g0, m, w, x, y, z; @@ -696,7 +696,7 @@ static int secp256k1_fe_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i)); VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i)); - if ((int16_t)eta < 0) { + if ((int64_t)eta < 0) { eta = -eta; x = f; f = g; g = -x; y = u; u = q; q = -y; @@ -704,7 +704,7 @@ static int secp256k1_fe_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, } /* Handle up to 3 divsteps at once, subject to eta and i. */ - limit = (eta + 1) > i ? i : (eta + 1); + limit = ((int)eta + 1) > i ? i : ((int)eta + 1); m = (UINT64_MAX >> (64 - limit)) & 7U; /* Note that f * f == 1 mod 8, for any f. */ @@ -769,7 +769,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { int64_t g[5]; secp256k1_fe b0, d0, a1, b1, c1, d1; int i, sign; - int16_t eta; + uint64_t eta; #ifdef VERIFY int zero_in; #endif @@ -786,7 +786,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { #endif /* The paper uses 'delta'; eta == -delta (a performance tweak). */ - eta = -1; + eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], &t[i * 4]); @@ -866,7 +866,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { int64_t g[5]; secp256k1_fe b0, d0, a1, b1, c1, d1; int i, sign; - int16_t eta; + uint64_t eta; #ifdef VERIFY int zero_in; #endif @@ -883,7 +883,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { #endif /* The paper uses 'delta'; eta == -delta (a performance tweak). */ - eta = -1; + eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], &t[i * 4]); diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index f51580a19d..8ce50a9ad3 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1083,7 +1083,7 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { #endif } -static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) { +static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) { uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; uint64_t c1, c2, f = f0, g = g0, x, y, z; @@ -1095,7 +1095,7 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, VERIFY_CHECK((u * f0 + v * g0) == -f << i); VERIFY_CHECK((q * f0 + r * g0) == -g << i); - c1 = -(g & (eta >> 15)); + c1 = -(g & (eta >> 63)); x = (f ^ g) & c1; f ^= x; g ^= x; g ^= c1; g -= c1; @@ -1106,7 +1106,7 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, z = (v ^ r) & c1; v ^= z; r ^= z; r ^= c1; r -= c1; - eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1; + eta = (eta ^ c1) - c1 - 1; c2 = -(g & 1); @@ -1123,7 +1123,7 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, return eta; } -static int secp256k1_scalar_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) { +static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) { uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; uint64_t f = f0, g = g0, m, w, x, y, z; @@ -1149,7 +1149,7 @@ static int secp256k1_scalar_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i)); VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i)); - if ((int16_t)eta < 0) { + if ((int64_t)eta < 0) { eta = -eta; x = f; f = g; g = -x; y = u; u = q; q = -y; @@ -1157,7 +1157,7 @@ static int secp256k1_scalar_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t } /* Handle up to 3 divsteps at once, subject to eta and i. */ - limit = (eta + 1) > i ? i : (eta + 1); + limit = ((int)eta + 1) > i ? i : ((int)eta + 1); m = (UINT64_MAX >> (64 - limit)) & 7U; /* Note that f * f == 1 mod 8, for any f. */ @@ -1233,7 +1233,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar int64_t g[5]; secp256k1_scalar b0, d0, a1, b1, c1, d1; int i, sign; - int16_t eta; + uint64_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif @@ -1243,7 +1243,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar_encode_62(g, &b0); /* The paper uses 'delta'; eta == -delta (a performance tweak). */ - eta = -1; + eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], &t[i * 4]); @@ -1326,7 +1326,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc int64_t g[5]; secp256k1_scalar b0, d0, a1, b1, c1, d1; int i, sign; - int16_t eta; + uint64_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif @@ -1336,7 +1336,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc secp256k1_scalar_encode_62(g, &b0); /* The paper uses 'delta'; eta == -delta (a performance tweak). */ - eta = -1; + eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], &t[i * 4]); diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 02d03f4504..54fd10f385 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -989,7 +989,7 @@ static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) { #endif } -static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +static uint32_t secp256k1_scalar_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t c1, c2, f = f0, g = g0, x, y, z; @@ -1001,7 +1001,7 @@ static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, VERIFY_CHECK((u * f0 + v * g0) == -f << i); VERIFY_CHECK((q * f0 + r * g0) == -g << i); - c1 = -(g & (eta >> 15)); + c1 = -(g & (eta >> 31)); x = (f ^ g) & c1; f ^= x; g ^= x; g ^= c1; g -= c1; @@ -1012,7 +1012,7 @@ static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, z = (v ^ r) & c1; v ^= z; r ^= z; r ^= c1; r -= c1; - eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1; + eta = (eta ^ c1) - c1 - 1; c2 = -(g & 1); @@ -1029,7 +1029,7 @@ static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, return eta; } -static int secp256k1_scalar_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +static uint32_t secp256k1_scalar_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t f = f0, g = g0, m, w, x, y, z; @@ -1055,7 +1055,7 @@ static int secp256k1_scalar_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i)); VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i)); - if ((int16_t)eta < 0) { + if ((int32_t)eta < 0) { eta = -eta; x = f; f = g; g = -x; y = u; u = q; q = -y; @@ -1063,7 +1063,7 @@ static int secp256k1_scalar_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t } /* Handle up to 3 divsteps at once, subject to eta and i. */ - limit = (eta + 1) > i ? i : (eta + 1); + limit = ((int)eta + 1) > i ? i : ((int)eta + 1); m = (UINT32_MAX >> (32 - limit)) & 7U; /* Note that f * f == 1 mod 8, for any f. */ @@ -1139,7 +1139,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar int32_t g[9]; secp256k1_scalar b0, d0, a1, b1, c1, d1; int i, sign; - int16_t eta; + uint32_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif @@ -1149,7 +1149,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar_encode_31(g, &b0); /* The paper uses 'delta'; eta == -delta (a performance tweak). */ - eta = -1; + eta = -(uint32_t)1; for (i = 0; i < 24; ++i) { eta = secp256k1_scalar_divsteps_31(eta, f[0], g[0], &t[i * 4]); @@ -1236,7 +1236,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc int32_t g[9]; secp256k1_scalar b0, d0, a1, b1, c1, d1; int i, sign; - int16_t eta; + uint32_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif @@ -1246,7 +1246,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc secp256k1_scalar_encode_31(g, &b0); /* The paper uses 'delta'; eta == -delta (a performance tweak). */ - eta = -1; + eta = -(uint32_t)1; for (i = 0; i < 24; ++i) { eta = secp256k1_scalar_divsteps_31_var(eta, f[0], g[0], &t[i * 4]); From b8c7390bc3a68cfda16e78a0914f47eb1a0bfe4f Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sat, 8 Aug 2020 16:58:43 +0700 Subject: [PATCH 11/34] =?UTF-8?q?field=5F5x52:=20update=20B=C3=A9zout=20co?= =?UTF-8?q?efficients=20on-the-fly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/field_5x52_impl.h | 316 +++++++++++++++--------------------------- 1 file changed, 113 insertions(+), 203 deletions(-) diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 8a56aeac38..c6e6dd0b0c 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -498,107 +498,29 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se #endif } -static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST( - 0x0E90A100UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, - 0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL -); +static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) { -static void secp256k1_fe_mul_add_2(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { - - /* Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64. - * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and - * writes the 252-bit signed result to [t[0],t[1],t[2],t[3]]. - */ - - int64_t z0, z1, z2, z3; - int128_t tt; - - tt = (int128_t)a0 * b0 - + (int128_t)c0 * d0; - z0 = (int64_t)tt; tt -= z0; tt >>= 64; - - tt += (int128_t)a0 * b1 - + (int128_t)a1 * b0 - + (int128_t)c0 * d1 - + (int128_t)c1 * d0; - z1 = (int64_t)tt; tt -= z1; tt >>= 64; - - tt += (int128_t)a1 * b1 - + (int128_t)c1 * d1; - z2 = (int64_t)tt; tt -= z2; tt >>= 64; - - z3 = (int64_t)tt; - - t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3; -} - -static void secp256k1_fe_combine_1s(int64_t *t) { - - int64_t a = t[0], b = t[1], c = t[2], d = t[3], - e = t[4], f = t[5], g = t[6], h = t[7]; - int128_t I, J, K, L; - - I = (int128_t)e * a + (int128_t)f * c; - J = (int128_t)e * b + (int128_t)f * d; - K = (int128_t)g * a + (int128_t)h * c; - L = (int128_t)g * b + (int128_t)h * d; - - a = (int64_t)I; I -= a; I >>= 64; b = (int64_t)I; - c = (int64_t)J; J -= c; J >>= 64; d = (int64_t)J; - e = (int64_t)K; K -= e; K >>= 64; f = (int64_t)K; - g = (int64_t)L; L -= g; L >>= 64; h = (int64_t)L; + const uint64_t M52 = UINT64_MAX >> 12; + uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; + uint64_t r0, r1, r2, r3, r4; - t[0] = a; t[1] = b; t[2] = c; t[3] = d; - t[4] = e; t[5] = f; t[6] = g; t[7] = h; -} + VERIFY_CHECK(a0 >> 62 == 0); + VERIFY_CHECK(a1 >> 62 == 0); + VERIFY_CHECK(a2 >> 62 == 0); + VERIFY_CHECK(a3 >> 62 == 0); -static void secp256k1_fe_combine_2s(int64_t *t) { - - int64_t a0 = t[ 0], a1 = t[ 1]; - int64_t b0 = t[ 2], b1 = t[ 3]; - int64_t c0 = t[ 4], c1 = t[ 5]; - int64_t d0 = t[ 6], d1 = t[ 7]; - int64_t e0 = t[ 8], e1 = t[ 9]; - int64_t f0 = t[10], f1 = t[11]; - int64_t g0 = t[12], g1 = t[13]; - int64_t h0 = t[14], h1 = t[15]; - - secp256k1_fe_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); - secp256k1_fe_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); - secp256k1_fe_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); - secp256k1_fe_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); -} + /* Add a multiple of the field prime in case u4 is "negative". */ + r0 = 0xFFFFEFFFFFC2FULL * 8; + r1 = 0xFFFFFFFFFFFFFULL * 8; + r2 = 0xFFFFFFFFFFFFFULL * 8; + r3 = 0xFFFFFFFFFFFFFULL * 8; + r4 = 0x0FFFFFFFFFFFFULL * 8; -static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) { - - uint64_t u0, u1, u2, u3, u4; - uint64_t r0, r1, r2, r3, r4; - int128_t cc; - - cc = t[0]; - u0 = (uint64_t)cc; cc >>= 64; - cc += t[1]; - u1 = (uint64_t)cc; cc >>= 64; - cc += t[2]; - u2 = (uint64_t)cc; cc >>= 64; - cc += t[3]; - u3 = (uint64_t)cc; cc >>= 64; - u4 = (uint64_t)cc; - - VERIFY_CHECK(u4 == 0 || u4 == UINT64_MAX); - - /* Add twice the field prime in case u4 is non-zero (which represents -2^256). */ - r0 = 0xFFFFEFFFFFC2FULL * 2; - r1 = 0xFFFFFFFFFFFFFULL * 2; - r2 = 0xFFFFFFFFFFFFFULL * 2; - r3 = 0xFFFFFFFFFFFFFULL * 2; - r4 = 0x0FFFFFFFFFFFFULL * 2; - - r0 += u0 & 0xFFFFFFFFFFFFFULL; - r1 += (u0 >> 52 | u1 << 12) & 0xFFFFFFFFFFFFFULL; - r2 += (u1 >> 40 | u2 << 24) & 0xFFFFFFFFFFFFFULL; - r3 += (u2 >> 28 | u3 << 36) & 0xFFFFFFFFFFFFFULL; - r4 += (u3 >> 16 | u4 << 48); + r0 += a0 & M52; + r1 += (a0 >> 52 | a1 << 10) & M52; + r2 += (a1 >> 42 | a2 << 20) & M52; + r3 += (a2 >> 32 | a3 << 30) & M52; + r4 += (a3 >> 22 | a4 << 40); r->n[0] = r0; r->n[1] = r1; @@ -607,7 +529,7 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) { r->n[4] = r4; #ifdef VERIFY - r->magnitude = 2; + r->magnitude = 7; r->normalized = 0; secp256k1_fe_verify(r); #endif @@ -722,6 +644,67 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t return eta; } +static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) { + + /* I64 == -P^-1 mod 2^64 */ + const int64_t I64 = 0xD838091DD2253531LL; + const int64_t C64 = 0x1000003D1LL; + const int64_t M62 = (int64_t)(UINT64_MAX >> 2); + int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; + int128_t cd = 0, ce = 0; + int i; + + di = d[0]; + ei = e[0]; + + cd -= (int128_t)u * di + (int128_t)v * ei; + ce -= (int128_t)q * di + (int128_t)r * ei; + + /* Calculate the multiples of P to add, to zero the 62 bottom bits. */ + md = ((int128_t)I64 * (int64_t)cd) & M62; + me = ((int128_t)I64 * (int64_t)ce) & M62; + + /* P == 2^256 - C64; subtract C64 products here. */ + cd -= (int128_t)C64 * md; + ce -= (int128_t)C64 * me; + + VERIFY_CHECK(((int64_t)cd & M62) == 0); + VERIFY_CHECK(((int64_t)ce & M62) == 0); + + cd >>= 62; + ce >>= 62; + + for (i = 1; i < 4; ++i) { + + di = d[i]; + ei = e[i]; + + cd -= (int128_t)u * di + (int128_t)v * ei; + ce -= (int128_t)q * di + (int128_t)r * ei; + + d[i - 1] = (int64_t)cd & M62; cd >>= 62; + e[i - 1] = (int64_t)ce & M62; ce >>= 62; + } + + { + di = d[4]; + ei = e[4]; + + cd -= (int128_t)u * di + (int128_t)v * ei; + ce -= (int128_t)q * di + (int128_t)r * ei; + + /* In the final iteration, add the 2^256 products. */ + cd += (int128_t)md << 8; + ce += (int128_t)me << 8; + + d[3] = (int64_t)cd & M62; cd >>= 62; + e[3] = (int64_t)ce & M62; ce >>= 62; + } + + d[4] = (int64_t)cd; + e[4] = (int64_t)ce; +} + static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) { const int64_t M62 = (int64_t)(UINT64_MAX >> 2); @@ -763,21 +746,20 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - int64_t t[12 * 4]; + int64_t t[4]; + int64_t d[5] = { 0, 0, 0, 0, 0 }; + int64_t e[5] = { 1, 0, 0, 0, 0 }; int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; - secp256k1_fe b0, d0, a1, b1, c1, d1; + secp256k1_fe b0, b1; int i, sign; uint64_t eta; #ifdef VERIFY int zero_in; #endif - /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input - * by 2^768, and then the output by 2^24. */ - /* Instead of dividing the output by 2^744, scale the input. */ - secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); + b0 = *a; secp256k1_fe_normalize(&b0); secp256k1_fe_encode_62(g, &b0); @@ -789,61 +771,23 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { - eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], &t[i * 4]); - secp256k1_fe_update_fg(f, g, &t[i * 4]); + eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], t); + secp256k1_fe_update_de(d, e, t); + secp256k1_fe_update_fg(f, g, t); } /* At this point sufficient iterations have been performed that g must have reached 0 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get - * the Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra - * factor of 2^744 to account for (by scaling the input and/or output accordingly). + * values i.e. +/- 1, and d now contains +/- the modular inverse. */ - VERIFY_CHECK(g[0] == 0); + VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0); sign = (f[0] >> 1) & 1; - for (i = 0; i < 3; ++i) { - int tOff = i * 16; - secp256k1_fe_combine_1s(&t[tOff + 0]); - secp256k1_fe_combine_1s(&t[tOff + 8]); - secp256k1_fe_combine_2s(&t[tOff + 0]); - } - - /* secp256k1_fe_decode_matrix(&a0, &t[0]); */ - secp256k1_fe_decode_matrix(&b0, &t[4]); - /* secp256k1_fe_decode_matrix(&c0, &t[8]); */ - secp256k1_fe_decode_matrix(&d0, &t[12]); - - secp256k1_fe_decode_matrix(&a1, &t[16]); - secp256k1_fe_decode_matrix(&b1, &t[20]); - secp256k1_fe_decode_matrix(&c1, &t[24]); - secp256k1_fe_decode_matrix(&d1, &t[28]); - - secp256k1_fe_mul(&a1, &a1, &b0); - secp256k1_fe_mul(&b1, &b1, &d0); - secp256k1_fe_mul(&c1, &c1, &b0); - secp256k1_fe_mul(&d1, &d1, &d0); - - b0 = a1; secp256k1_fe_add(&b0, &b1); - d0 = c1; secp256k1_fe_add(&d0, &d1); - - secp256k1_fe_decode_matrix(&a1, &t[32]); - secp256k1_fe_decode_matrix(&b1, &t[36]); - /* secp256k1_fe_decode_matrix(&c1, &t[40]); */ - /* secp256k1_fe_decode_matrix(&d1, &t[44]); */ + secp256k1_fe_decode_62(&b0, d); - secp256k1_fe_mul(&a1, &a1, &b0); - secp256k1_fe_mul(&b1, &b1, &d0); - /* secp256k1_fe_mul(&c1, &c1, &b0); */ - /* secp256k1_fe_mul(&d1, &d1, &d0); */ - - b0 = a1; secp256k1_fe_add(&b0, &b1); - /* d0 = c1; secp256k1_fe_add(&d0, &d1); */ - - secp256k1_fe_negate(&b1, &b0, 2); + secp256k1_fe_negate(&b1, &b0, 7); secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); @@ -860,21 +804,20 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - int64_t t[12 * 4]; + int64_t t[4]; + int64_t d[5] = { 0, 0, 0, 0, 0 }; + int64_t e[5] = { 1, 0, 0, 0, 0 }; int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; - secp256k1_fe b0, d0, a1, b1, c1, d1; + secp256k1_fe b0, b1; int i, sign; uint64_t eta; #ifdef VERIFY int zero_in; #endif - /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input - * by 2^768, and then the output by 2^24. */ - /* Instead of dividing the output by 2^744, scale the input. */ - secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); + b0 = *a; secp256k1_fe_normalize(&b0); secp256k1_fe_encode_62(g, &b0); @@ -886,61 +829,28 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { - eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], &t[i * 4]); - secp256k1_fe_update_fg(f, g, &t[i * 4]); + eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t); + secp256k1_fe_update_de(d, e, t); + secp256k1_fe_update_fg(f, g, t); + + if (g[0] == 0) { + if ((g[1] | g[2] | g[3] | g[4]) == 0) { + break; + } + } } - /* At this point sufficient iterations have been performed that g must have reached 0 - * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get - * the Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra - * factor of 2^744 to account for (by scaling the input and/or output accordingly). - */ + VERIFY_CHECK(i < 12); - VERIFY_CHECK(g[0] == 0); + /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of + * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. + */ sign = (f[0] >> 1) & 1; - for (i = 0; i < 3; ++i) { - int tOff = i * 16; - secp256k1_fe_combine_1s(&t[tOff + 0]); - secp256k1_fe_combine_1s(&t[tOff + 8]); - secp256k1_fe_combine_2s(&t[tOff + 0]); - } - - /* secp256k1_fe_decode_matrix(&a0, &t[0]); */ - secp256k1_fe_decode_matrix(&b0, &t[4]); - /* secp256k1_fe_decode_matrix(&c0, &t[8]); */ - secp256k1_fe_decode_matrix(&d0, &t[12]); - - secp256k1_fe_decode_matrix(&a1, &t[16]); - secp256k1_fe_decode_matrix(&b1, &t[20]); - secp256k1_fe_decode_matrix(&c1, &t[24]); - secp256k1_fe_decode_matrix(&d1, &t[28]); - - secp256k1_fe_mul(&a1, &a1, &b0); - secp256k1_fe_mul(&b1, &b1, &d0); - secp256k1_fe_mul(&c1, &c1, &b0); - secp256k1_fe_mul(&d1, &d1, &d0); - - b0 = a1; secp256k1_fe_add(&b0, &b1); - d0 = c1; secp256k1_fe_add(&d0, &d1); - - secp256k1_fe_decode_matrix(&a1, &t[32]); - secp256k1_fe_decode_matrix(&b1, &t[36]); - /* secp256k1_fe_decode_matrix(&c1, &t[40]); */ - /* secp256k1_fe_decode_matrix(&d1, &t[44]); */ - - secp256k1_fe_mul(&a1, &a1, &b0); - secp256k1_fe_mul(&b1, &b1, &d0); - /* secp256k1_fe_mul(&c1, &c1, &b0); */ - /* secp256k1_fe_mul(&d1, &d1, &d0); */ - - b0 = a1; secp256k1_fe_add(&b0, &b1); - /* d0 = c1; secp256k1_fe_add(&d0, &d1); */ + secp256k1_fe_decode_62(&b0, d); - secp256k1_fe_negate(&b1, &b0, 2); + secp256k1_fe_negate(&b1, &b0, 7); secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); From 64a4912c436bc44432443938af8c6dbfaf02f911 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sat, 8 Aug 2020 19:38:06 +0700 Subject: [PATCH 12/34] =?UTF-8?q?field=5F10x26:=20update=20B=C3=A9zout=20c?= =?UTF-8?q?oefficients=20on-the-fly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/field_10x26_impl.h | 552 +++++++++++++---------------------------- src/field_5x52_impl.h | 48 ++-- 2 files changed, 191 insertions(+), 409 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 8d954a5360..30c5653c78 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1164,238 +1164,44 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se #endif } -static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST( - 0x0E90A100UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, - 0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL -); +static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { -static void secp256k1_fe_mul_add_2(int32_t a0, int32_t a1, int32_t b0, int32_t b1, int32_t c0, int32_t c1, int32_t d0, int32_t d1, int32_t *t) { - - /* Each [a0,a1], etc. pair is a ??-bit signed value e.g. a0 + a1 * 2^32. - * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and - * writes the ???-bit signed result to [t[0],t[1],t[2],t[3]]. - */ - - int32_t z0, z1, z2, z3; - int64_t tt; - - tt = (int64_t)a0 * b0 - + (int64_t)c0 * d0; - z0 = (int32_t)tt; tt -= z0; tt >>= 32; - - tt += (int64_t)a0 * b1 - + (int64_t)a1 * b0 - + (int64_t)c0 * d1 - + (int64_t)c1 * d0; - z1 = (int32_t)tt; tt -= z1; tt >>= 32; - - tt += (int64_t)a1 * b1 - + (int64_t)c1 * d1; - z2 = (int32_t)tt; tt -= z2; tt >>= 32; - - z3 = (int32_t)tt; - - t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3; -} - -static void secp256k1_fe_mul_add_4(int32_t* tIn, int xPos, int yPos, int uPos, int vPos, int32_t *tOut, int zzPos) { - int32_t y0 = tIn[yPos + 0]; - int32_t y1 = tIn[yPos + 1]; - int32_t y2 = tIn[yPos + 2]; - int32_t y3 = tIn[yPos + 3]; - int32_t v0 = tIn[vPos + 0]; - int32_t v1 = tIn[vPos + 1]; - int32_t v2 = tIn[vPos + 2]; - int32_t v3 = tIn[vPos + 3]; - int32_t xVal, uVal; - int32_t z0, z1, z2, z3, z4, z5, z6, z7; - int64_t c; - - xVal = tIn[xPos]; - uVal = tIn[uPos]; - - c = (int64_t)xVal * y0 + (int64_t)uVal * v0; - z0 = (int32_t)c; c -= z0; c >>= 32; - - c += (int64_t)xVal * y1 + (int64_t)uVal * v1; - z1 = (int32_t)c; c -= z1; c >>= 32; - - c += (int64_t)xVal * y2 + (int64_t)uVal * v2; - z2 = (int32_t)c; c -= z2; c >>= 32; - - c += (int64_t)xVal * y3 + (int64_t)uVal * v3; - z3 = (int32_t)c; c -= z3; c >>= 32; - z4 = (int32_t)c; - - xVal = tIn[xPos + 1]; - uVal = tIn[uPos + 1]; - - c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z1; - z1 = (int32_t)c; c -= z1; c >>= 32; - - c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z2; - z2 = (int32_t)c; c -= z2; c >>= 32; - - c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z3; - z3 = (int32_t)c; c -= z3; c >>= 32; - - c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z4; - z4 = (int32_t)c; c -= z4; c >>= 32; - z5 = (int32_t)c; - - xVal = tIn[xPos + 2]; - uVal = tIn[uPos + 2]; - - c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z2; - z2 = (int32_t)c; c -= z2; c >>= 32; - - c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z3; - z3 = (int32_t)c; c -= z3; c >>= 32; - - c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z4; - z4 = (int32_t)c; c -= z4; c >>= 32; - - c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z5; - z5 = (int32_t)c; c -= z5; c >>= 32; - z6 = (int32_t)c; - - xVal = tIn[xPos + 3]; - uVal = tIn[uPos + 3]; - - c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z3; - z3 = (int32_t)c; c -= z3; c >>= 32; - - c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z4; - z4 = (int32_t)c; c -= z4; c >>= 32; - - c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z5; - z5 = (int32_t)c; c -= z5; c >>= 32; - - c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z6; - z6 = (int32_t)c; c -= z6; c >>= 32; - z7 = (int32_t)c; - - tOut[zzPos + 0] = z0; - tOut[zzPos + 1] = z1; - tOut[zzPos + 2] = z2; - tOut[zzPos + 3] = z3; - tOut[zzPos + 4] = z4; - tOut[zzPos + 5] = z5; - tOut[zzPos + 6] = z6; - tOut[zzPos + 7] = z7; -} - -static void secp256k1_fe_combine_1s(int32_t *t) { - - int32_t a = t[0], b = t[1], c = t[2], d = t[3], - e = t[4], f = t[5], g = t[6], h = t[7]; - int64_t I, J, K, L; - - I = (int64_t)e * a + (int64_t)f * c; - J = (int64_t)e * b + (int64_t)f * d; - K = (int64_t)g * a + (int64_t)h * c; - L = (int64_t)g * b + (int64_t)h * d; - - a = (int32_t)I; I -= a; I >>= 32; b = (int32_t)I; - c = (int32_t)J; J -= c; J >>= 32; d = (int32_t)J; - e = (int32_t)K; K -= e; K >>= 32; f = (int32_t)K; - g = (int32_t)L; L -= g; L >>= 32; h = (int32_t)L; - - t[0] = a; t[1] = b; t[2] = c; t[3] = d; - t[4] = e; t[5] = f; t[6] = g; t[7] = h; -} - -static void secp256k1_fe_combine_2s(int32_t *t) { - - int32_t a0 = t[ 0], a1 = t[ 1]; - int32_t b0 = t[ 2], b1 = t[ 3]; - int32_t c0 = t[ 4], c1 = t[ 5]; - int32_t d0 = t[ 6], d1 = t[ 7]; - int32_t e0 = t[ 8], e1 = t[ 9]; - int32_t f0 = t[10], f1 = t[11]; - int32_t g0 = t[12], g1 = t[13]; - int32_t h0 = t[14], h1 = t[15]; - - secp256k1_fe_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); - secp256k1_fe_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); - secp256k1_fe_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); - secp256k1_fe_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); -} - -static void secp256k1_fe_combine_4s(int32_t *t) -{ - int32_t tmp[32]; - - int aPos = 0; - int bPos = 4; - int cPos = 8; - int dPos = 12; - int ePos = 16; - int fPos = 20; - int gPos = 24; - int hPos = 28; - - int IPos = 0; - int JPos = 8; - int KPos = 16; - int LPos = 24; - - secp256k1_fe_mul_add_4(t, ePos, aPos, fPos, cPos, tmp, IPos); - secp256k1_fe_mul_add_4(t, ePos, bPos, fPos, dPos, tmp, JPos); - secp256k1_fe_mul_add_4(t, gPos, aPos, hPos, cPos, tmp, KPos); - secp256k1_fe_mul_add_4(t, gPos, bPos, hPos, dPos, tmp, LPos); - - memcpy(t, tmp, 32 * sizeof(int32_t)); -} - -static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int32_t *t) { - - uint32_t u0, u1, u2, u3, u4, u5, u6, u7, u8; + const uint32_t M26 = UINT32_MAX >> 6; + uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], + a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; - int64_t cc; - - cc = t[0]; - u0 = (uint32_t)cc; cc >>= 32; - cc += t[1]; - u1 = (uint32_t)cc; cc >>= 32; - cc += t[2]; - u2 = (uint32_t)cc; cc >>= 32; - cc += t[3]; - u3 = (uint32_t)cc; cc >>= 32; - cc += t[4]; - u4 = (uint32_t)cc; cc >>= 32; - cc += t[5]; - u5 = (uint32_t)cc; cc >>= 32; - cc += t[6]; - u6 = (uint32_t)cc; cc >>= 32; - cc += t[7]; - u7 = (uint32_t)cc; cc >>= 32; - u8 = (uint32_t)cc; - - VERIFY_CHECK(u8 == 0 || u8 == UINT32_MAX); - - /* Add twice the field prime in case u8 is non-zero (which represents -2^256). */ - r0 = 0x3FFFC2FUL * 2; - r1 = 0x3FFFFBFUL * 2; - r2 = 0x3FFFFFFUL * 2; - r3 = 0x3FFFFFFUL * 2; - r4 = 0x3FFFFFFUL * 2; - r5 = 0x3FFFFFFUL * 2; - r6 = 0x3FFFFFFUL * 2; - r7 = 0x3FFFFFFUL * 2; - r8 = 0x3FFFFFFUL * 2; - r9 = 0x03FFFFFUL * 2; - - r0 += ( u0 ) & 0x3FFFFFFUL; - r1 += (u0 >> 26 | u1 << 6) & 0x3FFFFFFUL; - r2 += (u1 >> 20 | u2 << 12) & 0x3FFFFFFUL; - r3 += (u2 >> 14 | u3 << 18) & 0x3FFFFFFUL; - r4 += (u3 >> 8 | u4 << 24) & 0x3FFFFFFUL; - r5 += (u4 >> 2 ) & 0x3FFFFFFUL; - r6 += (u4 >> 28 | u5 << 4) & 0x3FFFFFFUL; - r7 += (u5 >> 22 | u6 << 10) & 0x3FFFFFFUL; - r8 += (u6 >> 16 | u7 << 16) & 0x3FFFFFFUL; - r9 += (u7 >> 10 | u8 << 22); + + VERIFY_CHECK(a0 >> 30 == 0); + VERIFY_CHECK(a1 >> 30 == 0); + VERIFY_CHECK(a2 >> 30 == 0); + VERIFY_CHECK(a3 >> 30 == 0); + VERIFY_CHECK(a4 >> 30 == 0); + VERIFY_CHECK(a5 >> 30 == 0); + VERIFY_CHECK(a6 >> 30 == 0); + VERIFY_CHECK(a7 >> 30 == 0); + + /* Add a multiple of the field prime in case u4 is "negative". */ + r0 = 0x3FFFC2FUL * 8; + r1 = 0x3FFFFBFUL * 8; + r2 = 0x3FFFFFFUL * 8; + r3 = 0x3FFFFFFUL * 8; + r4 = 0x3FFFFFFUL * 8; + r5 = 0x3FFFFFFUL * 8; + r6 = 0x3FFFFFFUL * 8; + r7 = 0x3FFFFFFUL * 8; + r8 = 0x3FFFFFFUL * 8; + r9 = 0x03FFFFFUL * 8; + + r0 += a0 & M26; + r1 += (a0 >> 26 | a1 << 4) & M26; + r2 += (a1 >> 22 | a2 << 8) & M26; + r3 += (a2 >> 18 | a3 << 12) & M26; + r4 += (a3 >> 14 | a4 << 16) & M26; + r5 += (a4 >> 10 | a5 << 20) & M26; + r6 += (a5 >> 6 | a6 << 24) & M26; + r7 += (a6 >> 2 ) & M26; + r8 += (a6 >> 28 | a7 << 2) & M26; + r9 += (a7 >> 24 | a8 << 6); r->n[0] = r0; r->n[1] = r1; @@ -1409,15 +1215,15 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int32_t *t) { r->n[9] = r9; #ifdef VERIFY - r->magnitude = 2; + r->magnitude = 7; r->normalized = 0; secp256k1_fe_verify(r); #endif } -static void secp256k1_fe_encode_31(int32_t *r, const secp256k1_fe *a) { +static void secp256k1_fe_encode_30(int32_t *r, const secp256k1_fe *a) { - const uint32_t M31 = UINT32_MAX >> 1; + const uint32_t M30 = UINT32_MAX >> 2; const uint32_t *n = &a->n[0]; uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4], a5 = n[5], a6 = n[6], a7 = n[7], a8 = n[8], a9 = n[9]; @@ -1426,25 +1232,25 @@ static void secp256k1_fe_encode_31(int32_t *r, const secp256k1_fe *a) { VERIFY_CHECK(a->normalized); #endif - r[0] = (a0 | a1 << 26) & M31; - r[1] = (a1 >> 5 | a2 << 21) & M31; - r[2] = (a2 >> 10 | a3 << 16) & M31; - r[3] = (a3 >> 15 | a4 << 11) & M31; - r[4] = (a4 >> 20 | a5 << 6) & M31; - r[5] = (a5 >> 25 | a6 << 1 - | a7 << 27) & M31; - r[6] = (a7 >> 4 | a8 << 22) & M31; - r[7] = (a8 >> 9 | a9 << 17) & M31; - r[8] = a9 >> 14; + r[0] = (a0 | a1 << 26) & M30; + r[1] = (a1 >> 4 | a2 << 22) & M30; + r[2] = (a2 >> 8 | a3 << 18) & M30; + r[3] = (a3 >> 12 | a4 << 14) & M30; + r[4] = (a4 >> 16 | a5 << 10) & M30; + r[5] = (a5 >> 20 | a6 << 6) & M30; + r[6] = (a6 >> 24 | a7 << 2 + | a8 << 28) & M30; + r[7] = (a8 >> 2 | a9 << 24) & M30; + r[8] = a9 >> 6; } -static uint32_t secp256k1_fe_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t c1, c2, f = f0, g = g0, x, y, z; int i; - for (i = 0; i < 31; ++i) { + for (i = 0; i < 30; ++i) { VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((u * f0 + v * g0) == -f << i); @@ -1478,11 +1284,11 @@ static uint32_t secp256k1_fe_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, return eta; } -static uint32_t secp256k1_fe_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t f = f0, g = g0, m, w, x, y, z; - int i = 31, limit, zeros; + int i = 30, limit, zeros; for (;;) { @@ -1501,8 +1307,8 @@ static uint32_t secp256k1_fe_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((g & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i)); - VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i)); + VERIFY_CHECK((u * f0 + v * g0) == -f << (30 - i)); + VERIFY_CHECK((q * f0 + r * g0) == -g << (30 - i)); if ((int32_t)eta < 0) { eta = -eta; @@ -1530,9 +1336,74 @@ static uint32_t secp256k1_fe_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t return eta; } -static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) { +static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { + + /* I30 == -P^-1 mod 2^30 */ + const int32_t I30 = 0x12253531L; + const int32_t C30 = 0x3D1L; + const int32_t M30 = (int32_t)(UINT32_MAX >> 2); + int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; + int64_t cd = 0, ce = 0; + int i; + + di = d[0]; + ei = e[0]; + + cd -= (int64_t)u * di + (int64_t)v * ei; + ce -= (int64_t)q * di + (int64_t)r * ei; + + /* Calculate the multiples of P to add, to zero the 30 bottom bits. */ + md = ((int64_t)I30 * (int32_t)cd) & M30; + me = ((int64_t)I30 * (int32_t)ce) & M30; + + /* P == 2^256 - 2^32 - C30; subtract products of C30 here. */ + cd -= (int64_t)C30 * md; + ce -= (int64_t)C30 * me; + + VERIFY_CHECK(((int32_t)cd & M30) == 0); + VERIFY_CHECK(((int32_t)ce & M30) == 0); + + cd >>= 30; + ce >>= 30; + + /* Subtract products of 2^32. */ + cd -= (int64_t)md << 2; + ce -= (int64_t)me << 2; + + for (i = 1; i < 8; ++i) { + + di = d[i]; + ei = e[i]; + + cd -= (int64_t)u * di + (int64_t)v * ei; + ce -= (int64_t)q * di + (int64_t)r * ei; + + d[i - 1] = (int32_t)cd & M30; cd >>= 30; + e[i - 1] = (int32_t)ce & M30; ce >>= 30; + } + + /* Add products of 2^256. */ + cd += (int64_t)md << 16; + ce += (int64_t)me << 16; + + { + di = d[8]; + ei = e[8]; + + cd -= (int64_t)u * di + (int64_t)v * ei; + ce -= (int64_t)q * di + (int64_t)r * ei; + + d[7] = (int32_t)cd & M30; cd >>= 30; + e[7] = (int32_t)ce & M30; ce >>= 30; + } + + d[8] = (int32_t)cd; + e[8] = (int32_t)ce; +} + +static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, int32_t *t) { - const int32_t M31 = (int32_t)(UINT32_MAX >> 1); + const int32_t M30 = (int32_t)(UINT32_MAX >> 2); int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; int64_t cf = 0, cg = 0; int i; @@ -1543,11 +1414,11 @@ static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) { cf -= (int64_t)u * fi + (int64_t)v * gi; cg -= (int64_t)q * fi + (int64_t)r * gi; - VERIFY_CHECK(((int32_t)cf & M31) == 0); - VERIFY_CHECK(((int32_t)cg & M31) == 0); + VERIFY_CHECK(((int32_t)cf & M30) == 0); + VERIFY_CHECK(((int32_t)cg & M30) == 0); - cf >>= 31; - cg >>= 31; + cf >>= 30; + cg >>= 30; for (i = 1; i < 9; ++i) { @@ -1557,8 +1428,8 @@ static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) { cf -= (int64_t)u * fi + (int64_t)v * gi; cg -= (int64_t)q * fi + (int64_t)r * gi; - f[i - 1] = (int32_t)cf & M31; cf >>= 31; - g[i - 1] = (int32_t)cg & M31; cg >>= 31; + f[i - 1] = (int32_t)cf & M30; cf >>= 30; + g[i - 1] = (int32_t)cg & M30; cg >>= 30; } f[8] = (int32_t)cf; @@ -1568,26 +1439,24 @@ static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) { static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { /* Modular inversion based on the paper "Fast constant-time gcd computation and - * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. - */ + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - int32_t t[24 * 4]; - int32_t f[9] = { 0x7FFFFC2FL, 0x7FFFFFFDL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, - 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL }; + int32_t t[4]; + int32_t d[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int32_t e[9] = { 1, 0, 0, 0, 0, 0, 0, 0, 0 }; + int32_t f[9] = { 0x3FFFFC2F, 0x3FFFFFFB, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, + 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF }; int32_t g[9]; - secp256k1_fe b0, d0, a1, b1, c1, d1; + secp256k1_fe b0, b1; int i, sign; uint32_t eta; #ifdef VERIFY int zero_in; #endif - /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input - * by 2^768, and then the output by 2^24. */ - /* Instead of dividing the output by 2^744, scale the input. */ - secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); + b0 = *a; secp256k1_fe_normalize(&b0); - secp256k1_fe_encode_31(g, &b0); + secp256k1_fe_encode_30(g, &b0); #ifdef VERIFY zero_in = secp256k1_fe_is_zero(&b0); @@ -1596,66 +1465,23 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { /* The paper uses 'delta'; eta == -delta (a performance tweak). */ eta = -(uint32_t)1; - for (i = 0; i < 24; ++i) { - eta = secp256k1_fe_divsteps_31(eta, f[0], g[0], &t[i * 4]); - secp256k1_fe_update_fg(f, g, &t[i * 4]); + for (i = 0; i < 25; ++i) { + eta = secp256k1_fe_divsteps_30(eta, f[0], g[0], t); + secp256k1_fe_update_de_30(d, e, t); + secp256k1_fe_update_fg_30(f, g, t); } /* At this point sufficient iterations have been performed that g must have reached 0 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divsteps_31 are combined to get - * the Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divsteps_31 introduce an extra factor of 2^31 each, so there is a total extra - * factor of 2^744 to account for (by scaling the input and/or output accordingly). - */ + * values i.e. +/- 1, and d now contains +/- the modular inverse. */ - VERIFY_CHECK(g[0] == 0); + VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0); sign = (f[0] >> 1) & 1; - for (i = 0; i < 3; ++i) { - int tOff = i * 32; - secp256k1_fe_combine_1s(&t[tOff + 0]); - secp256k1_fe_combine_1s(&t[tOff + 8]); - secp256k1_fe_combine_1s(&t[tOff + 16]); - secp256k1_fe_combine_1s(&t[tOff + 24]); - secp256k1_fe_combine_2s(&t[tOff + 0]); - secp256k1_fe_combine_2s(&t[tOff + 16]); - secp256k1_fe_combine_4s(&t[tOff + 0]); - } - - /* secp256k1_fe_decode_matrix(&a0, &t[0]); */ - secp256k1_fe_decode_matrix(&b0, &t[8]); - /* secp256k1_fe_decode_matrix(&c0, &t[16]); */ - secp256k1_fe_decode_matrix(&d0, &t[24]); - - secp256k1_fe_decode_matrix(&a1, &t[32]); - secp256k1_fe_decode_matrix(&b1, &t[40]); - secp256k1_fe_decode_matrix(&c1, &t[48]); - secp256k1_fe_decode_matrix(&d1, &t[56]); - - secp256k1_fe_mul(&a1, &a1, &b0); - secp256k1_fe_mul(&b1, &b1, &d0); - secp256k1_fe_mul(&c1, &c1, &b0); - secp256k1_fe_mul(&d1, &d1, &d0); - - b0 = a1; secp256k1_fe_add(&b0, &b1); - d0 = c1; secp256k1_fe_add(&d0, &d1); - - secp256k1_fe_decode_matrix(&a1, &t[64]); - secp256k1_fe_decode_matrix(&b1, &t[72]); - /* secp256k1_fe_decode_matrix(&c1, &t[80]); */ - /* secp256k1_fe_decode_matrix(&d1, &t[88]); */ + secp256k1_fe_decode_30(&b0, d); - secp256k1_fe_mul(&a1, &a1, &b0); - secp256k1_fe_mul(&b1, &b1, &d0); - /* secp256k1_fe_mul(&c1, &c1, &b0); */ - /* secp256k1_fe_mul(&d1, &d1, &d0); */ - - b0 = a1; secp256k1_fe_add(&b0, &b1); - /* d0 = c1; secp256k1_fe_add(&d0, &d1); */ - - secp256k1_fe_negate(&b1, &b0, 2); + secp256k1_fe_negate(&b1, &b0, 7); secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); @@ -1669,26 +1495,24 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { /* Modular inversion based on the paper "Fast constant-time gcd computation and - * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. - */ + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - int32_t t[24 * 4]; - int32_t f[9] = { 0x7FFFFC2FL, 0x7FFFFFFDL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, - 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL }; + int32_t t[4]; + int32_t d[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int32_t e[9] = { 1, 0, 0, 0, 0, 0, 0, 0, 0 }; + int32_t f[9] = { 0x3FFFFC2F, 0x3FFFFFFB, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, + 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF }; int32_t g[9]; - secp256k1_fe b0, d0, a1, b1, c1, d1; + secp256k1_fe b0, b1; int i, sign; uint32_t eta; #ifdef VERIFY int zero_in; #endif - /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input - * by 2^768, and then the output by 2^24. */ - /* Instead of dividing the output by 2^744, scale the input. */ - secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744); + b0 = *a; secp256k1_fe_normalize(&b0); - secp256k1_fe_encode_31(g, &b0); + secp256k1_fe_encode_30(g, &b0); #ifdef VERIFY zero_in = secp256k1_fe_is_zero(&b0); @@ -1697,66 +1521,28 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { /* The paper uses 'delta'; eta == -delta (a performance tweak). */ eta = -(uint32_t)1; - for (i = 0; i < 24; ++i) { - eta = secp256k1_fe_divsteps_31_var(eta, f[0], g[0], &t[i * 4]); - secp256k1_fe_update_fg(f, g, &t[i * 4]); - } + for (i = 0; i < 25; ++i) { + eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t); + secp256k1_fe_update_de_30(d, e, t); + secp256k1_fe_update_fg_30(f, g, t); - /* At this point sufficient iterations have been performed that g must have reached 0 - * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divsteps_31_var are combined to - * get the Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divsteps_31_var introduce an extra factor of 2^31 each, so there is a total extra - * factor of 2^744 to account for (by scaling the input and/or output accordingly). - */ - - VERIFY_CHECK(g[0] == 0); - - sign = (f[0] >> 1) & 1; - - for (i = 0; i < 3; ++i) { - int tOff = i * 32; - secp256k1_fe_combine_1s(&t[tOff + 0]); - secp256k1_fe_combine_1s(&t[tOff + 8]); - secp256k1_fe_combine_1s(&t[tOff + 16]); - secp256k1_fe_combine_1s(&t[tOff + 24]); - secp256k1_fe_combine_2s(&t[tOff + 0]); - secp256k1_fe_combine_2s(&t[tOff + 16]); - secp256k1_fe_combine_4s(&t[tOff + 0]); + if (g[0] == 0) { + if ((g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0) { + break; + } + } } - /* secp256k1_fe_decode_matrix(&a0, &t[0]); */ - secp256k1_fe_decode_matrix(&b0, &t[8]); - /* secp256k1_fe_decode_matrix(&c0, &t[16]); */ - secp256k1_fe_decode_matrix(&d0, &t[24]); - - secp256k1_fe_decode_matrix(&a1, &t[32]); - secp256k1_fe_decode_matrix(&b1, &t[40]); - secp256k1_fe_decode_matrix(&c1, &t[48]); - secp256k1_fe_decode_matrix(&d1, &t[56]); - - secp256k1_fe_mul(&a1, &a1, &b0); - secp256k1_fe_mul(&b1, &b1, &d0); - secp256k1_fe_mul(&c1, &c1, &b0); - secp256k1_fe_mul(&d1, &d1, &d0); + VERIFY_CHECK(i < 25); - b0 = a1; secp256k1_fe_add(&b0, &b1); - d0 = c1; secp256k1_fe_add(&d0, &d1); + /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of + * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */ - secp256k1_fe_decode_matrix(&a1, &t[64]); - secp256k1_fe_decode_matrix(&b1, &t[72]); - /* secp256k1_fe_decode_matrix(&c1, &t[80]); */ - /* secp256k1_fe_decode_matrix(&d1, &t[88]); */ - - secp256k1_fe_mul(&a1, &a1, &b0); - secp256k1_fe_mul(&b1, &b1, &d0); - /* secp256k1_fe_mul(&c1, &c1, &b0); */ - /* secp256k1_fe_mul(&d1, &d1, &d0); */ + sign = (f[0] >> 1) & 1; - b0 = a1; secp256k1_fe_add(&b0, &b1); - /* d0 = c1; secp256k1_fe_add(&d0, &d1); */ + secp256k1_fe_decode_30(&b0, d); - secp256k1_fe_negate(&b1, &b0, 2); + secp256k1_fe_negate(&b1, &b0, 7); secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index c6e6dd0b0c..105eaa9500 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -644,11 +644,11 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t return eta; } -static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) { +static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) { - /* I64 == -P^-1 mod 2^64 */ - const int64_t I64 = 0xD838091DD2253531LL; - const int64_t C64 = 0x1000003D1LL; + /* I62 == -P^-1 mod 2^62 */ + const int64_t I62 = 0x1838091DD2253531LL; + const int64_t C62 = 0x1000003D1LL; const int64_t M62 = (int64_t)(UINT64_MAX >> 2); int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; int128_t cd = 0, ce = 0; @@ -661,12 +661,12 @@ static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) { ce -= (int128_t)q * di + (int128_t)r * ei; /* Calculate the multiples of P to add, to zero the 62 bottom bits. */ - md = ((int128_t)I64 * (int64_t)cd) & M62; - me = ((int128_t)I64 * (int64_t)ce) & M62; + md = ((int128_t)I62 * (int64_t)cd) & M62; + me = ((int128_t)I62 * (int64_t)ce) & M62; - /* P == 2^256 - C64; subtract C64 products here. */ - cd -= (int128_t)C64 * md; - ce -= (int128_t)C64 * me; + /* P == 2^256 - C62; subtract products of C62 here. */ + cd -= (int128_t)C62 * md; + ce -= (int128_t)C62 * me; VERIFY_CHECK(((int64_t)cd & M62) == 0); VERIFY_CHECK(((int64_t)ce & M62) == 0); @@ -686,6 +686,10 @@ static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) { e[i - 1] = (int64_t)ce & M62; ce >>= 62; } + /* Add products of 2^256. */ + cd += (int128_t)md << 8; + ce += (int128_t)me << 8; + { di = d[4]; ei = e[4]; @@ -693,10 +697,6 @@ static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) { cd -= (int128_t)u * di + (int128_t)v * ei; ce -= (int128_t)q * di + (int128_t)r * ei; - /* In the final iteration, add the 2^256 products. */ - cd += (int128_t)md << 8; - ce += (int128_t)me << 8; - d[3] = (int64_t)cd & M62; cd >>= 62; e[3] = (int64_t)ce & M62; ce >>= 62; } @@ -705,7 +705,7 @@ static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) { e[4] = (int64_t)ce; } -static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) { +static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, int64_t *t) { const int64_t M62 = (int64_t)(UINT64_MAX >> 2); int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; @@ -743,8 +743,7 @@ static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) { static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { /* Modular inversion based on the paper "Fast constant-time gcd computation and - * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. - */ + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ int64_t t[4]; int64_t d[5] = { 0, 0, 0, 0, 0 }; @@ -772,14 +771,13 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { for (i = 0; i < 12; ++i) { eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], t); - secp256k1_fe_update_de(d, e, t); - secp256k1_fe_update_fg(f, g, t); + secp256k1_fe_update_de_62(d, e, t); + secp256k1_fe_update_fg_62(f, g, t); } /* At this point sufficient iterations have been performed that g must have reached 0 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1, and d now contains +/- the modular inverse. - */ + * values i.e. +/- 1, and d now contains +/- the modular inverse. */ VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0); @@ -801,8 +799,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { /* Modular inversion based on the paper "Fast constant-time gcd computation and - * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. - */ + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ int64_t t[4]; int64_t d[5] = { 0, 0, 0, 0, 0 }; @@ -830,8 +827,8 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { for (i = 0; i < 12; ++i) { eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t); - secp256k1_fe_update_de(d, e, t); - secp256k1_fe_update_fg(f, g, t); + secp256k1_fe_update_de_62(d, e, t); + secp256k1_fe_update_fg_62(f, g, t); if (g[0] == 0) { if ((g[1] | g[2] | g[3] | g[4]) == 0) { @@ -843,8 +840,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { VERIFY_CHECK(i < 12); /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of - * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. - */ + * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */ sign = (f[0] >> 1) & 1; From e5f2d29cbb7a98fe295f957240138378558dae32 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sat, 8 Aug 2020 22:08:18 +0700 Subject: [PATCH 13/34] =?UTF-8?q?scalar=5F4x64:=20update=20B=C3=A9zout=20c?= =?UTF-8?q?oefficients=20on-the-fly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/field_10x26_impl.h | 4 +- src/field_5x52_impl.h | 4 +- src/scalar_4x64_impl.h | 300 ++++++++++++++--------------------------- src/scalar_8x32_impl.h | 4 - 4 files changed, 107 insertions(+), 205 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 30c5653c78..2a1eaed394 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1353,8 +1353,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { ce -= (int64_t)q * di + (int64_t)r * ei; /* Calculate the multiples of P to add, to zero the 30 bottom bits. */ - md = ((int64_t)I30 * (int32_t)cd) & M30; - me = ((int64_t)I30 * (int32_t)ce) & M30; + md = (I30 * (int32_t)cd) & M30; + me = (I30 * (int32_t)ce) & M30; /* P == 2^256 - 2^32 - C30; subtract products of C30 here. */ cd -= (int64_t)C30 * md; diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 105eaa9500..44188699b2 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -661,8 +661,8 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) { ce -= (int128_t)q * di + (int128_t)r * ei; /* Calculate the multiples of P to add, to zero the 62 bottom bits. */ - md = ((int128_t)I62 * (int64_t)cd) & M62; - me = ((int128_t)I62 * (int64_t)ce) & M62; + md = (I62 * (int64_t)cd) & M62; + me = (I62 * (int64_t)ce) & M62; /* P == 2^256 - C62; subtract products of C62 here. */ cd -= (int128_t)C62 * md; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 8ce50a9ad3..7b526cb90f 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -962,104 +962,40 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA 0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL ); -static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CONST( - 0x4E165355UL, 0x5D800C18UL, 0xEF116DB1UL, 0xB31347F1UL, - 0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL -); - -static void secp256k1_scalar_mul_add_2(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) { - - /* Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64. - * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and - * writes the 252-bit signed result to [t[0],t[1],t[2],t[3]]. - */ - - int64_t z0, z1, z2, z3; - int128_t tt; - - tt = (int128_t)a0 * b0 - + (int128_t)c0 * d0; - z0 = (int64_t)tt; tt -= z0; tt >>= 64; - - tt += (int128_t)a0 * b1 - + (int128_t)a1 * b0 - + (int128_t)c0 * d1 - + (int128_t)c1 * d0; - z1 = (int64_t)tt; tt -= z1; tt >>= 64; - - tt += (int128_t)a1 * b1 - + (int128_t)c1 * d1; - z2 = (int64_t)tt; tt -= z2; tt >>= 64; - - z3 = (int64_t)tt; - - t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3; -} - -static void secp256k1_scalar_combine_1s(int64_t *t) { - - int64_t a = t[0], b = t[1], c = t[2], d = t[3], - e = t[4], f = t[5], g = t[6], h = t[7]; - int128_t I, J, K, L; - - I = (int128_t)e * a + (int128_t)f * c; - J = (int128_t)e * b + (int128_t)f * d; - K = (int128_t)g * a + (int128_t)h * c; - L = (int128_t)g * b + (int128_t)h * d; - - a = (int64_t)I; I -= a; I >>= 64; b = (int64_t)I; - c = (int64_t)J; J -= c; J >>= 64; d = (int64_t)J; - e = (int64_t)K; K -= e; K >>= 64; f = (int64_t)K; - g = (int64_t)L; L -= g; L >>= 64; h = (int64_t)L; - - t[0] = a; t[1] = b; t[2] = c; t[3] = d; - t[4] = e; t[5] = f; t[6] = g; t[7] = h; -} - -static void secp256k1_scalar_combine_2s(int64_t *t) { - - int64_t a0 = t[ 0], a1 = t[ 1]; - int64_t b0 = t[ 2], b1 = t[ 3]; - int64_t c0 = t[ 4], c1 = t[ 5]; - int64_t d0 = t[ 6], d1 = t[ 7]; - int64_t e0 = t[ 8], e1 = t[ 9]; - int64_t f0 = t[10], f1 = t[11]; - int64_t g0 = t[12], g1 = t[13]; - int64_t h0 = t[14], h1 = t[15]; - - secp256k1_scalar_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); - secp256k1_scalar_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); - secp256k1_scalar_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); - secp256k1_scalar_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); -} - -static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) { +static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) { + uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; uint64_t r0, r1, r2, r3; - int flag; + int64_t t; secp256k1_scalar u; - int128_t cc; - - cc = t[0]; - r0 = (uint64_t)cc; cc >>= 64; - cc += t[1]; - r1 = (uint64_t)cc; cc >>= 64; - cc += t[2]; - r2 = (uint64_t)cc; cc >>= 64; - cc += t[3]; - r3 = (uint64_t)cc; cc >>= 64; - VERIFY_CHECK(cc == 0 || cc == -1); + VERIFY_CHECK(a0 >> 62 == 0); + VERIFY_CHECK(a1 >> 62 == 0); + VERIFY_CHECK(a2 >> 62 == 0); + VERIFY_CHECK(a3 >> 62 == 0); - flag = (int)cc & 1; + r0 = a0 | a1 << 62; + r1 = a1 >> 2 | a2 << 60; + r2 = a2 >> 4 | a3 << 58; + r3 = a3 >> 6 | a4 << 56; r->d[0] = r0; r->d[1] = r1; r->d[2] = r2; r->d[3] = r3; + secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r)); + + t = (int64_t)a4 >> 8; + + VERIFY_CHECK(t == 1 || t == 0 || t == -1); + secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256); - secp256k1_scalar_cmov(r, &u, flag); + secp256k1_scalar_cmov(r, &u, a4 >> 63); + + t += a4 >> 63; + + secp256k1_scalar_reduce(r, t); } static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { @@ -1077,10 +1013,6 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { r[2] = (a1 >> 60 | a2 << 4) & M62; r[3] = (a2 >> 58 | a3 << 6) & M62; r[4] = a3 >> 56; - -#ifdef VERIFY - VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0); -#endif } static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) { @@ -1175,7 +1107,56 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint return eta; } -static void secp256k1_scalar_update_fg(int64_t *f, int64_t *g, int64_t *t) { +static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, int64_t *t) { + + /* I62 == -P^-1 mod 2^62 */ + const int64_t I62 = 0x0B0DFF665588B13FLL; + const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, + 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; + const int64_t M62 = (int64_t)(UINT64_MAX >> 2); + int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; + int128_t cd = 0, ce = 0; + int i; + + di = d[0]; + ei = e[0]; + + cd -= (int128_t)u * di + (int128_t)v * ei; + ce -= (int128_t)q * di + (int128_t)r * ei; + + /* Calculate the multiples of P to add, to zero the 62 bottom bits. */ + md = (I62 * (int64_t)cd) & M62; + me = (I62 * (int64_t)ce) & M62; + + cd += (int128_t)P[0] * md; + ce += (int128_t)P[0] * me; + + VERIFY_CHECK(((int64_t)cd & M62) == 0); + VERIFY_CHECK(((int64_t)ce & M62) == 0); + + cd >>= 62; + ce >>= 62; + + for (i = 1; i < 5; ++i) { + + di = d[i]; + ei = e[i]; + + cd -= (int128_t)u * di + (int128_t)v * ei; + ce -= (int128_t)q * di + (int128_t)r * ei; + + cd += (int128_t)P[i] * md; + ce += (int128_t)P[i] * me; + + d[i - 1] = (int64_t)cd & M62; cd >>= 62; + e[i - 1] = (int64_t)ce & M62; ce >>= 62; + } + + d[4] = (int64_t)cd; + e[4] = (int64_t)ce; +} + +static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, int64_t *t) { const int64_t M62 = (int64_t)(UINT64_MAX >> 2); int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; @@ -1224,82 +1205,42 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar #else /* Modular inversion based on the paper "Fast constant-time gcd computation and - * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. - */ + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - int64_t t[12 * 4]; + int64_t t[4]; + int64_t d[5] = { 0, 0, 0, 0, 0 }; + int64_t e[5] = { 1, 0, 0, 0, 0 }; int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; - secp256k1_scalar b0, d0, a1, b1, c1, d1; + secp256k1_scalar b0; int i, sign; uint64_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - /* Instead of dividing the output by 2^744, scale the input. */ - secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); + b0 = *x; secp256k1_scalar_encode_62(g, &b0); /* The paper uses 'delta'; eta == -delta (a performance tweak). */ eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { - eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], &t[i * 4]); - secp256k1_scalar_update_fg(f, g, &t[i * 4]); + eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], t); + secp256k1_scalar_update_de_62(d, e, t); + secp256k1_scalar_update_fg_62(f, g, t); } /* At this point sufficient iterations have been performed that g must have reached 0 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get - * the Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra - * factor of 2^744 to account for (by scaling the input and/or output accordingly). - */ + * values i.e. +/- 1, and d now contains +/- the modular inverse. */ - VERIFY_CHECK(g[0] == 0); + VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0); sign = (f[0] >> 1) & 1; - for (i = 0; i < 3; ++i) { - int tOff = i * 16; - secp256k1_scalar_combine_1s(&t[tOff + 0]); - secp256k1_scalar_combine_1s(&t[tOff + 8]); - secp256k1_scalar_combine_2s(&t[tOff + 0]); - } - - /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */ - secp256k1_scalar_decode_matrix(&b0, &t[4]); - /* secp256k1_scalar_decode_matrix(&c0, &t[8]); */ - secp256k1_scalar_decode_matrix(&d0, &t[12]); - - secp256k1_scalar_decode_matrix(&a1, &t[16]); - secp256k1_scalar_decode_matrix(&b1, &t[20]); - secp256k1_scalar_decode_matrix(&c1, &t[24]); - secp256k1_scalar_decode_matrix(&d1, &t[28]); - - secp256k1_scalar_mul(&a1, &a1, &b0); - secp256k1_scalar_mul(&b1, &b1, &d0); - secp256k1_scalar_mul(&c1, &c1, &b0); - secp256k1_scalar_mul(&d1, &d1, &d0); - - secp256k1_scalar_add(&b0, &a1, &b1); - secp256k1_scalar_add(&d0, &c1, &d1); - - secp256k1_scalar_decode_matrix(&a1, &t[32]); - secp256k1_scalar_decode_matrix(&b1, &t[36]); - /* secp256k1_scalar_decode_matrix(&c1, &t[40]); */ - /* secp256k1_scalar_decode_matrix(&d1, &t[44]); */ - - secp256k1_scalar_mul(&a1, &a1, &b0); - secp256k1_scalar_mul(&b1, &b1, &d0); - /* secp256k1_scalar_mul(&c1, &c1, &b0); */ - /* secp256k1_scalar_mul(&d1, &d1, &d0); */ - - secp256k1_scalar_add(&b0, &a1, &b1); - /* secp256k1_scalar_add(&d0, &c1, &d1); */ - + secp256k1_scalar_decode_62(&b0, d); secp256k1_scalar_cond_negate(&b0, sign); #ifdef VERIFY @@ -1317,82 +1258,47 @@ SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) { /* Modular inversion based on the paper "Fast constant-time gcd computation and - * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. - */ + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - int64_t t[12 * 4]; + int64_t t[4]; + int64_t d[5] = { 0, 0, 0, 0, 0 }; + int64_t e[5] = { 1, 0, 0, 0, 0 }; int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; - secp256k1_scalar b0, d0, a1, b1, c1, d1; + secp256k1_scalar b0; int i, sign; uint64_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - /* Instead of dividing the output by 2^744, scale the input. */ - secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); + b0 = *x; secp256k1_scalar_encode_62(g, &b0); /* The paper uses 'delta'; eta == -delta (a performance tweak). */ eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { - eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], &t[i * 4]); - secp256k1_scalar_update_fg(f, g, &t[i * 4]); + eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], t); + secp256k1_scalar_update_de_62(d, e, t); + secp256k1_scalar_update_fg_62(f, g, t); + + if (g[0] == 0) { + if ((g[1] | g[2] | g[3] | g[4]) == 0) { + break; + } + } } - /* At this point sufficient iterations have been performed that g must have reached 0 - * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get - * the Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra - * factor of 2^744 to account for (by scaling the input and/or output accordingly). - */ + VERIFY_CHECK(i < 12); - VERIFY_CHECK(g[0] == 0); + /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of + * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */ sign = (f[0] >> 1) & 1; - for (i = 0; i < 3; ++i) { - int tOff = i * 16; - secp256k1_scalar_combine_1s(&t[tOff + 0]); - secp256k1_scalar_combine_1s(&t[tOff + 8]); - secp256k1_scalar_combine_2s(&t[tOff + 0]); - } - - /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */ - secp256k1_scalar_decode_matrix(&b0, &t[4]); - /* secp256k1_scalar_decode_matrix(&c0, &t[8]); */ - secp256k1_scalar_decode_matrix(&d0, &t[12]); - - secp256k1_scalar_decode_matrix(&a1, &t[16]); - secp256k1_scalar_decode_matrix(&b1, &t[20]); - secp256k1_scalar_decode_matrix(&c1, &t[24]); - secp256k1_scalar_decode_matrix(&d1, &t[28]); - - secp256k1_scalar_mul(&a1, &a1, &b0); - secp256k1_scalar_mul(&b1, &b1, &d0); - secp256k1_scalar_mul(&c1, &c1, &b0); - secp256k1_scalar_mul(&d1, &d1, &d0); - - secp256k1_scalar_add(&b0, &a1, &b1); - secp256k1_scalar_add(&d0, &c1, &d1); - - secp256k1_scalar_decode_matrix(&a1, &t[32]); - secp256k1_scalar_decode_matrix(&b1, &t[36]); - /* secp256k1_scalar_decode_matrix(&c1, &t[40]); */ - /* secp256k1_scalar_decode_matrix(&d1, &t[44]); */ - - secp256k1_scalar_mul(&a1, &a1, &b0); - secp256k1_scalar_mul(&b1, &b1, &d0); - /* secp256k1_scalar_mul(&c1, &c1, &b0); */ - /* secp256k1_scalar_mul(&d1, &d1, &d0); */ - - secp256k1_scalar_add(&b0, &a1, &b1); - /* secp256k1_scalar_add(&d0, &c1, &d1); */ - + secp256k1_scalar_decode_62(&b0, d); secp256k1_scalar_cond_negate(&b0, sign); #ifdef VERIFY diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 54fd10f385..f47b19b287 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -983,10 +983,6 @@ static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) { r[6] = (a5 >> 26 | a6 << 6) & M31; r[7] = (a6 >> 25 | a7 << 7) & M31; r[8] = a7 >> 24; - -#ifdef VERIFY - VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0); -#endif } static uint32_t secp256k1_scalar_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { From 34bec400bffc1c12a8cd8cc9163bc3b1a9947f64 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sat, 8 Aug 2020 23:40:40 +0700 Subject: [PATCH 14/34] =?UTF-8?q?scalar=5F8x32:=20update=20B=C3=A9zout=20c?= =?UTF-8?q?oefficients=20on-the-fly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/scalar_8x32_impl.h | 502 ++++++++++++----------------------------- 1 file changed, 146 insertions(+), 356 deletions(-) diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index f47b19b287..6cc23f90aa 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -738,217 +738,31 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA 0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL ); -static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CONST( - 0x4E165355UL, 0x5D800C18UL, 0xEF116DB1UL, 0xB31347F1UL, - 0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL -); - -static void secp256k1_scalar_mul_add_2(int32_t a0, int32_t a1, int32_t b0, int32_t b1, int32_t c0, int32_t c1, int32_t d0, int32_t d1, int32_t *t) { - - /* Each [a0,a1], etc. pair is a ??-bit signed value e.g. a0 + a1 * 2^32. - * This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and - * writes the ???-bit signed result to [t[0],t[1],t[2],t[3]]. - */ - - int32_t z0, z1, z2, z3; - int64_t tt; - - tt = (int64_t)a0 * b0 - + (int64_t)c0 * d0; - z0 = (int32_t)tt; tt -= z0; tt >>= 32; - - tt += (int64_t)a0 * b1 - + (int64_t)a1 * b0 - + (int64_t)c0 * d1 - + (int64_t)c1 * d0; - z1 = (int32_t)tt; tt -= z1; tt >>= 32; - - tt += (int64_t)a1 * b1 - + (int64_t)c1 * d1; - z2 = (int32_t)tt; tt -= z2; tt >>= 32; - - z3 = (int32_t)tt; - - t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3; -} - -static void secp256k1_scalar_mul_add_4(int32_t* tIn, int xPos, int yPos, int uPos, int vPos, int32_t *tOut, int zzPos) { - int32_t y0 = tIn[yPos + 0]; - int32_t y1 = tIn[yPos + 1]; - int32_t y2 = tIn[yPos + 2]; - int32_t y3 = tIn[yPos + 3]; - int32_t v0 = tIn[vPos + 0]; - int32_t v1 = tIn[vPos + 1]; - int32_t v2 = tIn[vPos + 2]; - int32_t v3 = tIn[vPos + 3]; - int32_t xVal, uVal; - int32_t z0, z1, z2, z3, z4, z5, z6, z7; - int64_t c; - - xVal = tIn[xPos]; - uVal = tIn[uPos]; - - c = (int64_t)xVal * y0 + (int64_t)uVal * v0; - z0 = (int32_t)c; c -= z0; c >>= 32; - - c += (int64_t)xVal * y1 + (int64_t)uVal * v1; - z1 = (int32_t)c; c -= z1; c >>= 32; - - c += (int64_t)xVal * y2 + (int64_t)uVal * v2; - z2 = (int32_t)c; c -= z2; c >>= 32; - - c += (int64_t)xVal * y3 + (int64_t)uVal * v3; - z3 = (int32_t)c; c -= z3; c >>= 32; - z4 = (int32_t)c; - - xVal = tIn[xPos + 1]; - uVal = tIn[uPos + 1]; - - c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z1; - z1 = (int32_t)c; c -= z1; c >>= 32; - - c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z2; - z2 = (int32_t)c; c -= z2; c >>= 32; - - c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z3; - z3 = (int32_t)c; c -= z3; c >>= 32; - - c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z4; - z4 = (int32_t)c; c -= z4; c >>= 32; - z5 = (int32_t)c; - - xVal = tIn[xPos + 2]; - uVal = tIn[uPos + 2]; - - c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z2; - z2 = (int32_t)c; c -= z2; c >>= 32; - - c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z3; - z3 = (int32_t)c; c -= z3; c >>= 32; - - c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z4; - z4 = (int32_t)c; c -= z4; c >>= 32; - - c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z5; - z5 = (int32_t)c; c -= z5; c >>= 32; - z6 = (int32_t)c; - - xVal = tIn[xPos + 3]; - uVal = tIn[uPos + 3]; - - c = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z3; - z3 = (int32_t)c; c -= z3; c >>= 32; - - c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z4; - z4 = (int32_t)c; c -= z4; c >>= 32; - - c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z5; - z5 = (int32_t)c; c -= z5; c >>= 32; - - c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z6; - z6 = (int32_t)c; c -= z6; c >>= 32; - z7 = (int32_t)c; - - tOut[zzPos + 0] = z0; - tOut[zzPos + 1] = z1; - tOut[zzPos + 2] = z2; - tOut[zzPos + 3] = z3; - tOut[zzPos + 4] = z4; - tOut[zzPos + 5] = z5; - tOut[zzPos + 6] = z6; - tOut[zzPos + 7] = z7; -} - -static void secp256k1_scalar_combine_1s(int32_t *t) { - - int32_t a = t[0], b = t[1], c = t[2], d = t[3], - e = t[4], f = t[5], g = t[6], h = t[7]; - int64_t I, J, K, L; - - I = (int64_t)e * a + (int64_t)f * c; - J = (int64_t)e * b + (int64_t)f * d; - K = (int64_t)g * a + (int64_t)h * c; - L = (int64_t)g * b + (int64_t)h * d; - - a = (int32_t)I; I -= a; I >>= 32; b = (int32_t)I; - c = (int32_t)J; J -= c; J >>= 32; d = (int32_t)J; - e = (int32_t)K; K -= e; K >>= 32; f = (int32_t)K; - g = (int32_t)L; L -= g; L >>= 32; h = (int32_t)L; - - t[0] = a; t[1] = b; t[2] = c; t[3] = d; - t[4] = e; t[5] = f; t[6] = g; t[7] = h; -} - -static void secp256k1_scalar_combine_2s(int32_t *t) { - - int32_t a0 = t[ 0], a1 = t[ 1]; - int32_t b0 = t[ 2], b1 = t[ 3]; - int32_t c0 = t[ 4], c1 = t[ 5]; - int32_t d0 = t[ 6], d1 = t[ 7]; - int32_t e0 = t[ 8], e1 = t[ 9]; - int32_t f0 = t[10], f1 = t[11]; - int32_t g0 = t[12], g1 = t[13]; - int32_t h0 = t[14], h1 = t[15]; - - secp256k1_scalar_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]); - secp256k1_scalar_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]); - secp256k1_scalar_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]); - secp256k1_scalar_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]); -} - -static void secp256k1_scalar_combine_4s(int32_t *t) -{ - int32_t tmp[32]; - - int aPos = 0; - int bPos = 4; - int cPos = 8; - int dPos = 12; - int ePos = 16; - int fPos = 20; - int gPos = 24; - int hPos = 28; - - int IPos = 0; - int JPos = 8; - int KPos = 16; - int LPos = 24; - - secp256k1_scalar_mul_add_4(t, ePos, aPos, fPos, cPos, tmp, IPos); - secp256k1_scalar_mul_add_4(t, ePos, bPos, fPos, dPos, tmp, JPos); - secp256k1_scalar_mul_add_4(t, gPos, aPos, hPos, cPos, tmp, KPos); - secp256k1_scalar_mul_add_4(t, gPos, bPos, hPos, dPos, tmp, LPos); - - memcpy(t, tmp, 32 * sizeof(int32_t)); -} - -static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int32_t *t) { +static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) { + uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], + a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; uint32_t r0, r1, r2, r3, r4, r5, r6, r7; - int flag; + int32_t t; secp256k1_scalar u; - int64_t cc; - - cc = t[0]; - r0 = (uint32_t)cc; cc >>= 32; - cc += t[1]; - r1 = (uint32_t)cc; cc >>= 32; - cc += t[2]; - r2 = (uint32_t)cc; cc >>= 32; - cc += t[3]; - r3 = (uint32_t)cc; cc >>= 32; - cc += t[4]; - r4 = (uint32_t)cc; cc >>= 32; - cc += t[5]; - r5 = (uint32_t)cc; cc >>= 32; - cc += t[6]; - r6 = (uint32_t)cc; cc >>= 32; - cc += t[7]; - r7 = (uint32_t)cc; cc >>= 32; - - VERIFY_CHECK(cc == 0 || cc == -1); - - flag = (int)cc & 1; + + VERIFY_CHECK(a0 >> 30 == 0); + VERIFY_CHECK(a1 >> 30 == 0); + VERIFY_CHECK(a2 >> 30 == 0); + VERIFY_CHECK(a3 >> 30 == 0); + VERIFY_CHECK(a4 >> 30 == 0); + VERIFY_CHECK(a5 >> 30 == 0); + VERIFY_CHECK(a6 >> 30 == 0); + VERIFY_CHECK(a7 >> 30 == 0); + + r0 = a0 | a1 << 30; + r1 = a1 >> 2 | a2 << 28; + r2 = a2 >> 4 | a3 << 26; + r3 = a3 >> 6 | a4 << 24; + r4 = a4 >> 8 | a5 << 22; + r5 = a5 >> 10 | a6 << 20; + r6 = a6 >> 12 | a7 << 18; + r7 = a7 >> 14 | a8 << 16; r->d[0] = r0; r->d[1] = r1; @@ -959,13 +773,23 @@ static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int32_t *t) { r->d[6] = r6; r->d[7] = r7; + secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r)); + + t = (int32_t)a8 >> 16; + + VERIFY_CHECK(t == 1 || t == 0 || t == -1); + secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256); - secp256k1_scalar_cmov(r, &u, flag); + secp256k1_scalar_cmov(r, &u, a8 >> 31); + + t += a8 >> 31; + + secp256k1_scalar_reduce(r, t); } -static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) { +static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) { - const uint32_t M31 = UINT32_MAX >> 1; + const uint32_t M30 = UINT32_MAX >> 2; const uint32_t *d = &a->d[0]; uint32_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3], a4 = d[4], a5 = d[5], a6 = d[6], a7 = d[7]; @@ -974,24 +798,24 @@ static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) { VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0); #endif - r[0] = a0 & M31; - r[1] = (a0 >> 31 | a1 << 1) & M31; - r[2] = (a1 >> 30 | a2 << 2) & M31; - r[3] = (a2 >> 29 | a3 << 3) & M31; - r[4] = (a3 >> 28 | a4 << 4) & M31; - r[5] = (a4 >> 27 | a5 << 5) & M31; - r[6] = (a5 >> 26 | a6 << 6) & M31; - r[7] = (a6 >> 25 | a7 << 7) & M31; - r[8] = a7 >> 24; + r[0] = a0 & M30; + r[1] = (a0 >> 30 | a1 << 2) & M30; + r[2] = (a1 >> 28 | a2 << 4) & M30; + r[3] = (a2 >> 26 | a3 << 6) & M30; + r[4] = (a3 >> 24 | a4 << 8) & M30; + r[5] = (a4 >> 22 | a5 << 10) & M30; + r[6] = (a5 >> 20 | a6 << 12) & M30; + r[7] = (a6 >> 18 | a7 << 14) & M30; + r[8] = a7 >> 16; } -static uint32_t secp256k1_scalar_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t c1, c2, f = f0, g = g0, x, y, z; int i; - for (i = 0; i < 31; ++i) { + for (i = 0; i < 30; ++i) { VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((u * f0 + v * g0) == -f << i); @@ -1025,11 +849,11 @@ static uint32_t secp256k1_scalar_divsteps_31(uint32_t eta, uint32_t f0, uint32_t return eta; } -static uint32_t secp256k1_scalar_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t f = f0, g = g0, m, w, x, y, z; - int i = 31, limit, zeros; + int i = 30, limit, zeros; for (;;) { @@ -1048,8 +872,8 @@ static uint32_t secp256k1_scalar_divsteps_31_var(uint32_t eta, uint32_t f0, uint VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((g & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i)); - VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i)); + VERIFY_CHECK((u * f0 + v * g0) == -f << (30 - i)); + VERIFY_CHECK((q * f0 + r * g0) == -g << (30 - i)); if ((int32_t)eta < 0) { eta = -eta; @@ -1077,9 +901,58 @@ static uint32_t secp256k1_scalar_divsteps_31_var(uint32_t eta, uint32_t f0, uint return eta; } -static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) { +static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) { + + /* I30 == -P^-1 mod 2^30 */ + const int32_t I30 = 0x1588B13FL; + const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, + 0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL }; + const int32_t M30 = (int32_t)(UINT32_MAX >> 2); + int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; + int64_t cd = 0, ce = 0; + int i; + + di = d[0]; + ei = e[0]; + + cd -= (int64_t)u * di + (int64_t)v * ei; + ce -= (int64_t)q * di + (int64_t)r * ei; + + /* Calculate the multiples of P to add, to zero the 30 bottom bits. */ + md = (I30 * (int32_t)cd) & M30; + me = (I30 * (int32_t)ce) & M30; + + cd += (int64_t)P[0] * md; + ce += (int64_t)P[0] * me; + + VERIFY_CHECK(((int32_t)cd & M30) == 0); + VERIFY_CHECK(((int32_t)ce & M30) == 0); + + cd >>= 30; + ce >>= 30; + + for (i = 1; i < 9; ++i) { + + di = d[i]; + ei = e[i]; + + cd -= (int64_t)u * di + (int64_t)v * ei; + ce -= (int64_t)q * di + (int64_t)r * ei; + + cd += (int64_t)P[i] * md; + ce += (int64_t)P[i] * me; + + d[i - 1] = (int32_t)cd & M30; cd >>= 30; + e[i - 1] = (int32_t)ce & M30; ce >>= 30; + } + + d[8] = (int32_t)cd; + e[8] = (int32_t)ce; +} + +static void secp256k1_scalar_update_fg_30(int32_t *f, int32_t *g, int32_t *t) { - const int32_t M31 = (int32_t)(UINT32_MAX >> 1); + const int32_t M30 = (int32_t)(UINT32_MAX >> 2); int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; int64_t cf = 0, cg = 0; int i; @@ -1090,11 +963,11 @@ static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) { cf -= (int64_t)u * fi + (int64_t)v * gi; cg -= (int64_t)q * fi + (int64_t)r * gi; - VERIFY_CHECK(((int32_t)cf & M31) == 0); - VERIFY_CHECK(((int32_t)cg & M31) == 0); + VERIFY_CHECK(((int32_t)cf & M30) == 0); + VERIFY_CHECK(((int32_t)cg & M30) == 0); - cf >>= 31; - cg >>= 31; + cf >>= 30; + cg >>= 30; for (i = 1; i < 9; ++i) { @@ -1104,8 +977,8 @@ static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) { cf -= (int64_t)u * fi + (int64_t)v * gi; cg -= (int64_t)q * fi + (int64_t)r * gi; - f[i - 1] = (int32_t)cf & M31; cf >>= 31; - g[i - 1] = (int32_t)cg & M31; cg >>= 31; + f[i - 1] = (int32_t)cf & M30; cf >>= 30; + g[i - 1] = (int32_t)cg & M30; cg >>= 30; } f[8] = (int32_t)cf; @@ -1126,86 +999,42 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar #else /* Modular inversion based on the paper "Fast constant-time gcd computation and - * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. - */ + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - int32_t t[24 * 4]; - int32_t f[9] = { 0x50364141L, 0x7FA4BD19L, 0x3D2280EEL, 0x5576E735L, 0x7FFFFFEBL, - 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL }; + int32_t t[4]; + int32_t d[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int32_t e[9] = { 1, 0, 0, 0, 0, 0, 0, 0, 0 }; + int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, + 0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL }; int32_t g[9]; - secp256k1_scalar b0, d0, a1, b1, c1, d1; + secp256k1_scalar b0; int i, sign; uint32_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - /* Instead of dividing the output by 2^744, scale the input. */ - secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); - secp256k1_scalar_encode_31(g, &b0); + b0 = *x; + secp256k1_scalar_encode_30(g, &b0); /* The paper uses 'delta'; eta == -delta (a performance tweak). */ eta = -(uint32_t)1; - for (i = 0; i < 24; ++i) { - eta = secp256k1_scalar_divsteps_31(eta, f[0], g[0], &t[i * 4]); - secp256k1_scalar_update_fg(f, g, &t[i * 4]); + for (i = 0; i < 25; ++i) { + eta = secp256k1_scalar_divsteps_30(eta, f[0], g[0], t); + secp256k1_scalar_update_de_30(d, e, t); + secp256k1_scalar_update_fg_30(f, g, t); } /* At this point sufficient iterations have been performed that g must have reached 0 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divsteps_31 are combined to get - * the Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divsteps_31 introduce an extra factor of 2^31 each, so there is a total extra - * factor of 2^744 to account for (by scaling the input and/or output accordingly). - */ + * values i.e. +/- 1, and d now contains +/- the modular inverse. */ - VERIFY_CHECK(g[0] == 0); + VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0); sign = (f[0] >> 1) & 1; - for (i = 0; i < 3; ++i) { - int tOff = i * 32; - secp256k1_scalar_combine_1s(&t[tOff + 0]); - secp256k1_scalar_combine_1s(&t[tOff + 8]); - secp256k1_scalar_combine_1s(&t[tOff + 16]); - secp256k1_scalar_combine_1s(&t[tOff + 24]); - secp256k1_scalar_combine_2s(&t[tOff + 0]); - secp256k1_scalar_combine_2s(&t[tOff + 16]); - secp256k1_scalar_combine_4s(&t[tOff + 0]); - } - - /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */ - secp256k1_scalar_decode_matrix(&b0, &t[8]); - /* secp256k1_scalar_decode_matrix(&c0, &t[16]); */ - secp256k1_scalar_decode_matrix(&d0, &t[24]); - - secp256k1_scalar_decode_matrix(&a1, &t[32]); - secp256k1_scalar_decode_matrix(&b1, &t[40]); - secp256k1_scalar_decode_matrix(&c1, &t[48]); - secp256k1_scalar_decode_matrix(&d1, &t[56]); - - secp256k1_scalar_mul(&a1, &a1, &b0); - secp256k1_scalar_mul(&b1, &b1, &d0); - secp256k1_scalar_mul(&c1, &c1, &b0); - secp256k1_scalar_mul(&d1, &d1, &d0); - - secp256k1_scalar_add(&b0, &a1, &b1); - secp256k1_scalar_add(&d0, &c1, &d1); - - secp256k1_scalar_decode_matrix(&a1, &t[64]); - secp256k1_scalar_decode_matrix(&b1, &t[72]); - /* secp256k1_scalar_decode_matrix(&c1, &t[80]); */ - /* secp256k1_scalar_decode_matrix(&d1, &t[88]); */ - - secp256k1_scalar_mul(&a1, &a1, &b0); - secp256k1_scalar_mul(&b1, &b1, &d0); - /* secp256k1_scalar_mul(&c1, &c1, &b0); */ - /* secp256k1_scalar_mul(&d1, &d1, &d0); */ - - secp256k1_scalar_add(&b0, &a1, &b1); - /* secp256k1_scalar_add(&d0, &c1, &d1); */ - + secp256k1_scalar_decode_30(&b0, d); secp256k1_scalar_cond_negate(&b0, sign); #ifdef VERIFY @@ -1223,86 +1052,47 @@ SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) { /* Modular inversion based on the paper "Fast constant-time gcd computation and - * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. - */ + * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ - int32_t t[24 * 4]; - int32_t f[9] = { 0x50364141L, 0x7FA4BD19L, 0x3D2280EEL, 0x5576E735L, 0x7FFFFFEBL, - 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL }; + int32_t t[4]; + int32_t d[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int32_t e[9] = { 1, 0, 0, 0, 0, 0, 0, 0, 0 }; + int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, + 0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL }; int32_t g[9]; - secp256k1_scalar b0, d0, a1, b1, c1, d1; + secp256k1_scalar b0; int i, sign; uint32_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - /* Instead of dividing the output by 2^744, scale the input. */ - secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744); - secp256k1_scalar_encode_31(g, &b0); + b0 = *x; + secp256k1_scalar_encode_30(g, &b0); /* The paper uses 'delta'; eta == -delta (a performance tweak). */ eta = -(uint32_t)1; - for (i = 0; i < 24; ++i) { - eta = secp256k1_scalar_divsteps_31_var(eta, f[0], g[0], &t[i * 4]); - secp256k1_scalar_update_fg(f, g, &t[i * 4]); - } - - /* At this point sufficient iterations have been performed that g must have reached 0 - * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g - * values i.e. +/- 1. The matrix outputs from each _divsteps_31_var are combined to - * get the Bézout coefficients, and thus the modular inverse. The matrix outputs of - * _divsteps_31_var introduce an extra factor of 2^31 each, so there is a total extra - * factor of 2^744 to account for (by scaling the input and/or output accordingly). - */ - - VERIFY_CHECK(g[0] == 0); + for (i = 0; i < 25; ++i) { + eta = secp256k1_scalar_divsteps_30_var(eta, f[0], g[0], t); + secp256k1_scalar_update_de_30(d, e, t); + secp256k1_scalar_update_fg_30(f, g, t); - sign = (f[0] >> 1) & 1; - - for (i = 0; i < 3; ++i) { - int tOff = i * 32; - secp256k1_scalar_combine_1s(&t[tOff + 0]); - secp256k1_scalar_combine_1s(&t[tOff + 8]); - secp256k1_scalar_combine_1s(&t[tOff + 16]); - secp256k1_scalar_combine_1s(&t[tOff + 24]); - secp256k1_scalar_combine_2s(&t[tOff + 0]); - secp256k1_scalar_combine_2s(&t[tOff + 16]); - secp256k1_scalar_combine_4s(&t[tOff + 0]); + if (g[0] == 0) { + if ((g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0) { + break; + } + } } - /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */ - secp256k1_scalar_decode_matrix(&b0, &t[8]); - /* secp256k1_scalar_decode_matrix(&c0, &t[16]); */ - secp256k1_scalar_decode_matrix(&d0, &t[24]); + VERIFY_CHECK(i < 25); - secp256k1_scalar_decode_matrix(&a1, &t[32]); - secp256k1_scalar_decode_matrix(&b1, &t[40]); - secp256k1_scalar_decode_matrix(&c1, &t[48]); - secp256k1_scalar_decode_matrix(&d1, &t[56]); + /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of + * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */ - secp256k1_scalar_mul(&a1, &a1, &b0); - secp256k1_scalar_mul(&b1, &b1, &d0); - secp256k1_scalar_mul(&c1, &c1, &b0); - secp256k1_scalar_mul(&d1, &d1, &d0); - - secp256k1_scalar_add(&b0, &a1, &b1); - secp256k1_scalar_add(&d0, &c1, &d1); - - secp256k1_scalar_decode_matrix(&a1, &t[64]); - secp256k1_scalar_decode_matrix(&b1, &t[72]); - /* secp256k1_scalar_decode_matrix(&c1, &t[80]); */ - /* secp256k1_scalar_decode_matrix(&d1, &t[88]); */ - - secp256k1_scalar_mul(&a1, &a1, &b0); - secp256k1_scalar_mul(&b1, &b1, &d0); - /* secp256k1_scalar_mul(&c1, &c1, &b0); */ - /* secp256k1_scalar_mul(&d1, &d1, &d0); */ - - secp256k1_scalar_add(&b0, &a1, &b1); - /* secp256k1_scalar_add(&d0, &c1, &d1); */ + sign = (f[0] >> 1) & 1; + secp256k1_scalar_decode_30(&b0, d); secp256k1_scalar_cond_negate(&b0, sign); #ifdef VERIFY From bfd7a0fbd6ef2d60d12ce63e2ce2921aa8424c1b Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sun, 9 Aug 2020 16:44:33 +0700 Subject: [PATCH 15/34] Alternate var-time divsteps code --- src/field_10x26_impl.h | 6 ++++++ src/field_5x52_impl.h | 6 ++++++ src/scalar_4x64_impl.h | 6 ++++++ src/scalar_8x32_impl.h | 6 ++++++ 4 files changed, 24 insertions(+) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 2a1eaed394..d174bd4cd4 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1317,6 +1317,7 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t z = v; v = r; r = -z; } +#if 1 /* Handle up to 3 divsteps at once, subject to eta and i. */ limit = ((int)eta + 1) > i ? i : ((int)eta + 1); m = (UINT32_MAX >> (32 - limit)) & 7U; @@ -1326,6 +1327,11 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g += f * w; q += u * w; r += v * w; +#else + g += f; + q += u; + r += v; +#endif } t[0] = (int32_t)u; diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 44188699b2..cc1b95a208 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -625,6 +625,7 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t z = v; v = r; r = -z; } +#if 1 /* Handle up to 3 divsteps at once, subject to eta and i. */ limit = ((int)eta + 1) > i ? i : ((int)eta + 1); m = (UINT64_MAX >> (64 - limit)) & 7U; @@ -634,6 +635,11 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g += f * w; q += u * w; r += v * w; +#else + g += f; + q += u; + r += v; +#endif } t[0] = (int64_t)u; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 7b526cb90f..15f4460dfc 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1088,6 +1088,7 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint z = v; v = r; r = -z; } +#if 1 /* Handle up to 3 divsteps at once, subject to eta and i. */ limit = ((int)eta + 1) > i ? i : ((int)eta + 1); m = (UINT64_MAX >> (64 - limit)) & 7U; @@ -1097,6 +1098,11 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint g += f * w; q += u * w; r += v * w; +#else + g += f; + q += u; + r += v; +#endif } t[0] = (int64_t)u; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 6cc23f90aa..5d21cb1233 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -882,6 +882,7 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint z = v; v = r; r = -z; } +#if 1 /* Handle up to 3 divsteps at once, subject to eta and i. */ limit = ((int)eta + 1) > i ? i : ((int)eta + 1); m = (UINT32_MAX >> (32 - limit)) & 7U; @@ -891,6 +892,11 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint g += f * w; q += u * w; r += v * w; +#else + g += f; + q += u; + r += v; +#endif } t[0] = (int32_t)u; From f873c3b503bf8e7c360bf8b440488e07aeb3ec2a Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sun, 9 Aug 2020 17:40:44 +0700 Subject: [PATCH 16/34] Add comments regarding small inputs --- src/field_10x26_impl.h | 11 +++++++++-- src/field_5x52_impl.h | 11 +++++++++-- src/scalar_4x64_impl.h | 11 +++++++++-- src/scalar_8x32_impl.h | 11 +++++++++-- 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index d174bd4cd4..fadc3dda75 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1468,7 +1468,11 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { zero_in = secp256k1_fe_is_zero(&b0); #endif - /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + /* The paper uses 'delta'; eta == -delta (a performance tweak). + * + * If the maximum bitlength of g is known to be less than 256, then eta can be set + * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total + * divsteps are needed. */ eta = -(uint32_t)1; for (i = 0; i < 25; ++i) { @@ -1524,7 +1528,10 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { zero_in = secp256k1_fe_is_zero(&b0); #endif - /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + /* The paper uses 'delta'; eta == -delta (a performance tweak). + * + * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to + * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */ eta = -(uint32_t)1; for (i = 0; i < 25; ++i) { diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index cc1b95a208..c7a6af7c86 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -772,7 +772,11 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { zero_in = secp256k1_fe_is_zero(&b0); #endif - /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + /* The paper uses 'delta'; eta == -delta (a performance tweak). + * + * If the maximum bitlength of g is known to be less than 256, then eta can be set + * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total + * divsteps are needed. */ eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { @@ -828,7 +832,10 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { zero_in = secp256k1_fe_is_zero(&b0); #endif - /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + /* The paper uses 'delta'; eta == -delta (a performance tweak). + * + * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to + * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */ eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 15f4460dfc..8272670426 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1229,7 +1229,11 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar b0 = *x; secp256k1_scalar_encode_62(g, &b0); - /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + /* The paper uses 'delta'; eta == -delta (a performance tweak). + * + * If the maximum bitlength of g is known to be less than 256, then eta can be set + * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total + * divsteps are needed. */ eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { @@ -1282,7 +1286,10 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc b0 = *x; secp256k1_scalar_encode_62(g, &b0); - /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + /* The paper uses 'delta'; eta == -delta (a performance tweak). + * + * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to + * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */ eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 5d21cb1233..1db6d52f15 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -1023,7 +1023,11 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar b0 = *x; secp256k1_scalar_encode_30(g, &b0); - /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + /* The paper uses 'delta'; eta == -delta (a performance tweak). + * + * If the maximum bitlength of g is known to be less than 256, then eta can be set + * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total + * divsteps are needed. */ eta = -(uint32_t)1; for (i = 0; i < 25; ++i) { @@ -1076,7 +1080,10 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc b0 = *x; secp256k1_scalar_encode_30(g, &b0); - /* The paper uses 'delta'; eta == -delta (a performance tweak). */ + /* The paper uses 'delta'; eta == -delta (a performance tweak). + * + * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to + * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */ eta = -(uint32_t)1; for (i = 0; i < 25; ++i) { From 17982d820ee780c5ca488f62661b83376dd5afb7 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sun, 9 Aug 2020 18:37:17 +0700 Subject: [PATCH 17/34] Avoid left shift of signed values --- src/field_10x26_impl.h | 8 ++++---- src/field_5x52_impl.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index fadc3dda75..6f7e9fa1a9 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1373,8 +1373,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { ce >>= 30; /* Subtract products of 2^32. */ - cd -= (int64_t)md << 2; - ce -= (int64_t)me << 2; + cd -= (int64_t)4 * md; + ce -= (int64_t)4 * me; for (i = 1; i < 8; ++i) { @@ -1389,8 +1389,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { } /* Add products of 2^256. */ - cd += (int64_t)md << 16; - ce += (int64_t)me << 16; + cd += (int64_t)65536 * md; + ce += (int64_t)65536 * me; { di = d[8]; diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index c7a6af7c86..03e1f855dd 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -693,8 +693,8 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) { } /* Add products of 2^256. */ - cd += (int128_t)md << 8; - ce += (int128_t)me << 8; + cd += (int128_t)256 * md; + ce += (int128_t)256 * me; { di = d[4]; From 06d568a7e6596aae9a6837bb2eee3108347cbc3a Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sun, 9 Aug 2020 20:40:46 +0700 Subject: [PATCH 18/34] Add alternative to __builtin_ctz intrinsics - lookup tables based on de Bruijn sequences --- src/field_10x26_impl.h | 14 +++++++++++++- src/field_5x52_impl.h | 17 ++++++++++++++++- src/scalar_4x64_impl.h | 17 ++++++++++++++++- src/scalar_8x32_impl.h | 14 +++++++++++++- 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 6f7e9fa1a9..6ffcfc8c7a 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1286,6 +1286,12 @@ static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +#if 1 + static const uint8_t debruijn[32] = { + 0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26, + 31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 }; +#endif + uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t f = f0, g = g0, m, w, x, y, z; int i = 30, limit, zeros; @@ -1293,7 +1299,13 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t for (;;) { /* Use a sentinel bit to count zeros only up to i. */ - zeros = __builtin_ctzl(g | (UINT32_MAX << i)); + x = g | (UINT32_MAX << i); + +#if 0 + zeros = __builtin_ctzl(x); +#else + zeros = debruijn[((x & -x) * 0x04D7651F) >> 27]; +#endif g >>= zeros; u <<= zeros; diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 03e1f855dd..f8914d9564 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -594,14 +594,29 @@ static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) { +#if 1 + static const uint8_t debruijn[64] = { + 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, + 62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, + 63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, + 51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12 + }; +#endif + uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; uint64_t f = f0, g = g0, m, w, x, y, z; int i = 62, limit, zeros; for (;;) { + x = g | (UINT64_MAX << i); + /* Use a sentinel bit to count zeros only up to i. */ - zeros = __builtin_ctzll(g | (UINT64_MAX << i)); +#if 0 + zeros = __builtin_ctzll(x); +#else + zeros = debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58]; +#endif g >>= zeros; u <<= zeros; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 8272670426..b8be7ab166 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1057,14 +1057,29 @@ static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) { +#if 1 + static const uint8_t debruijn[64] = { + 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, + 62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, + 63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, + 51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12 + }; +#endif + uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; uint64_t f = f0, g = g0, m, w, x, y, z; int i = 62, limit, zeros; for (;;) { + x = g | (UINT64_MAX << i); + /* Use a sentinel bit to count zeros only up to i. */ - zeros = __builtin_ctzll(g | (UINT64_MAX << i)); +#if 0 + zeros = __builtin_ctzll(x); +#else + zeros = debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58]; +#endif g >>= zeros; u <<= zeros; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 1db6d52f15..a9265ac4e1 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -851,6 +851,12 @@ static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { +#if 1 + static const uint8_t debruijn[32] = { + 0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26, + 31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 }; +#endif + uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; uint32_t f = f0, g = g0, m, w, x, y, z; int i = 30, limit, zeros; @@ -858,7 +864,13 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint for (;;) { /* Use a sentinel bit to count zeros only up to i. */ - zeros = __builtin_ctzl(g | (UINT32_MAX << i)); + x = g | (UINT32_MAX << i); + +#if 0 + zeros = __builtin_ctzl(x); +#else + zeros = debruijn[((x & -x) * 0x04D7651F) >> 27]; +#endif g >>= zeros; u <<= zeros; From 16509ca068410cb9c3bd45e7a17b5de996baea1a Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sun, 9 Aug 2020 21:54:40 +0700 Subject: [PATCH 19/34] Write primes in signed-digit form --- src/scalar_4x64_impl.h | 3 +-- src/scalar_8x32_impl.h | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index b8be7ab166..a0e42d41e1 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1132,8 +1132,7 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, int64_t *t) { /* I62 == -P^-1 mod 2^62 */ const int64_t I62 = 0x0B0DFF665588B13FLL; - const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, - 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; + const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 }; const int64_t M62 = (int64_t)(UINT64_MAX >> 2); int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; int128_t cd = 0, ce = 0; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index a9265ac4e1..4b42294799 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -923,8 +923,9 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) { /* I30 == -P^-1 mod 2^30 */ const int32_t I30 = 0x1588B13FL; - const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, - 0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL }; + const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L, + 0, 0, 0, 65536 }; + const int32_t M30 = (int32_t)(UINT32_MAX >> 2); int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; int64_t cd = 0, ce = 0; From 40c815ebe16cdd02d0a0f79122a94ab10969702c Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sun, 9 Aug 2020 22:28:17 +0700 Subject: [PATCH 20/34] Unify _update_de_ methods --- src/field_10x26_impl.h | 31 +++++++------------------------ src/field_5x52_impl.h | 26 +++++++------------------- src/scalar_8x32_impl.h | 1 - 3 files changed, 14 insertions(+), 44 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 6ffcfc8c7a..48551a89f1 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1358,7 +1358,7 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { /* I30 == -P^-1 mod 2^30 */ const int32_t I30 = 0x12253531L; - const int32_t C30 = 0x3D1L; + const int32_t P[9] = { -0x3D1L, -4L, 0, 0, 0, 0, 0, 0, 65536 }; const int32_t M30 = (int32_t)(UINT32_MAX >> 2); int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; int64_t cd = 0, ce = 0; @@ -1374,9 +1374,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { md = (I30 * (int32_t)cd) & M30; me = (I30 * (int32_t)ce) & M30; - /* P == 2^256 - 2^32 - C30; subtract products of C30 here. */ - cd -= (int64_t)C30 * md; - ce -= (int64_t)C30 * me; + cd += (int64_t)P[0] * md; + ce += (int64_t)P[0] * me; VERIFY_CHECK(((int32_t)cd & M30) == 0); VERIFY_CHECK(((int32_t)ce & M30) == 0); @@ -1384,11 +1383,7 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { cd >>= 30; ce >>= 30; - /* Subtract products of 2^32. */ - cd -= (int64_t)4 * md; - ce -= (int64_t)4 * me; - - for (i = 1; i < 8; ++i) { + for (i = 1; i < 9; ++i) { di = d[i]; ei = e[i]; @@ -1396,25 +1391,13 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { cd -= (int64_t)u * di + (int64_t)v * ei; ce -= (int64_t)q * di + (int64_t)r * ei; + cd += (int64_t)P[i] * md; + ce += (int64_t)P[i] * me; + d[i - 1] = (int32_t)cd & M30; cd >>= 30; e[i - 1] = (int32_t)ce & M30; ce >>= 30; } - /* Add products of 2^256. */ - cd += (int64_t)65536 * md; - ce += (int64_t)65536 * me; - - { - di = d[8]; - ei = e[8]; - - cd -= (int64_t)u * di + (int64_t)v * ei; - ce -= (int64_t)q * di + (int64_t)r * ei; - - d[7] = (int32_t)cd & M30; cd >>= 30; - e[7] = (int32_t)ce & M30; ce >>= 30; - } - d[8] = (int32_t)cd; e[8] = (int32_t)ce; } diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index f8914d9564..4289208a75 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -669,7 +669,7 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) { /* I62 == -P^-1 mod 2^62 */ const int64_t I62 = 0x1838091DD2253531LL; - const int64_t C62 = 0x1000003D1LL; + int64_t P[5] = { -0x1000003D1LL, 0, 0, 0, 256 }; const int64_t M62 = (int64_t)(UINT64_MAX >> 2); int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; int128_t cd = 0, ce = 0; @@ -686,8 +686,8 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) { me = (I62 * (int64_t)ce) & M62; /* P == 2^256 - C62; subtract products of C62 here. */ - cd -= (int128_t)C62 * md; - ce -= (int128_t)C62 * me; + cd += (int128_t)P[0] * md; + ce += (int128_t)P[0] * me; VERIFY_CHECK(((int64_t)cd & M62) == 0); VERIFY_CHECK(((int64_t)ce & M62) == 0); @@ -695,7 +695,7 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) { cd >>= 62; ce >>= 62; - for (i = 1; i < 4; ++i) { + for (i = 1; i < 5; ++i) { di = d[i]; ei = e[i]; @@ -703,25 +703,13 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) { cd -= (int128_t)u * di + (int128_t)v * ei; ce -= (int128_t)q * di + (int128_t)r * ei; + cd += (int128_t)P[i] * md; + ce += (int128_t)P[i] * me; + d[i - 1] = (int64_t)cd & M62; cd >>= 62; e[i - 1] = (int64_t)ce & M62; ce >>= 62; } - /* Add products of 2^256. */ - cd += (int128_t)256 * md; - ce += (int128_t)256 * me; - - { - di = d[4]; - ei = e[4]; - - cd -= (int128_t)u * di + (int128_t)v * ei; - ce -= (int128_t)q * di + (int128_t)r * ei; - - d[3] = (int64_t)cd & M62; cd >>= 62; - e[3] = (int64_t)ce & M62; ce >>= 62; - } - d[4] = (int64_t)cd; e[4] = (int64_t)ce; } diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 4b42294799..9b0cfcea4f 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -925,7 +925,6 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) { const int32_t I30 = 0x1588B13FL; const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L, 0, 0, 0, 65536 }; - const int32_t M30 = (int32_t)(UINT32_MAX >> 2); int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; int64_t cd = 0, ce = 0; From dc58f4f094120aa044453f4457aa0439e6b5717c Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Tue, 11 Aug 2020 02:31:00 +0700 Subject: [PATCH 21/34] Redo update_de methods --- src/field_10x26_impl.h | 59 +++++++++++++--------- src/field_5x52_impl.h | 109 +++++++++++++++++++++++------------------ src/scalar_4x64_impl.h | 109 ++++++++++++++++++++++++----------------- src/scalar_8x32_impl.h | 17 +++---- 4 files changed, 169 insertions(+), 125 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 48551a89f1..b47f79dc94 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1167,8 +1167,8 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { const uint32_t M26 = UINT32_MAX >> 6; - uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], - a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; + const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], + a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; VERIFY_CHECK(a0 >> 30 == 0); @@ -1225,8 +1225,8 @@ static void secp256k1_fe_encode_30(int32_t *r, const secp256k1_fe *a) { const uint32_t M30 = UINT32_MAX >> 2; const uint32_t *n = &a->n[0]; - uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4], - a5 = n[5], a6 = n[6], a7 = n[7], a8 = n[8], a9 = n[9]; + const uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4], + a5 = n[5], a6 = n[6], a7 = n[7], a8 = n[8], a9 = n[9]; #ifdef VERIFY VERIFY_CHECK(a->normalized); @@ -1354,13 +1354,15 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t return eta; } -static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { +static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t) { + /* P == 2^256 - 2^32 - C30 */ + const int64_t C30 = 0x3D1L; /* I30 == -P^-1 mod 2^30 */ const int32_t I30 = 0x12253531L; - const int32_t P[9] = { -0x3D1L, -4L, 0, 0, 0, 0, 0, 0, 65536 }; const int32_t M30 = (int32_t)(UINT32_MAX >> 2); - int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; + const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; + int32_t di, ei, md, me; int64_t cd = 0, ce = 0; int i; @@ -1374,16 +1376,16 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { md = (I30 * (int32_t)cd) & M30; me = (I30 * (int32_t)ce) & M30; - cd += (int64_t)P[0] * md; - ce += (int64_t)P[0] * me; + cd -= (int64_t)C30 * md; + ce -= (int64_t)C30 * me; - VERIFY_CHECK(((int32_t)cd & M30) == 0); - VERIFY_CHECK(((int32_t)ce & M30) == 0); + VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30; + VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30; - cd >>= 30; - ce >>= 30; + cd -= (int64_t)4 * md; + ce -= (int64_t)4 * me; - for (i = 1; i < 9; ++i) { + for (i = 1; i < 8; ++i) { di = d[i]; ei = e[i]; @@ -1391,21 +1393,33 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) { cd -= (int64_t)u * di + (int64_t)v * ei; ce -= (int64_t)q * di + (int64_t)r * ei; - cd += (int64_t)P[i] * md; - ce += (int64_t)P[i] * me; - d[i - 1] = (int32_t)cd & M30; cd >>= 30; e[i - 1] = (int32_t)ce & M30; ce >>= 30; } + { + di = d[8]; + ei = e[8]; + + cd -= (int64_t)u * di + (int64_t)v * ei; + ce -= (int64_t)q * di + (int64_t)r * ei; + + cd += (int64_t)65536 * md; + ce += (int64_t)65536 * me; + + d[7] = (int32_t)cd & M30; cd >>= 30; + e[7] = (int32_t)ce & M30; ce >>= 30; + } + d[8] = (int32_t)cd; e[8] = (int32_t)ce; } -static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, int32_t *t) { +static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, const int32_t *t) { const int32_t M30 = (int32_t)(UINT32_MAX >> 2); - int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; + const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; + int32_t fi, gi; int64_t cf = 0, cg = 0; int i; @@ -1415,11 +1429,8 @@ static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, int32_t *t) { cf -= (int64_t)u * fi + (int64_t)v * gi; cg -= (int64_t)q * fi + (int64_t)r * gi; - VERIFY_CHECK(((int32_t)cf & M30) == 0); - VERIFY_CHECK(((int32_t)cg & M30) == 0); - - cf >>= 30; - cg >>= 30; + VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30; + VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30; for (i = 1; i < 9; ++i) { diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 4289208a75..b15fcdcfd5 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -501,7 +501,7 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) { const uint64_t M52 = UINT64_MAX >> 12; - uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; + const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; uint64_t r0, r1, r2, r3, r4; VERIFY_CHECK(a0 >> 62 == 0); @@ -539,7 +539,7 @@ static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) { const uint64_t M62 = UINT64_MAX >> 2; const uint64_t *n = &a->n[0]; - uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4]; + const uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4]; #ifdef VERIFY VERIFY_CHECK(a->normalized); @@ -665,85 +665,100 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t return eta; } -static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) { +static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t) { + /* P == 2^256 - C62 */ + const int64_t C62 = 0x1000003D1LL; /* I62 == -P^-1 mod 2^62 */ const int64_t I62 = 0x1838091DD2253531LL; - int64_t P[5] = { -0x1000003D1LL, 0, 0, 0, 256 }; const int64_t M62 = (int64_t)(UINT64_MAX >> 2); - int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; + const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4]; + const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4]; + const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; + int64_t md, me; int128_t cd = 0, ce = 0; - int i; - - di = d[0]; - ei = e[0]; - cd -= (int128_t)u * di + (int128_t)v * ei; - ce -= (int128_t)q * di + (int128_t)r * ei; + cd -= (int128_t)u * d0 + (int128_t)v * e0; + ce -= (int128_t)q * d0 + (int128_t)r * e0; /* Calculate the multiples of P to add, to zero the 62 bottom bits. */ md = (I62 * (int64_t)cd) & M62; me = (I62 * (int64_t)ce) & M62; - /* P == 2^256 - C62; subtract products of C62 here. */ - cd += (int128_t)P[0] * md; - ce += (int128_t)P[0] * me; + cd -= (int128_t)C62 * md; + ce -= (int128_t)C62 * me; - VERIFY_CHECK(((int64_t)cd & M62) == 0); - VERIFY_CHECK(((int64_t)ce & M62) == 0); + VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62; + VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62; - cd >>= 62; - ce >>= 62; + cd -= (int128_t)u * d1 + (int128_t)v * e1; + ce -= (int128_t)q * d1 + (int128_t)r * e1; - for (i = 1; i < 5; ++i) { + d[0] = (int64_t)cd & M62; cd >>= 62; + e[0] = (int64_t)ce & M62; ce >>= 62; - di = d[i]; - ei = e[i]; + cd -= (int128_t)u * d2 + (int128_t)v * e2; + ce -= (int128_t)q * d2 + (int128_t)r * e2; - cd -= (int128_t)u * di + (int128_t)v * ei; - ce -= (int128_t)q * di + (int128_t)r * ei; + d[1] = (int64_t)cd & M62; cd >>= 62; + e[1] = (int64_t)ce & M62; ce >>= 62; - cd += (int128_t)P[i] * md; - ce += (int128_t)P[i] * me; + cd -= (int128_t)u * d3 + (int128_t)v * e3; + ce -= (int128_t)q * d3 + (int128_t)r * e3; - d[i - 1] = (int64_t)cd & M62; cd >>= 62; - e[i - 1] = (int64_t)ce & M62; ce >>= 62; - } + d[2] = (int64_t)cd & M62; cd >>= 62; + e[2] = (int64_t)ce & M62; ce >>= 62; + + cd -= (int128_t)u * d4 + (int128_t)v * e4; + ce -= (int128_t)q * d4 + (int128_t)r * e4; + + cd += (int128_t)256 * md; + ce += (int128_t)256 * me; + + d[3] = (int64_t)cd & M62; cd >>= 62; + e[3] = (int64_t)ce & M62; ce >>= 62; d[4] = (int64_t)cd; e[4] = (int64_t)ce; } -static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, int64_t *t) { +static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, const int64_t *t) { const int64_t M62 = (int64_t)(UINT64_MAX >> 2); - int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; + const int64_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4]; + const int64_t g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4]; + const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; int128_t cf = 0, cg = 0; - int i; - fi = f[0]; - gi = g[0]; + cf -= (int128_t)u * f0 + (int128_t)v * g0; + cg -= (int128_t)q * f0 + (int128_t)r * g0; - cf -= (int128_t)u * fi + (int128_t)v * gi; - cg -= (int128_t)q * fi + (int128_t)r * gi; + VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62; + VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62; - VERIFY_CHECK(((int64_t)cf & M62) == 0); - VERIFY_CHECK(((int64_t)cg & M62) == 0); + cf -= (int128_t)u * f1 + (int128_t)v * g1; + cg -= (int128_t)q * f1 + (int128_t)r * g1; - cf >>= 62; - cg >>= 62; + f[0] = (int64_t)cf & M62; cf >>= 62; + g[0] = (int64_t)cg & M62; cg >>= 62; - for (i = 1; i < 5; ++i) { + cf -= (int128_t)u * f2 + (int128_t)v * g2; + cg -= (int128_t)q * f2 + (int128_t)r * g2; - fi = f[i]; - gi = g[i]; + f[1] = (int64_t)cf & M62; cf >>= 62; + g[1] = (int64_t)cg & M62; cg >>= 62; - cf -= (int128_t)u * fi + (int128_t)v * gi; - cg -= (int128_t)q * fi + (int128_t)r * gi; + cf -= (int128_t)u * f3 + (int128_t)v * g3; + cg -= (int128_t)q * f3 + (int128_t)r * g3; - f[i - 1] = (int64_t)cf & M62; cf >>= 62; - g[i - 1] = (int64_t)cg & M62; cg >>= 62; - } + f[2] = (int64_t)cf & M62; cf >>= 62; + g[2] = (int64_t)cg & M62; cg >>= 62; + + cf -= (int128_t)u * f4 + (int128_t)v * g4; + cg -= (int128_t)q * f4 + (int128_t)r * g4; + + f[3] = (int64_t)cf & M62; cf >>= 62; + g[3] = (int64_t)cg & M62; cg >>= 62; f[4] = (int64_t)cf; g[4] = (int64_t)cg; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index a0e42d41e1..bbde844eae 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -964,7 +964,7 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) { - uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; + const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; uint64_t r0, r1, r2, r3; int64_t t; secp256k1_scalar u; @@ -1002,7 +1002,7 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { const uint64_t M62 = UINT64_MAX >> 2; const uint64_t *d = &a->d[0]; - uint64_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3]; + const uint64_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3]; #ifdef VERIFY VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0); @@ -1128,21 +1128,20 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint return eta; } -static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, int64_t *t) { +static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t *t) { /* I62 == -P^-1 mod 2^62 */ const int64_t I62 = 0x0B0DFF665588B13FLL; - const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 }; const int64_t M62 = (int64_t)(UINT64_MAX >> 2); - int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; + const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 }; + const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4]; + const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4]; + const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; + int64_t md, me; int128_t cd = 0, ce = 0; - int i; - - di = d[0]; - ei = e[0]; - cd -= (int128_t)u * di + (int128_t)v * ei; - ce -= (int128_t)q * di + (int128_t)r * ei; + cd -= (int128_t)u * d0 + (int128_t)v * e0; + ce -= (int128_t)q * d0 + (int128_t)r * e0; /* Calculate the multiples of P to add, to zero the 62 bottom bits. */ md = (I62 * (int64_t)cd) & M62; @@ -1151,61 +1150,83 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, int64_t *t) { cd += (int128_t)P[0] * md; ce += (int128_t)P[0] * me; - VERIFY_CHECK(((int64_t)cd & M62) == 0); - VERIFY_CHECK(((int64_t)ce & M62) == 0); + VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62; + VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62; - cd >>= 62; - ce >>= 62; + cd -= (int128_t)u * d1 + (int128_t)v * e1; + ce -= (int128_t)q * d1 + (int128_t)r * e1; - for (i = 1; i < 5; ++i) { + cd += (int128_t)P[1] * md; + ce += (int128_t)P[1] * me; - di = d[i]; - ei = e[i]; + d[0] = (int64_t)cd & M62; cd >>= 62; + e[0] = (int64_t)ce & M62; ce >>= 62; - cd -= (int128_t)u * di + (int128_t)v * ei; - ce -= (int128_t)q * di + (int128_t)r * ei; + cd -= (int128_t)u * d2 + (int128_t)v * e2; + ce -= (int128_t)q * d2 + (int128_t)r * e2; - cd += (int128_t)P[i] * md; - ce += (int128_t)P[i] * me; + cd += (int128_t)P[2] * md; + ce += (int128_t)P[2] * me; - d[i - 1] = (int64_t)cd & M62; cd >>= 62; - e[i - 1] = (int64_t)ce & M62; ce >>= 62; - } + d[1] = (int64_t)cd & M62; cd >>= 62; + e[1] = (int64_t)ce & M62; ce >>= 62; + + cd -= (int128_t)u * d3 + (int128_t)v * e3; + ce -= (int128_t)q * d3 + (int128_t)r * e3; + + d[2] = (int64_t)cd & M62; cd >>= 62; + e[2] = (int64_t)ce & M62; ce >>= 62; + + cd -= (int128_t)u * d4 + (int128_t)v * e4; + ce -= (int128_t)q * d4 + (int128_t)r * e4; + + cd += (int128_t)P[4] * md; + ce += (int128_t)P[4] * me; + + d[3] = (int64_t)cd & M62; cd >>= 62; + e[3] = (int64_t)ce & M62; ce >>= 62; d[4] = (int64_t)cd; e[4] = (int64_t)ce; } -static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, int64_t *t) { +static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, const int64_t *t) { const int64_t M62 = (int64_t)(UINT64_MAX >> 2); - int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; + const int64_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4]; + const int64_t g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4]; + const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; int128_t cf = 0, cg = 0; - int i; - fi = f[0]; - gi = g[0]; + cf -= (int128_t)u * f0 + (int128_t)v * g0; + cg -= (int128_t)q * f0 + (int128_t)r * g0; - cf -= (int128_t)u * fi + (int128_t)v * gi; - cg -= (int128_t)q * fi + (int128_t)r * gi; + VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62; + VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62; - VERIFY_CHECK(((int64_t)cf & M62) == 0); - VERIFY_CHECK(((int64_t)cg & M62) == 0); + cf -= (int128_t)u * f1 + (int128_t)v * g1; + cg -= (int128_t)q * f1 + (int128_t)r * g1; - cf >>= 62; - cg >>= 62; + f[0] = (int64_t)cf & M62; cf >>= 62; + g[0] = (int64_t)cg & M62; cg >>= 62; - for (i = 1; i < 5; ++i) { + cf -= (int128_t)u * f2 + (int128_t)v * g2; + cg -= (int128_t)q * f2 + (int128_t)r * g2; - fi = f[i]; - gi = g[i]; + f[1] = (int64_t)cf & M62; cf >>= 62; + g[1] = (int64_t)cg & M62; cg >>= 62; - cf -= (int128_t)u * fi + (int128_t)v * gi; - cg -= (int128_t)q * fi + (int128_t)r * gi; + cf -= (int128_t)u * f3 + (int128_t)v * g3; + cg -= (int128_t)q * f3 + (int128_t)r * g3; - f[i - 1] = (int64_t)cf & M62; cf >>= 62; - g[i - 1] = (int64_t)cg & M62; cg >>= 62; - } + f[2] = (int64_t)cf & M62; cf >>= 62; + g[2] = (int64_t)cg & M62; cg >>= 62; + + cf -= (int128_t)u * f4 + (int128_t)v * g4; + cg -= (int128_t)q * f4 + (int128_t)r * g4; + + f[3] = (int64_t)cf & M62; cf >>= 62; + g[3] = (int64_t)cg & M62; cg >>= 62; f[4] = (int64_t)cf; g[4] = (int64_t)cg; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 9b0cfcea4f..6b422a6334 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -740,8 +740,8 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) { - uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], - a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; + const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], + a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; uint32_t r0, r1, r2, r3, r4, r5, r6, r7; int32_t t; secp256k1_scalar u; @@ -791,8 +791,8 @@ static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) { const uint32_t M30 = UINT32_MAX >> 2; const uint32_t *d = &a->d[0]; - uint32_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3], - a4 = d[4], a5 = d[5], a6 = d[6], a7 = d[7]; + const uint32_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3], + a4 = d[4], a5 = d[5], a6 = d[6], a7 = d[7]; #ifdef VERIFY VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0); @@ -919,7 +919,7 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint return eta; } -static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) { +static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t *t) { /* I30 == -P^-1 mod 2^30 */ const int32_t I30 = 0x1588B13FL; @@ -943,11 +943,8 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) { cd += (int64_t)P[0] * md; ce += (int64_t)P[0] * me; - VERIFY_CHECK(((int32_t)cd & M30) == 0); - VERIFY_CHECK(((int32_t)ce & M30) == 0); - - cd >>= 30; - ce >>= 30; + VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30; + VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30; for (i = 1; i < 9; ++i) { From 132c76dc3a7c2f6553c89e684977b6e706a2846a Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Wed, 12 Aug 2020 02:27:26 +0700 Subject: [PATCH 22/34] Faster 64bit _inv_var, why not? --- src/field_5x52_impl.h | 25 +++++++++++++------------ src/scalar_4x64_impl.h | 25 +++++++++++++------------ 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index b15fcdcfd5..1513768d07 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -638,23 +638,24 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t x = f; f = g; g = -x; y = u; u = q; q = -y; z = v; v = r; r = -z; - } -#if 1 - /* Handle up to 3 divsteps at once, subject to eta and i. */ - limit = ((int)eta + 1) > i ? i : ((int)eta + 1); - m = (UINT64_MAX >> (64 - limit)) & 7U; + /* Handle up to 6 divsteps at once, subject to eta and i. */ + limit = ((int)eta + 1) > i ? i : ((int)eta + 1); + m = (UINT64_MAX >> (64 - limit)) & 63U; + + w = (f * g * (f * f - 2)) & m; + } else { + /* Handle up to 4 divsteps at once, subject to eta and i. */ + limit = ((int)eta + 1) > i ? i : ((int)eta + 1); + m = (UINT64_MAX >> (64 - limit)) & 15U; + + w = f + (((f + 1) & 4) << 1); + w = (-w * g) & m; + } - /* Note that f * f == 1 mod 8, for any f. */ - w = (-f * g) & m; g += f * w; q += u * w; r += v * w; -#else - g += f; - q += u; - r += v; -#endif } t[0] = (int64_t)u; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index bbde844eae..4679a45a88 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1101,23 +1101,24 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint x = f; f = g; g = -x; y = u; u = q; q = -y; z = v; v = r; r = -z; - } -#if 1 - /* Handle up to 3 divsteps at once, subject to eta and i. */ - limit = ((int)eta + 1) > i ? i : ((int)eta + 1); - m = (UINT64_MAX >> (64 - limit)) & 7U; + /* Handle up to 6 divsteps at once, subject to eta and i. */ + limit = ((int)eta + 1) > i ? i : ((int)eta + 1); + m = (UINT64_MAX >> (64 - limit)) & 63U; + + w = (f * g * (f * f - 2)) & m; + } else { + /* Handle up to 4 divsteps at once, subject to eta and i. */ + limit = ((int)eta + 1) > i ? i : ((int)eta + 1); + m = (UINT64_MAX >> (64 - limit)) & 15U; + + w = f + (((f + 1) & 4) << 1); + w = (-w * g) & m; + } - /* Note that f * f == 1 mod 8, for any f. */ - w = (-f * g) & m; g += f * w; q += u * w; r += v * w; -#else - g += f; - q += u; - r += v; -#endif } t[0] = (int64_t)u; From 2f6dfa21464b63f9097ab07199aee3dea674d214 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Wed, 12 Aug 2020 17:32:22 +0700 Subject: [PATCH 23/34] Get better control over the range of d, e --- src/field_10x26_impl.h | 39 +++++++++++++++++++++------------------ src/field_5x52_impl.h | 27 +++++++++++++++------------ src/scalar_4x64_impl.h | 18 ++++++------------ src/scalar_8x32_impl.h | 18 ++++++------------ 4 files changed, 48 insertions(+), 54 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index b47f79dc94..9bf05b85a2 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1171,6 +1171,7 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; + /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 30 == 0); VERIFY_CHECK(a1 >> 30 == 0); VERIFY_CHECK(a2 >> 30 == 0); @@ -1179,18 +1180,19 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { VERIFY_CHECK(a5 >> 30 == 0); VERIFY_CHECK(a6 >> 30 == 0); VERIFY_CHECK(a7 >> 30 == 0); - - /* Add a multiple of the field prime in case u4 is "negative". */ - r0 = 0x3FFFC2FUL * 8; - r1 = 0x3FFFFBFUL * 8; - r2 = 0x3FFFFFFUL * 8; - r3 = 0x3FFFFFFUL * 8; - r4 = 0x3FFFFFFUL * 8; - r5 = 0x3FFFFFFUL * 8; - r6 = 0x3FFFFFFUL * 8; - r7 = 0x3FFFFFFUL * 8; - r8 = 0x3FFFFFFUL * 8; - r9 = 0x03FFFFFUL * 8; + VERIFY_CHECK((int32_t)a8 >> 16 == 0 || (int32_t)a8 >> 16 == -(int32_t)1); + + /* Add a multiple of the field prime in case a8 is "negative". */ + r0 = 0x3FFFC2FUL * 2; + r1 = 0x3FFFFBFUL * 2; + r2 = 0x3FFFFFFUL * 2; + r3 = 0x3FFFFFFUL * 2; + r4 = 0x3FFFFFFUL * 2; + r5 = 0x3FFFFFFUL * 2; + r6 = 0x3FFFFFFUL * 2; + r7 = 0x3FFFFFFUL * 2; + r8 = 0x3FFFFFFUL * 2; + r9 = 0x03FFFFFUL * 2; r0 += a0 & M26; r1 += (a0 >> 26 | a1 << 4) & M26; @@ -1215,7 +1217,7 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { r->n[9] = r9; #ifdef VERIFY - r->magnitude = 7; + r->magnitude = 2; r->normalized = 0; secp256k1_fe_verify(r); #endif @@ -1372,9 +1374,10 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t) cd -= (int64_t)u * di + (int64_t)v * ei; ce -= (int64_t)q * di + (int64_t)r * ei; - /* Calculate the multiples of P to add, to zero the 30 bottom bits. */ - md = (I30 * (int32_t)cd) & M30; - me = (I30 * (int32_t)ce) & M30; + /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me + * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */ + md = (I30 * 4 * (int32_t)cd) >> 2; + me = (I30 * 4 * (int32_t)ce) >> 2; cd -= (int64_t)C30 * md; ce -= (int64_t)C30 * me; @@ -1497,7 +1500,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe_decode_30(&b0, d); - secp256k1_fe_negate(&b1, &b0, 7); + secp256k1_fe_negate(&b1, &b0, 2); secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); @@ -1561,7 +1564,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe_decode_30(&b0, d); - secp256k1_fe_negate(&b1, &b0, 7); + secp256k1_fe_negate(&b1, &b0, 2); secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 1513768d07..219c281405 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -504,17 +504,19 @@ static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) { const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; uint64_t r0, r1, r2, r3, r4; + /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 62 == 0); VERIFY_CHECK(a1 >> 62 == 0); VERIFY_CHECK(a2 >> 62 == 0); VERIFY_CHECK(a3 >> 62 == 0); + VERIFY_CHECK((int64_t)a4 >> 8 == 0 || (int64_t)a4 >> 8 == -(int64_t)1); - /* Add a multiple of the field prime in case u4 is "negative". */ - r0 = 0xFFFFEFFFFFC2FULL * 8; - r1 = 0xFFFFFFFFFFFFFULL * 8; - r2 = 0xFFFFFFFFFFFFFULL * 8; - r3 = 0xFFFFFFFFFFFFFULL * 8; - r4 = 0x0FFFFFFFFFFFFULL * 8; + /* Add a multiple of the field prime in case a4 is "negative". */ + r0 = 0xFFFFEFFFFFC2FULL * 2; + r1 = 0xFFFFFFFFFFFFFULL * 2; + r2 = 0xFFFFFFFFFFFFFULL * 2; + r3 = 0xFFFFFFFFFFFFFULL * 2; + r4 = 0x0FFFFFFFFFFFFULL * 2; r0 += a0 & M52; r1 += (a0 >> 52 | a1 << 10) & M52; @@ -529,7 +531,7 @@ static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) { r->n[4] = r4; #ifdef VERIFY - r->magnitude = 7; + r->magnitude = 2; r->normalized = 0; secp256k1_fe_verify(r); #endif @@ -682,9 +684,10 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t) cd -= (int128_t)u * d0 + (int128_t)v * e0; ce -= (int128_t)q * d0 + (int128_t)r * e0; - /* Calculate the multiples of P to add, to zero the 62 bottom bits. */ - md = (I62 * (int64_t)cd) & M62; - me = (I62 * (int64_t)ce) & M62; + /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me + * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */ + md = (I62 * 4 * (int64_t)cd) >> 2; + me = (I62 * 4 * (int64_t)ce) >> 2; cd -= (int128_t)C62 * md; ce -= (int128_t)C62 * me; @@ -814,7 +817,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe_decode_62(&b0, d); - secp256k1_fe_negate(&b1, &b0, 7); + secp256k1_fe_negate(&b1, &b0, 2); secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); @@ -878,7 +881,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe_decode_62(&b0, d); - secp256k1_fe_negate(&b1, &b0, 7); + secp256k1_fe_negate(&b1, &b0, 2); secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 4679a45a88..ee5e128187 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -966,13 +966,14 @@ static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) { const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; uint64_t r0, r1, r2, r3; - int64_t t; secp256k1_scalar u; + /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 62 == 0); VERIFY_CHECK(a1 >> 62 == 0); VERIFY_CHECK(a2 >> 62 == 0); VERIFY_CHECK(a3 >> 62 == 0); + VERIFY_CHECK((int64_t)a4 >> 8 == 0 || (int64_t)a4 >> 8 == -(int64_t)1); r0 = a0 | a1 << 62; r1 = a1 >> 2 | a2 << 60; @@ -986,16 +987,8 @@ static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) { secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r)); - t = (int64_t)a4 >> 8; - - VERIFY_CHECK(t == 1 || t == 0 || t == -1); - secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256); secp256k1_scalar_cmov(r, &u, a4 >> 63); - - t += a4 >> 63; - - secp256k1_scalar_reduce(r, t); } static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { @@ -1144,9 +1137,10 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t cd -= (int128_t)u * d0 + (int128_t)v * e0; ce -= (int128_t)q * d0 + (int128_t)r * e0; - /* Calculate the multiples of P to add, to zero the 62 bottom bits. */ - md = (I62 * (int64_t)cd) & M62; - me = (I62 * (int64_t)ce) & M62; + /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me + * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */ + md = (I62 * 4 * (int64_t)cd) >> 2; + me = (I62 * 4 * (int64_t)ce) >> 2; cd += (int128_t)P[0] * md; ce += (int128_t)P[0] * me; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 6b422a6334..8ba26ed95f 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -743,9 +743,9 @@ static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) { const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; uint32_t r0, r1, r2, r3, r4, r5, r6, r7; - int32_t t; secp256k1_scalar u; + /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 30 == 0); VERIFY_CHECK(a1 >> 30 == 0); VERIFY_CHECK(a2 >> 30 == 0); @@ -754,6 +754,7 @@ static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) { VERIFY_CHECK(a5 >> 30 == 0); VERIFY_CHECK(a6 >> 30 == 0); VERIFY_CHECK(a7 >> 30 == 0); + VERIFY_CHECK((int32_t)a8 >> 16 == 0 || (int32_t)a8 >> 16 == -(int32_t)1); r0 = a0 | a1 << 30; r1 = a1 >> 2 | a2 << 28; @@ -775,16 +776,8 @@ static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) { secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r)); - t = (int32_t)a8 >> 16; - - VERIFY_CHECK(t == 1 || t == 0 || t == -1); - secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256); secp256k1_scalar_cmov(r, &u, a8 >> 31); - - t += a8 >> 31; - - secp256k1_scalar_reduce(r, t); } static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) { @@ -936,9 +929,10 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t cd -= (int64_t)u * di + (int64_t)v * ei; ce -= (int64_t)q * di + (int64_t)r * ei; - /* Calculate the multiples of P to add, to zero the 30 bottom bits. */ - md = (I30 * (int32_t)cd) & M30; - me = (I30 * (int32_t)ce) & M30; + /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me + * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */ + md = (I30 * 4 * (int32_t)cd) >> 2; + me = (I30 * 4 * (int32_t)ce) >> 2; cd += (int64_t)P[0] * md; ce += (int64_t)P[0] * me; From 90743d29ab93c0d74d60f2c947b45686284da2b8 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Thu, 13 Aug 2020 17:16:23 +0700 Subject: [PATCH 24/34] Verify the expected zeros are produced --- src/field_10x26_impl.h | 4 ++++ src/field_5x52_impl.h | 2 ++ src/scalar_4x64_impl.h | 2 ++ src/scalar_8x32_impl.h | 4 ++++ 4 files changed, 12 insertions(+) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 9bf05b85a2..e1a4e43011 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1341,10 +1341,14 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g += f * w; q += u * w; r += v * w; + + VERIFY_CHECK((g & m) == 0); #else g += f; q += u; r += v; + + VERIFY_CHECK((g & 1) == 0); #endif } diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 219c281405..082886a150 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -658,6 +658,8 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g += f * w; q += u * w; r += v * w; + + VERIFY_CHECK((g & m) == 0); } t[0] = (int64_t)u; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index ee5e128187..a76a510d31 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1112,6 +1112,8 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint g += f * w; q += u * w; r += v * w; + + VERIFY_CHECK((g & m) == 0); } t[0] = (int64_t)u; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 8ba26ed95f..2b34d13aed 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -897,10 +897,14 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint g += f * w; q += u * w; r += v * w; + + VERIFY_CHECK((g & m) == 0); #else g += f; q += u; r += v; + + VERIFY_CHECK((g & 1) == 0); #endif } From 5de2c833907f4ddb94d9d13dc37214734eebfda4 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Thu, 13 Aug 2020 20:08:16 +0700 Subject: [PATCH 25/34] _inv_var conditional negations - tighten result magnitude for _fe_decode methods --- src/field_10x26_impl.h | 55 ++++++++++++++++++++++-------------------- src/field_5x52_impl.h | 45 ++++++++++++++++++---------------- src/scalar_4x64_impl.h | 17 +++++++------ src/scalar_8x32_impl.h | 17 +++++++------ 4 files changed, 73 insertions(+), 61 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index e1a4e43011..593266a967 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1169,7 +1169,9 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { const uint32_t M26 = UINT32_MAX >> 6; const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; - uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; + uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, t; + + t = (int32_t)a8 >> 16; /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 30 == 0); @@ -1180,19 +1182,19 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { VERIFY_CHECK(a5 >> 30 == 0); VERIFY_CHECK(a6 >> 30 == 0); VERIFY_CHECK(a7 >> 30 == 0); - VERIFY_CHECK((int32_t)a8 >> 16 == 0 || (int32_t)a8 >> 16 == -(int32_t)1); - - /* Add a multiple of the field prime in case a8 is "negative". */ - r0 = 0x3FFFC2FUL * 2; - r1 = 0x3FFFFBFUL * 2; - r2 = 0x3FFFFFFUL * 2; - r3 = 0x3FFFFFFUL * 2; - r4 = 0x3FFFFFFUL * 2; - r5 = 0x3FFFFFFUL * 2; - r6 = 0x3FFFFFFUL * 2; - r7 = 0x3FFFFFFUL * 2; - r8 = 0x3FFFFFFUL * 2; - r9 = 0x03FFFFFUL * 2; + VERIFY_CHECK(t == 0 || t == -(uint32_t)1); + + /* Add 2P if a8 is "negative". */ + r0 = 0x3FFF85EUL & t; + r1 = 0x3FFFF7FUL & t; + r2 = 0x3FFFFFFUL & t; + r3 = 0x3FFFFFFUL & t; + r4 = 0x3FFFFFFUL & t; + r5 = 0x3FFFFFFUL & t; + r6 = 0x3FFFFFFUL & t; + r7 = 0x3FFFFFFUL & t; + r8 = 0x3FFFFFFUL & t; + r9 = 0x07FFFFFUL & t; r0 += a0 & M26; r1 += (a0 >> 26 | a1 << 4) & M26; @@ -1217,7 +1219,7 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { r->n[9] = r9; #ifdef VERIFY - r->magnitude = 2; + r->magnitude = 1; r->normalized = 0; secp256k1_fe_verify(r); #endif @@ -1526,19 +1528,19 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { int32_t f[9] = { 0x3FFFFC2F, 0x3FFFFFFB, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF }; int32_t g[9]; - secp256k1_fe b0, b1; + secp256k1_fe b; int i, sign; uint32_t eta; #ifdef VERIFY int zero_in; #endif - b0 = *a; - secp256k1_fe_normalize(&b0); - secp256k1_fe_encode_30(g, &b0); + b = *a; + secp256k1_fe_normalize(&b); + secp256k1_fe_encode_30(g, &b); #ifdef VERIFY - zero_in = secp256k1_fe_is_zero(&b0); + zero_in = secp256k1_fe_is_zero(&b); #endif /* The paper uses 'delta'; eta == -delta (a performance tweak). @@ -1566,17 +1568,18 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { sign = (f[0] >> 1) & 1; - secp256k1_fe_decode_30(&b0, d); + secp256k1_fe_decode_30(&b, d); - secp256k1_fe_negate(&b1, &b0, 2); - secp256k1_fe_cmov(&b0, &b1, sign); - secp256k1_fe_normalize_weak(&b0); + if (sign) { + secp256k1_fe_negate(&b, &b, 1); + secp256k1_fe_normalize_weak(&b); + } #ifdef VERIFY - VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in); + VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b) == !zero_in); #endif - *r = b0; + *r = b; } #endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 082886a150..2d89e4fcdf 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -502,21 +502,23 @@ static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) { const uint64_t M52 = UINT64_MAX >> 12; const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; - uint64_t r0, r1, r2, r3, r4; + uint64_t r0, r1, r2, r3, r4, t; + + t = (int64_t)a4 >> 8; /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 62 == 0); VERIFY_CHECK(a1 >> 62 == 0); VERIFY_CHECK(a2 >> 62 == 0); VERIFY_CHECK(a3 >> 62 == 0); - VERIFY_CHECK((int64_t)a4 >> 8 == 0 || (int64_t)a4 >> 8 == -(int64_t)1); + VERIFY_CHECK(t == 0 || t == -(uint64_t)1); - /* Add a multiple of the field prime in case a4 is "negative". */ - r0 = 0xFFFFEFFFFFC2FULL * 2; - r1 = 0xFFFFFFFFFFFFFULL * 2; - r2 = 0xFFFFFFFFFFFFFULL * 2; - r3 = 0xFFFFFFFFFFFFFULL * 2; - r4 = 0x0FFFFFFFFFFFFULL * 2; + /* Add 2P if a4 is "negative". */ + r0 = 0xFFFFDFFFFF85EULL & t; + r1 = 0xFFFFFFFFFFFFFULL & t; + r2 = 0xFFFFFFFFFFFFFULL & t; + r3 = 0xFFFFFFFFFFFFFULL & t; + r4 = 0x1FFFFFFFFFFFFULL & t; r0 += a0 & M52; r1 += (a0 >> 52 | a1 << 10) & M52; @@ -531,7 +533,7 @@ static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) { r->n[4] = r4; #ifdef VERIFY - r->magnitude = 2; + r->magnitude = 1; r->normalized = 0; secp256k1_fe_verify(r); #endif @@ -819,7 +821,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe_decode_62(&b0, d); - secp256k1_fe_negate(&b1, &b0, 2); + secp256k1_fe_negate(&b1, &b0, 1); secp256k1_fe_cmov(&b0, &b1, sign); secp256k1_fe_normalize_weak(&b0); @@ -841,19 +843,19 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; - secp256k1_fe b0, b1; + secp256k1_fe b; int i, sign; uint64_t eta; #ifdef VERIFY int zero_in; #endif - b0 = *a; - secp256k1_fe_normalize(&b0); - secp256k1_fe_encode_62(g, &b0); + b = *a; + secp256k1_fe_normalize(&b); + secp256k1_fe_encode_62(g, &b); #ifdef VERIFY - zero_in = secp256k1_fe_is_zero(&b0); + zero_in = secp256k1_fe_is_zero(&b); #endif /* The paper uses 'delta'; eta == -delta (a performance tweak). @@ -881,17 +883,18 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { sign = (f[0] >> 1) & 1; - secp256k1_fe_decode_62(&b0, d); + secp256k1_fe_decode_62(&b, d); - secp256k1_fe_negate(&b1, &b0, 2); - secp256k1_fe_cmov(&b0, &b1, sign); - secp256k1_fe_normalize_weak(&b0); + if (sign) { + secp256k1_fe_negate(&b, &b, 1); + secp256k1_fe_normalize_weak(&b); + } #ifdef VERIFY - VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in); + VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b) == !zero_in); #endif - *r = b0; + *r = b; } #endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index a76a510d31..ee44b36bfd 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1308,15 +1308,15 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; - secp256k1_scalar b0; + secp256k1_scalar b; int i, sign; uint64_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - b0 = *x; - secp256k1_scalar_encode_62(g, &b0); + b = *x; + secp256k1_scalar_encode_62(g, &b); /* The paper uses 'delta'; eta == -delta (a performance tweak). * @@ -1343,14 +1343,17 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc sign = (f[0] >> 1) & 1; - secp256k1_scalar_decode_62(&b0, d); - secp256k1_scalar_cond_negate(&b0, sign); + secp256k1_scalar_decode_62(&b, d); + + if (sign) { + secp256k1_scalar_negate(&b, &b); + } #ifdef VERIFY - VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in); + VERIFY_CHECK(!secp256k1_scalar_is_zero(&b) == !zero_in); #endif - *r = b0; + *r = b; } #endif /* SECP256K1_SCALAR_REPR_IMPL_H */ diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 2b34d13aed..d1009cc870 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -1077,15 +1077,15 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, 0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL }; int32_t g[9]; - secp256k1_scalar b0; + secp256k1_scalar b; int i, sign; uint32_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - b0 = *x; - secp256k1_scalar_encode_30(g, &b0); + b = *x; + secp256k1_scalar_encode_30(g, &b); /* The paper uses 'delta'; eta == -delta (a performance tweak). * @@ -1112,14 +1112,17 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc sign = (f[0] >> 1) & 1; - secp256k1_scalar_decode_30(&b0, d); - secp256k1_scalar_cond_negate(&b0, sign); + secp256k1_scalar_decode_30(&b, d); + + if (sign) { + secp256k1_scalar_negate(&b, &b); + } #ifdef VERIFY - VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in); + VERIFY_CHECK(!secp256k1_scalar_is_zero(&b) == !zero_in); #endif - *r = b0; + *r = b; } #endif /* SECP256K1_SCALAR_REPR_IMPL_H */ From 308fd32e004991991a52d91a9c847ddca98b69ef Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sat, 15 Aug 2020 14:07:05 +0700 Subject: [PATCH 26/34] Experiment with f,g shortening in inv_var --- src/field_10x26_impl.h | 71 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 593266a967..baf1ab94be 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1457,6 +1457,41 @@ static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, const int32_t *t) g[8] = (int32_t)cg; } +static void secp256k1_fe_update_fg_30_var(int len, int32_t *f, int32_t *g, const int32_t *t) { + + const int32_t M30 = (int32_t)(UINT32_MAX >> 2); + const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; + int32_t fi, gi; + int64_t cf = 0, cg = 0; + int i; + + VERIFY_CHECK(len > 0); + + fi = f[0]; + gi = g[0]; + + cf -= (int64_t)u * fi + (int64_t)v * gi; + cg -= (int64_t)q * fi + (int64_t)r * gi; + + VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30; + VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30; + + for (i = 1; i < len; ++i) { + + fi = f[i]; + gi = g[i]; + + cf -= (int64_t)u * fi + (int64_t)v * gi; + cg -= (int64_t)q * fi + (int64_t)r * gi; + + f[i - 1] = (int32_t)cf & M30; cf >>= 30; + g[i - 1] = (int32_t)cg & M30; cg >>= 30; + } + + f[len - 1] = (int32_t)cf; + g[len - 1] = (int32_t)cg; +} + static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { /* Modular inversion based on the paper "Fast constant-time gcd computation and @@ -1519,6 +1554,8 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { +#define IS_THIS_FASTER 1 + /* Modular inversion based on the paper "Fast constant-time gcd computation and * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ @@ -1531,6 +1568,10 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe b; int i, sign; uint32_t eta; +#if IS_THIS_FASTER + int j, len = 9; + int32_t cond, fn, gn; +#endif #ifdef VERIFY int zero_in; #endif @@ -1550,6 +1591,35 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { eta = -(uint32_t)1; for (i = 0; i < 25; ++i) { +#if IS_THIS_FASTER + eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t); + secp256k1_fe_update_de_30(d, e, t); + secp256k1_fe_update_fg_30_var(len, f, g, t); + + if (g[0] == 0) { + cond = 0; + for (j = 1; j < len; ++j) { + cond |= g[j]; + } + if (cond == 0) { + break; + } + } + + fn = f[len - 1]; + gn = g[len - 1]; + + cond = ((int32_t)len - 2) >> 31; + cond |= fn ^ (fn >> 31); + cond |= gn ^ (gn >> 31); + + if (cond == 0) + { + f[len - 2] |= (uint32_t)fn << 30; + g[len - 2] |= (uint32_t)gn << 30; + --len; + } +#else eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t); secp256k1_fe_update_de_30(d, e, t); secp256k1_fe_update_fg_30(f, g, t); @@ -1559,6 +1629,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { break; } } +#endif } VERIFY_CHECK(i < 25); From ff0cf1124c630be4f4cd980ce333bf463ad2c289 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sat, 15 Aug 2020 14:28:42 +0700 Subject: [PATCH 27/34] f,g shortening for 64bit field --- src/field_5x52_impl.h | 71 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 2d89e4fcdf..1d80926e8e 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -772,6 +772,41 @@ static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, const int64_t *t) g[4] = (int64_t)cg; } +static void secp256k1_fe_update_fg_62_var(int len, int64_t *f, int64_t *g, const int64_t *t) { + + const int64_t M62 = (int64_t)(UINT64_MAX >> 2); + const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; + int64_t fi, gi; + int128_t cf = 0, cg = 0; + int i; + + VERIFY_CHECK(len > 0); + + fi = f[0]; + gi = g[0]; + + cf -= (int128_t)u * fi + (int128_t)v * gi; + cg -= (int128_t)q * fi + (int128_t)r * gi; + + VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62; + VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62; + + for (i = 1; i < len; ++i) { + + fi = f[i]; + gi = g[i]; + + cf -= (int128_t)u * fi + (int128_t)v * gi; + cg -= (int128_t)q * fi + (int128_t)r * gi; + + f[i - 1] = (int64_t)cf & M62; cf >>= 62; + g[i - 1] = (int64_t)cg & M62; cg >>= 62; + } + + f[len - 1] = (int64_t)cf; + g[len - 1] = (int64_t)cg; +} + static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { /* Modular inversion based on the paper "Fast constant-time gcd computation and @@ -834,6 +869,8 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { +#define IS_THIS_FASTER 1 + /* Modular inversion based on the paper "Fast constant-time gcd computation and * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ @@ -846,6 +883,10 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe b; int i, sign; uint64_t eta; +#if IS_THIS_FASTER + int j, len = 5; + int64_t cond, fn, gn; +#endif #ifdef VERIFY int zero_in; #endif @@ -865,6 +906,35 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { +#if IS_THIS_FASTER + eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t); + secp256k1_fe_update_de_62(d, e, t); + secp256k1_fe_update_fg_62_var(len, f, g, t); + + if (g[0] == 0) { + cond = 0; + for (j = 1; j < len; ++j) { + cond |= g[j]; + } + if (cond == 0) { + break; + } + } + + fn = f[len - 1]; + gn = g[len - 1]; + + cond = ((int64_t)len - 2) >> 63; + cond |= fn ^ (fn >> 63); + cond |= gn ^ (gn >> 63); + + if (cond == 0) + { + f[len - 2] |= fn << 62; + g[len - 2] |= gn << 62; + --len; + } +#else eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t); secp256k1_fe_update_de_62(d, e, t); secp256k1_fe_update_fg_62(f, g, t); @@ -874,6 +944,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { break; } } +#endif } VERIFY_CHECK(i < 12); From b51a1b55d9f90f792766cd1d2c76d4e4442a49ae Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sun, 16 Aug 2020 12:53:59 +0700 Subject: [PATCH 28/34] THIS_IS_FASTER --- src/field_10x26_impl.h | 23 +++------------- src/field_5x52_impl.h | 25 +++--------------- src/scalar_4x64_impl.h | 60 +++++++++++++++++++++++++++++++++++++++--- src/scalar_8x32_impl.h | 60 +++++++++++++++++++++++++++++++++++++++--- 4 files changed, 121 insertions(+), 47 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index baf1ab94be..cc29cdbaa2 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1554,8 +1554,6 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { -#define IS_THIS_FASTER 1 - /* Modular inversion based on the paper "Fast constant-time gcd computation and * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ @@ -1566,12 +1564,9 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF }; int32_t g[9]; secp256k1_fe b; - int i, sign; + int i, j, len = 9, sign; uint32_t eta; -#if IS_THIS_FASTER - int j, len = 9; int32_t cond, fn, gn; -#endif #ifdef VERIFY int zero_in; #endif @@ -1591,7 +1586,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { eta = -(uint32_t)1; for (i = 0; i < 25; ++i) { -#if IS_THIS_FASTER + eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t); secp256k1_fe_update_de_30(d, e, t); secp256k1_fe_update_fg_30_var(len, f, g, t); @@ -1613,23 +1608,11 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { cond |= fn ^ (fn >> 31); cond |= gn ^ (gn >> 31); - if (cond == 0) - { + if (cond == 0) { f[len - 2] |= (uint32_t)fn << 30; g[len - 2] |= (uint32_t)gn << 30; --len; } -#else - eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t); - secp256k1_fe_update_de_30(d, e, t); - secp256k1_fe_update_fg_30(f, g, t); - - if (g[0] == 0) { - if ((g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0) { - break; - } - } -#endif } VERIFY_CHECK(i < 25); diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 1d80926e8e..da1391aea4 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -869,8 +869,6 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { -#define IS_THIS_FASTER 1 - /* Modular inversion based on the paper "Fast constant-time gcd computation and * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */ @@ -881,12 +879,9 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; secp256k1_fe b; - int i, sign; + int i, j, len = 5, sign; uint64_t eta; -#if IS_THIS_FASTER - int j, len = 5; int64_t cond, fn, gn; -#endif #ifdef VERIFY int zero_in; #endif @@ -906,7 +901,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { -#if IS_THIS_FASTER + eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t); secp256k1_fe_update_de_62(d, e, t); secp256k1_fe_update_fg_62_var(len, f, g, t); @@ -915,7 +910,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { cond = 0; for (j = 1; j < len; ++j) { cond |= g[j]; - } + } if (cond == 0) { break; } @@ -928,23 +923,11 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { cond |= fn ^ (fn >> 63); cond |= gn ^ (gn >> 63); - if (cond == 0) - { + if (cond == 0) { f[len - 2] |= fn << 62; g[len - 2] |= gn << 62; --len; } -#else - eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t); - secp256k1_fe_update_de_62(d, e, t); - secp256k1_fe_update_fg_62(f, g, t); - - if (g[0] == 0) { - if ((g[1] | g[2] | g[3] | g[4]) == 0) { - break; - } - } -#endif } VERIFY_CHECK(i < 12); diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index ee44b36bfd..80c90cd347 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1229,6 +1229,41 @@ static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, const int64_t g[4] = (int64_t)cg; } +static void secp256k1_scalar_update_fg_62_var(int len, int64_t *f, int64_t *g, const int64_t *t) { + + const int64_t M62 = (int64_t)(UINT64_MAX >> 2); + const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; + int64_t fi, gi; + int128_t cf = 0, cg = 0; + int i; + + VERIFY_CHECK(len > 0); + + fi = f[0]; + gi = g[0]; + + cf -= (int128_t)u * fi + (int128_t)v * gi; + cg -= (int128_t)q * fi + (int128_t)r * gi; + + VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62; + VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62; + + for (i = 1; i < len; ++i) { + + fi = f[i]; + gi = g[i]; + + cf -= (int128_t)u * fi + (int128_t)v * gi; + cg -= (int128_t)q * fi + (int128_t)r * gi; + + f[i - 1] = (int64_t)cf & M62; cf >>= 62; + g[i - 1] = (int64_t)cg & M62; cg >>= 62; + } + + f[len - 1] = (int64_t)cf; + g[len - 1] = (int64_t)cg; +} + static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) { #if defined(EXHAUSTIVE_TEST_ORDER) int i; @@ -1309,8 +1344,9 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; secp256k1_scalar b; - int i, sign; + int i, j, len = 5, sign; uint64_t eta; + int64_t cond, fn, gn; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif @@ -1325,15 +1361,33 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc eta = -(uint64_t)1; for (i = 0; i < 12; ++i) { + eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], t); secp256k1_scalar_update_de_62(d, e, t); - secp256k1_scalar_update_fg_62(f, g, t); + secp256k1_scalar_update_fg_62_var(len, f, g, t); if (g[0] == 0) { - if ((g[1] | g[2] | g[3] | g[4]) == 0) { + cond = 0; + for (j = 1; j < len; ++j) { + cond |= g[j]; + } + if (cond == 0) { break; } } + + fn = f[len - 1]; + gn = g[len - 1]; + + cond = ((int64_t)len - 2) >> 63; + cond |= fn ^ (fn >> 63); + cond |= gn ^ (gn >> 63); + + if (cond == 0) { + f[len - 2] |= fn << 62; + g[len - 2] |= gn << 62; + --len; + } } VERIFY_CHECK(i < 12); diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index d1009cc870..40536c827b 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -998,6 +998,41 @@ static void secp256k1_scalar_update_fg_30(int32_t *f, int32_t *g, int32_t *t) { g[8] = (int32_t)cg; } +static void secp256k1_scalar_update_fg_30_var(int len, int32_t *f, int32_t *g, const int32_t *t) { + + const int32_t M30 = (int32_t)(UINT32_MAX >> 2); + const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; + int32_t fi, gi; + int64_t cf = 0, cg = 0; + int i; + + VERIFY_CHECK(len > 0); + + fi = f[0]; + gi = g[0]; + + cf -= (int64_t)u * fi + (int64_t)v * gi; + cg -= (int64_t)q * fi + (int64_t)r * gi; + + VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30; + VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30; + + for (i = 1; i < len; ++i) { + + fi = f[i]; + gi = g[i]; + + cf -= (int64_t)u * fi + (int64_t)v * gi; + cg -= (int64_t)q * fi + (int64_t)r * gi; + + f[i - 1] = (int32_t)cf & M30; cf >>= 30; + g[i - 1] = (int32_t)cg & M30; cg >>= 30; + } + + f[len - 1] = (int32_t)cf; + g[len - 1] = (int32_t)cg; +} + static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) { #if defined(EXHAUSTIVE_TEST_ORDER) int i; @@ -1078,8 +1113,9 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc 0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL }; int32_t g[9]; secp256k1_scalar b; - int i, sign; + int i, j, len = 9, sign; uint32_t eta; + int32_t cond, fn, gn; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif @@ -1094,15 +1130,33 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc eta = -(uint32_t)1; for (i = 0; i < 25; ++i) { + eta = secp256k1_scalar_divsteps_30_var(eta, f[0], g[0], t); secp256k1_scalar_update_de_30(d, e, t); - secp256k1_scalar_update_fg_30(f, g, t); + secp256k1_scalar_update_fg_30_var(len, f, g, t); if (g[0] == 0) { - if ((g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0) { + cond = 0; + for (j = 1; j < len; ++j) { + cond |= g[j]; + } + if (cond == 0) { break; } } + + fn = f[len - 1]; + gn = g[len - 1]; + + cond = ((int32_t)len - 2) >> 31; + cond |= fn ^ (fn >> 31); + cond |= gn ^ (gn >> 31); + + if (cond == 0) { + f[len - 2] |= (uint32_t)fn << 30; + g[len - 2] |= (uint32_t)gn << 30; + --len; + } } VERIFY_CHECK(i < 25); From 1baff2caec31ff709b9404edd8a7b7ea380d29d4 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sun, 16 Aug 2020 19:07:25 +0700 Subject: [PATCH 29/34] Accentuate the positive (Eliminate the negative) --- src/field_10x26_impl.h | 46 ++++++++++++++--------------- src/field_5x52_impl.h | 66 +++++++++++++++++++++--------------------- src/scalar_4x64_impl.h | 66 +++++++++++++++++++++--------------------- src/scalar_8x32_impl.h | 42 +++++++++++++-------------- 4 files changed, 110 insertions(+), 110 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index cc29cdbaa2..3fd44bc5da 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1250,15 +1250,15 @@ static void secp256k1_fe_encode_30(int32_t *r, const secp256k1_fe *a) { static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { - uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; + uint32_t u = 1, v = 0, q = 0, r = 1; uint32_t c1, c2, f = f0, g = g0, x, y, z; int i; for (i = 0; i < 30; ++i) { VERIFY_CHECK((f & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << i); - VERIFY_CHECK((q * f0 + r * g0) == -g << i); + VERIFY_CHECK((u * f0 + v * g0) == f << i); + VERIFY_CHECK((q * f0 + r * g0) == g << i); c1 = -(g & (eta >> 31)); @@ -1296,7 +1296,7 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t 31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 }; #endif - uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; + uint32_t u = 1, v = 0, q = 0, r = 1; uint32_t f = f0, g = g0, m, w, x, y, z; int i = 30, limit, zeros; @@ -1323,8 +1323,8 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((g & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << (30 - i)); - VERIFY_CHECK((q * f0 + r * g0) == -g << (30 - i)); + VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i)); + VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i)); if ((int32_t)eta < 0) { eta = -eta; @@ -1371,14 +1371,14 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t) const int32_t M30 = (int32_t)(UINT32_MAX >> 2); const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; int32_t di, ei, md, me; - int64_t cd = 0, ce = 0; + int64_t cd, ce; int i; di = d[0]; ei = e[0]; - cd -= (int64_t)u * di + (int64_t)v * ei; - ce -= (int64_t)q * di + (int64_t)r * ei; + cd = (int64_t)u * di + (int64_t)v * ei; + ce = (int64_t)q * di + (int64_t)r * ei; /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */ @@ -1399,8 +1399,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t) di = d[i]; ei = e[i]; - cd -= (int64_t)u * di + (int64_t)v * ei; - ce -= (int64_t)q * di + (int64_t)r * ei; + cd += (int64_t)u * di + (int64_t)v * ei; + ce += (int64_t)q * di + (int64_t)r * ei; d[i - 1] = (int32_t)cd & M30; cd >>= 30; e[i - 1] = (int32_t)ce & M30; ce >>= 30; @@ -1410,8 +1410,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t) di = d[8]; ei = e[8]; - cd -= (int64_t)u * di + (int64_t)v * ei; - ce -= (int64_t)q * di + (int64_t)r * ei; + cd += (int64_t)u * di + (int64_t)v * ei; + ce += (int64_t)q * di + (int64_t)r * ei; cd += (int64_t)65536 * md; ce += (int64_t)65536 * me; @@ -1429,14 +1429,14 @@ static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, const int32_t *t) const int32_t M30 = (int32_t)(UINT32_MAX >> 2); const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; int32_t fi, gi; - int64_t cf = 0, cg = 0; + int64_t cf, cg; int i; fi = f[0]; gi = g[0]; - cf -= (int64_t)u * fi + (int64_t)v * gi; - cg -= (int64_t)q * fi + (int64_t)r * gi; + cf = (int64_t)u * fi + (int64_t)v * gi; + cg = (int64_t)q * fi + (int64_t)r * gi; VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30; VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30; @@ -1446,8 +1446,8 @@ static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, const int32_t *t) fi = f[i]; gi = g[i]; - cf -= (int64_t)u * fi + (int64_t)v * gi; - cg -= (int64_t)q * fi + (int64_t)r * gi; + cf += (int64_t)u * fi + (int64_t)v * gi; + cg += (int64_t)q * fi + (int64_t)r * gi; f[i - 1] = (int32_t)cf & M30; cf >>= 30; g[i - 1] = (int32_t)cg & M30; cg >>= 30; @@ -1462,7 +1462,7 @@ static void secp256k1_fe_update_fg_30_var(int len, int32_t *f, int32_t *g, const const int32_t M30 = (int32_t)(UINT32_MAX >> 2); const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; int32_t fi, gi; - int64_t cf = 0, cg = 0; + int64_t cf, cg; int i; VERIFY_CHECK(len > 0); @@ -1470,8 +1470,8 @@ static void secp256k1_fe_update_fg_30_var(int len, int32_t *f, int32_t *g, const fi = f[0]; gi = g[0]; - cf -= (int64_t)u * fi + (int64_t)v * gi; - cg -= (int64_t)q * fi + (int64_t)r * gi; + cf = (int64_t)u * fi + (int64_t)v * gi; + cg = (int64_t)q * fi + (int64_t)r * gi; VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30; VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30; @@ -1481,8 +1481,8 @@ static void secp256k1_fe_update_fg_30_var(int len, int32_t *f, int32_t *g, const fi = f[i]; gi = g[i]; - cf -= (int64_t)u * fi + (int64_t)v * gi; - cg -= (int64_t)q * fi + (int64_t)r * gi; + cf += (int64_t)u * fi + (int64_t)v * gi; + cg += (int64_t)q * fi + (int64_t)r * gi; f[i - 1] = (int32_t)cf & M30; cf >>= 30; g[i - 1] = (int32_t)cg & M30; cg >>= 30; diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index da1391aea4..15c4fc2a8b 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -558,15 +558,15 @@ static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) { static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) { - uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; + uint64_t u = 1, v = 0, q = 0, r = 1; uint64_t c1, c2, f = f0, g = g0, x, y, z; int i; for (i = 0; i < 62; ++i) { VERIFY_CHECK((f & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << i); - VERIFY_CHECK((q * f0 + r * g0) == -g << i); + VERIFY_CHECK((u * f0 + v * g0) == f << i); + VERIFY_CHECK((q * f0 + r * g0) == g << i); c1 = -(g & (eta >> 63)); @@ -607,7 +607,7 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t }; #endif - uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; + uint64_t u = 1, v = 0, q = 0, r = 1; uint64_t f = f0, g = g0, m, w, x, y, z; int i = 62, limit, zeros; @@ -634,8 +634,8 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((g & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i)); - VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i)); + VERIFY_CHECK((u * f0 + v * g0) == f << (62 - i)); + VERIFY_CHECK((q * f0 + r * g0) == g << (62 - i)); if ((int64_t)eta < 0) { eta = -eta; @@ -683,10 +683,10 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t) const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4]; const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; int64_t md, me; - int128_t cd = 0, ce = 0; + int128_t cd, ce; - cd -= (int128_t)u * d0 + (int128_t)v * e0; - ce -= (int128_t)q * d0 + (int128_t)r * e0; + cd = (int128_t)u * d0 + (int128_t)v * e0; + ce = (int128_t)q * d0 + (int128_t)r * e0; /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */ @@ -699,26 +699,26 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t) VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62; VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62; - cd -= (int128_t)u * d1 + (int128_t)v * e1; - ce -= (int128_t)q * d1 + (int128_t)r * e1; + cd += (int128_t)u * d1 + (int128_t)v * e1; + ce += (int128_t)q * d1 + (int128_t)r * e1; d[0] = (int64_t)cd & M62; cd >>= 62; e[0] = (int64_t)ce & M62; ce >>= 62; - cd -= (int128_t)u * d2 + (int128_t)v * e2; - ce -= (int128_t)q * d2 + (int128_t)r * e2; + cd += (int128_t)u * d2 + (int128_t)v * e2; + ce += (int128_t)q * d2 + (int128_t)r * e2; d[1] = (int64_t)cd & M62; cd >>= 62; e[1] = (int64_t)ce & M62; ce >>= 62; - cd -= (int128_t)u * d3 + (int128_t)v * e3; - ce -= (int128_t)q * d3 + (int128_t)r * e3; + cd += (int128_t)u * d3 + (int128_t)v * e3; + ce += (int128_t)q * d3 + (int128_t)r * e3; d[2] = (int64_t)cd & M62; cd >>= 62; e[2] = (int64_t)ce & M62; ce >>= 62; - cd -= (int128_t)u * d4 + (int128_t)v * e4; - ce -= (int128_t)q * d4 + (int128_t)r * e4; + cd += (int128_t)u * d4 + (int128_t)v * e4; + ce += (int128_t)q * d4 + (int128_t)r * e4; cd += (int128_t)256 * md; ce += (int128_t)256 * me; @@ -736,34 +736,34 @@ static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, const int64_t *t) const int64_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4]; const int64_t g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4]; const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; - int128_t cf = 0, cg = 0; + int128_t cf, cg; - cf -= (int128_t)u * f0 + (int128_t)v * g0; - cg -= (int128_t)q * f0 + (int128_t)r * g0; + cf = (int128_t)u * f0 + (int128_t)v * g0; + cg = (int128_t)q * f0 + (int128_t)r * g0; VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62; VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62; - cf -= (int128_t)u * f1 + (int128_t)v * g1; - cg -= (int128_t)q * f1 + (int128_t)r * g1; + cf += (int128_t)u * f1 + (int128_t)v * g1; + cg += (int128_t)q * f1 + (int128_t)r * g1; f[0] = (int64_t)cf & M62; cf >>= 62; g[0] = (int64_t)cg & M62; cg >>= 62; - cf -= (int128_t)u * f2 + (int128_t)v * g2; - cg -= (int128_t)q * f2 + (int128_t)r * g2; + cf += (int128_t)u * f2 + (int128_t)v * g2; + cg += (int128_t)q * f2 + (int128_t)r * g2; f[1] = (int64_t)cf & M62; cf >>= 62; g[1] = (int64_t)cg & M62; cg >>= 62; - cf -= (int128_t)u * f3 + (int128_t)v * g3; - cg -= (int128_t)q * f3 + (int128_t)r * g3; + cf += (int128_t)u * f3 + (int128_t)v * g3; + cg += (int128_t)q * f3 + (int128_t)r * g3; f[2] = (int64_t)cf & M62; cf >>= 62; g[2] = (int64_t)cg & M62; cg >>= 62; - cf -= (int128_t)u * f4 + (int128_t)v * g4; - cg -= (int128_t)q * f4 + (int128_t)r * g4; + cf += (int128_t)u * f4 + (int128_t)v * g4; + cg += (int128_t)q * f4 + (int128_t)r * g4; f[3] = (int64_t)cf & M62; cf >>= 62; g[3] = (int64_t)cg & M62; cg >>= 62; @@ -777,7 +777,7 @@ static void secp256k1_fe_update_fg_62_var(int len, int64_t *f, int64_t *g, const const int64_t M62 = (int64_t)(UINT64_MAX >> 2); const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; int64_t fi, gi; - int128_t cf = 0, cg = 0; + int128_t cf, cg; int i; VERIFY_CHECK(len > 0); @@ -785,8 +785,8 @@ static void secp256k1_fe_update_fg_62_var(int len, int64_t *f, int64_t *g, const fi = f[0]; gi = g[0]; - cf -= (int128_t)u * fi + (int128_t)v * gi; - cg -= (int128_t)q * fi + (int128_t)r * gi; + cf = (int128_t)u * fi + (int128_t)v * gi; + cg = (int128_t)q * fi + (int128_t)r * gi; VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62; VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62; @@ -796,8 +796,8 @@ static void secp256k1_fe_update_fg_62_var(int len, int64_t *f, int64_t *g, const fi = f[i]; gi = g[i]; - cf -= (int128_t)u * fi + (int128_t)v * gi; - cg -= (int128_t)q * fi + (int128_t)r * gi; + cf += (int128_t)u * fi + (int128_t)v * gi; + cg += (int128_t)q * fi + (int128_t)r * gi; f[i - 1] = (int64_t)cf & M62; cf >>= 62; g[i - 1] = (int64_t)cg & M62; cg >>= 62; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 80c90cd347..1563dad1f9 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1010,15 +1010,15 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) { - uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; + uint64_t u = 1, v = 0, q = 0, r = 1; uint64_t c1, c2, f = f0, g = g0, x, y, z; int i; for (i = 0; i < 62; ++i) { VERIFY_CHECK((f & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << i); - VERIFY_CHECK((q * f0 + r * g0) == -g << i); + VERIFY_CHECK((u * f0 + v * g0) == f << i); + VERIFY_CHECK((q * f0 + r * g0) == g << i); c1 = -(g & (eta >> 63)); @@ -1059,7 +1059,7 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint }; #endif - uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1; + uint64_t u = 1, v = 0, q = 0, r = 1; uint64_t f = f0, g = g0, m, w, x, y, z; int i = 62, limit, zeros; @@ -1086,8 +1086,8 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((g & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i)); - VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i)); + VERIFY_CHECK((u * f0 + v * g0) == f << (62 - i)); + VERIFY_CHECK((q * f0 + r * g0) == g << (62 - i)); if ((int64_t)eta < 0) { eta = -eta; @@ -1134,10 +1134,10 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4]; const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; int64_t md, me; - int128_t cd = 0, ce = 0; + int128_t cd, ce; - cd -= (int128_t)u * d0 + (int128_t)v * e0; - ce -= (int128_t)q * d0 + (int128_t)r * e0; + cd = (int128_t)u * d0 + (int128_t)v * e0; + ce = (int128_t)q * d0 + (int128_t)r * e0; /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */ @@ -1150,8 +1150,8 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62; VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62; - cd -= (int128_t)u * d1 + (int128_t)v * e1; - ce -= (int128_t)q * d1 + (int128_t)r * e1; + cd += (int128_t)u * d1 + (int128_t)v * e1; + ce += (int128_t)q * d1 + (int128_t)r * e1; cd += (int128_t)P[1] * md; ce += (int128_t)P[1] * me; @@ -1159,8 +1159,8 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t d[0] = (int64_t)cd & M62; cd >>= 62; e[0] = (int64_t)ce & M62; ce >>= 62; - cd -= (int128_t)u * d2 + (int128_t)v * e2; - ce -= (int128_t)q * d2 + (int128_t)r * e2; + cd += (int128_t)u * d2 + (int128_t)v * e2; + ce += (int128_t)q * d2 + (int128_t)r * e2; cd += (int128_t)P[2] * md; ce += (int128_t)P[2] * me; @@ -1168,14 +1168,14 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t d[1] = (int64_t)cd & M62; cd >>= 62; e[1] = (int64_t)ce & M62; ce >>= 62; - cd -= (int128_t)u * d3 + (int128_t)v * e3; - ce -= (int128_t)q * d3 + (int128_t)r * e3; + cd += (int128_t)u * d3 + (int128_t)v * e3; + ce += (int128_t)q * d3 + (int128_t)r * e3; d[2] = (int64_t)cd & M62; cd >>= 62; e[2] = (int64_t)ce & M62; ce >>= 62; - cd -= (int128_t)u * d4 + (int128_t)v * e4; - ce -= (int128_t)q * d4 + (int128_t)r * e4; + cd += (int128_t)u * d4 + (int128_t)v * e4; + ce += (int128_t)q * d4 + (int128_t)r * e4; cd += (int128_t)P[4] * md; ce += (int128_t)P[4] * me; @@ -1193,34 +1193,34 @@ static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, const int64_t const int64_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4]; const int64_t g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4]; const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; - int128_t cf = 0, cg = 0; + int128_t cf, cg; - cf -= (int128_t)u * f0 + (int128_t)v * g0; - cg -= (int128_t)q * f0 + (int128_t)r * g0; + cf = (int128_t)u * f0 + (int128_t)v * g0; + cg = (int128_t)q * f0 + (int128_t)r * g0; VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62; VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62; - cf -= (int128_t)u * f1 + (int128_t)v * g1; - cg -= (int128_t)q * f1 + (int128_t)r * g1; + cf += (int128_t)u * f1 + (int128_t)v * g1; + cg += (int128_t)q * f1 + (int128_t)r * g1; f[0] = (int64_t)cf & M62; cf >>= 62; g[0] = (int64_t)cg & M62; cg >>= 62; - cf -= (int128_t)u * f2 + (int128_t)v * g2; - cg -= (int128_t)q * f2 + (int128_t)r * g2; + cf += (int128_t)u * f2 + (int128_t)v * g2; + cg += (int128_t)q * f2 + (int128_t)r * g2; f[1] = (int64_t)cf & M62; cf >>= 62; g[1] = (int64_t)cg & M62; cg >>= 62; - cf -= (int128_t)u * f3 + (int128_t)v * g3; - cg -= (int128_t)q * f3 + (int128_t)r * g3; + cf += (int128_t)u * f3 + (int128_t)v * g3; + cg += (int128_t)q * f3 + (int128_t)r * g3; f[2] = (int64_t)cf & M62; cf >>= 62; g[2] = (int64_t)cg & M62; cg >>= 62; - cf -= (int128_t)u * f4 + (int128_t)v * g4; - cg -= (int128_t)q * f4 + (int128_t)r * g4; + cf += (int128_t)u * f4 + (int128_t)v * g4; + cg += (int128_t)q * f4 + (int128_t)r * g4; f[3] = (int64_t)cf & M62; cf >>= 62; g[3] = (int64_t)cg & M62; cg >>= 62; @@ -1234,7 +1234,7 @@ static void secp256k1_scalar_update_fg_62_var(int len, int64_t *f, int64_t *g, c const int64_t M62 = (int64_t)(UINT64_MAX >> 2); const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; int64_t fi, gi; - int128_t cf = 0, cg = 0; + int128_t cf, cg; int i; VERIFY_CHECK(len > 0); @@ -1242,8 +1242,8 @@ static void secp256k1_scalar_update_fg_62_var(int len, int64_t *f, int64_t *g, c fi = f[0]; gi = g[0]; - cf -= (int128_t)u * fi + (int128_t)v * gi; - cg -= (int128_t)q * fi + (int128_t)r * gi; + cf = (int128_t)u * fi + (int128_t)v * gi; + cg = (int128_t)q * fi + (int128_t)r * gi; VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62; VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62; @@ -1253,8 +1253,8 @@ static void secp256k1_scalar_update_fg_62_var(int len, int64_t *f, int64_t *g, c fi = f[i]; gi = g[i]; - cf -= (int128_t)u * fi + (int128_t)v * gi; - cg -= (int128_t)q * fi + (int128_t)r * gi; + cf += (int128_t)u * fi + (int128_t)v * gi; + cg += (int128_t)q * fi + (int128_t)r * gi; f[i - 1] = (int64_t)cf & M62; cf >>= 62; g[i - 1] = (int64_t)cg & M62; cg >>= 62; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 40536c827b..9c964dae91 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -804,15 +804,15 @@ static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) { static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) { - uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; + uint32_t u = 1, v = 0, q = 0, r = 1; uint32_t c1, c2, f = f0, g = g0, x, y, z; int i; for (i = 0; i < 30; ++i) { VERIFY_CHECK((f & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << i); - VERIFY_CHECK((q * f0 + r * g0) == -g << i); + VERIFY_CHECK((u * f0 + v * g0) == f << i); + VERIFY_CHECK((q * f0 + r * g0) == g << i); c1 = -(g & (eta >> 31)); @@ -850,7 +850,7 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint 31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 }; #endif - uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1; + uint32_t u = 1, v = 0, q = 0, r = 1; uint32_t f = f0, g = g0, m, w, x, y, z; int i = 30, limit, zeros; @@ -877,8 +877,8 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint VERIFY_CHECK((f & 1) == 1); VERIFY_CHECK((g & 1) == 1); - VERIFY_CHECK((u * f0 + v * g0) == -f << (30 - i)); - VERIFY_CHECK((q * f0 + r * g0) == -g << (30 - i)); + VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i)); + VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i)); if ((int32_t)eta < 0) { eta = -eta; @@ -924,14 +924,14 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t 0, 0, 0, 65536 }; const int32_t M30 = (int32_t)(UINT32_MAX >> 2); int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; - int64_t cd = 0, ce = 0; + int64_t cd, ce; int i; di = d[0]; ei = e[0]; - cd -= (int64_t)u * di + (int64_t)v * ei; - ce -= (int64_t)q * di + (int64_t)r * ei; + cd = (int64_t)u * di + (int64_t)v * ei; + ce = (int64_t)q * di + (int64_t)r * ei; /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */ @@ -949,8 +949,8 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t di = d[i]; ei = e[i]; - cd -= (int64_t)u * di + (int64_t)v * ei; - ce -= (int64_t)q * di + (int64_t)r * ei; + cd += (int64_t)u * di + (int64_t)v * ei; + ce += (int64_t)q * di + (int64_t)r * ei; cd += (int64_t)P[i] * md; ce += (int64_t)P[i] * me; @@ -967,14 +967,14 @@ static void secp256k1_scalar_update_fg_30(int32_t *f, int32_t *g, int32_t *t) { const int32_t M30 = (int32_t)(UINT32_MAX >> 2); int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi; - int64_t cf = 0, cg = 0; + int64_t cf, cg; int i; fi = f[0]; gi = g[0]; - cf -= (int64_t)u * fi + (int64_t)v * gi; - cg -= (int64_t)q * fi + (int64_t)r * gi; + cf = (int64_t)u * fi + (int64_t)v * gi; + cg = (int64_t)q * fi + (int64_t)r * gi; VERIFY_CHECK(((int32_t)cf & M30) == 0); VERIFY_CHECK(((int32_t)cg & M30) == 0); @@ -987,8 +987,8 @@ static void secp256k1_scalar_update_fg_30(int32_t *f, int32_t *g, int32_t *t) { fi = f[i]; gi = g[i]; - cf -= (int64_t)u * fi + (int64_t)v * gi; - cg -= (int64_t)q * fi + (int64_t)r * gi; + cf += (int64_t)u * fi + (int64_t)v * gi; + cg += (int64_t)q * fi + (int64_t)r * gi; f[i - 1] = (int32_t)cf & M30; cf >>= 30; g[i - 1] = (int32_t)cg & M30; cg >>= 30; @@ -1003,7 +1003,7 @@ static void secp256k1_scalar_update_fg_30_var(int len, int32_t *f, int32_t *g, c const int32_t M30 = (int32_t)(UINT32_MAX >> 2); const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; int32_t fi, gi; - int64_t cf = 0, cg = 0; + int64_t cf, cg; int i; VERIFY_CHECK(len > 0); @@ -1011,8 +1011,8 @@ static void secp256k1_scalar_update_fg_30_var(int len, int32_t *f, int32_t *g, c fi = f[0]; gi = g[0]; - cf -= (int64_t)u * fi + (int64_t)v * gi; - cg -= (int64_t)q * fi + (int64_t)r * gi; + cf = (int64_t)u * fi + (int64_t)v * gi; + cg = (int64_t)q * fi + (int64_t)r * gi; VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30; VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30; @@ -1022,8 +1022,8 @@ static void secp256k1_scalar_update_fg_30_var(int len, int32_t *f, int32_t *g, c fi = f[i]; gi = g[i]; - cf -= (int64_t)u * fi + (int64_t)v * gi; - cg -= (int64_t)q * fi + (int64_t)r * gi; + cf += (int64_t)u * fi + (int64_t)v * gi; + cg += (int64_t)q * fi + (int64_t)r * gi; f[i - 1] = (int32_t)cf & M30; cf >>= 30; g[i - 1] = (int32_t)cg & M30; cg >>= 30; From 65550c1f6d44da2bd9d72d6fd2256a6cba0fd828 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Tue, 18 Aug 2020 00:50:29 +0700 Subject: [PATCH 30/34] Try 128 byte table of inverses --- src/field_10x26_impl.h | 35 +++++++++++++++++++++-------------- src/scalar_8x32_impl.h | 35 +++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 3fd44bc5da..4b3e35a14a 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1292,10 +1292,25 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t #if 1 static const uint8_t debruijn[32] = { - 0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26, - 31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 }; + 0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A, + 0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B, + 0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B }; #endif + static const uint8_t inv256[128] = { + 0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59, + 0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31, + 0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89, + 0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61, + 0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9, + 0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91, + 0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9, + 0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1, + 0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19, + 0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1, + 0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01 + }; + uint32_t u = 1, v = 0, q = 0, r = 1; uint32_t f = f0, g = g0, m, w, x, y, z; int i = 30, limit, zeros; @@ -1333,25 +1348,17 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t z = v; v = r; r = -z; } -#if 1 - /* Handle up to 3 divsteps at once, subject to eta and i. */ + /* Handle up to 8 divsteps at once, subject to eta and i. */ limit = ((int)eta + 1) > i ? i : ((int)eta + 1); - m = (UINT32_MAX >> (32 - limit)) & 7U; + m = (UINT32_MAX >> (32 - limit)) & 255U; + + w = (g * inv256[(f >> 1) & 127]) & m; - /* Note that f * f == 1 mod 8, for any f. */ - w = (-f * g) & m; g += f * w; q += u * w; r += v * w; VERIFY_CHECK((g & m) == 0); -#else - g += f; - q += u; - r += v; - - VERIFY_CHECK((g & 1) == 0); -#endif } t[0] = (int32_t)u; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 9c964dae91..4f11bd19cf 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -846,10 +846,25 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint #if 1 static const uint8_t debruijn[32] = { - 0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26, - 31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 }; + 0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A, + 0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B, + 0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B }; #endif + static const uint8_t inv256[128] = { + 0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59, + 0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31, + 0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89, + 0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61, + 0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9, + 0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91, + 0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9, + 0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1, + 0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19, + 0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1, + 0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01 + }; + uint32_t u = 1, v = 0, q = 0, r = 1; uint32_t f = f0, g = g0, m, w, x, y, z; int i = 30, limit, zeros; @@ -887,25 +902,17 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint z = v; v = r; r = -z; } -#if 1 - /* Handle up to 3 divsteps at once, subject to eta and i. */ + /* Handle up to 8 divsteps at once, subject to eta and i. */ limit = ((int)eta + 1) > i ? i : ((int)eta + 1); - m = (UINT32_MAX >> (32 - limit)) & 7U; + m = (UINT32_MAX >> (32 - limit)) & 255U; + + w = (g * inv256[(f >> 1) & 127]) & m; - /* Note that f * f == 1 mod 8, for any f. */ - w = (-f * g) & m; g += f * w; q += u * w; r += v * w; VERIFY_CHECK((g & m) == 0); -#else - g += f; - q += u; - r += v; - - VERIFY_CHECK((g & 1) == 0); -#endif } t[0] = (int32_t)u; From 5ccfc30aaf78bbfa1e3c7f171c5a7e425a2bad5c Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Tue, 25 Aug 2020 14:55:41 +0700 Subject: [PATCH 31/34] Avoid redundant calculation --- src/field_10x26_impl.h | 5 ++--- src/field_5x52_impl.h | 5 ++--- src/scalar_4x64_impl.h | 5 ++--- src/scalar_8x32_impl.h | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 4b3e35a14a..7faec253ac 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1260,7 +1260,8 @@ static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, VERIFY_CHECK((u * f0 + v * g0) == f << i); VERIFY_CHECK((q * f0 + r * g0) == g << i); - c1 = -(g & (eta >> 31)); + c2 = -(g & 1); + c1 = c2 & ((int32_t)eta >> 31); x = (f ^ g) & c1; f ^= x; g ^= x; g ^= c1; g -= c1; @@ -1273,8 +1274,6 @@ static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, eta = (eta ^ c1) - c1 - 1; - c2 = -(g & 1); - g += (f & c2); g >>= 1; q += (u & c2); u <<= 1; r += (v & c2); v <<= 1; diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 15c4fc2a8b..73ffe44b2b 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -568,7 +568,8 @@ static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, VERIFY_CHECK((u * f0 + v * g0) == f << i); VERIFY_CHECK((q * f0 + r * g0) == g << i); - c1 = -(g & (eta >> 63)); + c2 = -(g & 1); + c1 = c2 & ((int64_t)eta >> 63); x = (f ^ g) & c1; f ^= x; g ^= x; g ^= c1; g -= c1; @@ -581,8 +582,6 @@ static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, eta = (eta ^ c1) - c1 - 1; - c2 = -(g & 1); - g += (f & c2); g >>= 1; q += (u & c2); u <<= 1; r += (v & c2); v <<= 1; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 1563dad1f9..94b33f8cd3 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1020,7 +1020,8 @@ static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t VERIFY_CHECK((u * f0 + v * g0) == f << i); VERIFY_CHECK((q * f0 + r * g0) == g << i); - c1 = -(g & (eta >> 63)); + c2 = -(g & 1); + c1 = c2 & ((int64_t)eta >> 63); x = (f ^ g) & c1; f ^= x; g ^= x; g ^= c1; g -= c1; @@ -1033,8 +1034,6 @@ static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t eta = (eta ^ c1) - c1 - 1; - c2 = -(g & 1); - g += (f & c2); g >>= 1; q += (u & c2); u <<= 1; r += (v & c2); v <<= 1; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index 4f11bd19cf..e7fb988947 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -814,7 +814,8 @@ static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t VERIFY_CHECK((u * f0 + v * g0) == f << i); VERIFY_CHECK((q * f0 + r * g0) == g << i); - c1 = -(g & (eta >> 31)); + c2 = -(g & 1); + c1 = c2 & ((int32_t)eta >> 31); x = (f ^ g) & c1; f ^= x; g ^= x; g ^= c1; g -= c1; @@ -827,8 +828,6 @@ static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t eta = (eta ^ c1) - c1 - 1; - c2 = -(g & 1); - g += (f & c2); g >>= 1; q += (u & c2); u <<= 1; r += (v & c2); v <<= 1; From cbd2d57dcee044de9a1fabc8887ff090a2fa4482 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Wed, 9 Sep 2020 14:31:42 +0700 Subject: [PATCH 32/34] Faster const-time divsteps --- src/field_10x26_impl.h | 26 +++++++++++++++----------- src/field_5x52_impl.h | 26 +++++++++++++++----------- src/scalar_4x64_impl.h | 26 +++++++++++++++----------- src/scalar_8x32_impl.h | 26 +++++++++++++++----------- 4 files changed, 60 insertions(+), 44 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 7faec253ac..16e28c7821 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1260,23 +1260,27 @@ static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, VERIFY_CHECK((u * f0 + v * g0) == f << i); VERIFY_CHECK((q * f0 + r * g0) == g << i); + c1 = (int32_t)eta >> 31; c2 = -(g & 1); - c1 = c2 & ((int32_t)eta >> 31); - x = (f ^ g) & c1; - f ^= x; g ^= x; g ^= c1; g -= c1; + x = (f ^ c1) - c1; + y = (u ^ c1) - c1; + z = (v ^ c1) - c1; - y = (u ^ q) & c1; - u ^= y; q ^= y; q ^= c1; q -= c1; + g += x & c2; + q += y & c2; + r += z & c2; - z = (v ^ r) & c1; - v ^= z; r ^= z; r ^= c1; r -= c1; + c1 &= c2; + eta = (eta ^ c1) - (c1 + 1); - eta = (eta ^ c1) - c1 - 1; + f += g & c1; + u += q & c1; + v += r & c1; - g += (f & c2); g >>= 1; - q += (u & c2); u <<= 1; - r += (v & c2); v <<= 1; + g >>= 1; + u <<= 1; + v <<= 1; } t[0] = (int32_t)u; diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 73ffe44b2b..7d70e1a387 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -568,23 +568,27 @@ static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, VERIFY_CHECK((u * f0 + v * g0) == f << i); VERIFY_CHECK((q * f0 + r * g0) == g << i); + c1 = (int64_t)eta >> 63; c2 = -(g & 1); - c1 = c2 & ((int64_t)eta >> 63); - x = (f ^ g) & c1; - f ^= x; g ^= x; g ^= c1; g -= c1; + x = (f ^ c1) - c1; + y = (u ^ c1) - c1; + z = (v ^ c1) - c1; - y = (u ^ q) & c1; - u ^= y; q ^= y; q ^= c1; q -= c1; + g += x & c2; + q += y & c2; + r += z & c2; - z = (v ^ r) & c1; - v ^= z; r ^= z; r ^= c1; r -= c1; + c1 &= c2; + eta = (eta ^ c1) - (c1 + 1); - eta = (eta ^ c1) - c1 - 1; + f += g & c1; + u += q & c1; + v += r & c1; - g += (f & c2); g >>= 1; - q += (u & c2); u <<= 1; - r += (v & c2); v <<= 1; + g >>= 1; + u <<= 1; + v <<= 1; } t[0] = (int64_t)u; diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 94b33f8cd3..00cf4842cf 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -1020,23 +1020,27 @@ static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t VERIFY_CHECK((u * f0 + v * g0) == f << i); VERIFY_CHECK((q * f0 + r * g0) == g << i); + c1 = (int64_t)eta >> 63; c2 = -(g & 1); - c1 = c2 & ((int64_t)eta >> 63); - x = (f ^ g) & c1; - f ^= x; g ^= x; g ^= c1; g -= c1; + x = (f ^ c1) - c1; + y = (u ^ c1) - c1; + z = (v ^ c1) - c1; - y = (u ^ q) & c1; - u ^= y; q ^= y; q ^= c1; q -= c1; + g += x & c2; + q += y & c2; + r += z & c2; - z = (v ^ r) & c1; - v ^= z; r ^= z; r ^= c1; r -= c1; + c1 &= c2; + eta = (eta ^ c1) - (c1 + 1); - eta = (eta ^ c1) - c1 - 1; + f += g & c1; + u += q & c1; + v += r & c1; - g += (f & c2); g >>= 1; - q += (u & c2); u <<= 1; - r += (v & c2); v <<= 1; + g >>= 1; + u <<= 1; + v <<= 1; } t[0] = (int64_t)u; diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index e7fb988947..baf647d83f 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -814,23 +814,27 @@ static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t VERIFY_CHECK((u * f0 + v * g0) == f << i); VERIFY_CHECK((q * f0 + r * g0) == g << i); + c1 = (int32_t)eta >> 31; c2 = -(g & 1); - c1 = c2 & ((int32_t)eta >> 31); - x = (f ^ g) & c1; - f ^= x; g ^= x; g ^= c1; g -= c1; + x = (f ^ c1) - c1; + y = (u ^ c1) - c1; + z = (v ^ c1) - c1; - y = (u ^ q) & c1; - u ^= y; q ^= y; q ^= c1; q -= c1; + g += x & c2; + q += y & c2; + r += z & c2; - z = (v ^ r) & c1; - v ^= z; r ^= z; r ^= c1; r -= c1; + c1 &= c2; + eta = (eta ^ c1) - (c1 + 1); - eta = (eta ^ c1) - c1 - 1; + f += g & c1; + u += q & c1; + v += r & c1; - g += (f & c2); g >>= 1; - q += (u & c2); u <<= 1; - r += (v & c2); v <<= 1; + g >>= 1; + u <<= 1; + v <<= 1; } t[0] = (int32_t)u; From 85da7a9e4d0fb3b2251572f4bc683f65fd54f6e7 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Tue, 10 Nov 2020 22:52:04 +0700 Subject: [PATCH 33/34] Rework _update_de I/O bounds --- src/field_5x52_impl.h | 132 ++++++++++++++++++++++++----------------- src/scalar_4x64_impl.h | 123 ++++++++++++++++++++++---------------- 2 files changed, 149 insertions(+), 106 deletions(-) diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 7d70e1a387..83bdaf4281 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -498,43 +498,62 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se #endif } +static void secp256k1_fe_normalize_62(int64_t *r, int64_t cond_negate) { + /* P == 2^256 - C62 */ + const int64_t C62 = 0x1000003D1LL; + const int64_t M62 = (int64_t)(UINT64_MAX >> 2); + int64_t r0 = r[0], r1 = r[1], r2 = r[2], r3 = r[3], r4 = r[4]; + int64_t c, cond_add; + + cond_add = r4 >> 63; + + c = r0 - (C62 & cond_add); + r0 = c & M62; c >>= 62; + c += r1; + r1 = c & M62; c >>= 62; + c += r2; + r2 = c & M62; c >>= 62; + c += r3; + r3 = c & M62; c >>= 62; + c += r4 + (256 & cond_add); + r4 = c; + + cond_add = (c >> 63) ^ cond_negate; + + c = (r0 ^ cond_negate) - cond_negate - (C62 & cond_add); + r[0] = c & M62; c >>= 62; + c += (r1 ^ cond_negate) - cond_negate; + r[1] = c & M62; c >>= 62; + c += (r2 ^ cond_negate) - cond_negate; + r[2] = c & M62; c >>= 62; + c += (r3 ^ cond_negate) - cond_negate; + r[3] = c & M62; c >>= 62; + c += (r4 ^ cond_negate) - cond_negate + (256 & cond_add); + r[4] = c; + + VERIFY_CHECK(c >> 8 == 0); +} + static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) { const uint64_t M52 = UINT64_MAX >> 12; const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; - uint64_t r0, r1, r2, r3, r4, t; - - t = (int64_t)a4 >> 8; - /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 62 == 0); VERIFY_CHECK(a1 >> 62 == 0); VERIFY_CHECK(a2 >> 62 == 0); VERIFY_CHECK(a3 >> 62 == 0); - VERIFY_CHECK(t == 0 || t == -(uint64_t)1); - - /* Add 2P if a4 is "negative". */ - r0 = 0xFFFFDFFFFF85EULL & t; - r1 = 0xFFFFFFFFFFFFFULL & t; - r2 = 0xFFFFFFFFFFFFFULL & t; - r3 = 0xFFFFFFFFFFFFFULL & t; - r4 = 0x1FFFFFFFFFFFFULL & t; - - r0 += a0 & M52; - r1 += (a0 >> 52 | a1 << 10) & M52; - r2 += (a1 >> 42 | a2 << 20) & M52; - r3 += (a2 >> 32 | a3 << 30) & M52; - r4 += (a3 >> 22 | a4 << 40); - - r->n[0] = r0; - r->n[1] = r1; - r->n[2] = r2; - r->n[3] = r3; - r->n[4] = r4; + VERIFY_CHECK(a4 >> 8 == 0); + + r->n[0] = a0 & M52; + r->n[1] = (a0 >> 52 | a1 << 10) & M52; + r->n[2] = (a1 >> 42 | a2 << 20) & M52; + r->n[3] = (a2 >> 32 | a3 << 30) & M52; + r->n[4] = (a3 >> 22 | a4 << 40); #ifdef VERIFY r->magnitude = 1; - r->normalized = 0; + r->normalized = 1; secp256k1_fe_verify(r); #endif } @@ -679,22 +698,38 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t) /* P == 2^256 - C62 */ const int64_t C62 = 0x1000003D1LL; - /* I62 == -P^-1 mod 2^62 */ - const int64_t I62 = 0x1838091DD2253531LL; + /* I62 == P^-1 mod 2^62 */ + const int64_t I62 = 0x27C7F6E22DDACACFLL; const int64_t M62 = (int64_t)(UINT64_MAX >> 2); const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4]; const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4]; const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; - int64_t md, me; + int64_t md, me, sd, se; int128_t cd, ce; + /* + * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add + * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior + * to division by 2^62). This has the same effect as if we added the modulus to the input(s). + */ + + sd = d4 >> 63; + se = e4 >> 63; + + md = (u & sd) + (v & se); + me = (q & sd) + (r & se); + cd = (int128_t)u * d0 + (int128_t)v * e0; ce = (int128_t)q * d0 + (int128_t)r * e0; - /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me - * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */ - md = (I62 * 4 * (int64_t)cd) >> 2; - me = (I62 * 4 * (int64_t)ce) >> 2; + /* + * Subtract from md/me an extra term in the range [0, 2^62) such that the low 62 bits of each + * sum of products will be 0. This allows clean division by 2^62. On output, d/e are thus in + * the range (-2.P, P), consistent with the input constraint. + */ + + md -= (I62 * (int64_t)cd + md) & M62; + me -= (I62 * (int64_t)ce + me) & M62; cd -= (int128_t)C62 * md; ce -= (int128_t)C62 * me; @@ -821,8 +856,8 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; - secp256k1_fe b0, b1; - int i, sign; + secp256k1_fe b0; + int i; uint64_t eta; #ifdef VERIFY int zero_in; @@ -855,19 +890,12 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0); - sign = (f[0] >> 1) & 1; - - secp256k1_fe_decode_62(&b0, d); - - secp256k1_fe_negate(&b1, &b0, 1); - secp256k1_fe_cmov(&b0, &b1, sign); - secp256k1_fe_normalize_weak(&b0); + secp256k1_fe_normalize_62(d, f[4] >> 63); + secp256k1_fe_decode_62(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in); + VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(r) == !zero_in); #endif - - *r = b0; } static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { @@ -882,7 +910,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; secp256k1_fe b; - int i, j, len = 5, sign; + int i, j, len = 5; uint64_t eta; int64_t cond, fn, gn; #ifdef VERIFY @@ -938,20 +966,12 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */ - sign = (f[0] >> 1) & 1; - - secp256k1_fe_decode_62(&b, d); - - if (sign) { - secp256k1_fe_negate(&b, &b, 1); - secp256k1_fe_normalize_weak(&b); - } + secp256k1_fe_normalize_62(d, f[len - 1] >> 63); + secp256k1_fe_decode_62(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b) == !zero_in); + VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(r) == !zero_in); #endif - - *r = b; } #endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h index 00cf4842cf..a053aa70c7 100644 --- a/src/scalar_4x64_impl.h +++ b/src/scalar_4x64_impl.h @@ -962,33 +962,55 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA 0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL ); +static void secp256k1_scalar_normalize_62(int64_t *r, int64_t cond_negate) { + const int64_t M62 = (int64_t)(UINT64_MAX >> 2); + const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 }; + int64_t r0 = r[0], r1 = r[1], r2 = r[2], r3 = r[3], r4 = r[4]; + int64_t c, cond_add; + + cond_add = r4 >> 63; + + c = r0 + (P[0] & cond_add); + r0 = c & M62; c >>= 62; + c += r1 + (P[1] & cond_add); + r1 = c & M62; c >>= 62; + c += r2 + (P[2] & cond_add); + r2 = c & M62; c >>= 62; + c += r3; + r3 = c & M62; c >>= 62; + c += r4 + (P[4] & cond_add); + r4 = c; + + cond_add = (c >> 63) ^ cond_negate; + + c = (r0 ^ cond_negate) - cond_negate + (P[0] & cond_add); + r[0] = c & M62; c >>= 62; + c += (r1 ^ cond_negate) - cond_negate + (P[1] & cond_add); + r[1] = c & M62; c >>= 62; + c += (r2 ^ cond_negate) - cond_negate + (P[2] & cond_add); + r[2] = c & M62; c >>= 62; + c += (r3 ^ cond_negate) - cond_negate; + r[3] = c & M62; c >>= 62; + c += (r4 ^ cond_negate) - cond_negate + (P[4] & cond_add); + r[4] = c; + + VERIFY_CHECK(c >> 8 == 0); +} + static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) { const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; - uint64_t r0, r1, r2, r3; - secp256k1_scalar u; - /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 62 == 0); VERIFY_CHECK(a1 >> 62 == 0); VERIFY_CHECK(a2 >> 62 == 0); VERIFY_CHECK(a3 >> 62 == 0); - VERIFY_CHECK((int64_t)a4 >> 8 == 0 || (int64_t)a4 >> 8 == -(int64_t)1); - - r0 = a0 | a1 << 62; - r1 = a1 >> 2 | a2 << 60; - r2 = a2 >> 4 | a3 << 58; - r3 = a3 >> 6 | a4 << 56; + VERIFY_CHECK(a4 >> 8 == 0); - r->d[0] = r0; - r->d[1] = r1; - r->d[2] = r2; - r->d[3] = r3; - - secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r)); - - secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256); - secp256k1_scalar_cmov(r, &u, a4 >> 63); + r->d[0] = a0 | a1 << 62; + r->d[1] = a1 >> 2 | a2 << 60; + r->d[2] = a2 >> 4 | a3 << 58; + r->d[3] = a3 >> 6 | a4 << 56; } static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) { @@ -1129,23 +1151,39 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t *t) { - /* I62 == -P^-1 mod 2^62 */ - const int64_t I62 = 0x0B0DFF665588B13FLL; + /* I62 == P^-1 mod 2^62 */ + const int64_t I62 = 0x34F20099AA774EC1LL; const int64_t M62 = (int64_t)(UINT64_MAX >> 2); const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 }; const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4]; const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4]; const int64_t u = t[0], v = t[1], q = t[2], r = t[3]; - int64_t md, me; + int64_t md, me, sd, se; int128_t cd, ce; + /* + * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add + * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior + * to division by 2^62). This has the same effect as if we added the modulus to the input(s). + */ + + sd = d4 >> 63; + se = e4 >> 63; + + md = (u & sd) + (v & se); + me = (q & sd) + (r & se); + cd = (int128_t)u * d0 + (int128_t)v * e0; ce = (int128_t)q * d0 + (int128_t)r * e0; - /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me - * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */ - md = (I62 * 4 * (int64_t)cd) >> 2; - me = (I62 * 4 * (int64_t)ce) >> 2; + /* + * Subtract from md/me an extra term in the range [0, 2^62) such that the low 62 bits of each + * sum of products will be 0. This allows clean division by 2^62. On output, d/e are thus in + * the range (-2.P, P), consistent with the input constraint. + */ + + md -= (I62 * (int64_t)cd + md) & M62; + me -= (I62 * (int64_t)ce + me) & M62; cd += (int128_t)P[0] * md; ce += (int128_t)P[0] * me; @@ -1289,15 +1327,13 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; - secp256k1_scalar b0; - int i, sign; + int i; uint64_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - b0 = *x; - secp256k1_scalar_encode_62(g, &b0); + secp256k1_scalar_encode_62(g, x); /* The paper uses 'delta'; eta == -delta (a performance tweak). * @@ -1318,16 +1354,12 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0); - sign = (f[0] >> 1) & 1; - - secp256k1_scalar_decode_62(&b0, d); - secp256k1_scalar_cond_negate(&b0, sign); + secp256k1_scalar_normalize_62(d, f[4] >> 63); + secp256k1_scalar_decode_62(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in); + VERIFY_CHECK(!secp256k1_scalar_is_zero(r) == !zero_in); #endif - - *r = b0; } SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { @@ -1346,16 +1378,14 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL, 0x3FFFFFFFFFFFFFFFLL, 0xFFLL }; int64_t g[5]; - secp256k1_scalar b; - int i, j, len = 5, sign; + int i, j, len = 5; uint64_t eta; int64_t cond, fn, gn; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - b = *x; - secp256k1_scalar_encode_62(g, &b); + secp256k1_scalar_encode_62(g, x); /* The paper uses 'delta'; eta == -delta (a performance tweak). * @@ -1398,19 +1428,12 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */ - sign = (f[0] >> 1) & 1; - - secp256k1_scalar_decode_62(&b, d); - - if (sign) { - secp256k1_scalar_negate(&b, &b); - } + secp256k1_scalar_normalize_62(d, f[len - 1] >> 63); + secp256k1_scalar_decode_62(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_scalar_is_zero(&b) == !zero_in); + VERIFY_CHECK(!secp256k1_scalar_is_zero(r) == !zero_in); #endif - - *r = b; } #endif /* SECP256K1_SCALAR_REPR_IMPL_H */ From c9b77178270ddae9457069bb419f911ea1b9b63b Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Wed, 11 Nov 2020 16:46:36 +0700 Subject: [PATCH 34/34] Rework _update_de for 32bit --- src/field_10x26_impl.h | 171 ++++++++++++++++++++++++----------------- src/field_5x52_impl.h | 4 +- src/scalar_8x32_impl.h | 157 +++++++++++++++++++++++-------------- 3 files changed, 199 insertions(+), 133 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 16e28c7821..9841e2ba4b 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -1164,16 +1164,65 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se #endif } +static void secp256k1_fe_normalize_30(int32_t *r, int32_t cond_negate) { + /* P == 2^256 - 2^32 - C30 */ + const int32_t C30 = 0x3D1L; + const int32_t M30 = (int32_t)(UINT32_MAX >> 2); + int32_t r0 = r[0], r1 = r[1], r2 = r[2], r3 = r[3], r4 = r[4], + r5 = r[5], r6 = r[6], r7 = r[7], r8 = r[8]; + int32_t c, cond_add; + + cond_add = r8 >> 31; + + c = r0 - (C30 & cond_add); + r0 = c & M30; c >>= 30; + c += r1 - (4 & cond_add);; + r1 = c & M30; c >>= 30; + c += r2; + r2 = c & M30; c >>= 30; + c += r3; + r3 = c & M30; c >>= 30; + c += r4; + r4 = c & M30; c >>= 30; + c += r5; + r5 = c & M30; c >>= 30; + c += r6; + r6 = c & M30; c >>= 30; + c += r7; + r7 = c & M30; c >>= 30; + c += r8 + (65536 & cond_add); + r8 = c; + + cond_add = (c >> 31) ^ cond_negate; + + c = (r0 ^ cond_negate) - cond_negate - (C30 & cond_add); + r[0] = c & M30; c >>= 30; + c += (r1 ^ cond_negate) - cond_negate - (4 & cond_add); + r[1] = c & M30; c >>= 30; + c += (r2 ^ cond_negate) - cond_negate; + r[2] = c & M30; c >>= 30; + c += (r3 ^ cond_negate) - cond_negate; + r[3] = c & M30; c >>= 30; + c += (r4 ^ cond_negate) - cond_negate; + r[4] = c & M30; c >>= 30; + c += (r5 ^ cond_negate) - cond_negate; + r[5] = c & M30; c >>= 30; + c += (r6 ^ cond_negate) - cond_negate; + r[6] = c & M30; c >>= 30; + c += (r7 ^ cond_negate) - cond_negate; + r[7] = c & M30; c >>= 30; + c += (r8 ^ cond_negate) - cond_negate + (65536 & cond_add); + r[8] = c; + + VERIFY_CHECK(c >> 16 == 0); +} + static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { const uint32_t M26 = UINT32_MAX >> 6; const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; - uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, t; - - t = (int32_t)a8 >> 16; - /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 30 == 0); VERIFY_CHECK(a1 >> 30 == 0); VERIFY_CHECK(a2 >> 30 == 0); @@ -1182,45 +1231,22 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) { VERIFY_CHECK(a5 >> 30 == 0); VERIFY_CHECK(a6 >> 30 == 0); VERIFY_CHECK(a7 >> 30 == 0); - VERIFY_CHECK(t == 0 || t == -(uint32_t)1); - - /* Add 2P if a8 is "negative". */ - r0 = 0x3FFF85EUL & t; - r1 = 0x3FFFF7FUL & t; - r2 = 0x3FFFFFFUL & t; - r3 = 0x3FFFFFFUL & t; - r4 = 0x3FFFFFFUL & t; - r5 = 0x3FFFFFFUL & t; - r6 = 0x3FFFFFFUL & t; - r7 = 0x3FFFFFFUL & t; - r8 = 0x3FFFFFFUL & t; - r9 = 0x07FFFFFUL & t; - - r0 += a0 & M26; - r1 += (a0 >> 26 | a1 << 4) & M26; - r2 += (a1 >> 22 | a2 << 8) & M26; - r3 += (a2 >> 18 | a3 << 12) & M26; - r4 += (a3 >> 14 | a4 << 16) & M26; - r5 += (a4 >> 10 | a5 << 20) & M26; - r6 += (a5 >> 6 | a6 << 24) & M26; - r7 += (a6 >> 2 ) & M26; - r8 += (a6 >> 28 | a7 << 2) & M26; - r9 += (a7 >> 24 | a8 << 6); - - r->n[0] = r0; - r->n[1] = r1; - r->n[2] = r2; - r->n[3] = r3; - r->n[4] = r4; - r->n[5] = r5; - r->n[6] = r6; - r->n[7] = r7; - r->n[8] = r8; - r->n[9] = r9; + VERIFY_CHECK(a8 >> 16 == 0); + + r->n[0] = a0 & M26; + r->n[1] = (a0 >> 26 | a1 << 4) & M26; + r->n[2] = (a1 >> 22 | a2 << 8) & M26; + r->n[3] = (a2 >> 18 | a3 << 12) & M26; + r->n[4] = (a3 >> 14 | a4 << 16) & M26; + r->n[5] = (a4 >> 10 | a5 << 20) & M26; + r->n[6] = (a5 >> 6 | a6 << 24) & M26; + r->n[7] = (a6 >> 2 ) & M26; + r->n[8] = (a6 >> 28 | a7 << 2) & M26; + r->n[9] = (a7 >> 24 | a8 << 6); #ifdef VERIFY r->magnitude = 1; - r->normalized = 0; + r->normalized = 1; secp256k1_fe_verify(r); #endif } @@ -1375,25 +1401,41 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t) { /* P == 2^256 - 2^32 - C30 */ - const int64_t C30 = 0x3D1L; - /* I30 == -P^-1 mod 2^30 */ - const int32_t I30 = 0x12253531L; + const int32_t C30 = 0x3D1L; + /* I30 == P^-1 mod 2^30 */ + const int32_t I30 = 0x2DDACACFL; const int32_t M30 = (int32_t)(UINT32_MAX >> 2); const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; - int32_t di, ei, md, me; + int32_t di, ei, md, me, sd, se; int64_t cd, ce; int i; + /* + * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add + * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior + * to division by 2^30). This has the same effect as if we added the modulus to the input(s). + */ + + sd = d[8] >> 31; + se = e[8] >> 31; + + md = (u & sd) + (v & se); + me = (q & sd) + (r & se); + di = d[0]; ei = e[0]; cd = (int64_t)u * di + (int64_t)v * ei; ce = (int64_t)q * di + (int64_t)r * ei; - /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me - * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */ - md = (I30 * 4 * (int32_t)cd) >> 2; - me = (I30 * 4 * (int32_t)ce) >> 2; + /* + * Subtract from md/me an extra term in the range [0, 2^30) such that the low 30 bits of each + * sum of products will be 0. This allows clean division by 2^30. On output, d/e are thus in + * the range (-2.P, P), consistent with the input constraint. + */ + + md -= (I30 * (int32_t)cd + md) & M30; + me -= (I30 * (int32_t)ce + me) & M30; cd -= (int64_t)C30 * md; ce -= (int64_t)C30 * me; @@ -1513,8 +1555,8 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { int32_t f[9] = { 0x3FFFFC2F, 0x3FFFFFFB, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF }; int32_t g[9]; - secp256k1_fe b0, b1; - int i, sign; + secp256k1_fe b0; + int i; uint32_t eta; #ifdef VERIFY int zero_in; @@ -1547,19 +1589,12 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0); - sign = (f[0] >> 1) & 1; - - secp256k1_fe_decode_30(&b0, d); - - secp256k1_fe_negate(&b1, &b0, 2); - secp256k1_fe_cmov(&b0, &b1, sign); - secp256k1_fe_normalize_weak(&b0); + secp256k1_fe_normalize_30(d, f[8] >> 31); + secp256k1_fe_decode_30(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in); + VERIFY_CHECK(!secp256k1_fe_is_zero(r) == !zero_in); #endif - - *r = b0; } static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { @@ -1574,7 +1609,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF }; int32_t g[9]; secp256k1_fe b; - int i, j, len = 9, sign; + int i, j, len = 9; uint32_t eta; int32_t cond, fn, gn; #ifdef VERIFY @@ -1630,20 +1665,12 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */ - sign = (f[0] >> 1) & 1; - - secp256k1_fe_decode_30(&b, d); - - if (sign) { - secp256k1_fe_negate(&b, &b, 1); - secp256k1_fe_normalize_weak(&b); - } + secp256k1_fe_normalize_30(d, f[len - 1] >> 31); + secp256k1_fe_decode_30(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b) == !zero_in); + VERIFY_CHECK(!secp256k1_fe_is_zero(r) == !zero_in); #endif - - *r = b; } #endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 83bdaf4281..1af2c15fce 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -894,7 +894,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe_decode_62(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(r) == !zero_in); + VERIFY_CHECK(!secp256k1_fe_is_zero(r) == !zero_in); #endif } @@ -970,7 +970,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) { secp256k1_fe_decode_62(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(r) == !zero_in); + VERIFY_CHECK(!secp256k1_fe_is_zero(r) == !zero_in); #endif } diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h index baf647d83f..ba52b7d54a 100644 --- a/src/scalar_8x32_impl.h +++ b/src/scalar_8x32_impl.h @@ -738,14 +738,65 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA 0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL ); +static void secp256k1_scalar_normalize_30(int32_t *r, int32_t cond_negate) { + const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L, + 0, 0, 0, 65536 }; + const int32_t M30 = (int32_t)(UINT32_MAX >> 2); + int32_t r0 = r[0], r1 = r[1], r2 = r[2], r3 = r[3], r4 = r[4], + r5 = r[5], r6 = r[6], r7 = r[7], r8 = r[8]; + int32_t c, cond_add; + + cond_add = r8 >> 31; + + c = r0 + (P[0] & cond_add); + r0 = c & M30; c >>= 30; + c += r1 + (P[1] & cond_add); + r1 = c & M30; c >>= 30; + c += r2 + (P[2] & cond_add); + r2 = c & M30; c >>= 30; + c += r3 + (P[3] & cond_add); + r3 = c & M30; c >>= 30; + c += r4 + (P[4] & cond_add); + r4 = c & M30; c >>= 30; + c += r5; + r5 = c & M30; c >>= 30; + c += r6; + r6 = c & M30; c >>= 30; + c += r7; + r7 = c & M30; c >>= 30; + c += r8 + (P[8] & cond_add); + r8 = c; + + cond_add = (c >> 31) ^ cond_negate; + + c = (r0 ^ cond_negate) - cond_negate + (P[0] & cond_add); + r[0] = c & M30; c >>= 30; + c += (r1 ^ cond_negate) - cond_negate + (P[1] & cond_add); + r[1] = c & M30; c >>= 30; + c += (r2 ^ cond_negate) - cond_negate + (P[2] & cond_add); + r[2] = c & M30; c >>= 30; + c += (r3 ^ cond_negate) - cond_negate + (P[3] & cond_add); + r[3] = c & M30; c >>= 30; + c += (r4 ^ cond_negate) - cond_negate + (P[4] & cond_add); + r[4] = c & M30; c >>= 30; + c += (r5 ^ cond_negate) - cond_negate; + r[5] = c & M30; c >>= 30; + c += (r6 ^ cond_negate) - cond_negate; + r[6] = c & M30; c >>= 30; + c += (r7 ^ cond_negate) - cond_negate; + r[7] = c & M30; c >>= 30; + c += (r8 ^ cond_negate) - cond_negate + (P[8] & cond_add); + r[8] = c; + + VERIFY_CHECK(c >> 16 == 0); +} + + static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) { const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4], a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8]; - uint32_t r0, r1, r2, r3, r4, r5, r6, r7; - secp256k1_scalar u; - /* a must be in the range [-2^256, 2^256). */ VERIFY_CHECK(a0 >> 30 == 0); VERIFY_CHECK(a1 >> 30 == 0); VERIFY_CHECK(a2 >> 30 == 0); @@ -754,30 +805,16 @@ static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) { VERIFY_CHECK(a5 >> 30 == 0); VERIFY_CHECK(a6 >> 30 == 0); VERIFY_CHECK(a7 >> 30 == 0); - VERIFY_CHECK((int32_t)a8 >> 16 == 0 || (int32_t)a8 >> 16 == -(int32_t)1); - - r0 = a0 | a1 << 30; - r1 = a1 >> 2 | a2 << 28; - r2 = a2 >> 4 | a3 << 26; - r3 = a3 >> 6 | a4 << 24; - r4 = a4 >> 8 | a5 << 22; - r5 = a5 >> 10 | a6 << 20; - r6 = a6 >> 12 | a7 << 18; - r7 = a7 >> 14 | a8 << 16; - - r->d[0] = r0; - r->d[1] = r1; - r->d[2] = r2; - r->d[3] = r3; - r->d[4] = r4; - r->d[5] = r5; - r->d[6] = r6; - r->d[7] = r7; - - secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r)); - - secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256); - secp256k1_scalar_cmov(r, &u, a8 >> 31); + VERIFY_CHECK(a8 >> 16 == 0); + + r->d[0] = a0 | a1 << 30; + r->d[1] = a1 >> 2 | a2 << 28; + r->d[2] = a2 >> 4 | a3 << 26; + r->d[3] = a3 >> 6 | a4 << 24; + r->d[4] = a4 >> 8 | a5 << 22; + r->d[5] = a5 >> 10 | a6 << 20; + r->d[6] = a6 >> 12 | a7 << 18; + r->d[7] = a7 >> 14 | a8 << 16; } static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) { @@ -928,25 +965,42 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t *t) { - /* I30 == -P^-1 mod 2^30 */ - const int32_t I30 = 0x1588B13FL; + /* I30 == P^-1 mod 2^30 */ + const int32_t I30 = 0x2A774EC1L; const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L, 0, 0, 0, 65536 }; const int32_t M30 = (int32_t)(UINT32_MAX >> 2); - int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me; + const int32_t u = t[0], v = t[1], q = t[2], r = t[3]; + int32_t di, ei, md, me, sd, se; int64_t cd, ce; int i; + /* + * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add + * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior + * to division by 2^30). This has the same effect as if we added the modulus to the input(s). + */ + + sd = d[8] >> 31; + se = e[8] >> 31; + + md = (u & sd) + (v & se); + me = (q & sd) + (r & se); + di = d[0]; ei = e[0]; cd = (int64_t)u * di + (int64_t)v * ei; ce = (int64_t)q * di + (int64_t)r * ei; - /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me - * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */ - md = (I30 * 4 * (int32_t)cd) >> 2; - me = (I30 * 4 * (int32_t)ce) >> 2; + /* + * Subtract from md/me an extra term in the range [0, 2^30) such that the low 30 bits of each + * sum of products will be 0. This allows clean division by 2^30. On output, d/e are thus in + * the range (-2.P, P), consistent with the input constraint. + */ + + md -= (I30 * (int32_t)cd + md) & M30; + me -= (I30 * (int32_t)ce + me) & M30; cd += (int64_t)P[0] * md; ce += (int64_t)P[0] * me; @@ -1065,15 +1119,13 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, 0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL }; int32_t g[9]; - secp256k1_scalar b0; - int i, sign; + int i; uint32_t eta; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - b0 = *x; - secp256k1_scalar_encode_30(g, &b0); + secp256k1_scalar_encode_30(g, x); /* The paper uses 'delta'; eta == -delta (a performance tweak). * @@ -1094,16 +1146,12 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0); - sign = (f[0] >> 1) & 1; - - secp256k1_scalar_decode_30(&b0, d); - secp256k1_scalar_cond_negate(&b0, sign); + secp256k1_scalar_normalize_30(d, f[8] >> 31); + secp256k1_scalar_decode_30(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in); + VERIFY_CHECK(!secp256k1_scalar_is_zero(r) == !zero_in); #endif - - *r = b0; } SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) { @@ -1122,16 +1170,14 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, 0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL }; int32_t g[9]; - secp256k1_scalar b; - int i, j, len = 9, sign; + int i, j, len = 9; uint32_t eta; int32_t cond, fn, gn; #ifdef VERIFY int zero_in = secp256k1_scalar_is_zero(x); #endif - b = *x; - secp256k1_scalar_encode_30(g, &b); + secp256k1_scalar_encode_30(g, x); /* The paper uses 'delta'; eta == -delta (a performance tweak). * @@ -1174,19 +1220,12 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */ - sign = (f[0] >> 1) & 1; - - secp256k1_scalar_decode_30(&b, d); - - if (sign) { - secp256k1_scalar_negate(&b, &b); - } + secp256k1_scalar_normalize_30(d, f[len - 1] >> 31); + secp256k1_scalar_decode_30(r, d); #ifdef VERIFY - VERIFY_CHECK(!secp256k1_scalar_is_zero(&b) == !zero_in); + VERIFY_CHECK(!secp256k1_scalar_is_zero(r) == !zero_in); #endif - - *r = b; } #endif /* SECP256K1_SCALAR_REPR_IMPL_H */