From 9fbe4854fe9b937f151fc1dac656a54ba8a64222 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Wed, 15 Jul 2020 14:34:11 +0700
Subject: [PATCH 01/34] "safegcd" field and scalar inversion

- see "Fast constant-time gcd computation and modular inversion" by Daniel J. Bernstein and Bo-Yin Yang https://gcd.cr.yp.to()
---
 src/field_10x26.h      |   2 +
 src/field_5x52_impl.h  | 363 ++++++++++++++++++++++++++++++++
 src/field_impl.h       |   2 +
 src/scalar_4x64_impl.h | 457 +++++++++++++++++++++++++++++++++++++++++
 src/scalar_8x32.h      |   2 +
 src/scalar_impl.h      |   4 +-
 src/util.h             |   1 +
 7 files changed, 830 insertions(+), 1 deletion(-)

diff --git a/src/field_10x26.h b/src/field_10x26.h
index 5ff03c8abc..312a94c3ae 100644
--- a/src/field_10x26.h
+++ b/src/field_10x26.h
@@ -47,4 +47,6 @@ typedef struct {
 #define SECP256K1_FE_STORAGE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}
 #define SECP256K1_FE_STORAGE_CONST_GET(d) d.n[7], d.n[6], d.n[5], d.n[4],d.n[3], d.n[2], d.n[1], d.n[0]
 
+#define SECP256K1_FE_INV_DEFAULT
+
 #endif /* SECP256K1_FIELD_REPR_H */
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 71a38f915b..a08032afb7 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -498,4 +498,367 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
+static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST(
+    0x0E90A100UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
+    0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL
+);
+
+static void secp256k1_fe_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
+
+    int64_t z0, z1, z2, z3;
+    int128_t tt;
+
+    tt  = (int128_t)a0 * b0
+        + (int128_t)c0 * d0;
+    z0  = (int64_t)tt; tt -= z0; tt >>= 64;
+
+    tt += (int128_t)a0 * b1
+        + (int128_t)a1 * b0
+        + (int128_t)c0 * d1
+        + (int128_t)c1 * d0;
+    z1  = (int64_t)tt; tt -= z1; tt >>= 64;
+
+    tt += (int128_t)a1 * b1
+        + (int128_t)c1 * d1;
+    z2  = (int64_t)tt; tt -= z2; tt >>= 64;
+
+    z3 = (int64_t)tt;
+
+    t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3;
+}
+
+static void secp256k1_fe_combine_1s(int64_t *t) {
+
+    int64_t a = t[0], b = t[1], c = t[2], d = t[3],
+            e = t[4], f = t[5], g = t[6], h = t[7];
+    int128_t I, J, K, L;
+
+    I = (int128_t)e * a + (int128_t)f * c;
+    J = (int128_t)e * b + (int128_t)f * d;
+    K = (int128_t)g * a + (int128_t)h * c;
+    L = (int128_t)g * b + (int128_t)h * d;
+
+    a = (int64_t)I; I -= a; I >>= 64; b = (int64_t)I;
+    c = (int64_t)J; J -= c; J >>= 64; d = (int64_t)J;
+    e = (int64_t)K; K -= e; K >>= 64; f = (int64_t)K;
+    g = (int64_t)L; L -= g; L >>= 64; h = (int64_t)L;
+
+    t[0] = a; t[1] = b; t[2] = c; t[3] = d;
+    t[4] = e; t[5] = f; t[6] = g; t[7] = h;
+}
+
+static void secp256k1_fe_combine_2s(int64_t *t) {
+
+    int64_t a0 = t[ 0], a1 = t[ 1];
+    int64_t b0 = t[ 2], b1 = t[ 3];
+    int64_t c0 = t[ 4], c1 = t[ 5];
+    int64_t d0 = t[ 6], d1 = t[ 7];
+    int64_t e0 = t[ 8], e1 = t[ 9];
+    int64_t f0 = t[10], f1 = t[11];
+    int64_t g0 = t[12], g1 = t[13];
+    int64_t h0 = t[14], h1 = t[15];
+
+    secp256k1_fe_mul_add(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
+    secp256k1_fe_mul_add(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
+    secp256k1_fe_mul_add(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
+    secp256k1_fe_mul_add(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
+}
+
+static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) {
+
+    uint64_t u0, u1, u2, u3, u4;
+    uint64_t r0, r1, r2, r3, r4;
+
+    /* TODO Need proper carry chain */
+
+    u0 = (uint64_t)t[0];
+    u1 = (uint64_t)t[1] - (u0 >> 63);
+    u2 = (uint64_t)t[2] - (u1 >> 63);
+    u3 = (uint64_t)t[3] - (u2 >> 63);
+    u4 =                - (u3 >> 63);
+
+    r0 = 0xFFFFEFFFFFC2FULL * 2;
+    r1 = 0xFFFFFFFFFFFFFULL * 2;
+    r2 = 0xFFFFFFFFFFFFFULL * 2;
+    r3 = 0xFFFFFFFFFFFFFULL * 2;
+    r4 = 0x0FFFFFFFFFFFFULL * 2;
+
+    r0 += u0 & 0xFFFFFFFFFFFFFULL;
+    r1 += u0 >> 52 | ((u1 << 12) & 0xFFFFFFFFFFFFFULL);
+    r2 += u1 >> 40 | ((u2 << 24) & 0xFFFFFFFFFFFFFULL);
+    r3 += u2 >> 28 | ((u3 << 36) & 0xFFFFFFFFFFFFFULL);
+    r4 += u3 >> 16 |  (u4 << 48);
+
+    r->n[0] = r0;
+    r->n[1] = r1;
+    r->n[2] = r2;
+    r->n[3] = r3;
+    r->n[4] = r4;
+
+#ifdef VERIFY
+    /* TODO Probably 2 is enough? */
+    r->magnitude = 3;
+    r->normalized = 0;
+    secp256k1_fe_verify(r);
+#endif
+}
+
+static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) {
+
+    const uint64_t M62 = UINT64_MAX >> 2;
+    const uint64_t *n = &a->n[0];
+    uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4];
+
+#ifdef VERIFY
+    VERIFY_CHECK(a->normalized);
+#endif
+
+    r[0] = (a0       | a1 << 52) & M62;
+    r[1] = (a1 >> 10 | a2 << 42) & M62;
+    r[2] = (a2 >> 20 | a3 << 32) & M62;
+    r[3] = (a3 >> 30 | a4 << 22) & M62;
+    r[4] =  a4 >> 40;
+}
+
+static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f, uint64_t g, int64_t *t) {
+
+    uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
+    uint64_t c1, c2, x, y, z;
+    int i;
+
+    for (i = 0; i < 62; ++i) {
+
+        c1 = -(g & (eta >> 15));
+
+        x = (f ^ g) & c1;
+        f ^= x; g ^= x; g ^= c1; g -= c1;
+
+        y = (u ^ q) & c1;
+        u ^= y; q ^= y; q ^= c1; q -= c1;
+
+        z = (v ^ r) & c1;
+        v ^= z; r ^= z; r ^= c1; r -= c1;
+
+        eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1;
+
+        c2 = -(g & 1);
+
+        g += (f & c2); g >>= 1;
+        q += (u & c2); u <<= 1;
+        r += (v & c2); v <<= 1;
+    }
+
+    t[0] = (int64_t)u;
+    t[1] = (int64_t)v;
+    t[2] = (int64_t)q;
+    t[3] = (int64_t)r;
+
+    return eta;
+}
+
+static void secp256k1_fe_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) {
+
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
+    int128_t cf = 0, cg = 0;
+    int i;
+
+    VERIFY_CHECK(len > 0);
+
+    fi = f[0];
+    gi = g[0];
+
+    cf -= (int128_t)u * fi + (int128_t)v * gi;
+    cg -= (int128_t)q * fi + (int128_t)r * gi;
+
+    VERIFY_CHECK(((int64_t)cf & M62) == 0);
+    VERIFY_CHECK(((int64_t)cg & M62) == 0);
+
+    cf >>= 62;
+    cg >>= 62;
+
+    for (i = 1; i < len; ++i) {
+
+        fi = f[i];
+        gi = g[i];
+
+        cf -= (int128_t)u * fi + (int128_t)v * gi;
+        cg -= (int128_t)q * fi + (int128_t)r * gi;
+
+        f[i - 1] = (int64_t)cf & M62; cf >>= 62;
+        g[i - 1] = (int64_t)cg & M62; cg >>= 62;
+    }
+
+    f[i - 1] = (int64_t)cf;
+    g[i - 1] = (int64_t)cg;
+}
+
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
+
+#if 1
+
+    /* TODO Check for a == 0? */
+
+    int64_t t[12 * 4];
+    int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL,
+        0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
+    int64_t g[5];
+    secp256k1_fe b0, d0, a1, b1, c1, d1;
+    int i, len, sign;
+    int16_t eta;
+
+    /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
+     * by 2^768, and then the output by 2^24. */
+    /* Instead of dividing the output by 2^744, we scale the input. */
+    secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
+    secp256k1_fe_normalize(&b0);
+    secp256k1_fe_encode_62(&g[0], &b0);
+
+    eta = -1;
+
+    for (i = 0; i < 12; ++i) {
+        eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], &t[i * 4]);
+        len = i <= 6 ? 5 : i >= 10 ? 1 : 11 - i;
+        secp256k1_fe_update_fg(len, f, g, &t[i * 4]);
+    }
+
+    /* At this point, f must equal +/- 1 (the GCD). */
+    sign = (f[0] >> 1) & 1;
+
+    for (i = 0; i < 3; ++i) {
+        int tOff = i * 16;
+        secp256k1_fe_combine_1s(&t[tOff + 0]);
+        secp256k1_fe_combine_1s(&t[tOff + 8]);
+        secp256k1_fe_combine_2s(&t[tOff + 0]);
+    }
+
+    /* secp256k1_fe_decode_matrix(&a0, &t[0]); */
+    secp256k1_fe_decode_matrix(&b0, &t[4]);
+    /* secp256k1_fe_decode_matrix(&c0, &t[8]); */
+    secp256k1_fe_decode_matrix(&d0, &t[12]);
+
+    secp256k1_fe_decode_matrix(&a1, &t[16]);
+    secp256k1_fe_decode_matrix(&b1, &t[20]);
+    secp256k1_fe_decode_matrix(&c1, &t[24]);
+    secp256k1_fe_decode_matrix(&d1, &t[28]);
+
+    secp256k1_fe_mul(&a1, &a1, &b0);
+    secp256k1_fe_mul(&b1, &b1, &d0);
+    secp256k1_fe_mul(&c1, &c1, &b0);
+    secp256k1_fe_mul(&d1, &d1, &d0);
+
+    b0 = a1; secp256k1_fe_add(&b0, &b1);
+    d0 = c1; secp256k1_fe_add(&d0, &d1);
+
+    secp256k1_fe_decode_matrix(&a1, &t[32]);
+    secp256k1_fe_decode_matrix(&b1, &t[36]);
+    /* secp256k1_fe_decode_matrix(&c1, &t[40]); */
+    /* secp256k1_fe_decode_matrix(&d1, &t[44]); */
+
+    secp256k1_fe_mul(&a1, &a1, &b0);
+    secp256k1_fe_mul(&b1, &b1, &d0);
+    /* secp256k1_fe_mul(&c1, &c1, &b0); */
+    /* secp256k1_fe_mul(&d1, &d1, &d0); */
+
+    b0 = a1; secp256k1_fe_add(&b0, &b1);
+    /* d0 = c1; secp256k1_fe_add(&d0, &d1); */
+
+    secp256k1_fe_negate(&b1, &b0, 2);
+    secp256k1_fe_cmov(&b0, &b1, sign);
+    secp256k1_fe_normalize_weak(&b0);
+
+    *r = b0;
+
+#else
+
+    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;
+
+    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
+     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
+     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+     */
+
+    secp256k1_fe_sqr(&x2, a);
+    secp256k1_fe_mul(&x2, &x2, a);
+
+    secp256k1_fe_sqr(&x3, &x2);
+    secp256k1_fe_mul(&x3, &x3, a);
+
+    x6 = x3;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x6, &x6);
+    }
+    secp256k1_fe_mul(&x6, &x6, &x3);
+
+    x9 = x6;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x9, &x9);
+    }
+    secp256k1_fe_mul(&x9, &x9, &x3);
+
+    x11 = x9;
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&x11, &x11);
+    }
+    secp256k1_fe_mul(&x11, &x11, &x2);
+
+    x22 = x11;
+    for (j=0; j<11; j++) {
+        secp256k1_fe_sqr(&x22, &x22);
+    }
+    secp256k1_fe_mul(&x22, &x22, &x11);
+
+    x44 = x22;
+    for (j=0; j<22; j++) {
+        secp256k1_fe_sqr(&x44, &x44);
+    }
+    secp256k1_fe_mul(&x44, &x44, &x22);
+
+    x88 = x44;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x88, &x88);
+    }
+    secp256k1_fe_mul(&x88, &x88, &x44);
+
+    x176 = x88;
+    for (j=0; j<88; j++) {
+        secp256k1_fe_sqr(&x176, &x176);
+    }
+    secp256k1_fe_mul(&x176, &x176, &x88);
+
+    x220 = x176;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x220, &x220);
+    }
+    secp256k1_fe_mul(&x220, &x220, &x44);
+
+    x223 = x220;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x223, &x223);
+    }
+    secp256k1_fe_mul(&x223, &x223, &x3);
+
+    /* The final result is then assembled using a sliding window over the blocks. */
+
+    t1 = x223;
+    for (j=0; j<23; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x22);
+    for (j=0; j<5; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, a);
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x2);
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(r, a, &t1);
+#endif
+}
+
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_impl.h b/src/field_impl.h
index 485921a60e..c2b1cd2df2 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -136,6 +136,7 @@ static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a) {
     return secp256k1_fe_equal(&t1, a);
 }
 
+#if defined(SECP256K1_FE_INV_DEFAULT)
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
     int j;
@@ -225,6 +226,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     }
     secp256k1_fe_mul(r, a, &t1);
 }
+#endif
 
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 #if defined(USE_FIELD_INV_BUILTIN)
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 8f539c4bc6..1fcf1ba37c 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -957,4 +957,461 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
 }
 
+static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALAR_CONST(
+    0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFDUL,
+    0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL
+);
+
+static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CONST(
+    0x4E165355UL, 0x5D800C18UL, 0xEF116DB1UL, 0xB31347F1UL,
+    0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL
+);
+
+static void secp256k1_scalar_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
+
+    int64_t z0, z1, z2, z3;
+    int128_t tt;
+
+    tt  = (int128_t)a0 * b0
+        + (int128_t)c0 * d0;
+    z0  = (int64_t)tt; tt -= z0; tt >>= 64;
+
+    tt += (int128_t)a0 * b1
+        + (int128_t)a1 * b0
+        + (int128_t)c0 * d1
+        + (int128_t)c1 * d0;
+    z1  = (int64_t)tt; tt -= z1; tt >>= 64;
+
+    tt += (int128_t)a1 * b1
+        + (int128_t)c1 * d1;
+    z2  = (int64_t)tt; tt -= z2; tt >>= 64;
+
+    z3 = (int64_t)tt;
+
+    t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3;
+}
+
+static void secp256k1_scalar_combine_1s(int64_t *t) {
+
+    int64_t a = t[0], b = t[1], c = t[2], d = t[3],
+            e = t[4], f = t[5], g = t[6], h = t[7];
+    int128_t I, J, K, L;
+
+    I = (int128_t)e * a + (int128_t)f * c;
+    J = (int128_t)e * b + (int128_t)f * d;
+    K = (int128_t)g * a + (int128_t)h * c;
+    L = (int128_t)g * b + (int128_t)h * d;
+
+    a = (int64_t)I; I -= a; I >>= 64; b = (int64_t)I;
+    c = (int64_t)J; J -= c; J >>= 64; d = (int64_t)J;
+    e = (int64_t)K; K -= e; K >>= 64; f = (int64_t)K;
+    g = (int64_t)L; L -= g; L >>= 64; h = (int64_t)L;
+
+    t[0] = a; t[1] = b; t[2] = c; t[3] = d;
+    t[4] = e; t[5] = f; t[6] = g; t[7] = h;
+}
+
+static void secp256k1_scalar_combine_2s(int64_t *t) {
+
+    int64_t a0 = t[ 0], a1 = t[ 1];
+    int64_t b0 = t[ 2], b1 = t[ 3];
+    int64_t c0 = t[ 4], c1 = t[ 5];
+    int64_t d0 = t[ 6], d1 = t[ 7];
+    int64_t e0 = t[ 8], e1 = t[ 9];
+    int64_t f0 = t[10], f1 = t[11];
+    int64_t g0 = t[12], g1 = t[13];
+    int64_t h0 = t[14], h1 = t[15];
+
+    secp256k1_scalar_mul_add(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
+    secp256k1_scalar_mul_add(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
+    secp256k1_scalar_mul_add(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
+    secp256k1_scalar_mul_add(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
+}
+
+static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) {
+
+#if 1
+
+    uint64_t r0, r1, r2, r3;
+    int flag;
+    secp256k1_scalar u;
+
+    /* TODO Need proper carry chain */
+    r0 = (uint64_t)t[0];
+    r1 = (uint64_t)t[1] - (r0 >> 63);
+    r2 = (uint64_t)t[2] - (r1 >> 63);
+    r3 = (uint64_t)t[3] - (r2 >> 63);
+
+    flag = (int)(r3 >> 63);
+
+    r->d[0] = r0;
+    r->d[1] = r1;
+    r->d[2] = r2;
+    r->d[3] = r3;
+
+    secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256);
+    secp256k1_scalar_cmov(r, &u, flag);
+
+#else
+
+    uint64_t u0, u1, u2, u3, u4;
+    uint64_t r0, r1, r2, r3, r4;
+
+    u0 = (uint64_t)t[0];
+    u1 = (uint64_t)t[1] - (u0 >> 63);
+    u2 = (uint64_t)t[2] - (u1 >> 63);
+    u3 = (uint64_t)t[3] - (u2 >> 63);
+    u4 =                - (u3 >> 63);
+
+    r0 = 0xFFFFEFFFFFC2FULL * 2;
+    r1 = 0xFFFFFFFFFFFFFULL * 2;
+    r2 = 0xFFFFFFFFFFFFFULL * 2;
+    r3 = 0xFFFFFFFFFFFFFULL * 2;
+    r4 = 0x0FFFFFFFFFFFFULL * 2;
+
+    r0 += u0 & 0xFFFFFFFFFFFFFULL;
+    r1 += u0 >> 52 | ((u1 << 12) & 0xFFFFFFFFFFFFFULL);
+    r2 += u1 >> 40 | ((u2 << 24) & 0xFFFFFFFFFFFFFULL);
+    r3 += u2 >> 28 | ((u3 << 36) & 0xFFFFFFFFFFFFFULL);
+    r4 += u3 >> 16 |  (u4 << 48);
+
+    r->n[0] = r0;
+    r->n[1] = r1;
+    r->n[2] = r2;
+    r->n[3] = r3;
+    r->n[4] = r4;
+
+#endif
+}
+
+static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {
+
+    const uint64_t M62 = UINT64_MAX >> 2;
+    const uint64_t *d = &a->d[0];
+    uint64_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3];
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
+#endif
+
+    r[0] =  a0                   & M62;
+    r[1] = (a0 >> 62 | a1 <<  2) & M62;
+    r[2] = (a1 >> 60 | a2 <<  4) & M62;
+    r[3] = (a2 >> 58 | a3 <<  6) & M62;
+    r[4] =  a3 >> 56;
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
+#endif
+}
+
+static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f, uint64_t g, int64_t *t) {
+
+    uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
+    uint64_t c1, c2, x, y, z;
+    int i;
+
+    for (i = 0; i < 62; ++i) {
+
+        c1 = -(g & (eta >> 15));
+
+        x = (f ^ g) & c1;
+        f ^= x; g ^= x; g ^= c1; g -= c1;
+
+        y = (u ^ q) & c1;
+        u ^= y; q ^= y; q ^= c1; q -= c1;
+
+        z = (v ^ r) & c1;
+        v ^= z; r ^= z; r ^= c1; r -= c1;
+
+        eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1;
+
+        c2 = -(g & 1);
+
+        g += (f & c2); g >>= 1;
+        q += (u & c2); u <<= 1;
+        r += (v & c2); v <<= 1;
+    }
+
+    t[0] = (int64_t)u;
+    t[1] = (int64_t)v;
+    t[2] = (int64_t)q;
+    t[3] = (int64_t)r;
+
+    return eta;
+}
+
+static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) {
+
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
+    int128_t cf = 0, cg = 0;
+    int i;
+
+    VERIFY_CHECK(len > 0);
+
+    fi = f[0];
+    gi = g[0];
+
+    cf -= (int128_t)u * fi + (int128_t)v * gi;
+    cg -= (int128_t)q * fi + (int128_t)r * gi;
+
+    VERIFY_CHECK(((int64_t)cf & M62) == 0);
+    VERIFY_CHECK(((int64_t)cg & M62) == 0);
+
+    cf >>= 62;
+    cg >>= 62;
+
+    for (i = 1; i < len; ++i) {
+
+        fi = f[i];
+        gi = g[i];
+
+        cf -= (int128_t)u * fi + (int128_t)v * gi;
+        cg -= (int128_t)q * fi + (int128_t)r * gi;
+
+        f[i - 1] = (int64_t)cf & M62; cf >>= 62;
+        g[i - 1] = (int64_t)cg & M62; cg >>= 62;
+    }
+
+    f[i - 1] = (int64_t)cf;
+    g[i - 1] = (int64_t)cg;
+}
+
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+
+#if defined(EXHAUSTIVE_TEST_ORDER)
+    int i;
+    *r = 0;
+    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
+        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
+            *r = i;
+    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
+     * have a composite group order; fix it in exhaustive_tests.c). */
+    VERIFY_CHECK(*r != 0);
+}
+#elif 1
+
+    /* TODO Check for x == 0? */
+
+    int64_t t[12 * 4];
+    int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
+        0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
+    int64_t g[5];
+    secp256k1_scalar b0, d0, a1, b1, c1, d1;
+    int i, len, sign;
+    int16_t eta;
+
+    /* Instead of dividing the output by 2^744, we scale the input. */
+    secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
+    secp256k1_scalar_encode_62(&g[0], &b0);
+
+    eta = -1;
+
+    for (i = 0; i < 12; ++i) {
+        eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], &t[i * 4]);
+        len = i <= 6 ? 5 : i >= 10 ? 1 : 11 - i;
+        secp256k1_scalar_update_fg(len, f, g, &t[i * 4]);
+    }
+
+    /* At this point, f must equal +/- 1 (the GCD). */
+    sign = (f[0] >> 1) & 1;
+
+    for (i = 0; i < 3; ++i) {
+        int tOff = i * 16;
+        secp256k1_scalar_combine_1s(&t[tOff + 0]);
+        secp256k1_scalar_combine_1s(&t[tOff + 8]);
+        secp256k1_scalar_combine_2s(&t[tOff + 0]);
+    }
+
+    /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */
+    secp256k1_scalar_decode_matrix(&b0, &t[4]);
+    /* secp256k1_scalar_decode_matrix(&c0, &t[8]); */
+    secp256k1_scalar_decode_matrix(&d0, &t[12]);
+
+    secp256k1_scalar_decode_matrix(&a1, &t[16]);
+    secp256k1_scalar_decode_matrix(&b1, &t[20]);
+    secp256k1_scalar_decode_matrix(&c1, &t[24]);
+    secp256k1_scalar_decode_matrix(&d1, &t[28]);
+
+    secp256k1_scalar_mul(&a1, &a1, &b0);
+    secp256k1_scalar_mul(&b1, &b1, &d0);
+    secp256k1_scalar_mul(&c1, &c1, &b0);
+    secp256k1_scalar_mul(&d1, &d1, &d0);
+
+    secp256k1_scalar_add(&b0, &a1, &b1);
+    secp256k1_scalar_add(&d0, &c1, &d1);
+
+    secp256k1_scalar_decode_matrix(&a1, &t[32]);
+    secp256k1_scalar_decode_matrix(&b1, &t[36]);
+    /* secp256k1_scalar_decode_matrix(&c1, &t[40]); */
+    /* secp256k1_scalar_decode_matrix(&d1, &t[44]); */
+
+    secp256k1_scalar_mul(&a1, &a1, &b0);
+    secp256k1_scalar_mul(&b1, &b1, &d0);
+    /* secp256k1_scalar_mul(&c1, &c1, &b0); */
+    /* secp256k1_scalar_mul(&d1, &d1, &d0); */
+
+    secp256k1_scalar_add(&b0, &a1, &b1);
+    /* secp256k1_scalar_add(&d0, &c1, &d1); */
+
+    secp256k1_scalar_cond_negate(&b0, sign);
+
+    *r = b0;
+}
+#else
+    secp256k1_scalar *t;
+    int i;
+    /* First compute xN as x ^ (2^N - 1) for some values of N,
+     * and uM as x ^ M for some values of M. */
+    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
+    secp256k1_scalar u2, u5, u9, u11, u13;
+
+    secp256k1_scalar_sqr(&u2, x);
+    secp256k1_scalar_mul(&x2, &u2,  x);
+    secp256k1_scalar_mul(&u5, &u2, &x2);
+    secp256k1_scalar_mul(&x3, &u5,  &u2);
+    secp256k1_scalar_mul(&u9, &x3, &u2);
+    secp256k1_scalar_mul(&u11, &u9, &u2);
+    secp256k1_scalar_mul(&u13, &u11, &u2);
+
+    secp256k1_scalar_sqr(&x6, &u13);
+    secp256k1_scalar_sqr(&x6, &x6);
+    secp256k1_scalar_mul(&x6, &x6, &u11);
+
+    secp256k1_scalar_sqr(&x8, &x6);
+    secp256k1_scalar_sqr(&x8, &x8);
+    secp256k1_scalar_mul(&x8, &x8,  &x2);
+
+    secp256k1_scalar_sqr(&x14, &x8);
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(&x14, &x14);
+    }
+    secp256k1_scalar_mul(&x14, &x14, &x6);
+
+    secp256k1_scalar_sqr(&x28, &x14);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x28, &x28);
+    }
+    secp256k1_scalar_mul(&x28, &x28, &x14);
+
+    secp256k1_scalar_sqr(&x56, &x28);
+    for (i = 0; i < 27; i++) {
+        secp256k1_scalar_sqr(&x56, &x56);
+    }
+    secp256k1_scalar_mul(&x56, &x56, &x28);
+
+    secp256k1_scalar_sqr(&x112, &x56);
+    for (i = 0; i < 55; i++) {
+        secp256k1_scalar_sqr(&x112, &x112);
+    }
+    secp256k1_scalar_mul(&x112, &x112, &x56);
+
+    secp256k1_scalar_sqr(&x126, &x112);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x126, &x126);
+    }
+    secp256k1_scalar_mul(&x126, &x126, &x14);
+
+    /* Then accumulate the final result (t starts at x126). */
+    t = &x126;
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 10; i++) { /* 0000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 9; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x2); /* 11 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 10; i++) { /* 000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, x); /* 1 */
+    for (i = 0; i < 8; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
+}
+#endif
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/scalar_8x32.h b/src/scalar_8x32.h
index 2c9a348e24..10c55f1f8b 100644
--- a/src/scalar_8x32.h
+++ b/src/scalar_8x32.h
@@ -16,4 +16,6 @@ typedef struct {
 
 #define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{(d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7)}}
 
+#define SECP256K1_SCALAR_INV_DEFAULT
+
 #endif /* SECP256K1_SCALAR_REPR_H */
diff --git a/src/scalar_impl.h b/src/scalar_impl.h
index 70cd73db06..2318fcb0fd 100644
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@@ -61,6 +61,7 @@ static int secp256k1_scalar_set_b32_seckey(secp256k1_scalar *r, const unsigned c
     return (!overflow) & (!secp256k1_scalar_is_zero(r));
 }
 
+#if defined(SECP256K1_SCALAR_INV_DEFAULT)
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
 #if defined(EXHAUSTIVE_TEST_ORDER)
     int i;
@@ -225,11 +226,12 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     }
     secp256k1_scalar_mul(r, t, &x6); /* 111111 */
 }
+#endif
+#endif
 
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
     return !(a->d[0] & 1);
 }
-#endif
 
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
 #if defined(USE_SCALAR_INV_BUILTIN)
diff --git a/src/util.h b/src/util.h
index 8289e23e0c..b4f7b77344 100644
--- a/src/util.h
+++ b/src/util.h
@@ -176,6 +176,7 @@ static SECP256K1_INLINE void *manual_alloc(void** prealloc_ptr, size_t alloc_siz
 # else
 #  define SECP256K1_GNUC_EXT
 # endif
+SECP256K1_GNUC_EXT typedef __int128 int128_t;
 SECP256K1_GNUC_EXT typedef unsigned __int128 uint128_t;
 #endif
 

From 4fab082c9a9c7cea2ce2b770f1ba9b74a9bffdda Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Wed, 15 Jul 2020 15:34:53 +0700
Subject: [PATCH 02/34] Fix secp256k1_scalar_is_even/scalar_low issue

---
 src/scalar_4x64_impl.h | 9 ++++++++-
 src/scalar_impl.h      | 4 ++--
 src/scalar_low.h       | 2 ++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 1fcf1ba37c..8ed839e9d8 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1179,7 +1179,6 @@ static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t
 }
 
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
-
 #if defined(EXHAUSTIVE_TEST_ORDER)
     int i;
     *r = 0;
@@ -1259,6 +1258,10 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 
     *r = b0;
 }
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
 #else
     secp256k1_scalar *t;
     int i;
@@ -1412,6 +1415,10 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     }
     secp256k1_scalar_mul(r, t, &x6); /* 111111 */
 }
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
 #endif
 
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/scalar_impl.h b/src/scalar_impl.h
index 2318fcb0fd..a63b735491 100644
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@@ -226,12 +226,12 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     }
     secp256k1_scalar_mul(r, t, &x6); /* 111111 */
 }
-#endif
-#endif
 
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
     return !(a->d[0] & 1);
 }
+#endif
+#endif
 
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
 #if defined(USE_SCALAR_INV_BUILTIN)
diff --git a/src/scalar_low.h b/src/scalar_low.h
index 2794a7f171..c31ca35376 100644
--- a/src/scalar_low.h
+++ b/src/scalar_low.h
@@ -14,4 +14,6 @@ typedef uint32_t secp256k1_scalar;
 
 #define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) (d0)
 
+#define SECP256K1_SCALAR_INV_DEFAULT
+
 #endif /* SECP256K1_SCALAR_REPR_H */

From 0b90a57f7e7e6a4e5035ae8f86b0639ef5362109 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Thu, 16 Jul 2020 12:53:23 +0700
Subject: [PATCH 03/34] TODOs and comments

---
 src/field_5x52_impl.h  | 26 ++++++++++++++------
 src/scalar_4x64_impl.h | 56 +++++++++++++-----------------------------
 2 files changed, 35 insertions(+), 47 deletions(-)

diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index a08032afb7..0e6f21d299 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -505,6 +505,11 @@ static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST(
 
 static void secp256k1_fe_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
 
+    /*  Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64.
+     *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
+     *  writes the 252-bit signed result to [t[0],t[1],t[2],t[3]].
+     */
+
     int64_t z0, z1, z2, z3;
     int128_t tt;
 
@@ -568,15 +573,21 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) {
 
     uint64_t u0, u1, u2, u3, u4;
     uint64_t r0, r1, r2, r3, r4;
+    int128_t cc;
 
-    /* TODO Need proper carry chain */
+    cc  = t[0];
+    u0  = (uint64_t)cc; cc >>= 64;
+    cc += t[1];
+    u1  = (uint64_t)cc; cc >>= 64;
+    cc += t[2];
+    u2  = (uint64_t)cc; cc >>= 64;
+    cc += t[3];
+    u3  = (uint64_t)cc; cc >>= 64;
+    u4  = (uint64_t)cc;
 
-    u0 = (uint64_t)t[0];
-    u1 = (uint64_t)t[1] - (u0 >> 63);
-    u2 = (uint64_t)t[2] - (u1 >> 63);
-    u3 = (uint64_t)t[3] - (u2 >> 63);
-    u4 =                - (u3 >> 63);
+    VERIFY_CHECK(u4 == 0 || u4 == UINT64_MAX);
 
+    /* Add twice the field prime in case u4 is non-zero (which represents -2^256). */
     r0 = 0xFFFFEFFFFFC2FULL * 2;
     r1 = 0xFFFFFFFFFFFFFULL * 2;
     r2 = 0xFFFFFFFFFFFFFULL * 2;
@@ -596,8 +607,7 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) {
     r->n[4] = r4;
 
 #ifdef VERIFY
-    /* TODO Probably 2 is enough? */
-    r->magnitude = 3;
+    r->magnitude = 2;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 8ed839e9d8..8f5eea587d 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -969,6 +969,11 @@ static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CO
 
 static void secp256k1_scalar_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
 
+    /*  Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64.
+     *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
+     *  writes the 252-bit signed result to [t[0],t[1],t[2],t[3]].
+     */
+
     int64_t z0, z1, z2, z3;
     int128_t tt;
 
@@ -1030,19 +1035,23 @@ static void secp256k1_scalar_combine_2s(int64_t *t) {
 
 static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) {
 
-#if 1
-
     uint64_t r0, r1, r2, r3;
     int flag;
     secp256k1_scalar u;
+    int128_t cc;
 
-    /* TODO Need proper carry chain */
-    r0 = (uint64_t)t[0];
-    r1 = (uint64_t)t[1] - (r0 >> 63);
-    r2 = (uint64_t)t[2] - (r1 >> 63);
-    r3 = (uint64_t)t[3] - (r2 >> 63);
+    cc  = t[0];
+    r0 = (uint64_t)cc; cc >>= 64;
+    cc += t[1];
+    r1 = (uint64_t)cc; cc >>= 64;
+    cc += t[2];
+    r2 = (uint64_t)cc; cc >>= 64;
+    cc += t[3];
+    r3 = (uint64_t)cc; cc >>= 64;
 
-    flag = (int)(r3 >> 63);
+    VERIFY_CHECK(cc == 0 || cc == -1);
+
+    flag = (int)cc & 1;
 
     r->d[0] = r0;
     r->d[1] = r1;
@@ -1051,37 +1060,6 @@ static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) {
 
     secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256);
     secp256k1_scalar_cmov(r, &u, flag);
-
-#else
-
-    uint64_t u0, u1, u2, u3, u4;
-    uint64_t r0, r1, r2, r3, r4;
-
-    u0 = (uint64_t)t[0];
-    u1 = (uint64_t)t[1] - (u0 >> 63);
-    u2 = (uint64_t)t[2] - (u1 >> 63);
-    u3 = (uint64_t)t[3] - (u2 >> 63);
-    u4 =                - (u3 >> 63);
-
-    r0 = 0xFFFFEFFFFFC2FULL * 2;
-    r1 = 0xFFFFFFFFFFFFFULL * 2;
-    r2 = 0xFFFFFFFFFFFFFULL * 2;
-    r3 = 0xFFFFFFFFFFFFFULL * 2;
-    r4 = 0x0FFFFFFFFFFFFULL * 2;
-
-    r0 += u0 & 0xFFFFFFFFFFFFFULL;
-    r1 += u0 >> 52 | ((u1 << 12) & 0xFFFFFFFFFFFFFULL);
-    r2 += u1 >> 40 | ((u2 << 24) & 0xFFFFFFFFFFFFFULL);
-    r3 += u2 >> 28 | ((u3 << 36) & 0xFFFFFFFFFFFFFULL);
-    r4 += u3 >> 16 |  (u4 << 48);
-
-    r->n[0] = r0;
-    r->n[1] = r1;
-    r->n[2] = r2;
-    r->n[3] = r3;
-    r->n[4] = r4;
-
-#endif
 }
 
 static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {

From 0c3869a46d3eccf0324f4c40c076215093de0ab5 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sat, 18 Jul 2020 11:39:17 +0700
Subject: [PATCH 04/34] VERIFY_CHECK _divsteps_62 loop invariant

---
 src/field_5x52_impl.h  | 7 +++++--
 src/scalar_4x64_impl.h | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 0e6f21d299..758e6f000d 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -630,14 +630,17 @@ static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) {
     r[4] =  a4 >> 40;
 }
 
-static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f, uint64_t g, int64_t *t) {
+static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
     uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
-    uint64_t c1, c2, x, y, z;
+    uint64_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 0; i < 62; ++i) {
 
+        VERIFY_CHECK((u * f0 + v * g0) == -f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == -g << i);
+
         c1 = -(g & (eta >> 15));
 
         x = (f ^ g) & c1;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 8f5eea587d..08ccd68175 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1083,14 +1083,17 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {
 #endif
 }
 
-static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f, uint64_t g, int64_t *t) {
+static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
     uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
-    uint64_t c1, c2, x, y, z;
+    uint64_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 0; i < 62; ++i) {
 
+        VERIFY_CHECK((u * f0 + v * g0) == -f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == -g << i);
+
         c1 = -(g & (eta >> 15));
 
         x = (f ^ g) & c1;

From 11b525c71c599c3dccd48b7e345a88ea052391eb Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 21 Jul 2020 17:00:04 +0700
Subject: [PATCH 05/34] More checks and comments

---
 src/field_5x52_impl.h  | 30 +++++++++++++++++++++++++++---
 src/scalar_4x64_impl.h | 26 +++++++++++++++++++++++---
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 758e6f000d..1f2b24ddcd 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -638,6 +638,7 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6
 
     for (i = 0; i < 62; ++i) {
 
+        VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((u * f0 + v * g0) == -f << i);
         VERIFY_CHECK((q * f0 + r * g0) == -g << i);
 
@@ -710,7 +711,9 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
 #if 1
 
-    /* TODO Check for a == 0? */
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+     */
 
     int64_t t[12 * 4];
     int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL,
@@ -719,14 +722,22 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     secp256k1_fe b0, d0, a1, b1, c1, d1;
     int i, len, sign;
     int16_t eta;
+#ifdef VERIFY
+    int zero_in;
+#endif
 
     /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
      * by 2^768, and then the output by 2^24. */
-    /* Instead of dividing the output by 2^744, we scale the input. */
+    /* Instead of dividing the output by 2^744, scale the input. */
     secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
     secp256k1_fe_normalize(&b0);
     secp256k1_fe_encode_62(&g[0], &b0);
 
+#ifdef VERIFY
+    zero_in = secp256k1_fe_is_zero(&b0);
+#endif
+
+    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
     eta = -1;
 
     for (i = 0; i < 12; ++i) {
@@ -735,7 +746,16 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
         secp256k1_fe_update_fg(len, f, g, &t[i * 4]);
     }
 
-    /* At this point, f must equal +/- 1 (the GCD). */
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1. The matrix outputs from each _divstep_62 are combined to get the
+     * Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divstep_62 introduce an extra factor of 2^62 each, so there is a total extra
+     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
+     */
+
+    VERIFY_CHECK(g[0] == 0);
+
     sign = (f[0] >> 1) & 1;
 
     for (i = 0; i < 3; ++i) {
@@ -780,6 +800,10 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 
+#ifdef VERIFY
+    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in);
+#endif
+
     *r = b0;
 
 #else
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 08ccd68175..b153f73050 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1091,6 +1091,7 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0,
 
     for (i = 0; i < 62; ++i) {
 
+        VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((u * f0 + v * g0) == -f << i);
         VERIFY_CHECK((q * f0 + r * g0) == -g << i);
 
@@ -1172,7 +1173,9 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 }
 #elif 1
 
-    /* TODO Check for x == 0? */
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+     */
 
     int64_t t[12 * 4];
     int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
@@ -1181,11 +1184,15 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     secp256k1_scalar b0, d0, a1, b1, c1, d1;
     int i, len, sign;
     int16_t eta;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
 
-    /* Instead of dividing the output by 2^744, we scale the input. */
+    /* Instead of dividing the output by 2^744, scale the input. */
     secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
     secp256k1_scalar_encode_62(&g[0], &b0);
 
+    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
     eta = -1;
 
     for (i = 0; i < 12; ++i) {
@@ -1194,7 +1201,16 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
         secp256k1_scalar_update_fg(len, f, g, &t[i * 4]);
     }
 
-    /* At this point, f must equal +/- 1 (the GCD). */
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1. The matrix outputs from each _divstep_62 are combined to get the
+     * Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divstep_62 introduce an extra factor of 2^62 each, so there is a total extra
+     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
+     */
+
+    VERIFY_CHECK(g[0] == 0);
+
     sign = (f[0] >> 1) & 1;
 
     for (i = 0; i < 3; ++i) {
@@ -1237,6 +1253,10 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 
     secp256k1_scalar_cond_negate(&b0, sign);
 
+#ifdef VERIFY
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in);
+#endif
+
     *r = b0;
 }
 

From 3ae7179ad78398c23f27b41dd94baa7c0662c964 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 21 Jul 2020 20:44:13 +0700
Subject: [PATCH 06/34] Update f,g at full length until proper analysis

---
 src/field_5x52_impl.h  | 15 ++++++---------
 src/scalar_4x64_impl.h | 15 ++++++---------
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 1f2b24ddcd..6f632e2079 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -670,15 +670,13 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6
     return eta;
 }
 
-static void secp256k1_fe_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) {
+static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) {
 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
     int128_t cf = 0, cg = 0;
     int i;
 
-    VERIFY_CHECK(len > 0);
-
     fi = f[0];
     gi = g[0];
 
@@ -691,7 +689,7 @@ static void secp256k1_fe_update_fg(int len, int64_t *f, int64_t *g, int64_t *t)
     cf >>= 62;
     cg >>= 62;
 
-    for (i = 1; i < len; ++i) {
+    for (i = 1; i < 5; ++i) {
 
         fi = f[i];
         gi = g[i];
@@ -703,8 +701,8 @@ static void secp256k1_fe_update_fg(int len, int64_t *f, int64_t *g, int64_t *t)
         g[i - 1] = (int64_t)cg & M62; cg >>= 62;
     }
 
-    f[i - 1] = (int64_t)cf;
-    g[i - 1] = (int64_t)cg;
+    f[4] = (int64_t)cf;
+    g[4] = (int64_t)cg;
 }
 
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
@@ -720,7 +718,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
     secp256k1_fe b0, d0, a1, b1, c1, d1;
-    int i, len, sign;
+    int i, sign;
     int16_t eta;
 #ifdef VERIFY
     int zero_in;
@@ -742,8 +740,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     for (i = 0; i < 12; ++i) {
         eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], &t[i * 4]);
-        len = i <= 6 ? 5 : i >= 10 ? 1 : 11 - i;
-        secp256k1_fe_update_fg(len, f, g, &t[i * 4]);
+        secp256k1_fe_update_fg(f, g, &t[i * 4]);
     }
 
     /* At this point sufficient iterations have been performed that g must have reached 0
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index b153f73050..91e3a73848 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1123,15 +1123,13 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0,
     return eta;
 }
 
-static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) {
+static void secp256k1_scalar_update_fg(int64_t *f, int64_t *g, int64_t *t) {
 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
     int128_t cf = 0, cg = 0;
     int i;
 
-    VERIFY_CHECK(len > 0);
-
     fi = f[0];
     gi = g[0];
 
@@ -1144,7 +1142,7 @@ static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t
     cf >>= 62;
     cg >>= 62;
 
-    for (i = 1; i < len; ++i) {
+    for (i = 1; i < 5; ++i) {
 
         fi = f[i];
         gi = g[i];
@@ -1156,8 +1154,8 @@ static void secp256k1_scalar_update_fg(int len, int64_t *f, int64_t *g, int64_t
         g[i - 1] = (int64_t)cg & M62; cg >>= 62;
     }
 
-    f[i - 1] = (int64_t)cf;
-    g[i - 1] = (int64_t)cg;
+    f[4] = (int64_t)cf;
+    g[4] = (int64_t)cg;
 }
 
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
@@ -1182,7 +1180,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
     secp256k1_scalar b0, d0, a1, b1, c1, d1;
-    int i, len, sign;
+    int i, sign;
     int16_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
@@ -1197,8 +1195,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 
     for (i = 0; i < 12; ++i) {
         eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], &t[i * 4]);
-        len = i <= 6 ? 5 : i >= 10 ? 1 : 11 - i;
-        secp256k1_scalar_update_fg(len, f, g, &t[i * 4]);
+        secp256k1_scalar_update_fg(f, g, &t[i * 4]);
     }
 
     /* At this point sufficient iterations have been performed that g must have reached 0

From 2f643ad31d2a0c9ec58f6be302a302ee3f2a98f4 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Wed, 22 Jul 2020 00:13:20 +0700
Subject: [PATCH 07/34] Initial 32bit safegcd

- definitely needs bounds analysis
---
 src/field_10x26.h      |   2 -
 src/field_10x26_impl.h | 543 +++++++++++++++++++++++++++++++++++++
 src/field_5x52_impl.h  |  18 +-
 src/scalar_4x64_impl.h |  20 +-
 src/scalar_8x32.h      |   2 -
 src/scalar_8x32_impl.h | 596 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1158 insertions(+), 23 deletions(-)

diff --git a/src/field_10x26.h b/src/field_10x26.h
index 312a94c3ae..5ff03c8abc 100644
--- a/src/field_10x26.h
+++ b/src/field_10x26.h
@@ -47,6 +47,4 @@ typedef struct {
 #define SECP256K1_FE_STORAGE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}
 #define SECP256K1_FE_STORAGE_CONST_GET(d) d.n[7], d.n[6], d.n[5], d.n[4],d.n[3], d.n[2], d.n[1], d.n[0]
 
-#define SECP256K1_FE_INV_DEFAULT
-
 #endif /* SECP256K1_FIELD_REPR_H */
diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 651500ee8e..52995de68c 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1164,4 +1164,547 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
+static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST(
+    0x0E90A100UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
+    0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL
+);
+
+static void secp256k1_fe_mul_add_2(int32_t a0, int32_t a1, int32_t b0, int32_t b1, int32_t c0, int32_t c1, int32_t d0, int32_t d1, int32_t *t) {
+
+    /*  Each [a0,a1], etc. pair is a ??-bit signed value e.g. a0 + a1 * 2^32.
+     *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
+     *  writes the ???-bit signed result to [t[0],t[1],t[2],t[3]].
+     */
+
+    int32_t z0, z1, z2, z3;
+    int64_t tt;
+
+    tt  = (int64_t)a0 * b0
+        + (int64_t)c0 * d0;
+    z0  = (int32_t)tt; tt -= z0; tt >>= 32;
+
+    tt += (int64_t)a0 * b1
+        + (int64_t)a1 * b0
+        + (int64_t)c0 * d1
+        + (int64_t)c1 * d0;
+    z1  = (int32_t)tt; tt -= z1; tt >>= 32;
+
+    tt += (int64_t)a1 * b1
+        + (int64_t)c1 * d1;
+    z2  = (int32_t)tt; tt -= z2; tt >>= 32;
+
+    z3 = (int32_t)tt;
+
+    t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3;
+}
+
+static void secp256k1_fe_mul_add_4(int32_t* tIn, int xPos, int yPos, int uPos, int vPos, int32_t *tOut, int zzPos) {
+    int32_t y0 = tIn[yPos + 0];
+    int32_t y1 = tIn[yPos + 1];
+    int32_t y2 = tIn[yPos + 2];
+    int32_t y3 = tIn[yPos + 3];
+    int32_t v0 = tIn[vPos + 0];
+    int32_t v1 = tIn[vPos + 1];
+    int32_t v2 = tIn[vPos + 2];
+    int32_t v3 = tIn[vPos + 3];
+    int32_t xVal, uVal;
+    int32_t z0, z1, z2, z3, z4, z5, z6, z7;
+    int64_t c;
+
+    xVal = tIn[xPos];
+    uVal = tIn[uPos];
+
+    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0;
+    z0 = (int32_t)c; c -= z0; c >>= 32;
+
+    c += (int64_t)xVal * y1 + (int64_t)uVal * v1;
+    z1 = (int32_t)c; c -= z1; c >>= 32;
+
+    c += (int64_t)xVal * y2 + (int64_t)uVal * v2;
+    z2 = (int32_t)c; c -= z2; c >>= 32;
+
+    c += (int64_t)xVal * y3 + (int64_t)uVal * v3;
+    z3 = (int32_t)c; c -= z3; c >>= 32;
+    z4 = (int32_t)c;
+
+    xVal = tIn[xPos + 1];
+    uVal = tIn[uPos + 1];
+
+    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z1;
+    z1 = (int32_t)c; c -= z1; c >>= 32;
+
+    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z2;
+    z2 = (int32_t)c; c -= z2; c >>= 32;
+
+    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z3;
+    z3 = (int32_t)c; c -= z3; c >>= 32;
+
+    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z4;
+    z4 = (int32_t)c; c -= z4; c >>= 32;
+    z5 = (int32_t)c;
+
+    xVal = tIn[xPos + 2];
+    uVal = tIn[uPos + 2];
+
+    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z2;
+    z2 = (int32_t)c; c -= z2; c >>= 32;
+
+    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z3;
+    z3 = (int32_t)c; c -= z3; c >>= 32;
+
+    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z4;
+    z4 = (int32_t)c; c -= z4; c >>= 32;
+
+    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z5;
+    z5 = (int32_t)c; c -= z5; c >>= 32;
+    z6 = (int32_t)c;
+
+    xVal = tIn[xPos + 3];
+    uVal = tIn[uPos + 3];
+
+    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z3;
+    z3 = (int32_t)c; c -= z3; c >>= 32;
+
+    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z4;
+    z4 = (int32_t)c; c -= z4; c >>= 32;
+
+    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z5;
+    z5 = (int32_t)c; c -= z5; c >>= 32;
+
+    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z6;
+    z6 = (int32_t)c; c -= z6; c >>= 32;
+    z7 = (int32_t)c;
+
+    tOut[zzPos + 0] = z0;
+    tOut[zzPos + 1] = z1;
+    tOut[zzPos + 2] = z2;
+    tOut[zzPos + 3] = z3;
+    tOut[zzPos + 4] = z4;
+    tOut[zzPos + 5] = z5;
+    tOut[zzPos + 6] = z6;
+    tOut[zzPos + 7] = z7;
+}
+
+static void secp256k1_fe_combine_1s(int32_t *t) {
+
+    int32_t a = t[0], b = t[1], c = t[2], d = t[3],
+            e = t[4], f = t[5], g = t[6], h = t[7];
+    int64_t I, J, K, L;
+
+    I = (int64_t)e * a + (int64_t)f * c;
+    J = (int64_t)e * b + (int64_t)f * d;
+    K = (int64_t)g * a + (int64_t)h * c;
+    L = (int64_t)g * b + (int64_t)h * d;
+
+    a = (int32_t)I; I -= a; I >>= 32; b = (int32_t)I;
+    c = (int32_t)J; J -= c; J >>= 32; d = (int32_t)J;
+    e = (int32_t)K; K -= e; K >>= 32; f = (int32_t)K;
+    g = (int32_t)L; L -= g; L >>= 32; h = (int32_t)L;
+
+    t[0] = a; t[1] = b; t[2] = c; t[3] = d;
+    t[4] = e; t[5] = f; t[6] = g; t[7] = h;
+}
+
+static void secp256k1_fe_combine_2s(int32_t *t) {
+
+    int32_t a0 = t[ 0], a1 = t[ 1];
+    int32_t b0 = t[ 2], b1 = t[ 3];
+    int32_t c0 = t[ 4], c1 = t[ 5];
+    int32_t d0 = t[ 6], d1 = t[ 7];
+    int32_t e0 = t[ 8], e1 = t[ 9];
+    int32_t f0 = t[10], f1 = t[11];
+    int32_t g0 = t[12], g1 = t[13];
+    int32_t h0 = t[14], h1 = t[15];
+
+    secp256k1_fe_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
+    secp256k1_fe_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
+    secp256k1_fe_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
+    secp256k1_fe_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
+}
+
+static void secp256k1_fe_combine_4s(int32_t *t)
+{
+    int32_t tmp[32];
+
+    int aPos = 0;
+    int bPos = 4;
+    int cPos = 8;
+    int dPos = 12;
+    int ePos = 16;
+    int fPos = 20;
+    int gPos = 24;
+    int hPos = 28;
+
+    int IPos = 0;
+    int JPos = 8;
+    int KPos = 16;
+    int LPos = 24;
+
+    secp256k1_fe_mul_add_4(t, ePos, aPos, fPos, cPos, tmp, IPos);
+    secp256k1_fe_mul_add_4(t, ePos, bPos, fPos, dPos, tmp, JPos);
+    secp256k1_fe_mul_add_4(t, gPos, aPos, hPos, cPos, tmp, KPos);
+    secp256k1_fe_mul_add_4(t, gPos, bPos, hPos, dPos, tmp, LPos);
+
+    memcpy(t, tmp, 32 * sizeof(int32_t));
+}
+
+static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int32_t *t) {
+
+    uint32_t u0, u1, u2, u3, u4, u5, u6, u7, u8;
+    uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
+    int64_t cc;
+
+    cc  = t[0];
+    u0  = (uint32_t)cc; cc >>= 32;
+    cc += t[1];
+    u1  = (uint32_t)cc; cc >>= 32;
+    cc += t[2];
+    u2  = (uint32_t)cc; cc >>= 32;
+    cc += t[3];
+    u3  = (uint32_t)cc; cc >>= 32;
+    cc += t[4];
+    u4  = (uint32_t)cc; cc >>= 32;
+    cc += t[5];
+    u5  = (uint32_t)cc; cc >>= 32;
+    cc += t[6];
+    u6  = (uint32_t)cc; cc >>= 32;
+    cc += t[7];
+    u7  = (uint32_t)cc; cc >>= 32;
+    u8  = (uint32_t)cc;
+
+    VERIFY_CHECK(u8 == 0 || u8 == UINT32_MAX);
+
+    /* Add twice the field prime in case u8 is non-zero (which represents -2^256). */
+    r0 = 0x3FFFC2FUL * 2;
+    r1 = 0x3FFFFBFUL * 2;
+    r2 = 0x3FFFFFFUL * 2;
+    r3 = 0x3FFFFFFUL * 2;
+    r4 = 0x3FFFFFFUL * 2;
+    r5 = 0x3FFFFFFUL * 2;
+    r6 = 0x3FFFFFFUL * 2;
+    r7 = 0x3FFFFFFUL * 2;
+    r8 = 0x3FFFFFFUL * 2;
+    r9 = 0x03FFFFFUL * 2;
+
+    r0 += (           u0      ) & 0x3FFFFFFUL;
+    r1 += (u0 >> 26 | u1 <<  6) & 0x3FFFFFFUL;
+    r2 += (u1 >> 20 | u2 << 12) & 0x3FFFFFFUL;
+    r3 += (u2 >> 14 | u3 << 18) & 0x3FFFFFFUL;
+    r4 += (u3 >>  8 | u4 << 24) & 0x3FFFFFFUL;
+    r5 += (u4 >>  2           ) & 0x3FFFFFFUL;
+    r6 += (u4 >> 28 | u5 <<  4) & 0x3FFFFFFUL;
+    r7 += (u5 >> 22 | u6 << 10) & 0x3FFFFFFUL;
+    r8 += (u6 >> 16 | u7 << 16) & 0x3FFFFFFUL;
+    r9 += (u7 >> 10 | u8 << 22);
+
+    r->n[0] = r0;
+    r->n[1] = r1;
+    r->n[2] = r2;
+    r->n[3] = r3;
+    r->n[4] = r4;
+    r->n[5] = r5;
+    r->n[6] = r6;
+    r->n[7] = r7;
+    r->n[8] = r8;
+    r->n[9] = r9;
+
+#ifdef VERIFY
+    r->magnitude = 2;
+    r->normalized = 0;
+    secp256k1_fe_verify(r);
+#endif
+}
+
+static void secp256k1_fe_encode_31(int32_t *r, const secp256k1_fe *a) {
+
+    const uint32_t M31 = UINT32_MAX >> 1;
+    const uint32_t *n = &a->n[0];
+    uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4],
+             a5 = n[5], a6 = n[6], a7 = n[7], a8 = n[8], a9 = n[9];
+
+#ifdef VERIFY
+    VERIFY_CHECK(a->normalized);
+#endif
+
+    r[0] = (a0       | a1 << 26) & M31;
+    r[1] = (a1 >>  5 | a2 << 21) & M31;
+    r[2] = (a2 >> 10 | a3 << 16) & M31;
+    r[3] = (a3 >> 15 | a4 << 11) & M31;
+    r[4] = (a4 >> 20 | a5 <<  6) & M31;
+    r[5] = (a5 >> 25 | a6 <<  1
+                     | a7 << 27) & M31;
+    r[6] = (a7 >>  4 | a8 << 22) & M31;
+    r[7] = (a8 >>  9 | a9 << 17) & M31;
+    r[8] =  a9 >> 14;
+}
+
+static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+
+    uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
+    uint32_t c1, c2, f = f0, g = g0, x, y, z;
+    int i;
+
+    for (i = 0; i < 31; ++i) {
+
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == -f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == -g << i);
+
+        c1 = -(g & (eta >> 15));
+
+        x = (f ^ g) & c1;
+        f ^= x; g ^= x; g ^= c1; g -= c1;
+
+        y = (u ^ q) & c1;
+        u ^= y; q ^= y; q ^= c1; q -= c1;
+
+        z = (v ^ r) & c1;
+        v ^= z; r ^= z; r ^= c1; r -= c1;
+
+        eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1;
+
+        c2 = -(g & 1);
+
+        g += (f & c2); g >>= 1;
+        q += (u & c2); u <<= 1;
+        r += (v & c2); v <<= 1;
+    }
+
+    t[0] = (int32_t)u;
+    t[1] = (int32_t)v;
+    t[2] = (int32_t)q;
+    t[3] = (int32_t)r;
+
+    return eta;
+}
+
+static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) {
+
+    const int32_t M31 = (int32_t)(UINT32_MAX >> 1);
+    int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
+    int64_t cf = 0, cg = 0;
+    int i;
+
+    fi = f[0];
+    gi = g[0];
+
+    cf -= (int64_t)u * fi + (int64_t)v * gi;
+    cg -= (int64_t)q * fi + (int64_t)r * gi;
+
+    VERIFY_CHECK(((int32_t)cf & M31) == 0);
+    VERIFY_CHECK(((int32_t)cg & M31) == 0);
+
+    cf >>= 31;
+    cg >>= 31;
+
+    for (i = 1; i < 9; ++i) {
+
+        fi = f[i];
+        gi = g[i];
+
+        cf -= (int64_t)u * fi + (int64_t)v * gi;
+        cg -= (int64_t)q * fi + (int64_t)r * gi;
+
+        f[i - 1] = (int32_t)cf & M31; cf >>= 31;
+        g[i - 1] = (int32_t)cg & M31; cg >>= 31;
+    }
+
+    f[8] = (int32_t)cf;
+    g[8] = (int32_t)cg;
+}
+
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
+
+#if 1
+
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+     */
+
+    int32_t t[24 * 4];
+    int32_t f[9] = { 0x7FFFFC2FL, 0x7FFFFFFDL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL,
+        0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL };
+    int32_t g[9];
+    secp256k1_fe b0, d0, a1, b1, c1, d1;
+    int i, sign;
+    int16_t eta;
+#ifdef VERIFY
+    int zero_in;
+#endif
+
+    /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
+     * by 2^768, and then the output by 2^24. */
+    /* Instead of dividing the output by 2^744, scale the input. */
+    secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
+    secp256k1_fe_normalize(&b0);
+    secp256k1_fe_encode_31(&g[0], &b0);
+
+#ifdef VERIFY
+    zero_in = secp256k1_fe_is_zero(&b0);
+#endif
+
+    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    eta = -1;
+
+    for (i = 0; i < 24; ++i) {
+        eta = secp256k1_fe_divsteps_31(eta, f[0], g[0], &t[i * 4]);
+        secp256k1_fe_update_fg(f, g, &t[i * 4]);
+    }
+
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1. The matrix outputs from each _divstep_31 are combined to get the
+     * Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divstep_31 introduce an extra factor of 2^31 each, so there is a total extra
+     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
+     */
+
+    VERIFY_CHECK(g[0] == 0);
+
+    sign = (f[0] >> 1) & 1;
+
+    for (i = 0; i < 3; ++i) {
+        int tOff = i * 32;
+        secp256k1_fe_combine_1s(&t[tOff +  0]);
+        secp256k1_fe_combine_1s(&t[tOff +  8]);
+        secp256k1_fe_combine_1s(&t[tOff + 16]);
+        secp256k1_fe_combine_1s(&t[tOff + 24]);
+        secp256k1_fe_combine_2s(&t[tOff +  0]);
+        secp256k1_fe_combine_2s(&t[tOff + 16]);
+        secp256k1_fe_combine_4s(&t[tOff +  0]);
+    }
+
+    /* secp256k1_fe_decode_matrix(&a0, &t[0]); */
+    secp256k1_fe_decode_matrix(&b0, &t[8]);
+    /* secp256k1_fe_decode_matrix(&c0, &t[16]); */
+    secp256k1_fe_decode_matrix(&d0, &t[24]);
+
+    secp256k1_fe_decode_matrix(&a1, &t[32]);
+    secp256k1_fe_decode_matrix(&b1, &t[40]);
+    secp256k1_fe_decode_matrix(&c1, &t[48]);
+    secp256k1_fe_decode_matrix(&d1, &t[56]);
+
+    secp256k1_fe_mul(&a1, &a1, &b0);
+    secp256k1_fe_mul(&b1, &b1, &d0);
+    secp256k1_fe_mul(&c1, &c1, &b0);
+    secp256k1_fe_mul(&d1, &d1, &d0);
+
+    b0 = a1; secp256k1_fe_add(&b0, &b1);
+    d0 = c1; secp256k1_fe_add(&d0, &d1);
+
+    secp256k1_fe_decode_matrix(&a1, &t[64]);
+    secp256k1_fe_decode_matrix(&b1, &t[72]);
+    /* secp256k1_fe_decode_matrix(&c1, &t[80]); */
+    /* secp256k1_fe_decode_matrix(&d1, &t[88]); */
+
+    secp256k1_fe_mul(&a1, &a1, &b0);
+    secp256k1_fe_mul(&b1, &b1, &d0);
+    /* secp256k1_fe_mul(&c1, &c1, &b0); */
+    /* secp256k1_fe_mul(&d1, &d1, &d0); */
+
+    b0 = a1; secp256k1_fe_add(&b0, &b1);
+    /* d0 = c1; secp256k1_fe_add(&d0, &d1); */
+
+    secp256k1_fe_negate(&b1, &b0, 2);
+    secp256k1_fe_cmov(&b0, &b1, sign);
+    secp256k1_fe_normalize_weak(&b0);
+
+#ifdef VERIFY
+    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in);
+#endif
+
+    *r = b0;
+
+#else
+
+    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;
+
+    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
+     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
+     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+     */
+
+    secp256k1_fe_sqr(&x2, a);
+    secp256k1_fe_mul(&x2, &x2, a);
+
+    secp256k1_fe_sqr(&x3, &x2);
+    secp256k1_fe_mul(&x3, &x3, a);
+
+    x6 = x3;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x6, &x6);
+    }
+    secp256k1_fe_mul(&x6, &x6, &x3);
+
+    x9 = x6;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x9, &x9);
+    }
+    secp256k1_fe_mul(&x9, &x9, &x3);
+
+    x11 = x9;
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&x11, &x11);
+    }
+    secp256k1_fe_mul(&x11, &x11, &x2);
+
+    x22 = x11;
+    for (j=0; j<11; j++) {
+        secp256k1_fe_sqr(&x22, &x22);
+    }
+    secp256k1_fe_mul(&x22, &x22, &x11);
+
+    x44 = x22;
+    for (j=0; j<22; j++) {
+        secp256k1_fe_sqr(&x44, &x44);
+    }
+    secp256k1_fe_mul(&x44, &x44, &x22);
+
+    x88 = x44;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x88, &x88);
+    }
+    secp256k1_fe_mul(&x88, &x88, &x44);
+
+    x176 = x88;
+    for (j=0; j<88; j++) {
+        secp256k1_fe_sqr(&x176, &x176);
+    }
+    secp256k1_fe_mul(&x176, &x176, &x88);
+
+    x220 = x176;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x220, &x220);
+    }
+    secp256k1_fe_mul(&x220, &x220, &x44);
+
+    x223 = x220;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x223, &x223);
+    }
+    secp256k1_fe_mul(&x223, &x223, &x3);
+
+    /* The final result is then assembled using a sliding window over the blocks. */
+
+    t1 = x223;
+    for (j=0; j<23; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x22);
+    for (j=0; j<5; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, a);
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x2);
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(r, a, &t1);
+#endif
+}
+
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 6f632e2079..03d10b1dce 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -503,7 +503,7 @@ static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST(
     0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL
 );
 
-static void secp256k1_fe_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
+static void secp256k1_fe_mul_add_2(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
 
     /*  Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64.
      *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
@@ -563,10 +563,10 @@ static void secp256k1_fe_combine_2s(int64_t *t) {
     int64_t g0 = t[12], g1 = t[13];
     int64_t h0 = t[14], h1 = t[15];
 
-    secp256k1_fe_mul_add(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
-    secp256k1_fe_mul_add(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
-    secp256k1_fe_mul_add(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
-    secp256k1_fe_mul_add(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
+    secp256k1_fe_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
+    secp256k1_fe_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
+    secp256k1_fe_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
+    secp256k1_fe_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
 }
 
 static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) {
@@ -595,10 +595,10 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) {
     r4 = 0x0FFFFFFFFFFFFULL * 2;
 
     r0 += u0 & 0xFFFFFFFFFFFFFULL;
-    r1 += u0 >> 52 | ((u1 << 12) & 0xFFFFFFFFFFFFFULL);
-    r2 += u1 >> 40 | ((u2 << 24) & 0xFFFFFFFFFFFFFULL);
-    r3 += u2 >> 28 | ((u3 << 36) & 0xFFFFFFFFFFFFFULL);
-    r4 += u3 >> 16 |  (u4 << 48);
+    r1 += (u0 >> 52 | u1 << 12) & 0xFFFFFFFFFFFFFULL;
+    r2 += (u1 >> 40 | u2 << 24) & 0xFFFFFFFFFFFFFULL;
+    r3 += (u2 >> 28 | u3 << 36) & 0xFFFFFFFFFFFFFULL;
+    r4 += (u3 >> 16 | u4 << 48);
 
     r->n[0] = r0;
     r->n[1] = r1;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 91e3a73848..6dea776161 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -967,7 +967,7 @@ static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CO
     0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL
 );
 
-static void secp256k1_scalar_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
+static void secp256k1_scalar_mul_add_2(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
 
     /*  Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64.
      *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
@@ -1027,10 +1027,10 @@ static void secp256k1_scalar_combine_2s(int64_t *t) {
     int64_t g0 = t[12], g1 = t[13];
     int64_t h0 = t[14], h1 = t[15];
 
-    secp256k1_scalar_mul_add(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
-    secp256k1_scalar_mul_add(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
-    secp256k1_scalar_mul_add(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
-    secp256k1_scalar_mul_add(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
+    secp256k1_scalar_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
+    secp256k1_scalar_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
+    secp256k1_scalar_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
+    secp256k1_scalar_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
 }
 
 static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) {
@@ -1041,13 +1041,13 @@ static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) {
     int128_t cc;
 
     cc  = t[0];
-    r0 = (uint64_t)cc; cc >>= 64;
+    r0  = (uint64_t)cc; cc >>= 64;
     cc += t[1];
-    r1 = (uint64_t)cc; cc >>= 64;
+    r1  = (uint64_t)cc; cc >>= 64;
     cc += t[2];
-    r2 = (uint64_t)cc; cc >>= 64;
+    r2  = (uint64_t)cc; cc >>= 64;
     cc += t[3];
-    r3 = (uint64_t)cc; cc >>= 64;
+    r3  = (uint64_t)cc; cc >>= 64;
 
     VERIFY_CHECK(cc == 0 || cc == -1);
 
@@ -1188,7 +1188,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 
     /* Instead of dividing the output by 2^744, scale the input. */
     secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
-    secp256k1_scalar_encode_62(&g[0], &b0);
+    secp256k1_scalar_encode_62(g, &b0);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
     eta = -1;
diff --git a/src/scalar_8x32.h b/src/scalar_8x32.h
index 10c55f1f8b..2c9a348e24 100644
--- a/src/scalar_8x32.h
+++ b/src/scalar_8x32.h
@@ -16,6 +16,4 @@ typedef struct {
 
 #define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{(d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7)}}
 
-#define SECP256K1_SCALAR_INV_DEFAULT
-
 #endif /* SECP256K1_SCALAR_REPR_H */
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 3c372f34fe..cd5a985213 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -733,4 +733,600 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     r->d[7] = (r->d[7] & mask0) | (a->d[7] & mask1);
 }
 
+static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALAR_CONST(
+    0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFDUL,
+    0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL
+);
+
+static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CONST(
+    0x4E165355UL, 0x5D800C18UL, 0xEF116DB1UL, 0xB31347F1UL,
+    0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL
+);
+
+static void secp256k1_scalar_mul_add_2(int32_t a0, int32_t a1, int32_t b0, int32_t b1, int32_t c0, int32_t c1, int32_t d0, int32_t d1, int32_t *t) {
+
+    /*  Each [a0,a1], etc. pair is a ??-bit signed value e.g. a0 + a1 * 2^32.
+     *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
+     *  writes the ???-bit signed result to [t[0],t[1],t[2],t[3]].
+     */
+
+    int32_t z0, z1, z2, z3;
+    int64_t tt;
+
+    tt  = (int64_t)a0 * b0
+        + (int64_t)c0 * d0;
+    z0  = (int32_t)tt; tt -= z0; tt >>= 32;
+
+    tt += (int64_t)a0 * b1
+        + (int64_t)a1 * b0
+        + (int64_t)c0 * d1
+        + (int64_t)c1 * d0;
+    z1  = (int32_t)tt; tt -= z1; tt >>= 32;
+
+    tt += (int64_t)a1 * b1
+        + (int64_t)c1 * d1;
+    z2  = (int32_t)tt; tt -= z2; tt >>= 32;
+
+    z3 = (int32_t)tt;
+
+    t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3;
+}
+
+static void secp256k1_scalar_mul_add_4(int32_t* tIn, int xPos, int yPos, int uPos, int vPos, int32_t *tOut, int zzPos) {
+    int32_t y0 = tIn[yPos + 0];
+    int32_t y1 = tIn[yPos + 1];
+    int32_t y2 = tIn[yPos + 2];
+    int32_t y3 = tIn[yPos + 3];
+    int32_t v0 = tIn[vPos + 0];
+    int32_t v1 = tIn[vPos + 1];
+    int32_t v2 = tIn[vPos + 2];
+    int32_t v3 = tIn[vPos + 3];
+    int32_t xVal, uVal;
+    int32_t z0, z1, z2, z3, z4, z5, z6, z7;
+    int64_t c;
+
+    xVal = tIn[xPos];
+    uVal = tIn[uPos];
+
+    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0;
+    z0 = (int32_t)c; c -= z0; c >>= 32;
+
+    c += (int64_t)xVal * y1 + (int64_t)uVal * v1;
+    z1 = (int32_t)c; c -= z1; c >>= 32;
+
+    c += (int64_t)xVal * y2 + (int64_t)uVal * v2;
+    z2 = (int32_t)c; c -= z2; c >>= 32;
+
+    c += (int64_t)xVal * y3 + (int64_t)uVal * v3;
+    z3 = (int32_t)c; c -= z3; c >>= 32;
+    z4 = (int32_t)c;
+
+    xVal = tIn[xPos + 1];
+    uVal = tIn[uPos + 1];
+
+    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z1;
+    z1 = (int32_t)c; c -= z1; c >>= 32;
+
+    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z2;
+    z2 = (int32_t)c; c -= z2; c >>= 32;
+
+    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z3;
+    z3 = (int32_t)c; c -= z3; c >>= 32;
+
+    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z4;
+    z4 = (int32_t)c; c -= z4; c >>= 32;
+    z5 = (int32_t)c;
+
+    xVal = tIn[xPos + 2];
+    uVal = tIn[uPos + 2];
+
+    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z2;
+    z2 = (int32_t)c; c -= z2; c >>= 32;
+
+    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z3;
+    z3 = (int32_t)c; c -= z3; c >>= 32;
+
+    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z4;
+    z4 = (int32_t)c; c -= z4; c >>= 32;
+
+    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z5;
+    z5 = (int32_t)c; c -= z5; c >>= 32;
+    z6 = (int32_t)c;
+
+    xVal = tIn[xPos + 3];
+    uVal = tIn[uPos + 3];
+
+    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z3;
+    z3 = (int32_t)c; c -= z3; c >>= 32;
+
+    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z4;
+    z4 = (int32_t)c; c -= z4; c >>= 32;
+
+    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z5;
+    z5 = (int32_t)c; c -= z5; c >>= 32;
+
+    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z6;
+    z6 = (int32_t)c; c -= z6; c >>= 32;
+    z7 = (int32_t)c;
+
+    tOut[zzPos + 0] = z0;
+    tOut[zzPos + 1] = z1;
+    tOut[zzPos + 2] = z2;
+    tOut[zzPos + 3] = z3;
+    tOut[zzPos + 4] = z4;
+    tOut[zzPos + 5] = z5;
+    tOut[zzPos + 6] = z6;
+    tOut[zzPos + 7] = z7;
+}
+
+static void secp256k1_scalar_combine_1s(int32_t *t) {
+
+    int32_t a = t[0], b = t[1], c = t[2], d = t[3],
+            e = t[4], f = t[5], g = t[6], h = t[7];
+    int64_t I, J, K, L;
+
+    I = (int64_t)e * a + (int64_t)f * c;
+    J = (int64_t)e * b + (int64_t)f * d;
+    K = (int64_t)g * a + (int64_t)h * c;
+    L = (int64_t)g * b + (int64_t)h * d;
+
+    a = (int32_t)I; I -= a; I >>= 32; b = (int32_t)I;
+    c = (int32_t)J; J -= c; J >>= 32; d = (int32_t)J;
+    e = (int32_t)K; K -= e; K >>= 32; f = (int32_t)K;
+    g = (int32_t)L; L -= g; L >>= 32; h = (int32_t)L;
+
+    t[0] = a; t[1] = b; t[2] = c; t[3] = d;
+    t[4] = e; t[5] = f; t[6] = g; t[7] = h;
+}
+
+static void secp256k1_scalar_combine_2s(int32_t *t) {
+
+    int32_t a0 = t[ 0], a1 = t[ 1];
+    int32_t b0 = t[ 2], b1 = t[ 3];
+    int32_t c0 = t[ 4], c1 = t[ 5];
+    int32_t d0 = t[ 6], d1 = t[ 7];
+    int32_t e0 = t[ 8], e1 = t[ 9];
+    int32_t f0 = t[10], f1 = t[11];
+    int32_t g0 = t[12], g1 = t[13];
+    int32_t h0 = t[14], h1 = t[15];
+
+    secp256k1_scalar_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
+    secp256k1_scalar_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
+    secp256k1_scalar_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
+    secp256k1_scalar_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
+}
+
+static void secp256k1_scalar_combine_4s(int32_t *t)
+{
+    int32_t tmp[32];
+
+    int aPos = 0;
+    int bPos = 4;
+    int cPos = 8;
+    int dPos = 12;
+    int ePos = 16;
+    int fPos = 20;
+    int gPos = 24;
+    int hPos = 28;
+
+    int IPos = 0;
+    int JPos = 8;
+    int KPos = 16;
+    int LPos = 24;
+
+    secp256k1_scalar_mul_add_4(t, ePos, aPos, fPos, cPos, tmp, IPos);
+    secp256k1_scalar_mul_add_4(t, ePos, bPos, fPos, dPos, tmp, JPos);
+    secp256k1_scalar_mul_add_4(t, gPos, aPos, hPos, cPos, tmp, KPos);
+    secp256k1_scalar_mul_add_4(t, gPos, bPos, hPos, dPos, tmp, LPos);
+
+    memcpy(t, tmp, 32 * sizeof(int32_t));
+}
+
+static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int32_t *t) {
+
+    uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
+    int flag;
+    secp256k1_scalar u;
+    int64_t cc;
+
+    cc  = t[0];
+    r0  = (uint32_t)cc; cc >>= 32;
+    cc += t[1];
+    r1  = (uint32_t)cc; cc >>= 32;
+    cc += t[2];
+    r2  = (uint32_t)cc; cc >>= 32;
+    cc += t[3];
+    r3  = (uint32_t)cc; cc >>= 32;
+    cc += t[4];
+    r4  = (uint32_t)cc; cc >>= 32;
+    cc += t[5];
+    r5  = (uint32_t)cc; cc >>= 32;
+    cc += t[6];
+    r6  = (uint32_t)cc; cc >>= 32;
+    cc += t[7];
+    r7  = (uint32_t)cc; cc >>= 32;
+
+    VERIFY_CHECK(cc == 0 || cc == -1);
+
+    flag = (int)cc & 1;
+
+    r->d[0] = r0;
+    r->d[1] = r1;
+    r->d[2] = r2;
+    r->d[3] = r3;
+    r->d[4] = r4;
+    r->d[5] = r5;
+    r->d[6] = r6;
+    r->d[7] = r7;
+
+    secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256);
+    secp256k1_scalar_cmov(r, &u, flag);
+}
+
+static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) {
+
+    const uint32_t M31 = UINT32_MAX >> 1;
+    const uint32_t *d = &a->d[0];
+    uint32_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3],
+             a4 = d[4], a5 = d[5], a6 = d[6], a7 = d[7];
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
+#endif
+
+    r[0] =  a0                   & M31;
+    r[1] = (a0 >> 31 | a1 <<  1) & M31;
+    r[2] = (a1 >> 30 | a2 <<  2) & M31;
+    r[3] = (a2 >> 29 | a3 <<  3) & M31;
+    r[4] = (a3 >> 28 | a4 <<  4) & M31;
+    r[5] = (a4 >> 27 | a5 <<  5) & M31;
+    r[6] = (a5 >> 26 | a6 <<  6) & M31;
+    r[7] = (a6 >> 25 | a7 <<  7) & M31;
+    r[8] =  a7 >> 24;
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
+#endif
+}
+
+static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+
+    uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
+    uint32_t c1, c2, f = f0, g = g0, x, y, z;
+    int i;
+
+    for (i = 0; i < 31; ++i) {
+
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == -f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == -g << i);
+
+        c1 = -(g & (eta >> 15));
+
+        x = (f ^ g) & c1;
+        f ^= x; g ^= x; g ^= c1; g -= c1;
+
+        y = (u ^ q) & c1;
+        u ^= y; q ^= y; q ^= c1; q -= c1;
+
+        z = (v ^ r) & c1;
+        v ^= z; r ^= z; r ^= c1; r -= c1;
+
+        eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1;
+
+        c2 = -(g & 1);
+
+        g += (f & c2); g >>= 1;
+        q += (u & c2); u <<= 1;
+        r += (v & c2); v <<= 1;
+    }
+
+    t[0] = (int32_t)u;
+    t[1] = (int32_t)v;
+    t[2] = (int32_t)q;
+    t[3] = (int32_t)r;
+
+    return eta;
+}
+
+static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) {
+
+    const int32_t M31 = (int32_t)(UINT32_MAX >> 1);
+    int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
+    int64_t cf = 0, cg = 0;
+    int i;
+
+    fi = f[0];
+    gi = g[0];
+
+    cf -= (int64_t)u * fi + (int64_t)v * gi;
+    cg -= (int64_t)q * fi + (int64_t)r * gi;
+
+    VERIFY_CHECK(((int32_t)cf & M31) == 0);
+    VERIFY_CHECK(((int32_t)cg & M31) == 0);
+
+    cf >>= 31;
+    cg >>= 31;
+
+    for (i = 1; i < 9; ++i) {
+
+        fi = f[i];
+        gi = g[i];
+
+        cf -= (int64_t)u * fi + (int64_t)v * gi;
+        cg -= (int64_t)q * fi + (int64_t)r * gi;
+
+        f[i - 1] = (int32_t)cf & M31; cf >>= 31;
+        g[i - 1] = (int32_t)cg & M31; cg >>= 31;
+    }
+
+    f[8] = (int32_t)cf;
+    g[8] = (int32_t)cg;
+}
+
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+#if defined(EXHAUSTIVE_TEST_ORDER)
+    int i;
+    *r = 0;
+    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
+        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
+            *r = i;
+    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
+     * have a composite group order; fix it in exhaustive_tests.c). */
+    VERIFY_CHECK(*r != 0);
+}
+#elif 1
+
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+     */
+
+    int32_t t[24 * 4];
+    int32_t f[9] = { 0x50364141L, 0x7FA4BD19L, 0x3D2280EEL, 0x5576E735L, 0x7FFFFFEBL,
+        0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL };
+    int32_t g[9];
+    secp256k1_scalar b0, d0, a1, b1, c1, d1;
+    int i, sign;
+    int16_t eta;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
+
+    /* Instead of dividing the output by 2^744, scale the input. */
+    secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
+    secp256k1_scalar_encode_31(g, &b0);
+
+    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    eta = -1;
+
+    for (i = 0; i < 24; ++i) {
+        eta = secp256k1_scalar_divsteps_31(eta, f[0], g[0], &t[i * 4]);
+        secp256k1_scalar_update_fg(f, g, &t[i * 4]);
+    }
+
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1. The matrix outputs from each _divstep_31 are combined to get the
+     * Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divstep_31 introduce an extra factor of 2^31 each, so there is a total extra
+     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
+     */
+
+    VERIFY_CHECK(g[0] == 0);
+
+    sign = (f[0] >> 1) & 1;
+
+    for (i = 0; i < 3; ++i) {
+        int tOff = i * 32;
+        secp256k1_scalar_combine_1s(&t[tOff +  0]);
+        secp256k1_scalar_combine_1s(&t[tOff +  8]);
+        secp256k1_scalar_combine_1s(&t[tOff + 16]);
+        secp256k1_scalar_combine_1s(&t[tOff + 24]);
+        secp256k1_scalar_combine_2s(&t[tOff +  0]);
+        secp256k1_scalar_combine_2s(&t[tOff + 16]);
+        secp256k1_scalar_combine_4s(&t[tOff +  0]);
+    }
+
+    /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */
+    secp256k1_scalar_decode_matrix(&b0, &t[8]);
+    /* secp256k1_scalar_decode_matrix(&c0, &t[16]); */
+    secp256k1_scalar_decode_matrix(&d0, &t[24]);
+
+    secp256k1_scalar_decode_matrix(&a1, &t[32]);
+    secp256k1_scalar_decode_matrix(&b1, &t[40]);
+    secp256k1_scalar_decode_matrix(&c1, &t[48]);
+    secp256k1_scalar_decode_matrix(&d1, &t[56]);
+
+    secp256k1_scalar_mul(&a1, &a1, &b0);
+    secp256k1_scalar_mul(&b1, &b1, &d0);
+    secp256k1_scalar_mul(&c1, &c1, &b0);
+    secp256k1_scalar_mul(&d1, &d1, &d0);
+
+    secp256k1_scalar_add(&b0, &a1, &b1);
+    secp256k1_scalar_add(&d0, &c1, &d1);
+
+    secp256k1_scalar_decode_matrix(&a1, &t[64]);
+    secp256k1_scalar_decode_matrix(&b1, &t[72]);
+    /* secp256k1_scalar_decode_matrix(&c1, &t[80]); */
+    /* secp256k1_scalar_decode_matrix(&d1, &t[88]); */
+
+    secp256k1_scalar_mul(&a1, &a1, &b0);
+    secp256k1_scalar_mul(&b1, &b1, &d0);
+    /* secp256k1_scalar_mul(&c1, &c1, &b0); */
+    /* secp256k1_scalar_mul(&d1, &d1, &d0); */
+
+    secp256k1_scalar_add(&b0, &a1, &b1);
+    /* secp256k1_scalar_add(&d0, &c1, &d1); */
+
+    secp256k1_scalar_cond_negate(&b0, sign);
+
+#ifdef VERIFY
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in);
+#endif
+
+    *r = b0;
+}
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
+#else
+    secp256k1_scalar *t;
+    int i;
+    /* First compute xN as x ^ (2^N - 1) for some values of N,
+     * and uM as x ^ M for some values of M. */
+    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
+    secp256k1_scalar u2, u5, u9, u11, u13;
+
+    secp256k1_scalar_sqr(&u2, x);
+    secp256k1_scalar_mul(&x2, &u2,  x);
+    secp256k1_scalar_mul(&u5, &u2, &x2);
+    secp256k1_scalar_mul(&x3, &u5,  &u2);
+    secp256k1_scalar_mul(&u9, &x3, &u2);
+    secp256k1_scalar_mul(&u11, &u9, &u2);
+    secp256k1_scalar_mul(&u13, &u11, &u2);
+
+    secp256k1_scalar_sqr(&x6, &u13);
+    secp256k1_scalar_sqr(&x6, &x6);
+    secp256k1_scalar_mul(&x6, &x6, &u11);
+
+    secp256k1_scalar_sqr(&x8, &x6);
+    secp256k1_scalar_sqr(&x8, &x8);
+    secp256k1_scalar_mul(&x8, &x8,  &x2);
+
+    secp256k1_scalar_sqr(&x14, &x8);
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(&x14, &x14);
+    }
+    secp256k1_scalar_mul(&x14, &x14, &x6);
+
+    secp256k1_scalar_sqr(&x28, &x14);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x28, &x28);
+    }
+    secp256k1_scalar_mul(&x28, &x28, &x14);
+
+    secp256k1_scalar_sqr(&x56, &x28);
+    for (i = 0; i < 27; i++) {
+        secp256k1_scalar_sqr(&x56, &x56);
+    }
+    secp256k1_scalar_mul(&x56, &x56, &x28);
+
+    secp256k1_scalar_sqr(&x112, &x56);
+    for (i = 0; i < 55; i++) {
+        secp256k1_scalar_sqr(&x112, &x112);
+    }
+    secp256k1_scalar_mul(&x112, &x112, &x56);
+
+    secp256k1_scalar_sqr(&x126, &x112);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x126, &x126);
+    }
+    secp256k1_scalar_mul(&x126, &x126, &x14);
+
+    /* Then accumulate the final result (t starts at x126). */
+    t = &x126;
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 10; i++) { /* 0000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 9; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x2); /* 11 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 10; i++) { /* 000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, x); /* 1 */
+    for (i = 0; i < 8; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
+}
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
+#endif
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */

From b29e51e1eef281c60b6d5449d249ae5aa688e23b Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Mon, 27 Jul 2020 14:57:40 +0700
Subject: [PATCH 08/34] Minor cleanup

---
 src/field_10x26_impl.h | 8 ++++----
 src/field_5x52_impl.h  | 8 ++++----
 src/scalar_4x64_impl.h | 6 +++---
 src/scalar_8x32_impl.h | 6 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 52995de68c..9173d5ae36 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1537,7 +1537,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     /* Instead of dividing the output by 2^744, scale the input. */
     secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
     secp256k1_fe_normalize(&b0);
-    secp256k1_fe_encode_31(&g[0], &b0);
+    secp256k1_fe_encode_31(g, &b0);
 
 #ifdef VERIFY
     zero_in = secp256k1_fe_is_zero(&b0);
@@ -1553,9 +1553,9 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divstep_31 are combined to get the
-     * Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divstep_31 introduce an extra factor of 2^31 each, so there is a total extra
+     * values i.e. +/- 1. The matrix outputs from each _divsteps_31 are combined to get
+     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divsteps_31 introduce an extra factor of 2^31 each, so there is a total extra
      * factor of 2^744 to account for (by scaling the input and/or output accordingly).
      */
 
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 03d10b1dce..f73f356257 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -729,7 +729,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     /* Instead of dividing the output by 2^744, scale the input. */
     secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
     secp256k1_fe_normalize(&b0);
-    secp256k1_fe_encode_62(&g[0], &b0);
+    secp256k1_fe_encode_62(g, &b0);
 
 #ifdef VERIFY
     zero_in = secp256k1_fe_is_zero(&b0);
@@ -745,9 +745,9 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divstep_62 are combined to get the
-     * Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divstep_62 introduce an extra factor of 2^62 each, so there is a total extra
+     * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get
+     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra
      * factor of 2^744 to account for (by scaling the input and/or output accordingly).
      */
 
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 6dea776161..43fc415eeb 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1200,9 +1200,9 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divstep_62 are combined to get the
-     * Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divstep_62 introduce an extra factor of 2^62 each, so there is a total extra
+     * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get
+     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra
      * factor of 2^744 to account for (by scaling the input and/or output accordingly).
      */
 
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index cd5a985213..d0593939ed 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -1106,9 +1106,9 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divstep_31 are combined to get the
-     * Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divstep_31 introduce an extra factor of 2^31 each, so there is a total extra
+     * values i.e. +/- 1. The matrix outputs from each _divsteps_31 are combined to get
+     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divsteps_31 introduce an extra factor of 2^31 each, so there is a total extra
      * factor of 2^744 to account for (by scaling the input and/or output accordingly).
      */
 

From 3519dccfe4aedac46d73d11736151ee3f7966deb Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 28 Jul 2020 01:22:58 +0700
Subject: [PATCH 09/34] Initial _inv_var implementations

---
 src/field_10x26_impl.h | 208 ++++++++++++++++++-----------
 src/field_5x52_impl.h  | 204 ++++++++++++++++++-----------
 src/field_impl.h       |   2 +
 src/scalar_4x64_impl.h | 283 +++++++++++++++++++---------------------
 src/scalar_8x32_impl.h | 287 ++++++++++++++++++++---------------------
 src/scalar_impl.h      |   2 +
 src/scalar_low.h       |   1 +
 7 files changed, 539 insertions(+), 448 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 9173d5ae36..c502c772e6 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1478,6 +1478,58 @@ static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int3
     return eta;
 }
 
+static int secp256k1_fe_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+
+    uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
+    uint32_t f = f0, g = g0, m, w, x, y, z;
+    int i = 31, limit, zeros;
+
+    for (;;) {
+
+        /* Use a sentinel bit to count zeros only up to i. */
+        zeros = __builtin_ctzl(g | (UINT32_MAX << i));
+
+        g >>= zeros;
+        u <<= zeros;
+        v <<= zeros;
+        eta -= zeros;
+        i -= zeros;
+
+        if (i <= 0) {
+            break;
+        }
+
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((g & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i));
+
+        if ((int16_t)eta < 0) {
+            eta = -eta;
+            x = f; f = g; g = -x;
+            y = u; u = q; q = -y;
+            z = v; v = r; r = -z;
+        }
+
+        /* Handle up to 3 divsteps at once, subject to eta and i. */
+        limit = (eta + 1) > i ? i : (eta + 1);
+        m = (UINT32_MAX >> (32 - limit)) & 7U;
+
+        /* Note that f * f == 1 mod 8, for any f. */
+        w = (-f * g) & m;
+        g += f * w;
+        q += u * w;
+        r += v * w;
+    }
+
+    t[0] = (int32_t)u;
+    t[1] = (int32_t)v;
+    t[2] = (int32_t)q;
+    t[3] = (int32_t)r;
+
+    return eta;
+}
+
 static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) {
 
     const int32_t M31 = (int32_t)(UINT32_MAX >> 1);
@@ -1515,8 +1567,6 @@ static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) {
 
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
-#if 1
-
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
      */
@@ -1614,97 +1664,107 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 #endif
 
     *r = b0;
+}
 
-#else
-
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
-    int j;
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
-    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
      */
 
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
+    int32_t t[24 * 4];
+    int32_t f[9] = { 0x7FFFFC2FL, 0x7FFFFFFDL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL,
+        0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL };
+    int32_t g[9];
+    secp256k1_fe b0, d0, a1, b1, c1, d1;
+    int i, sign;
+    int16_t eta;
+#ifdef VERIFY
+    int zero_in;
+#endif
 
-    secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
+    /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
+     * by 2^768, and then the output by 2^24. */
+    /* Instead of dividing the output by 2^744, scale the input. */
+    secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
+    secp256k1_fe_normalize(&b0);
+    secp256k1_fe_encode_31(g, &b0);
 
-    x6 = x3;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x6, &x6);
-    }
-    secp256k1_fe_mul(&x6, &x6, &x3);
+#ifdef VERIFY
+    zero_in = secp256k1_fe_is_zero(&b0);
+#endif
 
-    x9 = x6;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x9, &x9);
-    }
-    secp256k1_fe_mul(&x9, &x9, &x3);
+    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    eta = -1;
 
-    x11 = x9;
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&x11, &x11);
+    for (i = 0; i < 24; ++i) {
+        eta = secp256k1_fe_divsteps_31_var(eta, f[0], g[0], &t[i * 4]);
+        secp256k1_fe_update_fg(f, g, &t[i * 4]);
     }
-    secp256k1_fe_mul(&x11, &x11, &x2);
 
-    x22 = x11;
-    for (j=0; j<11; j++) {
-        secp256k1_fe_sqr(&x22, &x22);
-    }
-    secp256k1_fe_mul(&x22, &x22, &x11);
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1. The matrix outputs from each _divsteps_31_var are combined to
+     * get the Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divsteps_31_var introduce an extra factor of 2^31 each, so there is a total extra
+     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
+     */
 
-    x44 = x22;
-    for (j=0; j<22; j++) {
-        secp256k1_fe_sqr(&x44, &x44);
-    }
-    secp256k1_fe_mul(&x44, &x44, &x22);
+    VERIFY_CHECK(g[0] == 0);
 
-    x88 = x44;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x88, &x88);
-    }
-    secp256k1_fe_mul(&x88, &x88, &x44);
+    sign = (f[0] >> 1) & 1;
 
-    x176 = x88;
-    for (j=0; j<88; j++) {
-        secp256k1_fe_sqr(&x176, &x176);
+    for (i = 0; i < 3; ++i) {
+        int tOff = i * 32;
+        secp256k1_fe_combine_1s(&t[tOff +  0]);
+        secp256k1_fe_combine_1s(&t[tOff +  8]);
+        secp256k1_fe_combine_1s(&t[tOff + 16]);
+        secp256k1_fe_combine_1s(&t[tOff + 24]);
+        secp256k1_fe_combine_2s(&t[tOff +  0]);
+        secp256k1_fe_combine_2s(&t[tOff + 16]);
+        secp256k1_fe_combine_4s(&t[tOff +  0]);
     }
-    secp256k1_fe_mul(&x176, &x176, &x88);
 
-    x220 = x176;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x220, &x220);
-    }
-    secp256k1_fe_mul(&x220, &x220, &x44);
+    /* secp256k1_fe_decode_matrix(&a0, &t[0]); */
+    secp256k1_fe_decode_matrix(&b0, &t[8]);
+    /* secp256k1_fe_decode_matrix(&c0, &t[16]); */
+    secp256k1_fe_decode_matrix(&d0, &t[24]);
 
-    x223 = x220;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x223, &x223);
-    }
-    secp256k1_fe_mul(&x223, &x223, &x3);
+    secp256k1_fe_decode_matrix(&a1, &t[32]);
+    secp256k1_fe_decode_matrix(&b1, &t[40]);
+    secp256k1_fe_decode_matrix(&c1, &t[48]);
+    secp256k1_fe_decode_matrix(&d1, &t[56]);
+
+    secp256k1_fe_mul(&a1, &a1, &b0);
+    secp256k1_fe_mul(&b1, &b1, &d0);
+    secp256k1_fe_mul(&c1, &c1, &b0);
+    secp256k1_fe_mul(&d1, &d1, &d0);
 
-    /* The final result is then assembled using a sliding window over the blocks. */
+    b0 = a1; secp256k1_fe_add(&b0, &b1);
+    d0 = c1; secp256k1_fe_add(&d0, &d1);
 
-    t1 = x223;
-    for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<5; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, a);
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x2);
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(r, a, &t1);
+    secp256k1_fe_decode_matrix(&a1, &t[64]);
+    secp256k1_fe_decode_matrix(&b1, &t[72]);
+    /* secp256k1_fe_decode_matrix(&c1, &t[80]); */
+    /* secp256k1_fe_decode_matrix(&d1, &t[88]); */
+
+    secp256k1_fe_mul(&a1, &a1, &b0);
+    secp256k1_fe_mul(&b1, &b1, &d0);
+    /* secp256k1_fe_mul(&c1, &c1, &b0); */
+    /* secp256k1_fe_mul(&d1, &d1, &d0); */
+
+    b0 = a1; secp256k1_fe_add(&b0, &b1);
+    /* d0 = c1; secp256k1_fe_add(&d0, &d1); */
+
+    secp256k1_fe_negate(&b1, &b0, 2);
+    secp256k1_fe_cmov(&b0, &b1, sign);
+    secp256k1_fe_normalize_weak(&b0);
+
+#ifdef VERIFY
+    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in);
 #endif
+
+    *r = b0;
 }
 
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index f73f356257..0cc7e80056 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -670,6 +670,58 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6
     return eta;
 }
 
+static int secp256k1_fe_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
+
+    uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
+    uint64_t f = f0, g = g0, m, w, x, y, z;
+    int i = 62, limit, zeros;
+
+    for (;;) {
+
+        /* Use a sentinel bit to count zeros only up to i. */
+        zeros = __builtin_ctzll(g | (UINT64_MAX << i));
+
+        g >>= zeros;
+        u <<= zeros;
+        v <<= zeros;
+        eta -= zeros;
+        i -= zeros;
+
+        if (i <= 0) {
+            break;
+        }
+
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((g & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i));
+
+        if ((int16_t)eta < 0) {
+            eta = -eta;
+            x = f; f = g; g = -x;
+            y = u; u = q; q = -y;
+            z = v; v = r; r = -z;
+        }
+
+        /* Handle up to 3 divsteps at once, subject to eta and i. */
+        limit = (eta + 1) > i ? i : (eta + 1);
+        m = (UINT64_MAX >> (64 - limit)) & 7U;
+
+        /* Note that f * f == 1 mod 8, for any f. */
+        w = (-f * g) & m;
+        g += f * w;
+        q += u * w;
+        r += v * w;
+    }
+
+    t[0] = (int64_t)u;
+    t[1] = (int64_t)v;
+    t[2] = (int64_t)q;
+    t[3] = (int64_t)r;
+
+    return eta;
+}
+
 static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) {
 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
@@ -707,8 +759,6 @@ static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) {
 
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
-#if 1
-
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
      */
@@ -802,97 +852,103 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 #endif
 
     *r = b0;
+}
 
-#else
-
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
-    int j;
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
-    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
      */
 
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
+    int64_t t[12 * 4];
+    int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL,
+        0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
+    int64_t g[5];
+    secp256k1_fe b0, d0, a1, b1, c1, d1;
+    int i, sign;
+    int16_t eta;
+#ifdef VERIFY
+    int zero_in;
+#endif
 
-    secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
+    /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
+     * by 2^768, and then the output by 2^24. */
+    /* Instead of dividing the output by 2^744, scale the input. */
+    secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
+    secp256k1_fe_normalize(&b0);
+    secp256k1_fe_encode_62(g, &b0);
 
-    x6 = x3;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x6, &x6);
-    }
-    secp256k1_fe_mul(&x6, &x6, &x3);
+#ifdef VERIFY
+    zero_in = secp256k1_fe_is_zero(&b0);
+#endif
 
-    x9 = x6;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x9, &x9);
-    }
-    secp256k1_fe_mul(&x9, &x9, &x3);
+    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    eta = -1;
 
-    x11 = x9;
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&x11, &x11);
+    for (i = 0; i < 12; ++i) {
+        eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], &t[i * 4]);
+        secp256k1_fe_update_fg(f, g, &t[i * 4]);
     }
-    secp256k1_fe_mul(&x11, &x11, &x2);
 
-    x22 = x11;
-    for (j=0; j<11; j++) {
-        secp256k1_fe_sqr(&x22, &x22);
-    }
-    secp256k1_fe_mul(&x22, &x22, &x11);
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get
+     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra
+     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
+     */
 
-    x44 = x22;
-    for (j=0; j<22; j++) {
-        secp256k1_fe_sqr(&x44, &x44);
-    }
-    secp256k1_fe_mul(&x44, &x44, &x22);
+    VERIFY_CHECK(g[0] == 0);
 
-    x88 = x44;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x88, &x88);
-    }
-    secp256k1_fe_mul(&x88, &x88, &x44);
+    sign = (f[0] >> 1) & 1;
 
-    x176 = x88;
-    for (j=0; j<88; j++) {
-        secp256k1_fe_sqr(&x176, &x176);
+    for (i = 0; i < 3; ++i) {
+        int tOff = i * 16;
+        secp256k1_fe_combine_1s(&t[tOff + 0]);
+        secp256k1_fe_combine_1s(&t[tOff + 8]);
+        secp256k1_fe_combine_2s(&t[tOff + 0]);
     }
-    secp256k1_fe_mul(&x176, &x176, &x88);
 
-    x220 = x176;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x220, &x220);
-    }
-    secp256k1_fe_mul(&x220, &x220, &x44);
+    /* secp256k1_fe_decode_matrix(&a0, &t[0]); */
+    secp256k1_fe_decode_matrix(&b0, &t[4]);
+    /* secp256k1_fe_decode_matrix(&c0, &t[8]); */
+    secp256k1_fe_decode_matrix(&d0, &t[12]);
 
-    x223 = x220;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x223, &x223);
-    }
-    secp256k1_fe_mul(&x223, &x223, &x3);
+    secp256k1_fe_decode_matrix(&a1, &t[16]);
+    secp256k1_fe_decode_matrix(&b1, &t[20]);
+    secp256k1_fe_decode_matrix(&c1, &t[24]);
+    secp256k1_fe_decode_matrix(&d1, &t[28]);
+
+    secp256k1_fe_mul(&a1, &a1, &b0);
+    secp256k1_fe_mul(&b1, &b1, &d0);
+    secp256k1_fe_mul(&c1, &c1, &b0);
+    secp256k1_fe_mul(&d1, &d1, &d0);
 
-    /* The final result is then assembled using a sliding window over the blocks. */
+    b0 = a1; secp256k1_fe_add(&b0, &b1);
+    d0 = c1; secp256k1_fe_add(&d0, &d1);
 
-    t1 = x223;
-    for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<5; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, a);
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x2);
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(r, a, &t1);
+    secp256k1_fe_decode_matrix(&a1, &t[32]);
+    secp256k1_fe_decode_matrix(&b1, &t[36]);
+    /* secp256k1_fe_decode_matrix(&c1, &t[40]); */
+    /* secp256k1_fe_decode_matrix(&d1, &t[44]); */
+
+    secp256k1_fe_mul(&a1, &a1, &b0);
+    secp256k1_fe_mul(&b1, &b1, &d0);
+    /* secp256k1_fe_mul(&c1, &c1, &b0); */
+    /* secp256k1_fe_mul(&d1, &d1, &d0); */
+
+    b0 = a1; secp256k1_fe_add(&b0, &b1);
+    /* d0 = c1; secp256k1_fe_add(&d0, &d1); */
+
+    secp256k1_fe_negate(&b1, &b0, 2);
+    secp256k1_fe_cmov(&b0, &b1, sign);
+    secp256k1_fe_normalize_weak(&b0);
+
+#ifdef VERIFY
+    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in);
 #endif
+
+    *r = b0;
 }
 
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_impl.h b/src/field_impl.h
index c2b1cd2df2..ef15a0fc85 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -228,6 +228,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 }
 #endif
 
+#if defined(SECP256K1_FE_INV_VAR_DEFAULT)
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 #if defined(USE_FIELD_INV_BUILTIN)
     secp256k1_fe_inv(r, a);
@@ -264,6 +265,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 #error "Please select field inverse implementation"
 #endif
 }
+#endif
 
 static void secp256k1_fe_inv_all_var(secp256k1_fe *r, const secp256k1_fe *a, size_t len) {
     secp256k1_fe u;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 43fc415eeb..f51580a19d 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1123,6 +1123,58 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0,
     return eta;
 }
 
+static int secp256k1_scalar_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
+
+    uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
+    uint64_t f = f0, g = g0, m, w, x, y, z;
+    int i = 62, limit, zeros;
+
+    for (;;) {
+
+        /* Use a sentinel bit to count zeros only up to i. */
+        zeros = __builtin_ctzll(g | (UINT64_MAX << i));
+
+        g >>= zeros;
+        u <<= zeros;
+        v <<= zeros;
+        eta -= zeros;
+        i -= zeros;
+
+        if (i <= 0) {
+            break;
+        }
+
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((g & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i));
+
+        if ((int16_t)eta < 0) {
+            eta = -eta;
+            x = f; f = g; g = -x;
+            y = u; u = q; q = -y;
+            z = v; v = r; r = -z;
+        }
+
+        /* Handle up to 3 divsteps at once, subject to eta and i. */
+        limit = (eta + 1) > i ? i : (eta + 1);
+        m = (UINT64_MAX >> (64 - limit)) & 7U;
+
+        /* Note that f * f == 1 mod 8, for any f. */
+        w = (-f * g) & m;
+        g += f * w;
+        q += u * w;
+        r += v * w;
+    }
+
+    t[0] = (int64_t)u;
+    t[1] = (int64_t)v;
+    t[2] = (int64_t)q;
+    t[3] = (int64_t)r;
+
+    return eta;
+}
+
 static void secp256k1_scalar_update_fg(int64_t *f, int64_t *g, int64_t *t) {
 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
@@ -1169,7 +1221,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
      * have a composite group order; fix it in exhaustive_tests.c). */
     VERIFY_CHECK(*r != 0);
 }
-#elif 1
+#else
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
@@ -1260,163 +1312,94 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
     return !(a->d[0] & 1);
 }
-#else
-    secp256k1_scalar *t;
-    int i;
-    /* First compute xN as x ^ (2^N - 1) for some values of N,
-     * and uM as x ^ M for some values of M. */
-    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
-    secp256k1_scalar u2, u5, u9, u11, u13;
-
-    secp256k1_scalar_sqr(&u2, x);
-    secp256k1_scalar_mul(&x2, &u2,  x);
-    secp256k1_scalar_mul(&u5, &u2, &x2);
-    secp256k1_scalar_mul(&x3, &u5,  &u2);
-    secp256k1_scalar_mul(&u9, &x3, &u2);
-    secp256k1_scalar_mul(&u11, &u9, &u2);
-    secp256k1_scalar_mul(&u13, &u11, &u2);
-
-    secp256k1_scalar_sqr(&x6, &u13);
-    secp256k1_scalar_sqr(&x6, &x6);
-    secp256k1_scalar_mul(&x6, &x6, &u11);
-
-    secp256k1_scalar_sqr(&x8, &x6);
-    secp256k1_scalar_sqr(&x8, &x8);
-    secp256k1_scalar_mul(&x8, &x8,  &x2);
-
-    secp256k1_scalar_sqr(&x14, &x8);
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(&x14, &x14);
-    }
-    secp256k1_scalar_mul(&x14, &x14, &x6);
+#endif
 
-    secp256k1_scalar_sqr(&x28, &x14);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x28, &x28);
-    }
-    secp256k1_scalar_mul(&x28, &x28, &x14);
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
 
-    secp256k1_scalar_sqr(&x56, &x28);
-    for (i = 0; i < 27; i++) {
-        secp256k1_scalar_sqr(&x56, &x56);
-    }
-    secp256k1_scalar_mul(&x56, &x56, &x28);
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+     */
 
-    secp256k1_scalar_sqr(&x112, &x56);
-    for (i = 0; i < 55; i++) {
-        secp256k1_scalar_sqr(&x112, &x112);
-    }
-    secp256k1_scalar_mul(&x112, &x112, &x56);
+    int64_t t[12 * 4];
+    int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
+        0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
+    int64_t g[5];
+    secp256k1_scalar b0, d0, a1, b1, c1, d1;
+    int i, sign;
+    int16_t eta;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
 
-    secp256k1_scalar_sqr(&x126, &x112);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x126, &x126);
-    }
-    secp256k1_scalar_mul(&x126, &x126, &x14);
+    /* Instead of dividing the output by 2^744, scale the input. */
+    secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
+    secp256k1_scalar_encode_62(g, &b0);
 
-    /* Then accumulate the final result (t starts at x126). */
-    t = &x126;
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 10; i++) { /* 0000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 9; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 10; i++) { /* 000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00000 */
-        secp256k1_scalar_sqr(t, t);
+    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    eta = -1;
+
+    for (i = 0; i < 12; ++i) {
+        eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], &t[i * 4]);
+        secp256k1_scalar_update_fg(f, g, &t[i * 4]);
     }
-    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (i = 0; i < 8; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
+
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get
+     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra
+     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
+     */
+
+    VERIFY_CHECK(g[0] == 0);
+
+    sign = (f[0] >> 1) & 1;
+
+    for (i = 0; i < 3; ++i) {
+        int tOff = i * 16;
+        secp256k1_scalar_combine_1s(&t[tOff + 0]);
+        secp256k1_scalar_combine_1s(&t[tOff + 8]);
+        secp256k1_scalar_combine_2s(&t[tOff + 0]);
     }
-    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
-}
 
-SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
-    return !(a->d[0] & 1);
-}
+    /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */
+    secp256k1_scalar_decode_matrix(&b0, &t[4]);
+    /* secp256k1_scalar_decode_matrix(&c0, &t[8]); */
+    secp256k1_scalar_decode_matrix(&d0, &t[12]);
+
+    secp256k1_scalar_decode_matrix(&a1, &t[16]);
+    secp256k1_scalar_decode_matrix(&b1, &t[20]);
+    secp256k1_scalar_decode_matrix(&c1, &t[24]);
+    secp256k1_scalar_decode_matrix(&d1, &t[28]);
+
+    secp256k1_scalar_mul(&a1, &a1, &b0);
+    secp256k1_scalar_mul(&b1, &b1, &d0);
+    secp256k1_scalar_mul(&c1, &c1, &b0);
+    secp256k1_scalar_mul(&d1, &d1, &d0);
+
+    secp256k1_scalar_add(&b0, &a1, &b1);
+    secp256k1_scalar_add(&d0, &c1, &d1);
+
+    secp256k1_scalar_decode_matrix(&a1, &t[32]);
+    secp256k1_scalar_decode_matrix(&b1, &t[36]);
+    /* secp256k1_scalar_decode_matrix(&c1, &t[40]); */
+    /* secp256k1_scalar_decode_matrix(&d1, &t[44]); */
+
+    secp256k1_scalar_mul(&a1, &a1, &b0);
+    secp256k1_scalar_mul(&b1, &b1, &d0);
+    /* secp256k1_scalar_mul(&c1, &c1, &b0); */
+    /* secp256k1_scalar_mul(&d1, &d1, &d0); */
+
+    secp256k1_scalar_add(&b0, &a1, &b1);
+    /* secp256k1_scalar_add(&d0, &c1, &d1); */
+
+    secp256k1_scalar_cond_negate(&b0, sign);
+
+#ifdef VERIFY
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in);
 #endif
 
+    *r = b0;
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index d0593939ed..02d03f4504 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -1029,6 +1029,58 @@ static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0,
     return eta;
 }
 
+static int secp256k1_scalar_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+
+    uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
+    uint32_t f = f0, g = g0, m, w, x, y, z;
+    int i = 31, limit, zeros;
+
+    for (;;) {
+
+        /* Use a sentinel bit to count zeros only up to i. */
+        zeros = __builtin_ctzl(g | (UINT32_MAX << i));
+
+        g >>= zeros;
+        u <<= zeros;
+        v <<= zeros;
+        eta -= zeros;
+        i -= zeros;
+
+        if (i <= 0) {
+            break;
+        }
+
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((g & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i));
+
+        if ((int16_t)eta < 0) {
+            eta = -eta;
+            x = f; f = g; g = -x;
+            y = u; u = q; q = -y;
+            z = v; v = r; r = -z;
+        }
+
+        /* Handle up to 3 divsteps at once, subject to eta and i. */
+        limit = (eta + 1) > i ? i : (eta + 1);
+        m = (UINT32_MAX >> (32 - limit)) & 7U;
+
+        /* Note that f * f == 1 mod 8, for any f. */
+        w = (-f * g) & m;
+        g += f * w;
+        q += u * w;
+        r += v * w;
+    }
+
+    t[0] = (int32_t)u;
+    t[1] = (int32_t)v;
+    t[2] = (int32_t)q;
+    t[3] = (int32_t)r;
+
+    return eta;
+}
+
 static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) {
 
     const int32_t M31 = (int32_t)(UINT32_MAX >> 1);
@@ -1075,7 +1127,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
      * have a composite group order; fix it in exhaustive_tests.c). */
     VERIFY_CHECK(*r != 0);
 }
-#elif 1
+#else
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
@@ -1170,163 +1222,98 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
     return !(a->d[0] & 1);
 }
-#else
-    secp256k1_scalar *t;
-    int i;
-    /* First compute xN as x ^ (2^N - 1) for some values of N,
-     * and uM as x ^ M for some values of M. */
-    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
-    secp256k1_scalar u2, u5, u9, u11, u13;
-
-    secp256k1_scalar_sqr(&u2, x);
-    secp256k1_scalar_mul(&x2, &u2,  x);
-    secp256k1_scalar_mul(&u5, &u2, &x2);
-    secp256k1_scalar_mul(&x3, &u5,  &u2);
-    secp256k1_scalar_mul(&u9, &x3, &u2);
-    secp256k1_scalar_mul(&u11, &u9, &u2);
-    secp256k1_scalar_mul(&u13, &u11, &u2);
-
-    secp256k1_scalar_sqr(&x6, &u13);
-    secp256k1_scalar_sqr(&x6, &x6);
-    secp256k1_scalar_mul(&x6, &x6, &u11);
-
-    secp256k1_scalar_sqr(&x8, &x6);
-    secp256k1_scalar_sqr(&x8, &x8);
-    secp256k1_scalar_mul(&x8, &x8,  &x2);
-
-    secp256k1_scalar_sqr(&x14, &x8);
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(&x14, &x14);
-    }
-    secp256k1_scalar_mul(&x14, &x14, &x6);
+#endif
 
-    secp256k1_scalar_sqr(&x28, &x14);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x28, &x28);
-    }
-    secp256k1_scalar_mul(&x28, &x28, &x14);
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
 
-    secp256k1_scalar_sqr(&x56, &x28);
-    for (i = 0; i < 27; i++) {
-        secp256k1_scalar_sqr(&x56, &x56);
-    }
-    secp256k1_scalar_mul(&x56, &x56, &x28);
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+     */
 
-    secp256k1_scalar_sqr(&x112, &x56);
-    for (i = 0; i < 55; i++) {
-        secp256k1_scalar_sqr(&x112, &x112);
-    }
-    secp256k1_scalar_mul(&x112, &x112, &x56);
+    int32_t t[24 * 4];
+    int32_t f[9] = { 0x50364141L, 0x7FA4BD19L, 0x3D2280EEL, 0x5576E735L, 0x7FFFFFEBL,
+        0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL };
+    int32_t g[9];
+    secp256k1_scalar b0, d0, a1, b1, c1, d1;
+    int i, sign;
+    int16_t eta;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
 
-    secp256k1_scalar_sqr(&x126, &x112);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x126, &x126);
-    }
-    secp256k1_scalar_mul(&x126, &x126, &x14);
+    /* Instead of dividing the output by 2^744, scale the input. */
+    secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
+    secp256k1_scalar_encode_31(g, &b0);
 
-    /* Then accumulate the final result (t starts at x126). */
-    t = &x126;
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 10; i++) { /* 0000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 9; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 10; i++) { /* 000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00000 */
-        secp256k1_scalar_sqr(t, t);
+    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    eta = -1;
+
+    for (i = 0; i < 24; ++i) {
+        eta = secp256k1_scalar_divsteps_31_var(eta, f[0], g[0], &t[i * 4]);
+        secp256k1_scalar_update_fg(f, g, &t[i * 4]);
     }
-    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (i = 0; i < 8; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
+
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1. The matrix outputs from each _divsteps_31_var are combined to
+     * get the Bézout coefficients, and thus the modular inverse. The matrix outputs of
+     * _divsteps_31_var introduce an extra factor of 2^31 each, so there is a total extra
+     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
+     */
+
+    VERIFY_CHECK(g[0] == 0);
+
+    sign = (f[0] >> 1) & 1;
+
+    for (i = 0; i < 3; ++i) {
+        int tOff = i * 32;
+        secp256k1_scalar_combine_1s(&t[tOff +  0]);
+        secp256k1_scalar_combine_1s(&t[tOff +  8]);
+        secp256k1_scalar_combine_1s(&t[tOff + 16]);
+        secp256k1_scalar_combine_1s(&t[tOff + 24]);
+        secp256k1_scalar_combine_2s(&t[tOff +  0]);
+        secp256k1_scalar_combine_2s(&t[tOff + 16]);
+        secp256k1_scalar_combine_4s(&t[tOff +  0]);
     }
-    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
-}
 
-SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
-    return !(a->d[0] & 1);
-}
+    /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */
+    secp256k1_scalar_decode_matrix(&b0, &t[8]);
+    /* secp256k1_scalar_decode_matrix(&c0, &t[16]); */
+    secp256k1_scalar_decode_matrix(&d0, &t[24]);
+
+    secp256k1_scalar_decode_matrix(&a1, &t[32]);
+    secp256k1_scalar_decode_matrix(&b1, &t[40]);
+    secp256k1_scalar_decode_matrix(&c1, &t[48]);
+    secp256k1_scalar_decode_matrix(&d1, &t[56]);
+
+    secp256k1_scalar_mul(&a1, &a1, &b0);
+    secp256k1_scalar_mul(&b1, &b1, &d0);
+    secp256k1_scalar_mul(&c1, &c1, &b0);
+    secp256k1_scalar_mul(&d1, &d1, &d0);
+
+    secp256k1_scalar_add(&b0, &a1, &b1);
+    secp256k1_scalar_add(&d0, &c1, &d1);
+
+    secp256k1_scalar_decode_matrix(&a1, &t[64]);
+    secp256k1_scalar_decode_matrix(&b1, &t[72]);
+    /* secp256k1_scalar_decode_matrix(&c1, &t[80]); */
+    /* secp256k1_scalar_decode_matrix(&d1, &t[88]); */
+
+    secp256k1_scalar_mul(&a1, &a1, &b0);
+    secp256k1_scalar_mul(&b1, &b1, &d0);
+    /* secp256k1_scalar_mul(&c1, &c1, &b0); */
+    /* secp256k1_scalar_mul(&d1, &d1, &d0); */
+
+    secp256k1_scalar_add(&b0, &a1, &b1);
+    /* secp256k1_scalar_add(&d0, &c1, &d1); */
+
+    secp256k1_scalar_cond_negate(&b0, sign);
+
+#ifdef VERIFY
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in);
 #endif
 
+    *r = b0;
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/scalar_impl.h b/src/scalar_impl.h
index a63b735491..69f31f6c51 100644
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@@ -233,6 +233,7 @@ SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a)
 #endif
 #endif
 
+#if defined(SECP256K1_SCALAR_INV_VAR_DEFAULT)
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
 #if defined(USE_SCALAR_INV_BUILTIN)
     secp256k1_scalar_inverse(r, x);
@@ -253,6 +254,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
 #error "Please select scalar inverse implementation"
 #endif
 }
+#endif
 
 #ifdef USE_ENDOMORPHISM
 #if defined(EXHAUSTIVE_TEST_ORDER)
diff --git a/src/scalar_low.h b/src/scalar_low.h
index c31ca35376..53ea913203 100644
--- a/src/scalar_low.h
+++ b/src/scalar_low.h
@@ -15,5 +15,6 @@ typedef uint32_t secp256k1_scalar;
 #define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) (d0)
 
 #define SECP256K1_SCALAR_INV_DEFAULT
+#define SECP256K1_SCALAR_INV_VAR_DEFAULT
 
 #endif /* SECP256K1_SCALAR_REPR_H */

From bd184711c8897e36ca7a24ee5f502dcc8c6ccabb Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Fri, 31 Jul 2020 22:04:11 +0700
Subject: [PATCH 10/34] Simplify type of 'eta'

---
 src/field_10x26_impl.h | 20 ++++++++++----------
 src/field_5x52_impl.h  | 20 ++++++++++----------
 src/scalar_4x64_impl.h | 20 ++++++++++----------
 src/scalar_8x32_impl.h | 20 ++++++++++----------
 4 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index c502c772e6..8d954a5360 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1438,7 +1438,7 @@ static void secp256k1_fe_encode_31(int32_t *r, const secp256k1_fe *a) {
     r[8] =  a9 >> 14;
 }
 
-static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+static uint32_t secp256k1_fe_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t c1, c2, f = f0, g = g0, x, y, z;
@@ -1450,7 +1450,7 @@ static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int3
         VERIFY_CHECK((u * f0 + v * g0) == -f << i);
         VERIFY_CHECK((q * f0 + r * g0) == -g << i);
 
-        c1 = -(g & (eta >> 15));
+        c1 = -(g & (eta >> 31));
 
         x = (f ^ g) & c1;
         f ^= x; g ^= x; g ^= c1; g -= c1;
@@ -1461,7 +1461,7 @@ static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int3
         z = (v ^ r) & c1;
         v ^= z; r ^= z; r ^= c1; r -= c1;
 
-        eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1;
+        eta = (eta ^ c1) - c1 - 1;
 
         c2 = -(g & 1);
 
@@ -1478,7 +1478,7 @@ static int secp256k1_fe_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int3
     return eta;
 }
 
-static int secp256k1_fe_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+static uint32_t secp256k1_fe_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
@@ -1504,7 +1504,7 @@ static int secp256k1_fe_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0,
         VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i));
         VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i));
 
-        if ((int16_t)eta < 0) {
+        if ((int32_t)eta < 0) {
             eta = -eta;
             x = f; f = g; g = -x;
             y = u; u = q; q = -y;
@@ -1512,7 +1512,7 @@ static int secp256k1_fe_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0,
         }
 
         /* Handle up to 3 divsteps at once, subject to eta and i. */
-        limit = (eta + 1) > i ? i : (eta + 1);
+        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         m = (UINT32_MAX >> (32 - limit)) & 7U;
 
         /* Note that f * f == 1 mod 8, for any f. */
@@ -1577,7 +1577,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     int32_t g[9];
     secp256k1_fe b0, d0, a1, b1, c1, d1;
     int i, sign;
-    int16_t eta;
+    uint32_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
@@ -1594,7 +1594,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 #endif
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
-    eta = -1;
+    eta = -(uint32_t)1;
 
     for (i = 0; i < 24; ++i) {
         eta = secp256k1_fe_divsteps_31(eta, f[0], g[0], &t[i * 4]);
@@ -1678,7 +1678,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     int32_t g[9];
     secp256k1_fe b0, d0, a1, b1, c1, d1;
     int i, sign;
-    int16_t eta;
+    uint32_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
@@ -1695,7 +1695,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 #endif
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
-    eta = -1;
+    eta = -(uint32_t)1;
 
     for (i = 0; i < 24; ++i) {
         eta = secp256k1_fe_divsteps_31_var(eta, f[0], g[0], &t[i * 4]);
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 0cc7e80056..8a56aeac38 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -630,7 +630,7 @@ static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) {
     r[4] =  a4 >> 40;
 }
 
-static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
+static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
     uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
     uint64_t c1, c2, f = f0, g = g0, x, y, z;
@@ -642,7 +642,7 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6
         VERIFY_CHECK((u * f0 + v * g0) == -f << i);
         VERIFY_CHECK((q * f0 + r * g0) == -g << i);
 
-        c1 = -(g & (eta >> 15));
+        c1 = -(g & (eta >> 63));
 
         x = (f ^ g) & c1;
         f ^= x; g ^= x; g ^= c1; g -= c1;
@@ -653,7 +653,7 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6
         z = (v ^ r) & c1;
         v ^= z; r ^= z; r ^= c1; r -= c1;
 
-        eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1;
+        eta = (eta ^ c1) - c1 - 1;
 
         c2 = -(g & 1);
 
@@ -670,7 +670,7 @@ static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int6
     return eta;
 }
 
-static int secp256k1_fe_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
+static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
     uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
     uint64_t f = f0, g = g0, m, w, x, y, z;
@@ -696,7 +696,7 @@ static int secp256k1_fe_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0,
         VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i));
         VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i));
 
-        if ((int16_t)eta < 0) {
+        if ((int64_t)eta < 0) {
             eta = -eta;
             x = f; f = g; g = -x;
             y = u; u = q; q = -y;
@@ -704,7 +704,7 @@ static int secp256k1_fe_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0,
         }
 
         /* Handle up to 3 divsteps at once, subject to eta and i. */
-        limit = (eta + 1) > i ? i : (eta + 1);
+        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         m = (UINT64_MAX >> (64 - limit)) & 7U;
 
         /* Note that f * f == 1 mod 8, for any f. */
@@ -769,7 +769,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     int64_t g[5];
     secp256k1_fe b0, d0, a1, b1, c1, d1;
     int i, sign;
-    int16_t eta;
+    uint64_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
@@ -786,7 +786,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 #endif
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
-    eta = -1;
+    eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
         eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], &t[i * 4]);
@@ -866,7 +866,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     int64_t g[5];
     secp256k1_fe b0, d0, a1, b1, c1, d1;
     int i, sign;
-    int16_t eta;
+    uint64_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
@@ -883,7 +883,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 #endif
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
-    eta = -1;
+    eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
         eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], &t[i * 4]);
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index f51580a19d..8ce50a9ad3 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1083,7 +1083,7 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {
 #endif
 }
 
-static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
+static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
     uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
     uint64_t c1, c2, f = f0, g = g0, x, y, z;
@@ -1095,7 +1095,7 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0,
         VERIFY_CHECK((u * f0 + v * g0) == -f << i);
         VERIFY_CHECK((q * f0 + r * g0) == -g << i);
 
-        c1 = -(g & (eta >> 15));
+        c1 = -(g & (eta >> 63));
 
         x = (f ^ g) & c1;
         f ^= x; g ^= x; g ^= c1; g -= c1;
@@ -1106,7 +1106,7 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0,
         z = (v ^ r) & c1;
         v ^= z; r ^= z; r ^= c1; r -= c1;
 
-        eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1;
+        eta = (eta ^ c1) - c1 - 1;
 
         c2 = -(g & 1);
 
@@ -1123,7 +1123,7 @@ static int secp256k1_scalar_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0,
     return eta;
 }
 
-static int secp256k1_scalar_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
+static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
     uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
     uint64_t f = f0, g = g0, m, w, x, y, z;
@@ -1149,7 +1149,7 @@ static int secp256k1_scalar_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t
         VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i));
         VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i));
 
-        if ((int16_t)eta < 0) {
+        if ((int64_t)eta < 0) {
             eta = -eta;
             x = f; f = g; g = -x;
             y = u; u = q; q = -y;
@@ -1157,7 +1157,7 @@ static int secp256k1_scalar_divsteps_62_var(uint16_t eta, uint64_t f0, uint64_t
         }
 
         /* Handle up to 3 divsteps at once, subject to eta and i. */
-        limit = (eta + 1) > i ? i : (eta + 1);
+        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         m = (UINT64_MAX >> (64 - limit)) & 7U;
 
         /* Note that f * f == 1 mod 8, for any f. */
@@ -1233,7 +1233,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     int64_t g[5];
     secp256k1_scalar b0, d0, a1, b1, c1, d1;
     int i, sign;
-    int16_t eta;
+    uint64_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
@@ -1243,7 +1243,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     secp256k1_scalar_encode_62(g, &b0);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
-    eta = -1;
+    eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
         eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], &t[i * 4]);
@@ -1326,7 +1326,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     int64_t g[5];
     secp256k1_scalar b0, d0, a1, b1, c1, d1;
     int i, sign;
-    int16_t eta;
+    uint64_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
@@ -1336,7 +1336,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     secp256k1_scalar_encode_62(g, &b0);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
-    eta = -1;
+    eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
         eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], &t[i * 4]);
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 02d03f4504..54fd10f385 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -989,7 +989,7 @@ static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) {
 #endif
 }
 
-static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+static uint32_t secp256k1_scalar_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t c1, c2, f = f0, g = g0, x, y, z;
@@ -1001,7 +1001,7 @@ static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0,
         VERIFY_CHECK((u * f0 + v * g0) == -f << i);
         VERIFY_CHECK((q * f0 + r * g0) == -g << i);
 
-        c1 = -(g & (eta >> 15));
+        c1 = -(g & (eta >> 31));
 
         x = (f ^ g) & c1;
         f ^= x; g ^= x; g ^= c1; g -= c1;
@@ -1012,7 +1012,7 @@ static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0,
         z = (v ^ r) & c1;
         v ^= z; r ^= z; r ^= c1; r -= c1;
 
-        eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1;
+        eta = (eta ^ c1) - c1 - 1;
 
         c2 = -(g & 1);
 
@@ -1029,7 +1029,7 @@ static int secp256k1_scalar_divsteps_31(uint16_t eta, uint32_t f0, uint32_t g0,
     return eta;
 }
 
-static int secp256k1_scalar_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+static uint32_t secp256k1_scalar_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
@@ -1055,7 +1055,7 @@ static int secp256k1_scalar_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t
         VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i));
         VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i));
 
-        if ((int16_t)eta < 0) {
+        if ((int32_t)eta < 0) {
             eta = -eta;
             x = f; f = g; g = -x;
             y = u; u = q; q = -y;
@@ -1063,7 +1063,7 @@ static int secp256k1_scalar_divsteps_31_var(uint16_t eta, uint32_t f0, uint32_t
         }
 
         /* Handle up to 3 divsteps at once, subject to eta and i. */
-        limit = (eta + 1) > i ? i : (eta + 1);
+        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         m = (UINT32_MAX >> (32 - limit)) & 7U;
 
         /* Note that f * f == 1 mod 8, for any f. */
@@ -1139,7 +1139,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     int32_t g[9];
     secp256k1_scalar b0, d0, a1, b1, c1, d1;
     int i, sign;
-    int16_t eta;
+    uint32_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
@@ -1149,7 +1149,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     secp256k1_scalar_encode_31(g, &b0);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
-    eta = -1;
+    eta = -(uint32_t)1;
 
     for (i = 0; i < 24; ++i) {
         eta = secp256k1_scalar_divsteps_31(eta, f[0], g[0], &t[i * 4]);
@@ -1236,7 +1236,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     int32_t g[9];
     secp256k1_scalar b0, d0, a1, b1, c1, d1;
     int i, sign;
-    int16_t eta;
+    uint32_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
@@ -1246,7 +1246,7 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     secp256k1_scalar_encode_31(g, &b0);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
-    eta = -1;
+    eta = -(uint32_t)1;
 
     for (i = 0; i < 24; ++i) {
         eta = secp256k1_scalar_divsteps_31_var(eta, f[0], g[0], &t[i * 4]);

From b8c7390bc3a68cfda16e78a0914f47eb1a0bfe4f Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sat, 8 Aug 2020 16:58:43 +0700
Subject: [PATCH 11/34] =?UTF-8?q?field=5F5x52:=20update=20B=C3=A9zout=20co?=
 =?UTF-8?q?efficients=20on-the-fly?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/field_5x52_impl.h | 316 +++++++++++++++---------------------------
 1 file changed, 113 insertions(+), 203 deletions(-)

diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 8a56aeac38..c6e6dd0b0c 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -498,107 +498,29 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
-static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST(
-    0x0E90A100UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
-    0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL
-);
+static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) {
 
-static void secp256k1_fe_mul_add_2(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
-
-    /*  Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64.
-     *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
-     *  writes the 252-bit signed result to [t[0],t[1],t[2],t[3]].
-     */
-
-    int64_t z0, z1, z2, z3;
-    int128_t tt;
-
-    tt  = (int128_t)a0 * b0
-        + (int128_t)c0 * d0;
-    z0  = (int64_t)tt; tt -= z0; tt >>= 64;
-
-    tt += (int128_t)a0 * b1
-        + (int128_t)a1 * b0
-        + (int128_t)c0 * d1
-        + (int128_t)c1 * d0;
-    z1  = (int64_t)tt; tt -= z1; tt >>= 64;
-
-    tt += (int128_t)a1 * b1
-        + (int128_t)c1 * d1;
-    z2  = (int64_t)tt; tt -= z2; tt >>= 64;
-
-    z3 = (int64_t)tt;
-
-    t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3;
-}
-
-static void secp256k1_fe_combine_1s(int64_t *t) {
-
-    int64_t a = t[0], b = t[1], c = t[2], d = t[3],
-            e = t[4], f = t[5], g = t[6], h = t[7];
-    int128_t I, J, K, L;
-
-    I = (int128_t)e * a + (int128_t)f * c;
-    J = (int128_t)e * b + (int128_t)f * d;
-    K = (int128_t)g * a + (int128_t)h * c;
-    L = (int128_t)g * b + (int128_t)h * d;
-
-    a = (int64_t)I; I -= a; I >>= 64; b = (int64_t)I;
-    c = (int64_t)J; J -= c; J >>= 64; d = (int64_t)J;
-    e = (int64_t)K; K -= e; K >>= 64; f = (int64_t)K;
-    g = (int64_t)L; L -= g; L >>= 64; h = (int64_t)L;
+    const uint64_t M52 = UINT64_MAX >> 12;
+    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
+    uint64_t r0, r1, r2, r3, r4;
 
-    t[0] = a; t[1] = b; t[2] = c; t[3] = d;
-    t[4] = e; t[5] = f; t[6] = g; t[7] = h;
-}
+    VERIFY_CHECK(a0 >> 62 == 0);
+    VERIFY_CHECK(a1 >> 62 == 0);
+    VERIFY_CHECK(a2 >> 62 == 0);
+    VERIFY_CHECK(a3 >> 62 == 0);
 
-static void secp256k1_fe_combine_2s(int64_t *t) {
-
-    int64_t a0 = t[ 0], a1 = t[ 1];
-    int64_t b0 = t[ 2], b1 = t[ 3];
-    int64_t c0 = t[ 4], c1 = t[ 5];
-    int64_t d0 = t[ 6], d1 = t[ 7];
-    int64_t e0 = t[ 8], e1 = t[ 9];
-    int64_t f0 = t[10], f1 = t[11];
-    int64_t g0 = t[12], g1 = t[13];
-    int64_t h0 = t[14], h1 = t[15];
-
-    secp256k1_fe_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
-    secp256k1_fe_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
-    secp256k1_fe_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
-    secp256k1_fe_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
-}
+    /* Add a multiple of the field prime in case u4 is "negative". */
+    r0  = 0xFFFFEFFFFFC2FULL * 8;
+    r1  = 0xFFFFFFFFFFFFFULL * 8;
+    r2  = 0xFFFFFFFFFFFFFULL * 8;
+    r3  = 0xFFFFFFFFFFFFFULL * 8;
+    r4  = 0x0FFFFFFFFFFFFULL * 8;
 
-static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) {
-
-    uint64_t u0, u1, u2, u3, u4;
-    uint64_t r0, r1, r2, r3, r4;
-    int128_t cc;
-
-    cc  = t[0];
-    u0  = (uint64_t)cc; cc >>= 64;
-    cc += t[1];
-    u1  = (uint64_t)cc; cc >>= 64;
-    cc += t[2];
-    u2  = (uint64_t)cc; cc >>= 64;
-    cc += t[3];
-    u3  = (uint64_t)cc; cc >>= 64;
-    u4  = (uint64_t)cc;
-
-    VERIFY_CHECK(u4 == 0 || u4 == UINT64_MAX);
-
-    /* Add twice the field prime in case u4 is non-zero (which represents -2^256). */
-    r0 = 0xFFFFEFFFFFC2FULL * 2;
-    r1 = 0xFFFFFFFFFFFFFULL * 2;
-    r2 = 0xFFFFFFFFFFFFFULL * 2;
-    r3 = 0xFFFFFFFFFFFFFULL * 2;
-    r4 = 0x0FFFFFFFFFFFFULL * 2;
-
-    r0 += u0 & 0xFFFFFFFFFFFFFULL;
-    r1 += (u0 >> 52 | u1 << 12) & 0xFFFFFFFFFFFFFULL;
-    r2 += (u1 >> 40 | u2 << 24) & 0xFFFFFFFFFFFFFULL;
-    r3 += (u2 >> 28 | u3 << 36) & 0xFFFFFFFFFFFFFULL;
-    r4 += (u3 >> 16 | u4 << 48);
+    r0 +=  a0                   & M52;
+    r1 += (a0 >> 52 | a1 << 10) & M52;
+    r2 += (a1 >> 42 | a2 << 20) & M52;
+    r3 += (a2 >> 32 | a3 << 30) & M52;
+    r4 += (a3 >> 22 | a4 << 40);
 
     r->n[0] = r0;
     r->n[1] = r1;
@@ -607,7 +529,7 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) {
     r->n[4] = r4;
 
 #ifdef VERIFY
-    r->magnitude = 2;
+    r->magnitude = 7;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
@@ -722,6 +644,67 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t
     return eta;
 }
 
+static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) {
+
+    /* I64 == -P^-1 mod 2^64 */
+    const int64_t I64 = 0xD838091DD2253531LL;
+    const int64_t C64 = 0x1000003D1LL;
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
+    int128_t cd = 0, ce = 0;
+    int i;
+
+    di = d[0];
+    ei = e[0];
+
+    cd -= (int128_t)u * di + (int128_t)v * ei;
+    ce -= (int128_t)q * di + (int128_t)r * ei;
+
+    /* Calculate the multiples of P to add, to zero the 62 bottom bits. */
+    md = ((int128_t)I64 * (int64_t)cd) & M62;
+    me = ((int128_t)I64 * (int64_t)ce) & M62;
+
+    /* P == 2^256 - C64; subtract C64 products here. */
+    cd -= (int128_t)C64 * md;
+    ce -= (int128_t)C64 * me;
+
+    VERIFY_CHECK(((int64_t)cd & M62) == 0);
+    VERIFY_CHECK(((int64_t)ce & M62) == 0);
+
+    cd >>= 62;
+    ce >>= 62;
+
+    for (i = 1; i < 4; ++i) {
+
+        di = d[i];
+        ei = e[i];
+
+        cd -= (int128_t)u * di + (int128_t)v * ei;
+        ce -= (int128_t)q * di + (int128_t)r * ei;
+
+        d[i - 1] = (int64_t)cd & M62; cd >>= 62;
+        e[i - 1] = (int64_t)ce & M62; ce >>= 62;
+    }
+
+    {
+        di = d[4];
+        ei = e[4];
+
+        cd -= (int128_t)u * di + (int128_t)v * ei;
+        ce -= (int128_t)q * di + (int128_t)r * ei;
+
+        /* In the final iteration, add the 2^256 products. */
+        cd += (int128_t)md << 8;
+        ce += (int128_t)me << 8;
+
+        d[3] = (int64_t)cd & M62; cd >>= 62;
+        e[3] = (int64_t)ce & M62; ce >>= 62;
+    }
+
+    d[4] = (int64_t)cd;
+    e[4] = (int64_t)ce;
+}
+
 static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) {
 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
@@ -763,21 +746,20 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
      */
 
-    int64_t t[12 * 4];
+    int64_t t[4];
+    int64_t d[5] = { 0, 0, 0, 0, 0 };
+    int64_t e[5] = { 1, 0, 0, 0, 0 };
     int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL,
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
-    secp256k1_fe b0, d0, a1, b1, c1, d1;
+    secp256k1_fe b0, b1;
     int i, sign;
     uint64_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
 
-    /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
-     * by 2^768, and then the output by 2^24. */
-    /* Instead of dividing the output by 2^744, scale the input. */
-    secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
+    b0 = *a;
     secp256k1_fe_normalize(&b0);
     secp256k1_fe_encode_62(g, &b0);
 
@@ -789,61 +771,23 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
-        eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], &t[i * 4]);
-        secp256k1_fe_update_fg(f, g, &t[i * 4]);
+        eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], t);
+        secp256k1_fe_update_de(d, e, t);
+        secp256k1_fe_update_fg(f, g, t);
     }
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get
-     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra
-     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
+     * values i.e. +/- 1, and d now contains +/- the modular inverse.
      */
 
-    VERIFY_CHECK(g[0] == 0);
+    VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0);
 
     sign = (f[0] >> 1) & 1;
 
-    for (i = 0; i < 3; ++i) {
-        int tOff = i * 16;
-        secp256k1_fe_combine_1s(&t[tOff + 0]);
-        secp256k1_fe_combine_1s(&t[tOff + 8]);
-        secp256k1_fe_combine_2s(&t[tOff + 0]);
-    }
-
-    /* secp256k1_fe_decode_matrix(&a0, &t[0]); */
-    secp256k1_fe_decode_matrix(&b0, &t[4]);
-    /* secp256k1_fe_decode_matrix(&c0, &t[8]); */
-    secp256k1_fe_decode_matrix(&d0, &t[12]);
-
-    secp256k1_fe_decode_matrix(&a1, &t[16]);
-    secp256k1_fe_decode_matrix(&b1, &t[20]);
-    secp256k1_fe_decode_matrix(&c1, &t[24]);
-    secp256k1_fe_decode_matrix(&d1, &t[28]);
-
-    secp256k1_fe_mul(&a1, &a1, &b0);
-    secp256k1_fe_mul(&b1, &b1, &d0);
-    secp256k1_fe_mul(&c1, &c1, &b0);
-    secp256k1_fe_mul(&d1, &d1, &d0);
-
-    b0 = a1; secp256k1_fe_add(&b0, &b1);
-    d0 = c1; secp256k1_fe_add(&d0, &d1);
-
-    secp256k1_fe_decode_matrix(&a1, &t[32]);
-    secp256k1_fe_decode_matrix(&b1, &t[36]);
-    /* secp256k1_fe_decode_matrix(&c1, &t[40]); */
-    /* secp256k1_fe_decode_matrix(&d1, &t[44]); */
+    secp256k1_fe_decode_62(&b0, d);
 
-    secp256k1_fe_mul(&a1, &a1, &b0);
-    secp256k1_fe_mul(&b1, &b1, &d0);
-    /* secp256k1_fe_mul(&c1, &c1, &b0); */
-    /* secp256k1_fe_mul(&d1, &d1, &d0); */
-
-    b0 = a1; secp256k1_fe_add(&b0, &b1);
-    /* d0 = c1; secp256k1_fe_add(&d0, &d1); */
-
-    secp256k1_fe_negate(&b1, &b0, 2);
+    secp256k1_fe_negate(&b1, &b0, 7);
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 
@@ -860,21 +804,20 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
      */
 
-    int64_t t[12 * 4];
+    int64_t t[4];
+    int64_t d[5] = { 0, 0, 0, 0, 0 };
+    int64_t e[5] = { 1, 0, 0, 0, 0 };
     int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL,
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
-    secp256k1_fe b0, d0, a1, b1, c1, d1;
+    secp256k1_fe b0, b1;
     int i, sign;
     uint64_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
 
-    /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
-     * by 2^768, and then the output by 2^24. */
-    /* Instead of dividing the output by 2^744, scale the input. */
-    secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
+    b0 = *a;
     secp256k1_fe_normalize(&b0);
     secp256k1_fe_encode_62(g, &b0);
 
@@ -886,61 +829,28 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
-        eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], &t[i * 4]);
-        secp256k1_fe_update_fg(f, g, &t[i * 4]);
+        eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t);
+        secp256k1_fe_update_de(d, e, t);
+        secp256k1_fe_update_fg(f, g, t);
+
+        if (g[0] == 0) {
+            if ((g[1] | g[2] | g[3] | g[4]) == 0) {
+                break;
+            }
+        }
     }
 
-    /* At this point sufficient iterations have been performed that g must have reached 0
-     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get
-     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra
-     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
-     */
+    VERIFY_CHECK(i < 12);
 
-    VERIFY_CHECK(g[0] == 0);
+    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
+     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse.
+     */
 
     sign = (f[0] >> 1) & 1;
 
-    for (i = 0; i < 3; ++i) {
-        int tOff = i * 16;
-        secp256k1_fe_combine_1s(&t[tOff + 0]);
-        secp256k1_fe_combine_1s(&t[tOff + 8]);
-        secp256k1_fe_combine_2s(&t[tOff + 0]);
-    }
-
-    /* secp256k1_fe_decode_matrix(&a0, &t[0]); */
-    secp256k1_fe_decode_matrix(&b0, &t[4]);
-    /* secp256k1_fe_decode_matrix(&c0, &t[8]); */
-    secp256k1_fe_decode_matrix(&d0, &t[12]);
-
-    secp256k1_fe_decode_matrix(&a1, &t[16]);
-    secp256k1_fe_decode_matrix(&b1, &t[20]);
-    secp256k1_fe_decode_matrix(&c1, &t[24]);
-    secp256k1_fe_decode_matrix(&d1, &t[28]);
-
-    secp256k1_fe_mul(&a1, &a1, &b0);
-    secp256k1_fe_mul(&b1, &b1, &d0);
-    secp256k1_fe_mul(&c1, &c1, &b0);
-    secp256k1_fe_mul(&d1, &d1, &d0);
-
-    b0 = a1; secp256k1_fe_add(&b0, &b1);
-    d0 = c1; secp256k1_fe_add(&d0, &d1);
-
-    secp256k1_fe_decode_matrix(&a1, &t[32]);
-    secp256k1_fe_decode_matrix(&b1, &t[36]);
-    /* secp256k1_fe_decode_matrix(&c1, &t[40]); */
-    /* secp256k1_fe_decode_matrix(&d1, &t[44]); */
-
-    secp256k1_fe_mul(&a1, &a1, &b0);
-    secp256k1_fe_mul(&b1, &b1, &d0);
-    /* secp256k1_fe_mul(&c1, &c1, &b0); */
-    /* secp256k1_fe_mul(&d1, &d1, &d0); */
-
-    b0 = a1; secp256k1_fe_add(&b0, &b1);
-    /* d0 = c1; secp256k1_fe_add(&d0, &d1); */
+    secp256k1_fe_decode_62(&b0, d);
 
-    secp256k1_fe_negate(&b1, &b0, 2);
+    secp256k1_fe_negate(&b1, &b0, 7);
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 

From 64a4912c436bc44432443938af8c6dbfaf02f911 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sat, 8 Aug 2020 19:38:06 +0700
Subject: [PATCH 12/34] =?UTF-8?q?field=5F10x26:=20update=20B=C3=A9zout=20c?=
 =?UTF-8?q?oefficients=20on-the-fly?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/field_10x26_impl.h | 552 +++++++++++++----------------------------
 src/field_5x52_impl.h  |  48 ++--
 2 files changed, 191 insertions(+), 409 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 8d954a5360..30c5653c78 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1164,238 +1164,44 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
-static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST(
-    0x0E90A100UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
-    0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL
-);
+static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
 
-static void secp256k1_fe_mul_add_2(int32_t a0, int32_t a1, int32_t b0, int32_t b1, int32_t c0, int32_t c1, int32_t d0, int32_t d1, int32_t *t) {
-
-    /*  Each [a0,a1], etc. pair is a ??-bit signed value e.g. a0 + a1 * 2^32.
-     *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
-     *  writes the ???-bit signed result to [t[0],t[1],t[2],t[3]].
-     */
-
-    int32_t z0, z1, z2, z3;
-    int64_t tt;
-
-    tt  = (int64_t)a0 * b0
-        + (int64_t)c0 * d0;
-    z0  = (int32_t)tt; tt -= z0; tt >>= 32;
-
-    tt += (int64_t)a0 * b1
-        + (int64_t)a1 * b0
-        + (int64_t)c0 * d1
-        + (int64_t)c1 * d0;
-    z1  = (int32_t)tt; tt -= z1; tt >>= 32;
-
-    tt += (int64_t)a1 * b1
-        + (int64_t)c1 * d1;
-    z2  = (int32_t)tt; tt -= z2; tt >>= 32;
-
-    z3 = (int32_t)tt;
-
-    t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3;
-}
-
-static void secp256k1_fe_mul_add_4(int32_t* tIn, int xPos, int yPos, int uPos, int vPos, int32_t *tOut, int zzPos) {
-    int32_t y0 = tIn[yPos + 0];
-    int32_t y1 = tIn[yPos + 1];
-    int32_t y2 = tIn[yPos + 2];
-    int32_t y3 = tIn[yPos + 3];
-    int32_t v0 = tIn[vPos + 0];
-    int32_t v1 = tIn[vPos + 1];
-    int32_t v2 = tIn[vPos + 2];
-    int32_t v3 = tIn[vPos + 3];
-    int32_t xVal, uVal;
-    int32_t z0, z1, z2, z3, z4, z5, z6, z7;
-    int64_t c;
-
-    xVal = tIn[xPos];
-    uVal = tIn[uPos];
-
-    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0;
-    z0 = (int32_t)c; c -= z0; c >>= 32;
-
-    c += (int64_t)xVal * y1 + (int64_t)uVal * v1;
-    z1 = (int32_t)c; c -= z1; c >>= 32;
-
-    c += (int64_t)xVal * y2 + (int64_t)uVal * v2;
-    z2 = (int32_t)c; c -= z2; c >>= 32;
-
-    c += (int64_t)xVal * y3 + (int64_t)uVal * v3;
-    z3 = (int32_t)c; c -= z3; c >>= 32;
-    z4 = (int32_t)c;
-
-    xVal = tIn[xPos + 1];
-    uVal = tIn[uPos + 1];
-
-    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z1;
-    z1 = (int32_t)c; c -= z1; c >>= 32;
-
-    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z2;
-    z2 = (int32_t)c; c -= z2; c >>= 32;
-
-    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z3;
-    z3 = (int32_t)c; c -= z3; c >>= 32;
-
-    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z4;
-    z4 = (int32_t)c; c -= z4; c >>= 32;
-    z5 = (int32_t)c;
-
-    xVal = tIn[xPos + 2];
-    uVal = tIn[uPos + 2];
-
-    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z2;
-    z2 = (int32_t)c; c -= z2; c >>= 32;
-
-    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z3;
-    z3 = (int32_t)c; c -= z3; c >>= 32;
-
-    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z4;
-    z4 = (int32_t)c; c -= z4; c >>= 32;
-
-    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z5;
-    z5 = (int32_t)c; c -= z5; c >>= 32;
-    z6 = (int32_t)c;
-
-    xVal = tIn[xPos + 3];
-    uVal = tIn[uPos + 3];
-
-    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z3;
-    z3 = (int32_t)c; c -= z3; c >>= 32;
-
-    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z4;
-    z4 = (int32_t)c; c -= z4; c >>= 32;
-
-    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z5;
-    z5 = (int32_t)c; c -= z5; c >>= 32;
-
-    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z6;
-    z6 = (int32_t)c; c -= z6; c >>= 32;
-    z7 = (int32_t)c;
-
-    tOut[zzPos + 0] = z0;
-    tOut[zzPos + 1] = z1;
-    tOut[zzPos + 2] = z2;
-    tOut[zzPos + 3] = z3;
-    tOut[zzPos + 4] = z4;
-    tOut[zzPos + 5] = z5;
-    tOut[zzPos + 6] = z6;
-    tOut[zzPos + 7] = z7;
-}
-
-static void secp256k1_fe_combine_1s(int32_t *t) {
-
-    int32_t a = t[0], b = t[1], c = t[2], d = t[3],
-            e = t[4], f = t[5], g = t[6], h = t[7];
-    int64_t I, J, K, L;
-
-    I = (int64_t)e * a + (int64_t)f * c;
-    J = (int64_t)e * b + (int64_t)f * d;
-    K = (int64_t)g * a + (int64_t)h * c;
-    L = (int64_t)g * b + (int64_t)h * d;
-
-    a = (int32_t)I; I -= a; I >>= 32; b = (int32_t)I;
-    c = (int32_t)J; J -= c; J >>= 32; d = (int32_t)J;
-    e = (int32_t)K; K -= e; K >>= 32; f = (int32_t)K;
-    g = (int32_t)L; L -= g; L >>= 32; h = (int32_t)L;
-
-    t[0] = a; t[1] = b; t[2] = c; t[3] = d;
-    t[4] = e; t[5] = f; t[6] = g; t[7] = h;
-}
-
-static void secp256k1_fe_combine_2s(int32_t *t) {
-
-    int32_t a0 = t[ 0], a1 = t[ 1];
-    int32_t b0 = t[ 2], b1 = t[ 3];
-    int32_t c0 = t[ 4], c1 = t[ 5];
-    int32_t d0 = t[ 6], d1 = t[ 7];
-    int32_t e0 = t[ 8], e1 = t[ 9];
-    int32_t f0 = t[10], f1 = t[11];
-    int32_t g0 = t[12], g1 = t[13];
-    int32_t h0 = t[14], h1 = t[15];
-
-    secp256k1_fe_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
-    secp256k1_fe_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
-    secp256k1_fe_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
-    secp256k1_fe_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
-}
-
-static void secp256k1_fe_combine_4s(int32_t *t)
-{
-    int32_t tmp[32];
-
-    int aPos = 0;
-    int bPos = 4;
-    int cPos = 8;
-    int dPos = 12;
-    int ePos = 16;
-    int fPos = 20;
-    int gPos = 24;
-    int hPos = 28;
-
-    int IPos = 0;
-    int JPos = 8;
-    int KPos = 16;
-    int LPos = 24;
-
-    secp256k1_fe_mul_add_4(t, ePos, aPos, fPos, cPos, tmp, IPos);
-    secp256k1_fe_mul_add_4(t, ePos, bPos, fPos, dPos, tmp, JPos);
-    secp256k1_fe_mul_add_4(t, gPos, aPos, hPos, cPos, tmp, KPos);
-    secp256k1_fe_mul_add_4(t, gPos, bPos, hPos, dPos, tmp, LPos);
-
-    memcpy(t, tmp, 32 * sizeof(int32_t));
-}
-
-static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int32_t *t) {
-
-    uint32_t u0, u1, u2, u3, u4, u5, u6, u7, u8;
+    const uint32_t M26 = UINT32_MAX >> 6;
+    uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
+             a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
     uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
-    int64_t cc;
-
-    cc  = t[0];
-    u0  = (uint32_t)cc; cc >>= 32;
-    cc += t[1];
-    u1  = (uint32_t)cc; cc >>= 32;
-    cc += t[2];
-    u2  = (uint32_t)cc; cc >>= 32;
-    cc += t[3];
-    u3  = (uint32_t)cc; cc >>= 32;
-    cc += t[4];
-    u4  = (uint32_t)cc; cc >>= 32;
-    cc += t[5];
-    u5  = (uint32_t)cc; cc >>= 32;
-    cc += t[6];
-    u6  = (uint32_t)cc; cc >>= 32;
-    cc += t[7];
-    u7  = (uint32_t)cc; cc >>= 32;
-    u8  = (uint32_t)cc;
-
-    VERIFY_CHECK(u8 == 0 || u8 == UINT32_MAX);
-
-    /* Add twice the field prime in case u8 is non-zero (which represents -2^256). */
-    r0 = 0x3FFFC2FUL * 2;
-    r1 = 0x3FFFFBFUL * 2;
-    r2 = 0x3FFFFFFUL * 2;
-    r3 = 0x3FFFFFFUL * 2;
-    r4 = 0x3FFFFFFUL * 2;
-    r5 = 0x3FFFFFFUL * 2;
-    r6 = 0x3FFFFFFUL * 2;
-    r7 = 0x3FFFFFFUL * 2;
-    r8 = 0x3FFFFFFUL * 2;
-    r9 = 0x03FFFFFUL * 2;
-
-    r0 += (           u0      ) & 0x3FFFFFFUL;
-    r1 += (u0 >> 26 | u1 <<  6) & 0x3FFFFFFUL;
-    r2 += (u1 >> 20 | u2 << 12) & 0x3FFFFFFUL;
-    r3 += (u2 >> 14 | u3 << 18) & 0x3FFFFFFUL;
-    r4 += (u3 >>  8 | u4 << 24) & 0x3FFFFFFUL;
-    r5 += (u4 >>  2           ) & 0x3FFFFFFUL;
-    r6 += (u4 >> 28 | u5 <<  4) & 0x3FFFFFFUL;
-    r7 += (u5 >> 22 | u6 << 10) & 0x3FFFFFFUL;
-    r8 += (u6 >> 16 | u7 << 16) & 0x3FFFFFFUL;
-    r9 += (u7 >> 10 | u8 << 22);
+
+    VERIFY_CHECK(a0 >> 30 == 0);
+    VERIFY_CHECK(a1 >> 30 == 0);
+    VERIFY_CHECK(a2 >> 30 == 0);
+    VERIFY_CHECK(a3 >> 30 == 0);
+    VERIFY_CHECK(a4 >> 30 == 0);
+    VERIFY_CHECK(a5 >> 30 == 0);
+    VERIFY_CHECK(a6 >> 30 == 0);
+    VERIFY_CHECK(a7 >> 30 == 0);
+
+    /* Add a multiple of the field prime in case u4 is "negative". */
+    r0  = 0x3FFFC2FUL * 8;
+    r1  = 0x3FFFFBFUL * 8;
+    r2  = 0x3FFFFFFUL * 8;
+    r3  = 0x3FFFFFFUL * 8;
+    r4  = 0x3FFFFFFUL * 8;
+    r5  = 0x3FFFFFFUL * 8;
+    r6  = 0x3FFFFFFUL * 8;
+    r7  = 0x3FFFFFFUL * 8;
+    r8  = 0x3FFFFFFUL * 8;
+    r9  = 0x03FFFFFUL * 8;
+
+    r0 +=  a0                   & M26;
+    r1 += (a0 >> 26 | a1 <<  4) & M26;
+    r2 += (a1 >> 22 | a2 <<  8) & M26;
+    r3 += (a2 >> 18 | a3 << 12) & M26;
+    r4 += (a3 >> 14 | a4 << 16) & M26;
+    r5 += (a4 >> 10 | a5 << 20) & M26;
+    r6 += (a5 >>  6 | a6 << 24) & M26;
+    r7 += (a6 >>  2           ) & M26;
+    r8 += (a6 >> 28 | a7 <<  2) & M26;
+    r9 += (a7 >> 24 | a8 <<  6);
 
     r->n[0] = r0;
     r->n[1] = r1;
@@ -1409,15 +1215,15 @@ static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int32_t *t) {
     r->n[9] = r9;
 
 #ifdef VERIFY
-    r->magnitude = 2;
+    r->magnitude = 7;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
-static void secp256k1_fe_encode_31(int32_t *r, const secp256k1_fe *a) {
+static void secp256k1_fe_encode_30(int32_t *r, const secp256k1_fe *a) {
 
-    const uint32_t M31 = UINT32_MAX >> 1;
+    const uint32_t M30 = UINT32_MAX >> 2;
     const uint32_t *n = &a->n[0];
     uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4],
              a5 = n[5], a6 = n[6], a7 = n[7], a8 = n[8], a9 = n[9];
@@ -1426,25 +1232,25 @@ static void secp256k1_fe_encode_31(int32_t *r, const secp256k1_fe *a) {
     VERIFY_CHECK(a->normalized);
 #endif
 
-    r[0] = (a0       | a1 << 26) & M31;
-    r[1] = (a1 >>  5 | a2 << 21) & M31;
-    r[2] = (a2 >> 10 | a3 << 16) & M31;
-    r[3] = (a3 >> 15 | a4 << 11) & M31;
-    r[4] = (a4 >> 20 | a5 <<  6) & M31;
-    r[5] = (a5 >> 25 | a6 <<  1
-                     | a7 << 27) & M31;
-    r[6] = (a7 >>  4 | a8 << 22) & M31;
-    r[7] = (a8 >>  9 | a9 << 17) & M31;
-    r[8] =  a9 >> 14;
+    r[0] = (a0       | a1 << 26) & M30;
+    r[1] = (a1 >>  4 | a2 << 22) & M30;
+    r[2] = (a2 >>  8 | a3 << 18) & M30;
+    r[3] = (a3 >> 12 | a4 << 14) & M30;
+    r[4] = (a4 >> 16 | a5 << 10) & M30;
+    r[5] = (a5 >> 20 | a6 <<  6) & M30;
+    r[6] = (a6 >> 24 | a7 <<  2
+                     | a8 << 28) & M30;
+    r[7] = (a8 >>  2 | a9 << 24) & M30;
+    r[8] =  a9 >>  6;
 }
 
-static uint32_t secp256k1_fe_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
-    for (i = 0; i < 31; ++i) {
+    for (i = 0; i < 30; ++i) {
 
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((u * f0 + v * g0) == -f << i);
@@ -1478,11 +1284,11 @@ static uint32_t secp256k1_fe_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0,
     return eta;
 }
 
-static uint32_t secp256k1_fe_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
-    int i = 31, limit, zeros;
+    int i = 30, limit, zeros;
 
     for (;;) {
 
@@ -1501,8 +1307,8 @@ static uint32_t secp256k1_fe_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t
 
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((g & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i));
-        VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i));
+        VERIFY_CHECK((u * f0 + v * g0) == -f << (30 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == -g << (30 - i));
 
         if ((int32_t)eta < 0) {
             eta = -eta;
@@ -1530,9 +1336,74 @@ static uint32_t secp256k1_fe_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t
     return eta;
 }
 
-static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) {
+static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
+
+    /* I30 == -P^-1 mod 2^30 */
+    const int32_t I30 = 0x12253531L;
+    const int32_t C30 = 0x3D1L;
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
+    int64_t cd = 0, ce = 0;
+    int i;
+
+    di = d[0];
+    ei = e[0];
+
+    cd -= (int64_t)u * di + (int64_t)v * ei;
+    ce -= (int64_t)q * di + (int64_t)r * ei;
+
+    /* Calculate the multiples of P to add, to zero the 30 bottom bits. */
+    md = ((int64_t)I30 * (int32_t)cd) & M30;
+    me = ((int64_t)I30 * (int32_t)ce) & M30;
+
+    /* P == 2^256 - 2^32 - C30; subtract products of C30 here. */
+    cd -= (int64_t)C30 * md;
+    ce -= (int64_t)C30 * me;
+
+    VERIFY_CHECK(((int32_t)cd & M30) == 0);
+    VERIFY_CHECK(((int32_t)ce & M30) == 0);
+
+    cd >>= 30;
+    ce >>= 30;
+
+    /* Subtract products of 2^32. */
+    cd -= (int64_t)md << 2;
+    ce -= (int64_t)me << 2;
+
+    for (i = 1; i < 8; ++i) {
+
+        di = d[i];
+        ei = e[i];
+
+        cd -= (int64_t)u * di + (int64_t)v * ei;
+        ce -= (int64_t)q * di + (int64_t)r * ei;
+
+        d[i - 1] = (int32_t)cd & M30; cd >>= 30;
+        e[i - 1] = (int32_t)ce & M30; ce >>= 30;
+    }
+
+    /* Add products of 2^256. */
+    cd += (int64_t)md << 16;
+    ce += (int64_t)me << 16;
+
+    {
+        di = d[8];
+        ei = e[8];
+
+        cd -= (int64_t)u * di + (int64_t)v * ei;
+        ce -= (int64_t)q * di + (int64_t)r * ei;
+
+        d[7] = (int32_t)cd & M30; cd >>= 30;
+        e[7] = (int32_t)ce & M30; ce >>= 30;
+    }
+
+    d[8] = (int32_t)cd;
+    e[8] = (int32_t)ce;
+}
+
+static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, int32_t *t) {
 
-    const int32_t M31 = (int32_t)(UINT32_MAX >> 1);
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
     int64_t cf = 0, cg = 0;
     int i;
@@ -1543,11 +1414,11 @@ static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) {
     cf -= (int64_t)u * fi + (int64_t)v * gi;
     cg -= (int64_t)q * fi + (int64_t)r * gi;
 
-    VERIFY_CHECK(((int32_t)cf & M31) == 0);
-    VERIFY_CHECK(((int32_t)cg & M31) == 0);
+    VERIFY_CHECK(((int32_t)cf & M30) == 0);
+    VERIFY_CHECK(((int32_t)cg & M30) == 0);
 
-    cf >>= 31;
-    cg >>= 31;
+    cf >>= 30;
+    cg >>= 30;
 
     for (i = 1; i < 9; ++i) {
 
@@ -1557,8 +1428,8 @@ static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) {
         cf -= (int64_t)u * fi + (int64_t)v * gi;
         cg -= (int64_t)q * fi + (int64_t)r * gi;
 
-        f[i - 1] = (int32_t)cf & M31; cf >>= 31;
-        g[i - 1] = (int32_t)cg & M31; cg >>= 31;
+        f[i - 1] = (int32_t)cf & M30; cf >>= 30;
+        g[i - 1] = (int32_t)cg & M30; cg >>= 30;
     }
 
     f[8] = (int32_t)cf;
@@ -1568,26 +1439,24 @@ static void secp256k1_fe_update_fg(int32_t *f, int32_t *g, int32_t *t) {
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
-     */
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
-    int32_t t[24 * 4];
-    int32_t f[9] = { 0x7FFFFC2FL, 0x7FFFFFFDL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL,
-        0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL };
+    int32_t t[4];
+    int32_t d[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int32_t e[9] = { 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int32_t f[9] = { 0x3FFFFC2F, 0x3FFFFFFB, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF,
+        0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF };
     int32_t g[9];
-    secp256k1_fe b0, d0, a1, b1, c1, d1;
+    secp256k1_fe b0, b1;
     int i, sign;
     uint32_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
 
-    /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
-     * by 2^768, and then the output by 2^24. */
-    /* Instead of dividing the output by 2^744, scale the input. */
-    secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
+    b0 = *a;
     secp256k1_fe_normalize(&b0);
-    secp256k1_fe_encode_31(g, &b0);
+    secp256k1_fe_encode_30(g, &b0);
 
 #ifdef VERIFY
     zero_in = secp256k1_fe_is_zero(&b0);
@@ -1596,66 +1465,23 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
     eta = -(uint32_t)1;
 
-    for (i = 0; i < 24; ++i) {
-        eta = secp256k1_fe_divsteps_31(eta, f[0], g[0], &t[i * 4]);
-        secp256k1_fe_update_fg(f, g, &t[i * 4]);
+    for (i = 0; i < 25; ++i) {
+        eta = secp256k1_fe_divsteps_30(eta, f[0], g[0], t);
+        secp256k1_fe_update_de_30(d, e, t);
+        secp256k1_fe_update_fg_30(f, g, t);
     }
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divsteps_31 are combined to get
-     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divsteps_31 introduce an extra factor of 2^31 each, so there is a total extra
-     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
-     */
+     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    VERIFY_CHECK(g[0] == 0);
+    VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0);
 
     sign = (f[0] >> 1) & 1;
 
-    for (i = 0; i < 3; ++i) {
-        int tOff = i * 32;
-        secp256k1_fe_combine_1s(&t[tOff +  0]);
-        secp256k1_fe_combine_1s(&t[tOff +  8]);
-        secp256k1_fe_combine_1s(&t[tOff + 16]);
-        secp256k1_fe_combine_1s(&t[tOff + 24]);
-        secp256k1_fe_combine_2s(&t[tOff +  0]);
-        secp256k1_fe_combine_2s(&t[tOff + 16]);
-        secp256k1_fe_combine_4s(&t[tOff +  0]);
-    }
-
-    /* secp256k1_fe_decode_matrix(&a0, &t[0]); */
-    secp256k1_fe_decode_matrix(&b0, &t[8]);
-    /* secp256k1_fe_decode_matrix(&c0, &t[16]); */
-    secp256k1_fe_decode_matrix(&d0, &t[24]);
-
-    secp256k1_fe_decode_matrix(&a1, &t[32]);
-    secp256k1_fe_decode_matrix(&b1, &t[40]);
-    secp256k1_fe_decode_matrix(&c1, &t[48]);
-    secp256k1_fe_decode_matrix(&d1, &t[56]);
-
-    secp256k1_fe_mul(&a1, &a1, &b0);
-    secp256k1_fe_mul(&b1, &b1, &d0);
-    secp256k1_fe_mul(&c1, &c1, &b0);
-    secp256k1_fe_mul(&d1, &d1, &d0);
-
-    b0 = a1; secp256k1_fe_add(&b0, &b1);
-    d0 = c1; secp256k1_fe_add(&d0, &d1);
-
-    secp256k1_fe_decode_matrix(&a1, &t[64]);
-    secp256k1_fe_decode_matrix(&b1, &t[72]);
-    /* secp256k1_fe_decode_matrix(&c1, &t[80]); */
-    /* secp256k1_fe_decode_matrix(&d1, &t[88]); */
+    secp256k1_fe_decode_30(&b0, d);
 
-    secp256k1_fe_mul(&a1, &a1, &b0);
-    secp256k1_fe_mul(&b1, &b1, &d0);
-    /* secp256k1_fe_mul(&c1, &c1, &b0); */
-    /* secp256k1_fe_mul(&d1, &d1, &d0); */
-
-    b0 = a1; secp256k1_fe_add(&b0, &b1);
-    /* d0 = c1; secp256k1_fe_add(&d0, &d1); */
-
-    secp256k1_fe_negate(&b1, &b0, 2);
+    secp256k1_fe_negate(&b1, &b0, 7);
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 
@@ -1669,26 +1495,24 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
-     */
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
-    int32_t t[24 * 4];
-    int32_t f[9] = { 0x7FFFFC2FL, 0x7FFFFFFDL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL,
-        0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL };
+    int32_t t[4];
+    int32_t d[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int32_t e[9] = { 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int32_t f[9] = { 0x3FFFFC2F, 0x3FFFFFFB, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF,
+        0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF };
     int32_t g[9];
-    secp256k1_fe b0, d0, a1, b1, c1, d1;
+    secp256k1_fe b0, b1;
     int i, sign;
     uint32_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
 
-    /* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
-     * by 2^768, and then the output by 2^24. */
-    /* Instead of dividing the output by 2^744, scale the input. */
-    secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
+    b0 = *a;
     secp256k1_fe_normalize(&b0);
-    secp256k1_fe_encode_31(g, &b0);
+    secp256k1_fe_encode_30(g, &b0);
 
 #ifdef VERIFY
     zero_in = secp256k1_fe_is_zero(&b0);
@@ -1697,66 +1521,28 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
     eta = -(uint32_t)1;
 
-    for (i = 0; i < 24; ++i) {
-        eta = secp256k1_fe_divsteps_31_var(eta, f[0], g[0], &t[i * 4]);
-        secp256k1_fe_update_fg(f, g, &t[i * 4]);
-    }
+    for (i = 0; i < 25; ++i) {
+        eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t);
+        secp256k1_fe_update_de_30(d, e, t);
+        secp256k1_fe_update_fg_30(f, g, t);
 
-    /* At this point sufficient iterations have been performed that g must have reached 0
-     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divsteps_31_var are combined to
-     * get the Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divsteps_31_var introduce an extra factor of 2^31 each, so there is a total extra
-     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
-     */
-
-    VERIFY_CHECK(g[0] == 0);
-
-    sign = (f[0] >> 1) & 1;
-
-    for (i = 0; i < 3; ++i) {
-        int tOff = i * 32;
-        secp256k1_fe_combine_1s(&t[tOff +  0]);
-        secp256k1_fe_combine_1s(&t[tOff +  8]);
-        secp256k1_fe_combine_1s(&t[tOff + 16]);
-        secp256k1_fe_combine_1s(&t[tOff + 24]);
-        secp256k1_fe_combine_2s(&t[tOff +  0]);
-        secp256k1_fe_combine_2s(&t[tOff + 16]);
-        secp256k1_fe_combine_4s(&t[tOff +  0]);
+        if (g[0] == 0) {
+            if ((g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0) {
+                break;
+            }
+        }
     }
 
-    /* secp256k1_fe_decode_matrix(&a0, &t[0]); */
-    secp256k1_fe_decode_matrix(&b0, &t[8]);
-    /* secp256k1_fe_decode_matrix(&c0, &t[16]); */
-    secp256k1_fe_decode_matrix(&d0, &t[24]);
-
-    secp256k1_fe_decode_matrix(&a1, &t[32]);
-    secp256k1_fe_decode_matrix(&b1, &t[40]);
-    secp256k1_fe_decode_matrix(&c1, &t[48]);
-    secp256k1_fe_decode_matrix(&d1, &t[56]);
-
-    secp256k1_fe_mul(&a1, &a1, &b0);
-    secp256k1_fe_mul(&b1, &b1, &d0);
-    secp256k1_fe_mul(&c1, &c1, &b0);
-    secp256k1_fe_mul(&d1, &d1, &d0);
+    VERIFY_CHECK(i < 25);
 
-    b0 = a1; secp256k1_fe_add(&b0, &b1);
-    d0 = c1; secp256k1_fe_add(&d0, &d1);
+    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
+     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    secp256k1_fe_decode_matrix(&a1, &t[64]);
-    secp256k1_fe_decode_matrix(&b1, &t[72]);
-    /* secp256k1_fe_decode_matrix(&c1, &t[80]); */
-    /* secp256k1_fe_decode_matrix(&d1, &t[88]); */
-
-    secp256k1_fe_mul(&a1, &a1, &b0);
-    secp256k1_fe_mul(&b1, &b1, &d0);
-    /* secp256k1_fe_mul(&c1, &c1, &b0); */
-    /* secp256k1_fe_mul(&d1, &d1, &d0); */
+    sign = (f[0] >> 1) & 1;
 
-    b0 = a1; secp256k1_fe_add(&b0, &b1);
-    /* d0 = c1; secp256k1_fe_add(&d0, &d1); */
+    secp256k1_fe_decode_30(&b0, d);
 
-    secp256k1_fe_negate(&b1, &b0, 2);
+    secp256k1_fe_negate(&b1, &b0, 7);
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index c6e6dd0b0c..105eaa9500 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -644,11 +644,11 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t
     return eta;
 }
 
-static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) {
+static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
 
-    /* I64 == -P^-1 mod 2^64 */
-    const int64_t I64 = 0xD838091DD2253531LL;
-    const int64_t C64 = 0x1000003D1LL;
+    /* I62 == -P^-1 mod 2^62 */
+    const int64_t I62 = 0x1838091DD2253531LL;
+    const int64_t C62 = 0x1000003D1LL;
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
     int128_t cd = 0, ce = 0;
@@ -661,12 +661,12 @@ static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) {
     ce -= (int128_t)q * di + (int128_t)r * ei;
 
     /* Calculate the multiples of P to add, to zero the 62 bottom bits. */
-    md = ((int128_t)I64 * (int64_t)cd) & M62;
-    me = ((int128_t)I64 * (int64_t)ce) & M62;
+    md = ((int128_t)I62 * (int64_t)cd) & M62;
+    me = ((int128_t)I62 * (int64_t)ce) & M62;
 
-    /* P == 2^256 - C64; subtract C64 products here. */
-    cd -= (int128_t)C64 * md;
-    ce -= (int128_t)C64 * me;
+    /* P == 2^256 - C62; subtract products of C62 here. */
+    cd -= (int128_t)C62 * md;
+    ce -= (int128_t)C62 * me;
 
     VERIFY_CHECK(((int64_t)cd & M62) == 0);
     VERIFY_CHECK(((int64_t)ce & M62) == 0);
@@ -686,6 +686,10 @@ static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) {
         e[i - 1] = (int64_t)ce & M62; ce >>= 62;
     }
 
+    /* Add products of 2^256. */
+    cd += (int128_t)md << 8;
+    ce += (int128_t)me << 8;
+
     {
         di = d[4];
         ei = e[4];
@@ -693,10 +697,6 @@ static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) {
         cd -= (int128_t)u * di + (int128_t)v * ei;
         ce -= (int128_t)q * di + (int128_t)r * ei;
 
-        /* In the final iteration, add the 2^256 products. */
-        cd += (int128_t)md << 8;
-        ce += (int128_t)me << 8;
-
         d[3] = (int64_t)cd & M62; cd >>= 62;
         e[3] = (int64_t)ce & M62; ce >>= 62;
     }
@@ -705,7 +705,7 @@ static void secp256k1_fe_update_de(int64_t *d, int64_t *e, int64_t *t) {
     e[4] = (int64_t)ce;
 }
 
-static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) {
+static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, int64_t *t) {
 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
@@ -743,8 +743,7 @@ static void secp256k1_fe_update_fg(int64_t *f, int64_t *g, int64_t *t) {
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
-     */
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
     int64_t t[4];
     int64_t d[5] = { 0, 0, 0, 0, 0 };
@@ -772,14 +771,13 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     for (i = 0; i < 12; ++i) {
         eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], t);
-        secp256k1_fe_update_de(d, e, t);
-        secp256k1_fe_update_fg(f, g, t);
+        secp256k1_fe_update_de_62(d, e, t);
+        secp256k1_fe_update_fg_62(f, g, t);
     }
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1, and d now contains +/- the modular inverse.
-     */
+     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
     VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0);
 
@@ -801,8 +799,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
-     */
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
     int64_t t[4];
     int64_t d[5] = { 0, 0, 0, 0, 0 };
@@ -830,8 +827,8 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
     for (i = 0; i < 12; ++i) {
         eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t);
-        secp256k1_fe_update_de(d, e, t);
-        secp256k1_fe_update_fg(f, g, t);
+        secp256k1_fe_update_de_62(d, e, t);
+        secp256k1_fe_update_fg_62(f, g, t);
 
         if (g[0] == 0) {
             if ((g[1] | g[2] | g[3] | g[4]) == 0) {
@@ -843,8 +840,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     VERIFY_CHECK(i < 12);
 
     /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
-     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse.
-     */
+     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
     sign = (f[0] >> 1) & 1;
 

From e5f2d29cbb7a98fe295f957240138378558dae32 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sat, 8 Aug 2020 22:08:18 +0700
Subject: [PATCH 13/34] =?UTF-8?q?scalar=5F4x64:=20update=20B=C3=A9zout=20c?=
 =?UTF-8?q?oefficients=20on-the-fly?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/field_10x26_impl.h |   4 +-
 src/field_5x52_impl.h  |   4 +-
 src/scalar_4x64_impl.h | 300 ++++++++++++++---------------------------
 src/scalar_8x32_impl.h |   4 -
 4 files changed, 107 insertions(+), 205 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 30c5653c78..2a1eaed394 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1353,8 +1353,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
     ce -= (int64_t)q * di + (int64_t)r * ei;
 
     /* Calculate the multiples of P to add, to zero the 30 bottom bits. */
-    md = ((int64_t)I30 * (int32_t)cd) & M30;
-    me = ((int64_t)I30 * (int32_t)ce) & M30;
+    md = (I30 * (int32_t)cd) & M30;
+    me = (I30 * (int32_t)ce) & M30;
 
     /* P == 2^256 - 2^32 - C30; subtract products of C30 here. */
     cd -= (int64_t)C30 * md;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 105eaa9500..44188699b2 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -661,8 +661,8 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
     ce -= (int128_t)q * di + (int128_t)r * ei;
 
     /* Calculate the multiples of P to add, to zero the 62 bottom bits. */
-    md = ((int128_t)I62 * (int64_t)cd) & M62;
-    me = ((int128_t)I62 * (int64_t)ce) & M62;
+    md = (I62 * (int64_t)cd) & M62;
+    me = (I62 * (int64_t)ce) & M62;
 
     /* P == 2^256 - C62; subtract products of C62 here. */
     cd -= (int128_t)C62 * md;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 8ce50a9ad3..7b526cb90f 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -962,104 +962,40 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA
     0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL
 );
 
-static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CONST(
-    0x4E165355UL, 0x5D800C18UL, 0xEF116DB1UL, 0xB31347F1UL,
-    0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL
-);
-
-static void secp256k1_scalar_mul_add_2(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {
-
-    /*  Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64.
-     *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
-     *  writes the 252-bit signed result to [t[0],t[1],t[2],t[3]].
-     */
-
-    int64_t z0, z1, z2, z3;
-    int128_t tt;
-
-    tt  = (int128_t)a0 * b0
-        + (int128_t)c0 * d0;
-    z0  = (int64_t)tt; tt -= z0; tt >>= 64;
-
-    tt += (int128_t)a0 * b1
-        + (int128_t)a1 * b0
-        + (int128_t)c0 * d1
-        + (int128_t)c1 * d0;
-    z1  = (int64_t)tt; tt -= z1; tt >>= 64;
-
-    tt += (int128_t)a1 * b1
-        + (int128_t)c1 * d1;
-    z2  = (int64_t)tt; tt -= z2; tt >>= 64;
-
-    z3 = (int64_t)tt;
-
-    t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3;
-}
-
-static void secp256k1_scalar_combine_1s(int64_t *t) {
-
-    int64_t a = t[0], b = t[1], c = t[2], d = t[3],
-            e = t[4], f = t[5], g = t[6], h = t[7];
-    int128_t I, J, K, L;
-
-    I = (int128_t)e * a + (int128_t)f * c;
-    J = (int128_t)e * b + (int128_t)f * d;
-    K = (int128_t)g * a + (int128_t)h * c;
-    L = (int128_t)g * b + (int128_t)h * d;
-
-    a = (int64_t)I; I -= a; I >>= 64; b = (int64_t)I;
-    c = (int64_t)J; J -= c; J >>= 64; d = (int64_t)J;
-    e = (int64_t)K; K -= e; K >>= 64; f = (int64_t)K;
-    g = (int64_t)L; L -= g; L >>= 64; h = (int64_t)L;
-
-    t[0] = a; t[1] = b; t[2] = c; t[3] = d;
-    t[4] = e; t[5] = f; t[6] = g; t[7] = h;
-}
-
-static void secp256k1_scalar_combine_2s(int64_t *t) {
-
-    int64_t a0 = t[ 0], a1 = t[ 1];
-    int64_t b0 = t[ 2], b1 = t[ 3];
-    int64_t c0 = t[ 4], c1 = t[ 5];
-    int64_t d0 = t[ 6], d1 = t[ 7];
-    int64_t e0 = t[ 8], e1 = t[ 9];
-    int64_t f0 = t[10], f1 = t[11];
-    int64_t g0 = t[12], g1 = t[13];
-    int64_t h0 = t[14], h1 = t[15];
-
-    secp256k1_scalar_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
-    secp256k1_scalar_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
-    secp256k1_scalar_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
-    secp256k1_scalar_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
-}
-
-static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int64_t *t) {
+static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) {
 
+    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
     uint64_t r0, r1, r2, r3;
-    int flag;
+    int64_t t;
     secp256k1_scalar u;
-    int128_t cc;
-
-    cc  = t[0];
-    r0  = (uint64_t)cc; cc >>= 64;
-    cc += t[1];
-    r1  = (uint64_t)cc; cc >>= 64;
-    cc += t[2];
-    r2  = (uint64_t)cc; cc >>= 64;
-    cc += t[3];
-    r3  = (uint64_t)cc; cc >>= 64;
 
-    VERIFY_CHECK(cc == 0 || cc == -1);
+    VERIFY_CHECK(a0 >> 62 == 0);
+    VERIFY_CHECK(a1 >> 62 == 0);
+    VERIFY_CHECK(a2 >> 62 == 0);
+    VERIFY_CHECK(a3 >> 62 == 0);
 
-    flag = (int)cc & 1;
+    r0 = a0      | a1 << 62;
+    r1 = a1 >> 2 | a2 << 60;
+    r2 = a2 >> 4 | a3 << 58;
+    r3 = a3 >> 6 | a4 << 56;
 
     r->d[0] = r0;
     r->d[1] = r1;
     r->d[2] = r2;
     r->d[3] = r3;
 
+    secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
+
+    t = (int64_t)a4 >> 8;
+
+    VERIFY_CHECK(t == 1 || t == 0 || t == -1);
+
     secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256);
-    secp256k1_scalar_cmov(r, &u, flag);
+    secp256k1_scalar_cmov(r, &u, a4 >> 63);
+
+    t += a4 >> 63;
+
+    secp256k1_scalar_reduce(r, t);
 }
 
 static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {
@@ -1077,10 +1013,6 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {
     r[2] = (a1 >> 60 | a2 <<  4) & M62;
     r[3] = (a2 >> 58 | a3 <<  6) & M62;
     r[4] =  a3 >> 56;
-
-#ifdef VERIFY
-    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
-#endif
 }
 
 static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
@@ -1175,7 +1107,56 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
     return eta;
 }
 
-static void secp256k1_scalar_update_fg(int64_t *f, int64_t *g, int64_t *t) {
+static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
+
+    /* I62 == -P^-1 mod 2^62 */
+    const int64_t I62 = 0x0B0DFF665588B13FLL;
+    const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
+        0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
+    int128_t cd = 0, ce = 0;
+    int i;
+
+    di = d[0];
+    ei = e[0];
+
+    cd -= (int128_t)u * di + (int128_t)v * ei;
+    ce -= (int128_t)q * di + (int128_t)r * ei;
+
+    /* Calculate the multiples of P to add, to zero the 62 bottom bits. */
+    md = (I62 * (int64_t)cd) & M62;
+    me = (I62 * (int64_t)ce) & M62;
+
+    cd += (int128_t)P[0] * md;
+    ce += (int128_t)P[0] * me;
+
+    VERIFY_CHECK(((int64_t)cd & M62) == 0);
+    VERIFY_CHECK(((int64_t)ce & M62) == 0);
+
+    cd >>= 62;
+    ce >>= 62;
+
+    for (i = 1; i < 5; ++i) {
+
+        di = d[i];
+        ei = e[i];
+
+        cd -= (int128_t)u * di + (int128_t)v * ei;
+        ce -= (int128_t)q * di + (int128_t)r * ei;
+
+        cd += (int128_t)P[i] * md;
+        ce += (int128_t)P[i] * me;
+
+        d[i - 1] = (int64_t)cd & M62; cd >>= 62;
+        e[i - 1] = (int64_t)ce & M62; ce >>= 62;
+    }
+
+    d[4] = (int64_t)cd;
+    e[4] = (int64_t)ce;
+}
+
+static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, int64_t *t) {
 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
@@ -1224,82 +1205,42 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 #else
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
-     */
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
-    int64_t t[12 * 4];
+    int64_t t[4];
+    int64_t d[5] = { 0, 0, 0, 0, 0 };
+    int64_t e[5] = { 1, 0, 0, 0, 0 };
     int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
-    secp256k1_scalar b0, d0, a1, b1, c1, d1;
+    secp256k1_scalar b0;
     int i, sign;
     uint64_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    /* Instead of dividing the output by 2^744, scale the input. */
-    secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
+    b0 = *x;
     secp256k1_scalar_encode_62(g, &b0);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
-        eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], &t[i * 4]);
-        secp256k1_scalar_update_fg(f, g, &t[i * 4]);
+        eta = secp256k1_scalar_divsteps_62(eta, f[0], g[0], t);
+        secp256k1_scalar_update_de_62(d, e, t);
+        secp256k1_scalar_update_fg_62(f, g, t);
     }
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get
-     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra
-     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
-     */
+     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    VERIFY_CHECK(g[0] == 0);
+    VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0);
 
     sign = (f[0] >> 1) & 1;
 
-    for (i = 0; i < 3; ++i) {
-        int tOff = i * 16;
-        secp256k1_scalar_combine_1s(&t[tOff + 0]);
-        secp256k1_scalar_combine_1s(&t[tOff + 8]);
-        secp256k1_scalar_combine_2s(&t[tOff + 0]);
-    }
-
-    /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */
-    secp256k1_scalar_decode_matrix(&b0, &t[4]);
-    /* secp256k1_scalar_decode_matrix(&c0, &t[8]); */
-    secp256k1_scalar_decode_matrix(&d0, &t[12]);
-
-    secp256k1_scalar_decode_matrix(&a1, &t[16]);
-    secp256k1_scalar_decode_matrix(&b1, &t[20]);
-    secp256k1_scalar_decode_matrix(&c1, &t[24]);
-    secp256k1_scalar_decode_matrix(&d1, &t[28]);
-
-    secp256k1_scalar_mul(&a1, &a1, &b0);
-    secp256k1_scalar_mul(&b1, &b1, &d0);
-    secp256k1_scalar_mul(&c1, &c1, &b0);
-    secp256k1_scalar_mul(&d1, &d1, &d0);
-
-    secp256k1_scalar_add(&b0, &a1, &b1);
-    secp256k1_scalar_add(&d0, &c1, &d1);
-
-    secp256k1_scalar_decode_matrix(&a1, &t[32]);
-    secp256k1_scalar_decode_matrix(&b1, &t[36]);
-    /* secp256k1_scalar_decode_matrix(&c1, &t[40]); */
-    /* secp256k1_scalar_decode_matrix(&d1, &t[44]); */
-
-    secp256k1_scalar_mul(&a1, &a1, &b0);
-    secp256k1_scalar_mul(&b1, &b1, &d0);
-    /* secp256k1_scalar_mul(&c1, &c1, &b0); */
-    /* secp256k1_scalar_mul(&d1, &d1, &d0); */
-
-    secp256k1_scalar_add(&b0, &a1, &b1);
-    /* secp256k1_scalar_add(&d0, &c1, &d1); */
-
+    secp256k1_scalar_decode_62(&b0, d);
     secp256k1_scalar_cond_negate(&b0, sign);
 
 #ifdef VERIFY
@@ -1317,82 +1258,47 @@ SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a)
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
-     */
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
-    int64_t t[12 * 4];
+    int64_t t[4];
+    int64_t d[5] = { 0, 0, 0, 0, 0 };
+    int64_t e[5] = { 1, 0, 0, 0, 0 };
     int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
-    secp256k1_scalar b0, d0, a1, b1, c1, d1;
+    secp256k1_scalar b0;
     int i, sign;
     uint64_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    /* Instead of dividing the output by 2^744, scale the input. */
-    secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
+    b0 = *x;
     secp256k1_scalar_encode_62(g, &b0);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
-        eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], &t[i * 4]);
-        secp256k1_scalar_update_fg(f, g, &t[i * 4]);
+        eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], t);
+        secp256k1_scalar_update_de_62(d, e, t);
+        secp256k1_scalar_update_fg_62(f, g, t);
+
+        if (g[0] == 0) {
+            if ((g[1] | g[2] | g[3] | g[4]) == 0) {
+                break;
+            }
+        }
     }
 
-    /* At this point sufficient iterations have been performed that g must have reached 0
-     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divsteps_62 are combined to get
-     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divsteps_62 introduce an extra factor of 2^62 each, so there is a total extra
-     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
-     */
+    VERIFY_CHECK(i < 12);
 
-    VERIFY_CHECK(g[0] == 0);
+    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
+     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
     sign = (f[0] >> 1) & 1;
 
-    for (i = 0; i < 3; ++i) {
-        int tOff = i * 16;
-        secp256k1_scalar_combine_1s(&t[tOff + 0]);
-        secp256k1_scalar_combine_1s(&t[tOff + 8]);
-        secp256k1_scalar_combine_2s(&t[tOff + 0]);
-    }
-
-    /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */
-    secp256k1_scalar_decode_matrix(&b0, &t[4]);
-    /* secp256k1_scalar_decode_matrix(&c0, &t[8]); */
-    secp256k1_scalar_decode_matrix(&d0, &t[12]);
-
-    secp256k1_scalar_decode_matrix(&a1, &t[16]);
-    secp256k1_scalar_decode_matrix(&b1, &t[20]);
-    secp256k1_scalar_decode_matrix(&c1, &t[24]);
-    secp256k1_scalar_decode_matrix(&d1, &t[28]);
-
-    secp256k1_scalar_mul(&a1, &a1, &b0);
-    secp256k1_scalar_mul(&b1, &b1, &d0);
-    secp256k1_scalar_mul(&c1, &c1, &b0);
-    secp256k1_scalar_mul(&d1, &d1, &d0);
-
-    secp256k1_scalar_add(&b0, &a1, &b1);
-    secp256k1_scalar_add(&d0, &c1, &d1);
-
-    secp256k1_scalar_decode_matrix(&a1, &t[32]);
-    secp256k1_scalar_decode_matrix(&b1, &t[36]);
-    /* secp256k1_scalar_decode_matrix(&c1, &t[40]); */
-    /* secp256k1_scalar_decode_matrix(&d1, &t[44]); */
-
-    secp256k1_scalar_mul(&a1, &a1, &b0);
-    secp256k1_scalar_mul(&b1, &b1, &d0);
-    /* secp256k1_scalar_mul(&c1, &c1, &b0); */
-    /* secp256k1_scalar_mul(&d1, &d1, &d0); */
-
-    secp256k1_scalar_add(&b0, &a1, &b1);
-    /* secp256k1_scalar_add(&d0, &c1, &d1); */
-
+    secp256k1_scalar_decode_62(&b0, d);
     secp256k1_scalar_cond_negate(&b0, sign);
 
 #ifdef VERIFY
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 54fd10f385..f47b19b287 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -983,10 +983,6 @@ static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) {
     r[6] = (a5 >> 26 | a6 <<  6) & M31;
     r[7] = (a6 >> 25 | a7 <<  7) & M31;
     r[8] =  a7 >> 24;
-
-#ifdef VERIFY
-    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
-#endif
 }
 
 static uint32_t secp256k1_scalar_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {

From 34bec400bffc1c12a8cd8cc9163bc3b1a9947f64 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sat, 8 Aug 2020 23:40:40 +0700
Subject: [PATCH 14/34] =?UTF-8?q?scalar=5F8x32:=20update=20B=C3=A9zout=20c?=
 =?UTF-8?q?oefficients=20on-the-fly?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/scalar_8x32_impl.h | 502 ++++++++++++-----------------------------
 1 file changed, 146 insertions(+), 356 deletions(-)

diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index f47b19b287..6cc23f90aa 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -738,217 +738,31 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA
     0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL
 );
 
-static const secp256k1_scalar SECP256K1_SCALAR_TWO_POW_744 = SECP256K1_SCALAR_CONST(
-    0x4E165355UL, 0x5D800C18UL, 0xEF116DB1UL, 0xB31347F1UL,
-    0x6D77C2DCUL, 0x0E3E8029UL, 0x59BA208FUL, 0xFD01F4F7UL
-);
-
-static void secp256k1_scalar_mul_add_2(int32_t a0, int32_t a1, int32_t b0, int32_t b1, int32_t c0, int32_t c1, int32_t d0, int32_t d1, int32_t *t) {
-
-    /*  Each [a0,a1], etc. pair is a ??-bit signed value e.g. a0 + a1 * 2^32.
-     *  This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
-     *  writes the ???-bit signed result to [t[0],t[1],t[2],t[3]].
-     */
-
-    int32_t z0, z1, z2, z3;
-    int64_t tt;
-
-    tt  = (int64_t)a0 * b0
-        + (int64_t)c0 * d0;
-    z0  = (int32_t)tt; tt -= z0; tt >>= 32;
-
-    tt += (int64_t)a0 * b1
-        + (int64_t)a1 * b0
-        + (int64_t)c0 * d1
-        + (int64_t)c1 * d0;
-    z1  = (int32_t)tt; tt -= z1; tt >>= 32;
-
-    tt += (int64_t)a1 * b1
-        + (int64_t)c1 * d1;
-    z2  = (int32_t)tt; tt -= z2; tt >>= 32;
-
-    z3 = (int32_t)tt;
-
-    t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3;
-}
-
-static void secp256k1_scalar_mul_add_4(int32_t* tIn, int xPos, int yPos, int uPos, int vPos, int32_t *tOut, int zzPos) {
-    int32_t y0 = tIn[yPos + 0];
-    int32_t y1 = tIn[yPos + 1];
-    int32_t y2 = tIn[yPos + 2];
-    int32_t y3 = tIn[yPos + 3];
-    int32_t v0 = tIn[vPos + 0];
-    int32_t v1 = tIn[vPos + 1];
-    int32_t v2 = tIn[vPos + 2];
-    int32_t v3 = tIn[vPos + 3];
-    int32_t xVal, uVal;
-    int32_t z0, z1, z2, z3, z4, z5, z6, z7;
-    int64_t c;
-
-    xVal = tIn[xPos];
-    uVal = tIn[uPos];
-
-    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0;
-    z0 = (int32_t)c; c -= z0; c >>= 32;
-
-    c += (int64_t)xVal * y1 + (int64_t)uVal * v1;
-    z1 = (int32_t)c; c -= z1; c >>= 32;
-
-    c += (int64_t)xVal * y2 + (int64_t)uVal * v2;
-    z2 = (int32_t)c; c -= z2; c >>= 32;
-
-    c += (int64_t)xVal * y3 + (int64_t)uVal * v3;
-    z3 = (int32_t)c; c -= z3; c >>= 32;
-    z4 = (int32_t)c;
-
-    xVal = tIn[xPos + 1];
-    uVal = tIn[uPos + 1];
-
-    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z1;
-    z1 = (int32_t)c; c -= z1; c >>= 32;
-
-    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z2;
-    z2 = (int32_t)c; c -= z2; c >>= 32;
-
-    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z3;
-    z3 = (int32_t)c; c -= z3; c >>= 32;
-
-    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z4;
-    z4 = (int32_t)c; c -= z4; c >>= 32;
-    z5 = (int32_t)c;
-
-    xVal = tIn[xPos + 2];
-    uVal = tIn[uPos + 2];
-
-    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z2;
-    z2 = (int32_t)c; c -= z2; c >>= 32;
-
-    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z3;
-    z3 = (int32_t)c; c -= z3; c >>= 32;
-
-    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z4;
-    z4 = (int32_t)c; c -= z4; c >>= 32;
-
-    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z5;
-    z5 = (int32_t)c; c -= z5; c >>= 32;
-    z6 = (int32_t)c;
-
-    xVal = tIn[xPos + 3];
-    uVal = tIn[uPos + 3];
-
-    c  = (int64_t)xVal * y0 + (int64_t)uVal * v0 + z3;
-    z3 = (int32_t)c; c -= z3; c >>= 32;
-
-    c += (int64_t)xVal * y1 + (int64_t)uVal * v1 + z4;
-    z4 = (int32_t)c; c -= z4; c >>= 32;
-
-    c += (int64_t)xVal * y2 + (int64_t)uVal * v2 + z5;
-    z5 = (int32_t)c; c -= z5; c >>= 32;
-
-    c += (int64_t)xVal * y3 + (int64_t)uVal * v3 + z6;
-    z6 = (int32_t)c; c -= z6; c >>= 32;
-    z7 = (int32_t)c;
-
-    tOut[zzPos + 0] = z0;
-    tOut[zzPos + 1] = z1;
-    tOut[zzPos + 2] = z2;
-    tOut[zzPos + 3] = z3;
-    tOut[zzPos + 4] = z4;
-    tOut[zzPos + 5] = z5;
-    tOut[zzPos + 6] = z6;
-    tOut[zzPos + 7] = z7;
-}
-
-static void secp256k1_scalar_combine_1s(int32_t *t) {
-
-    int32_t a = t[0], b = t[1], c = t[2], d = t[3],
-            e = t[4], f = t[5], g = t[6], h = t[7];
-    int64_t I, J, K, L;
-
-    I = (int64_t)e * a + (int64_t)f * c;
-    J = (int64_t)e * b + (int64_t)f * d;
-    K = (int64_t)g * a + (int64_t)h * c;
-    L = (int64_t)g * b + (int64_t)h * d;
-
-    a = (int32_t)I; I -= a; I >>= 32; b = (int32_t)I;
-    c = (int32_t)J; J -= c; J >>= 32; d = (int32_t)J;
-    e = (int32_t)K; K -= e; K >>= 32; f = (int32_t)K;
-    g = (int32_t)L; L -= g; L >>= 32; h = (int32_t)L;
-
-    t[0] = a; t[1] = b; t[2] = c; t[3] = d;
-    t[4] = e; t[5] = f; t[6] = g; t[7] = h;
-}
-
-static void secp256k1_scalar_combine_2s(int32_t *t) {
-
-    int32_t a0 = t[ 0], a1 = t[ 1];
-    int32_t b0 = t[ 2], b1 = t[ 3];
-    int32_t c0 = t[ 4], c1 = t[ 5];
-    int32_t d0 = t[ 6], d1 = t[ 7];
-    int32_t e0 = t[ 8], e1 = t[ 9];
-    int32_t f0 = t[10], f1 = t[11];
-    int32_t g0 = t[12], g1 = t[13];
-    int32_t h0 = t[14], h1 = t[15];
-
-    secp256k1_scalar_mul_add_2(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
-    secp256k1_scalar_mul_add_2(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
-    secp256k1_scalar_mul_add_2(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
-    secp256k1_scalar_mul_add_2(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
-}
-
-static void secp256k1_scalar_combine_4s(int32_t *t)
-{
-    int32_t tmp[32];
-
-    int aPos = 0;
-    int bPos = 4;
-    int cPos = 8;
-    int dPos = 12;
-    int ePos = 16;
-    int fPos = 20;
-    int gPos = 24;
-    int hPos = 28;
-
-    int IPos = 0;
-    int JPos = 8;
-    int KPos = 16;
-    int LPos = 24;
-
-    secp256k1_scalar_mul_add_4(t, ePos, aPos, fPos, cPos, tmp, IPos);
-    secp256k1_scalar_mul_add_4(t, ePos, bPos, fPos, dPos, tmp, JPos);
-    secp256k1_scalar_mul_add_4(t, gPos, aPos, hPos, cPos, tmp, KPos);
-    secp256k1_scalar_mul_add_4(t, gPos, bPos, hPos, dPos, tmp, LPos);
-
-    memcpy(t, tmp, 32 * sizeof(int32_t));
-}
-
-static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int32_t *t) {
+static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) {
 
+    uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
+             a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
     uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
-    int flag;
+    int32_t t;
     secp256k1_scalar u;
-    int64_t cc;
-
-    cc  = t[0];
-    r0  = (uint32_t)cc; cc >>= 32;
-    cc += t[1];
-    r1  = (uint32_t)cc; cc >>= 32;
-    cc += t[2];
-    r2  = (uint32_t)cc; cc >>= 32;
-    cc += t[3];
-    r3  = (uint32_t)cc; cc >>= 32;
-    cc += t[4];
-    r4  = (uint32_t)cc; cc >>= 32;
-    cc += t[5];
-    r5  = (uint32_t)cc; cc >>= 32;
-    cc += t[6];
-    r6  = (uint32_t)cc; cc >>= 32;
-    cc += t[7];
-    r7  = (uint32_t)cc; cc >>= 32;
-
-    VERIFY_CHECK(cc == 0 || cc == -1);
-
-    flag = (int)cc & 1;
+
+    VERIFY_CHECK(a0 >> 30 == 0);
+    VERIFY_CHECK(a1 >> 30 == 0);
+    VERIFY_CHECK(a2 >> 30 == 0);
+    VERIFY_CHECK(a3 >> 30 == 0);
+    VERIFY_CHECK(a4 >> 30 == 0);
+    VERIFY_CHECK(a5 >> 30 == 0);
+    VERIFY_CHECK(a6 >> 30 == 0);
+    VERIFY_CHECK(a7 >> 30 == 0);
+
+    r0 = a0       | a1 << 30;
+    r1 = a1 >>  2 | a2 << 28;
+    r2 = a2 >>  4 | a3 << 26;
+    r3 = a3 >>  6 | a4 << 24;
+    r4 = a4 >>  8 | a5 << 22;
+    r5 = a5 >> 10 | a6 << 20;
+    r6 = a6 >> 12 | a7 << 18;
+    r7 = a7 >> 14 | a8 << 16;
 
     r->d[0] = r0;
     r->d[1] = r1;
@@ -959,13 +773,23 @@ static void secp256k1_scalar_decode_matrix(secp256k1_scalar *r, int32_t *t) {
     r->d[6] = r6;
     r->d[7] = r7;
 
+    secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
+
+    t = (int32_t)a8 >> 16;
+
+    VERIFY_CHECK(t == 1 || t == 0 || t == -1);
+
     secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256);
-    secp256k1_scalar_cmov(r, &u, flag);
+    secp256k1_scalar_cmov(r, &u, a8 >> 31);
+
+    t += a8 >> 31;
+
+    secp256k1_scalar_reduce(r, t);
 }
 
-static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) {
+static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) {
 
-    const uint32_t M31 = UINT32_MAX >> 1;
+    const uint32_t M30 = UINT32_MAX >> 2;
     const uint32_t *d = &a->d[0];
     uint32_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3],
              a4 = d[4], a5 = d[5], a6 = d[6], a7 = d[7];
@@ -974,24 +798,24 @@ static void secp256k1_scalar_encode_31(int32_t *r, const secp256k1_scalar *a) {
     VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
 #endif
 
-    r[0] =  a0                   & M31;
-    r[1] = (a0 >> 31 | a1 <<  1) & M31;
-    r[2] = (a1 >> 30 | a2 <<  2) & M31;
-    r[3] = (a2 >> 29 | a3 <<  3) & M31;
-    r[4] = (a3 >> 28 | a4 <<  4) & M31;
-    r[5] = (a4 >> 27 | a5 <<  5) & M31;
-    r[6] = (a5 >> 26 | a6 <<  6) & M31;
-    r[7] = (a6 >> 25 | a7 <<  7) & M31;
-    r[8] =  a7 >> 24;
+    r[0] =  a0                   & M30;
+    r[1] = (a0 >> 30 | a1 <<  2) & M30;
+    r[2] = (a1 >> 28 | a2 <<  4) & M30;
+    r[3] = (a2 >> 26 | a3 <<  6) & M30;
+    r[4] = (a3 >> 24 | a4 <<  8) & M30;
+    r[5] = (a4 >> 22 | a5 << 10) & M30;
+    r[6] = (a5 >> 20 | a6 << 12) & M30;
+    r[7] = (a6 >> 18 | a7 << 14) & M30;
+    r[8] =  a7 >> 16;
 }
 
-static uint32_t secp256k1_scalar_divsteps_31(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
-    for (i = 0; i < 31; ++i) {
+    for (i = 0; i < 30; ++i) {
 
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((u * f0 + v * g0) == -f << i);
@@ -1025,11 +849,11 @@ static uint32_t secp256k1_scalar_divsteps_31(uint32_t eta, uint32_t f0, uint32_t
     return eta;
 }
 
-static uint32_t secp256k1_scalar_divsteps_31_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
+static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
-    int i = 31, limit, zeros;
+    int i = 30, limit, zeros;
 
     for (;;) {
 
@@ -1048,8 +872,8 @@ static uint32_t secp256k1_scalar_divsteps_31_var(uint32_t eta, uint32_t f0, uint
 
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((g & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << (31 - i));
-        VERIFY_CHECK((q * f0 + r * g0) == -g << (31 - i));
+        VERIFY_CHECK((u * f0 + v * g0) == -f << (30 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == -g << (30 - i));
 
         if ((int32_t)eta < 0) {
             eta = -eta;
@@ -1077,9 +901,58 @@ static uint32_t secp256k1_scalar_divsteps_31_var(uint32_t eta, uint32_t f0, uint
     return eta;
 }
 
-static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) {
+static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
+
+    /* I30 == -P^-1 mod 2^30 */
+    const int32_t I30 = 0x1588B13FL;
+    const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL,
+        0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL };
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
+    int64_t cd = 0, ce = 0;
+    int i;
+
+    di = d[0];
+    ei = e[0];
+
+    cd -= (int64_t)u * di + (int64_t)v * ei;
+    ce -= (int64_t)q * di + (int64_t)r * ei;
+
+    /* Calculate the multiples of P to add, to zero the 30 bottom bits. */
+    md = (I30 * (int32_t)cd) & M30;
+    me = (I30 * (int32_t)ce) & M30;
+
+    cd += (int64_t)P[0] * md;
+    ce += (int64_t)P[0] * me;
+
+    VERIFY_CHECK(((int32_t)cd & M30) == 0);
+    VERIFY_CHECK(((int32_t)ce & M30) == 0);
+
+    cd >>= 30;
+    ce >>= 30;
+
+    for (i = 1; i < 9; ++i) {
+
+        di = d[i];
+        ei = e[i];
+
+        cd -= (int64_t)u * di + (int64_t)v * ei;
+        ce -= (int64_t)q * di + (int64_t)r * ei;
+
+        cd += (int64_t)P[i] * md;
+        ce += (int64_t)P[i] * me;
+
+        d[i - 1] = (int32_t)cd & M30; cd >>= 30;
+        e[i - 1] = (int32_t)ce & M30; ce >>= 30;
+    }
+
+    d[8] = (int32_t)cd;
+    e[8] = (int32_t)ce;
+}
+
+static void secp256k1_scalar_update_fg_30(int32_t *f, int32_t *g, int32_t *t) {
 
-    const int32_t M31 = (int32_t)(UINT32_MAX >> 1);
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
     int64_t cf = 0, cg = 0;
     int i;
@@ -1090,11 +963,11 @@ static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) {
     cf -= (int64_t)u * fi + (int64_t)v * gi;
     cg -= (int64_t)q * fi + (int64_t)r * gi;
 
-    VERIFY_CHECK(((int32_t)cf & M31) == 0);
-    VERIFY_CHECK(((int32_t)cg & M31) == 0);
+    VERIFY_CHECK(((int32_t)cf & M30) == 0);
+    VERIFY_CHECK(((int32_t)cg & M30) == 0);
 
-    cf >>= 31;
-    cg >>= 31;
+    cf >>= 30;
+    cg >>= 30;
 
     for (i = 1; i < 9; ++i) {
 
@@ -1104,8 +977,8 @@ static void secp256k1_scalar_update_fg(int32_t *f, int32_t *g, int32_t *t) {
         cf -= (int64_t)u * fi + (int64_t)v * gi;
         cg -= (int64_t)q * fi + (int64_t)r * gi;
 
-        f[i - 1] = (int32_t)cf & M31; cf >>= 31;
-        g[i - 1] = (int32_t)cg & M31; cg >>= 31;
+        f[i - 1] = (int32_t)cf & M30; cf >>= 30;
+        g[i - 1] = (int32_t)cg & M30; cg >>= 30;
     }
 
     f[8] = (int32_t)cf;
@@ -1126,86 +999,42 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 #else
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
-     */
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
-    int32_t t[24 * 4];
-    int32_t f[9] = { 0x50364141L, 0x7FA4BD19L, 0x3D2280EEL, 0x5576E735L, 0x7FFFFFEBL,
-        0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL };
+    int32_t t[4];
+    int32_t d[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int32_t e[9] = { 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL,
+        0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL };
     int32_t g[9];
-    secp256k1_scalar b0, d0, a1, b1, c1, d1;
+    secp256k1_scalar b0;
     int i, sign;
     uint32_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    /* Instead of dividing the output by 2^744, scale the input. */
-    secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
-    secp256k1_scalar_encode_31(g, &b0);
+    b0 = *x;
+    secp256k1_scalar_encode_30(g, &b0);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
     eta = -(uint32_t)1;
 
-    for (i = 0; i < 24; ++i) {
-        eta = secp256k1_scalar_divsteps_31(eta, f[0], g[0], &t[i * 4]);
-        secp256k1_scalar_update_fg(f, g, &t[i * 4]);
+    for (i = 0; i < 25; ++i) {
+        eta = secp256k1_scalar_divsteps_30(eta, f[0], g[0], t);
+        secp256k1_scalar_update_de_30(d, e, t);
+        secp256k1_scalar_update_fg_30(f, g, t);
     }
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divsteps_31 are combined to get
-     * the Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divsteps_31 introduce an extra factor of 2^31 each, so there is a total extra
-     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
-     */
+     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    VERIFY_CHECK(g[0] == 0);
+    VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0);
 
     sign = (f[0] >> 1) & 1;
 
-    for (i = 0; i < 3; ++i) {
-        int tOff = i * 32;
-        secp256k1_scalar_combine_1s(&t[tOff +  0]);
-        secp256k1_scalar_combine_1s(&t[tOff +  8]);
-        secp256k1_scalar_combine_1s(&t[tOff + 16]);
-        secp256k1_scalar_combine_1s(&t[tOff + 24]);
-        secp256k1_scalar_combine_2s(&t[tOff +  0]);
-        secp256k1_scalar_combine_2s(&t[tOff + 16]);
-        secp256k1_scalar_combine_4s(&t[tOff +  0]);
-    }
-
-    /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */
-    secp256k1_scalar_decode_matrix(&b0, &t[8]);
-    /* secp256k1_scalar_decode_matrix(&c0, &t[16]); */
-    secp256k1_scalar_decode_matrix(&d0, &t[24]);
-
-    secp256k1_scalar_decode_matrix(&a1, &t[32]);
-    secp256k1_scalar_decode_matrix(&b1, &t[40]);
-    secp256k1_scalar_decode_matrix(&c1, &t[48]);
-    secp256k1_scalar_decode_matrix(&d1, &t[56]);
-
-    secp256k1_scalar_mul(&a1, &a1, &b0);
-    secp256k1_scalar_mul(&b1, &b1, &d0);
-    secp256k1_scalar_mul(&c1, &c1, &b0);
-    secp256k1_scalar_mul(&d1, &d1, &d0);
-
-    secp256k1_scalar_add(&b0, &a1, &b1);
-    secp256k1_scalar_add(&d0, &c1, &d1);
-
-    secp256k1_scalar_decode_matrix(&a1, &t[64]);
-    secp256k1_scalar_decode_matrix(&b1, &t[72]);
-    /* secp256k1_scalar_decode_matrix(&c1, &t[80]); */
-    /* secp256k1_scalar_decode_matrix(&d1, &t[88]); */
-
-    secp256k1_scalar_mul(&a1, &a1, &b0);
-    secp256k1_scalar_mul(&b1, &b1, &d0);
-    /* secp256k1_scalar_mul(&c1, &c1, &b0); */
-    /* secp256k1_scalar_mul(&d1, &d1, &d0); */
-
-    secp256k1_scalar_add(&b0, &a1, &b1);
-    /* secp256k1_scalar_add(&d0, &c1, &d1); */
-
+    secp256k1_scalar_decode_30(&b0, d);
     secp256k1_scalar_cond_negate(&b0, sign);
 
 #ifdef VERIFY
@@ -1223,86 +1052,47 @@ SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a)
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
-     */
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
-    int32_t t[24 * 4];
-    int32_t f[9] = { 0x50364141L, 0x7FA4BD19L, 0x3D2280EEL, 0x5576E735L, 0x7FFFFFEBL,
-        0x7FFFFFFFL, 0x7FFFFFFFL, 0x7FFFFFFFL, 0xFFL };
+    int32_t t[4];
+    int32_t d[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int32_t e[9] = { 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL,
+        0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL };
     int32_t g[9];
-    secp256k1_scalar b0, d0, a1, b1, c1, d1;
+    secp256k1_scalar b0;
     int i, sign;
     uint32_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    /* Instead of dividing the output by 2^744, scale the input. */
-    secp256k1_scalar_mul(&b0, x, &SECP256K1_SCALAR_TWO_POW_744);
-    secp256k1_scalar_encode_31(g, &b0);
+    b0 = *x;
+    secp256k1_scalar_encode_30(g, &b0);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak). */
     eta = -(uint32_t)1;
 
-    for (i = 0; i < 24; ++i) {
-        eta = secp256k1_scalar_divsteps_31_var(eta, f[0], g[0], &t[i * 4]);
-        secp256k1_scalar_update_fg(f, g, &t[i * 4]);
-    }
-
-    /* At this point sufficient iterations have been performed that g must have reached 0
-     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
-     * values i.e. +/- 1. The matrix outputs from each _divsteps_31_var are combined to
-     * get the Bézout coefficients, and thus the modular inverse. The matrix outputs of
-     * _divsteps_31_var introduce an extra factor of 2^31 each, so there is a total extra
-     * factor of 2^744 to account for (by scaling the input and/or output accordingly).
-     */
-
-    VERIFY_CHECK(g[0] == 0);
+    for (i = 0; i < 25; ++i) {
+        eta = secp256k1_scalar_divsteps_30_var(eta, f[0], g[0], t);
+        secp256k1_scalar_update_de_30(d, e, t);
+        secp256k1_scalar_update_fg_30(f, g, t);
 
-    sign = (f[0] >> 1) & 1;
-
-    for (i = 0; i < 3; ++i) {
-        int tOff = i * 32;
-        secp256k1_scalar_combine_1s(&t[tOff +  0]);
-        secp256k1_scalar_combine_1s(&t[tOff +  8]);
-        secp256k1_scalar_combine_1s(&t[tOff + 16]);
-        secp256k1_scalar_combine_1s(&t[tOff + 24]);
-        secp256k1_scalar_combine_2s(&t[tOff +  0]);
-        secp256k1_scalar_combine_2s(&t[tOff + 16]);
-        secp256k1_scalar_combine_4s(&t[tOff +  0]);
+        if (g[0] == 0) {
+            if ((g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0) {
+                break;
+            }
+        }
     }
 
-    /* secp256k1_scalar_decode_matrix(&a0, &t[0]); */
-    secp256k1_scalar_decode_matrix(&b0, &t[8]);
-    /* secp256k1_scalar_decode_matrix(&c0, &t[16]); */
-    secp256k1_scalar_decode_matrix(&d0, &t[24]);
+    VERIFY_CHECK(i < 25);
 
-    secp256k1_scalar_decode_matrix(&a1, &t[32]);
-    secp256k1_scalar_decode_matrix(&b1, &t[40]);
-    secp256k1_scalar_decode_matrix(&c1, &t[48]);
-    secp256k1_scalar_decode_matrix(&d1, &t[56]);
+    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
+     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    secp256k1_scalar_mul(&a1, &a1, &b0);
-    secp256k1_scalar_mul(&b1, &b1, &d0);
-    secp256k1_scalar_mul(&c1, &c1, &b0);
-    secp256k1_scalar_mul(&d1, &d1, &d0);
-
-    secp256k1_scalar_add(&b0, &a1, &b1);
-    secp256k1_scalar_add(&d0, &c1, &d1);
-
-    secp256k1_scalar_decode_matrix(&a1, &t[64]);
-    secp256k1_scalar_decode_matrix(&b1, &t[72]);
-    /* secp256k1_scalar_decode_matrix(&c1, &t[80]); */
-    /* secp256k1_scalar_decode_matrix(&d1, &t[88]); */
-
-    secp256k1_scalar_mul(&a1, &a1, &b0);
-    secp256k1_scalar_mul(&b1, &b1, &d0);
-    /* secp256k1_scalar_mul(&c1, &c1, &b0); */
-    /* secp256k1_scalar_mul(&d1, &d1, &d0); */
-
-    secp256k1_scalar_add(&b0, &a1, &b1);
-    /* secp256k1_scalar_add(&d0, &c1, &d1); */
+    sign = (f[0] >> 1) & 1;
 
+    secp256k1_scalar_decode_30(&b0, d);
     secp256k1_scalar_cond_negate(&b0, sign);
 
 #ifdef VERIFY

From bfd7a0fbd6ef2d60d12ce63e2ce2921aa8424c1b Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sun, 9 Aug 2020 16:44:33 +0700
Subject: [PATCH 15/34] Alternate var-time divsteps code

---
 src/field_10x26_impl.h | 6 ++++++
 src/field_5x52_impl.h  | 6 ++++++
 src/scalar_4x64_impl.h | 6 ++++++
 src/scalar_8x32_impl.h | 6 ++++++
 4 files changed, 24 insertions(+)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 2a1eaed394..d174bd4cd4 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1317,6 +1317,7 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
             z = v; v = r; r = -z;
         }
 
+#if 1
         /* Handle up to 3 divsteps at once, subject to eta and i. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         m = (UINT32_MAX >> (32 - limit)) & 7U;
@@ -1326,6 +1327,11 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
         g += f * w;
         q += u * w;
         r += v * w;
+#else
+        g += f;
+        q += u;
+        r += v;
+#endif
     }
 
     t[0] = (int32_t)u;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 44188699b2..cc1b95a208 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -625,6 +625,7 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t
             z = v; v = r; r = -z;
         }
 
+#if 1
         /* Handle up to 3 divsteps at once, subject to eta and i. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         m = (UINT64_MAX >> (64 - limit)) & 7U;
@@ -634,6 +635,11 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t
         g += f * w;
         q += u * w;
         r += v * w;
+#else
+        g += f;
+        q += u;
+        r += v;
+#endif
     }
 
     t[0] = (int64_t)u;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 7b526cb90f..15f4460dfc 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1088,6 +1088,7 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
             z = v; v = r; r = -z;
         }
 
+#if 1
         /* Handle up to 3 divsteps at once, subject to eta and i. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         m = (UINT64_MAX >> (64 - limit)) & 7U;
@@ -1097,6 +1098,11 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
         g += f * w;
         q += u * w;
         r += v * w;
+#else
+        g += f;
+        q += u;
+        r += v;
+#endif
     }
 
     t[0] = (int64_t)u;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 6cc23f90aa..5d21cb1233 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -882,6 +882,7 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
             z = v; v = r; r = -z;
         }
 
+#if 1
         /* Handle up to 3 divsteps at once, subject to eta and i. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         m = (UINT32_MAX >> (32 - limit)) & 7U;
@@ -891,6 +892,11 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
         g += f * w;
         q += u * w;
         r += v * w;
+#else
+        g += f;
+        q += u;
+        r += v;
+#endif
     }
 
     t[0] = (int32_t)u;

From f873c3b503bf8e7c360bf8b440488e07aeb3ec2a Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sun, 9 Aug 2020 17:40:44 +0700
Subject: [PATCH 16/34] Add comments regarding small inputs

---
 src/field_10x26_impl.h | 11 +++++++++--
 src/field_5x52_impl.h  | 11 +++++++++--
 src/scalar_4x64_impl.h | 11 +++++++++--
 src/scalar_8x32_impl.h | 11 +++++++++--
 4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index d174bd4cd4..fadc3dda75 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1468,7 +1468,11 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     zero_in = secp256k1_fe_is_zero(&b0);
 #endif
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If the maximum bitlength of g is known to be less than 256, then eta can be set
+     * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total
+     * divsteps are needed. */
     eta = -(uint32_t)1;
 
     for (i = 0; i < 25; ++i) {
@@ -1524,7 +1528,10 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     zero_in = secp256k1_fe_is_zero(&b0);
 #endif
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to
+     * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */
     eta = -(uint32_t)1;
 
     for (i = 0; i < 25; ++i) {
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index cc1b95a208..c7a6af7c86 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -772,7 +772,11 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     zero_in = secp256k1_fe_is_zero(&b0);
 #endif
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If the maximum bitlength of g is known to be less than 256, then eta can be set
+     * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total
+     * divsteps are needed. */
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
@@ -828,7 +832,10 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     zero_in = secp256k1_fe_is_zero(&b0);
 #endif
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to
+     * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 15f4460dfc..8272670426 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1229,7 +1229,11 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     b0 = *x;
     secp256k1_scalar_encode_62(g, &b0);
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If the maximum bitlength of g is known to be less than 256, then eta can be set
+     * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total
+     * divsteps are needed. */
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
@@ -1282,7 +1286,10 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     b0 = *x;
     secp256k1_scalar_encode_62(g, &b0);
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to
+     * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 5d21cb1233..1db6d52f15 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -1023,7 +1023,11 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     b0 = *x;
     secp256k1_scalar_encode_30(g, &b0);
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If the maximum bitlength of g is known to be less than 256, then eta can be set
+     * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total
+     * divsteps are needed. */
     eta = -(uint32_t)1;
 
     for (i = 0; i < 25; ++i) {
@@ -1076,7 +1080,10 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     b0 = *x;
     secp256k1_scalar_encode_30(g, &b0);
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak). */
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to
+     * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */
     eta = -(uint32_t)1;
 
     for (i = 0; i < 25; ++i) {

From 17982d820ee780c5ca488f62661b83376dd5afb7 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sun, 9 Aug 2020 18:37:17 +0700
Subject: [PATCH 17/34] Avoid left shift of signed values

---
 src/field_10x26_impl.h | 8 ++++----
 src/field_5x52_impl.h  | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index fadc3dda75..6f7e9fa1a9 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1373,8 +1373,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
     ce >>= 30;
 
     /* Subtract products of 2^32. */
-    cd -= (int64_t)md << 2;
-    ce -= (int64_t)me << 2;
+    cd -= (int64_t)4 * md;
+    ce -= (int64_t)4 * me;
 
     for (i = 1; i < 8; ++i) {
 
@@ -1389,8 +1389,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
     }
 
     /* Add products of 2^256. */
-    cd += (int64_t)md << 16;
-    ce += (int64_t)me << 16;
+    cd += (int64_t)65536 * md;
+    ce += (int64_t)65536 * me;
 
     {
         di = d[8];
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index c7a6af7c86..03e1f855dd 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -693,8 +693,8 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
     }
 
     /* Add products of 2^256. */
-    cd += (int128_t)md << 8;
-    ce += (int128_t)me << 8;
+    cd += (int128_t)256 * md;
+    ce += (int128_t)256 * me;
 
     {
         di = d[4];

From 06d568a7e6596aae9a6837bb2eee3108347cbc3a Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sun, 9 Aug 2020 20:40:46 +0700
Subject: [PATCH 18/34] Add alternative to __builtin_ctz intrinsics

- lookup tables based on  de Bruijn sequences
---
 src/field_10x26_impl.h | 14 +++++++++++++-
 src/field_5x52_impl.h  | 17 ++++++++++++++++-
 src/scalar_4x64_impl.h | 17 ++++++++++++++++-
 src/scalar_8x32_impl.h | 14 +++++++++++++-
 4 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 6f7e9fa1a9..6ffcfc8c7a 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1286,6 +1286,12 @@ static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0,
 
 static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
+#if 1
+    static const uint8_t debruijn[32] = {
+        0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26,
+        31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 };
+#endif
+
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
     int i = 30, limit, zeros;
@@ -1293,7 +1299,13 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
     for (;;) {
 
         /* Use a sentinel bit to count zeros only up to i. */
-        zeros = __builtin_ctzl(g | (UINT32_MAX << i));
+        x = g | (UINT32_MAX << i);
+
+#if 0
+        zeros = __builtin_ctzl(x);
+#else
+        zeros = debruijn[((x & -x) * 0x04D7651F) >> 27];
+#endif
 
         g >>= zeros;
         u <<= zeros;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 03e1f855dd..f8914d9564 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -594,14 +594,29 @@ static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0,
 
 static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
+#if 1
+    static const uint8_t debruijn[64] = {
+        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
+        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
+        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
+    };
+#endif
+
     uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
     uint64_t f = f0, g = g0, m, w, x, y, z;
     int i = 62, limit, zeros;
 
     for (;;) {
 
+        x = g | (UINT64_MAX << i);
+
         /* Use a sentinel bit to count zeros only up to i. */
-        zeros = __builtin_ctzll(g | (UINT64_MAX << i));
+#if 0
+        zeros = __builtin_ctzll(x);
+#else
+        zeros = debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58];
+#endif
 
         g >>= zeros;
         u <<= zeros;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 8272670426..b8be7ab166 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1057,14 +1057,29 @@ static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t
 
 static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
+#if 1
+    static const uint8_t debruijn[64] = {
+        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
+        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
+        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
+    };
+#endif
+
     uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
     uint64_t f = f0, g = g0, m, w, x, y, z;
     int i = 62, limit, zeros;
 
     for (;;) {
 
+        x = g | (UINT64_MAX << i);
+
         /* Use a sentinel bit to count zeros only up to i. */
-        zeros = __builtin_ctzll(g | (UINT64_MAX << i));
+#if 0
+        zeros = __builtin_ctzll(x);
+#else
+        zeros = debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58];
+#endif
 
         g >>= zeros;
         u <<= zeros;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 1db6d52f15..a9265ac4e1 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -851,6 +851,12 @@ static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t
 
 static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
+#if 1
+    static const uint8_t debruijn[32] = {
+        0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26,
+        31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 };
+#endif
+
     uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
     int i = 30, limit, zeros;
@@ -858,7 +864,13 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
     for (;;) {
 
         /* Use a sentinel bit to count zeros only up to i. */
-        zeros = __builtin_ctzl(g | (UINT32_MAX << i));
+        x = g | (UINT32_MAX << i);
+
+#if 0
+        zeros = __builtin_ctzl(x);
+#else
+        zeros = debruijn[((x & -x) * 0x04D7651F) >> 27];
+#endif
 
         g >>= zeros;
         u <<= zeros;

From 16509ca068410cb9c3bd45e7a17b5de996baea1a Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sun, 9 Aug 2020 21:54:40 +0700
Subject: [PATCH 19/34] Write primes in signed-digit form

---
 src/scalar_4x64_impl.h | 3 +--
 src/scalar_8x32_impl.h | 5 +++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index b8be7ab166..a0e42d41e1 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1132,8 +1132,7 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
 
     /* I62 == -P^-1 mod 2^62 */
     const int64_t I62 = 0x0B0DFF665588B13FLL;
-    const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
-        0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
+    const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 };
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
     int128_t cd = 0, ce = 0;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index a9265ac4e1..4b42294799 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -923,8 +923,9 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
 
     /* I30 == -P^-1 mod 2^30 */
     const int32_t I30 = 0x1588B13FL;
-    const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL,
-        0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL };
+    const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L,
+        0, 0, 0, 65536 };
+
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
     int64_t cd = 0, ce = 0;

From 40c815ebe16cdd02d0a0f79122a94ab10969702c Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sun, 9 Aug 2020 22:28:17 +0700
Subject: [PATCH 20/34] Unify _update_de_ methods

---
 src/field_10x26_impl.h | 31 +++++++------------------------
 src/field_5x52_impl.h  | 26 +++++++-------------------
 src/scalar_8x32_impl.h |  1 -
 3 files changed, 14 insertions(+), 44 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 6ffcfc8c7a..48551a89f1 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1358,7 +1358,7 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
 
     /* I30 == -P^-1 mod 2^30 */
     const int32_t I30 = 0x12253531L;
-    const int32_t C30 = 0x3D1L;
+    const int32_t P[9] = { -0x3D1L, -4L, 0, 0, 0, 0, 0, 0, 65536 };
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
     int64_t cd = 0, ce = 0;
@@ -1374,9 +1374,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
     md = (I30 * (int32_t)cd) & M30;
     me = (I30 * (int32_t)ce) & M30;
 
-    /* P == 2^256 - 2^32 - C30; subtract products of C30 here. */
-    cd -= (int64_t)C30 * md;
-    ce -= (int64_t)C30 * me;
+    cd += (int64_t)P[0] * md;
+    ce += (int64_t)P[0] * me;
 
     VERIFY_CHECK(((int32_t)cd & M30) == 0);
     VERIFY_CHECK(((int32_t)ce & M30) == 0);
@@ -1384,11 +1383,7 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
     cd >>= 30;
     ce >>= 30;
 
-    /* Subtract products of 2^32. */
-    cd -= (int64_t)4 * md;
-    ce -= (int64_t)4 * me;
-
-    for (i = 1; i < 8; ++i) {
+    for (i = 1; i < 9; ++i) {
 
         di = d[i];
         ei = e[i];
@@ -1396,25 +1391,13 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
         cd -= (int64_t)u * di + (int64_t)v * ei;
         ce -= (int64_t)q * di + (int64_t)r * ei;
 
+        cd += (int64_t)P[i] * md;
+        ce += (int64_t)P[i] * me;
+
         d[i - 1] = (int32_t)cd & M30; cd >>= 30;
         e[i - 1] = (int32_t)ce & M30; ce >>= 30;
     }
 
-    /* Add products of 2^256. */
-    cd += (int64_t)65536 * md;
-    ce += (int64_t)65536 * me;
-
-    {
-        di = d[8];
-        ei = e[8];
-
-        cd -= (int64_t)u * di + (int64_t)v * ei;
-        ce -= (int64_t)q * di + (int64_t)r * ei;
-
-        d[7] = (int32_t)cd & M30; cd >>= 30;
-        e[7] = (int32_t)ce & M30; ce >>= 30;
-    }
-
     d[8] = (int32_t)cd;
     e[8] = (int32_t)ce;
 }
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index f8914d9564..4289208a75 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -669,7 +669,7 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
 
     /* I62 == -P^-1 mod 2^62 */
     const int64_t I62 = 0x1838091DD2253531LL;
-    const int64_t C62 = 0x1000003D1LL;
+    int64_t P[5] = { -0x1000003D1LL, 0, 0, 0, 256 }; 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
     int128_t cd = 0, ce = 0;
@@ -686,8 +686,8 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
     me = (I62 * (int64_t)ce) & M62;
 
     /* P == 2^256 - C62; subtract products of C62 here. */
-    cd -= (int128_t)C62 * md;
-    ce -= (int128_t)C62 * me;
+    cd += (int128_t)P[0] * md;
+    ce += (int128_t)P[0] * me;
 
     VERIFY_CHECK(((int64_t)cd & M62) == 0);
     VERIFY_CHECK(((int64_t)ce & M62) == 0);
@@ -695,7 +695,7 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
     cd >>= 62;
     ce >>= 62;
 
-    for (i = 1; i < 4; ++i) {
+    for (i = 1; i < 5; ++i) {
 
         di = d[i];
         ei = e[i];
@@ -703,25 +703,13 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
         cd -= (int128_t)u * di + (int128_t)v * ei;
         ce -= (int128_t)q * di + (int128_t)r * ei;
 
+        cd += (int128_t)P[i] * md;
+        ce += (int128_t)P[i] * me;
+
         d[i - 1] = (int64_t)cd & M62; cd >>= 62;
         e[i - 1] = (int64_t)ce & M62; ce >>= 62;
     }
 
-    /* Add products of 2^256. */
-    cd += (int128_t)256 * md;
-    ce += (int128_t)256 * me;
-
-    {
-        di = d[4];
-        ei = e[4];
-
-        cd -= (int128_t)u * di + (int128_t)v * ei;
-        ce -= (int128_t)q * di + (int128_t)r * ei;
-
-        d[3] = (int64_t)cd & M62; cd >>= 62;
-        e[3] = (int64_t)ce & M62; ce >>= 62;
-    }
-
     d[4] = (int64_t)cd;
     e[4] = (int64_t)ce;
 }
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 4b42294799..9b0cfcea4f 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -925,7 +925,6 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
     const int32_t I30 = 0x1588B13FL;
     const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L,
         0, 0, 0, 65536 };
-
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
     int64_t cd = 0, ce = 0;

From dc58f4f094120aa044453f4457aa0439e6b5717c Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 11 Aug 2020 02:31:00 +0700
Subject: [PATCH 21/34] Redo update_de methods

---
 src/field_10x26_impl.h |  59 +++++++++++++---------
 src/field_5x52_impl.h  | 109 +++++++++++++++++++++++------------------
 src/scalar_4x64_impl.h | 109 ++++++++++++++++++++++++-----------------
 src/scalar_8x32_impl.h |  17 +++----
 4 files changed, 169 insertions(+), 125 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 48551a89f1..b47f79dc94 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1167,8 +1167,8 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
 
     const uint32_t M26 = UINT32_MAX >> 6;
-    uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
-             a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
+    const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
+                   a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
     uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
 
     VERIFY_CHECK(a0 >> 30 == 0);
@@ -1225,8 +1225,8 @@ static void secp256k1_fe_encode_30(int32_t *r, const secp256k1_fe *a) {
 
     const uint32_t M30 = UINT32_MAX >> 2;
     const uint32_t *n = &a->n[0];
-    uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4],
-             a5 = n[5], a6 = n[6], a7 = n[7], a8 = n[8], a9 = n[9];
+    const uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4],
+                   a5 = n[5], a6 = n[6], a7 = n[7], a8 = n[8], a9 = n[9];
 
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
@@ -1354,13 +1354,15 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
     return eta;
 }
 
-static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
+static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t) {
 
+    /* P == 2^256 - 2^32 - C30 */
+    const int64_t C30 = 0x3D1L;
     /* I30 == -P^-1 mod 2^30 */
     const int32_t I30 = 0x12253531L;
-    const int32_t P[9] = { -0x3D1L, -4L, 0, 0, 0, 0, 0, 0, 65536 };
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
-    int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
+    const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
+    int32_t di, ei, md, me;
     int64_t cd = 0, ce = 0;
     int i;
 
@@ -1374,16 +1376,16 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
     md = (I30 * (int32_t)cd) & M30;
     me = (I30 * (int32_t)ce) & M30;
 
-    cd += (int64_t)P[0] * md;
-    ce += (int64_t)P[0] * me;
+    cd -= (int64_t)C30 * md;
+    ce -= (int64_t)C30 * me;
 
-    VERIFY_CHECK(((int32_t)cd & M30) == 0);
-    VERIFY_CHECK(((int32_t)ce & M30) == 0);
+    VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30;
+    VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30;
 
-    cd >>= 30;
-    ce >>= 30;
+    cd -= (int64_t)4 * md;
+    ce -= (int64_t)4 * me;
 
-    for (i = 1; i < 9; ++i) {
+    for (i = 1; i < 8; ++i) {
 
         di = d[i];
         ei = e[i];
@@ -1391,21 +1393,33 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
         cd -= (int64_t)u * di + (int64_t)v * ei;
         ce -= (int64_t)q * di + (int64_t)r * ei;
 
-        cd += (int64_t)P[i] * md;
-        ce += (int64_t)P[i] * me;
-
         d[i - 1] = (int32_t)cd & M30; cd >>= 30;
         e[i - 1] = (int32_t)ce & M30; ce >>= 30;
     }
 
+    {
+        di = d[8];
+        ei = e[8];
+
+        cd -= (int64_t)u * di + (int64_t)v * ei;
+        ce -= (int64_t)q * di + (int64_t)r * ei;
+
+        cd += (int64_t)65536 * md;
+        ce += (int64_t)65536 * me;
+
+        d[7] = (int32_t)cd & M30; cd >>= 30;
+        e[7] = (int32_t)ce & M30; ce >>= 30;
+    }
+
     d[8] = (int32_t)cd;
     e[8] = (int32_t)ce;
 }
 
-static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, int32_t *t) {
+static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, const int32_t *t) {
 
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
-    int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
+    const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
+    int32_t fi, gi;
     int64_t cf = 0, cg = 0;
     int i;
 
@@ -1415,11 +1429,8 @@ static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, int32_t *t) {
     cf -= (int64_t)u * fi + (int64_t)v * gi;
     cg -= (int64_t)q * fi + (int64_t)r * gi;
 
-    VERIFY_CHECK(((int32_t)cf & M30) == 0);
-    VERIFY_CHECK(((int32_t)cg & M30) == 0);
-
-    cf >>= 30;
-    cg >>= 30;
+    VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
+    VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
 
     for (i = 1; i < 9; ++i) {
 
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 4289208a75..b15fcdcfd5 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -501,7 +501,7 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) {
 
     const uint64_t M52 = UINT64_MAX >> 12;
-    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
+    const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
     uint64_t r0, r1, r2, r3, r4;
 
     VERIFY_CHECK(a0 >> 62 == 0);
@@ -539,7 +539,7 @@ static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) {
 
     const uint64_t M62 = UINT64_MAX >> 2;
     const uint64_t *n = &a->n[0];
-    uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4];
+    const uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4];
 
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
@@ -665,85 +665,100 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t
     return eta;
 }
 
-static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
+static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t) {
 
+    /* P == 2^256 - C62 */
+    const int64_t C62 = 0x1000003D1LL;
     /* I62 == -P^-1 mod 2^62 */
     const int64_t I62 = 0x1838091DD2253531LL;
-    int64_t P[5] = { -0x1000003D1LL, 0, 0, 0, 256 }; 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
-    int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
+    const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4];
+    const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4];
+    const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
+    int64_t md, me;
     int128_t cd = 0, ce = 0;
-    int i;
-
-    di = d[0];
-    ei = e[0];
 
-    cd -= (int128_t)u * di + (int128_t)v * ei;
-    ce -= (int128_t)q * di + (int128_t)r * ei;
+    cd -= (int128_t)u * d0 + (int128_t)v * e0;
+    ce -= (int128_t)q * d0 + (int128_t)r * e0;
 
     /* Calculate the multiples of P to add, to zero the 62 bottom bits. */
     md = (I62 * (int64_t)cd) & M62;
     me = (I62 * (int64_t)ce) & M62;
 
-    /* P == 2^256 - C62; subtract products of C62 here. */
-    cd += (int128_t)P[0] * md;
-    ce += (int128_t)P[0] * me;
+    cd -= (int128_t)C62 * md;
+    ce -= (int128_t)C62 * me;
 
-    VERIFY_CHECK(((int64_t)cd & M62) == 0);
-    VERIFY_CHECK(((int64_t)ce & M62) == 0);
+    VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
+    VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
 
-    cd >>= 62;
-    ce >>= 62;
+    cd -= (int128_t)u * d1 + (int128_t)v * e1;
+    ce -= (int128_t)q * d1 + (int128_t)r * e1;
 
-    for (i = 1; i < 5; ++i) {
+    d[0] = (int64_t)cd & M62; cd >>= 62;
+    e[0] = (int64_t)ce & M62; ce >>= 62;
 
-        di = d[i];
-        ei = e[i];
+    cd -= (int128_t)u * d2 + (int128_t)v * e2;
+    ce -= (int128_t)q * d2 + (int128_t)r * e2;
 
-        cd -= (int128_t)u * di + (int128_t)v * ei;
-        ce -= (int128_t)q * di + (int128_t)r * ei;
+    d[1] = (int64_t)cd & M62; cd >>= 62;
+    e[1] = (int64_t)ce & M62; ce >>= 62;
 
-        cd += (int128_t)P[i] * md;
-        ce += (int128_t)P[i] * me;
+    cd -= (int128_t)u * d3 + (int128_t)v * e3;
+    ce -= (int128_t)q * d3 + (int128_t)r * e3;
 
-        d[i - 1] = (int64_t)cd & M62; cd >>= 62;
-        e[i - 1] = (int64_t)ce & M62; ce >>= 62;
-    }
+    d[2] = (int64_t)cd & M62; cd >>= 62;
+    e[2] = (int64_t)ce & M62; ce >>= 62;
+
+    cd -= (int128_t)u * d4 + (int128_t)v * e4;
+    ce -= (int128_t)q * d4 + (int128_t)r * e4;
+
+    cd += (int128_t)256 * md;
+    ce += (int128_t)256 * me;
+
+    d[3] = (int64_t)cd & M62; cd >>= 62;
+    e[3] = (int64_t)ce & M62; ce >>= 62;
 
     d[4] = (int64_t)cd;
     e[4] = (int64_t)ce;
 }
 
-static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, int64_t *t) {
+static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, const int64_t *t) {
 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
-    int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
+    const int64_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4];
+    const int64_t g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4];
+    const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
     int128_t cf = 0, cg = 0;
-    int i;
 
-    fi = f[0];
-    gi = g[0];
+    cf -= (int128_t)u * f0 + (int128_t)v * g0;
+    cg -= (int128_t)q * f0 + (int128_t)r * g0;
 
-    cf -= (int128_t)u * fi + (int128_t)v * gi;
-    cg -= (int128_t)q * fi + (int128_t)r * gi;
+    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
+    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
 
-    VERIFY_CHECK(((int64_t)cf & M62) == 0);
-    VERIFY_CHECK(((int64_t)cg & M62) == 0);
+    cf -= (int128_t)u * f1 + (int128_t)v * g1;
+    cg -= (int128_t)q * f1 + (int128_t)r * g1;
 
-    cf >>= 62;
-    cg >>= 62;
+    f[0] = (int64_t)cf & M62; cf >>= 62;
+    g[0] = (int64_t)cg & M62; cg >>= 62;
 
-    for (i = 1; i < 5; ++i) {
+    cf -= (int128_t)u * f2 + (int128_t)v * g2;
+    cg -= (int128_t)q * f2 + (int128_t)r * g2;
 
-        fi = f[i];
-        gi = g[i];
+    f[1] = (int64_t)cf & M62; cf >>= 62;
+    g[1] = (int64_t)cg & M62; cg >>= 62;
 
-        cf -= (int128_t)u * fi + (int128_t)v * gi;
-        cg -= (int128_t)q * fi + (int128_t)r * gi;
+    cf -= (int128_t)u * f3 + (int128_t)v * g3;
+    cg -= (int128_t)q * f3 + (int128_t)r * g3;
 
-        f[i - 1] = (int64_t)cf & M62; cf >>= 62;
-        g[i - 1] = (int64_t)cg & M62; cg >>= 62;
-    }
+    f[2] = (int64_t)cf & M62; cf >>= 62;
+    g[2] = (int64_t)cg & M62; cg >>= 62;
+
+    cf -= (int128_t)u * f4 + (int128_t)v * g4;
+    cg -= (int128_t)q * f4 + (int128_t)r * g4;
+
+    f[3] = (int64_t)cf & M62; cf >>= 62;
+    g[3] = (int64_t)cg & M62; cg >>= 62;
 
     f[4] = (int64_t)cf;
     g[4] = (int64_t)cg;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index a0e42d41e1..bbde844eae 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -964,7 +964,7 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA
 
 static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) {
 
-    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
+    const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
     uint64_t r0, r1, r2, r3;
     int64_t t;
     secp256k1_scalar u;
@@ -1002,7 +1002,7 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {
 
     const uint64_t M62 = UINT64_MAX >> 2;
     const uint64_t *d = &a->d[0];
-    uint64_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3];
+    const uint64_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3];
 
 #ifdef VERIFY
     VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
@@ -1128,21 +1128,20 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
     return eta;
 }
 
-static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
+static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t *t) {
 
     /* I62 == -P^-1 mod 2^62 */
     const int64_t I62 = 0x0B0DFF665588B13FLL;
-    const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 };
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
-    int64_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
+    const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 };
+    const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4];
+    const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4];
+    const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
+    int64_t md, me;
     int128_t cd = 0, ce = 0;
-    int i;
-
-    di = d[0];
-    ei = e[0];
 
-    cd -= (int128_t)u * di + (int128_t)v * ei;
-    ce -= (int128_t)q * di + (int128_t)r * ei;
+    cd -= (int128_t)u * d0 + (int128_t)v * e0;
+    ce -= (int128_t)q * d0 + (int128_t)r * e0;
 
     /* Calculate the multiples of P to add, to zero the 62 bottom bits. */
     md = (I62 * (int64_t)cd) & M62;
@@ -1151,61 +1150,83 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, int64_t *t) {
     cd += (int128_t)P[0] * md;
     ce += (int128_t)P[0] * me;
 
-    VERIFY_CHECK(((int64_t)cd & M62) == 0);
-    VERIFY_CHECK(((int64_t)ce & M62) == 0);
+    VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
+    VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
 
-    cd >>= 62;
-    ce >>= 62;
+    cd -= (int128_t)u * d1 + (int128_t)v * e1;
+    ce -= (int128_t)q * d1 + (int128_t)r * e1;
 
-    for (i = 1; i < 5; ++i) {
+    cd += (int128_t)P[1] * md;
+    ce += (int128_t)P[1] * me;
 
-        di = d[i];
-        ei = e[i];
+    d[0] = (int64_t)cd & M62; cd >>= 62;
+    e[0] = (int64_t)ce & M62; ce >>= 62;
 
-        cd -= (int128_t)u * di + (int128_t)v * ei;
-        ce -= (int128_t)q * di + (int128_t)r * ei;
+    cd -= (int128_t)u * d2 + (int128_t)v * e2;
+    ce -= (int128_t)q * d2 + (int128_t)r * e2;
 
-        cd += (int128_t)P[i] * md;
-        ce += (int128_t)P[i] * me;
+    cd += (int128_t)P[2] * md;
+    ce += (int128_t)P[2] * me;
 
-        d[i - 1] = (int64_t)cd & M62; cd >>= 62;
-        e[i - 1] = (int64_t)ce & M62; ce >>= 62;
-    }
+    d[1] = (int64_t)cd & M62; cd >>= 62;
+    e[1] = (int64_t)ce & M62; ce >>= 62;
+
+    cd -= (int128_t)u * d3 + (int128_t)v * e3;
+    ce -= (int128_t)q * d3 + (int128_t)r * e3;
+
+    d[2] = (int64_t)cd & M62; cd >>= 62;
+    e[2] = (int64_t)ce & M62; ce >>= 62;
+
+    cd -= (int128_t)u * d4 + (int128_t)v * e4;
+    ce -= (int128_t)q * d4 + (int128_t)r * e4;
+
+    cd += (int128_t)P[4] * md;
+    ce += (int128_t)P[4] * me;
+
+    d[3] = (int64_t)cd & M62; cd >>= 62;
+    e[3] = (int64_t)ce & M62; ce >>= 62;
 
     d[4] = (int64_t)cd;
     e[4] = (int64_t)ce;
 }
 
-static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, int64_t *t) {
+static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, const int64_t *t) {
 
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
-    int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
+    const int64_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4];
+    const int64_t g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4];
+    const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
     int128_t cf = 0, cg = 0;
-    int i;
 
-    fi = f[0];
-    gi = g[0];
+    cf -= (int128_t)u * f0 + (int128_t)v * g0;
+    cg -= (int128_t)q * f0 + (int128_t)r * g0;
 
-    cf -= (int128_t)u * fi + (int128_t)v * gi;
-    cg -= (int128_t)q * fi + (int128_t)r * gi;
+    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
+    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
 
-    VERIFY_CHECK(((int64_t)cf & M62) == 0);
-    VERIFY_CHECK(((int64_t)cg & M62) == 0);
+    cf -= (int128_t)u * f1 + (int128_t)v * g1;
+    cg -= (int128_t)q * f1 + (int128_t)r * g1;
 
-    cf >>= 62;
-    cg >>= 62;
+    f[0] = (int64_t)cf & M62; cf >>= 62;
+    g[0] = (int64_t)cg & M62; cg >>= 62;
 
-    for (i = 1; i < 5; ++i) {
+    cf -= (int128_t)u * f2 + (int128_t)v * g2;
+    cg -= (int128_t)q * f2 + (int128_t)r * g2;
 
-        fi = f[i];
-        gi = g[i];
+    f[1] = (int64_t)cf & M62; cf >>= 62;
+    g[1] = (int64_t)cg & M62; cg >>= 62;
 
-        cf -= (int128_t)u * fi + (int128_t)v * gi;
-        cg -= (int128_t)q * fi + (int128_t)r * gi;
+    cf -= (int128_t)u * f3 + (int128_t)v * g3;
+    cg -= (int128_t)q * f3 + (int128_t)r * g3;
 
-        f[i - 1] = (int64_t)cf & M62; cf >>= 62;
-        g[i - 1] = (int64_t)cg & M62; cg >>= 62;
-    }
+    f[2] = (int64_t)cf & M62; cf >>= 62;
+    g[2] = (int64_t)cg & M62; cg >>= 62;
+
+    cf -= (int128_t)u * f4 + (int128_t)v * g4;
+    cg -= (int128_t)q * f4 + (int128_t)r * g4;
+
+    f[3] = (int64_t)cf & M62; cf >>= 62;
+    g[3] = (int64_t)cg & M62; cg >>= 62;
 
     f[4] = (int64_t)cf;
     g[4] = (int64_t)cg;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 9b0cfcea4f..6b422a6334 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -740,8 +740,8 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA
 
 static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) {
 
-    uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
-             a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
+    const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
+                   a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
     uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
     int32_t t;
     secp256k1_scalar u;
@@ -791,8 +791,8 @@ static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) {
 
     const uint32_t M30 = UINT32_MAX >> 2;
     const uint32_t *d = &a->d[0];
-    uint32_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3],
-             a4 = d[4], a5 = d[5], a6 = d[6], a7 = d[7];
+    const uint32_t a0 = d[0], a1 = d[1], a2 = d[2], a3 = d[3],
+                   a4 = d[4], a5 = d[5], a6 = d[6], a7 = d[7];
 
 #ifdef VERIFY
     VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
@@ -919,7 +919,7 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
     return eta;
 }
 
-static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
+static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t *t) {
 
     /* I30 == -P^-1 mod 2^30 */
     const int32_t I30 = 0x1588B13FL;
@@ -943,11 +943,8 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, int32_t *t) {
     cd += (int64_t)P[0] * md;
     ce += (int64_t)P[0] * me;
 
-    VERIFY_CHECK(((int32_t)cd & M30) == 0);
-    VERIFY_CHECK(((int32_t)ce & M30) == 0);
-
-    cd >>= 30;
-    ce >>= 30;
+    VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30;
+    VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30;
 
     for (i = 1; i < 9; ++i) {
 

From 132c76dc3a7c2f6553c89e684977b6e706a2846a Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Wed, 12 Aug 2020 02:27:26 +0700
Subject: [PATCH 22/34] Faster 64bit _inv_var, why not?

---
 src/field_5x52_impl.h  | 25 +++++++++++++------------
 src/scalar_4x64_impl.h | 25 +++++++++++++------------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index b15fcdcfd5..1513768d07 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -638,23 +638,24 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t
             x = f; f = g; g = -x;
             y = u; u = q; q = -y;
             z = v; v = r; r = -z;
-        }
 
-#if 1
-        /* Handle up to 3 divsteps at once, subject to eta and i. */
-        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
-        m = (UINT64_MAX >> (64 - limit)) & 7U;
+            /* Handle up to 6 divsteps at once, subject to eta and i. */
+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+            m = (UINT64_MAX >> (64 - limit)) & 63U;
+
+            w = (f * g * (f * f - 2)) & m;
+        } else {
+            /* Handle up to 4 divsteps at once, subject to eta and i. */
+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+            m = (UINT64_MAX >> (64 - limit)) & 15U;
+
+            w = f + (((f + 1) & 4) << 1);
+            w = (-w * g) & m;
+        }
 
-        /* Note that f * f == 1 mod 8, for any f. */
-        w = (-f * g) & m;
         g += f * w;
         q += u * w;
         r += v * w;
-#else
-        g += f;
-        q += u;
-        r += v;
-#endif
     }
 
     t[0] = (int64_t)u;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index bbde844eae..4679a45a88 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1101,23 +1101,24 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
             x = f; f = g; g = -x;
             y = u; u = q; q = -y;
             z = v; v = r; r = -z;
-        }
 
-#if 1
-        /* Handle up to 3 divsteps at once, subject to eta and i. */
-        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
-        m = (UINT64_MAX >> (64 - limit)) & 7U;
+            /* Handle up to 6 divsteps at once, subject to eta and i. */
+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+            m = (UINT64_MAX >> (64 - limit)) & 63U;
+
+            w = (f * g * (f * f - 2)) & m;
+        } else {
+            /* Handle up to 4 divsteps at once, subject to eta and i. */
+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+            m = (UINT64_MAX >> (64 - limit)) & 15U;
+
+            w = f + (((f + 1) & 4) << 1);
+            w = (-w * g) & m;
+        }
 
-        /* Note that f * f == 1 mod 8, for any f. */
-        w = (-f * g) & m;
         g += f * w;
         q += u * w;
         r += v * w;
-#else
-        g += f;
-        q += u;
-        r += v;
-#endif
     }
 
     t[0] = (int64_t)u;

From 2f6dfa21464b63f9097ab07199aee3dea674d214 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Wed, 12 Aug 2020 17:32:22 +0700
Subject: [PATCH 23/34] Get better control over the range of d, e

---
 src/field_10x26_impl.h | 39 +++++++++++++++++++++------------------
 src/field_5x52_impl.h  | 27 +++++++++++++++------------
 src/scalar_4x64_impl.h | 18 ++++++------------
 src/scalar_8x32_impl.h | 18 ++++++------------
 4 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index b47f79dc94..9bf05b85a2 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1171,6 +1171,7 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
                    a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
     uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
 
+    /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 30 == 0);
     VERIFY_CHECK(a1 >> 30 == 0);
     VERIFY_CHECK(a2 >> 30 == 0);
@@ -1179,18 +1180,19 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
     VERIFY_CHECK(a5 >> 30 == 0);
     VERIFY_CHECK(a6 >> 30 == 0);
     VERIFY_CHECK(a7 >> 30 == 0);
-
-    /* Add a multiple of the field prime in case u4 is "negative". */
-    r0  = 0x3FFFC2FUL * 8;
-    r1  = 0x3FFFFBFUL * 8;
-    r2  = 0x3FFFFFFUL * 8;
-    r3  = 0x3FFFFFFUL * 8;
-    r4  = 0x3FFFFFFUL * 8;
-    r5  = 0x3FFFFFFUL * 8;
-    r6  = 0x3FFFFFFUL * 8;
-    r7  = 0x3FFFFFFUL * 8;
-    r8  = 0x3FFFFFFUL * 8;
-    r9  = 0x03FFFFFUL * 8;
+    VERIFY_CHECK((int32_t)a8 >> 16 == 0 || (int32_t)a8 >> 16 == -(int32_t)1);
+
+    /* Add a multiple of the field prime in case a8 is "negative". */
+    r0  = 0x3FFFC2FUL * 2;
+    r1  = 0x3FFFFBFUL * 2;
+    r2  = 0x3FFFFFFUL * 2;
+    r3  = 0x3FFFFFFUL * 2;
+    r4  = 0x3FFFFFFUL * 2;
+    r5  = 0x3FFFFFFUL * 2;
+    r6  = 0x3FFFFFFUL * 2;
+    r7  = 0x3FFFFFFUL * 2;
+    r8  = 0x3FFFFFFUL * 2;
+    r9  = 0x03FFFFFUL * 2;
 
     r0 +=  a0                   & M26;
     r1 += (a0 >> 26 | a1 <<  4) & M26;
@@ -1215,7 +1217,7 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
     r->n[9] = r9;
 
 #ifdef VERIFY
-    r->magnitude = 7;
+    r->magnitude = 2;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
@@ -1372,9 +1374,10 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t)
     cd -= (int64_t)u * di + (int64_t)v * ei;
     ce -= (int64_t)q * di + (int64_t)r * ei;
 
-    /* Calculate the multiples of P to add, to zero the 30 bottom bits. */
-    md = (I30 * (int32_t)cd) & M30;
-    me = (I30 * (int32_t)ce) & M30;
+    /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me
+     * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */
+    md = (I30 * 4 * (int32_t)cd) >> 2;
+    me = (I30 * 4 * (int32_t)ce) >> 2;
 
     cd -= (int64_t)C30 * md;
     ce -= (int64_t)C30 * me;
@@ -1497,7 +1500,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     secp256k1_fe_decode_30(&b0, d);
 
-    secp256k1_fe_negate(&b1, &b0, 7);
+    secp256k1_fe_negate(&b1, &b0, 2);
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 
@@ -1561,7 +1564,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
     secp256k1_fe_decode_30(&b0, d);
 
-    secp256k1_fe_negate(&b1, &b0, 7);
+    secp256k1_fe_negate(&b1, &b0, 2);
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 1513768d07..219c281405 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -504,17 +504,19 @@ static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) {
     const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
     uint64_t r0, r1, r2, r3, r4;
 
+    /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 62 == 0);
     VERIFY_CHECK(a1 >> 62 == 0);
     VERIFY_CHECK(a2 >> 62 == 0);
     VERIFY_CHECK(a3 >> 62 == 0);
+    VERIFY_CHECK((int64_t)a4 >> 8 == 0 || (int64_t)a4 >> 8 == -(int64_t)1);
 
-    /* Add a multiple of the field prime in case u4 is "negative". */
-    r0  = 0xFFFFEFFFFFC2FULL * 8;
-    r1  = 0xFFFFFFFFFFFFFULL * 8;
-    r2  = 0xFFFFFFFFFFFFFULL * 8;
-    r3  = 0xFFFFFFFFFFFFFULL * 8;
-    r4  = 0x0FFFFFFFFFFFFULL * 8;
+    /* Add a multiple of the field prime in case a4 is "negative". */
+    r0  = 0xFFFFEFFFFFC2FULL * 2;
+    r1  = 0xFFFFFFFFFFFFFULL * 2;
+    r2  = 0xFFFFFFFFFFFFFULL * 2;
+    r3  = 0xFFFFFFFFFFFFFULL * 2;
+    r4  = 0x0FFFFFFFFFFFFULL * 2;
 
     r0 +=  a0                   & M52;
     r1 += (a0 >> 52 | a1 << 10) & M52;
@@ -529,7 +531,7 @@ static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) {
     r->n[4] = r4;
 
 #ifdef VERIFY
-    r->magnitude = 7;
+    r->magnitude = 2;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
@@ -682,9 +684,10 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t)
     cd -= (int128_t)u * d0 + (int128_t)v * e0;
     ce -= (int128_t)q * d0 + (int128_t)r * e0;
 
-    /* Calculate the multiples of P to add, to zero the 62 bottom bits. */
-    md = (I62 * (int64_t)cd) & M62;
-    me = (I62 * (int64_t)ce) & M62;
+    /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me
+     * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */
+    md = (I62 * 4 * (int64_t)cd) >> 2;
+    me = (I62 * 4 * (int64_t)ce) >> 2;
 
     cd -= (int128_t)C62 * md;
     ce -= (int128_t)C62 * me;
@@ -814,7 +817,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     secp256k1_fe_decode_62(&b0, d);
 
-    secp256k1_fe_negate(&b1, &b0, 7);
+    secp256k1_fe_negate(&b1, &b0, 2);
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 
@@ -878,7 +881,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
     secp256k1_fe_decode_62(&b0, d);
 
-    secp256k1_fe_negate(&b1, &b0, 7);
+    secp256k1_fe_negate(&b1, &b0, 2);
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 4679a45a88..ee5e128187 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -966,13 +966,14 @@ static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) {
 
     const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
     uint64_t r0, r1, r2, r3;
-    int64_t t;
     secp256k1_scalar u;
 
+    /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 62 == 0);
     VERIFY_CHECK(a1 >> 62 == 0);
     VERIFY_CHECK(a2 >> 62 == 0);
     VERIFY_CHECK(a3 >> 62 == 0);
+    VERIFY_CHECK((int64_t)a4 >> 8 == 0 || (int64_t)a4 >> 8 == -(int64_t)1);
 
     r0 = a0      | a1 << 62;
     r1 = a1 >> 2 | a2 << 60;
@@ -986,16 +987,8 @@ static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) {
 
     secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
 
-    t = (int64_t)a4 >> 8;
-
-    VERIFY_CHECK(t == 1 || t == 0 || t == -1);
-
     secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256);
     secp256k1_scalar_cmov(r, &u, a4 >> 63);
-
-    t += a4 >> 63;
-
-    secp256k1_scalar_reduce(r, t);
 }
 
 static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {
@@ -1144,9 +1137,10 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t
     cd -= (int128_t)u * d0 + (int128_t)v * e0;
     ce -= (int128_t)q * d0 + (int128_t)r * e0;
 
-    /* Calculate the multiples of P to add, to zero the 62 bottom bits. */
-    md = (I62 * (int64_t)cd) & M62;
-    me = (I62 * (int64_t)ce) & M62;
+    /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me
+     * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */
+    md = (I62 * 4 * (int64_t)cd) >> 2;
+    me = (I62 * 4 * (int64_t)ce) >> 2;
 
     cd += (int128_t)P[0] * md;
     ce += (int128_t)P[0] * me;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 6b422a6334..8ba26ed95f 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -743,9 +743,9 @@ static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) {
     const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
                    a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
     uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
-    int32_t t;
     secp256k1_scalar u;
 
+    /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 30 == 0);
     VERIFY_CHECK(a1 >> 30 == 0);
     VERIFY_CHECK(a2 >> 30 == 0);
@@ -754,6 +754,7 @@ static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) {
     VERIFY_CHECK(a5 >> 30 == 0);
     VERIFY_CHECK(a6 >> 30 == 0);
     VERIFY_CHECK(a7 >> 30 == 0);
+    VERIFY_CHECK((int32_t)a8 >> 16 == 0 || (int32_t)a8 >> 16 == -(int32_t)1);
 
     r0 = a0       | a1 << 30;
     r1 = a1 >>  2 | a2 << 28;
@@ -775,16 +776,8 @@ static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) {
 
     secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
 
-    t = (int32_t)a8 >> 16;
-
-    VERIFY_CHECK(t == 1 || t == 0 || t == -1);
-
     secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256);
     secp256k1_scalar_cmov(r, &u, a8 >> 31);
-
-    t += a8 >> 31;
-
-    secp256k1_scalar_reduce(r, t);
 }
 
 static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) {
@@ -936,9 +929,10 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t
     cd -= (int64_t)u * di + (int64_t)v * ei;
     ce -= (int64_t)q * di + (int64_t)r * ei;
 
-    /* Calculate the multiples of P to add, to zero the 30 bottom bits. */
-    md = (I30 * (int32_t)cd) & M30;
-    me = (I30 * (int32_t)ce) & M30;
+    /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me
+     * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */
+    md = (I30 * 4 * (int32_t)cd) >> 2;
+    me = (I30 * 4 * (int32_t)ce) >> 2;
 
     cd += (int64_t)P[0] * md;
     ce += (int64_t)P[0] * me;

From 90743d29ab93c0d74d60f2c947b45686284da2b8 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Thu, 13 Aug 2020 17:16:23 +0700
Subject: [PATCH 24/34] Verify the expected zeros are produced

---
 src/field_10x26_impl.h | 4 ++++
 src/field_5x52_impl.h  | 2 ++
 src/scalar_4x64_impl.h | 2 ++
 src/scalar_8x32_impl.h | 4 ++++
 4 files changed, 12 insertions(+)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 9bf05b85a2..e1a4e43011 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1341,10 +1341,14 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
         g += f * w;
         q += u * w;
         r += v * w;
+
+        VERIFY_CHECK((g & m) == 0);
 #else
         g += f;
         q += u;
         r += v;
+
+        VERIFY_CHECK((g & 1) == 0);
 #endif
     }
 
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 219c281405..082886a150 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -658,6 +658,8 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t
         g += f * w;
         q += u * w;
         r += v * w;
+
+        VERIFY_CHECK((g & m) == 0);
     }
 
     t[0] = (int64_t)u;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index ee5e128187..a76a510d31 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1112,6 +1112,8 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
         g += f * w;
         q += u * w;
         r += v * w;
+
+        VERIFY_CHECK((g & m) == 0);
     }
 
     t[0] = (int64_t)u;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 8ba26ed95f..2b34d13aed 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -897,10 +897,14 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
         g += f * w;
         q += u * w;
         r += v * w;
+
+        VERIFY_CHECK((g & m) == 0);
 #else
         g += f;
         q += u;
         r += v;
+
+        VERIFY_CHECK((g & 1) == 0);
 #endif
     }
 

From 5de2c833907f4ddb94d9d13dc37214734eebfda4 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Thu, 13 Aug 2020 20:08:16 +0700
Subject: [PATCH 25/34] _inv_var conditional negations

- tighten result magnitude for _fe_decode methods
---
 src/field_10x26_impl.h | 55 ++++++++++++++++++++++--------------------
 src/field_5x52_impl.h  | 45 ++++++++++++++++++----------------
 src/scalar_4x64_impl.h | 17 +++++++------
 src/scalar_8x32_impl.h | 17 +++++++------
 4 files changed, 73 insertions(+), 61 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index e1a4e43011..593266a967 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1169,7 +1169,9 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
     const uint32_t M26 = UINT32_MAX >> 6;
     const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
                    a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
-    uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
+    uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, t;
+
+    t = (int32_t)a8 >> 16;
 
     /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 30 == 0);
@@ -1180,19 +1182,19 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
     VERIFY_CHECK(a5 >> 30 == 0);
     VERIFY_CHECK(a6 >> 30 == 0);
     VERIFY_CHECK(a7 >> 30 == 0);
-    VERIFY_CHECK((int32_t)a8 >> 16 == 0 || (int32_t)a8 >> 16 == -(int32_t)1);
-
-    /* Add a multiple of the field prime in case a8 is "negative". */
-    r0  = 0x3FFFC2FUL * 2;
-    r1  = 0x3FFFFBFUL * 2;
-    r2  = 0x3FFFFFFUL * 2;
-    r3  = 0x3FFFFFFUL * 2;
-    r4  = 0x3FFFFFFUL * 2;
-    r5  = 0x3FFFFFFUL * 2;
-    r6  = 0x3FFFFFFUL * 2;
-    r7  = 0x3FFFFFFUL * 2;
-    r8  = 0x3FFFFFFUL * 2;
-    r9  = 0x03FFFFFUL * 2;
+    VERIFY_CHECK(t == 0 || t == -(uint32_t)1);
+
+    /* Add 2P if a8 is "negative". */
+    r0  = 0x3FFF85EUL & t;
+    r1  = 0x3FFFF7FUL & t;
+    r2  = 0x3FFFFFFUL & t;
+    r3  = 0x3FFFFFFUL & t;
+    r4  = 0x3FFFFFFUL & t;
+    r5  = 0x3FFFFFFUL & t;
+    r6  = 0x3FFFFFFUL & t;
+    r7  = 0x3FFFFFFUL & t;
+    r8  = 0x3FFFFFFUL & t;
+    r9  = 0x07FFFFFUL & t;
 
     r0 +=  a0                   & M26;
     r1 += (a0 >> 26 | a1 <<  4) & M26;
@@ -1217,7 +1219,7 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
     r->n[9] = r9;
 
 #ifdef VERIFY
-    r->magnitude = 2;
+    r->magnitude = 1;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
@@ -1526,19 +1528,19 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     int32_t f[9] = { 0x3FFFFC2F, 0x3FFFFFFB, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF,
         0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF };
     int32_t g[9];
-    secp256k1_fe b0, b1;
+    secp256k1_fe b;
     int i, sign;
     uint32_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
 
-    b0 = *a;
-    secp256k1_fe_normalize(&b0);
-    secp256k1_fe_encode_30(g, &b0);
+    b = *a;
+    secp256k1_fe_normalize(&b);
+    secp256k1_fe_encode_30(g, &b);
 
 #ifdef VERIFY
-    zero_in = secp256k1_fe_is_zero(&b0);
+    zero_in = secp256k1_fe_is_zero(&b);
 #endif
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak).
@@ -1566,17 +1568,18 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
     sign = (f[0] >> 1) & 1;
 
-    secp256k1_fe_decode_30(&b0, d);
+    secp256k1_fe_decode_30(&b, d);
 
-    secp256k1_fe_negate(&b1, &b0, 2);
-    secp256k1_fe_cmov(&b0, &b1, sign);
-    secp256k1_fe_normalize_weak(&b0);
+    if (sign) {
+        secp256k1_fe_negate(&b, &b, 1);
+        secp256k1_fe_normalize_weak(&b);
+    }
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in);
+    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b) == !zero_in);
 #endif
 
-    *r = b0;
+    *r = b;
 }
 
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 082886a150..2d89e4fcdf 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -502,21 +502,23 @@ static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) {
 
     const uint64_t M52 = UINT64_MAX >> 12;
     const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
-    uint64_t r0, r1, r2, r3, r4;
+    uint64_t r0, r1, r2, r3, r4, t;
+
+    t = (int64_t)a4 >> 8;
 
     /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 62 == 0);
     VERIFY_CHECK(a1 >> 62 == 0);
     VERIFY_CHECK(a2 >> 62 == 0);
     VERIFY_CHECK(a3 >> 62 == 0);
-    VERIFY_CHECK((int64_t)a4 >> 8 == 0 || (int64_t)a4 >> 8 == -(int64_t)1);
+    VERIFY_CHECK(t == 0 || t == -(uint64_t)1);
 
-    /* Add a multiple of the field prime in case a4 is "negative". */
-    r0  = 0xFFFFEFFFFFC2FULL * 2;
-    r1  = 0xFFFFFFFFFFFFFULL * 2;
-    r2  = 0xFFFFFFFFFFFFFULL * 2;
-    r3  = 0xFFFFFFFFFFFFFULL * 2;
-    r4  = 0x0FFFFFFFFFFFFULL * 2;
+    /* Add 2P if a4 is "negative". */
+    r0  = 0xFFFFDFFFFF85EULL & t;
+    r1  = 0xFFFFFFFFFFFFFULL & t;
+    r2  = 0xFFFFFFFFFFFFFULL & t;
+    r3  = 0xFFFFFFFFFFFFFULL & t;
+    r4  = 0x1FFFFFFFFFFFFULL & t;
 
     r0 +=  a0                   & M52;
     r1 += (a0 >> 52 | a1 << 10) & M52;
@@ -531,7 +533,7 @@ static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) {
     r->n[4] = r4;
 
 #ifdef VERIFY
-    r->magnitude = 2;
+    r->magnitude = 1;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
@@ -819,7 +821,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     secp256k1_fe_decode_62(&b0, d);
 
-    secp256k1_fe_negate(&b1, &b0, 2);
+    secp256k1_fe_negate(&b1, &b0, 1);
     secp256k1_fe_cmov(&b0, &b1, sign);
     secp256k1_fe_normalize_weak(&b0);
 
@@ -841,19 +843,19 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL,
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
-    secp256k1_fe b0, b1;
+    secp256k1_fe b;
     int i, sign;
     uint64_t eta;
 #ifdef VERIFY
     int zero_in;
 #endif
 
-    b0 = *a;
-    secp256k1_fe_normalize(&b0);
-    secp256k1_fe_encode_62(g, &b0);
+    b = *a;
+    secp256k1_fe_normalize(&b);
+    secp256k1_fe_encode_62(g, &b);
 
 #ifdef VERIFY
-    zero_in = secp256k1_fe_is_zero(&b0);
+    zero_in = secp256k1_fe_is_zero(&b);
 #endif
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak).
@@ -881,17 +883,18 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
     sign = (f[0] >> 1) & 1;
 
-    secp256k1_fe_decode_62(&b0, d);
+    secp256k1_fe_decode_62(&b, d);
 
-    secp256k1_fe_negate(&b1, &b0, 2);
-    secp256k1_fe_cmov(&b0, &b1, sign);
-    secp256k1_fe_normalize_weak(&b0);
+    if (sign) {
+        secp256k1_fe_negate(&b, &b, 1);
+        secp256k1_fe_normalize_weak(&b);
+    }
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in);
+    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b) == !zero_in);
 #endif
 
-    *r = b0;
+    *r = b;
 }
 
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index a76a510d31..ee44b36bfd 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1308,15 +1308,15 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
-    secp256k1_scalar b0;
+    secp256k1_scalar b;
     int i, sign;
     uint64_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    b0 = *x;
-    secp256k1_scalar_encode_62(g, &b0);
+    b = *x;
+    secp256k1_scalar_encode_62(g, &b);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak).
      *
@@ -1343,14 +1343,17 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
 
     sign = (f[0] >> 1) & 1;
 
-    secp256k1_scalar_decode_62(&b0, d);
-    secp256k1_scalar_cond_negate(&b0, sign);
+    secp256k1_scalar_decode_62(&b, d);
+
+    if (sign) {
+        secp256k1_scalar_negate(&b, &b);
+    }
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in);
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b) == !zero_in);
 #endif
 
-    *r = b0;
+    *r = b;
 }
 
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 2b34d13aed..d1009cc870 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -1077,15 +1077,15 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL,
         0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL };
     int32_t g[9];
-    secp256k1_scalar b0;
+    secp256k1_scalar b;
     int i, sign;
     uint32_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    b0 = *x;
-    secp256k1_scalar_encode_30(g, &b0);
+    b = *x;
+    secp256k1_scalar_encode_30(g, &b);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak).
      *
@@ -1112,14 +1112,17 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
 
     sign = (f[0] >> 1) & 1;
 
-    secp256k1_scalar_decode_30(&b0, d);
-    secp256k1_scalar_cond_negate(&b0, sign);
+    secp256k1_scalar_decode_30(&b, d);
+
+    if (sign) {
+        secp256k1_scalar_negate(&b, &b);
+    }
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in);
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b) == !zero_in);
 #endif
 
-    *r = b0;
+    *r = b;
 }
 
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */

From 308fd32e004991991a52d91a9c847ddca98b69ef Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sat, 15 Aug 2020 14:07:05 +0700
Subject: [PATCH 26/34] Experiment with f,g shortening in inv_var

---
 src/field_10x26_impl.h | 71 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 593266a967..baf1ab94be 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1457,6 +1457,41 @@ static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, const int32_t *t)
     g[8] = (int32_t)cg;
 }
 
+static void secp256k1_fe_update_fg_30_var(int len, int32_t *f, int32_t *g, const int32_t *t) {
+
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
+    int32_t fi, gi;
+    int64_t cf = 0, cg = 0;
+    int i;
+
+    VERIFY_CHECK(len > 0);
+
+    fi = f[0];
+    gi = g[0];
+
+    cf -= (int64_t)u * fi + (int64_t)v * gi;
+    cg -= (int64_t)q * fi + (int64_t)r * gi;
+
+    VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
+    VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
+
+    for (i = 1; i < len; ++i) {
+
+        fi = f[i];
+        gi = g[i];
+
+        cf -= (int64_t)u * fi + (int64_t)v * gi;
+        cg -= (int64_t)q * fi + (int64_t)r * gi;
+
+        f[i - 1] = (int32_t)cf & M30; cf >>= 30;
+        g[i - 1] = (int32_t)cg & M30; cg >>= 30;
+    }
+
+    f[len - 1] = (int32_t)cf;
+    g[len - 1] = (int32_t)cg;
+}
+
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
@@ -1519,6 +1554,8 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
+#define IS_THIS_FASTER 1
+
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
@@ -1531,6 +1568,10 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     secp256k1_fe b;
     int i, sign;
     uint32_t eta;
+#if IS_THIS_FASTER
+    int j, len = 9;
+    int32_t cond, fn, gn;
+#endif
 #ifdef VERIFY
     int zero_in;
 #endif
@@ -1550,6 +1591,35 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     eta = -(uint32_t)1;
 
     for (i = 0; i < 25; ++i) {
+#if IS_THIS_FASTER
+        eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t);
+        secp256k1_fe_update_de_30(d, e, t);
+        secp256k1_fe_update_fg_30_var(len, f, g, t);
+
+        if (g[0] == 0) {
+            cond = 0;
+            for (j = 1; j < len; ++j) {
+                cond |= g[j];
+            }
+            if (cond == 0) {
+                break;
+            }
+        }
+
+        fn = f[len - 1];
+        gn = g[len - 1];
+
+        cond = ((int32_t)len - 2) >> 31;
+        cond |= fn ^ (fn >> 31);
+        cond |= gn ^ (gn >> 31);
+
+        if (cond == 0)
+        {
+            f[len - 2] |= (uint32_t)fn << 30;
+            g[len - 2] |= (uint32_t)gn << 30;
+            --len;
+        }
+#else
         eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t);
         secp256k1_fe_update_de_30(d, e, t);
         secp256k1_fe_update_fg_30(f, g, t);
@@ -1559,6 +1629,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
                 break;
             }
         }
+#endif
     }
 
     VERIFY_CHECK(i < 25);

From ff0cf1124c630be4f4cd980ce333bf463ad2c289 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sat, 15 Aug 2020 14:28:42 +0700
Subject: [PATCH 27/34] f,g shortening for 64bit field

---
 src/field_5x52_impl.h | 71 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 2d89e4fcdf..1d80926e8e 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -772,6 +772,41 @@ static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, const int64_t *t)
     g[4] = (int64_t)cg;
 }
 
+static void secp256k1_fe_update_fg_62_var(int len, int64_t *f, int64_t *g, const int64_t *t) {
+
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
+    int64_t fi, gi;
+    int128_t cf = 0, cg = 0;
+    int i;
+
+    VERIFY_CHECK(len > 0);
+
+    fi = f[0];
+    gi = g[0];
+
+    cf -= (int128_t)u * fi + (int128_t)v * gi;
+    cg -= (int128_t)q * fi + (int128_t)r * gi;
+
+    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
+    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+
+    for (i = 1; i < len; ++i) {
+
+        fi = f[i];
+        gi = g[i];
+
+        cf -= (int128_t)u * fi + (int128_t)v * gi;
+        cg -= (int128_t)q * fi + (int128_t)r * gi;
+
+        f[i - 1] = (int64_t)cf & M62; cf >>= 62;
+        g[i - 1] = (int64_t)cg & M62; cg >>= 62;
+    }
+
+    f[len - 1] = (int64_t)cf;
+    g[len - 1] = (int64_t)cg;
+}
+
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
@@ -834,6 +869,8 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
+#define IS_THIS_FASTER 1
+
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
@@ -846,6 +883,10 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     secp256k1_fe b;
     int i, sign;
     uint64_t eta;
+#if IS_THIS_FASTER
+    int j, len = 5;
+    int64_t cond, fn, gn;
+#endif
 #ifdef VERIFY
     int zero_in;
 #endif
@@ -865,6 +906,35 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
+#if IS_THIS_FASTER
+        eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t);
+        secp256k1_fe_update_de_62(d, e, t);
+        secp256k1_fe_update_fg_62_var(len, f, g, t);
+
+        if (g[0] == 0) {
+            cond = 0;
+            for (j = 1; j < len; ++j) {
+                cond |= g[j];
+            } 
+            if (cond == 0) {
+                break;
+            }
+        }
+
+        fn = f[len - 1];
+        gn = g[len - 1];
+
+        cond = ((int64_t)len - 2) >> 63;
+        cond |= fn ^ (fn >> 63);
+        cond |= gn ^ (gn >> 63);
+
+        if (cond == 0)
+        {
+            f[len - 2] |= fn << 62;
+            g[len - 2] |= gn << 62;
+            --len;
+        }
+#else
         eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t);
         secp256k1_fe_update_de_62(d, e, t);
         secp256k1_fe_update_fg_62(f, g, t);
@@ -874,6 +944,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
                 break;
             }
         }
+#endif
     }
 
     VERIFY_CHECK(i < 12);

From b51a1b55d9f90f792766cd1d2c76d4e4442a49ae Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sun, 16 Aug 2020 12:53:59 +0700
Subject: [PATCH 28/34] THIS_IS_FASTER

---
 src/field_10x26_impl.h | 23 +++-------------
 src/field_5x52_impl.h  | 25 +++---------------
 src/scalar_4x64_impl.h | 60 +++++++++++++++++++++++++++++++++++++++---
 src/scalar_8x32_impl.h | 60 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 121 insertions(+), 47 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index baf1ab94be..cc29cdbaa2 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1554,8 +1554,6 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
-#define IS_THIS_FASTER 1
-
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
@@ -1566,12 +1564,9 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
         0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF };
     int32_t g[9];
     secp256k1_fe b;
-    int i, sign;
+    int i, j, len = 9, sign;
     uint32_t eta;
-#if IS_THIS_FASTER
-    int j, len = 9;
     int32_t cond, fn, gn;
-#endif
 #ifdef VERIFY
     int zero_in;
 #endif
@@ -1591,7 +1586,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     eta = -(uint32_t)1;
 
     for (i = 0; i < 25; ++i) {
-#if IS_THIS_FASTER
+
         eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t);
         secp256k1_fe_update_de_30(d, e, t);
         secp256k1_fe_update_fg_30_var(len, f, g, t);
@@ -1613,23 +1608,11 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
         cond |= fn ^ (fn >> 31);
         cond |= gn ^ (gn >> 31);
 
-        if (cond == 0)
-        {
+        if (cond == 0) {
             f[len - 2] |= (uint32_t)fn << 30;
             g[len - 2] |= (uint32_t)gn << 30;
             --len;
         }
-#else
-        eta = secp256k1_fe_divsteps_30_var(eta, f[0], g[0], t);
-        secp256k1_fe_update_de_30(d, e, t);
-        secp256k1_fe_update_fg_30(f, g, t);
-
-        if (g[0] == 0) {
-            if ((g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0) {
-                break;
-            }
-        }
-#endif
     }
 
     VERIFY_CHECK(i < 25);
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 1d80926e8e..da1391aea4 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -869,8 +869,6 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
-#define IS_THIS_FASTER 1
-
     /* Modular inversion based on the paper "Fast constant-time gcd computation and
      * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
 
@@ -881,12 +879,9 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
     secp256k1_fe b;
-    int i, sign;
+    int i, j, len = 5, sign;
     uint64_t eta;
-#if IS_THIS_FASTER
-    int j, len = 5;
     int64_t cond, fn, gn;
-#endif
 #ifdef VERIFY
     int zero_in;
 #endif
@@ -906,7 +901,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
-#if IS_THIS_FASTER
+
         eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t);
         secp256k1_fe_update_de_62(d, e, t);
         secp256k1_fe_update_fg_62_var(len, f, g, t);
@@ -915,7 +910,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
             cond = 0;
             for (j = 1; j < len; ++j) {
                 cond |= g[j];
-            } 
+            }
             if (cond == 0) {
                 break;
             }
@@ -928,23 +923,11 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
         cond |= fn ^ (fn >> 63);
         cond |= gn ^ (gn >> 63);
 
-        if (cond == 0)
-        {
+        if (cond == 0) {
             f[len - 2] |= fn << 62;
             g[len - 2] |= gn << 62;
             --len;
         }
-#else
-        eta = secp256k1_fe_divsteps_62_var(eta, f[0], g[0], t);
-        secp256k1_fe_update_de_62(d, e, t);
-        secp256k1_fe_update_fg_62(f, g, t);
-
-        if (g[0] == 0) {
-            if ((g[1] | g[2] | g[3] | g[4]) == 0) {
-                break;
-            }
-        }
-#endif
     }
 
     VERIFY_CHECK(i < 12);
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index ee44b36bfd..80c90cd347 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1229,6 +1229,41 @@ static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, const int64_t
     g[4] = (int64_t)cg;
 }
 
+static void secp256k1_scalar_update_fg_62_var(int len, int64_t *f, int64_t *g, const int64_t *t) {
+
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
+    int64_t fi, gi;
+    int128_t cf = 0, cg = 0;
+    int i;
+
+    VERIFY_CHECK(len > 0);
+
+    fi = f[0];
+    gi = g[0];
+
+    cf -= (int128_t)u * fi + (int128_t)v * gi;
+    cg -= (int128_t)q * fi + (int128_t)r * gi;
+
+    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
+    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+
+    for (i = 1; i < len; ++i) {
+
+        fi = f[i];
+        gi = g[i];
+
+        cf -= (int128_t)u * fi + (int128_t)v * gi;
+        cg -= (int128_t)q * fi + (int128_t)r * gi;
+
+        f[i - 1] = (int64_t)cf & M62; cf >>= 62;
+        g[i - 1] = (int64_t)cg & M62; cg >>= 62;
+    }
+
+    f[len - 1] = (int64_t)cf;
+    g[len - 1] = (int64_t)cg;
+}
+
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
 #if defined(EXHAUSTIVE_TEST_ORDER)
     int i;
@@ -1309,8 +1344,9 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
     secp256k1_scalar b;
-    int i, sign;
+    int i, j, len = 5, sign;
     uint64_t eta;
+    int64_t cond, fn, gn;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
@@ -1325,15 +1361,33 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     eta = -(uint64_t)1;
 
     for (i = 0; i < 12; ++i) {
+
         eta = secp256k1_scalar_divsteps_62_var(eta, f[0], g[0], t);
         secp256k1_scalar_update_de_62(d, e, t);
-        secp256k1_scalar_update_fg_62(f, g, t);
+        secp256k1_scalar_update_fg_62_var(len, f, g, t);
 
         if (g[0] == 0) {
-            if ((g[1] | g[2] | g[3] | g[4]) == 0) {
+            cond = 0;
+            for (j = 1; j < len; ++j) {
+                cond |= g[j];
+            }
+            if (cond == 0) {
                 break;
             }
         }
+
+        fn = f[len - 1];
+        gn = g[len - 1];
+
+        cond = ((int64_t)len - 2) >> 63;
+        cond |= fn ^ (fn >> 63);
+        cond |= gn ^ (gn >> 63);
+
+        if (cond == 0) {
+            f[len - 2] |= fn << 62;
+            g[len - 2] |= gn << 62;
+            --len;
+        }
     }
 
     VERIFY_CHECK(i < 12);
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index d1009cc870..40536c827b 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -998,6 +998,41 @@ static void secp256k1_scalar_update_fg_30(int32_t *f, int32_t *g, int32_t *t) {
     g[8] = (int32_t)cg;
 }
 
+static void secp256k1_scalar_update_fg_30_var(int len, int32_t *f, int32_t *g, const int32_t *t) {
+
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
+    int32_t fi, gi;
+    int64_t cf = 0, cg = 0;
+    int i;
+
+    VERIFY_CHECK(len > 0);
+
+    fi = f[0];
+    gi = g[0];
+
+    cf -= (int64_t)u * fi + (int64_t)v * gi;
+    cg -= (int64_t)q * fi + (int64_t)r * gi;
+
+    VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
+    VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
+
+    for (i = 1; i < len; ++i) {
+
+        fi = f[i];
+        gi = g[i];
+
+        cf -= (int64_t)u * fi + (int64_t)v * gi;
+        cg -= (int64_t)q * fi + (int64_t)r * gi;
+
+        f[i - 1] = (int32_t)cf & M30; cf >>= 30;
+        g[i - 1] = (int32_t)cg & M30; cg >>= 30;
+    }
+
+    f[len - 1] = (int32_t)cf;
+    g[len - 1] = (int32_t)cg;
+}
+
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
 #if defined(EXHAUSTIVE_TEST_ORDER)
     int i;
@@ -1078,8 +1113,9 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
         0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL };
     int32_t g[9];
     secp256k1_scalar b;
-    int i, sign;
+    int i, j, len = 9, sign;
     uint32_t eta;
+    int32_t cond, fn, gn;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
@@ -1094,15 +1130,33 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     eta = -(uint32_t)1;
 
     for (i = 0; i < 25; ++i) {
+
         eta = secp256k1_scalar_divsteps_30_var(eta, f[0], g[0], t);
         secp256k1_scalar_update_de_30(d, e, t);
-        secp256k1_scalar_update_fg_30(f, g, t);
+        secp256k1_scalar_update_fg_30_var(len, f, g, t);
 
         if (g[0] == 0) {
-            if ((g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0) {
+            cond = 0;
+            for (j = 1; j < len; ++j) {
+                cond |= g[j];
+            }
+            if (cond == 0) {
                 break;
             }
         }
+
+        fn = f[len - 1];
+        gn = g[len - 1];
+
+        cond = ((int32_t)len - 2) >> 31;
+        cond |= fn ^ (fn >> 31);
+        cond |= gn ^ (gn >> 31);
+
+        if (cond == 0) {
+            f[len - 2] |= (uint32_t)fn << 30;
+            g[len - 2] |= (uint32_t)gn << 30;
+            --len;
+        }
     }
 
     VERIFY_CHECK(i < 25);

From 1baff2caec31ff709b9404edd8a7b7ea380d29d4 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sun, 16 Aug 2020 19:07:25 +0700
Subject: [PATCH 29/34] Accentuate the positive

(Eliminate the negative)
---
 src/field_10x26_impl.h | 46 ++++++++++++++---------------
 src/field_5x52_impl.h  | 66 +++++++++++++++++++++---------------------
 src/scalar_4x64_impl.h | 66 +++++++++++++++++++++---------------------
 src/scalar_8x32_impl.h | 42 +++++++++++++--------------
 4 files changed, 110 insertions(+), 110 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index cc29cdbaa2..3fd44bc5da 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1250,15 +1250,15 @@ static void secp256k1_fe_encode_30(int32_t *r, const secp256k1_fe *a) {
 
 static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
-    uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
+    uint32_t u = 1, v = 0, q = 0, r = 1;
     uint32_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 0; i < 30; ++i) {
 
         VERIFY_CHECK((f & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << i);
-        VERIFY_CHECK((q * f0 + r * g0) == -g << i);
+        VERIFY_CHECK((u * f0 + v * g0) == f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
         c1 = -(g & (eta >> 31));
 
@@ -1296,7 +1296,7 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
         31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 };
 #endif
 
-    uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
+    uint32_t u = 1, v = 0, q = 0, r = 1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
     int i = 30, limit, zeros;
 
@@ -1323,8 +1323,8 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
 
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((g & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << (30 - i));
-        VERIFY_CHECK((q * f0 + r * g0) == -g << (30 - i));
+        VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i));
 
         if ((int32_t)eta < 0) {
             eta = -eta;
@@ -1371,14 +1371,14 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t)
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
     int32_t di, ei, md, me;
-    int64_t cd = 0, ce = 0;
+    int64_t cd, ce;
     int i;
 
     di = d[0];
     ei = e[0];
 
-    cd -= (int64_t)u * di + (int64_t)v * ei;
-    ce -= (int64_t)q * di + (int64_t)r * ei;
+    cd = (int64_t)u * di + (int64_t)v * ei;
+    ce = (int64_t)q * di + (int64_t)r * ei;
 
     /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me
      * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */
@@ -1399,8 +1399,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t)
         di = d[i];
         ei = e[i];
 
-        cd -= (int64_t)u * di + (int64_t)v * ei;
-        ce -= (int64_t)q * di + (int64_t)r * ei;
+        cd += (int64_t)u * di + (int64_t)v * ei;
+        ce += (int64_t)q * di + (int64_t)r * ei;
 
         d[i - 1] = (int32_t)cd & M30; cd >>= 30;
         e[i - 1] = (int32_t)ce & M30; ce >>= 30;
@@ -1410,8 +1410,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t)
         di = d[8];
         ei = e[8];
 
-        cd -= (int64_t)u * di + (int64_t)v * ei;
-        ce -= (int64_t)q * di + (int64_t)r * ei;
+        cd += (int64_t)u * di + (int64_t)v * ei;
+        ce += (int64_t)q * di + (int64_t)r * ei;
 
         cd += (int64_t)65536 * md;
         ce += (int64_t)65536 * me;
@@ -1429,14 +1429,14 @@ static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, const int32_t *t)
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
     int32_t fi, gi;
-    int64_t cf = 0, cg = 0;
+    int64_t cf, cg;
     int i;
 
     fi = f[0];
     gi = g[0];
 
-    cf -= (int64_t)u * fi + (int64_t)v * gi;
-    cg -= (int64_t)q * fi + (int64_t)r * gi;
+    cf = (int64_t)u * fi + (int64_t)v * gi;
+    cg = (int64_t)q * fi + (int64_t)r * gi;
 
     VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
     VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
@@ -1446,8 +1446,8 @@ static void secp256k1_fe_update_fg_30(int32_t *f, int32_t *g, const int32_t *t)
         fi = f[i];
         gi = g[i];
 
-        cf -= (int64_t)u * fi + (int64_t)v * gi;
-        cg -= (int64_t)q * fi + (int64_t)r * gi;
+        cf += (int64_t)u * fi + (int64_t)v * gi;
+        cg += (int64_t)q * fi + (int64_t)r * gi;
 
         f[i - 1] = (int32_t)cf & M30; cf >>= 30;
         g[i - 1] = (int32_t)cg & M30; cg >>= 30;
@@ -1462,7 +1462,7 @@ static void secp256k1_fe_update_fg_30_var(int len, int32_t *f, int32_t *g, const
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
     int32_t fi, gi;
-    int64_t cf = 0, cg = 0;
+    int64_t cf, cg;
     int i;
 
     VERIFY_CHECK(len > 0);
@@ -1470,8 +1470,8 @@ static void secp256k1_fe_update_fg_30_var(int len, int32_t *f, int32_t *g, const
     fi = f[0];
     gi = g[0];
 
-    cf -= (int64_t)u * fi + (int64_t)v * gi;
-    cg -= (int64_t)q * fi + (int64_t)r * gi;
+    cf = (int64_t)u * fi + (int64_t)v * gi;
+    cg = (int64_t)q * fi + (int64_t)r * gi;
 
     VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
     VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
@@ -1481,8 +1481,8 @@ static void secp256k1_fe_update_fg_30_var(int len, int32_t *f, int32_t *g, const
         fi = f[i];
         gi = g[i];
 
-        cf -= (int64_t)u * fi + (int64_t)v * gi;
-        cg -= (int64_t)q * fi + (int64_t)r * gi;
+        cf += (int64_t)u * fi + (int64_t)v * gi;
+        cg += (int64_t)q * fi + (int64_t)r * gi;
 
         f[i - 1] = (int32_t)cf & M30; cf >>= 30;
         g[i - 1] = (int32_t)cg & M30; cg >>= 30;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index da1391aea4..15c4fc2a8b 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -558,15 +558,15 @@ static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) {
 
 static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
-    uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
+    uint64_t u = 1, v = 0, q = 0, r = 1;
     uint64_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 0; i < 62; ++i) {
 
         VERIFY_CHECK((f & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << i);
-        VERIFY_CHECK((q * f0 + r * g0) == -g << i);
+        VERIFY_CHECK((u * f0 + v * g0) == f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
         c1 = -(g & (eta >> 63));
 
@@ -607,7 +607,7 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t
     };
 #endif
 
-    uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
+    uint64_t u = 1, v = 0, q = 0, r = 1;
     uint64_t f = f0, g = g0, m, w, x, y, z;
     int i = 62, limit, zeros;
 
@@ -634,8 +634,8 @@ static uint64_t secp256k1_fe_divsteps_62_var(uint64_t eta, uint64_t f0, uint64_t
 
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((g & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i));
-        VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i));
+        VERIFY_CHECK((u * f0 + v * g0) == f << (62 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == g << (62 - i));
 
         if ((int64_t)eta < 0) {
             eta = -eta;
@@ -683,10 +683,10 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t)
     const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4];
     const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
     int64_t md, me;
-    int128_t cd = 0, ce = 0;
+    int128_t cd, ce;
 
-    cd -= (int128_t)u * d0 + (int128_t)v * e0;
-    ce -= (int128_t)q * d0 + (int128_t)r * e0;
+    cd = (int128_t)u * d0 + (int128_t)v * e0;
+    ce = (int128_t)q * d0 + (int128_t)r * e0;
 
     /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me
      * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */
@@ -699,26 +699,26 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t)
     VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
     VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
 
-    cd -= (int128_t)u * d1 + (int128_t)v * e1;
-    ce -= (int128_t)q * d1 + (int128_t)r * e1;
+    cd += (int128_t)u * d1 + (int128_t)v * e1;
+    ce += (int128_t)q * d1 + (int128_t)r * e1;
 
     d[0] = (int64_t)cd & M62; cd >>= 62;
     e[0] = (int64_t)ce & M62; ce >>= 62;
 
-    cd -= (int128_t)u * d2 + (int128_t)v * e2;
-    ce -= (int128_t)q * d2 + (int128_t)r * e2;
+    cd += (int128_t)u * d2 + (int128_t)v * e2;
+    ce += (int128_t)q * d2 + (int128_t)r * e2;
 
     d[1] = (int64_t)cd & M62; cd >>= 62;
     e[1] = (int64_t)ce & M62; ce >>= 62;
 
-    cd -= (int128_t)u * d3 + (int128_t)v * e3;
-    ce -= (int128_t)q * d3 + (int128_t)r * e3;
+    cd += (int128_t)u * d3 + (int128_t)v * e3;
+    ce += (int128_t)q * d3 + (int128_t)r * e3;
 
     d[2] = (int64_t)cd & M62; cd >>= 62;
     e[2] = (int64_t)ce & M62; ce >>= 62;
 
-    cd -= (int128_t)u * d4 + (int128_t)v * e4;
-    ce -= (int128_t)q * d4 + (int128_t)r * e4;
+    cd += (int128_t)u * d4 + (int128_t)v * e4;
+    ce += (int128_t)q * d4 + (int128_t)r * e4;
 
     cd += (int128_t)256 * md;
     ce += (int128_t)256 * me;
@@ -736,34 +736,34 @@ static void secp256k1_fe_update_fg_62(int64_t *f, int64_t *g, const int64_t *t)
     const int64_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4];
     const int64_t g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4];
     const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
-    int128_t cf = 0, cg = 0;
+    int128_t cf, cg;
 
-    cf -= (int128_t)u * f0 + (int128_t)v * g0;
-    cg -= (int128_t)q * f0 + (int128_t)r * g0;
+    cf = (int128_t)u * f0 + (int128_t)v * g0;
+    cg = (int128_t)q * f0 + (int128_t)r * g0;
 
     VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
     VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
 
-    cf -= (int128_t)u * f1 + (int128_t)v * g1;
-    cg -= (int128_t)q * f1 + (int128_t)r * g1;
+    cf += (int128_t)u * f1 + (int128_t)v * g1;
+    cg += (int128_t)q * f1 + (int128_t)r * g1;
 
     f[0] = (int64_t)cf & M62; cf >>= 62;
     g[0] = (int64_t)cg & M62; cg >>= 62;
 
-    cf -= (int128_t)u * f2 + (int128_t)v * g2;
-    cg -= (int128_t)q * f2 + (int128_t)r * g2;
+    cf += (int128_t)u * f2 + (int128_t)v * g2;
+    cg += (int128_t)q * f2 + (int128_t)r * g2;
 
     f[1] = (int64_t)cf & M62; cf >>= 62;
     g[1] = (int64_t)cg & M62; cg >>= 62;
 
-    cf -= (int128_t)u * f3 + (int128_t)v * g3;
-    cg -= (int128_t)q * f3 + (int128_t)r * g3;
+    cf += (int128_t)u * f3 + (int128_t)v * g3;
+    cg += (int128_t)q * f3 + (int128_t)r * g3;
 
     f[2] = (int64_t)cf & M62; cf >>= 62;
     g[2] = (int64_t)cg & M62; cg >>= 62;
 
-    cf -= (int128_t)u * f4 + (int128_t)v * g4;
-    cg -= (int128_t)q * f4 + (int128_t)r * g4;
+    cf += (int128_t)u * f4 + (int128_t)v * g4;
+    cg += (int128_t)q * f4 + (int128_t)r * g4;
 
     f[3] = (int64_t)cf & M62; cf >>= 62;
     g[3] = (int64_t)cg & M62; cg >>= 62;
@@ -777,7 +777,7 @@ static void secp256k1_fe_update_fg_62_var(int len, int64_t *f, int64_t *g, const
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
     int64_t fi, gi;
-    int128_t cf = 0, cg = 0;
+    int128_t cf, cg;
     int i;
 
     VERIFY_CHECK(len > 0);
@@ -785,8 +785,8 @@ static void secp256k1_fe_update_fg_62_var(int len, int64_t *f, int64_t *g, const
     fi = f[0];
     gi = g[0];
 
-    cf -= (int128_t)u * fi + (int128_t)v * gi;
-    cg -= (int128_t)q * fi + (int128_t)r * gi;
+    cf = (int128_t)u * fi + (int128_t)v * gi;
+    cg = (int128_t)q * fi + (int128_t)r * gi;
 
     VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
     VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
@@ -796,8 +796,8 @@ static void secp256k1_fe_update_fg_62_var(int len, int64_t *f, int64_t *g, const
         fi = f[i];
         gi = g[i];
 
-        cf -= (int128_t)u * fi + (int128_t)v * gi;
-        cg -= (int128_t)q * fi + (int128_t)r * gi;
+        cf += (int128_t)u * fi + (int128_t)v * gi;
+        cg += (int128_t)q * fi + (int128_t)r * gi;
 
         f[i - 1] = (int64_t)cf & M62; cf >>= 62;
         g[i - 1] = (int64_t)cg & M62; cg >>= 62;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 80c90cd347..1563dad1f9 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1010,15 +1010,15 @@ static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {
 
 static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
 
-    uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
+    uint64_t u = 1, v = 0, q = 0, r = 1;
     uint64_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 0; i < 62; ++i) {
 
         VERIFY_CHECK((f & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << i);
-        VERIFY_CHECK((q * f0 + r * g0) == -g << i);
+        VERIFY_CHECK((u * f0 + v * g0) == f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
         c1 = -(g & (eta >> 63));
 
@@ -1059,7 +1059,7 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
     };
 #endif
 
-    uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
+    uint64_t u = 1, v = 0, q = 0, r = 1;
     uint64_t f = f0, g = g0, m, w, x, y, z;
     int i = 62, limit, zeros;
 
@@ -1086,8 +1086,8 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
 
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((g & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << (62 - i));
-        VERIFY_CHECK((q * f0 + r * g0) == -g << (62 - i));
+        VERIFY_CHECK((u * f0 + v * g0) == f << (62 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == g << (62 - i));
 
         if ((int64_t)eta < 0) {
             eta = -eta;
@@ -1134,10 +1134,10 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t
     const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4];
     const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
     int64_t md, me;
-    int128_t cd = 0, ce = 0;
+    int128_t cd, ce;
 
-    cd -= (int128_t)u * d0 + (int128_t)v * e0;
-    ce -= (int128_t)q * d0 + (int128_t)r * e0;
+    cd = (int128_t)u * d0 + (int128_t)v * e0;
+    ce = (int128_t)q * d0 + (int128_t)r * e0;
 
     /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me
      * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */
@@ -1150,8 +1150,8 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t
     VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
     VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
 
-    cd -= (int128_t)u * d1 + (int128_t)v * e1;
-    ce -= (int128_t)q * d1 + (int128_t)r * e1;
+    cd += (int128_t)u * d1 + (int128_t)v * e1;
+    ce += (int128_t)q * d1 + (int128_t)r * e1;
 
     cd += (int128_t)P[1] * md;
     ce += (int128_t)P[1] * me;
@@ -1159,8 +1159,8 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t
     d[0] = (int64_t)cd & M62; cd >>= 62;
     e[0] = (int64_t)ce & M62; ce >>= 62;
 
-    cd -= (int128_t)u * d2 + (int128_t)v * e2;
-    ce -= (int128_t)q * d2 + (int128_t)r * e2;
+    cd += (int128_t)u * d2 + (int128_t)v * e2;
+    ce += (int128_t)q * d2 + (int128_t)r * e2;
 
     cd += (int128_t)P[2] * md;
     ce += (int128_t)P[2] * me;
@@ -1168,14 +1168,14 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t
     d[1] = (int64_t)cd & M62; cd >>= 62;
     e[1] = (int64_t)ce & M62; ce >>= 62;
 
-    cd -= (int128_t)u * d3 + (int128_t)v * e3;
-    ce -= (int128_t)q * d3 + (int128_t)r * e3;
+    cd += (int128_t)u * d3 + (int128_t)v * e3;
+    ce += (int128_t)q * d3 + (int128_t)r * e3;
 
     d[2] = (int64_t)cd & M62; cd >>= 62;
     e[2] = (int64_t)ce & M62; ce >>= 62;
 
-    cd -= (int128_t)u * d4 + (int128_t)v * e4;
-    ce -= (int128_t)q * d4 + (int128_t)r * e4;
+    cd += (int128_t)u * d4 + (int128_t)v * e4;
+    ce += (int128_t)q * d4 + (int128_t)r * e4;
 
     cd += (int128_t)P[4] * md;
     ce += (int128_t)P[4] * me;
@@ -1193,34 +1193,34 @@ static void secp256k1_scalar_update_fg_62(int64_t *f, int64_t *g, const int64_t
     const int64_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4];
     const int64_t g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4];
     const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
-    int128_t cf = 0, cg = 0;
+    int128_t cf, cg;
 
-    cf -= (int128_t)u * f0 + (int128_t)v * g0;
-    cg -= (int128_t)q * f0 + (int128_t)r * g0;
+    cf = (int128_t)u * f0 + (int128_t)v * g0;
+    cg = (int128_t)q * f0 + (int128_t)r * g0;
 
     VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
     VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
 
-    cf -= (int128_t)u * f1 + (int128_t)v * g1;
-    cg -= (int128_t)q * f1 + (int128_t)r * g1;
+    cf += (int128_t)u * f1 + (int128_t)v * g1;
+    cg += (int128_t)q * f1 + (int128_t)r * g1;
 
     f[0] = (int64_t)cf & M62; cf >>= 62;
     g[0] = (int64_t)cg & M62; cg >>= 62;
 
-    cf -= (int128_t)u * f2 + (int128_t)v * g2;
-    cg -= (int128_t)q * f2 + (int128_t)r * g2;
+    cf += (int128_t)u * f2 + (int128_t)v * g2;
+    cg += (int128_t)q * f2 + (int128_t)r * g2;
 
     f[1] = (int64_t)cf & M62; cf >>= 62;
     g[1] = (int64_t)cg & M62; cg >>= 62;
 
-    cf -= (int128_t)u * f3 + (int128_t)v * g3;
-    cg -= (int128_t)q * f3 + (int128_t)r * g3;
+    cf += (int128_t)u * f3 + (int128_t)v * g3;
+    cg += (int128_t)q * f3 + (int128_t)r * g3;
 
     f[2] = (int64_t)cf & M62; cf >>= 62;
     g[2] = (int64_t)cg & M62; cg >>= 62;
 
-    cf -= (int128_t)u * f4 + (int128_t)v * g4;
-    cg -= (int128_t)q * f4 + (int128_t)r * g4;
+    cf += (int128_t)u * f4 + (int128_t)v * g4;
+    cg += (int128_t)q * f4 + (int128_t)r * g4;
 
     f[3] = (int64_t)cf & M62; cf >>= 62;
     g[3] = (int64_t)cg & M62; cg >>= 62;
@@ -1234,7 +1234,7 @@ static void secp256k1_scalar_update_fg_62_var(int len, int64_t *f, int64_t *g, c
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
     int64_t fi, gi;
-    int128_t cf = 0, cg = 0;
+    int128_t cf, cg;
     int i;
 
     VERIFY_CHECK(len > 0);
@@ -1242,8 +1242,8 @@ static void secp256k1_scalar_update_fg_62_var(int len, int64_t *f, int64_t *g, c
     fi = f[0];
     gi = g[0];
 
-    cf -= (int128_t)u * fi + (int128_t)v * gi;
-    cg -= (int128_t)q * fi + (int128_t)r * gi;
+    cf = (int128_t)u * fi + (int128_t)v * gi;
+    cg = (int128_t)q * fi + (int128_t)r * gi;
 
     VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
     VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
@@ -1253,8 +1253,8 @@ static void secp256k1_scalar_update_fg_62_var(int len, int64_t *f, int64_t *g, c
         fi = f[i];
         gi = g[i];
 
-        cf -= (int128_t)u * fi + (int128_t)v * gi;
-        cg -= (int128_t)q * fi + (int128_t)r * gi;
+        cf += (int128_t)u * fi + (int128_t)v * gi;
+        cg += (int128_t)q * fi + (int128_t)r * gi;
 
         f[i - 1] = (int64_t)cf & M62; cf >>= 62;
         g[i - 1] = (int64_t)cg & M62; cg >>= 62;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 40536c827b..9c964dae91 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -804,15 +804,15 @@ static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) {
 
 static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0, int32_t *t) {
 
-    uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
+    uint32_t u = 1, v = 0, q = 0, r = 1;
     uint32_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 0; i < 30; ++i) {
 
         VERIFY_CHECK((f & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << i);
-        VERIFY_CHECK((q * f0 + r * g0) == -g << i);
+        VERIFY_CHECK((u * f0 + v * g0) == f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
         c1 = -(g & (eta >> 31));
 
@@ -850,7 +850,7 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
         31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 };
 #endif
 
-    uint32_t u = -(uint32_t)1, v = 0, q = 0, r = -(uint32_t)1;
+    uint32_t u = 1, v = 0, q = 0, r = 1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
     int i = 30, limit, zeros;
 
@@ -877,8 +877,8 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
 
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((g & 1) == 1);
-        VERIFY_CHECK((u * f0 + v * g0) == -f << (30 - i));
-        VERIFY_CHECK((q * f0 + r * g0) == -g << (30 - i));
+        VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i));
 
         if ((int32_t)eta < 0) {
             eta = -eta;
@@ -924,14 +924,14 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t
         0, 0, 0, 65536 };
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
-    int64_t cd = 0, ce = 0;
+    int64_t cd, ce;
     int i;
 
     di = d[0];
     ei = e[0];
 
-    cd -= (int64_t)u * di + (int64_t)v * ei;
-    ce -= (int64_t)q * di + (int64_t)r * ei;
+    cd = (int64_t)u * di + (int64_t)v * ei;
+    ce = (int64_t)q * di + (int64_t)r * ei;
 
     /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me
      * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */
@@ -949,8 +949,8 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t
         di = d[i];
         ei = e[i];
 
-        cd -= (int64_t)u * di + (int64_t)v * ei;
-        ce -= (int64_t)q * di + (int64_t)r * ei;
+        cd += (int64_t)u * di + (int64_t)v * ei;
+        ce += (int64_t)q * di + (int64_t)r * ei;
 
         cd += (int64_t)P[i] * md;
         ce += (int64_t)P[i] * me;
@@ -967,14 +967,14 @@ static void secp256k1_scalar_update_fg_30(int32_t *f, int32_t *g, int32_t *t) {
 
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int32_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
-    int64_t cf = 0, cg = 0;
+    int64_t cf, cg;
     int i;
 
     fi = f[0];
     gi = g[0];
 
-    cf -= (int64_t)u * fi + (int64_t)v * gi;
-    cg -= (int64_t)q * fi + (int64_t)r * gi;
+    cf = (int64_t)u * fi + (int64_t)v * gi;
+    cg = (int64_t)q * fi + (int64_t)r * gi;
 
     VERIFY_CHECK(((int32_t)cf & M30) == 0);
     VERIFY_CHECK(((int32_t)cg & M30) == 0);
@@ -987,8 +987,8 @@ static void secp256k1_scalar_update_fg_30(int32_t *f, int32_t *g, int32_t *t) {
         fi = f[i];
         gi = g[i];
 
-        cf -= (int64_t)u * fi + (int64_t)v * gi;
-        cg -= (int64_t)q * fi + (int64_t)r * gi;
+        cf += (int64_t)u * fi + (int64_t)v * gi;
+        cg += (int64_t)q * fi + (int64_t)r * gi;
 
         f[i - 1] = (int32_t)cf & M30; cf >>= 30;
         g[i - 1] = (int32_t)cg & M30; cg >>= 30;
@@ -1003,7 +1003,7 @@ static void secp256k1_scalar_update_fg_30_var(int len, int32_t *f, int32_t *g, c
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
     int32_t fi, gi;
-    int64_t cf = 0, cg = 0;
+    int64_t cf, cg;
     int i;
 
     VERIFY_CHECK(len > 0);
@@ -1011,8 +1011,8 @@ static void secp256k1_scalar_update_fg_30_var(int len, int32_t *f, int32_t *g, c
     fi = f[0];
     gi = g[0];
 
-    cf -= (int64_t)u * fi + (int64_t)v * gi;
-    cg -= (int64_t)q * fi + (int64_t)r * gi;
+    cf = (int64_t)u * fi + (int64_t)v * gi;
+    cg = (int64_t)q * fi + (int64_t)r * gi;
 
     VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
     VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
@@ -1022,8 +1022,8 @@ static void secp256k1_scalar_update_fg_30_var(int len, int32_t *f, int32_t *g, c
         fi = f[i];
         gi = g[i];
 
-        cf -= (int64_t)u * fi + (int64_t)v * gi;
-        cg -= (int64_t)q * fi + (int64_t)r * gi;
+        cf += (int64_t)u * fi + (int64_t)v * gi;
+        cg += (int64_t)q * fi + (int64_t)r * gi;
 
         f[i - 1] = (int32_t)cf & M30; cf >>= 30;
         g[i - 1] = (int32_t)cg & M30; cg >>= 30;

From 65550c1f6d44da2bd9d72d6fd2256a6cba0fd828 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 18 Aug 2020 00:50:29 +0700
Subject: [PATCH 30/34] Try 128 byte table of inverses

---
 src/field_10x26_impl.h | 35 +++++++++++++++++++++--------------
 src/scalar_8x32_impl.h | 35 +++++++++++++++++++++--------------
 2 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 3fd44bc5da..4b3e35a14a 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1292,10 +1292,25 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
 
 #if 1
     static const uint8_t debruijn[32] = {
-        0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26,
-        31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 };
+        0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A,
+        0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B,
+        0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B };
 #endif
 
+    static const uint8_t inv256[128] = {
+        0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59,
+        0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31,
+        0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89,
+        0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61,
+        0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9,
+        0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91,
+        0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9,
+        0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1,
+        0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19,
+        0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1,
+        0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
+    };
+
     uint32_t u = 1, v = 0, q = 0, r = 1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
     int i = 30, limit, zeros;
@@ -1333,25 +1348,17 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
             z = v; v = r; r = -z;
         }
 
-#if 1
-        /* Handle up to 3 divsteps at once, subject to eta and i. */
+        /* Handle up to 8 divsteps at once, subject to eta and i. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
-        m = (UINT32_MAX >> (32 - limit)) & 7U;
+        m = (UINT32_MAX >> (32 - limit)) & 255U;
+
+        w = (g * inv256[(f >> 1) & 127]) & m;
 
-        /* Note that f * f == 1 mod 8, for any f. */
-        w = (-f * g) & m;
         g += f * w;
         q += u * w;
         r += v * w;
 
         VERIFY_CHECK((g & m) == 0);
-#else
-        g += f;
-        q += u;
-        r += v;
-
-        VERIFY_CHECK((g & 1) == 0);
-#endif
     }
 
     t[0] = (int32_t)u;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 9c964dae91..4f11bd19cf 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -846,10 +846,25 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
 
 #if 1
     static const uint8_t debruijn[32] = {
-        0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26,
-        31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27 };
+        0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A,
+        0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B,
+        0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B };
 #endif
 
+    static const uint8_t inv256[128] = {
+        0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59,
+        0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31,
+        0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89,
+        0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61,
+        0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9,
+        0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91,
+        0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9,
+        0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1,
+        0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19,
+        0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1,
+        0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
+    };
+
     uint32_t u = 1, v = 0, q = 0, r = 1;
     uint32_t f = f0, g = g0, m, w, x, y, z;
     int i = 30, limit, zeros;
@@ -887,25 +902,17 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
             z = v; v = r; r = -z;
         }
 
-#if 1
-        /* Handle up to 3 divsteps at once, subject to eta and i. */
+        /* Handle up to 8 divsteps at once, subject to eta and i. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
-        m = (UINT32_MAX >> (32 - limit)) & 7U;
+        m = (UINT32_MAX >> (32 - limit)) & 255U;
+
+        w = (g * inv256[(f >> 1) & 127]) & m;
 
-        /* Note that f * f == 1 mod 8, for any f. */
-        w = (-f * g) & m;
         g += f * w;
         q += u * w;
         r += v * w;
 
         VERIFY_CHECK((g & m) == 0);
-#else
-        g += f;
-        q += u;
-        r += v;
-
-        VERIFY_CHECK((g & 1) == 0);
-#endif
     }
 
     t[0] = (int32_t)u;

From 5ccfc30aaf78bbfa1e3c7f171c5a7e425a2bad5c Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 25 Aug 2020 14:55:41 +0700
Subject: [PATCH 31/34] Avoid redundant calculation

---
 src/field_10x26_impl.h | 5 ++---
 src/field_5x52_impl.h  | 5 ++---
 src/scalar_4x64_impl.h | 5 ++---
 src/scalar_8x32_impl.h | 5 ++---
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 4b3e35a14a..7faec253ac 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1260,7 +1260,8 @@ static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0,
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
-        c1 = -(g & (eta >> 31));
+        c2 = -(g & 1);
+        c1 = c2 & ((int32_t)eta >> 31);
 
         x = (f ^ g) & c1;
         f ^= x; g ^= x; g ^= c1; g -= c1;
@@ -1273,8 +1274,6 @@ static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0,
 
         eta = (eta ^ c1) - c1 - 1;
 
-        c2 = -(g & 1);
-
         g += (f & c2); g >>= 1;
         q += (u & c2); u <<= 1;
         r += (v & c2); v <<= 1;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 15c4fc2a8b..73ffe44b2b 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -568,7 +568,8 @@ static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0,
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
-        c1 = -(g & (eta >> 63));
+        c2 = -(g & 1);
+        c1 = c2 & ((int64_t)eta >> 63);
 
         x = (f ^ g) & c1;
         f ^= x; g ^= x; g ^= c1; g -= c1;
@@ -581,8 +582,6 @@ static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0,
 
         eta = (eta ^ c1) - c1 - 1;
 
-        c2 = -(g & 1);
-
         g += (f & c2); g >>= 1;
         q += (u & c2); u <<= 1;
         r += (v & c2); v <<= 1;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 1563dad1f9..94b33f8cd3 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1020,7 +1020,8 @@ static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
-        c1 = -(g & (eta >> 63));
+        c2 = -(g & 1);
+        c1 = c2 & ((int64_t)eta >> 63);
 
         x = (f ^ g) & c1;
         f ^= x; g ^= x; g ^= c1; g -= c1;
@@ -1033,8 +1034,6 @@ static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t
 
         eta = (eta ^ c1) - c1 - 1;
 
-        c2 = -(g & 1);
-
         g += (f & c2); g >>= 1;
         q += (u & c2); u <<= 1;
         r += (v & c2); v <<= 1;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 4f11bd19cf..e7fb988947 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -814,7 +814,8 @@ static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
-        c1 = -(g & (eta >> 31));
+        c2 = -(g & 1);
+        c1 = c2 & ((int32_t)eta >> 31);
 
         x = (f ^ g) & c1;
         f ^= x; g ^= x; g ^= c1; g -= c1;
@@ -827,8 +828,6 @@ static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t
 
         eta = (eta ^ c1) - c1 - 1;
 
-        c2 = -(g & 1);
-
         g += (f & c2); g >>= 1;
         q += (u & c2); u <<= 1;
         r += (v & c2); v <<= 1;

From cbd2d57dcee044de9a1fabc8887ff090a2fa4482 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Wed, 9 Sep 2020 14:31:42 +0700
Subject: [PATCH 32/34] Faster const-time divsteps

---
 src/field_10x26_impl.h | 26 +++++++++++++++-----------
 src/field_5x52_impl.h  | 26 +++++++++++++++-----------
 src/scalar_4x64_impl.h | 26 +++++++++++++++-----------
 src/scalar_8x32_impl.h | 26 +++++++++++++++-----------
 4 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 7faec253ac..16e28c7821 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1260,23 +1260,27 @@ static uint32_t secp256k1_fe_divsteps_30(uint32_t eta, uint32_t f0, uint32_t g0,
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
+        c1 = (int32_t)eta >> 31;
         c2 = -(g & 1);
-        c1 = c2 & ((int32_t)eta >> 31);
 
-        x = (f ^ g) & c1;
-        f ^= x; g ^= x; g ^= c1; g -= c1;
+        x = (f ^ c1) - c1;
+        y = (u ^ c1) - c1;
+        z = (v ^ c1) - c1;
 
-        y = (u ^ q) & c1;
-        u ^= y; q ^= y; q ^= c1; q -= c1;
+        g += x & c2;
+        q += y & c2;
+        r += z & c2;
 
-        z = (v ^ r) & c1;
-        v ^= z; r ^= z; r ^= c1; r -= c1;
+        c1 &= c2;
+        eta = (eta ^ c1) - (c1 + 1);
 
-        eta = (eta ^ c1) - c1 - 1;
+        f += g & c1;
+        u += q & c1;
+        v += r & c1;
 
-        g += (f & c2); g >>= 1;
-        q += (u & c2); u <<= 1;
-        r += (v & c2); v <<= 1;
+        g >>= 1;
+        u <<= 1;
+        v <<= 1;
     }
 
     t[0] = (int32_t)u;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 73ffe44b2b..7d70e1a387 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -568,23 +568,27 @@ static uint64_t secp256k1_fe_divsteps_62(uint64_t eta, uint64_t f0, uint64_t g0,
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
+        c1 = (int64_t)eta >> 63;
         c2 = -(g & 1);
-        c1 = c2 & ((int64_t)eta >> 63);
 
-        x = (f ^ g) & c1;
-        f ^= x; g ^= x; g ^= c1; g -= c1;
+        x = (f ^ c1) - c1;
+        y = (u ^ c1) - c1;
+        z = (v ^ c1) - c1;
 
-        y = (u ^ q) & c1;
-        u ^= y; q ^= y; q ^= c1; q -= c1;
+        g += x & c2;
+        q += y & c2;
+        r += z & c2;
 
-        z = (v ^ r) & c1;
-        v ^= z; r ^= z; r ^= c1; r -= c1;
+        c1 &= c2;
+        eta = (eta ^ c1) - (c1 + 1);
 
-        eta = (eta ^ c1) - c1 - 1;
+        f += g & c1;
+        u += q & c1;
+        v += r & c1;
 
-        g += (f & c2); g >>= 1;
-        q += (u & c2); u <<= 1;
-        r += (v & c2); v <<= 1;
+        g >>= 1;
+        u <<= 1;
+        v <<= 1;
     }
 
     t[0] = (int64_t)u;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 94b33f8cd3..00cf4842cf 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -1020,23 +1020,27 @@ static uint64_t secp256k1_scalar_divsteps_62(uint64_t eta, uint64_t f0, uint64_t
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
+        c1 = (int64_t)eta >> 63;
         c2 = -(g & 1);
-        c1 = c2 & ((int64_t)eta >> 63);
 
-        x = (f ^ g) & c1;
-        f ^= x; g ^= x; g ^= c1; g -= c1;
+        x = (f ^ c1) - c1;
+        y = (u ^ c1) - c1;
+        z = (v ^ c1) - c1;
 
-        y = (u ^ q) & c1;
-        u ^= y; q ^= y; q ^= c1; q -= c1;
+        g += x & c2;
+        q += y & c2;
+        r += z & c2;
 
-        z = (v ^ r) & c1;
-        v ^= z; r ^= z; r ^= c1; r -= c1;
+        c1 &= c2;
+        eta = (eta ^ c1) - (c1 + 1);
 
-        eta = (eta ^ c1) - c1 - 1;
+        f += g & c1;
+        u += q & c1;
+        v += r & c1;
 
-        g += (f & c2); g >>= 1;
-        q += (u & c2); u <<= 1;
-        r += (v & c2); v <<= 1;
+        g >>= 1;
+        u <<= 1;
+        v <<= 1;
     }
 
     t[0] = (int64_t)u;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index e7fb988947..baf647d83f 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -814,23 +814,27 @@ static uint32_t secp256k1_scalar_divsteps_30(uint32_t eta, uint32_t f0, uint32_t
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
 
+        c1 = (int32_t)eta >> 31;
         c2 = -(g & 1);
-        c1 = c2 & ((int32_t)eta >> 31);
 
-        x = (f ^ g) & c1;
-        f ^= x; g ^= x; g ^= c1; g -= c1;
+        x = (f ^ c1) - c1;
+        y = (u ^ c1) - c1;
+        z = (v ^ c1) - c1;
 
-        y = (u ^ q) & c1;
-        u ^= y; q ^= y; q ^= c1; q -= c1;
+        g += x & c2;
+        q += y & c2;
+        r += z & c2;
 
-        z = (v ^ r) & c1;
-        v ^= z; r ^= z; r ^= c1; r -= c1;
+        c1 &= c2;
+        eta = (eta ^ c1) - (c1 + 1);
 
-        eta = (eta ^ c1) - c1 - 1;
+        f += g & c1;
+        u += q & c1;
+        v += r & c1;
 
-        g += (f & c2); g >>= 1;
-        q += (u & c2); u <<= 1;
-        r += (v & c2); v <<= 1;
+        g >>= 1;
+        u <<= 1;
+        v <<= 1;
     }
 
     t[0] = (int32_t)u;

From 85da7a9e4d0fb3b2251572f4bc683f65fd54f6e7 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 10 Nov 2020 22:52:04 +0700
Subject: [PATCH 33/34] Rework _update_de I/O bounds

---
 src/field_5x52_impl.h  | 132 ++++++++++++++++++++++++-----------------
 src/scalar_4x64_impl.h | 123 ++++++++++++++++++++++----------------
 2 files changed, 149 insertions(+), 106 deletions(-)

diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 7d70e1a387..83bdaf4281 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -498,43 +498,62 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
+static void secp256k1_fe_normalize_62(int64_t *r, int64_t cond_negate) {
+    /* P == 2^256 - C62 */
+    const int64_t C62 = 0x1000003D1LL;
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int64_t r0 = r[0], r1 = r[1], r2 = r[2], r3 = r[3], r4 = r[4];
+    int64_t c, cond_add;
+
+    cond_add = r4 >> 63;
+
+    c  = r0 - (C62 & cond_add);
+    r0 = c & M62; c >>= 62;
+    c += r1;
+    r1 = c & M62; c >>= 62;
+    c += r2;
+    r2 = c & M62; c >>= 62;
+    c += r3;
+    r3 = c & M62; c >>= 62;
+    c += r4 + (256 & cond_add);
+    r4 = c;
+
+    cond_add = (c >> 63) ^ cond_negate;
+
+    c  = (r0 ^ cond_negate) - cond_negate - (C62 & cond_add);
+    r[0] = c & M62; c >>= 62;
+    c += (r1 ^ cond_negate) - cond_negate;
+    r[1] = c & M62; c >>= 62;
+    c += (r2 ^ cond_negate) - cond_negate;
+    r[2] = c & M62; c >>= 62;
+    c += (r3 ^ cond_negate) - cond_negate;
+    r[3] = c & M62; c >>= 62;
+    c += (r4 ^ cond_negate) - cond_negate + (256 & cond_add);
+    r[4] = c;
+
+    VERIFY_CHECK(c >> 8 == 0);
+}
+
 static void secp256k1_fe_decode_62(secp256k1_fe *r, const int64_t *a) {
 
     const uint64_t M52 = UINT64_MAX >> 12;
     const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
-    uint64_t r0, r1, r2, r3, r4, t;
-
-    t = (int64_t)a4 >> 8;
 
-    /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 62 == 0);
     VERIFY_CHECK(a1 >> 62 == 0);
     VERIFY_CHECK(a2 >> 62 == 0);
     VERIFY_CHECK(a3 >> 62 == 0);
-    VERIFY_CHECK(t == 0 || t == -(uint64_t)1);
-
-    /* Add 2P if a4 is "negative". */
-    r0  = 0xFFFFDFFFFF85EULL & t;
-    r1  = 0xFFFFFFFFFFFFFULL & t;
-    r2  = 0xFFFFFFFFFFFFFULL & t;
-    r3  = 0xFFFFFFFFFFFFFULL & t;
-    r4  = 0x1FFFFFFFFFFFFULL & t;
-
-    r0 +=  a0                   & M52;
-    r1 += (a0 >> 52 | a1 << 10) & M52;
-    r2 += (a1 >> 42 | a2 << 20) & M52;
-    r3 += (a2 >> 32 | a3 << 30) & M52;
-    r4 += (a3 >> 22 | a4 << 40);
-
-    r->n[0] = r0;
-    r->n[1] = r1;
-    r->n[2] = r2;
-    r->n[3] = r3;
-    r->n[4] = r4;
+    VERIFY_CHECK(a4 >>  8 == 0);
+
+    r->n[0] =  a0                   & M52;
+    r->n[1] = (a0 >> 52 | a1 << 10) & M52;
+    r->n[2] = (a1 >> 42 | a2 << 20) & M52;
+    r->n[3] = (a2 >> 32 | a3 << 30) & M52;
+    r->n[4] = (a3 >> 22 | a4 << 40);
 
 #ifdef VERIFY
     r->magnitude = 1;
-    r->normalized = 0;
+    r->normalized = 1;
     secp256k1_fe_verify(r);
 #endif
 }
@@ -679,22 +698,38 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t)
 
     /* P == 2^256 - C62 */
     const int64_t C62 = 0x1000003D1LL;
-    /* I62 == -P^-1 mod 2^62 */
-    const int64_t I62 = 0x1838091DD2253531LL;
+    /* I62 == P^-1 mod 2^62 */
+    const int64_t I62 = 0x27C7F6E22DDACACFLL;
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4];
     const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4];
     const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
-    int64_t md, me;
+    int64_t md, me, sd, se;
     int128_t cd, ce;
 
+    /*
+     * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add
+     * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior
+     * to division by 2^62). This has the same effect as if we added the modulus to the input(s).
+     */
+
+    sd = d4 >> 63;
+    se = e4 >> 63;
+
+    md = (u & sd) + (v & se);
+    me = (q & sd) + (r & se);
+
     cd = (int128_t)u * d0 + (int128_t)v * e0;
     ce = (int128_t)q * d0 + (int128_t)r * e0;
 
-    /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me
-     * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */
-    md = (I62 * 4 * (int64_t)cd) >> 2;
-    me = (I62 * 4 * (int64_t)ce) >> 2;
+    /*
+     * Subtract from md/me an extra term in the range [0, 2^62) such that the low 62 bits of each
+     * sum of products will be 0. This allows clean division by 2^62. On output, d/e are thus in
+     * the range (-2.P, P), consistent with the input constraint.
+     */
+
+    md -= (I62 * (int64_t)cd + md) & M62;
+    me -= (I62 * (int64_t)ce + me) & M62;
 
     cd -= (int128_t)C62 * md;
     ce -= (int128_t)C62 * me;
@@ -821,8 +856,8 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL,
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
-    secp256k1_fe b0, b1;
-    int i, sign;
+    secp256k1_fe b0;
+    int i;
     uint64_t eta;
 #ifdef VERIFY
     int zero_in;
@@ -855,19 +890,12 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0);
 
-    sign = (f[0] >> 1) & 1;
-
-    secp256k1_fe_decode_62(&b0, d);
-
-    secp256k1_fe_negate(&b1, &b0, 1);
-    secp256k1_fe_cmov(&b0, &b1, sign);
-    secp256k1_fe_normalize_weak(&b0);
+    secp256k1_fe_normalize_62(d, f[4] >> 63);
+    secp256k1_fe_decode_62(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in);
+    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(r) == !zero_in);
 #endif
-
-    *r = b0;
 }
 
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
@@ -882,7 +910,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
     secp256k1_fe b;
-    int i, j, len = 5, sign;
+    int i, j, len = 5;
     uint64_t eta;
     int64_t cond, fn, gn;
 #ifdef VERIFY
@@ -938,20 +966,12 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
      * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    sign = (f[0] >> 1) & 1;
-
-    secp256k1_fe_decode_62(&b, d);
-
-    if (sign) {
-        secp256k1_fe_negate(&b, &b, 1);
-        secp256k1_fe_normalize_weak(&b);
-    }
+    secp256k1_fe_normalize_62(d, f[len - 1] >> 63);
+    secp256k1_fe_decode_62(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b) == !zero_in);
+    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(r) == !zero_in);
 #endif
-
-    *r = b;
 }
 
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 00cf4842cf..a053aa70c7 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -962,33 +962,55 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA
     0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL
 );
 
+static void secp256k1_scalar_normalize_62(int64_t *r, int64_t cond_negate) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 };
+    int64_t r0 = r[0], r1 = r[1], r2 = r[2], r3 = r[3], r4 = r[4];
+    int64_t c, cond_add;
+
+    cond_add = r4 >> 63;
+
+    c  = r0 + (P[0] & cond_add);
+    r0 = c & M62; c >>= 62;
+    c += r1 + (P[1] & cond_add);
+    r1 = c & M62; c >>= 62;
+    c += r2 + (P[2] & cond_add);
+    r2 = c & M62; c >>= 62;
+    c += r3;
+    r3 = c & M62; c >>= 62;
+    c += r4 + (P[4] & cond_add);
+    r4 = c;
+
+    cond_add = (c >> 63) ^ cond_negate;
+
+    c  = (r0 ^ cond_negate) - cond_negate + (P[0] & cond_add);
+    r[0] = c & M62; c >>= 62;
+    c += (r1 ^ cond_negate) - cond_negate + (P[1] & cond_add);
+    r[1] = c & M62; c >>= 62;
+    c += (r2 ^ cond_negate) - cond_negate + (P[2] & cond_add);
+    r[2] = c & M62; c >>= 62;
+    c += (r3 ^ cond_negate) - cond_negate;
+    r[3] = c & M62; c >>= 62;
+    c += (r4 ^ cond_negate) - cond_negate + (P[4] & cond_add);
+    r[4] = c;
+
+    VERIFY_CHECK(c >> 8 == 0);
+}
+
 static void secp256k1_scalar_decode_62(secp256k1_scalar *r, const int64_t *a) {
 
     const uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
-    uint64_t r0, r1, r2, r3;
-    secp256k1_scalar u;
 
-    /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 62 == 0);
     VERIFY_CHECK(a1 >> 62 == 0);
     VERIFY_CHECK(a2 >> 62 == 0);
     VERIFY_CHECK(a3 >> 62 == 0);
-    VERIFY_CHECK((int64_t)a4 >> 8 == 0 || (int64_t)a4 >> 8 == -(int64_t)1);
-
-    r0 = a0      | a1 << 62;
-    r1 = a1 >> 2 | a2 << 60;
-    r2 = a2 >> 4 | a3 << 58;
-    r3 = a3 >> 6 | a4 << 56;
+    VERIFY_CHECK(a4 >>  8 == 0);
 
-    r->d[0] = r0;
-    r->d[1] = r1;
-    r->d[2] = r2;
-    r->d[3] = r3;
-
-    secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
-
-    secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256);
-    secp256k1_scalar_cmov(r, &u, a4 >> 63);
+    r->d[0] = a0      | a1 << 62;
+    r->d[1] = a1 >> 2 | a2 << 60;
+    r->d[2] = a2 >> 4 | a3 << 58;
+    r->d[3] = a3 >> 6 | a4 << 56;
 }
 
 static void secp256k1_scalar_encode_62(int64_t *r, const secp256k1_scalar *a) {
@@ -1129,23 +1151,39 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
 
 static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t *t) {
 
-    /* I62 == -P^-1 mod 2^62 */
-    const int64_t I62 = 0x0B0DFF665588B13FLL;
+    /* I62 == P^-1 mod 2^62 */
+    const int64_t I62 = 0x34F20099AA774EC1LL;
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 };
     const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4];
     const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4];
     const int64_t u = t[0], v = t[1], q = t[2], r = t[3];
-    int64_t md, me;
+    int64_t md, me, sd, se;
     int128_t cd, ce;
 
+    /*
+     * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add
+     * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior
+     * to division by 2^62). This has the same effect as if we added the modulus to the input(s).
+     */
+
+    sd = d4 >> 63;
+    se = e4 >> 63;
+
+    md = (u & sd) + (v & se);
+    me = (q & sd) + (r & se);
+
     cd = (int128_t)u * d0 + (int128_t)v * e0;
     ce = (int128_t)q * d0 + (int128_t)r * e0;
 
-    /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me
-     * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */
-    md = (I62 * 4 * (int64_t)cd) >> 2;
-    me = (I62 * 4 * (int64_t)ce) >> 2;
+    /*
+     * Subtract from md/me an extra term in the range [0, 2^62) such that the low 62 bits of each
+     * sum of products will be 0. This allows clean division by 2^62. On output, d/e are thus in
+     * the range (-2.P, P), consistent with the input constraint.
+     */
+
+    md -= (I62 * (int64_t)cd + md) & M62;
+    me -= (I62 * (int64_t)ce + me) & M62;
 
     cd += (int128_t)P[0] * md;
     ce += (int128_t)P[0] * me;
@@ -1289,15 +1327,13 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
-    secp256k1_scalar b0;
-    int i, sign;
+    int i;
     uint64_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    b0 = *x;
-    secp256k1_scalar_encode_62(g, &b0);
+    secp256k1_scalar_encode_62(g, x);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak).
      *
@@ -1318,16 +1354,12 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 
     VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4]) == 0);
 
-    sign = (f[0] >> 1) & 1;
-
-    secp256k1_scalar_decode_62(&b0, d);
-    secp256k1_scalar_cond_negate(&b0, sign);
+    secp256k1_scalar_normalize_62(d, f[4] >> 63);
+    secp256k1_scalar_decode_62(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in);
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(r) == !zero_in);
 #endif
-
-    *r = b0;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
@@ -1346,16 +1378,14 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     int64_t f[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, 0x3FFFFFFFFFFFFFEBLL,
         0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
     int64_t g[5];
-    secp256k1_scalar b;
-    int i, j, len = 5, sign;
+    int i, j, len = 5;
     uint64_t eta;
     int64_t cond, fn, gn;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    b = *x;
-    secp256k1_scalar_encode_62(g, &b);
+    secp256k1_scalar_encode_62(g, x);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak).
      *
@@ -1398,19 +1428,12 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
      * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    sign = (f[0] >> 1) & 1;
-
-    secp256k1_scalar_decode_62(&b, d);
-
-    if (sign) {
-        secp256k1_scalar_negate(&b, &b);
-    }
+    secp256k1_scalar_normalize_62(d, f[len - 1] >> 63);
+    secp256k1_scalar_decode_62(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b) == !zero_in);
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(r) == !zero_in);
 #endif
-
-    *r = b;
 }
 
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */

From c9b77178270ddae9457069bb419f911ea1b9b63b Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Wed, 11 Nov 2020 16:46:36 +0700
Subject: [PATCH 34/34] Rework _update_de for 32bit

---
 src/field_10x26_impl.h | 171 ++++++++++++++++++++++++-----------------
 src/field_5x52_impl.h  |   4 +-
 src/scalar_8x32_impl.h | 157 +++++++++++++++++++++++--------------
 3 files changed, 199 insertions(+), 133 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 16e28c7821..9841e2ba4b 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1164,16 +1164,65 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
+static void secp256k1_fe_normalize_30(int32_t *r, int32_t cond_negate) {
+    /* P == 2^256 - 2^32 - C30 */
+    const int32_t C30 = 0x3D1L;
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    int32_t r0 = r[0], r1 = r[1], r2 = r[2], r3 = r[3], r4 = r[4],
+            r5 = r[5], r6 = r[6], r7 = r[7], r8 = r[8];
+    int32_t c, cond_add;
+
+    cond_add = r8 >> 31;
+
+    c  = r0 - (C30 & cond_add);
+    r0 = c & M30; c >>= 30;
+    c += r1 - (4 & cond_add);;
+    r1 = c & M30; c >>= 30;
+    c += r2;
+    r2 = c & M30; c >>= 30;
+    c += r3;
+    r3 = c & M30; c >>= 30;
+    c += r4;
+    r4 = c & M30; c >>= 30;
+    c += r5;
+    r5 = c & M30; c >>= 30;
+    c += r6;
+    r6 = c & M30; c >>= 30;
+    c += r7;
+    r7 = c & M30; c >>= 30;
+    c += r8 + (65536 & cond_add);
+    r8 = c;
+
+    cond_add = (c >> 31) ^ cond_negate;
+
+    c  = (r0 ^ cond_negate) - cond_negate - (C30 & cond_add);
+    r[0] = c & M30; c >>= 30;
+    c += (r1 ^ cond_negate) - cond_negate - (4 & cond_add);
+    r[1] = c & M30; c >>= 30;
+    c += (r2 ^ cond_negate) - cond_negate;
+    r[2] = c & M30; c >>= 30;
+    c += (r3 ^ cond_negate) - cond_negate;
+    r[3] = c & M30; c >>= 30;
+    c += (r4 ^ cond_negate) - cond_negate;
+    r[4] = c & M30; c >>= 30;
+    c += (r5 ^ cond_negate) - cond_negate;
+    r[5] = c & M30; c >>= 30;
+    c += (r6 ^ cond_negate) - cond_negate;
+    r[6] = c & M30; c >>= 30;
+    c += (r7 ^ cond_negate) - cond_negate;
+    r[7] = c & M30; c >>= 30;
+    c += (r8 ^ cond_negate) - cond_negate + (65536 & cond_add);
+    r[8] = c;
+
+    VERIFY_CHECK(c >> 16 == 0);
+}
+
 static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
 
     const uint32_t M26 = UINT32_MAX >> 6;
     const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
                    a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
-    uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, t;
-
-    t = (int32_t)a8 >> 16;
 
-    /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 30 == 0);
     VERIFY_CHECK(a1 >> 30 == 0);
     VERIFY_CHECK(a2 >> 30 == 0);
@@ -1182,45 +1231,22 @@ static void secp256k1_fe_decode_30(secp256k1_fe *r, const int32_t *a) {
     VERIFY_CHECK(a5 >> 30 == 0);
     VERIFY_CHECK(a6 >> 30 == 0);
     VERIFY_CHECK(a7 >> 30 == 0);
-    VERIFY_CHECK(t == 0 || t == -(uint32_t)1);
-
-    /* Add 2P if a8 is "negative". */
-    r0  = 0x3FFF85EUL & t;
-    r1  = 0x3FFFF7FUL & t;
-    r2  = 0x3FFFFFFUL & t;
-    r3  = 0x3FFFFFFUL & t;
-    r4  = 0x3FFFFFFUL & t;
-    r5  = 0x3FFFFFFUL & t;
-    r6  = 0x3FFFFFFUL & t;
-    r7  = 0x3FFFFFFUL & t;
-    r8  = 0x3FFFFFFUL & t;
-    r9  = 0x07FFFFFUL & t;
-
-    r0 +=  a0                   & M26;
-    r1 += (a0 >> 26 | a1 <<  4) & M26;
-    r2 += (a1 >> 22 | a2 <<  8) & M26;
-    r3 += (a2 >> 18 | a3 << 12) & M26;
-    r4 += (a3 >> 14 | a4 << 16) & M26;
-    r5 += (a4 >> 10 | a5 << 20) & M26;
-    r6 += (a5 >>  6 | a6 << 24) & M26;
-    r7 += (a6 >>  2           ) & M26;
-    r8 += (a6 >> 28 | a7 <<  2) & M26;
-    r9 += (a7 >> 24 | a8 <<  6);
-
-    r->n[0] = r0;
-    r->n[1] = r1;
-    r->n[2] = r2;
-    r->n[3] = r3;
-    r->n[4] = r4;
-    r->n[5] = r5;
-    r->n[6] = r6;
-    r->n[7] = r7;
-    r->n[8] = r8;
-    r->n[9] = r9;
+    VERIFY_CHECK(a8 >> 16 == 0);
+
+    r->n[0] =  a0                   & M26;
+    r->n[1] = (a0 >> 26 | a1 <<  4) & M26;
+    r->n[2] = (a1 >> 22 | a2 <<  8) & M26;
+    r->n[3] = (a2 >> 18 | a3 << 12) & M26;
+    r->n[4] = (a3 >> 14 | a4 << 16) & M26;
+    r->n[5] = (a4 >> 10 | a5 << 20) & M26;
+    r->n[6] = (a5 >>  6 | a6 << 24) & M26;
+    r->n[7] = (a6 >>  2           ) & M26;
+    r->n[8] = (a6 >> 28 | a7 <<  2) & M26;
+    r->n[9] = (a7 >> 24 | a8 <<  6);
 
 #ifdef VERIFY
     r->magnitude = 1;
-    r->normalized = 0;
+    r->normalized = 1;
     secp256k1_fe_verify(r);
 #endif
 }
@@ -1375,25 +1401,41 @@ static uint32_t secp256k1_fe_divsteps_30_var(uint32_t eta, uint32_t f0, uint32_t
 static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t) {
 
     /* P == 2^256 - 2^32 - C30 */
-    const int64_t C30 = 0x3D1L;
-    /* I30 == -P^-1 mod 2^30 */
-    const int32_t I30 = 0x12253531L;
+    const int32_t C30 = 0x3D1L;
+    /* I30 == P^-1 mod 2^30 */
+    const int32_t I30 = 0x2DDACACFL;
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
-    int32_t di, ei, md, me;
+    int32_t di, ei, md, me, sd, se;
     int64_t cd, ce;
     int i;
 
+    /*
+     * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add
+     * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior
+     * to division by 2^30). This has the same effect as if we added the modulus to the input(s).
+     */
+
+    sd = d[8] >> 31;
+    se = e[8] >> 31;
+
+    md = (u & sd) + (v & se);
+    me = (q & sd) + (r & se);
+
     di = d[0];
     ei = e[0];
 
     cd = (int64_t)u * di + (int64_t)v * ei;
     ce = (int64_t)q * di + (int64_t)r * ei;
 
-    /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me
-     * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */
-    md = (I30 * 4 * (int32_t)cd) >> 2;
-    me = (I30 * 4 * (int32_t)ce) >> 2;
+    /*
+     * Subtract from md/me an extra term in the range [0, 2^30) such that the low 30 bits of each
+     * sum of products will be 0. This allows clean division by 2^30. On output, d/e are thus in
+     * the range (-2.P, P), consistent with the input constraint.
+     */
+
+    md -= (I30 * (int32_t)cd + md) & M30;
+    me -= (I30 * (int32_t)ce + me) & M30;
 
     cd -= (int64_t)C30 * md;
     ce -= (int64_t)C30 * me;
@@ -1513,8 +1555,8 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     int32_t f[9] = { 0x3FFFFC2F, 0x3FFFFFFB, 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF,
         0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF };
     int32_t g[9];
-    secp256k1_fe b0, b1;
-    int i, sign;
+    secp256k1_fe b0;
+    int i;
     uint32_t eta;
 #ifdef VERIFY
     int zero_in;
@@ -1547,19 +1589,12 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
 
     VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0);
 
-    sign = (f[0] >> 1) & 1;
-
-    secp256k1_fe_decode_30(&b0, d);
-
-    secp256k1_fe_negate(&b1, &b0, 2);
-    secp256k1_fe_cmov(&b0, &b1, sign);
-    secp256k1_fe_normalize_weak(&b0);
+    secp256k1_fe_normalize_30(d, f[8] >> 31);
+    secp256k1_fe_decode_30(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b0) == !zero_in);
+    VERIFY_CHECK(!secp256k1_fe_is_zero(r) == !zero_in);
 #endif
-
-    *r = b0;
 }
 
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
@@ -1574,7 +1609,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
         0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0xFFFF };
     int32_t g[9];
     secp256k1_fe b;
-    int i, j, len = 9, sign;
+    int i, j, len = 9;
     uint32_t eta;
     int32_t cond, fn, gn;
 #ifdef VERIFY
@@ -1630,20 +1665,12 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
      * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    sign = (f[0] >> 1) & 1;
-
-    secp256k1_fe_decode_30(&b, d);
-
-    if (sign) {
-        secp256k1_fe_negate(&b, &b, 1);
-        secp256k1_fe_normalize_weak(&b);
-    }
+    secp256k1_fe_normalize_30(d, f[len - 1] >> 31);
+    secp256k1_fe_decode_30(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(&b) == !zero_in);
+    VERIFY_CHECK(!secp256k1_fe_is_zero(r) == !zero_in);
 #endif
-
-    *r = b;
 }
 
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 83bdaf4281..1af2c15fce 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -894,7 +894,7 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
     secp256k1_fe_decode_62(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(r) == !zero_in);
+    VERIFY_CHECK(!secp256k1_fe_is_zero(r) == !zero_in);
 #endif
 }
 
@@ -970,7 +970,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
     secp256k1_fe_decode_62(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_fe_normalizes_to_zero(r) == !zero_in);
+    VERIFY_CHECK(!secp256k1_fe_is_zero(r) == !zero_in);
 #endif
 }
 
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index baf647d83f..ba52b7d54a 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -738,14 +738,65 @@ static const secp256k1_scalar SECP256K1_SCALAR_NEG_TWO_POW_256 = SECP256K1_SCALA
     0x755DB9CDUL, 0x5E914077UL, 0x7FA4BD19UL, 0xA06C8282UL
 );
 
+static void secp256k1_scalar_normalize_30(int32_t *r, int32_t cond_negate) {
+    const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L,
+        0, 0, 0, 65536 };
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    int32_t r0 = r[0], r1 = r[1], r2 = r[2], r3 = r[3], r4 = r[4],
+            r5 = r[5], r6 = r[6], r7 = r[7], r8 = r[8];
+    int32_t c, cond_add;
+
+    cond_add = r8 >> 31;
+
+    c  = r0 + (P[0] & cond_add);
+    r0 = c & M30; c >>= 30;
+    c += r1 + (P[1] & cond_add);
+    r1 = c & M30; c >>= 30;
+    c += r2 + (P[2] & cond_add);
+    r2 = c & M30; c >>= 30;
+    c += r3 + (P[3] & cond_add);
+    r3 = c & M30; c >>= 30;
+    c += r4 + (P[4] & cond_add);
+    r4 = c & M30; c >>= 30;
+    c += r5;
+    r5 = c & M30; c >>= 30;
+    c += r6;
+    r6 = c & M30; c >>= 30;
+    c += r7;
+    r7 = c & M30; c >>= 30;
+    c += r8 + (P[8] & cond_add);
+    r8 = c;
+
+    cond_add = (c >> 31) ^ cond_negate;
+
+    c  = (r0 ^ cond_negate) - cond_negate + (P[0] & cond_add);
+    r[0] = c & M30; c >>= 30;
+    c += (r1 ^ cond_negate) - cond_negate + (P[1] & cond_add);
+    r[1] = c & M30; c >>= 30;
+    c += (r2 ^ cond_negate) - cond_negate + (P[2] & cond_add);
+    r[2] = c & M30; c >>= 30;
+    c += (r3 ^ cond_negate) - cond_negate + (P[3] & cond_add);
+    r[3] = c & M30; c >>= 30;
+    c += (r4 ^ cond_negate) - cond_negate + (P[4] & cond_add);
+    r[4] = c & M30; c >>= 30;
+    c += (r5 ^ cond_negate) - cond_negate;
+    r[5] = c & M30; c >>= 30;
+    c += (r6 ^ cond_negate) - cond_negate;
+    r[6] = c & M30; c >>= 30;
+    c += (r7 ^ cond_negate) - cond_negate;
+    r[7] = c & M30; c >>= 30;
+    c += (r8 ^ cond_negate) - cond_negate + (P[8] & cond_add);
+    r[8] = c;
+
+    VERIFY_CHECK(c >> 16 == 0);
+}
+
+
 static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) {
 
     const uint32_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4],
                    a5 = a[5], a6 = a[6], a7 = a[7], a8 = a[8];
-    uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
-    secp256k1_scalar u;
 
-    /* a must be in the range [-2^256, 2^256). */
     VERIFY_CHECK(a0 >> 30 == 0);
     VERIFY_CHECK(a1 >> 30 == 0);
     VERIFY_CHECK(a2 >> 30 == 0);
@@ -754,30 +805,16 @@ static void secp256k1_scalar_decode_30(secp256k1_scalar *r, const int32_t *a) {
     VERIFY_CHECK(a5 >> 30 == 0);
     VERIFY_CHECK(a6 >> 30 == 0);
     VERIFY_CHECK(a7 >> 30 == 0);
-    VERIFY_CHECK((int32_t)a8 >> 16 == 0 || (int32_t)a8 >> 16 == -(int32_t)1);
-
-    r0 = a0       | a1 << 30;
-    r1 = a1 >>  2 | a2 << 28;
-    r2 = a2 >>  4 | a3 << 26;
-    r3 = a3 >>  6 | a4 << 24;
-    r4 = a4 >>  8 | a5 << 22;
-    r5 = a5 >> 10 | a6 << 20;
-    r6 = a6 >> 12 | a7 << 18;
-    r7 = a7 >> 14 | a8 << 16;
-
-    r->d[0] = r0;
-    r->d[1] = r1;
-    r->d[2] = r2;
-    r->d[3] = r3;
-    r->d[4] = r4;
-    r->d[5] = r5;
-    r->d[6] = r6;
-    r->d[7] = r7;
-
-    secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
-
-    secp256k1_scalar_add(&u, r, &SECP256K1_SCALAR_NEG_TWO_POW_256);
-    secp256k1_scalar_cmov(r, &u, a8 >> 31);
+    VERIFY_CHECK(a8 >> 16 == 0);
+
+    r->d[0] = a0       | a1 << 30;
+    r->d[1] = a1 >>  2 | a2 << 28;
+    r->d[2] = a2 >>  4 | a3 << 26;
+    r->d[3] = a3 >>  6 | a4 << 24;
+    r->d[4] = a4 >>  8 | a5 << 22;
+    r->d[5] = a5 >> 10 | a6 << 20;
+    r->d[6] = a6 >> 12 | a7 << 18;
+    r->d[7] = a7 >> 14 | a8 << 16;
 }
 
 static void secp256k1_scalar_encode_30(int32_t *r, const secp256k1_scalar *a) {
@@ -928,25 +965,42 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
 
 static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t *t) {
 
-    /* I30 == -P^-1 mod 2^30 */
-    const int32_t I30 = 0x1588B13FL;
+    /* I30 == P^-1 mod 2^30 */
+    const int32_t I30 = 0x2A774EC1L;
     const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L,
         0, 0, 0, 65536 };
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
-    int32_t u = t[0], v = t[1], q = t[2], r = t[3], di, ei, md, me;
+    const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
+    int32_t di, ei, md, me, sd, se;
     int64_t cd, ce;
     int i;
 
+    /*
+     * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add
+     * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior
+     * to division by 2^30). This has the same effect as if we added the modulus to the input(s).
+     */
+
+    sd = d[8] >> 31;
+    se = e[8] >> 31;
+
+    md = (u & sd) + (v & se);
+    me = (q & sd) + (r & se);
+
     di = d[0];
     ei = e[0];
 
     cd = (int64_t)u * di + (int64_t)v * ei;
     ce = (int64_t)q * di + (int64_t)r * ei;
 
-    /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me
-     * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */
-    md = (I30 * 4 * (int32_t)cd) >> 2;
-    me = (I30 * 4 * (int32_t)ce) >> 2;
+    /*
+     * Subtract from md/me an extra term in the range [0, 2^30) such that the low 30 bits of each
+     * sum of products will be 0. This allows clean division by 2^30. On output, d/e are thus in
+     * the range (-2.P, P), consistent with the input constraint.
+     */
+
+    md -= (I30 * (int32_t)cd + md) & M30;
+    me -= (I30 * (int32_t)ce + me) & M30;
 
     cd += (int64_t)P[0] * md;
     ce += (int64_t)P[0] * me;
@@ -1065,15 +1119,13 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
     int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL,
         0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL };
     int32_t g[9];
-    secp256k1_scalar b0;
-    int i, sign;
+    int i;
     uint32_t eta;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    b0 = *x;
-    secp256k1_scalar_encode_30(g, &b0);
+    secp256k1_scalar_encode_30(g, x);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak).
      *
@@ -1094,16 +1146,12 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 
     VERIFY_CHECK((g[0] | g[1] | g[2] | g[3] | g[4] | g[5] | g[6] | g[7] | g[8]) == 0);
 
-    sign = (f[0] >> 1) & 1;
-
-    secp256k1_scalar_decode_30(&b0, d);
-    secp256k1_scalar_cond_negate(&b0, sign);
+    secp256k1_scalar_normalize_30(d, f[8] >> 31);
+    secp256k1_scalar_decode_30(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b0) == !zero_in);
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(r) == !zero_in);
 #endif
-
-    *r = b0;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
@@ -1122,16 +1170,14 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     int32_t f[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL,
         0x3FFFFEBAL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0x3FFFFFFFL, 0xFFFFL };
     int32_t g[9];
-    secp256k1_scalar b;
-    int i, j, len = 9, sign;
+    int i, j, len = 9;
     uint32_t eta;
     int32_t cond, fn, gn;
 #ifdef VERIFY
     int zero_in = secp256k1_scalar_is_zero(x);
 #endif
 
-    b = *x;
-    secp256k1_scalar_encode_30(g, &b);
+    secp256k1_scalar_encode_30(g, x);
 
     /* The paper uses 'delta'; eta == -delta (a performance tweak).
      *
@@ -1174,19 +1220,12 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
      * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    sign = (f[0] >> 1) & 1;
-
-    secp256k1_scalar_decode_30(&b, d);
-
-    if (sign) {
-        secp256k1_scalar_negate(&b, &b);
-    }
+    secp256k1_scalar_normalize_30(d, f[len - 1] >> 31);
+    secp256k1_scalar_decode_30(r, d);
 
 #ifdef VERIFY
-    VERIFY_CHECK(!secp256k1_scalar_is_zero(&b) == !zero_in);
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(r) == !zero_in);
 #endif
-
-    *r = b;
 }
 
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */