Implement the extended euclidian algorithm to compute the inverse of scalars in variable time.

deadalnix · deadalnix · commit 0a4741a461c2 · 2020-04-15T19:01:22.000+02:00
This is faster than the exponentiation method. It is still significantly slower than GMP, so we keep that option as well.
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
@@ -181,6 +181,31 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     return 2 * (mask == 0) - 1;
 }
 
+static int secp256k1_scalar_complement(secp256k1_scalar *r, const secp256k1_scalar *a) {
+    uint128_t t = 1;
+    t += ~a->d[0];
+    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
+    t += ~a->d[1];
+    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
+    t += ~a->d[2];
+    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
+    t += ~a->d[3];
+    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
+    return t;
+}
+
+static int secp256k1_scalar_binadd(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
+    uint128_t t = (uint128_t)a->d[0] + b->d[0];
+    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
+    t += (uint128_t)a->d[1] + b->d[1];
+    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
+    t += (uint128_t)a->d[2] + b->d[2];
+    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
+    t += (uint128_t)a->d[3] + b->d[3];
+    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
+    return t;
+}
+
 /* Inspired by the macros in OpenSSL's crypto/bn/asm/x86_64-gcc.c. */
 
 /** Add a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
@@ -259,6 +259,46 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     return 2 * (mask == 0) - 1;
 }
 
+static int secp256k1_scalar_complement(secp256k1_scalar *r, const secp256k1_scalar *a) {
+    uint64_t t = 1;
+    t += ~a->d[0];
+    r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += ~a->d[1];
+    r->d[1] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += ~a->d[2];
+    r->d[2] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += ~a->d[3];
+    r->d[3] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += ~a->d[4];
+    r->d[4] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += ~a->d[5];
+    r->d[5] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += ~a->d[6];
+    r->d[6] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += ~a->d[7];
+    r->d[7] = t & 0xFFFFFFFFULL; t >>= 32;
+    return t;
+}
+
+static int secp256k1_scalar_binadd(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
+    uint64_t t = (uint64_t)a->d[0] + b->d[0];
+    r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += (uint64_t)a->d[1] + b->d[1];
+    r->d[1] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += (uint64_t)a->d[2] + b->d[2];
+    r->d[2] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += (uint64_t)a->d[3] + b->d[3];
+    r->d[3] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += (uint64_t)a->d[4] + b->d[4];
+    r->d[4] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += (uint64_t)a->d[5] + b->d[5];
+    r->d[5] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += (uint64_t)a->d[6] + b->d[6];
+    r->d[6] = t & 0xFFFFFFFFULL; t >>= 32;
+    t += (uint64_t)a->d[7] + b->d[7];
+    r->d[7] = t & 0xFFFFFFFFULL; t >>= 32;
+    return t;
+}
 
 /* Inspired by the macros in OpenSSL's crypto/bn/asm/x86_64-gcc.c. */
 
diff --git a/src/scalar_impl.h b/src/scalar_impl.h
@@ -222,6 +222,7 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar
 #endif
 }
 
+#if !defined(EXHAUSTIVE_TEST_ORDER)
 static void secp256k1_scalar_pow2_div(secp256k1_scalar *r, const secp256k1_scalar *a, int k) {
     static const secp256k1_scalar lookup[16] = {
         SECP256K1_SCALAR_CONST(
@@ -293,9 +294,108 @@ static void secp256k1_scalar_pow2_div(secp256k1_scalar *r, const secp256k1_scala
     VERIFY_CHECK(k == 0);
 }
 
+SECP256K1_INLINE static int secp256k1_scalar_shr_zeros(secp256k1_scalar *r) {
+    int n, k = 0;
+
+    /* Ensure that we do not have more than 15 trailing zeros. */
+    while ((n = __builtin_ctz(r->d[0] | (1 << 15)))) {
+        k += n;
+        secp256k1_scalar_shr_int(r, n);
+    }
+
+    return k;
+}
+
+static int secp256k1_scalar_eea_inverse(secp256k1_scalar *r, const secp256k1_scalar *n) {
+    secp256k1_scalar u, v, i, j, acomp, negx;
+    secp256k1_scalar *a, *b, *x0, *x1, *tmp;
+    int ka, kb;
+
+    /* zero is not invertible */
+    if (secp256k1_scalar_is_zero(n)) {
+        secp256k1_scalar_set_int(r, 0);
+        return 0;
+    }
+
+    /**
+     * The extended euclidian algorithm compute x, y and gcd(a, b) such as
+     * a*x + b*y = gcd(a, b)
+     * If we use this algorithm with b = p, then we solve a*x + p*y = gcd(a, p)
+     * We note that:
+     *  - The order is a prime, so gcd(a, p) = 1.
+     *  - We compute modulo p, and y*p = 0 mod p.
+     * So the equation simplify to a*x = 1, and x = a^-1.
+     */
+
+    /* a = n */
+    u = *n;
+    a = &u;
+
+    /* Because 2 is not a common factor between a and b, we can detect
+     * multiples of 2 using the LSB and eliminate them aggressively. */
+    ka = secp256k1_scalar_shr_zeros(a);
+
+    /* b = p - a */
+    secp256k1_scalar_negate(&v, a);
+    b = &v;
+
+    /* x0 = 1 */
+    secp256k1_scalar_set_int(&i, 1);
+    secp256k1_scalar_negate(&negx, &i);
+    x0 = &i;
+
+    /* x1 = 0 */
+    secp256k1_scalar_set_int(&j, 0);
+    x1 = &j;
+
+    if (secp256k1_scalar_is_one(a)) {
+        goto done;
+    }
+
+    /* For a and b, we use 2 comlement math and ensure no overflow happens. */
+    secp256k1_scalar_complement(&acomp, a);
+    goto bzero;
+
+    while (!secp256k1_scalar_is_one(a)) {
+        secp256k1_scalar_complement(&acomp, a);
+        secp256k1_scalar_negate(&negx, x0);
+
+        VERIFY_CHECK(secp256k1_scalar_cmp_var(b, a) > 0);
+        do {
+            secp256k1_scalar_binadd(b, b, &acomp);
+
+        bzero:
+            /* We ensure that a and b are odd, so b must be even after subtracting a. */
+            VERIFY_CHECK(secp256k1_scalar_is_even(b));
+            kb = secp256k1_scalar_shr_zeros(b);
+            secp256k1_scalar_add(x1, x1, &negx);
+            secp256k1_scalar_pow2_div(x1, x1, kb);
+        } while (secp256k1_scalar_cmp_var(b, a) > 0);
+
+        /* a and b can never be equal, so if we exited, it is because a > b. */
+        VERIFY_CHECK(secp256k1_scalar_cmp_var(a, b) > 0);
+
+        /* In order to speed things up, we only swap pointers */
+        tmp = a;
+        a = b;
+        b = tmp;
+
+        tmp = x0;
+        x0 = x1;
+        x1 = tmp;
+    }
+
+done:
+    secp256k1_scalar_pow2_div(r, x0, ka);
+    return 1;
+}
+#endif
+
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
-#if defined(USE_SCALAR_INV_BUILTIN)
+#if defined(EXHAUSTIVE_TEST_ORDER)
     secp256k1_scalar_inverse(r, x);
+#elif defined(USE_SCALAR_INV_BUILTIN)
+    secp256k1_scalar_eea_inverse(r, x);
 #elif defined(USE_SCALAR_INV_NUM)
     unsigned char b[32];
     secp256k1_num n, m;
diff --git a/src/tests.c b/src/tests.c
@@ -1199,9 +1199,7 @@ void run_scalar_tests(void) {
         secp256k1_scalar one;
         secp256k1_scalar r1;
         secp256k1_scalar r2;
-#if defined(USE_SCALAR_INV_NUM)
         secp256k1_scalar zzv;
-#endif
         int overflow;
         unsigned char chal[33][2][32] = {
             {{0xff, 0xff, 0x03, 0x07, 0x00, 0x00, 0x00, 0x00,
@@ -1751,10 +1749,8 @@ void run_scalar_tests(void) {
             if (!secp256k1_scalar_is_zero(&y)) {
                 secp256k1_scalar_inverse(&zz, &y);
                 CHECK(!secp256k1_scalar_check_overflow(&zz));
-#if defined(USE_SCALAR_INV_NUM)
                 secp256k1_scalar_inverse_var(&zzv, &y);
                 CHECK(secp256k1_scalar_eq(&zzv, &zz));
-#endif
                 secp256k1_scalar_mul(&z, &z, &zz);
                 CHECK(!secp256k1_scalar_check_overflow(&z));
                 CHECK(secp256k1_scalar_eq(&x, &z));