flintlib
diff --git a/‎src/mpn_extras.h‎
Lines changed: 5 additions & 8 deletions b/‎src/mpn_extras.h‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎src/mpn_extras/mulhigh.c‎
Lines changed: 181 additions & 80 deletions b/‎src/mpn_extras/mulhigh.c‎
Lines changed: 181 additions & 80 deletions
@@ -275,19 +275,18 @@ mp_limb_t _flint_mpn_mulhigh_basecase(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
 mp_limb_t _flint_mpn_sqrhigh_basecase_even(mp_ptr, mp_srcptr, mp_size_t);
 mp_limb_t _flint_mpn_sqrhigh_basecase_odd(mp_ptr, mp_srcptr, mp_size_t);
 
+mp_limb_t _flint_mpn_mulhigh(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
+
 /* TODO: Proceed with higher cases */
 MPN_EXTRAS_INLINE
-mp_limb_t flint_mpn_mulhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+mp_limb_t flint_mpn_mulhigh(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
 {
     FLINT_ASSERT(n >= 1);
 
     if (FLINT_HAVE_MULHIGH_FUNC(n)) /* NOTE: Aliasing allowed here */
         return flint_mpn_mulhigh_func_tab[n](rp, xp, yp);
     else
-    {
-        FLINT_ASSERT(rp != xp && rp != yp);
-        return _flint_mpn_mulhigh_basecase(rp, xp, yp, n);
-    }
+        return _flint_mpn_mulhigh(rp, xp, yp, n);
 }
 
 /* TODO: Proceed with higher cases */
@@ -321,9 +320,7 @@ struct mp_limb_pair_t flint_mpn_mulhigh_normalised(mp_ptr rp, mp_srcptr xp, mp_s
 
         FLINT_ASSERT(rp != xp && rp != yp);
 
-        /* TODO */
-        /* ret.m1 = flint_mpn_mulhigh(rp, xp, yp, n); */
-        ret.m1 = flint_mpn_mulhigh_basecase(rp, xp, yp, n);
+        ret.m1 = _flint_mpn_mulhigh(rp, xp, yp, n);
 
         if (rp[n - 1] >> (FLINT_BITS - 1))
         {
 
@@ -9,90 +9,191 @@
     (at your option) any later version.  See <https://www.gnu.org/licenses/>.
 */
 
+#include <string.h> /* For memcpy */
 #include "mpn_extras.h"
 
-#if FLINT_HAVE_ADX
-mp_limb_t flint_mpn_mulhigh_1(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_2(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_3(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_4(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_5(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_6(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_7(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_8(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_9(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_10(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_11(mp_ptr, mp_srcptr, mp_srcptr);
-mp_limb_t flint_mpn_mulhigh_12(mp_ptr, mp_srcptr, mp_srcptr);
-
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_1(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_2(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_3(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_4(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_5(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_6(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_7(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_8(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_9(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_10(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_11(mp_ptr, mp_srcptr, mp_srcptr);
-struct mp_limb_pair_t flint_mpn_mulhigh_normalised_12(mp_ptr, mp_srcptr, mp_srcptr);
-
-mp_limb_t flint_mpn_sqrhigh_1(mp_ptr, mp_srcptr);
-mp_limb_t flint_mpn_sqrhigh_2(mp_ptr, mp_srcptr);
-mp_limb_t flint_mpn_sqrhigh_3(mp_ptr, mp_srcptr);
-mp_limb_t flint_mpn_sqrhigh_4(mp_ptr, mp_srcptr);
-mp_limb_t flint_mpn_sqrhigh_5(mp_ptr, mp_srcptr);
-mp_limb_t flint_mpn_sqrhigh_6(mp_ptr, mp_srcptr);
-mp_limb_t flint_mpn_sqrhigh_7(mp_ptr, mp_srcptr);
-mp_limb_t flint_mpn_sqrhigh_8(mp_ptr, mp_srcptr);
-
-const flint_mpn_mul_func_t flint_mpn_mulhigh_func_tab[] =
-{
-    NULL,
-    flint_mpn_mulhigh_1,
-    flint_mpn_mulhigh_2,
-    flint_mpn_mulhigh_3,
-    flint_mpn_mulhigh_4,
-    flint_mpn_mulhigh_5,
-    flint_mpn_mulhigh_6,
-    flint_mpn_mulhigh_7,
-    flint_mpn_mulhigh_8,
-    flint_mpn_mulhigh_9,
-    flint_mpn_mulhigh_10,
-    flint_mpn_mulhigh_11,
-    flint_mpn_mulhigh_12
-};
-
-const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_normalised_func_tab[] =
+/*
+
+We will now define what we consider as a high multiplication, and how we go from
+basecase to bigger cases where Toom-Cook or Schönhage-Strassen multiplication is
+required to outperform `flint_mpn_mul'.
+
+Let {a, n} and {b, n} be two n-limbed (positive) integers, of which the product
+of a and b is {c, 2 n}. For some applications, only the higher part is required,
+and so we only calculate an "approximation" of the most significant n + 1 limbs.
+With a sloppy wording, what this means is that we only calculate the part of the
+multiplication that contributes to the most significant n + 1 limbs, so carries
+are disregarded. This result in that the approximation of c[n - 1, ..., 2 n - 1]
+is smaller than the real value, and that the error is at most ~n ULPs in the
+least significant limb in the approximation.
+
+With {c, 2 n} denoting the product of {a, n} times {b, n}, let {d, n + 1} denote
+the high product. We visualise the high multiplication of two 10-limbed integers
+with the following:
+
+                               0 1 2 3 4 5 6 7 8 9
+                             0                 h x
+                             1               h x x
+                             2             h x x x
+                             3           h x x x x
+                             4         h x x x x x
+                             5       h x x x x x x
+                             6     h x x x x x x x
+                             7   h x x x x x x x x
+                             8 h x x x x x x x x x
+                             9 x x x x x x x x x x
+
+Here `h' means that only the higher part of this entry was calculated, and `x'
+means that the full product of the limbs where calculated.
+
+To utilise multiplication algorithms that exploits symmetries, we divide this
+figure into four different parts:
+
+                               0 1 2 3 4 5 6 7 8
+                             0          |    h x
+                             1          |  h x x
+                             2          |h x x x
+                             3  _ _ _ _h|x_x_x_x
+     n = 9:                  4       h x|x x x x
+                             5     h x x|x x x x
+                             6   h x x x|x x x x
+                             7 h x x x x|x x x x
+                             8 x x x x x|x x x x
+
+                              0 1 2 3 4 5 6 7 8 9
+                            0          |      h x
+                            1          |    h x x
+                            2          |  h x x x
+                            3          |h x x x x
+     n = 10:                4  _ _ _ _h|x_x_x_x_x
+                            5       h x|x x x x x
+                            6     h x x|x x x x x
+                            7   h x x x|x x x x x
+                            8 h x x x x|x x x x x
+                            9 x x x x x|x x x x x
+
+Observe that we have only one multi-limbed full multiplication, two multi-limbed
+high multiplications and one single-limbed high multiplication.
+
+*/
+
+#if FLINT_HAVE_NATIVE_MPN_MULHIGH_BASECASE && FLINT_HAVE_NATIVE_2ADD_N_INPLACE
+
+#if !defined(__amd64__)
+# error
+#endif
+
+/* NOTE: As we will not reuse factors in mulhigh, we utilize mul instead of mulx
+ * to save a few bytes. */
+#define mulhigh(p, u, v) \
+  do { \
+    ulong _scr; \
+    __asm__("mulq\t%3" \
+      : "=a" (_scr), "=d" (p) \
+      : "%0" ((ulong)(u)), "rm" ((ulong)(v))); \
+  } while (0)
+
+/* NOTE: Assumes no carry */
+#define flint_mpn_add_1(rp, x) \
+  do { \
+    ulong __rp_save = (rp)[0]; \
+    (rp)[0] += (x); \
+    if (__rp_save > (rp)[0]) \
+    { \
+      slong __ix = 0; \
+      do \
+      { \
+        __ix++; \
+        (rp)[__ix] += 1; \
+      } while ((rp)[__ix] == UWORD(0)); \
+    } \
+  } while (0)
+
+#define RECURSIVE_THRESHOLD 59
+#define _RECURSIVE_THRESHOLD 47
+#define FALLBACK_THRESHOLD 330
+
+FLINT_STATIC_NOINLINE
+mp_limb_t _flint_mpn_mulhigh_rec(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr scr)
 {
-    NULL,
-    flint_mpn_mulhigh_normalised_1,
-    flint_mpn_mulhigh_normalised_2,
-    flint_mpn_mulhigh_normalised_3,
-    flint_mpn_mulhigh_normalised_4,
-    flint_mpn_mulhigh_normalised_5,
-    flint_mpn_mulhigh_normalised_6,
-    flint_mpn_mulhigh_normalised_7,
-    flint_mpn_mulhigh_normalised_8,
-    flint_mpn_mulhigh_normalised_9,
-    flint_mpn_mulhigh_normalised_10,
-    flint_mpn_mulhigh_normalised_11,
-    flint_mpn_mulhigh_normalised_12
-};
-
-const flint_mpn_sqr_func_t flint_mpn_sqrhigh_func_tab[] =
+    if (n < _RECURSIVE_THRESHOLD)
+        return _flint_mpn_mulhigh_basecase(rp, xp, yp, n);
+    else
+    {
+        mp_size_t np1o2 = (n + 1) / 2;
+        mp_size_t no2 = n / 2;
+        mp_limb_t c0, c1 = 0, ret;
+        mp_ptr hl, hr;
+
+        /* Top left */
+        mulhigh(c0, xp[no2 - 1], yp[np1o2 - 1]);
+
+        /* Bottom right */
+        _flint_mpn_mul(rp, xp + no2, np1o2, yp + np1o2, no2);
+
+        /* Bottom left */
+        hr = scr;
+        ret = _flint_mpn_mulhigh_rec(hr, xp + no2, yp, np1o2, hr + np1o2);
+        add_ssaaaa(c1, c0, c1, c0, 0, ret);
+
+        /* Top right */
+        hl = scr + np1o2;
+        hl[np1o2 - 1] = 0;
+        ret = _flint_mpn_mulhigh_rec(hl, xp, yp + np1o2, no2, hl + np1o2);
+        add_ssaaaa(c1, c0, c1, c0, 0, ret);
+
+        /* Add c1 to rp */
+        flint_mpn_add_1(rp, c1);
+
+        /* Add both high multiplications to rp */
+        ret = flint_mpn_2add_n_inplace(rp, hr, hl, np1o2);
+
+        /* Add carry from addition to rp */
+        flint_mpn_add_1(rp + np1o2, ret);
+
+        return c0;
+    }
+}
+
+mp_limb_t _flint_mpn_mulhigh(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
 {
-    NULL,
-    flint_mpn_sqrhigh_1,
-    flint_mpn_sqrhigh_2,
-    flint_mpn_sqrhigh_3,
-    flint_mpn_sqrhigh_4,
-    flint_mpn_sqrhigh_5,
-    flint_mpn_sqrhigh_6,
-    flint_mpn_sqrhigh_7,
-    flint_mpn_sqrhigh_8
-};
+    FLINT_ASSERT(n > FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH);
+
+    if (n < RECURSIVE_THRESHOLD)
+    {
+        FLINT_ASSERT(rp != xp && rp != yp);
+        return _flint_mpn_mulhigh_basecase(rp, xp, yp, n);
+    }
+    else if (n < FALLBACK_THRESHOLD)
+    {
+        mp_limb_t ret;
+        mp_ptr scr;
+
+        FLINT_ASSERT(rp != xp && rp != yp);
+
+        scr = flint_malloc(2 * sizeof(mp_limb_t) * n);
+        ret = _flint_mpn_mulhigh_rec(rp, xp, yp, n, scr);
+        flint_free(scr);
+
+        return ret;
+    }
+    else
+    {
+        /* Aliasing is okay */
+        mp_ptr tmp;
+        mp_limb_t ret;
+
+        tmp = flint_malloc(2 * sizeof(mp_limb_t) * n);
+
+        _flint_mpn_mul_n(tmp, xp, yp, n);
+        memcpy(rp, tmp + n, sizeof(mp_limb_t) * n);
+        ret = tmp[n - 1];
+
+        flint_free(tmp);
+
+        return ret;
+    }
+}
 #else
 typedef int this_file_is_empty;
 #endif