flintlib
diff --git a/‎src/mpn_extras.h‎
Lines changed: 15 additions & 8 deletions b/‎src/mpn_extras.h‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎src/mpn_extras/broadwell/2add_n_inplace.asm‎
Lines changed: 204 additions & 0 deletions b/‎src/mpn_extras/broadwell/2add_n_inplace.asm‎
Lines changed: 204 additions & 0 deletions
@@ -118,6 +118,16 @@ extern "C" {
 double
 flint_mpn_get_d(mp_srcptr ptr, mp_size_t size, mp_size_t sign, long exp);
 
+/* Addition ******************************************************************/
+
+#if FLINT_HAVE_ADX
+# define FLINT_HAVE_NATIVE_2ADD_N_INPLACE 1
+
+/* Simultaneously adds two n-limbed integers onto result and returns carry. */
+/* NOTE: Requires n >= 4 */
+mp_limb_t flint_mpn_2add_n_inplace(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
+#endif
+
 /* General multiplication ****************************************************/
 
 #ifdef FLINT_HAVE_FFT_SMALL
@@ -265,19 +275,18 @@ mp_limb_t _flint_mpn_mulhigh_basecase(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
 mp_limb_t _flint_mpn_sqrhigh_basecase_even(mp_ptr, mp_srcptr, mp_size_t);
 mp_limb_t _flint_mpn_sqrhigh_basecase_odd(mp_ptr, mp_srcptr, mp_size_t);
 
+mp_limb_t _flint_mpn_mulhigh(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
+
 /* TODO: Proceed with higher cases */
 MPN_EXTRAS_INLINE
-mp_limb_t flint_mpn_mulhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+mp_limb_t flint_mpn_mulhigh(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
 {
     FLINT_ASSERT(n >= 1);
 
     if (FLINT_HAVE_MULHIGH_FUNC(n)) /* NOTE: Aliasing allowed here */
         return flint_mpn_mulhigh_func_tab[n](rp, xp, yp);
     else
-    {
-        FLINT_ASSERT(rp != xp && rp != yp);
-        return _flint_mpn_mulhigh_basecase(rp, xp, yp, n);
-    }
+        return _flint_mpn_mulhigh(rp, xp, yp, n);
 }
 
 /* TODO: Proceed with higher cases */
@@ -311,9 +320,7 @@ struct mp_limb_pair_t flint_mpn_mulhigh_normalised(mp_ptr rp, mp_srcptr xp, mp_s
 
         FLINT_ASSERT(rp != xp && rp != yp);
 
-        /* TODO */
-        /* ret.m1 = flint_mpn_mulhigh(rp, xp, yp, n); */
-        ret.m1 = flint_mpn_mulhigh_basecase(rp, xp, yp, n);
+        ret.m1 = _flint_mpn_mulhigh(rp, xp, yp, n);
 
         if (rp[n - 1] >> (FLINT_BITS - 1))
         {
 
@@ -0,0 +1,204 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+dnl
+
+include(`config.m4')
+
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`bp', `%rdx')
+define(`n', `%rcx')
+
+define(`s0', `%r8')
+define(`s1', `%r9')
+define(`s2', `%r10')
+define(`s3', `%r11')
+
+define(`sx', `%rax')
+
+dnl NOTE: This function requires n >= 4
+
+dnl NOTE: This function could easily not be inplace without pushing registers,
+dnl       but currently I do not know if this function is going to be used on
+dnl       other functions than multiplications.
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_2add_n_inplace)
+	mov	R32(n), R32(s1)
+	lea	L(tab)(%rip), s0
+	shr	$3, R32(n)
+	and	$7, R32(s1)
+	xor	R32(sx), R32(sx)
+
+ifdef(`PIC',
+`	movslq	(s0,s1,4), s1
+	lea	(s1,s0), s0
+	jmp	*s0
+',`
+	jmp	*(s0,s1,8)
+')
+
+L(p1):	mov	0*8(rp), s3
+	mov	1*8(rp), s0
+	mov	2*8(rp), s1
+	adcx	0*8(ap), s3
+	adox	0*8(bp), s3
+	adcx	1*8(ap), s0
+	adox	1*8(bp), s0
+	adcx	2*8(ap), s1
+	lea	1*8(rp), rp
+	lea	1*8(bp), bp
+	lea	1*8(ap), ap
+	mov	s3, -1*8(rp)
+	jmp	L(a0)
+
+L(p0):	mov	0*8(rp), s0
+	mov	1*8(rp), s1
+	adcx	0*8(ap), s0
+	adox	0*8(bp), s0
+	adcx	1*8(ap), s1
+	jmp	L(a0)
+
+L(p5):	mov	0*8(rp), s3
+	mov	1*8(rp), s0
+	mov	2*8(rp), s1
+	adcx	0*8(ap), s3
+	adox	0*8(bp), s3
+	adcx	1*8(ap), s0
+	adox	1*8(bp), s0
+	adcx	2*8(ap), s1
+	lea	-3*8(rp), rp
+	lea	-3*8(bp), bp
+	lea	-3*8(ap), ap
+	mov	s3, 3*8(rp)
+	jmp	L(a4)
+
+L(p4):	mov	0*8(rp), s0
+	mov	1*8(rp), s1
+	adcx	0*8(ap), s0
+	adox	0*8(bp), s0
+	adcx	1*8(ap), s1
+	lea	-4*8(rp), rp
+	lea	-4*8(bp), bp
+	lea	-4*8(ap), ap
+	jmp	L(a4)
+
+L(p7):	mov	0*8(rp), s1
+	mov	1*8(rp), s2
+	mov	2*8(rp), s3
+	adcx	0*8(ap), s1
+	adox	0*8(bp), s1
+	adcx	1*8(ap), s2
+	adox	1*8(bp), s2
+	adcx	2*8(ap), s3
+	lea	-1*8(rp), rp
+	lea	-1*8(bp), bp
+	lea	-1*8(ap), ap
+	mov	s1, 1*8(rp)
+	jmp	L(a6)
+
+L(p6):	mov	0*8(rp), s2
+	mov	1*8(rp), s3
+	adcx	0*8(ap), s2
+	adox	0*8(bp), s2
+	adcx	1*8(ap), s3
+	lea	-2*8(rp), rp
+	lea	-2*8(bp), bp
+	lea	-2*8(ap), ap
+	jmp	L(a6)
+
+L(p3):	mov	0*8(rp), s1
+	mov	1*8(rp), s2
+	mov	2*8(rp), s3
+	adcx	0*8(ap), s1
+	adox	0*8(bp), s1
+	adcx	1*8(ap), s2
+	adox	1*8(bp), s2
+	adcx	2*8(ap), s3
+	lea	3*8(rp), rp
+	lea	3*8(bp), bp
+	lea	3*8(ap), ap
+	mov	s1, -3*8(rp)
+	jmp	L(a2)
+
+L(p2):	mov	0*8(rp), s2
+	mov	1*8(rp), s3
+	adcx	0*8(ap), s2
+	adox	0*8(bp), s2
+	adcx	1*8(ap), s3
+	lea	2*8(rp), rp
+	lea	2*8(bp), bp
+	lea	2*8(ap), ap
+	C jmp	L(a2)
+
+	C n = 12 -> n = 1, kx = 4
+	C 2, 3, 4, 5
+
+	ALIGN(32)
+L(a2):	mov	0*8(rp), s0	C 01 start
+	mov	1*8(rp), s1
+	adox	-1*8(bp), s3
+	adcx	0*8(ap), s0
+	mov	s2, -2*8(rp)
+	mov	s3, -1*8(rp)	C 23 end
+	adox	0*8(bp), s0
+	adcx	1*8(ap), s1
+L(a0):	mov	2*8(rp), s2	C 23 start
+	mov	3*8(rp), s3
+	adox	1*8(bp), s1
+	adcx	2*8(ap), s2
+	lea	-1(n), R32(n)
+	mov	s0, 0*8(rp)
+	mov	s1, 1*8(rp)	C 01 end
+	adox	2*8(bp), s2
+	adcx	3*8(ap), s3
+L(a6):	mov	4*8(rp), s0	C 01 start
+	mov	5*8(rp), s1
+	adox	3*8(bp), s3
+	adcx	4*8(ap), s0
+	mov	s2, 2*8(rp)
+	mov	s3, 3*8(rp)	C 23 end
+	adox	4*8(bp), s0
+	adcx	5*8(ap), s1
+L(a4):	mov	6*8(rp), s2	C 23 start
+	mov	7*8(rp), s3
+	adox	5*8(bp), s1
+	adcx	6*8(ap), s2
+	mov	s0, 4*8(rp)
+	mov	s1, 5*8(rp)	C 01 end
+	adox	6*8(bp), s2
+	adcx	7*8(ap), s3
+	jrcxz	L(end)
+	lea	8*8(bp), bp
+	lea	8*8(ap), ap
+	lea	8*8(rp), rp
+	jmp	L(a2)
+
+L(end):	adox	7*8(bp), s3
+	mov	s2, 6*8(rp)
+	mov	s3, 7*8(rp)
+	seto	R8(sx)
+	adc	R32(n), R32(sx)		C n = 0
+
+	ret
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(p0), L(tab))
+	JMPENT(	L(p1), L(tab))
+	JMPENT(	L(p2), L(tab))
+	JMPENT(	L(p3), L(tab))
+	JMPENT(	L(p4), L(tab))
+	JMPENT(	L(p5), L(tab))
+	JMPENT(	L(p6), L(tab))
+	JMPENT(	L(p7), L(tab))
+	TEXT
+EPILOGUE()