Skip to content

Commit d8b5320

Browse files
authored
Merge pull request #1794 from albinahlback/mulhigh_generic
Mulhigh generic
2 parents 0e5f9fa + 7c9d0e8 commit d8b5320

File tree

14 files changed

+867
-273
lines changed

14 files changed

+867
-273
lines changed

src/mpn_extras.h

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,16 @@ extern "C" {
118118
double
119119
flint_mpn_get_d(mp_srcptr ptr, mp_size_t size, mp_size_t sign, long exp);
120120

121+
/* Addition ******************************************************************/
122+
123+
#if FLINT_HAVE_ADX
124+
# define FLINT_HAVE_NATIVE_2ADD_N_INPLACE 1
125+
126+
/* Simultaneously adds two n-limbed integers onto result and returns carry. */
127+
/* NOTE: Requires n >= 4 */
128+
mp_limb_t flint_mpn_2add_n_inplace(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
129+
#endif
130+
121131
/* General multiplication ****************************************************/
122132

123133
#ifdef FLINT_HAVE_FFT_SMALL
@@ -265,19 +275,18 @@ mp_limb_t _flint_mpn_mulhigh_basecase(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
265275
mp_limb_t _flint_mpn_sqrhigh_basecase_even(mp_ptr, mp_srcptr, mp_size_t);
266276
mp_limb_t _flint_mpn_sqrhigh_basecase_odd(mp_ptr, mp_srcptr, mp_size_t);
267277

278+
mp_limb_t _flint_mpn_mulhigh(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
279+
268280
/* TODO: Proceed with higher cases */
269281
MPN_EXTRAS_INLINE
270-
mp_limb_t flint_mpn_mulhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
282+
mp_limb_t flint_mpn_mulhigh(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
271283
{
272284
FLINT_ASSERT(n >= 1);
273285

274286
if (FLINT_HAVE_MULHIGH_FUNC(n)) /* NOTE: Aliasing allowed here */
275287
return flint_mpn_mulhigh_func_tab[n](rp, xp, yp);
276288
else
277-
{
278-
FLINT_ASSERT(rp != xp && rp != yp);
279-
return _flint_mpn_mulhigh_basecase(rp, xp, yp, n);
280-
}
289+
return _flint_mpn_mulhigh(rp, xp, yp, n);
281290
}
282291

283292
/* TODO: Proceed with higher cases */
@@ -311,9 +320,7 @@ struct mp_limb_pair_t flint_mpn_mulhigh_normalised(mp_ptr rp, mp_srcptr xp, mp_s
311320

312321
FLINT_ASSERT(rp != xp && rp != yp);
313322

314-
/* TODO */
315-
/* ret.m1 = flint_mpn_mulhigh(rp, xp, yp, n); */
316-
ret.m1 = flint_mpn_mulhigh_basecase(rp, xp, yp, n);
323+
ret.m1 = _flint_mpn_mulhigh(rp, xp, yp, n);
317324

318325
if (rp[n - 1] >> (FLINT_BITS - 1))
319326
{
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
dnl
2+
dnl Copyright (C) 2024 Albin Ahlbäck
3+
dnl
4+
dnl This file is part of FLINT.
5+
dnl
6+
dnl FLINT is free software: you can redistribute it and/or modify it under
7+
dnl the terms of the GNU Lesser General Public License (LGPL) as published
8+
dnl by the Free Software Foundation; either version 3 of the License, or
9+
dnl (at your option) any later version. See <https://www.gnu.org/licenses/>.
10+
dnl
11+
12+
include(`config.m4')
13+
14+
define(`rp', `%rdi')
15+
define(`ap', `%rsi')
16+
define(`bp', `%rdx')
17+
define(`n', `%rcx')
18+
19+
define(`s0', `%r8')
20+
define(`s1', `%r9')
21+
define(`s2', `%r10')
22+
define(`s3', `%r11')
23+
24+
define(`sx', `%rax')
25+
26+
dnl NOTE: This function requires n >= 4
27+
28+
dnl NOTE: This function could easily not be inplace without pushing registers,
29+
dnl but currently I do not know if this function is going to be used on
30+
dnl other functions than multiplications.
31+
32+
TEXT
33+
34+
ALIGN(16)
35+
PROLOGUE(flint_mpn_2add_n_inplace)
36+
mov R32(n), R32(s1)
37+
lea L(tab)(%rip), s0
38+
shr $3, R32(n)
39+
and $7, R32(s1)
40+
xor R32(sx), R32(sx)
41+
42+
ifdef(`PIC',
43+
` movslq (s0,s1,4), s1
44+
lea (s1,s0), s0
45+
jmp *s0
46+
',`
47+
jmp *(s0,s1,8)
48+
')
49+
50+
L(p1): mov 0*8(rp), s3
51+
mov 1*8(rp), s0
52+
mov 2*8(rp), s1
53+
adcx 0*8(ap), s3
54+
adox 0*8(bp), s3
55+
adcx 1*8(ap), s0
56+
adox 1*8(bp), s0
57+
adcx 2*8(ap), s1
58+
lea 1*8(rp), rp
59+
lea 1*8(bp), bp
60+
lea 1*8(ap), ap
61+
mov s3, -1*8(rp)
62+
jmp L(a0)
63+
64+
L(p0): mov 0*8(rp), s0
65+
mov 1*8(rp), s1
66+
adcx 0*8(ap), s0
67+
adox 0*8(bp), s0
68+
adcx 1*8(ap), s1
69+
jmp L(a0)
70+
71+
L(p5): mov 0*8(rp), s3
72+
mov 1*8(rp), s0
73+
mov 2*8(rp), s1
74+
adcx 0*8(ap), s3
75+
adox 0*8(bp), s3
76+
adcx 1*8(ap), s0
77+
adox 1*8(bp), s0
78+
adcx 2*8(ap), s1
79+
lea -3*8(rp), rp
80+
lea -3*8(bp), bp
81+
lea -3*8(ap), ap
82+
mov s3, 3*8(rp)
83+
jmp L(a4)
84+
85+
L(p4): mov 0*8(rp), s0
86+
mov 1*8(rp), s1
87+
adcx 0*8(ap), s0
88+
adox 0*8(bp), s0
89+
adcx 1*8(ap), s1
90+
lea -4*8(rp), rp
91+
lea -4*8(bp), bp
92+
lea -4*8(ap), ap
93+
jmp L(a4)
94+
95+
L(p7): mov 0*8(rp), s1
96+
mov 1*8(rp), s2
97+
mov 2*8(rp), s3
98+
adcx 0*8(ap), s1
99+
adox 0*8(bp), s1
100+
adcx 1*8(ap), s2
101+
adox 1*8(bp), s2
102+
adcx 2*8(ap), s3
103+
lea -1*8(rp), rp
104+
lea -1*8(bp), bp
105+
lea -1*8(ap), ap
106+
mov s1, 1*8(rp)
107+
jmp L(a6)
108+
109+
L(p6): mov 0*8(rp), s2
110+
mov 1*8(rp), s3
111+
adcx 0*8(ap), s2
112+
adox 0*8(bp), s2
113+
adcx 1*8(ap), s3
114+
lea -2*8(rp), rp
115+
lea -2*8(bp), bp
116+
lea -2*8(ap), ap
117+
jmp L(a6)
118+
119+
L(p3): mov 0*8(rp), s1
120+
mov 1*8(rp), s2
121+
mov 2*8(rp), s3
122+
adcx 0*8(ap), s1
123+
adox 0*8(bp), s1
124+
adcx 1*8(ap), s2
125+
adox 1*8(bp), s2
126+
adcx 2*8(ap), s3
127+
lea 3*8(rp), rp
128+
lea 3*8(bp), bp
129+
lea 3*8(ap), ap
130+
mov s1, -3*8(rp)
131+
jmp L(a2)
132+
133+
L(p2): mov 0*8(rp), s2
134+
mov 1*8(rp), s3
135+
adcx 0*8(ap), s2
136+
adox 0*8(bp), s2
137+
adcx 1*8(ap), s3
138+
lea 2*8(rp), rp
139+
lea 2*8(bp), bp
140+
lea 2*8(ap), ap
141+
C jmp L(a2)
142+
143+
C n = 12 -> n = 1, kx = 4
144+
C 2, 3, 4, 5
145+
146+
ALIGN(32)
147+
L(a2): mov 0*8(rp), s0 C 01 start
148+
mov 1*8(rp), s1
149+
adox -1*8(bp), s3
150+
adcx 0*8(ap), s0
151+
mov s2, -2*8(rp)
152+
mov s3, -1*8(rp) C 23 end
153+
adox 0*8(bp), s0
154+
adcx 1*8(ap), s1
155+
L(a0): mov 2*8(rp), s2 C 23 start
156+
mov 3*8(rp), s3
157+
adox 1*8(bp), s1
158+
adcx 2*8(ap), s2
159+
lea -1(n), R32(n)
160+
mov s0, 0*8(rp)
161+
mov s1, 1*8(rp) C 01 end
162+
adox 2*8(bp), s2
163+
adcx 3*8(ap), s3
164+
L(a6): mov 4*8(rp), s0 C 01 start
165+
mov 5*8(rp), s1
166+
adox 3*8(bp), s3
167+
adcx 4*8(ap), s0
168+
mov s2, 2*8(rp)
169+
mov s3, 3*8(rp) C 23 end
170+
adox 4*8(bp), s0
171+
adcx 5*8(ap), s1
172+
L(a4): mov 6*8(rp), s2 C 23 start
173+
mov 7*8(rp), s3
174+
adox 5*8(bp), s1
175+
adcx 6*8(ap), s2
176+
mov s0, 4*8(rp)
177+
mov s1, 5*8(rp) C 01 end
178+
adox 6*8(bp), s2
179+
adcx 7*8(ap), s3
180+
jrcxz L(end)
181+
lea 8*8(bp), bp
182+
lea 8*8(ap), ap
183+
lea 8*8(rp), rp
184+
jmp L(a2)
185+
186+
L(end): adox 7*8(bp), s3
187+
mov s2, 6*8(rp)
188+
mov s3, 7*8(rp)
189+
seto R8(sx)
190+
adc R32(n), R32(sx) C n = 0
191+
192+
ret
193+
JUMPTABSECT
194+
ALIGN(8)
195+
L(tab): JMPENT( L(p0), L(tab))
196+
JMPENT( L(p1), L(tab))
197+
JMPENT( L(p2), L(tab))
198+
JMPENT( L(p3), L(tab))
199+
JMPENT( L(p4), L(tab))
200+
JMPENT( L(p5), L(tab))
201+
JMPENT( L(p6), L(tab))
202+
JMPENT( L(p7), L(tab))
203+
TEXT
204+
EPILOGUE()

0 commit comments

Comments
 (0)