@@ -38,22 +38,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
3838
3939 __asm__ __volatile__
4040 (
41- "vmovddup (%2), %%xmm0 \n\t" // alpha
41+ "vbroadcastsd (%2), %%ymm0 \n\t" // alpha
4242
4343 "addq $128, %1 \n\t"
4444
4545 "cmpq $0, %0 \n\t"
4646 "je 4f \n\t"
4747
48- "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t"
49- "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t"
50- "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t"
51- "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t"
48+ "vmulpd -128(%1), %%ymm0, %%ymm4 \n\t"
49+ "vmulpd -96(%1), %%ymm0, %%ymm5 \n\t"
5250
53- "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t"
54- "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t"
55- "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t"
56- "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t"
51+ "vmulpd -64(%1), %%ymm0, %%ymm6 \n\t"
52+ "vmulpd -32(%1), %%ymm0, %%ymm7 \n\t"
5753
5854 "subq $1 , %0 \n\t"
5955 "jz 2f \n\t"
@@ -62,26 +58,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
6258 "1: \n\t"
6359 // "prefetcht0 640(%1) \n\t"
6460
65- "vmovups %%xmm4 ,-128(%1) \n\t"
66- "vmovups %%xmm5 ,-112(%1) \n\t"
67- "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t"
68- "vmovups %%xmm6 , -96(%1) \n\t"
69- "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t"
70- "vmovups %%xmm7 , -80(%1) \n\t"
71- "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t"
61+ "vmovups %%ymm4 ,-128(%1) \n\t"
62+ "vmovups %%ymm5 , -96(%1) \n\t"
63+ "vmulpd 0(%1), %%ymm0, %%ymm4 \n\t"
7264
7365 // "prefetcht0 704(%1) \n\t"
7466
75- "vmovups %%xmm8 , -64(%1) \n\t"
76- "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t"
77- "vmovups %%xmm9 , -48(%1) \n\t"
78- "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t"
79- "vmovups %%xmm10 , -32(%1) \n\t"
80- "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t"
81- "vmovups %%xmm11 , -16(%1) \n\t"
67+ "vmovups %%ymm6 , -64(%1) \n\t"
68+ "vmulpd 32(%1), %%ymm0, %%ymm5 \n\t"
69+ "vmovups %%ymm7 , -32(%1) \n\t"
8270
83- "vmulpd 96 (%1), %%xmm0 , %%xmm10 \n\t"
84- "vmulpd 112 (%1), %%xmm0 , %%xmm11 \n\t"
71+ "vmulpd 64 (%1), %%ymm0 , %%ymm6 \n\t"
72+ "vmulpd 96 (%1), %%ymm0 , %%ymm7 \n\t"
8573
8674
8775 "addq $128, %1 \n\t"
@@ -90,15 +78,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
9078
9179 "2: \n\t"
9280
93- "vmovups %%xmm4 ,-128(%1) \n\t"
94- "vmovups %%xmm5 ,-112(%1) \n\t"
95- "vmovups %%xmm6 , -96(%1) \n\t"
96- "vmovups %%xmm7 , -80(%1) \n\t"
81+ "vmovups %%ymm4 ,-128(%1) \n\t"
82+ "vmovups %%ymm5 , -96(%1) \n\t"
9783
98- "vmovups %%xmm8 , -64(%1) \n\t"
99- "vmovups %%xmm9 , -48(%1) \n\t"
100- "vmovups %%xmm10 , -32(%1) \n\t"
101- "vmovups %%xmm11 , -16(%1) \n\t"
84+ "vmovups %%ymm6 , -64(%1) \n\t"
85+ "vmovups %%ymm7 , -32(%1) \n\t"
10286
10387 "addq $128, %1 \n\t"
10488
@@ -107,15 +91,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
10791 "cmpq $8 ,%3 \n\t"
10892 "jne 5f \n\t"
10993
110- "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t"
111- "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t"
112- "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t"
113- "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t"
94+ "vmulpd -128(%1), %%ymm0, %%ymm4 \n\t"
95+ "vmulpd -96(%1), %%ymm0, %%ymm5 \n\t"
11496
115- "vmovups %%xmm4 ,-128(%1) \n\t"
116- "vmovups %%xmm5 ,-112(%1) \n\t"
117- "vmovups %%xmm6 , -96(%1) \n\t"
118- "vmovups %%xmm7 , -80(%1) \n\t"
97+ "vmovups %%ymm4 ,-128(%1) \n\t"
98+ "vmovups %%ymm5 , -96(%1) \n\t"
11999
120100 "5: \n\t"
121101
@@ -149,7 +129,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
149129
150130 __asm__ __volatile__
151131 (
152- "vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t"
132+ "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t"
153133
154134 "addq $128, %1 \n\t"
155135
@@ -159,15 +139,11 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
159139 ".p2align 4 \n\t"
160140 "1: \n\t"
161141
162- "vmovups %%xmm0 ,-128(%1) \n\t"
163- "vmovups %%xmm0 ,-112(%1) \n\t"
164- "vmovups %%xmm0 , -96(%1) \n\t"
165- "vmovups %%xmm0 , -80(%1) \n\t"
142+ "vmovups %%ymm0 , -128(%1) \n\t"
143+ "vmovups %%ymm0 , -96(%1) \n\t"
166144
167- "vmovups %%xmm0 , -64(%1) \n\t"
168- "vmovups %%xmm0 , -48(%1) \n\t"
169- "vmovups %%xmm0 , -32(%1) \n\t"
170- "vmovups %%xmm0 , -16(%1) \n\t"
145+ "vmovups %%ymm0 , -64(%1) \n\t"
146+ "vmovups %%ymm0 , -32(%1) \n\t"
171147
172148 "addq $128, %1 \n\t"
173149 "subq $1 , %0 \n\t"
@@ -178,10 +154,8 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
178154 "cmpq $8 ,%3 \n\t"
179155 "jne 4f \n\t"
180156
181- "vmovups %%xmm0 ,-128(%1) \n\t"
182- "vmovups %%xmm0 ,-112(%1) \n\t"
183- "vmovups %%xmm0 , -96(%1) \n\t"
184- "vmovups %%xmm0 , -80(%1) \n\t"
157+ "vmovups %%ymm0 ,-128(%1) \n\t"
158+ "vmovups %%ymm0 , -96(%1) \n\t"
185159
186160 "4: \n\t"
187161
0 commit comments