@@ -114,6 +114,35 @@ namespace xsimd
114114#endif
115115
116116#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
117+
118+ auto get_xcr0_low = []() noexcept
119+ {
120+ uint32_t xcr0;
121+
122+ #if defined(_MSC_VER) && _MSC_VER >= 1400
123+
124+ xcr0 = (uint32_t )_xgetbv (0 );
125+
126+ #elif defined(__GNUC__)
127+
128+ __asm__ (
129+ " xorl %%ecx, %%ecx\n "
130+ " xgetbv\n "
131+ : " =a" (xcr0)
132+ :
133+ #if defined(__i386__)
134+ : " ecx" , " edx"
135+ #else
136+ : " rcx" , " rdx"
137+ #endif
138+ );
139+
140+ #else /* _MSC_VER < 1400 */
141+ #error "_MSC_VER < 1400 is not supported"
142+ #endif /* _MSC_VER && _MSC_VER >= 1400 */
143+ return xcr0;
144+ };
145+
117146 auto get_cpuid = [](int reg[4 ], int level, int count = 0 ) noexcept
118147 {
119148
@@ -148,43 +177,67 @@ namespace xsimd
148177
149178 get_cpuid (regs1, 0x1 );
150179
151- sse2 = regs1[3 ] >> 26 & 1 ;
152- sse3 = regs1[2 ] >> 0 & 1 ;
153- ssse3 = regs1[2 ] >> 9 & 1 ;
154- sse4_1 = regs1[2 ] >> 19 & 1 ;
155- sse4_2 = regs1[2 ] >> 20 & 1 ;
156- fma3_sse42 = regs1[2 ] >> 12 & 1 ;
180+ // OS can explicitly disable the usage of SSE/AVX extensions
181+ // by setting an appropriate flag in CR0 register
182+ //
183+ // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html
184+
185+ unsigned sse_state_os_enabled = 1 ;
186+ unsigned avx_state_os_enabled = 1 ;
187+ unsigned avx512_state_os_enabled = 1 ;
188+
189+ // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit
190+ // 18] to enable XSETBV/XGETBV instructions to access XCR0 and
191+ // to support processor extended state management using
192+ // XSAVE/XRSTOR.
193+ bool osxsave = regs1[2 ] >> 27 & 1 ;
194+ if (osxsave)
195+ {
196+
197+ uint32_t xcr0 = get_xcr0_low ();
198+
199+ sse_state_os_enabled = xcr0 >> 1 & 1 ;
200+ avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled;
201+ avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled;
202+ }
203+
204+ sse2 = regs1[3 ] >> 26 & sse_state_os_enabled;
205+ sse3 = regs1[2 ] >> 0 & sse_state_os_enabled;
206+ ssse3 = regs1[2 ] >> 9 & sse_state_os_enabled;
207+ sse4_1 = regs1[2 ] >> 19 & sse_state_os_enabled;
208+ sse4_2 = regs1[2 ] >> 20 & sse_state_os_enabled;
209+ fma3_sse42 = regs1[2 ] >> 12 & sse_state_os_enabled;
157210
158- avx = regs1[2 ] >> 28 & 1 ;
211+ avx = regs1[2 ] >> 28 & avx_state_os_enabled ;
159212 fma3_avx = avx && fma3_sse42;
160213
161214 int regs8[4 ];
162215 get_cpuid (regs8, 0x80000001 );
163- fma4 = regs8[2 ] >> 16 & 1 ;
216+ fma4 = regs8[2 ] >> 16 & avx_state_os_enabled ;
164217
165218 // sse4a = regs[2] >> 6 & 1;
166219
167220 // xop = regs[2] >> 11 & 1;
168221
169222 int regs7[4 ];
170223 get_cpuid (regs7, 0x7 );
171- avx2 = regs7[1 ] >> 5 & 1 ;
224+ avx2 = regs7[1 ] >> 5 & avx_state_os_enabled ;
172225
173226 int regs7a[4 ];
174227 get_cpuid (regs7a, 0x7 , 0x1 );
175- avxvnni = regs7a[0 ] >> 4 & 1 ;
228+ avxvnni = regs7a[0 ] >> 4 & avx_state_os_enabled ;
176229
177230 fma3_avx2 = avx2 && fma3_sse42;
178231
179- avx512f = regs7[1 ] >> 16 & 1 ;
180- avx512cd = regs7[1 ] >> 28 & 1 ;
181- avx512dq = regs7[1 ] >> 17 & 1 ;
182- avx512bw = regs7[1 ] >> 30 & 1 ;
183- avx512er = regs7[1 ] >> 27 & 1 ;
184- avx512pf = regs7[1 ] >> 26 & 1 ;
185- avx512ifma = regs7[1 ] >> 21 & 1 ;
186- avx512vbmi = regs7[2 ] >> 1 & 1 ;
187- avx512vnni_bw = regs7[2 ] >> 11 & 1 ;
232+ avx512f = regs7[1 ] >> 16 & avx512_state_os_enabled ;
233+ avx512cd = regs7[1 ] >> 28 & avx512_state_os_enabled ;
234+ avx512dq = regs7[1 ] >> 17 & avx512_state_os_enabled ;
235+ avx512bw = regs7[1 ] >> 30 & avx512_state_os_enabled ;
236+ avx512er = regs7[1 ] >> 27 & avx512_state_os_enabled ;
237+ avx512pf = regs7[1 ] >> 26 & avx512_state_os_enabled ;
238+ avx512ifma = regs7[1 ] >> 21 & avx512_state_os_enabled ;
239+ avx512vbmi = regs7[2 ] >> 1 & avx512_state_os_enabled ;
240+ avx512vnni_bw = regs7[2 ] >> 11 & avx512_state_os_enabled ;
188241 avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
189242#endif
190243 }
0 commit comments