Skip to content

Commit e9234cd

Browse files
authored
Merge pull request #451 from serge-sans-paille/feature/test-avx512-vs2015
FixAVX 512 build under VS2015
2 parents 1bd6d8c + 52517c9 commit e9234cd

10 files changed

+110
-33
lines changed

.appveyor.yml

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,22 @@ platform:
66
- x64
77

88
environment:
9+
global:
10+
MINICONDA: C:\xsimd-conda
911
matrix:
10-
- MINICONDA: C:\xsimd-conda
12+
- JOB: "AVX2"
13+
CXXFLAGS: "/arch:AVX2"
14+
VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\vcvarsall.bat"
15+
RUNTEST: ".\\test_xsimd"
16+
- JOB: "AVX512"
17+
CXXFLAGS: "/arch:AVX512"
18+
APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
19+
VCVARSALL: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat"
20+
RUNTEST: "ECHO"
1121

1222
init:
1323
- "ECHO %MINICONDA%"
14-
- C:\"Program Files (x86)"\"Microsoft Visual Studio 14.0"\VC\vcvarsall.bat %PLATFORM%
24+
- call "%VCVARSALL%" %PLATFORM%
1525
- ps: if($env:Platform -eq "x64"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe' C:\Miniconda.exe; echo "Done"}
1626
- ps: if($env:Platform -eq "x86"){Start-FileDownload 'http://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86.exe' C:\Miniconda.exe; echo "Done"}
1727
- cmd: C:\Miniconda.exe /S /D=C:\xsimd-conda
@@ -27,4 +37,4 @@ install:
2737
- cd test
2838

2939
build_script:
30-
- .\test_xsimd
40+
- "%RUNTEST%"

include/xsimd/math/xsimd_rounding.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -329,13 +329,13 @@ namespace xsimd
329329

330330
static inline batch_type ceil(const batch_type& x)
331331
{
332-
auto res = _mm512_ceil_ps(x);
332+
auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF);
333333
return res;
334334
}
335335

336336
static inline batch_type floor(const batch_type& x)
337337
{
338-
auto res = _mm512_floor_ps(x);
338+
auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF);
339339
return res;
340340
}
341341

@@ -359,13 +359,13 @@ namespace xsimd
359359

360360
static inline batch_type ceil(const batch_type& x)
361361
{
362-
auto res = _mm512_ceil_pd(x);
362+
auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_POS_INF);
363363
return res;
364364
}
365365

366366
static inline batch_type floor(const batch_type& x)
367367
{
368-
auto res = _mm512_floor_pd(x);
368+
auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_NEG_INF);
369369
return res;
370370
}
371371

include/xsimd/types/xsimd_avx512_double.hpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -449,8 +449,11 @@ namespace xsimd
449449

450450
static batch_type abs(const batch_type& rhs)
451451
{
452-
return (__m512d)(_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
453-
(__m512i)((__m512d)(rhs))));
452+
__m512d rhs_asd = (__m512d)rhs;
453+
__m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asd);
454+
__m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
455+
rhs_asi);
456+
return *reinterpret_cast<__m512d*>(&res_asi);
454457
}
455458

456459
static batch_type fabs(const batch_type& rhs)
@@ -487,7 +490,7 @@ namespace xsimd
487490
{
488491
__m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
489492
__m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
490-
__m256d res1 = tmp1 + tmp2;
493+
__m256d res1 = _mm256_add_pd(tmp1, tmp2);
491494
return xsimd::hadd(batch<double, 4>(res1));
492495
}
493496

@@ -498,7 +501,7 @@ namespace xsimd
498501
{ \
499502
auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
500503
auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
501-
res ## I = (tmp1 + tmp2); \
504+
res ## I = _mm512_add_pd(tmp1, tmp2); \
502505
} \
503506

504507
step1(1, row[0], row[2]);
@@ -511,12 +514,12 @@ namespace xsimd
511514
batch<double, 8> tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0));
512515
batch<double, 8> tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1));
513516

514-
batch<double, 8> resx1 = (tmp5 + tmp6);
517+
batch<double, 8> resx1 = _mm512_add_pd(tmp5, tmp6);
515518

516519
batch<double, 8> tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0));
517520
batch<double, 8> tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1));
518521

519-
batch<double, 8> resx2 = (tmp7 + tmp8);
522+
batch<double, 8> resx2 = _mm512_add_pd(tmp7, tmp8);
520523

521524
batch<double, 8> tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000);
522525
batch<double, 8> tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111);

include/xsimd/types/xsimd_avx512_float.hpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -472,8 +472,11 @@ namespace xsimd
472472

473473
static batch_type abs(const batch_type& rhs)
474474
{
475-
return (__m512)(_mm512_and_epi32((__m512i)((__m512)(rhs)),
476-
_mm512_set1_epi32(0x7fffffff)));
475+
__m512 rhs_asf = (__m512)rhs;
476+
__m512i rhs_asi = *reinterpret_cast<__m512i*>(&rhs_asf);
477+
__m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),
478+
rhs_asi);
479+
return *reinterpret_cast<__m512*>(&res_asi);
477480
}
478481

479482
static batch_type fabs(const batch_type& rhs)
@@ -510,7 +513,7 @@ namespace xsimd
510513
{
511514
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
512515
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
513-
__m256 res1 = tmp1 + tmp2;
516+
__m256 res1 = _mm256_add_ps(tmp1, tmp2);
514517
return xsimd::hadd(batch<float, 8>(res1));
515518
}
516519

@@ -524,7 +527,7 @@ namespace xsimd
524527
{ \
525528
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
526529
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
527-
res ## I = tmp1 + tmp2; \
530+
res ## I = _mm512_add_ps(tmp1, tmp2); \
528531
} \
529532

530533
XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
@@ -548,17 +551,17 @@ namespace xsimd
548551
batch<float, 16> tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
549552
batch<float, 16> tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
550553
\
551-
batch<float, 16> resx1 = tmp1 + tmp2; \
554+
batch<float, 16> resx1 = _mm512_add_ps(tmp1, tmp2); \
552555
\
553556
batch<float, 16> tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
554557
batch<float, 16> tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
555558
\
556-
batch<float, 16> resx2 = tmp3 + tmp4; \
559+
batch<float, 16> resx2 = _mm512_add_ps(tmp3, tmp4); \
557560
\
558561
batch<float, 16> tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
559562
batch<float, 16> tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
560563
\
561-
batch<float, 16> resx3 = tmp5 + tmp6; \
564+
batch<float, 16> resx3 = _mm512_add_ps(tmp5, tmp6); \
562565
\
563566
halfx ## I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \
564567
_mm512_extractf32x8_ps(resx3, 1)); \
@@ -576,7 +579,20 @@ namespace xsimd
576579

577580
static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
578581
{
582+
#if !defined(_MSC_VER)
579583
return _mm512_mask_blend_ps(cond, b, a);
584+
#else
585+
__m512i mcondi = _mm512_maskz_broadcastd_epi32 ((__mmask16)cond, _mm_set1_epi32(~0));
586+
__m512 mcond = *reinterpret_cast<__m512*>(&mcondi);
587+
XSIMD_SPLITPS_AVX512(mcond);
588+
XSIMD_SPLITPS_AVX512(a);
589+
XSIMD_SPLITPS_AVX512(b);
590+
591+
auto res_lo = _mm256_blendv_ps(b_low, a_low, mcond_low);
592+
auto res_hi = _mm256_blendv_ps(b_high, a_high, mcond_high);
593+
594+
XSIMD_RETURN_MERGEDPS_AVX(res_lo, res_hi);
595+
#endif
580596
}
581597

582598
static batch_bool_type isnan(const batch_type& x)

include/xsimd/types/xsimd_avx512_int16.hpp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -316,19 +316,17 @@ namespace xsimd
316316

317317
static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
318318
{
319-
#if defined(XSIMD_AVX512BW_AVAILABLE)
320-
// Some compilers are not happy with passing directly a and b to the intrinsics
321-
// See https://github.com/xtensor-stack/xsimd/issues/315
322-
__m512i ma = a;
323-
__m512i mb = b;
324-
return _mm512_mask_blend_epi16(cond, mb, ma);
319+
#if defined(XSIMD_AVX512BW_AVAILABLE) && !defined(_MSC_VER)
320+
auto res = _mm512_mask_blend_epi16((__mmask32)cond, (__m512i)b, (__m512i)a);
321+
return batch_type(res);
325322
#else
326-
XSIMD_SPLIT_AVX512(cond);
323+
__m512i mcond = _mm512_maskz_broadcastw_epi16((__mmask32)cond, _mm_set1_epi32(~0));
324+
XSIMD_SPLIT_AVX512(mcond);
327325
XSIMD_SPLIT_AVX512(a);
328326
XSIMD_SPLIT_AVX512(b);
329327

330-
auto res_lo = _mm256_blendv_epi8(b_low, a_low, cond_low);
331-
auto res_hi = _mm256_blendv_epi8(b_high, a_high, cond_high);
328+
auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low);
329+
auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high);
332330

333331
XSIMD_RETURN_MERGED_AVX(res_lo, res_hi);
334332
#endif

include/xsimd/types/xsimd_avx512_int32.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ namespace xsimd
236236
// TODO Why not _mm512_reduce_add_...?
237237
__m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0);
238238
__m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1);
239-
__m256i res1 = tmp1 + tmp2;
239+
__m256i res1 = _mm256_add_epi32(tmp1, tmp2);
240240
return xsimd::hadd(batch<int32_t, 8>(res1));
241241
}
242242

include/xsimd/types/xsimd_avx512_int64.hpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,13 +293,25 @@ namespace xsimd
293293
{
294294
__m256i tmp1 = _mm512_extracti32x8_epi32(rhs, 0);
295295
__m256i tmp2 = _mm512_extracti32x8_epi32(rhs, 1);
296-
__m256i res1 = tmp1 + tmp2;
296+
__m256i res1 = _mm256_add_epi64(tmp1, tmp2);
297297
return xsimd::hadd(batch<int64_t, 4>(res1));
298298
}
299299

300300
static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
301301
{
302+
#if !defined(_MSC_VER)
302303
return _mm512_mask_blend_epi64(cond, b, a);
304+
#else
305+
__m512i mcond = _mm512_maskz_broadcastq_epi64((__mmask8)cond, _mm_set1_epi32(~0));
306+
XSIMD_SPLIT_AVX512(mcond);
307+
XSIMD_SPLIT_AVX512(a);
308+
XSIMD_SPLIT_AVX512(b);
309+
310+
auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low);
311+
auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high);
312+
313+
XSIMD_RETURN_MERGED_AVX(res_lo, res_hi);
314+
#endif
303315
}
304316
};
305317

include/xsimd/types/xsimd_avx512_int_base.hpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,29 @@ namespace xsimd
1818
{
1919

2020
#define XSIMD_SPLIT_AVX512(avx_name) \
21-
__m256i avx_name##_low = _mm512_castsi512_si256(avx_name); \
22-
__m256i avx_name##_high = _mm512_extracti64x4_epi64(avx_name, 1) \
21+
__m256i avx_name##_low = _mm512_castsi512_si256((__m512i)avx_name); \
22+
__m256i avx_name##_high = _mm512_extracti64x4_epi64((__m512i)avx_name, 1) \
23+
24+
#define XSIMD_SPLITPS_AVX512(avx_name) \
25+
__m256 avx_name##_low = _mm512_castps512_ps256((__m512)avx_name); \
26+
__m256 avx_name##_high = _mm512_extractf32x8_ps((__m512)avx_name, 1) \
27+
28+
#define XSIMD_SPLITPD_AVX512(avx_name) \
29+
__m256d avx_name##_low = _mm512_castpd512_pd256((__m512d)avx_name); \
30+
__m256d avx_name##_high = _mm512_extractf64x4_pd((__m512d)avx_name, 1) \
2331

2432
#define XSIMD_RETURN_MERGED_AVX(res_low, res_high) \
2533
__m512i result = _mm512_castsi256_si512(res_low); \
2634
return _mm512_inserti64x4(result, res_high, 1) \
2735

36+
#define XSIMD_RETURN_MERGEDPS_AVX(res_low, res_high) \
37+
__m512 result = _mm512_castps256_ps512(res_low); \
38+
return _mm512_insertf32x8(result, res_high, 1) \
39+
40+
#define XSIMD_RETURN_MERGEDPD_AVX(res_low, res_high) \
41+
__m512d result = _mm512_castpd256_pd512(res_low); \
42+
return _mm512_insertf64x4(result, res_high, 1) \
43+
2844
#define XSIMD_APPLY_AVX2_FUNCTION(N, func, avx_lhs, avx_rhs) \
2945
XSIMD_SPLIT_AVX512(avx_lhs); \
3046
XSIMD_SPLIT_AVX512(avx_rhs); \

include/xsimd/types/xsimd_avx_conversion.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,17 @@ namespace xsimd
123123
XSIMD_BATCH_CAST_INTRINSIC(int32_t, uint16_t, 8, _mm256_cvtepi32_epi16)
124124
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, int16_t, 8, _mm256_cvtepi32_epi16)
125125
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, uint16_t, 8, _mm256_cvtepi32_epi16)
126+
#if defined(_MSC_VER)
127+
namespace detail {
128+
static inline __m256 xsimd_mm256_cvtepu32_ps(__m256i a)
129+
{
130+
return _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_castsi256_si512(a)));
131+
}
132+
}
133+
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, detail::xsimd_mm256_cvtepu32_ps)
134+
#else
126135
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 8, _mm256_cvtepu32_ps)
136+
#endif
127137
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, double, 4, _mm256_cvtepu32_pd)
128138
XSIMD_BATCH_CAST_INTRINSIC(int64_t, int32_t, 4, _mm256_cvtepi64_epi32)
129139
XSIMD_BATCH_CAST_INTRINSIC(int64_t, uint32_t, 4, _mm256_cvtepi64_epi32)

include/xsimd/types/xsimd_sse_conversion.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,19 @@ namespace xsimd
8787
XSIMD_BATCH_CAST_IMPLICIT(uint64_t, int64_t, 2)
8888
XSIMD_BATCH_CAST_INTRINSIC(float, int32_t, 4, _mm_cvttps_epi32)
8989
#if defined(XSIMD_AVX512VL_AVAILABLE)
90+
91+
#if defined(_MSC_VER)
92+
namespace detail {
93+
static inline __m128 xsimd_mm_cvtepu32_ps(__m128i a)
94+
{
95+
return _mm512_castps512_ps128(_mm512_cvtepu32_ps(_mm512_castsi128_si512(a)));
96+
}
97+
}
98+
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, detail::xsimd_mm_cvtepu32_ps)
99+
#else
90100
XSIMD_BATCH_CAST_INTRINSIC(uint32_t, float, 4, _mm_cvtepu32_ps)
101+
#endif
102+
91103
XSIMD_BATCH_CAST_INTRINSIC(float, uint32_t, 4, _mm_cvttps_epu32)
92104
#if defined(XSIMD_AVX512DQ_AVAILABLE)
93105
XSIMD_BATCH_CAST_INTRINSIC(int64_t, double, 2, _mm_cvtepi64_pd)

0 commit comments

Comments
 (0)