Skip to content

Commit 2b34485

Browse files
authored
[Graph] remove CPU vector warnings. (intel#1070)
1 parent f8a6ea6 commit 2b34485

File tree

18 files changed

+396
-264
lines changed

18 files changed

+396
-264
lines changed

intel_extension_for_transformers/backends/neural_engine/graph/layers/ele_wise.h

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
#include <math.h>
44
#include "core/data_types.h"
5-
#include "vectors/cpu/simd.h"
5+
#include "vectors/ele_wise.h"
66

77
#ifdef __cplusplus
88
extern "C" {
@@ -11,41 +11,29 @@ extern "C" {
1111
// fundamental operations
1212
//
1313

14-
inline static void ne_vec_set_i8(const int n, int8_t* x, const int8_t v) {
15-
for (int i = 0; i < n; ++i) x[i] = v;
16-
}
14+
inline static void ne_vec_set_i8(const int n, int8_t* x, const int8_t v) { ne_vec_set_i8_(n, x, v); }
1715

18-
inline static void ne_vec_set_i16(const int n, int16_t* x, const int16_t v) {
19-
for (int i = 0; i < n; ++i) x[i] = v;
20-
}
16+
inline static void ne_vec_set_i16(const int n, int16_t* x, const int16_t v) { ne_vec_set_i16_(n, x, v); }
2117

22-
inline static void ne_vec_set_i32(const int n, int32_t* x, const int32_t v) {
23-
for (int i = 0; i < n; ++i) x[i] = v;
24-
}
18+
inline static void ne_vec_set_i32(const int n, int32_t* x, const int32_t v) { ne_vec_set_i32_(n, x, v); }
2519

26-
inline static void ne_vec_set_f16(const int n, ne_fp16_t* x, const int32_t v) {
27-
for (int i = 0; i < n; ++i) x[i] = v;
28-
}
20+
inline static void ne_vec_set_f16(const int n, ne_fp16_t* x, const int32_t v) { ne_vec_set_f16_(n, x, v); }
2921

3022
inline static void ne_vec_add_f32(const int n, float* z, const float* x, const float* y) {
31-
for (int i = 0; i < n; ++i) z[i] = x[i] + y[i];
23+
ne_vec_add_f32_(n, z, x, y);
3224
}
3325
inline static void ne_vec_add1_f32(const int n, float* z, const float* x, const float v) {
3426
for (int i = 0; i < n; ++i) z[i] = x[i] + v;
3527
}
36-
inline static void ne_vec_acc_f32(const int n, float* y, const float* x) {
37-
for (int i = 0; i < n; ++i) y[i] += x[i];
38-
}
28+
inline static void ne_vec_acc_f32(const int n, float* y, const float* x) { ne_vec_acc_f32_(n, y, x); }
3929
inline static void ne_vec_acc1_f32(const int n, float* y, const float v) {
4030
for (int i = 0; i < n; ++i) y[i] += v;
4131
}
4232
inline static void ne_vec_sub_f32(const int n, float* z, const float* x, const float* y) {
43-
for (int i = 0; i < n; ++i) z[i] = x[i] - y[i];
33+
ne_vec_sub_f32_(n, z, x, y);
4434
}
4535

46-
inline static void ne_vec_set_f32(const int n, float* x, const float v) {
47-
for (int i = 0; i < n; ++i) x[i] = v;
48-
}
36+
inline static void ne_vec_set_f32(const int n, float* x, const float v) { ne_vec_set_f32_(n, x, v); }
4937

5038
inline static void ne_vec_cpy_f32(const int n, float* y, const float* x) {
5139
for (int i = 0; i < n; ++i) y[i] = x[i];
@@ -54,10 +42,10 @@ inline static void ne_vec_neg_f32(const int n, float* y, const float* x) {
5442
for (int i = 0; i < n; ++i) y[i] = -x[i];
5543
}
5644
inline static void ne_vec_mul_f32(const int n, float* z, const float* x, const float* y) {
57-
for (int i = 0; i < n; ++i) z[i] = x[i] * y[i];
45+
ne_vec_mul_f32_(n, z, x, y);
5846
}
5947
inline static void ne_vec_div_f32(const int n, float* z, const float* x, const float* y) {
60-
for (int i = 0; i < n; ++i) z[i] = x[i] / y[i];
48+
ne_vec_div_f32_(n, z, x, y);
6149
}
6250

6351
inline static void ne_vec_mad_f32(const int n, float* __restrict y, const float* __restrict x, const float v) {

intel_extension_for_transformers/backends/neural_engine/graph/vectors/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ if (NE_GPU)
1717
add_subdirectory(gpu)
1818
endif()
1919

20-
add_library_w_warning(ne_vec ele_reduce.cpp)
20+
add_library_w_warning(ne_vec ele_reduce.cpp ele_wise.cpp)
2121
target_link_libraries(ne_vec PUBLIC cpu_vec)

intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#include "vec_base.hpp"
2020
#include "vec_compare.hpp"
2121
#include "vec_convert.hpp"
22-
#include "vec_load.hpp"
2322
#include "vec_set.hpp"
2423

2524
#endif // ENGINE_EXECUTOR_INCLUDE_VEC_HPP_

intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.cpp

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -12,27 +12,30 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
#include "vec_load.hpp"
16+
#include "vec_store.hpp"
1517
#include "vec_arithmetic.hpp"
18+
#include "cmath"
1619

17-
inline fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y) {
20+
fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y) {
1821
#if __AVX512F__
19-
return _mm512_sub_ps(x, y);
22+
return {_mm512_sub_ps(x.first, y.first)};
2023
#else
2124
return {_mm256_sub_ps(x.first, y.first), _mm256_sub_ps(x.second, y.second)};
2225
#endif
2326
}
2427

25-
inline fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
28+
fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
2629
#if __AVX512F__
27-
return _mm512_fmsub_ps(x, y, z);
30+
return {_mm512_fmsub_ps(x.first, y.first, z.first)};
2831
#else
2932
return {_mm256_fmsub_ps(x.first, y.first, z.first), _mm256_fmsub_ps(x.second, y.second, z.second)};
3033
#endif
3134
}
3235

33-
inline fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) {
36+
fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) {
3437
#if __AVX512F__
35-
return _mm512_maskz_fmsub_ps(mask, x, y, z);
38+
return {_mm512_maskz_fmsub_ps(mask, x.first, y.first, z.first)};
3639
#else
3740
__m256 first, second;
3841
MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_fmsub_ps(x.first, y.first, z.first), mask & 255, first);
@@ -42,33 +45,33 @@ inline fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) {
4245
#endif
4346
}
4447

45-
inline fp32x16 add_fp32x16(fp32x16 x, fp32x16 y) {
48+
fp32x16 add_fp32x16(fp32x16 x, fp32x16 y) {
4649
#if __AVX512F__
47-
return _mm512_add_ps(x, y);
50+
return {_mm512_add_ps(x.first, y.first)};
4851
#else
4952
return {_mm256_add_ps(x.first, y.first), _mm256_add_ps(x.second, y.second)};
5053
#endif
5154
}
5255

53-
inline fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
56+
fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
5457
#if __AVX512F__
55-
return _mm512_fmadd_ps(x, y, z);
58+
return {_mm512_fmadd_ps(x.first, y.first, z.first)};
5659
#else
5760
return {_mm256_fmadd_ps(x.first, y.first, z.first), _mm256_fmadd_ps(x.second, y.second, z.second)};
5861
#endif
5962
}
6063

61-
inline fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y) {
64+
fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y) {
6265
#if __AVX512F__
63-
return _mm512_mul_ps(x, y);
66+
return {_mm512_mul_ps(x.first, y.first)};
6467
#else
6568
return {_mm256_mul_ps(x.first, y.first), _mm256_mul_ps(x.second, y.second)};
6669
#endif
6770
}
6871

69-
inline fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) {
72+
fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) {
7073
#if __AVX512F__
71-
return _mm512_maskz_mul_ps(mask, x, y);
74+
return {_mm512_maskz_mul_ps(mask, x.first, y.first)};
7275
#else
7376
__m256 first, second;
7477
MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_mul_ps(x.first, y.first), mask & 255, first);
@@ -78,31 +81,31 @@ inline fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) {
7881
}
7982

8083
template <int rounding>
81-
inline fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y) {
84+
fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y) {
8285
static_assert(rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8386
rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) ||
8487
rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8588
rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_CUR_DIRECTION),
8689
"ERROR: Not support rounding");
8790
#if __AVX512F__
88-
return _mm512_mul_round_ps(x, y, rounding);
91+
return {_mm512_mul_round_ps(x.first, y.first, rounding)};
8992
#else
9093
return {_mm256_round_ps(_mm256_mul_ps(x.first, y.first), rounding),
9194
_mm256_round_ps(_mm256_mul_ps(x.second, y.second), rounding)};
9295
#endif
9396
}
9497

95-
inline fp32x16 div_fp32x16(fp32x16 x, fp32x16 y) {
98+
fp32x16 div_fp32x16(fp32x16 x, fp32x16 y) {
9699
#if __AVX512F__
97-
return _mm512_div_ps(x, y);
100+
return {_mm512_div_ps(x.first, y.first)};
98101
#else
99102
return {_mm256_div_ps(x.first, y.first), _mm256_div_ps(x.second, y.second)};
100103
#endif
101104
}
102105

103-
inline float reduce_add_fp32x16(fp32x16 x) {
106+
float reduce_add_fp32x16(fp32x16 x) {
104107
#if __AVX512F__
105-
return _mm512_reduce_add_ps(x);
108+
return {_mm512_reduce_add_ps(x.first)};
106109
#else
107110
const __m256 x256 = _mm256_add_ps(x.first, x.second);
108111
const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x256, 1), _mm256_castps256_ps128(x256));
@@ -112,46 +115,55 @@ inline float reduce_add_fp32x16(fp32x16 x) {
112115
#endif
113116
}
114117

115-
inline fp32x16 sqrt_fp32x16(fp32x16 x) {
118+
fp32x16 sqrt_fp32x16(fp32x16 x) {
116119
#if __AVX512F__
117-
return _mm512_sqrt_ps(x);
120+
return {_mm512_sqrt_ps(x.first)};
118121
#else
119122
return {_mm256_sqrt_ps(x.first), _mm256_sqrt_ps(x.second)};
120123
#endif
121124
}
122125

123-
inline fp32x16 rsqrt14_fp32x16(fp32x16 x) {
126+
fp32x16 rsqrt14_fp32x16(fp32x16 x) {
124127
#if __AVX512F__
125-
return _mm512_rsqrt14_ps(x);
128+
return {_mm512_rsqrt14_ps(x.first)};
126129
#else
127130
// the max relative error is 6x than avx512
128131
return {_mm256_rsqrt_ps(x.first), _mm256_rsqrt_ps(x.second)};
129132
#endif
130133
}
131-
inline fp32x16 ceil_fp32x16(fp32x16 x) {
134+
fp32x16 ceil_fp32x16(fp32x16 x) {
132135
#if __AVX512F__
133-
return _mm512_ceil_ps(x);
136+
return {_mm512_ceil_ps(x.first)};
134137
#else
135138
// the max relative error is 6x than avx512
136139
return {_mm256_ceil_ps(x.first), _mm256_ceil_ps(x.second)};
137140
#endif
138141
}
139142

140-
inline fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y) {
143+
fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y) {
141144
#if __AVX512F__
142-
return _mm512_scalef_ps(x, y);
145+
return {_mm512_scalef_ps(x.first, y.first)};
143146
#else
144147
// No intrinsic
145-
assert("No intrinsic");
146-
return {_mm256_rsqrt_ps(x.first), _mm256_rsqrt_ps(x.second)};
148+
float* vec_x = new float[16];
149+
float* vec_y = new float[16];
150+
float* vec_z = new float[16];
151+
store_fp32x16(vec_x, x);
152+
store_fp32x16(vec_y, y);
153+
for (int i = 0; i < 16; i++) vec_z[i] = vec_x[i] * exp2(vec_y[i]);
154+
fp32x16 res = load_fp32x16(vec_z);
155+
delete[] vec_x;
156+
delete[] vec_y;
157+
delete[] vec_z;
158+
return res;
147159
#endif
148160
}
149161

150-
inline float dot_fp32x16(fp32x16 x, fp32x16 y) { return reduce_add_fp32x16(mul_fp32x16(x, y)); }
162+
float dot_fp32x16(fp32x16 x, fp32x16 y) { return reduce_add_fp32x16(mul_fp32x16(x, y)); }
151163

152-
inline fp32x16 abs_fp32x16(fp32x16 x) {
164+
fp32x16 abs_fp32x16(fp32x16 x) {
153165
#if __AVX512F__
154-
return _mm512_abs_ps(x);
166+
return {_mm512_abs_ps(x.first)};
155167
#else
156168
return {_mm256_castsi256_ps(_mm256_abs_epi32(_mm256_castps_si256(x.first))),
157169
_mm256_castsi256_ps(_mm256_abs_epi32(_mm256_castps_si256(x.second)))};

intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.hpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,50 +17,50 @@
1717

1818
#include "vec_base.hpp"
1919

20-
inline fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y);
20+
fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y);
2121
REGISTER_KERNEL_T(sub_fp32x16, fp32x16, fp32x16, fp32x16);
2222

23-
inline fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
23+
fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
2424
REGISTER_KERNEL_T(fmsub_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16);
2525

26-
inline fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z);
26+
fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z);
2727

28-
inline fp32x16 add_fp32x16(fp32x16 x, fp32x16 y);
28+
fp32x16 add_fp32x16(fp32x16 x, fp32x16 y);
2929
REGISTER_KERNEL_T(add_fp32x16, fp32x16, fp32x16, fp32x16);
3030

31-
inline fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
31+
fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
3232
REGISTER_KERNEL_T(fmadd_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16);
3333

34-
inline fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y);
34+
fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y);
3535
REGISTER_KERNEL_T(mul_fp32x16, fp32x16, fp32x16, fp32x16);
3636

37-
inline fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y);
37+
fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y);
3838

3939
template <int rounding>
40-
inline fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y);
40+
fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y);
4141

42-
inline fp32x16 div_fp32x16(fp32x16 x, fp32x16 y);
42+
fp32x16 div_fp32x16(fp32x16 x, fp32x16 y);
4343
REGISTER_KERNEL_T(div_fp32x16, fp32x16, fp32x16, fp32x16);
4444

45-
inline float reduce_add_fp32x16(fp32x16 x);
45+
float reduce_add_fp32x16(fp32x16 x);
4646
REGISTER_KERNEL_T(reduce_add_fp32x16, float, fp32x16);
4747

48-
inline fp32x16 sqrt_fp32x16(fp32x16 x);
48+
fp32x16 sqrt_fp32x16(fp32x16 x);
4949
REGISTER_KERNEL_T(sqrt_fp32x16, fp32x16, fp32x16);
5050

51-
inline fp32x16 rsqrt14_fp32x16(fp32x16 x);
51+
fp32x16 rsqrt14_fp32x16(fp32x16 x);
5252
REGISTER_KERNEL_T(rsqrt14_fp32x16, fp32x16, fp32x16);
5353

54-
inline fp32x16 ceil_fp32x16(fp32x16 x);
54+
fp32x16 ceil_fp32x16(fp32x16 x);
5555
REGISTER_KERNEL_T(ceil_fp32x16, fp32x16, fp32x16);
5656

57-
inline fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y);
57+
fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y);
5858
REGISTER_KERNEL_T(scale_fp32x16, fp32x16, fp32x16, fp32x16);
5959

60-
inline float dot_fp32x16(fp32x16 x, fp32x16 y);
60+
float dot_fp32x16(fp32x16 x, fp32x16 y);
6161
REGISTER_KERNEL_T(dot_fp32x16, float, fp32x16, fp32x16);
6262

63-
inline fp32x16 abs_fp32x16(fp32x16 x);
63+
fp32x16 abs_fp32x16(fp32x16 x);
6464
REGISTER_KERNEL_T(abs_fp32x16, fp32x16, fp32x16);
6565

6666
#endif // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_

0 commit comments

Comments
 (0)