Skip to content

Commit 6f0a64b

Browse files
committed
remove array util inline
1 parent 4793946 commit 6f0a64b

File tree

2 files changed

+141
-113
lines changed

2 files changed

+141
-113
lines changed

cpp/fury/util/array_util.cc

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
#include "fury/util/platform.h"
22+
#include <cstdint>
23+
24+
namespace fury {
25+
#if defined(FURY_HAS_NEON)
26+
uint16_t getMaxValue(const uint16_t *arr, size_t length) {
27+
if (length == 0) {
28+
return 0; // Return 0 for empty arrays
29+
}
30+
uint16x8_t max_val = vdupq_n_u16(0); // Initialize max vector to zero
31+
32+
size_t i = 0;
33+
for (; i + 8 <= length; i += 8) {
34+
uint16x8_t current_val = vld1q_u16(&arr[i]);
35+
max_val = vmaxq_u16(max_val, current_val); // Max operation
36+
}
37+
38+
// Find the max value in the resulting vector
39+
uint16_t temp[8];
40+
vst1q_u16(temp, max_val);
41+
uint16_t max_neon = temp[0];
42+
for (int j = 1; j < 8; j++) {
43+
if (temp[j] > max_neon) {
44+
max_neon = temp[j];
45+
}
46+
}
47+
48+
// Handle remaining elements
49+
for (; i < length; i++) {
50+
if (arr[i] > max_neon) {
51+
max_neon = arr[i];
52+
}
53+
}
54+
return max_neon;
55+
}
56+
57+
void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
58+
size_t i = 0;
59+
for (; i + 7 < length; i += 8) {
60+
uint16x8_t src = vld1q_u16(&from[i]);
61+
uint8x8_t result = vmovn_u16(src);
62+
vst1_u8(&to[i], result);
63+
}
64+
65+
// Fallback for the remainder
66+
for (; i < length; ++i) {
67+
to[i] = static_cast<uint8_t>(from[i]);
68+
}
69+
}
70+
#elif defined(FURY_HAS_SSE2)
71+
uint16_t getMaxValue(const uint16_t *arr, size_t length) {
72+
if (length == 0) {
73+
return 0; // Return 0 for empty arrays
74+
}
75+
76+
__m128i max_val = _mm_setzero_si128(); // Initialize max vector with zeros
77+
78+
size_t i = 0;
79+
for (; i + 8 <= length; i += 8) {
80+
__m128i current_val = _mm_loadu_si128((__m128i *)&arr[i]);
81+
max_val = _mm_max_epu16(max_val, current_val); // Max operation
82+
}
83+
84+
// Find the max value in the resulting vector
85+
uint16_t temp[8];
86+
_mm_storeu_si128((__m128i *)temp, max_val);
87+
uint16_t max_sse = temp[0];
88+
for (int j = 1; j < 8; j++) {
89+
if (temp[j] > max_sse) {
90+
max_sse = temp[j];
91+
}
92+
}
93+
94+
// Handle remaining elements
95+
for (; i < length; i++) {
96+
if (arr[i] > max_sse) {
97+
max_sse = arr[i];
98+
}
99+
}
100+
return max_sse;
101+
}
102+
103+
void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
104+
size_t i = 0;
105+
__m128i mask = _mm_set1_epi16(0xFF); // Mask to zero out the high byte
106+
for (; i + 7 < length; i += 8) {
107+
__m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&from[i]));
108+
__m128i result = _mm_and_si128(src, mask);
109+
_mm_storel_epi64(reinterpret_cast<__m128i *>(&to[i]),
110+
_mm_packus_epi16(result, result));
111+
}
112+
113+
// Fallback for the remainder
114+
for (; i < length; ++i) {
115+
to[i] = static_cast<uint8_t>(from[i]);
116+
}
117+
}
118+
#else
119+
uint16_t getMaxValue(const uint16_t *arr, size_t length) {
120+
if (length == 0) {
121+
return 0; // Return 0 for empty arrays
122+
}
123+
uint16_t max_val = arr[0];
124+
for (size_t i = 1; i < length; i++) {
125+
if (arr[i] > max_val) {
126+
max_val = arr[i];
127+
}
128+
}
129+
return max_val;
130+
}
131+
132+
void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
133+
// Fallback for systems without SSE2/NEON
134+
for (size_t i = 0; i < length; ++i) {
135+
to[i] = static_cast<uint8_t>(from[i]);
136+
}
137+
}
138+
#endif
139+
} // namespace fury

cpp/fury/util/array_util.h

Lines changed: 2 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -22,118 +22,7 @@
2222
#include <cstdint>
2323

2424
namespace fury {
25-
#if defined(FURY_HAS_NEON)
26-
inline uint16_t getMaxValue(const uint16_t *arr, size_t length) {
27-
if (length == 0) {
28-
return 0; // Return 0 for empty arrays
29-
}
30-
uint16x8_t max_val = vdupq_n_u16(0); // Initialize max vector to zero
25+
uint16_t getMaxValue(const uint16_t *arr, size_t length);
3126

32-
size_t i = 0;
33-
for (; i + 8 <= length; i += 8) {
34-
uint16x8_t current_val = vld1q_u16(&arr[i]);
35-
max_val = vmaxq_u16(max_val, current_val); // Max operation
36-
}
37-
38-
// Find the max value in the resulting vector
39-
uint16_t temp[8];
40-
vst1q_u16(temp, max_val);
41-
uint16_t max_neon = temp[0];
42-
for (int j = 1; j < 8; j++) {
43-
if (temp[j] > max_neon) {
44-
max_neon = temp[j];
45-
}
46-
}
47-
48-
// Handle remaining elements
49-
for (; i < length; i++) {
50-
if (arr[i] > max_neon) {
51-
max_neon = arr[i];
52-
}
53-
}
54-
return max_neon;
55-
}
56-
57-
inline void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
58-
size_t i = 0;
59-
for (; i + 7 < length; i += 8) {
60-
uint16x8_t src = vld1q_u16(&from[i]);
61-
uint8x8_t result = vmovn_u16(src);
62-
vst1_u8(&to[i], result);
63-
}
64-
65-
// Fallback for the remainder
66-
for (; i < length; ++i) {
67-
to[i] = static_cast<uint8_t>(from[i]);
68-
}
69-
}
70-
#elif defined(FURY_HAS_SSE2)
71-
inline uint16_t getMaxValue(const uint16_t *arr, size_t length) {
72-
if (length == 0) {
73-
return 0; // Return 0 for empty arrays
74-
}
75-
76-
__m128i max_val = _mm_setzero_si128(); // Initialize max vector with zeros
77-
78-
size_t i = 0;
79-
for (; i + 8 <= length; i += 8) {
80-
__m128i current_val = _mm_loadu_si128((__m128i *)&arr[i]);
81-
max_val = _mm_max_epu16(max_val, current_val); // Max operation
82-
}
83-
84-
// Find the max value in the resulting vector
85-
uint16_t temp[8];
86-
_mm_storeu_si128((__m128i *)temp, max_val);
87-
uint16_t max_sse = temp[0];
88-
for (int j = 1; j < 8; j++) {
89-
if (temp[j] > max_sse) {
90-
max_sse = temp[j];
91-
}
92-
}
93-
94-
// Handle remaining elements
95-
for (; i < length; i++) {
96-
if (arr[i] > max_sse) {
97-
max_sse = arr[i];
98-
}
99-
}
100-
return max_sse;
101-
}
102-
103-
inline void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
104-
size_t i = 0;
105-
__m128i mask = _mm_set1_epi16(0xFF); // Mask to zero out the high byte
106-
for (; i + 7 < length; i += 8) {
107-
__m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&from[i]));
108-
__m128i result = _mm_and_si128(src, mask);
109-
_mm_storel_epi64(reinterpret_cast<__m128i *>(&to[i]),
110-
_mm_packus_epi16(result, result));
111-
}
112-
113-
// Fallback for the remainder
114-
for (; i < length; ++i) {
115-
to[i] = static_cast<uint8_t>(from[i]);
116-
}
117-
}
118-
#else
119-
inline uint16_t getMaxValue(const uint16_t *arr, size_t length) {
120-
if (length == 0) {
121-
return 0; // Return 0 for empty arrays
122-
}
123-
uint16_t max_val = arr[0];
124-
for (size_t i = 1; i < length; i++) {
125-
if (arr[i] > max_val) {
126-
max_val = arr[i];
127-
}
128-
}
129-
return max_val;
130-
}
131-
132-
inline void copyArray(const uint16_t *from, uint8_t *to, size_t length) {
133-
// Fallback for systems without SSE2/NEON
134-
for (size_t i = 0; i < length; ++i) {
135-
to[i] = static_cast<uint8_t>(from[i]);
136-
}
137-
}
138-
#endif
27+
void copyArray(const uint16_t *from, uint8_t *to, size_t length);
13928
} // namespace fury

0 commit comments

Comments
 (0)