Skip to content

Commit 616fedd

Browse files
committed
Improvement in strlen, SIMD strlen, improved CMake to support AVX in Xcode
1 parent 51db77c commit 616fedd

File tree

7 files changed

+94
-20
lines changed

7 files changed

+94
-20
lines changed

benchmark/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,18 @@ if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo")
1515
endif()
1616
endif()
1717

18+
# Macro to set properties for Xcode targets
19+
macro(set_xcode_properties TARGET_NAME)
20+
if(CMAKE_GENERATOR STREQUAL Xcode)
21+
set_target_properties(${TARGET_NAME} PROPERTIES
22+
XCODE_ATTRIBUTE_ENABLE_AVX YES
23+
XCODE_ATTRIBUTE_ENABLE_AVX2 YES
24+
XCODE_ATTRIBUTE_OTHER_CPLUSPLUSFLAGS "-mavx -mavx2"
25+
XCODE_ATTRIBUTE_OTHER_CFLAGS "-mavx -mavx2"
26+
)
27+
endif()
28+
endmacro()
29+
1830
include_directories(
1931
../test/inc
2032
dependencies/google_benchmark/include
@@ -36,14 +48,17 @@ add_executable(
3648
egyptian.cpp
3749
# RobinHood.benchmark.cpp
3850
)
51+
set_xcode_properties(catch2Benchmark)
3952

4053
add_executable(
4154
zoo-google-benchmark benchmark_main.cpp cfs.cpp cfs/cfs_utility.cpp
4255
)
56+
set_xcode_properties(zoo-google-benchmark)
4357

4458
add_executable(
4559
zoo-atoi-benchmark benchmark_main.cpp bm-swar.cpp atoi.cpp
4660
)
61+
set_xcode_properties(zoo-atoi-benchmark)
4762

4863
target_link_libraries(zoo-google-benchmark benchmark::benchmark)
4964
target_link_libraries(zoo-atoi-benchmark benchmark::benchmark)

benchmark/atoi-corpus.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,10 @@ struct CorpusStringLength {
121121

122122
#define STRLEN_CORPUS_X_LIST \
123123
X(LIBC_STRLEN, strlen) \
124-
X(ZOO_NATURAL_STRLEN, zoo::c_strLength) \
125-
X(ZOO_MANUAL_STRLEN, zoo::c_strLength_ManualComparison) \
124+
X(ZOO_STRLEN, zoo::c_strLength) \
125+
X(ZOO_NATURAL_STRLEN, zoo::c_strLength_natural) \
126+
X(ZOO_MANUAL_STRLEN, zoo::c_strLength_manualComparison) \
127+
X(ZOO_AVX, zoo::avx2_strlen) \
126128
X(GENERIC_GLIBC_STRLEN, STRLEN_old)
127129

128130
#define X(Typename, FunctionToCall) \

benchmark/atoi.cpp

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#include "zoo/swar/SWAR.h"
22
#include "zoo/swar/associative_iteration.h"
33

4+
#include <immintrin.h>
5+
46
#include <stdint.h>
57
#include <string.h>
68
#include <stdlib.h>
@@ -53,10 +55,36 @@ auto lemire_as_zoo_swar(const char *chars) {
5355
namespace zoo {
5456

5557
std::size_t c_strLength(const char *s) {
56-
std::size_t rv = 0;
58+
using S = swar::SWAR<8, std::size_t>;
59+
constexpr auto
60+
MSBs = S{S::MostSignificantBit},
61+
Ones = S{S::LeastSignificantBit};
62+
S bytes;
63+
for(auto base = s;; base += 8) {
64+
memcpy(&bytes.m_v, base, 8);
65+
auto firstNullTurnsOnMSB = bytes - Ones;
66+
// The first lane with a null will borrow and set its MSB on when
67+
// subtracted one.
68+
// The borrowing from the first null interferes with the subsequent
69+
// lanes, that's why we focus on the first null.
70+
// The lanes previous to the first null might keep their MSB on after
71+
// subtracting one (if their value is greater than 0x80).
72+
// This provides a way to detect the first null: It is the first lane
73+
// in firstNullTurnsOnMSB that "flipped on" its MSB
74+
auto cheapestInversionOfMSBs = ~bytes;
75+
auto firstMSBsOnIsFirstNull =
76+
firstNullTurnsOnMSB & cheapestInversionOfMSBs;
77+
auto onlyMSBs = zoo::swar::convertToBooleanSWAR(firstMSBsOnIsFirstNull);
78+
if(onlyMSBs) { // there is a null!
79+
auto firstNullIndex = onlyMSBs.lsbIndex();
80+
return firstNullIndex + (base - s);
81+
}
82+
}
83+
}
84+
85+
std::size_t c_strLength_natural(const char *s) {
5786
using S = swar::SWAR<8, std::size_t>;
5887
S bytes;
59-
constexpr auto MSBs = S{S::MostSignificantBit};
6088
for(auto base = s;; base += 8) {
6189
memcpy(&bytes.m_v, base, 8);
6290
auto nulls = zoo::swar::equals(bytes, S{0});
@@ -67,29 +95,52 @@ std::size_t c_strLength(const char *s) {
6795
}
6896
}
6997

70-
std::size_t c_strLength_ManualComparison(const char *s) {
71-
std::size_t rv = 0;
98+
std::size_t c_strLength_manualComparison(const char *s) {
7299
using S = swar::SWAR<8, std::size_t>;
73100
S bytes;
74101
constexpr auto MSBs = S{S::MostSignificantBit};
75102
for(auto base = s;; base += 8) {
76103
memcpy(&bytes.m_v, base, 8);
77104
// A null byte is detected in two steps:
78105
// 1. it has the MSB off, and
79-
// the least significant bits are also off.
106+
// 2. the least significant bits are also off.
80107
// The swar library allows the detection of lsbs off
81108
// By comparing greater equal to 0,
82109
// 0 can only be greater-equal to a byte with LSBs 0
83110
auto haveMSB_cleared = bytes ^ MSBs;
84111
auto lsbNulls = zoo::swar::greaterEqual_MSB_off(S{0}, bytes & ~MSBs);
85-
auto nulls = swar::asBooleanSWAR(haveMSB_cleared & lsbNulls);
86-
if(nulls) {
112+
auto nulls = haveMSB_cleared & lsbNulls;
113+
if(nulls.value()) {
87114
auto firstNullIndex = nulls.lsbIndex();
88115
return firstNullIndex + (base - s);
89116
}
90117
}
91118
}
92119

120+
size_t avx2_strlen(const char* str) {
121+
const __m256i zero = _mm256_setzero_si256(); // Vector of 32 zero bytes
122+
size_t offset = 0;
123+
124+
// Loop over the string in blocks of 32 bytes
125+
for (;; offset += 32) {
126+
// Load 32 bytes of the string into a __m256i vector
127+
__m256i data;// = _mm256_load_si256((const __m256i*)(str + offset));
128+
memcpy(&data, str + offset, 32);
129+
// Compare each byte with '\0'
130+
__m256i cmp = _mm256_cmpeq_epi8(data, zero);
131+
// Create a mask indicating which bytes are '\0'
132+
int mask = _mm256_movemask_epi8(cmp);
133+
134+
// If mask is not zero, we found a '\0' byte
135+
if (mask) {
136+
// Calculate the index of the first '\0' byte using ctz (Count Trailing Zeros)
137+
return offset + __builtin_ctz(mask);
138+
}
139+
}
140+
// Unreachable, but included to avoid compiler warnings
141+
return offset;
142+
}
143+
93144
}
94145

95146
/// \brief This is the last non-platform specific "generic" strlen in GLibC.

benchmark/atoi.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ uint32_t lemire_as_zoo_swar(const char *chars);
77
namespace zoo {
88

99
std::size_t c_strLength(const char *s);
10-
std::size_t c_strLength_ManualComparison(const char *s);
10+
std::size_t c_strLength_natural(const char *s);
11+
std::size_t c_strLength_manualComparison(const char *s);
12+
std::size_t avx2_strlen(const char* str);
1113

1214
}
1315

benchmark/bm-swar.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@ void runBenchmark(benchmark::State &s) {
2727
}
2828

2929

30-
#define X(Typename, FunctionToCall) \
31-
BENCHMARK(runBenchmark<CORPUS, Invoke##Typename>);
32-
33-
#define CORPUS Corpus8DecimalDigits
30+
#define X(Typename, _) \
31+
BENCHMARK(runBenchmark<Corpus8DecimalDigits, Invoke##Typename>);
3432
PARSE8BYTES_CORPUS_X_LIST
35-
#undef CORPUS
33+
#undef X
3634

37-
#define CORPUS CorpusStringLength
35+
#define X(Typename, _) \
36+
BENCHMARK(runBenchmark<CorpusStringLength, Invoke##Typename>);
3837
STRLEN_CORPUS_X_LIST
39-
#undef CORPUS
40-
4138
#undef X
39+
40+
using RepeatZooStrlen = InvokeZOO_STRLEN;
41+
BENCHMARK(runBenchmark<CorpusStringLength, RepeatZooStrlen>);

benchmark/catch2swar-demo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") {
3232
#undef X
3333
REQUIRE(fromLemire == fromZoo);
3434
REQUIRE(fromLIBC == fromZoo);
35+
REQUIRE(fromZOO_STRLEN == fromLIBC_STRLEN);
3536
REQUIRE(fromLIBC_STRLEN == fromZOO_NATURAL_STRLEN);
3637
REQUIRE(fromZOO_NATURAL_STRLEN == fromZOO_MANUAL_STRLEN);
3738
REQUIRE(fromGENERIC_GLIBC_STRLEN == fromZOO_NATURAL_STRLEN);
39+
REQUIRE(fromZOO_AVX == fromZOO_STRLEN);
3840

3941
auto haveTheRoleOfMemoryBarrier = -1;
4042
#define X(Type, Fun) \

inc/zoo/swar/SWAR.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,12 +365,14 @@ struct BooleanSWAR: SWAR<NBits, T> {
365365

366366
template<int NB, typename TT>
367367
friend constexpr BooleanSWAR<NB, TT>
368-
asBooleanSWAR(SWAR<NB, TT> arg) noexcept;
368+
convertToBooleanSWAR(SWAR<NB, TT> arg) noexcept;
369369
};
370370

371371
template<int NBits, typename T>
372372
constexpr BooleanSWAR<NBits, T>
373-
asBooleanSWAR(SWAR<NBits, T> arg) noexcept { return arg; }
373+
convertToBooleanSWAR(SWAR<NBits, T> arg) noexcept {
374+
return SWAR<NBits, T>{SWAR<NBits, T>::MostSignificantBit} & arg;
375+
}
374376

375377
template<int N, int NBits, typename T>
376378
constexpr BooleanSWAR<NBits, T>

0 commit comments

Comments
 (0)