Skip to content

Commit 2f616ec

Browse files
authored
Merge pull request #73 from thecppzoo/jp/arm-strlen
ARM Strlen from Godbolt
2 parents a648b99 + 9b5cf33 commit 2f616ec

File tree

4 files changed

+69
-1
lines changed

4 files changed

+69
-1
lines changed

benchmark/atoi-corpus.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,21 @@ struct CorpusStringLength {
127127
#define AVX2_STRLEN_CORPUS_X_LIST /* nothing */
128128
#endif
129129

130+
#if ZOO_CONFIGURED_TO_USE_NEON()
131+
#define NEON_STRLEN_CORPUS_X_LIST \
132+
X(ZOO_NEON, zoo::neon_strlen)
133+
#else
134+
#define NEON_STRLEN_CORPUS_X_LIST /* nothing */
135+
#endif
136+
137+
130138
#define STRLEN_CORPUS_X_LIST \
131139
X(LIBC_STRLEN, strlen) \
132140
X(ZOO_STRLEN, zoo::c_strLength) \
133141
X(ZOO_NATURAL_STRLEN, zoo::c_strLength_natural) \
134142
X(GENERIC_GLIBC_STRLEN, STRLEN_old) \
135-
AVX2_STRLEN_CORPUS_X_LIST
143+
AVX2_STRLEN_CORPUS_X_LIST \
144+
NEON_STRLEN_CORPUS_X_LIST
136145

137146
#define X(Typename, FunctionToCall) \
138147
struct Invoke##Typename { int operator()(const char *p) { return FunctionToCall(p); } };

benchmark/atoi.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,3 +325,52 @@ STRLEN_old (const char *str)
325325
}
326326
}
327327
}
328+
329+
330+
#if ZOO_CONFIGURED_TO_USE_NEON()
331+
332+
#include <arm_neon.h>
333+
334+
namespace zoo {
335+
336+
/// \note uses the key technique of shifting by 4 and narrowing from 16 to 8 bit lanes in
337+
/// aarch64/strlen.S at
338+
/// https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/aarch64/strlen.S;h=ab2a576cdb5665e596b791299af3f4abecb73c0e;hb=HEAD
339+
std::size_t neon_strlen(const char *str) {
340+
const uint8x16_t zero = vdupq_n_u8(0);
341+
size_t offset = 0;
342+
uint8x16_t data;
343+
auto [alignedBase, misalignment] = blockAlignedLoad(str, &data);
344+
345+
auto compareAndConvertResultsToNibbles = [&]() {
346+
auto cmp = vceqq_u8(data, zero);
347+
// The result looks like, in hexadecimal digits, like this:
348+
// [ AA, BB, CC, DD, EE, FF, GG, HH, ... ] with each
349+
// variable A, B, ... either 0xF or 0x0.
350+
// instead of 16x8 bit results, we can see that as
351+
// 8 16 bit results like this
352+
// [ AABB, CCDD, EEFF, GGHH, ... ]
353+
// If we shift out a nibble from each element (shift right by 4):
354+
// [ ABB0, CDD0, EFF0, GHH0, ... ]
355+
// Narrowing from 16 to eight, we would get
356+
// [ AB, CD, EF, GH, ... ]
357+
auto straddle8bitLanePairAndNarrowToBytes = vshrn_n_u16(cmp, 4);
358+
return vget_lane_u64(vreinterpret_u64_u8(straddle8bitLanePairAndNarrowToBytes), 0);
359+
};
360+
auto nibbles = compareAndConvertResultsToNibbles();
361+
auto misalignmentNibbleMask = (~uint64_t(0)) << (misalignment * 4);
362+
nibbles &= misalignmentNibbleMask;
363+
for(;;) {
364+
if(nibbles) {
365+
auto trailingZeroBits = __builtin_ctz(nibbles);
366+
auto nonNullByteCount = trailingZeroBits / 4;
367+
return alignedBase + offset + nonNullByteCount - str;
368+
}
369+
alignedBase += sizeof(uint8x16_t);
370+
memcpy(&data, alignedBase, sizeof(uint8x16_t));
371+
nibbles = compareAndConvertResultsToNibbles();
372+
}
373+
}
374+
375+
}
376+
#endif

benchmark/atoi.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ std::size_t c_strLength_natural(const char *s);
1818
std::size_t avx2_strlen(const char* str);
1919
#endif
2020

21+
#if ZOO_CONFIGURED_TO_USE_NEON()
22+
std::size_t neon_strlen(const char* str);
23+
#endif
24+
2125
}
2226

2327
std::size_t

inc/zoo/pp/platform.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
#define ZOO_CONFIGURED_TO_USE_AVX() 0
88
#endif
99

10+
#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && (defined(__aarch64__) || defined(_M_ARM64))
11+
#define ZOO_CONFIGURED_TO_USE_NEON() 1
12+
#else
13+
#define ZOO_CONFIGURED_TO_USE_NEON() 0
14+
#endif
15+
1016
#ifdef _MSC_VER
1117
#define MSVC_EMPTY_BASES __declspec(empty_bases)
1218
#else

0 commit comments

Comments
 (0)