@@ -325,3 +325,52 @@ STRLEN_old (const char *str)
325325 }
326326 }
327327}
328+
329+
330+ #if ZOO_CONFIGURED_TO_USE_NEON()
331+
332+ #include < arm_neon.h>
333+
334+ namespace zoo {
335+
336+ // / \note uses the key technique of shifting by 4 and narrowing from 16 to 8 bit lanes in
337+ // / aarch64/strlen.S at
338+ // / https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/aarch64/strlen.S;h=ab2a576cdb5665e596b791299af3f4abecb73c0e;hb=HEAD
339+ std::size_t neon_strlen (const char *str) {
340+ const uint8x16_t zero = vdupq_n_u8 (0 );
341+ size_t offset = 0 ;
342+ uint8x16_t data;
343+ auto [alignedBase, misalignment] = blockAlignedLoad (str, &data);
344+
345+ auto compareAndConvertResultsToNibbles = [&]() {
346+ auto cmp = vceqq_u8 (data, zero);
347+ // The result looks like, in hexadecimal digits, like this:
348+ // [ AA, BB, CC, DD, EE, FF, GG, HH, ... ] with each
349+ // variable A, B, ... either 0xF or 0x0.
350+ // instead of 16x8 bit results, we can see that as
351+ // 8 16 bit results like this
352+ // [ AABB, CCDD, EEFF, GGHH, ... ]
353+ // If we shift out a nibble from each element (shift right by 4):
354+ // [ ABB0, CDD0, EFF0, GHH0, ... ]
355+ // Narrowing from 16 to eight, we would get
356+ // [ AB, CD, EF, GH, ... ]
357+ auto straddle8bitLanePairAndNarrowToBytes = vshrn_n_u16 (cmp, 4 );
358+ return vget_lane_u64 (vreinterpret_u64_u8 (straddle8bitLanePairAndNarrowToBytes), 0 );
359+ };
360+ auto nibbles = compareAndConvertResultsToNibbles ();
361+ auto misalignmentNibbleMask = (~uint64_t (0 )) << (misalignment * 4 );
362+ nibbles &= misalignmentNibbleMask;
363+ for (;;) {
364+ if (nibbles) {
365+ auto trailingZeroBits = __builtin_ctz (nibbles);
366+ auto nonNullByteCount = trailingZeroBits / 4 ;
367+ return alignedBase + offset + nonNullByteCount - str;
368+ }
369+ alignedBase += sizeof (uint8x16_t );
370+ memcpy (&data, alignedBase, sizeof (uint8x16_t ));
371+ nibbles = compareAndConvertResultsToNibbles ();
372+ }
373+ }
374+
375+ }
376+ #endif
0 commit comments