From e8c53fabc7f34a71efa17b1f9f00a9af60895d5c Mon Sep 17 00:00:00 2001 From: Eddie Date: Thu, 4 Jul 2024 02:11:52 +0100 Subject: [PATCH 1/4] Draft of string comparison --- benchmark/c_str-functions/c_str.cpp | 127 ++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/benchmark/c_str-functions/c_str.cpp b/benchmark/c_str-functions/c_str.cpp index 645f275..7ed8978 100644 --- a/benchmark/c_str-functions/c_str.cpp +++ b/benchmark/c_str-functions/c_str.cpp @@ -554,3 +554,130 @@ std::size_t neon_strlen(const char *str) { } #endif + +namespace zoo { + +//template +int c_strCmp(const char *a, const char *b) { + using S = swar::SWAR<8>; + S as, bs; + auto [aB, aM] = blockAlignedLoad(a, &as); + auto [bB, bM] = blockAlignedLoad(b, &bs); + + auto misalignmentDifference = aM - bM; + // to establish the loop invariant there is the need to fill the + // bytes of the blocks that do not belong to the inputs. + // the bytes that do not belong are those up to the misalignment. + // let's say: + // **NOTE: THE DIAGRAMS ARE IN LITTLE ENDIAN!** + // [ a0, a1, a2, a3, a4, a5, a6, a7 ] + // ^ ^ misalignment = 3 + // | base of A + // [ b0, b1, b2, b3, b4, b5, b6, b7 ] + // ^ ^ misalignment of b = 2 + // | base of B + // The bytes that really belong to A are + // [ ?, ?, ?, a3, a4, a5, a6, a7 ] + // To avoid the first three bytes interfering in the comparison, we + // fill them with lanes of all ones: + // [ ~0, ~0, ~0, a3, a4, a5, a6, a7 ], for this, we do this: + // [ 0, 0, 0, ~0, ..., ~0 ] = S{S::AllOnes}.shiftLanesLeft(3) = SLL + // [ ~0, ~0, ~0, 0, ..., 0 ] = ~SLL + // [ ~0, ~0, ~0, a3, a4, a5, a6, a7 ] = ASL | ~SLL + // now, we can use all the bytes in as. + // We need to do something similar for bs, but because bs is less misaligned + // we will process the bytes we can in this iteration, but we have to + // leave a remainder: + // [ 0, b0, b1, b2, b3, b4, b5, b6 ] = bs.shiftLanesLeft(3 - 2) = BSL + // [~0, ~0, ~0, b2, b3, b4, b5, b6 ] = BSL | ~SLL + // [ ?, ?, ?, ?, ?, ?, ?, b6 ] = remainder for the next iteration + + // The prefix mma means "more mis-aligned", lma "less mis-aligned" + const char *mmaBase, *lmaBase; + S mmaBytes, lmaBytes, lmaRemainder; + int returnMultiplier; + auto loopInvariantMaker = + [&]( + auto largerMisalignment, + auto mmaBa, auto mmaBy, auto lmaBa, auto lmaBy, + int reM + ) { + // a is more misaligned than b, a provides less bytes + auto initialFiller = + ~S{S::AllOnes}.shiftLanesLeft(largerMisalignment); + mmaBase = mmaBa; + mmaBytes = mmaBy | initialFiller; + lmaBase = lmaBa; + auto lmaAdjusted = lmaBy.shiftLanesLeft(misalignmentDifference); + lmaBytes = lmaAdjusted | initialFiller; + lmaRemainder = + lmaBy | + S{S::AllOnes}.shiftLanesRight(S::Lanes - misalignmentDifference); + returnMultiplier = reM; + }; + if(0 <= misalignmentDifference) { + loopInvariantMaker(aM, aB, as, bB, bs, 1); + } else { + misalignmentDifference = -misalignmentDifference; + loopInvariantMaker(bM, bB, bs, aB, as, -1); + } + auto nulls = [](S bytes) { + return swar::constantIsGreaterEqual<0>(bytes); + }; + for(;;) { + // invariant: + // 1. ready to compare mmaBytes with lmaBytes + // 2. there is at least one byte of input in both mmaBytes and lmaBytes + // 3. mmaBytes and lmaBytes are equal + // 4. There is no null in the bytes + // 5. There is no null in the significant bytes in the remainder + // Step 1: determine if the swars are different + auto exor = mmaBytes ^ lmaBytes; + if(!exor.value()) { + // There is a difference. Will terminate + // There are several cases. + // Is any string terminated? + auto + mNulls = nulls(mmaBytes), + lNulls = nulls(lmaBytes); + auto thereIsANull = mNulls | lNulls; + auto returner = + [&](S s) { + auto firstNullIndex = s.lsbIndex(); + return returnMultiplier * ( + mmaBytes - lmaBytes + ).shiftLanesRight(firstNullIndex).value(); + }; + if(thereIsANull) { + return returner(thereIsANull); + } + auto diffs = swar::constantIsGreaterEqual<0>(exor); + return returner(diffs); + } + // despite equality, we might have reached the end of the strings, + // this needs to be tested explicitly + if(nulls(mmaBytes)) { return 0; } + // preparation of the next iteration, grab a block from mmaBase + mmaBase += sizeof(S); + memcpy(&mmaBytes.m_v, mmaBase, sizeof(S)); + // there can be a null in the lmaRemainder, thus we can't just + // load more bytes + if(nulls(lmaRemainder)) { + // prepare the next iteration knowing it will terminate: + lmaBytes = + lmaRemainder.shiftLanesRight(S::Lanes - misalignmentDifference); + continue; + } + lmaBase += sizeof(S); + auto remShifted = + lmaRemainder.shiftLanesRight(S::Lanes - misalignmentDifference); + memcpy(&lmaRemainder.m_v, lmaBase, sizeof(S)); + auto newBytes = lmaRemainder.shiftLanesLeft(misalignmentDifference); + lmaBytes = remShifted | newBytes; + // note: if there are nulls in the lmaRemainder part that was + // copied to newBytes, they will be compared against mmaBytes + // and thus taken into account + } +} + +} From 05beda80dabee3e7ba1e76d6aa194ce74d77838e Mon Sep 17 00:00:00 2001 From: Eddie Date: Thu, 4 Jul 2024 10:16:27 +0100 Subject: [PATCH 2/4] Corrects known return val issue --- benchmark/c_str-functions/c_str.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmark/c_str-functions/c_str.cpp b/benchmark/c_str-functions/c_str.cpp index 7ed8978..d3c1e37 100644 --- a/benchmark/c_str-functions/c_str.cpp +++ b/benchmark/c_str-functions/c_str.cpp @@ -644,9 +644,11 @@ int c_strCmp(const char *a, const char *b) { auto returner = [&](S s) { auto firstNullIndex = s.lsbIndex(); - return returnMultiplier * ( - mmaBytes - lmaBytes - ).shiftLanesRight(firstNullIndex).value(); + auto + comparison = mmaBytes - lmaBytes, + inLeast = comparison.shiftLanesRight(firstNullIndex), + onlyLeast = inLeast & S{S::LeastSignificantLaneMask}; + return returnMultiplier * int8_t(onlyLeast.value()); }; if(thereIsANull) { return returner(thereIsANull); From 17c70e3e3c50fec25ed5bda7aa9337dcdca0d2b1 Mon Sep 17 00:00:00 2001 From: Eddie Date: Sat, 6 Jul 2024 22:06:58 +0100 Subject: [PATCH 3/4] Fixes issue with non-libc++ support for __uint128_t --- inc/zoo/swar/SWAR.h | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 60ba954..4021ae1 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -39,9 +39,34 @@ constexpr uint64_t popcount(uint64_t a) noexcept { >::execute(a); } +template +struct ToUnsigned_impl { + using type = T; +}; + +template +struct ToUnsigned_impl>> { + using type = std::make_unsigned_t; +}; + +template<> +struct ToUnsigned_impl<__int128_t, void> { + using type = __uint128_t; +}; + +template<> +struct ToUnsigned_impl<__uint128_t, void> { + using type = __uint128_t; +}; + +static_assert(std::is_same_v::type, unsigned>); + +template +using ToUnsigned = typename ToUnsigned_impl::type; + /// Index into the bits of the type T that contains the MSB. template -constexpr std::make_unsigned_t msbIndex(T v) noexcept { +constexpr ToUnsigned msbIndex(T v) noexcept { return meta::logFloor(v); } @@ -49,7 +74,7 @@ constexpr std::make_unsigned_t msbIndex(T v) noexcept { /// /// \todo incorporate __builtin_ctzg when it is more widely available template -constexpr std::make_unsigned_t lsbIndex(T v) noexcept { +constexpr ToUnsigned lsbIndex(T v) noexcept { // This check should be SFINAE, but supporting all sorts // of base types is an ongoing task, we put a bare-minimum // temporary preventive measure with static_assert @@ -77,7 +102,9 @@ constexpr __uint128_t lsbIndex(__uint128_t v) noexcept { /// Certain computational workloads can be materially sped up using SWAR techniques. template struct SWAR { - using type = std::make_unsigned_t; + using type = + // std::make_unsigned_t; + ToUnsigned; constexpr static auto Literal = Literals; constexpr static inline type NBits = NBits_, From d1d718b1bb05de6a844d7fcfcd535afb1337a241 Mon Sep 17 00:00:00 2001 From: Eddie Date: Sat, 6 Jul 2024 22:24:47 +0100 Subject: [PATCH 4/4] Windows build solution --- inc/zoo/pp/platform.h | 2 ++ inc/zoo/swar/SWAR.h | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/inc/zoo/pp/platform.h b/inc/zoo/pp/platform.h index e612066..886a4de 100644 --- a/inc/zoo/pp/platform.h +++ b/inc/zoo/pp/platform.h @@ -20,8 +20,10 @@ #endif #ifdef _MSC_VER +#define ZOO_WINDOWS_BUILD() 1 #define MSVC_EMPTY_BASES __declspec(empty_bases) #else +#define ZOO_WINDOWS_BUILD() 0 #define MSVC_EMPTY_BASES #endif diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 4021ae1..e41c887 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -3,6 +3,7 @@ /// \file Swar.h SWAR operations #include "zoo/meta/log.h" +#include "zoo/pp/platform.h" #include #include @@ -49,6 +50,7 @@ struct ToUnsigned_impl>> { using type = std::make_unsigned_t; }; +#if !ZOO_WINDOWS_BUILD() template<> struct ToUnsigned_impl<__int128_t, void> { using type = __uint128_t; @@ -58,6 +60,7 @@ template<> struct ToUnsigned_impl<__uint128_t, void> { using type = __uint128_t; }; +#endif static_assert(std::is_same_v::type, unsigned>); @@ -79,7 +82,7 @@ constexpr ToUnsigned lsbIndex(T v) noexcept { // of base types is an ongoing task, we put a bare-minimum // temporary preventive measure with static_assert static_assert(sizeof(T) <= 8, "Unsupported"); - #ifdef _MSC_VER + #if ZOO_WINDOWS_BUILD() // ~v & (v - 1) turns on all trailing zeroes, zeroes the rest return meta::logFloor(1 + (~v & (v - 1))); #else @@ -87,7 +90,7 @@ constexpr ToUnsigned lsbIndex(T v) noexcept { #endif } -#ifndef _MSC_VER +#if !ZOO_WINDOWS_BUILD() constexpr __uint128_t lsbIndex(__uint128_t v) noexcept { auto low = (v << 64) >> 64; if(low) { return __builtin_ctzll(low); }