From 530543ab7af3d88cd9bd6fe93ca9426aaa49ff7c Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Mon, 26 Feb 2024 22:42:10 -0800 Subject: [PATCH 01/26] some progress... --- inc/zoo/swar/associative_iteration.h | 57 ++++++++++++++++++++++++++++ test/swar/BasicOperations.cpp | 14 +++++++ 2 files changed, 71 insertions(+) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index e63c2869..b08698fc 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -205,6 +205,27 @@ constexpr auto associativeOperatorIterated_regressive( return result; } +template +constexpr auto multiplication_scalar( + T multiplicand, T multiplier +) { + auto operation = [](auto left, auto right, auto counts) { + return counts ? left + right : left; + }; + + auto halver = [](auto counts) { + return counts >> 1; + }; + + multiplier = multiplier + 1; + + return associativeOperatorIterated_regressive( + multiplicand, 1, multiplier, 0, operation, + ActualBits, halver + ); +} + + template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR multiplicand, SWAR multiplier @@ -228,6 +249,31 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( ); } +template +constexpr auto expo_OverflowUnsafe_SpecificBitCount( + SWAR x, + SWAR exponent +) { + using S = SWAR; + + auto operation = [](auto left, auto right, auto counts) { + const auto product = multiplication_OverflowUnsafe_SpecificBitCount(left, right); + const auto mask = makeLaneMaskFromMSB(counts); + return (mask & product) | (left & ~mask); + }; + + auto halver = [](auto counts) { + auto msbCleared = counts & ~S{S::MostSignificantBit}; + return S{msbCleared.value() << 1}; + }; + + exponent = S{exponent.value() << (NB - ActualBits)}; + return associativeOperatorIterated_regressive( + x, S{1}, exponent, S{S::MostSignificantBit}, operation, + ActualBits, halver + ); +} + /// \note Not removed yet because it is an example of "progressive" associative exponentiation template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( @@ -261,6 +307,17 @@ constexpr auto multiplication_OverflowUnsafe( ); } +template +constexpr auto expo_OverflowUnsafe( + SWAR base, + SWAR exponent +) { + return + expo_OverflowUnsafe_SpecificBitCount( + base, exponent + ); +} + template struct SWAR_Pair{ SWAR even, odd; diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index af5a1016..09970e01 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -41,8 +41,22 @@ static_assert( multiplication_OverflowUnsafe_SpecificBitCount<3>(Micand, Mplier).value() ); +TEST_CASE("Jamie's expo") { + constexpr auto base = SWAR<4>{0b0010'0011}; // 2 | 3 + constexpr auto exponent = SWAR<4>{0b0011'0010}; // 3 | 2 + constexpr auto expected = SWAR<4>{0b1000'1001}; // 8 | 9 + // static_assert( + // expected.value() == expo_OverflowUnsafe(base, exponent).value() + // ); + auto actual = expo_OverflowUnsafe(base, exponent); + CHECK(expected.value() == actual.value()); + auto as_bits = std::bitset<8>(expected.value()); + printf("Expected: %s\n", as_bits.to_string().c_str()); + printf("Actual: %s\n", std::bitset<8>(actual.value()).to_string().c_str()); } +} // namespace Multiplication + #define HE(nbits, t, v0, v1) \ static_assert(horizontalEquality(\ SWAR(v0),\ From c6008427d7b18061cd551ce4728188d8fefd74c1 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Mon, 26 Feb 2024 22:49:09 -0800 Subject: [PATCH 02/26] hmmm --- inc/zoo/swar/associative_iteration.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index b08698fc..6fa838c6 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -257,9 +257,18 @@ constexpr auto expo_OverflowUnsafe_SpecificBitCount( using S = SWAR; auto operation = [](auto left, auto right, auto counts) { - const auto product = multiplication_OverflowUnsafe_SpecificBitCount(left, right); const auto mask = makeLaneMaskFromMSB(counts); - return (mask & product) | (left & ~mask); + const auto antiMask = ~mask; + + const auto product = multiplication_OverflowUnsafe_SpecificBitCount(left, right); + /* + * if (count) + * return product; + * else + * return left; + * */ + return (product & mask) | (left & antiMask); + }; auto halver = [](auto counts) { From 3b9f000c4d3379b78563d767cf24e6e926fe2a7d Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Mon, 26 Feb 2024 22:52:56 -0800 Subject: [PATCH 03/26] add wip from fmtgp --- inc/zoo/swar/associative_iteration.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 6fa838c6..2cb1effd 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -249,6 +249,24 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( ); } + +/* + // extended from mathematics to generic programming + + template constexpr T exp_acc(T r, T a, T n) { + for (;;) { + if (is_odd(n)) { + r = multiply(r, a); + if (n == 1) { + return r; + } + } + n = half(n); + a = multiply(a, a); + } + } +*/ + template constexpr auto expo_OverflowUnsafe_SpecificBitCount( SWAR x, @@ -268,9 +286,9 @@ constexpr auto expo_OverflowUnsafe_SpecificBitCount( * return left; * */ return (product & mask) | (left & antiMask); - }; + // halver should work same as multiplication... i think... auto halver = [](auto counts) { auto msbCleared = counts & ~S{S::MostSignificantBit}; return S{msbCleared.value() << 1}; From 07952f533d4f196f2750f8f045ee16b68b48525e Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Mon, 26 Feb 2024 22:54:01 -0800 Subject: [PATCH 04/26] add notes --- inc/zoo/swar/associative_iteration.h | 1 + 1 file changed, 1 insertion(+) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 2cb1effd..d6ffc389 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -252,6 +252,7 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( /* // extended from mathematics to generic programming + // see https://github.com/jamierpond/fmtgp/blob/main/2_first_algo/main.cpp template constexpr T exp_acc(T r, T a, T n) { for (;;) { From bc27994e27c5f4f5583b88718e05040bf88d90a5 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Mon, 26 Feb 2024 23:31:25 -0800 Subject: [PATCH 05/26] add explicity typing to make compiler happy --- inc/zoo/swar/associative_iteration.h | 18 +++++++++--------- test/swar/BasicOperations.cpp | 16 +++++++++------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index d6ffc389..284809fc 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -69,7 +69,7 @@ template constexpr auto makeLaneMaskFromMSB(SWAR input) { using S = SWAR; auto msb = input & S{S::MostSignificantBit}; - auto msbCopiedToLSB = S{msb.value() >> (NB - 1)}; + auto msbCopiedToLSB = S{static_cast(msb.value() >> (NB - 1))}; return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); } @@ -239,10 +239,10 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( auto halver = [](auto counts) { auto msbCleared = counts & ~S{S::MostSignificantBit}; - return S{msbCleared.value() << 1}; + return S{static_cast(msbCleared.value() << 1)}; }; - multiplier = S{multiplier.value() << (NB - ActualBits)}; + multiplier = S{static_cast(multiplier.value() << (NB - ActualBits))}; return associativeOperatorIterated_regressive( multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, ActualBits, halver @@ -278,24 +278,24 @@ constexpr auto expo_OverflowUnsafe_SpecificBitCount( auto operation = [](auto left, auto right, auto counts) { const auto mask = makeLaneMaskFromMSB(counts); const auto antiMask = ~mask; - - const auto product = multiplication_OverflowUnsafe_SpecificBitCount(left, right); + const auto product = + multiplication_OverflowUnsafe_SpecificBitCount(left, right); /* * if (count) * return product; - * else + * else * return left; - * */ + */ return (product & mask) | (left & antiMask); }; // halver should work same as multiplication... i think... auto halver = [](auto counts) { auto msbCleared = counts & ~S{S::MostSignificantBit}; - return S{msbCleared.value() << 1}; + return S{static_cast(msbCleared.value() << 1)}; }; - exponent = S{exponent.value() << (NB - ActualBits)}; + exponent = S{static_cast(exponent.value() << (NB - ActualBits))}; return associativeOperatorIterated_regressive( x, S{1}, exponent, S{S::MostSignificantBit}, operation, ActualBits, halver diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 09970e01..73a2e6d7 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -41,18 +41,20 @@ static_assert( multiplication_OverflowUnsafe_SpecificBitCount<3>(Micand, Mplier).value() ); -TEST_CASE("Jamie's expo") { - constexpr auto base = SWAR<4>{0b0010'0011}; // 2 | 3 - constexpr auto exponent = SWAR<4>{0b0011'0010}; // 3 | 2 - constexpr auto expected = SWAR<4>{0b1000'1001}; // 8 | 9 +TEST_CASE("Jamie's wip expo") { + // the LSB lanes seem to be correct, but the MSB lanes are not... + constexpr auto base = SWAR<8, u32>{0b0001'0011}; // 2 | 3 + constexpr auto exponent = SWAR<8, u32>{0b0001'0010}; // 3 | 2 + constexpr auto expected = SWAR<8, u32>{0b0001'1001}; // 8 | 9 // static_assert( // expected.value() == expo_OverflowUnsafe(base, exponent).value() // ); auto actual = expo_OverflowUnsafe(base, exponent); CHECK(expected.value() == actual.value()); - auto as_bits = std::bitset<8>(expected.value()); - printf("Expected: %s\n", as_bits.to_string().c_str()); - printf("Actual: %s\n", std::bitset<8>(actual.value()).to_string().c_str()); + auto expected_as_bits = std::bitset<32>(expected.value()); + auto actual_as_bits = std::bitset<32>(actual.value()); + printf("expected: %s\n", expected_as_bits.to_string().c_str()); + printf("actual: %s\n", actual_as_bits.to_string().c_str()); } } // namespace Multiplication From 264dce9ed49523f92e467062e3dbceac81f5606c Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Mon, 26 Feb 2024 23:38:34 -0800 Subject: [PATCH 06/26] cleanup --- inc/zoo/swar/associative_iteration.h | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 284809fc..771a4e17 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -205,27 +205,6 @@ constexpr auto associativeOperatorIterated_regressive( return result; } -template -constexpr auto multiplication_scalar( - T multiplicand, T multiplier -) { - auto operation = [](auto left, auto right, auto counts) { - return counts ? left + right : left; - }; - - auto halver = [](auto counts) { - return counts >> 1; - }; - - multiplier = multiplier + 1; - - return associativeOperatorIterated_regressive( - multiplicand, 1, multiplier, 0, operation, - ActualBits, halver - ); -} - - template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR multiplicand, SWAR multiplier From 994391726d2eeb760f4a72607e4869177fed73b0 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Mon, 26 Feb 2024 23:59:37 -0800 Subject: [PATCH 07/26] neutral lane wise --- inc/zoo/meta/BitmaskMaker.h | 1 + inc/zoo/swar/associative_iteration.h | 15 ++++++++++++--- test/swar/BasicOperations.cpp | 9 +++------ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/inc/zoo/meta/BitmaskMaker.h b/inc/zoo/meta/BitmaskMaker.h index 4c8008ac..0c3f26a8 100644 --- a/inc/zoo/meta/BitmaskMaker.h +++ b/inc/zoo/meta/BitmaskMaker.h @@ -42,6 +42,7 @@ struct BitmaskMaker { static_assert(0xF0F0 == BitmaskMaker::value); static_assert(0xEDFEDFED == BitmaskMaker::value); +static_assert(0b0001'0001 == BitmaskMaker::value); }} // zoo::meta diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 771a4e17..41de35c2 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -223,7 +223,11 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( multiplier = S{static_cast(multiplier.value() << (NB - ActualBits))}; return associativeOperatorIterated_regressive( - multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, + multiplicand, + S{0}, + multiplier, + S{S::MostSignificantBit}, + operation, ActualBits, halver ); } @@ -276,8 +280,13 @@ constexpr auto expo_OverflowUnsafe_SpecificBitCount( exponent = S{static_cast(exponent.value() << (NB - ActualBits))}; return associativeOperatorIterated_regressive( - x, S{1}, exponent, S{S::MostSignificantBit}, operation, - ActualBits, halver + x, + S{meta::BitmaskMaker().value}, // neutral is lane wise.. + exponent, + S{S::MostSignificantBit}, + operation, + ActualBits, + halver ); } diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 73a2e6d7..6cffe595 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -43,12 +43,9 @@ static_assert( TEST_CASE("Jamie's wip expo") { // the LSB lanes seem to be correct, but the MSB lanes are not... - constexpr auto base = SWAR<8, u32>{0b0001'0011}; // 2 | 3 - constexpr auto exponent = SWAR<8, u32>{0b0001'0010}; // 3 | 2 - constexpr auto expected = SWAR<8, u32>{0b0001'1001}; // 8 | 9 - // static_assert( - // expected.value() == expo_OverflowUnsafe(base, exponent).value() - // ); + constexpr auto base = SWAR<8, u32>{0b0001'0011}; // 1 | 3 + constexpr auto exponent = SWAR<8, u32>{0b0001'0010}; // 1 | 2 + constexpr auto expected = SWAR<8, u32>{0b0001'1001}; // 1 | 9 auto actual = expo_OverflowUnsafe(base, exponent); CHECK(expected.value() == actual.value()); auto expected_as_bits = std::bitset<32>(expected.value()); From acdaa6f39af15f1b769214e29a44eeeb872c1f64 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Tue, 27 Feb 2024 19:34:54 -0800 Subject: [PATCH 08/26] wip --- test/swar/BasicOperations.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 6cffe595..a3138b5f 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -43,9 +43,9 @@ static_assert( TEST_CASE("Jamie's wip expo") { // the LSB lanes seem to be correct, but the MSB lanes are not... - constexpr auto base = SWAR<8, u32>{0b0001'0011}; // 1 | 3 - constexpr auto exponent = SWAR<8, u32>{0b0001'0010}; // 1 | 2 - constexpr auto expected = SWAR<8, u32>{0b0001'1001}; // 1 | 9 + constexpr auto base = SWAR<16, u32>{0b0000'0001'0011}; // 1 | 3 + constexpr auto exponent = SWAR<16, u32>{0b0000'0001'0010}; // 1 | 2 + constexpr auto expected = SWAR<16, u32>{0b0000'0001'1001}; // 1 | 9 auto actual = expo_OverflowUnsafe(base, exponent); CHECK(expected.value() == actual.value()); auto expected_as_bits = std::bitset<32>(expected.value()); From 4eace36922e0216f2916ad971e88b5b91b2f554e Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 28 Feb 2024 15:02:18 -0800 Subject: [PATCH 09/26] woop seems to be working! --- .gitignore | 2 + CMakeLists.txt | 0 inc/zoo/swar/associative_iteration.h | 94 +++++++++++++++++++++++++--- test/swar/BasicOperations.cpp | 18 +++--- 4 files changed, 96 insertions(+), 18 deletions(-) create mode 100644 CMakeLists.txt diff --git a/.gitignore b/.gitignore index c8f9a2b8..30bd14b3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ test/.vscode build .cache +.idea +**cmake-build** diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 41de35c2..d65d496a 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -1,8 +1,16 @@ #ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H #define ZOO_SWAR_ASSOCIATIVE_ITERATION_H +#include #include "zoo/swar/SWAR.h" +// include std::cout etc + +template +void print(Args... args) { + (std::cout << ... << args); +} + namespace zoo::swar { /// \note This code should be substituted by an application of "progressive" algebraic iteration @@ -191,20 +199,89 @@ template< typename CountHalver > constexpr auto associativeOperatorIterated_regressive( - Base base, Base neutral, IterationCount count, IterationCount forSquaring, - Operator op, unsigned log2Count, CountHalver ch + const Base base, // 4 + const Base neutral, // 1 + IterationCount count, // 2 + const IterationCount forSquaring, // ?? + const Operator op, // plus + unsigned log2Count, // big number + const CountHalver ch // halver ) { - auto result = neutral; - if(!log2Count) { return result; } - for(;;) { - result = op(result, base, count); + auto result = neutral; // result = 1 + if (!log2Count) { return result; } // still going + for (;;) { + result = op(result, base, count); // result = 1 + 4 + if constexpr (std::is_same_v) { + print("result1: ", result, "\n"); + } if(!--log2Count) { break; } result = op(result, result, forSquaring); + if constexpr (std::is_same_v) { + print("result2: ", result, "\n"); + } count = ch(count); } return result; } + +template +constexpr auto multiply(T a , T b) { + auto operation = [](auto left, auto right, auto count) { + if (count) { + return left + right; + } else { + return left; + } + }; + + auto halver = [](auto count) { + return count >> 1; + }; + + constexpr auto numBits = sizeof(T) * 8; + return associativeOperatorIterated_regressive( + a, // base + 0, // neutral + b, // count + 1, // forSquaring, pretty sure this is where i am not understanding + operation, // operation + numBits, // log2Count + halver // halver + ); +} + +// static_assert(multiply(2, 3) == 6); + +template +constexpr auto expo(T base, T exponent) { + + auto operation = [](auto left, auto right, auto counts) { + if (counts) { + return left * right; + } else { + return left; + } + }; + + auto halver = [](auto counts) { + return counts >> 1; + }; + + constexpr auto numBits = sizeof(T) * 8; + return associativeOperatorIterated_regressive( + base, + 1, + exponent, + meta::BitmaskMaker().value << 1, + operation, + numBits, + halver + ); +} + +// static_assert(expo(2, 3) == 8, "expo(2, 3) == 8"); + template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR multiplicand, SWAR multiplier @@ -228,7 +305,8 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( multiplier, S{S::MostSignificantBit}, operation, - ActualBits, halver + ActualBits, + halver ); } @@ -251,6 +329,8 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( } */ + + template constexpr auto expo_OverflowUnsafe_SpecificBitCount( SWAR x, diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index a3138b5f..4ebc08f9 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -30,7 +30,7 @@ constexpr SWAR<8, u32> Mplier{0xA050301}; // 5*0xA = 5*10 = 50 = 0x32, // 3*5 = 15 = 0xF, // 3*2 = 6, -// 1*1 = 1 +// 1*1 = 1, constexpr auto Expected = 0x320F0601; static_assert( @@ -41,17 +41,13 @@ static_assert( multiplication_OverflowUnsafe_SpecificBitCount<3>(Micand, Mplier).value() ); -TEST_CASE("Jamie's wip expo") { - // the LSB lanes seem to be correct, but the MSB lanes are not... - constexpr auto base = SWAR<16, u32>{0b0000'0001'0011}; // 1 | 3 - constexpr auto exponent = SWAR<16, u32>{0b0000'0001'0010}; // 1 | 2 - constexpr auto expected = SWAR<16, u32>{0b0000'0001'1001}; // 1 | 9 - auto actual = expo_OverflowUnsafe(base, exponent); +TEST_CASE("Jamie's totally working exponentiation :D") { + constexpr auto base = SWAR<8, u32>{0x02'01'05'06}; // 2 | 1 | 5 | 6 + constexpr auto exponent = SWAR<8, u32>{0x07'00'02'03}; // 7 | 0 | 2 | 3 + constexpr auto expected = SWAR<8, u32>{0x80'01'19'D8}; // 128 | 1 | 19 | 216 + constexpr auto actual = expo_OverflowUnsafe(base, exponent); + static_assert(expected.value() == actual.value()); CHECK(expected.value() == actual.value()); - auto expected_as_bits = std::bitset<32>(expected.value()); - auto actual_as_bits = std::bitset<32>(actual.value()); - printf("expected: %s\n", expected_as_bits.to_string().c_str()); - printf("actual: %s\n", actual_as_bits.to_string().c_str()); } } // namespace Multiplication From 88360cf6f39c798f9e65950ab0c09e9530ac9123 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 28 Feb 2024 15:06:49 -0800 Subject: [PATCH 10/26] tidy up --- inc/zoo/swar/associative_iteration.h | 32 +--------------------------- test/swar/BasicOperations.cpp | 4 ++-- 2 files changed, 3 insertions(+), 33 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index d65d496a..4f862976 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -225,6 +225,7 @@ constexpr auto associativeOperatorIterated_regressive( } +// What I don't understand is why this doesn't work? template constexpr auto multiply(T a , T b) { auto operation = [](auto left, auto right, auto count) { @@ -251,37 +252,6 @@ constexpr auto multiply(T a , T b) { ); } -// static_assert(multiply(2, 3) == 6); - -template -constexpr auto expo(T base, T exponent) { - - auto operation = [](auto left, auto right, auto counts) { - if (counts) { - return left * right; - } else { - return left; - } - }; - - auto halver = [](auto counts) { - return counts >> 1; - }; - - constexpr auto numBits = sizeof(T) * 8; - return associativeOperatorIterated_regressive( - base, - 1, - exponent, - meta::BitmaskMaker().value << 1, - operation, - numBits, - halver - ); -} - -// static_assert(expo(2, 3) == 8, "expo(2, 3) == 8"); - template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR multiplicand, SWAR multiplier diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 4ebc08f9..cecdde53 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -42,9 +42,9 @@ static_assert( ); TEST_CASE("Jamie's totally working exponentiation :D") { - constexpr auto base = SWAR<8, u32>{0x02'01'05'06}; // 2 | 1 | 5 | 6 + constexpr auto base = SWAR<8, u32>{0x02'00'05'06}; // 2 | 0 | 5 | 6 constexpr auto exponent = SWAR<8, u32>{0x07'00'02'03}; // 7 | 0 | 2 | 3 - constexpr auto expected = SWAR<8, u32>{0x80'01'19'D8}; // 128 | 1 | 19 | 216 + constexpr auto expected = SWAR<8, u32>{0x80'01'19'D8}; // 128 | 1 | 25 | 216 constexpr auto actual = expo_OverflowUnsafe(base, exponent); static_assert(expected.value() == actual.value()); CHECK(expected.value() == actual.value()); From 4801e56df8f59ead375bb004b5e1d06be2f168e4 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 28 Feb 2024 15:10:00 -0800 Subject: [PATCH 11/26] tidy --- inc/zoo/swar/associative_iteration.h | 55 +++++----------------------- test/swar/BasicOperations.cpp | 2 +- 2 files changed, 11 insertions(+), 46 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 4f862976..2f183f77 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -1,16 +1,8 @@ #ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H #define ZOO_SWAR_ASSOCIATIVE_ITERATION_H -#include #include "zoo/swar/SWAR.h" -// include std::cout etc - -template -void print(Args... args) { - (std::cout << ... << args); -} - namespace zoo::swar { /// \note This code should be substituted by an application of "progressive" algebraic iteration @@ -199,26 +191,20 @@ template< typename CountHalver > constexpr auto associativeOperatorIterated_regressive( - const Base base, // 4 - const Base neutral, // 1 - IterationCount count, // 2 - const IterationCount forSquaring, // ?? - const Operator op, // plus - unsigned log2Count, // big number - const CountHalver ch // halver + const Base base, + const Base neutral, + IterationCount count, + const IterationCount forSquaring, + const Operator op, + unsigned log2Count, + const CountHalver ch ) { - auto result = neutral; // result = 1 - if (!log2Count) { return result; } // still going + auto result = neutral; + if (!log2Count) { return result; } for (;;) { - result = op(result, base, count); // result = 1 + 4 - if constexpr (std::is_same_v) { - print("result1: ", result, "\n"); - } + result = op(result, base, count); if(!--log2Count) { break; } result = op(result, result, forSquaring); - if constexpr (std::is_same_v) { - print("result2: ", result, "\n"); - } count = ch(count); } return result; @@ -280,27 +266,6 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( ); } - -/* - // extended from mathematics to generic programming - // see https://github.com/jamierpond/fmtgp/blob/main/2_first_algo/main.cpp - - template constexpr T exp_acc(T r, T a, T n) { - for (;;) { - if (is_odd(n)) { - r = multiply(r, a); - if (n == 1) { - return r; - } - } - n = half(n); - a = multiply(a, a); - } - } -*/ - - - template constexpr auto expo_OverflowUnsafe_SpecificBitCount( SWAR x, diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index cecdde53..83a8419d 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -50,7 +50,7 @@ TEST_CASE("Jamie's totally working exponentiation :D") { CHECK(expected.value() == actual.value()); } -} // namespace Multiplication +} #define HE(nbits, t, v0, v1) \ static_assert(horizontalEquality(\ From 582affe023c35e333216e6bb0237a24ed1be35d1 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 28 Feb 2024 15:10:19 -0800 Subject: [PATCH 12/26] rm --- test/swar/BasicOperations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 83a8419d..3ef027f2 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -30,7 +30,7 @@ constexpr SWAR<8, u32> Mplier{0xA050301}; // 5*0xA = 5*10 = 50 = 0x32, // 3*5 = 15 = 0xF, // 3*2 = 6, -// 1*1 = 1, +// 1*1 = 1 constexpr auto Expected = 0x320F0601; static_assert( From 78be7f711009e5e14d73885895d7588c284c09b4 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 28 Feb 2024 15:58:50 -0800 Subject: [PATCH 13/26] update name --- inc/zoo/swar/associative_iteration.h | 13 ++++++++----- test/swar/BasicOperations.cpp | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 2f183f77..c047648c 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -223,21 +223,24 @@ constexpr auto multiply(T a , T b) { }; auto halver = [](auto count) { - return count >> 1; + return count << 1; }; constexpr auto numBits = sizeof(T) * 8; + constexpr auto msb = 1 << (numBits - 1); return associativeOperatorIterated_regressive( a, // base 0, // neutral b, // count - 1, // forSquaring, pretty sure this is where i am not understanding + 0, // forSquaring, pretty sure this is where i am not understanding operation, // operation numBits, // log2Count halver // halver ); } +// static_assert(multiply(3, 4) == 12, "multiply failed"); + template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR multiplicand, SWAR multiplier @@ -267,7 +270,7 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( } template -constexpr auto expo_OverflowUnsafe_SpecificBitCount( +constexpr auto exponentiation_OverflowUnsafe_SpecificBitCount( SWAR x, SWAR exponent ) { @@ -339,12 +342,12 @@ constexpr auto multiplication_OverflowUnsafe( } template -constexpr auto expo_OverflowUnsafe( +constexpr auto exponentiation_OverflowUnsafe( SWAR base, SWAR exponent ) { return - expo_OverflowUnsafe_SpecificBitCount( + exponentiation_OverflowUnsafe_SpecificBitCount( base, exponent ); } diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 3ef027f2..b07643e8 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -42,10 +42,10 @@ static_assert( ); TEST_CASE("Jamie's totally working exponentiation :D") { - constexpr auto base = SWAR<8, u32>{0x02'00'05'06}; // 2 | 0 | 5 | 6 + constexpr auto base = SWAR<8, u32>{0x02'01'05'06}; // 2 | 0 | 5 | 6 constexpr auto exponent = SWAR<8, u32>{0x07'00'02'03}; // 7 | 0 | 2 | 3 constexpr auto expected = SWAR<8, u32>{0x80'01'19'D8}; // 128 | 1 | 25 | 216 - constexpr auto actual = expo_OverflowUnsafe(base, exponent); + constexpr auto actual = exponentiation_OverflowUnsafe(base, exponent); static_assert(expected.value() == actual.value()); CHECK(expected.value() == actual.value()); } From 48f960464b502fc3cab3bf0494baee17daf5303a Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 28 Feb 2024 16:03:44 -0800 Subject: [PATCH 14/26] update comment --- test/swar/BasicOperations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index b07643e8..531df746 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -42,7 +42,7 @@ static_assert( ); TEST_CASE("Jamie's totally working exponentiation :D") { - constexpr auto base = SWAR<8, u32>{0x02'01'05'06}; // 2 | 0 | 5 | 6 + constexpr auto base = SWAR<8, u32>{0x02'01'05'06}; // 2 | 1 | 5 | 6 constexpr auto exponent = SWAR<8, u32>{0x07'00'02'03}; // 7 | 0 | 2 | 3 constexpr auto expected = SWAR<8, u32>{0x80'01'19'D8}; // 128 | 1 | 25 | 216 constexpr auto actual = exponentiation_OverflowUnsafe(base, exponent); From bee72027964a2d5868238b04f6b952bf5e154b3e Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 28 Feb 2024 16:04:30 -0800 Subject: [PATCH 15/26] rm needless format --- inc/zoo/swar/associative_iteration.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index c047648c..c8b81271 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -200,8 +200,8 @@ constexpr auto associativeOperatorIterated_regressive( const CountHalver ch ) { auto result = neutral; - if (!log2Count) { return result; } - for (;;) { + if(!log2Count) { return result; } + for(;;) { result = op(result, base, count); if(!--log2Count) { break; } result = op(result, result, forSquaring); From 332bd67ac7dd46ccf8c487415212f7f21ffedf84 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 7 Mar 2024 20:59:17 -0800 Subject: [PATCH 16/26] wip --- inc/zoo/swar/associative_iteration.h | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index c8b81271..ea544681 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -222,20 +222,19 @@ constexpr auto multiply(T a , T b) { } }; - auto halver = [](auto count) { + auto updateCount = [](auto count) { return count << 1; }; constexpr auto numBits = sizeof(T) * 8; - constexpr auto msb = 1 << (numBits - 1); return associativeOperatorIterated_regressive( - a, // base - 0, // neutral - b, // count - 0, // forSquaring, pretty sure this is where i am not understanding - operation, // operation - numBits, // log2Count - halver // halver + a, // base + 0, // neutral + b, // count + 1, // forSquaring, pretty sure this is where i am not understanding + operation, // operation + numBits, // log2Count + updateCount // halver ); } @@ -278,16 +277,9 @@ constexpr auto exponentiation_OverflowUnsafe_SpecificBitCount( auto operation = [](auto left, auto right, auto counts) { const auto mask = makeLaneMaskFromMSB(counts); - const auto antiMask = ~mask; const auto product = multiplication_OverflowUnsafe_SpecificBitCount(left, right); - /* - * if (count) - * return product; - * else - * return left; - */ - return (product & mask) | (left & antiMask); + return (product & mask) | (left & ~mask); }; // halver should work same as multiplication... i think... @@ -308,7 +300,7 @@ constexpr auto exponentiation_OverflowUnsafe_SpecificBitCount( ); } -/// \note Not removed yet because it is an example of "progressive" associative exponentiation +// \note Not removed yet because it is an example of "progressive" associative exponentiation template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( SWAR multiplicand, From a2e30d6213bc0dd9abf2a515156f9aa4979f219d Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Tue, 19 Mar 2024 11:02:58 -0700 Subject: [PATCH 17/26] generate using lane literals --- inc/zoo/swar/SWAR.h | 14 ++++++++++++++ test/swar/BasicOperations.cpp | 8 +++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 5b3db31b..7eaec1a7 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -5,6 +5,7 @@ #include "zoo/meta/log.h" #include +#include #ifdef _MSC_VER #include @@ -70,6 +71,19 @@ struct SWAR { constexpr T value() const noexcept { return m_v; } + constexpr static T baseFromLaneLiterals(std::initializer_list args) noexcept { + T result = 0; + for (auto arg: args) { + result = (result << NBits) | arg; + } + return result; + } + + constexpr static SWAR fromLaneLiterals(std::initializer_list args) noexcept { + return SWAR(baseFromLaneLiterals(args)); + } + + #define SWAR_UNARY_OPERATORS_X_LIST \ X(SWAR, ~) //constexpr SWAR operator~() const noexcept { return SWAR{~m_v}; } diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 531df746..1ca476a9 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -41,10 +41,12 @@ static_assert( multiplication_OverflowUnsafe_SpecificBitCount<3>(Micand, Mplier).value() ); +static_assert(0b00000010000000110000010100000110 == 0x02'03'05'06); + TEST_CASE("Jamie's totally working exponentiation :D") { - constexpr auto base = SWAR<8, u32>{0x02'01'05'06}; // 2 | 1 | 5 | 6 - constexpr auto exponent = SWAR<8, u32>{0x07'00'02'03}; // 7 | 0 | 2 | 3 - constexpr auto expected = SWAR<8, u32>{0x80'01'19'D8}; // 128 | 1 | 25 | 216 + constexpr auto base = SWAR<8, u32>::fromLaneLiterals({2, 3, 5, 6}); // {(2 << 24) + (3 << 16) + (5 << 8) + (6)}; + constexpr auto exponent = SWAR<8, u32>::fromLaneLiterals({7, 0, 2, 3}); // 7 | 0 | 2 | 3 + constexpr auto expected = SWAR<8, u32>::fromLaneLiterals({128, 1, 25, 216}); // 128 | 1 | 25 | 216 constexpr auto actual = exponentiation_OverflowUnsafe(base, exponent); static_assert(expected.value() == actual.value()); CHECK(expected.value() == actual.value()); From 7d8379fdf29b7987f142a7dc280e418bf35acd87 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Tue, 19 Mar 2024 11:04:39 -0700 Subject: [PATCH 18/26] todey --- test/swar/BasicOperations.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 1ca476a9..b0c34df1 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -44,9 +44,9 @@ static_assert( static_assert(0b00000010000000110000010100000110 == 0x02'03'05'06); TEST_CASE("Jamie's totally working exponentiation :D") { - constexpr auto base = SWAR<8, u32>::fromLaneLiterals({2, 3, 5, 6}); // {(2 << 24) + (3 << 16) + (5 << 8) + (6)}; - constexpr auto exponent = SWAR<8, u32>::fromLaneLiterals({7, 0, 2, 3}); // 7 | 0 | 2 | 3 - constexpr auto expected = SWAR<8, u32>::fromLaneLiterals({128, 1, 25, 216}); // 128 | 1 | 25 | 216 + constexpr auto base = SWAR<8, u32>::fromLaneLiterals({2, 3, 5, 6}); + constexpr auto exponent = SWAR<8, u32>::fromLaneLiterals({7, 0, 2, 3}); + constexpr auto expected = SWAR<8, u32>::fromLaneLiterals({128, 1, 25, 216}); constexpr auto actual = exponentiation_OverflowUnsafe(base, exponent); static_assert(expected.value() == actual.value()); CHECK(expected.value() == actual.value()); From 20e7f423cac446493de94b49fe923aa754d6b0d7 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Tue, 19 Mar 2024 18:01:01 -0700 Subject: [PATCH 19/26] cleanup --- inc/zoo/swar/SWAR.h | 16 ++- test/swar/BasicOperations.cpp | 230 ++++++++++++++++------------------ 2 files changed, 119 insertions(+), 127 deletions(-) diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 7eaec1a7..2257b210 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -71,7 +71,9 @@ struct SWAR { constexpr T value() const noexcept { return m_v; } - constexpr static T baseFromLaneLiterals(std::initializer_list args) noexcept { + template + constexpr static T baseFromLaneLiterals(const T(&args)[N]) { + static_assert(N == Lanes, "Wrong number of lanes"); T result = 0; for (auto arg: args) { result = (result << NBits) | arg; @@ -79,11 +81,14 @@ struct SWAR { return result; } - constexpr static SWAR fromLaneLiterals(std::initializer_list args) noexcept { - return SWAR(baseFromLaneLiterals(args)); + template + constexpr static SWAR fromLaneLiterals(const T(&args)[N]) { + return SWAR{baseFromLaneLiterals(args)}; } + + #define SWAR_UNARY_OPERATORS_X_LIST \ X(SWAR, ~) //constexpr SWAR operator~() const noexcept { return SWAR{~m_v}; } @@ -508,4 +513,9 @@ static_assert( 0x0706050403020100ull ); + + }} + + +static_assert(zoo::swar::SWAR<8, zoo::swar::u32>::baseFromLaneLiterals({0, 0, 0, 0}) == 0); diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index b0c34df1..40dbb1a5 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -4,7 +4,6 @@ #include - using namespace zoo; using namespace zoo::swar; @@ -21,7 +20,8 @@ constexpr auto Doubled = static_assert(0x090B0D0F == Doubled.even.value()); static_assert(0x080A0C0E == Doubled.odd.value()); -static_assert(PrecisionFixtureTest == halvePrecision(Doubled.even, Doubled.odd).value()); +static_assert(PrecisionFixtureTest == + halvePrecision(Doubled.even, Doubled.odd).value()); constexpr SWAR<8, u32> Micand{0x5030201}; constexpr SWAR<8, u32> Mplier{0xA050301}; @@ -33,32 +33,28 @@ constexpr SWAR<8, u32> Mplier{0xA050301}; // 1*1 = 1 constexpr auto Expected = 0x320F0601; -static_assert( - Expected == multiplication_OverflowUnsafe(Micand, Mplier).value() -); +static_assert(Expected == + multiplication_OverflowUnsafe(Micand, Mplier).value()); static_assert( 0x320F0601 != // intentionally use a too-small bit count - multiplication_OverflowUnsafe_SpecificBitCount<3>(Micand, Mplier).value() -); - -static_assert(0b00000010000000110000010100000110 == 0x02'03'05'06); + multiplication_OverflowUnsafe_SpecificBitCount<3>(Micand, Mplier).value()); TEST_CASE("Jamie's totally working exponentiation :D") { - constexpr auto base = SWAR<8, u32>::fromLaneLiterals({2, 3, 5, 6}); - constexpr auto exponent = SWAR<8, u32>::fromLaneLiterals({7, 0, 2, 3}); - constexpr auto expected = SWAR<8, u32>::fromLaneLiterals({128, 1, 25, 216}); - constexpr auto actual = exponentiation_OverflowUnsafe(base, exponent); - static_assert(expected.value() == actual.value()); - CHECK(expected.value() == actual.value()); + using S = SWAR<8, u32>; + constexpr auto base = S::fromLaneLiterals({2, 3, 5, 6}); + constexpr auto exponent = S::fromLaneLiterals({7, 0, 2, 3}); + constexpr auto expected = S::fromLaneLiterals({128, 1, 25, 216}); + constexpr auto actual = exponentiation_OverflowUnsafe(base, exponent); + static_assert(expected.value() == actual.value()); + CHECK(expected.value() == actual.value()); } -} +} // namespace Multiplication -#define HE(nbits, t, v0, v1) \ - static_assert(horizontalEquality(\ - SWAR(v0),\ - SWAR(meta::BitmaskMaker::value)\ - )); +#define HE(nbits, t, v0, v1) \ + static_assert(horizontalEquality( \ + SWAR(v0), \ + SWAR(meta::BitmaskMaker::value))); HE(8, u64, 0x0808'0808'0808'0808, 0x8); HE(4, u64, 0x1111'1111'1111'1111, 0x1); HE(3, u64, 0xFFFF'FFFF'FFFF'FFFF, 0x7); @@ -69,47 +65,42 @@ HE(2, u8, 0xAA, 0x2); #undef HE TEST_CASE("Old version", "[deprecated][swar]") { - SWAR<8, u32> Micand{0x5030201}; - SWAR<8, u32> Mplier{0xA050301}; - auto Expected = 0x320F0601; - auto result = - multiplication_OverflowUnsafe_SpecificBitCount_deprecated<4>( - Micand, Mplier - ); - CHECK(Expected == result.value()); + SWAR<8, u32> Micand{0x5030201}; + SWAR<8, u32> Mplier{0xA050301}; + auto Expected = 0x320F0601; + auto result = multiplication_OverflowUnsafe_SpecificBitCount_deprecated<4>( + Micand, Mplier); + CHECK(Expected == result.value()); } TEST_CASE("Parity", "[swar]") { - // For each nibble, E indicates (E)ven and O (O)dd parities - // EEOEEOOO - auto Examples = 0xFF13A7E4; - SWAR<4, u32> casesBy4{Examples}; - SWAR<8, u32> casesBy8{Examples}; - auto by4 = parity(casesBy4); - auto by8 = parity(casesBy8); - CHECK(by4.value() == 0x00800888); - CHECK(by8.value() == 0x00808000); + // For each nibble, E indicates (E)ven and O (O)dd parities + // EEOEEOOO + auto Examples = 0xFF13A7E4; + SWAR<4, u32> casesBy4{Examples}; + SWAR<8, u32> casesBy8{Examples}; + auto by4 = parity(casesBy4); + auto by8 = parity(casesBy8); + CHECK(by4.value() == 0x00800888); + CHECK(by8.value() == 0x00808000); } -TEST_CASE( - "Isolate", - "[swar]" -) { - for (auto i = 0; i < 63; ++i) { - CHECK(i == isolate<8>(i)); - CHECK(i == isolate<8>(0xFF00+i)); - CHECK(i == isolate<8>(0xFFFF00+i)); - } - for (auto i = 0; i < 31; ++i) { - CHECK(i == isolate<7>(i)); - CHECK(i == isolate<7>(0xFF00+i)); - CHECK(i == isolate<7>(0xFFFF00+i)); - } - for (auto i = 0; i < 31; ++i) { - CHECK(i == isolate<11>(i)); - CHECK(i == isolate<11>(0xF800+i)); - CHECK(i == isolate<11>(0xFFF800+i)); - } +TEST_CASE("Isolate", "[swar]") { + for (auto i = 0; i < 63; ++i) { + CHECK(i == isolate<8>(i)); + CHECK(i == isolate<8>(0xFF00 + i)); + CHECK(i == isolate<8>(0xFFFF00 + i)); + } + for (auto i = 0; i < 31; ++i) { + CHECK(i == isolate<7>(i)); + CHECK(i == isolate<7>(0xFF00 + i)); + CHECK(i == isolate<7>(0xFFFF00 + i)); + } + for (auto i = 0; i < 31; ++i) { + CHECK(i == isolate<11>(i)); + CHECK(i == isolate<11>(0xF800 + i)); + CHECK(i == isolate<11>(0xFFF800 + i)); + } } static_assert(1 == popcount<5>(0x100ull)); @@ -122,13 +113,13 @@ static_assert(0x210 == popcount<1>(0x320)); static_assert(0x4321 == popcount<2>(0xF754)); static_assert(0x50004 == popcount<4>(0x3E001122)); -static_assert(1 == msbIndex(1ull<<1)); -static_assert(3 == msbIndex(1ull<<3)); -static_assert(5 == msbIndex(1ull<<5)); -static_assert(8 == msbIndex(1ull<<8)); -static_assert(17 == msbIndex(1ull<<17)); -static_assert(30 == msbIndex(1ull<<30)); -static_assert(31 == msbIndex(1ull<<31)); +static_assert(1 == msbIndex(1ull << 1)); +static_assert(3 == msbIndex(1ull << 3)); +static_assert(5 == msbIndex(1ull << 5)); +static_assert(8 == msbIndex(1ull << 8)); +static_assert(17 == msbIndex(1ull << 17)); +static_assert(30 == msbIndex(1ull << 30)); +static_assert(31 == msbIndex(1ull << 31)); namespace { using namespace zoo::meta; @@ -139,7 +130,7 @@ static_assert(0x0808'0808'0808'0808ull == BitmaskMaker::value); static_assert(0x0101'0101'0101'0101ull == BitmaskMaker::value); static_assert(0x0E0E'0E0E'0E0E'0E0Eull == BitmaskMaker::value); static_assert(0x0303'0303'0303'0303ull == BitmaskMaker::value); -} +} // namespace static_assert(0x00 == clearLSB(0x80)); static_assert(0x80 == clearLSB(0xC0)); @@ -227,53 +218,44 @@ static_assert(0x0808'0808 == u32(broadcast<8>(SWAR<8, u32>(0x0000'0008)))); static_assert(0x0B0B'0B0B == u32(broadcast<8>(SWAR<8, u32>(0x0000'000B)))); static_assert(0x0E0E'0E0E == u32(broadcast<8>(SWAR<8, u32>(0x0000'000E)))); static_assert(0x6B6B'6B6B == u32(broadcast<8>(SWAR<8, u32>(0x0000'006B)))); -static_assert(0x0808'0808'0808'0808ull == u64(broadcast<8>(SWAR<8, u64>(0x0000'0000'0000'0008ull)))); +static_assert(0x0808'0808'0808'0808ull == + u64(broadcast<8>(SWAR<8, u64>(0x0000'0000'0000'0008ull)))); -static_assert(1 == lsbIndex(1<<1)); -static_assert(3 == lsbIndex(1<<3)); -static_assert(5 == lsbIndex(1<<5)); -static_assert(8 == lsbIndex(1<<8)); -static_assert(17 == lsbIndex(1<<17)); -static_assert(30 == lsbIndex(1<<30)); +static_assert(1 == lsbIndex(1 << 1)); +static_assert(3 == lsbIndex(1 << 3)); +static_assert(5 == lsbIndex(1 << 5)); +static_assert(8 == lsbIndex(1 << 8)); +static_assert(17 == lsbIndex(1 << 17)); +static_assert(30 == lsbIndex(1 << 30)); /* These tests were not catching errors known to have been present -static_assert(0x80880008 == greaterEqual<3>(SWAR<4, uint32_t>(0x3245'1027)).value()); -static_assert(0x88888888 == greaterEqual<0>(SWAR<4, uint32_t>(0x0123'4567)).value()); -static_assert(0x88888888 == greaterEqual<0>(SWAR<4, uint32_t>(0x7654'3210)).value()); -static_assert(0x00000008 == greaterEqual<7>(SWAR<4, uint32_t>(0x0123'4567)).value()); -static_assert(0x80000000 == greaterEqual<7>(SWAR<4, uint32_t>(0x7654'3210)).value()); +static_assert(0x80880008 == greaterEqual<3>(SWAR<4, +uint32_t>(0x3245'1027)).value()); static_assert(0x88888888 == +greaterEqual<0>(SWAR<4, uint32_t>(0x0123'4567)).value()); +static_assert(0x88888888 == greaterEqual<0>(SWAR<4, +uint32_t>(0x7654'3210)).value()); static_assert(0x00000008 == +greaterEqual<7>(SWAR<4, uint32_t>(0x0123'4567)).value()); +static_assert(0x80000000 == greaterEqual<7>(SWAR<4, +uint32_t>(0x7654'3210)).value()); */ // Unusual formatting for easy visual verification. -#define GE_MSB_TEST(left, right, result) static_assert(result== greaterEqual_MSB_off<4, u32>(SWAR<4, u32>(left), SWAR<4, u32>(right)).value()); - -GE_MSB_TEST(0x1000'0010, - 0x0111'1101, - 0x8000'0080) -GE_MSB_TEST(0x4333'3343, - 0x4444'4444, - 0x8000'0080) -GE_MSB_TEST(0x0550'0110, - 0x0110'0550, - 0x8888'8008) -GE_MSB_TEST(0x4771'1414, - 0x4641'1774, - 0x8888'8008) - -GE_MSB_TEST(0x0123'4567, - 0x0000'0000, - 0x8888'8888) -GE_MSB_TEST(0x0123'4567, - 0x7777'7777, - 0x0000'0008) - -GE_MSB_TEST(0x0000'0000, - 0x0123'4567, - 0x8000'0000) -GE_MSB_TEST(0x7777'7777, - 0x0123'4567, - 0x8888'8888) +#define GE_MSB_TEST(left, right, result) \ + static_assert(result == greaterEqual_MSB_off<4, u32>(SWAR<4, u32>(left), \ + SWAR<4, u32>(right)) \ + .value()); + +GE_MSB_TEST(0x1000'0010, 0x0111'1101, 0x8000'0080) +GE_MSB_TEST(0x4333'3343, 0x4444'4444, 0x8000'0080) +GE_MSB_TEST(0x0550'0110, 0x0110'0550, 0x8888'8008) +GE_MSB_TEST(0x4771'1414, 0x4641'1774, 0x8888'8008) + +GE_MSB_TEST(0x0123'4567, 0x0000'0000, 0x8888'8888) +GE_MSB_TEST(0x0123'4567, 0x7777'7777, 0x0000'0008) + +GE_MSB_TEST(0x0000'0000, 0x0123'4567, 0x8000'0000) +GE_MSB_TEST(0x7777'7777, 0x0123'4567, 0x8888'8888) // 3 bits on msb side, 5 bits on lsb side. using Lanes = SWARWithSubLanes<5, 3, u32>; @@ -284,25 +266,25 @@ static constexpr inline u32 allF = broadcast<8>(S8u32(0x0000'00FFul)).value(); static_assert(allF == Lanes(allF).value()); static_assert(0xFFFF'FFFF == Lanes(allF).value()); -static_assert(0xFFFF'FFE0 == Lanes(allF).least(0,0).value()); -static_assert(0xFFFF'FFE1 == Lanes(allF).least(1,0).value()); -static_assert(0xFFFF'E0FF == Lanes(allF).least(0,1).value()); -static_assert(0xFFFF'E1FF == Lanes(allF).least(1,1).value()); - -static_assert(0xFFE0'FFFF == Lanes(allF).least(0,2).value()); -static_assert(0xFFE1'FFFF == Lanes(allF).least(1,2).value()); -static_assert(0xE0FF'FFFF == Lanes(allF).least(0,3).value()); -static_assert(0xE1FF'FFFF == Lanes(allF).least(1,3).value()); - -static_assert(0xFFFF'FF1F == Lanes(allF).most(0,0).value()); -static_assert(0xFFFF'FF3F == Lanes(allF).most(1,0).value()); -static_assert(0xFFFF'1FFF == Lanes(allF).most(0,1).value()); -static_assert(0xFFFF'3FFF == Lanes(allF).most(1,1).value()); - -static_assert(0xFF1F'FFFF == Lanes(allF).most(0,2).value()); -static_assert(0xFF3F'FFFF == Lanes(allF).most(1,2).value()); -static_assert(0x1FFF'FFFF == Lanes(allF).most(0,3).value()); -static_assert(0x3FFF'FFFF == Lanes(allF).most(1,3).value()); +static_assert(0xFFFF'FFE0 == Lanes(allF).least(0, 0).value()); +static_assert(0xFFFF'FFE1 == Lanes(allF).least(1, 0).value()); +static_assert(0xFFFF'E0FF == Lanes(allF).least(0, 1).value()); +static_assert(0xFFFF'E1FF == Lanes(allF).least(1, 1).value()); + +static_assert(0xFFE0'FFFF == Lanes(allF).least(0, 2).value()); +static_assert(0xFFE1'FFFF == Lanes(allF).least(1, 2).value()); +static_assert(0xE0FF'FFFF == Lanes(allF).least(0, 3).value()); +static_assert(0xE1FF'FFFF == Lanes(allF).least(1, 3).value()); + +static_assert(0xFFFF'FF1F == Lanes(allF).most(0, 0).value()); +static_assert(0xFFFF'FF3F == Lanes(allF).most(1, 0).value()); +static_assert(0xFFFF'1FFF == Lanes(allF).most(0, 1).value()); +static_assert(0xFFFF'3FFF == Lanes(allF).most(1, 1).value()); + +static_assert(0xFF1F'FFFF == Lanes(allF).most(0, 2).value()); +static_assert(0xFF3F'FFFF == Lanes(allF).most(1, 2).value()); +static_assert(0x1FFF'FFFF == Lanes(allF).most(0, 3).value()); +static_assert(0x3FFF'FFFF == Lanes(allF).most(1, 3).value()); static_assert(0x0000'001f == Lanes(all0).least(31, 0).most(0, 0).value()); static_assert(0x0000'1f00 == Lanes(all0).least(31, 1).most(0, 1).value()); From bf2be04b6c7c94cf8203a5d46a3e0306313b45d6 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 20 Mar 2024 12:47:06 -0700 Subject: [PATCH 20/26] indents --- inc/zoo/swar/SWAR.h | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 2257b210..b2d288d0 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -71,23 +71,20 @@ struct SWAR { constexpr T value() const noexcept { return m_v; } - template - constexpr static T baseFromLaneLiterals(const T(&args)[N]) { - static_assert(N == Lanes, "Wrong number of lanes"); - T result = 0; - for (auto arg: args) { - result = (result << NBits) | arg; - } - return result; - } - - template - constexpr static SWAR fromLaneLiterals(const T(&args)[N]) { - return SWAR{baseFromLaneLiterals(args)}; - } - - + template + constexpr static T baseFromLaneLiterals(const T(&args)[N]) { + static_assert(N == Lanes, "Wrong number of lanes"); + T result = 0; + for (auto arg: args) { + result = (result << NBits) | arg; + } + return result; + } + template + constexpr static SWAR fromLaneLiterals(const T(&args)[N]) { + return SWAR{baseFromLaneLiterals(args)}; + } #define SWAR_UNARY_OPERATORS_X_LIST \ X(SWAR, ~) From 3af74ee4bf6463c067a429003b67af3e44b78e1e Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:01:44 -0700 Subject: [PATCH 21/26] format --- test/swar/BasicOperations.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 40dbb1a5..c45944cb 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -41,8 +41,8 @@ static_assert( TEST_CASE("Jamie's totally working exponentiation :D") { using S = SWAR<8, u32>; - constexpr auto base = S::fromLaneLiterals({2, 3, 5, 6}); - constexpr auto exponent = S::fromLaneLiterals({7, 0, 2, 3}); + constexpr auto base = S::fromLaneLiterals({2, 3, 5, 6}); + constexpr auto exponent = S::fromLaneLiterals({7, 0, 2, 3}); constexpr auto expected = S::fromLaneLiterals({128, 1, 25, 216}); constexpr auto actual = exponentiation_OverflowUnsafe(base, exponent); static_assert(expected.value() == actual.value()); From c83ac4a24189ec0e52833195c336382c16a2066d Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:03:22 -0700 Subject: [PATCH 22/26] format --- test/swar/BasicOperations.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index c45944cb..b934867f 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -41,9 +41,9 @@ static_assert( TEST_CASE("Jamie's totally working exponentiation :D") { using S = SWAR<8, u32>; - constexpr auto base = S::fromLaneLiterals({2, 3, 5, 6}); - constexpr auto exponent = S::fromLaneLiterals({7, 0, 2, 3}); - constexpr auto expected = S::fromLaneLiterals({128, 1, 25, 216}); + constexpr auto base = S::fromLaneLiterals({2, 3, 5, 6}); + constexpr auto exponent = S::fromLaneLiterals({7, 4, 2, 3}); + constexpr auto expected = S::fromLaneLiterals({128, 81, 25, 216}); constexpr auto actual = exponentiation_OverflowUnsafe(base, exponent); static_assert(expected.value() == actual.value()); CHECK(expected.value() == actual.value()); From 4261a23fc7f7cf59bcefa299c59d59d5e11ffb4c Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:05:55 -0700 Subject: [PATCH 23/26] revert accidental change --- inc/zoo/swar/SWAR.h | 5 - test/swar/BasicOperations.cpp | 217 ++++++++++++++++++---------------- 2 files changed, 118 insertions(+), 104 deletions(-) diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index b2d288d0..1715d3e0 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -510,9 +510,4 @@ static_assert( 0x0706050403020100ull ); - - }} - - -static_assert(zoo::swar::SWAR<8, zoo::swar::u32>::baseFromLaneLiterals({0, 0, 0, 0}) == 0); diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index b934867f..5bdee385 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -4,6 +4,7 @@ #include + using namespace zoo; using namespace zoo::swar; @@ -20,8 +21,7 @@ constexpr auto Doubled = static_assert(0x090B0D0F == Doubled.even.value()); static_assert(0x080A0C0E == Doubled.odd.value()); -static_assert(PrecisionFixtureTest == - halvePrecision(Doubled.even, Doubled.odd).value()); +static_assert(PrecisionFixtureTest == halvePrecision(Doubled.even, Doubled.odd).value()); constexpr SWAR<8, u32> Micand{0x5030201}; constexpr SWAR<8, u32> Mplier{0xA050301}; @@ -33,11 +33,15 @@ constexpr SWAR<8, u32> Mplier{0xA050301}; // 1*1 = 1 constexpr auto Expected = 0x320F0601; -static_assert(Expected == - multiplication_OverflowUnsafe(Micand, Mplier).value()); +static_assert( + Expected == multiplication_OverflowUnsafe(Micand, Mplier).value() +); static_assert( 0x320F0601 != // intentionally use a too-small bit count - multiplication_OverflowUnsafe_SpecificBitCount<3>(Micand, Mplier).value()); + multiplication_OverflowUnsafe_SpecificBitCount<3>(Micand, Mplier).value() +); + +static_assert(0b00000010000000110000010100000110 == 0x02'03'05'06); TEST_CASE("Jamie's totally working exponentiation :D") { using S = SWAR<8, u32>; @@ -49,12 +53,13 @@ TEST_CASE("Jamie's totally working exponentiation :D") { CHECK(expected.value() == actual.value()); } -} // namespace Multiplication +} -#define HE(nbits, t, v0, v1) \ - static_assert(horizontalEquality( \ - SWAR(v0), \ - SWAR(meta::BitmaskMaker::value))); +#define HE(nbits, t, v0, v1) \ + static_assert(horizontalEquality(\ + SWAR(v0),\ + SWAR(meta::BitmaskMaker::value)\ + )); HE(8, u64, 0x0808'0808'0808'0808, 0x8); HE(4, u64, 0x1111'1111'1111'1111, 0x1); HE(3, u64, 0xFFFF'FFFF'FFFF'FFFF, 0x7); @@ -65,42 +70,47 @@ HE(2, u8, 0xAA, 0x2); #undef HE TEST_CASE("Old version", "[deprecated][swar]") { - SWAR<8, u32> Micand{0x5030201}; - SWAR<8, u32> Mplier{0xA050301}; - auto Expected = 0x320F0601; - auto result = multiplication_OverflowUnsafe_SpecificBitCount_deprecated<4>( - Micand, Mplier); - CHECK(Expected == result.value()); + SWAR<8, u32> Micand{0x5030201}; + SWAR<8, u32> Mplier{0xA050301}; + auto Expected = 0x320F0601; + auto result = + multiplication_OverflowUnsafe_SpecificBitCount_deprecated<4>( + Micand, Mplier + ); + CHECK(Expected == result.value()); } TEST_CASE("Parity", "[swar]") { - // For each nibble, E indicates (E)ven and O (O)dd parities - // EEOEEOOO - auto Examples = 0xFF13A7E4; - SWAR<4, u32> casesBy4{Examples}; - SWAR<8, u32> casesBy8{Examples}; - auto by4 = parity(casesBy4); - auto by8 = parity(casesBy8); - CHECK(by4.value() == 0x00800888); - CHECK(by8.value() == 0x00808000); + // For each nibble, E indicates (E)ven and O (O)dd parities + // EEOEEOOO + auto Examples = 0xFF13A7E4; + SWAR<4, u32> casesBy4{Examples}; + SWAR<8, u32> casesBy8{Examples}; + auto by4 = parity(casesBy4); + auto by8 = parity(casesBy8); + CHECK(by4.value() == 0x00800888); + CHECK(by8.value() == 0x00808000); } -TEST_CASE("Isolate", "[swar]") { - for (auto i = 0; i < 63; ++i) { - CHECK(i == isolate<8>(i)); - CHECK(i == isolate<8>(0xFF00 + i)); - CHECK(i == isolate<8>(0xFFFF00 + i)); - } - for (auto i = 0; i < 31; ++i) { - CHECK(i == isolate<7>(i)); - CHECK(i == isolate<7>(0xFF00 + i)); - CHECK(i == isolate<7>(0xFFFF00 + i)); - } - for (auto i = 0; i < 31; ++i) { - CHECK(i == isolate<11>(i)); - CHECK(i == isolate<11>(0xF800 + i)); - CHECK(i == isolate<11>(0xFFF800 + i)); - } +TEST_CASE( + "Isolate", + "[swar]" +) { + for (auto i = 0; i < 63; ++i) { + CHECK(i == isolate<8>(i)); + CHECK(i == isolate<8>(0xFF00+i)); + CHECK(i == isolate<8>(0xFFFF00+i)); + } + for (auto i = 0; i < 31; ++i) { + CHECK(i == isolate<7>(i)); + CHECK(i == isolate<7>(0xFF00+i)); + CHECK(i == isolate<7>(0xFFFF00+i)); + } + for (auto i = 0; i < 31; ++i) { + CHECK(i == isolate<11>(i)); + CHECK(i == isolate<11>(0xF800+i)); + CHECK(i == isolate<11>(0xFFF800+i)); + } } static_assert(1 == popcount<5>(0x100ull)); @@ -113,13 +123,13 @@ static_assert(0x210 == popcount<1>(0x320)); static_assert(0x4321 == popcount<2>(0xF754)); static_assert(0x50004 == popcount<4>(0x3E001122)); -static_assert(1 == msbIndex(1ull << 1)); -static_assert(3 == msbIndex(1ull << 3)); -static_assert(5 == msbIndex(1ull << 5)); -static_assert(8 == msbIndex(1ull << 8)); -static_assert(17 == msbIndex(1ull << 17)); -static_assert(30 == msbIndex(1ull << 30)); -static_assert(31 == msbIndex(1ull << 31)); +static_assert(1 == msbIndex(1ull<<1)); +static_assert(3 == msbIndex(1ull<<3)); +static_assert(5 == msbIndex(1ull<<5)); +static_assert(8 == msbIndex(1ull<<8)); +static_assert(17 == msbIndex(1ull<<17)); +static_assert(30 == msbIndex(1ull<<30)); +static_assert(31 == msbIndex(1ull<<31)); namespace { using namespace zoo::meta; @@ -130,7 +140,7 @@ static_assert(0x0808'0808'0808'0808ull == BitmaskMaker::value); static_assert(0x0101'0101'0101'0101ull == BitmaskMaker::value); static_assert(0x0E0E'0E0E'0E0E'0E0Eull == BitmaskMaker::value); static_assert(0x0303'0303'0303'0303ull == BitmaskMaker::value); -} // namespace +} static_assert(0x00 == clearLSB(0x80)); static_assert(0x80 == clearLSB(0xC0)); @@ -218,44 +228,53 @@ static_assert(0x0808'0808 == u32(broadcast<8>(SWAR<8, u32>(0x0000'0008)))); static_assert(0x0B0B'0B0B == u32(broadcast<8>(SWAR<8, u32>(0x0000'000B)))); static_assert(0x0E0E'0E0E == u32(broadcast<8>(SWAR<8, u32>(0x0000'000E)))); static_assert(0x6B6B'6B6B == u32(broadcast<8>(SWAR<8, u32>(0x0000'006B)))); -static_assert(0x0808'0808'0808'0808ull == - u64(broadcast<8>(SWAR<8, u64>(0x0000'0000'0000'0008ull)))); +static_assert(0x0808'0808'0808'0808ull == u64(broadcast<8>(SWAR<8, u64>(0x0000'0000'0000'0008ull)))); -static_assert(1 == lsbIndex(1 << 1)); -static_assert(3 == lsbIndex(1 << 3)); -static_assert(5 == lsbIndex(1 << 5)); -static_assert(8 == lsbIndex(1 << 8)); -static_assert(17 == lsbIndex(1 << 17)); -static_assert(30 == lsbIndex(1 << 30)); +static_assert(1 == lsbIndex(1<<1)); +static_assert(3 == lsbIndex(1<<3)); +static_assert(5 == lsbIndex(1<<5)); +static_assert(8 == lsbIndex(1<<8)); +static_assert(17 == lsbIndex(1<<17)); +static_assert(30 == lsbIndex(1<<30)); /* These tests were not catching errors known to have been present -static_assert(0x80880008 == greaterEqual<3>(SWAR<4, -uint32_t>(0x3245'1027)).value()); static_assert(0x88888888 == -greaterEqual<0>(SWAR<4, uint32_t>(0x0123'4567)).value()); -static_assert(0x88888888 == greaterEqual<0>(SWAR<4, -uint32_t>(0x7654'3210)).value()); static_assert(0x00000008 == -greaterEqual<7>(SWAR<4, uint32_t>(0x0123'4567)).value()); -static_assert(0x80000000 == greaterEqual<7>(SWAR<4, -uint32_t>(0x7654'3210)).value()); +static_assert(0x80880008 == greaterEqual<3>(SWAR<4, uint32_t>(0x3245'1027)).value()); +static_assert(0x88888888 == greaterEqual<0>(SWAR<4, uint32_t>(0x0123'4567)).value()); +static_assert(0x88888888 == greaterEqual<0>(SWAR<4, uint32_t>(0x7654'3210)).value()); +static_assert(0x00000008 == greaterEqual<7>(SWAR<4, uint32_t>(0x0123'4567)).value()); +static_assert(0x80000000 == greaterEqual<7>(SWAR<4, uint32_t>(0x7654'3210)).value()); */ // Unusual formatting for easy visual verification. -#define GE_MSB_TEST(left, right, result) \ - static_assert(result == greaterEqual_MSB_off<4, u32>(SWAR<4, u32>(left), \ - SWAR<4, u32>(right)) \ - .value()); - -GE_MSB_TEST(0x1000'0010, 0x0111'1101, 0x8000'0080) -GE_MSB_TEST(0x4333'3343, 0x4444'4444, 0x8000'0080) -GE_MSB_TEST(0x0550'0110, 0x0110'0550, 0x8888'8008) -GE_MSB_TEST(0x4771'1414, 0x4641'1774, 0x8888'8008) - -GE_MSB_TEST(0x0123'4567, 0x0000'0000, 0x8888'8888) -GE_MSB_TEST(0x0123'4567, 0x7777'7777, 0x0000'0008) - -GE_MSB_TEST(0x0000'0000, 0x0123'4567, 0x8000'0000) -GE_MSB_TEST(0x7777'7777, 0x0123'4567, 0x8888'8888) +#define GE_MSB_TEST(left, right, result) static_assert(result== greaterEqual_MSB_off<4, u32>(SWAR<4, u32>(left), SWAR<4, u32>(right)).value()); + +GE_MSB_TEST(0x1000'0010, + 0x0111'1101, + 0x8000'0080) +GE_MSB_TEST(0x4333'3343, + 0x4444'4444, + 0x8000'0080) +GE_MSB_TEST(0x0550'0110, + 0x0110'0550, + 0x8888'8008) +GE_MSB_TEST(0x4771'1414, + 0x4641'1774, + 0x8888'8008) + +GE_MSB_TEST(0x0123'4567, + 0x0000'0000, + 0x8888'8888) +GE_MSB_TEST(0x0123'4567, + 0x7777'7777, + 0x0000'0008) + +GE_MSB_TEST(0x0000'0000, + 0x0123'4567, + 0x8000'0000) +GE_MSB_TEST(0x7777'7777, + 0x0123'4567, + 0x8888'8888) // 3 bits on msb side, 5 bits on lsb side. using Lanes = SWARWithSubLanes<5, 3, u32>; @@ -266,25 +285,25 @@ static constexpr inline u32 allF = broadcast<8>(S8u32(0x0000'00FFul)).value(); static_assert(allF == Lanes(allF).value()); static_assert(0xFFFF'FFFF == Lanes(allF).value()); -static_assert(0xFFFF'FFE0 == Lanes(allF).least(0, 0).value()); -static_assert(0xFFFF'FFE1 == Lanes(allF).least(1, 0).value()); -static_assert(0xFFFF'E0FF == Lanes(allF).least(0, 1).value()); -static_assert(0xFFFF'E1FF == Lanes(allF).least(1, 1).value()); - -static_assert(0xFFE0'FFFF == Lanes(allF).least(0, 2).value()); -static_assert(0xFFE1'FFFF == Lanes(allF).least(1, 2).value()); -static_assert(0xE0FF'FFFF == Lanes(allF).least(0, 3).value()); -static_assert(0xE1FF'FFFF == Lanes(allF).least(1, 3).value()); - -static_assert(0xFFFF'FF1F == Lanes(allF).most(0, 0).value()); -static_assert(0xFFFF'FF3F == Lanes(allF).most(1, 0).value()); -static_assert(0xFFFF'1FFF == Lanes(allF).most(0, 1).value()); -static_assert(0xFFFF'3FFF == Lanes(allF).most(1, 1).value()); - -static_assert(0xFF1F'FFFF == Lanes(allF).most(0, 2).value()); -static_assert(0xFF3F'FFFF == Lanes(allF).most(1, 2).value()); -static_assert(0x1FFF'FFFF == Lanes(allF).most(0, 3).value()); -static_assert(0x3FFF'FFFF == Lanes(allF).most(1, 3).value()); +static_assert(0xFFFF'FFE0 == Lanes(allF).least(0,0).value()); +static_assert(0xFFFF'FFE1 == Lanes(allF).least(1,0).value()); +static_assert(0xFFFF'E0FF == Lanes(allF).least(0,1).value()); +static_assert(0xFFFF'E1FF == Lanes(allF).least(1,1).value()); + +static_assert(0xFFE0'FFFF == Lanes(allF).least(0,2).value()); +static_assert(0xFFE1'FFFF == Lanes(allF).least(1,2).value()); +static_assert(0xE0FF'FFFF == Lanes(allF).least(0,3).value()); +static_assert(0xE1FF'FFFF == Lanes(allF).least(1,3).value()); + +static_assert(0xFFFF'FF1F == Lanes(allF).most(0,0).value()); +static_assert(0xFFFF'FF3F == Lanes(allF).most(1,0).value()); +static_assert(0xFFFF'1FFF == Lanes(allF).most(0,1).value()); +static_assert(0xFFFF'3FFF == Lanes(allF).most(1,1).value()); + +static_assert(0xFF1F'FFFF == Lanes(allF).most(0,2).value()); +static_assert(0xFF3F'FFFF == Lanes(allF).most(1,2).value()); +static_assert(0x1FFF'FFFF == Lanes(allF).most(0,3).value()); +static_assert(0x3FFF'FFFF == Lanes(allF).most(1,3).value()); static_assert(0x0000'001f == Lanes(all0).least(31, 0).most(0, 0).value()); static_assert(0x0000'1f00 == Lanes(all0).least(31, 1).most(0, 1).value()); From f66514234d6f99300568b2776a51438bf23899e3 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:08:27 -0700 Subject: [PATCH 24/26] merge master --- .gitignore | 9 +- benchmark/CMakeLists.txt | 12 +- benchmark/atoi-corpus.h | 165 ++++++++++++++++++++- benchmark/atoi.cpp | 150 +++++++++++++++++--- benchmark/atoi.h | 19 ++- benchmark/bm-swar.cpp | 11 ++ benchmark/catch2swar-demo.cpp | 61 ++++++++ benchmark/swar/compress.cpp | 95 +++++++++++++ inc/zoo/map/RobinHood.h | 6 +- inc/zoo/map/RobinHoodAlt.h | 2 +- inc/zoo/map/RobinHoodUtil.h | 2 +- inc/zoo/pp/platform.h | 6 + inc/zoo/swar/SWAR.h | 204 +++++++++----------------- inc/zoo/swar/SWARWithSubLanes.h | 121 ++++++++++++++++ inc/zoo/swar/associative_iteration.h | 205 ++++++++++++++++++++++++++- scripts/mock-includes.sh | 15 ++ scripts/redirective.sh | 7 + test/CMakeLists.txt | 2 +- test/swar/BasicOperations.cpp | 181 ++++++++++++++--------- test/swar/sublanes.cpp | 56 ++++++++ 20 files changed, 1096 insertions(+), 233 deletions(-) create mode 100644 benchmark/swar/compress.cpp create mode 100644 inc/zoo/swar/SWARWithSubLanes.h create mode 100644 scripts/mock-includes.sh create mode 100644 scripts/redirective.sh create mode 100644 test/swar/sublanes.cpp diff --git a/.gitignore b/.gitignore index 30bd14b3..800011dc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,3 @@ -# Vscode does not like to build outside of the source tree -# (multiple glitches) .vscode test/.vscode @@ -7,3 +5,10 @@ build .cache .idea **cmake-build** +# Vscode does not like to build outside of the source tree +# (multiple glitches) + +.vscode +test/.vscode +build +.cache diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 10db98ed..5d4b1160 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -56,9 +56,19 @@ add_executable( set_xcode_properties(zoo-google-benchmark) add_executable( - zoo-atoi-benchmark benchmark_main.cpp bm-swar.cpp atoi.cpp + zoo-atoi-benchmark benchmark_main.cpp bm-swar.cpp atoi.cpp swar/compress.cpp ) set_xcode_properties(zoo-atoi-benchmark) target_link_libraries(zoo-google-benchmark benchmark::benchmark) target_link_libraries(zoo-atoi-benchmark benchmark::benchmark) + +add_library(zoo-atoi-implementations SHARED atoi.cpp) +add_executable( + zoo-atoi-benchmark-from-dynamic-library benchmark_main.cpp bm-swar.cpp +) +target_link_libraries( + zoo-atoi-benchmark-from-dynamic-library + zoo-atoi-implementations + benchmark::benchmark +) diff --git a/benchmark/atoi-corpus.h b/benchmark/atoi-corpus.h index 78ed2aba..e334e142 100644 --- a/benchmark/atoi-corpus.h +++ b/benchmark/atoi-corpus.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include struct Corpus8DecimalDigits { std::vector asNumbers_; @@ -85,7 +87,7 @@ struct CorpusStringLength { while(count--) { auto length = strSize(generator); - sizes.push_back(length); + sizes.push_back(length + 1); for(auto i = length; i--; ) { allCharacters.append(1, characters(generator)); } @@ -143,10 +145,171 @@ struct CorpusStringLength { AVX2_STRLEN_CORPUS_X_LIST \ NEON_STRLEN_CORPUS_X_LIST +struct CorpusLeadingSpaces { + constexpr static auto CountOfSpaceCharactersAvailable = 6; + constexpr static inline std::array Spaces = + { ' ', '\n', '\t', '\r', '\f', '\v' }; + std::vector skips_; + std::string characters_; + + CorpusLeadingSpaces(std::vector &&skips, std::string &&cs): + skips_{std::move(skips)}, characters_{std::move(cs)} + {} + + template + static auto makeCorpus(G &generator) { + auto count = 1031; // see Corpus8DecimalDigits for why 1031 + std::vector sizes; + std::string allCharacters; + std::geometric_distribution<> + spacesCount(1.0/29), + extraCharacters(0.5); + // unrepresentatively very large, but will cross the 32 boundary + // to test 32-byte techniques + std::uniform_int_distribution<> + spacer(0, CountOfSpaceCharactersAvailable - 1), + moreCharacters(0, 255); + + while(count--) { + auto count = spacesCount(generator); + for(auto i = count; i--; ) { + allCharacters.append(1, Spaces[spacer(generator)]); + } + auto extra = moreCharacters(generator); + for(auto i = extra; i--; ) { + allCharacters.append(1, moreCharacters(generator)); + } + sizes.push_back(count + extra + 1); + allCharacters.append(1, '\0'); + } + return CorpusStringLength(std::move(sizes), std::move(allCharacters)); + } + + struct Iterator { + int *skips, *sentinel; + char *cp; + + Iterator &operator++() { + cp += *skips++; + return *this; + } + + char *operator*() { + return cp; + } + + auto next() noexcept { + ++(*this); + return sentinel != skips; + } + }; + + Iterator commence() { + return { + skips_.data(), skips_.data() + skips_.size(), characters_.data() + }; + } +}; + +#define LEADING_SPACES_CORPUS_X_LIST X(GLIB_Spaces, spaces_glibc) X(ZooSpaces, zoo::leadingSpacesCount) + +void (*consumeStrPtr)(const char *, unsigned) = + [](const char *p, unsigned l) { + return; + }; + +struct CorpusAtoi { + constexpr static auto CountOfSpaceCharactersAvailable = 6; + constexpr static inline std::array Spaces = + { ' ', '\n', '\t', '\r', '\f', '\v' }; + std::vector skips_; + std::string characters_; + + CorpusAtoi(std::vector &&skips, std::string &&cs): + skips_{std::move(skips)}, characters_{std::move(cs)} + {} + + template + static auto makeCorpus(G &generator) { + auto count = 1031; // see Corpus8DecimalDigits for why 1031 + std::vector sizes; + std::string allCharacters; + std::geometric_distribution + spacesCount(0.5), + insignificantZeros(0.9); + std::uniform_real_distribution numberLogarithmBase10(-2.0, 9.2); + // a maximum of 10^9.2 is ~1.6 billion, within the range. + // negative "logarithms" are for indicating negative numbers up to + // -10^2, or -100 + std::uniform_int_distribution + postNumber('9' + 1, 255), + spacer(0, CountOfSpaceCharactersAvailable - 1); + char conversionBuffer[20]; + + while(count--) { + auto currentLength = allCharacters.size(); + auto count = spacesCount(generator); + for(auto i = count; i--; ) { + allCharacters.append(1, Spaces[spacer(generator)]); + } + auto logBase10 = numberLogarithmBase10(generator); + int negativeSign; + if(0.0 <= logBase10) { + negativeSign = 0; + } else { + allCharacters.append(1, '-'); + logBase10 = -logBase10; + negativeSign = 1; + } + auto iz = insignificantZeros(generator); + for(auto i = iz; i--; ) { + allCharacters.append(1, '0'); + } + int number = exp(logBase10 * M_LN10); + auto n = sprintf(conversionBuffer, "%d%c", number, postNumber(generator)); + if(n < 0) { throw 0; } + allCharacters.append(conversionBuffer); + sizes.push_back(count + negativeSign + iz + n); + consumeStrPtr(allCharacters.c_str() + currentLength, count + negativeSign + iz + n); + } + return CorpusStringLength(std::move(sizes), std::move(allCharacters)); + } + + struct Iterator { + int *skips, *sentinel; + char *cp; + + Iterator &operator++() { + cp += *skips++; + return *this; + } + + char *operator*() { + return cp; + } + + auto next() noexcept { + ++(*this); + return sentinel != skips; + } + }; + + Iterator commence() { + return { + skips_.data(), skips_.data() + skips_.size(), characters_.data() + }; + } +}; + +#define ATOI_CORPUS_X_LIST \ + X(GLIBC_atoi, atoi) X(ZOO_ATOI, zoo::c_strToI) X(COMPARE_ATOI, zoo::compareAtoi) + #define X(Typename, FunctionToCall) \ struct Invoke##Typename { int operator()(const char *p) { return FunctionToCall(p); } }; PARSE8BYTES_CORPUS_X_LIST STRLEN_CORPUS_X_LIST +LEADING_SPACES_CORPUS_X_LIST +ATOI_CORPUS_X_LIST #undef X diff --git a/benchmark/atoi.cpp b/benchmark/atoi.cpp index 8d002b08..53f974f4 100644 --- a/benchmark/atoi.cpp +++ b/benchmark/atoi.cpp @@ -10,9 +10,11 @@ #include #include #include - +#include #include +static_assert(~uint32_t(0) == zoo::swar::SWAR<32, uint32_t>::LeastSignificantLaneMask); + // Copied from Daniel Lemire's GitHub at // https://lemire.me/blog/2018/10/03/quickly-parsing-eight-digits/ // https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/ddb082981228f7256e9a4dbbf56fd4a335d78e30/2018/10/03/eightchartoi.c#L26C1-L34C2 @@ -27,14 +29,7 @@ uint32_t parse_eight_digits_swar(const char *chars) { return short100plus >> 32; } -// Note: eight digits can represent from 0 to (10^9) - 1, the logarithm base 2 -// of 10^9 is slightly less than 30, thus, only 30 bits are needed. -uint32_t lemire_as_zoo_swar(const char *chars) { - uint64_t bytes; - memcpy(&bytes, chars, 8); - auto allCharacterZero = zoo::meta::BitmaskMaker::value; - using S8_64 = zoo::swar::SWAR<8, uint64_t>; - S8_64 convertedToIntegers = S8_64{bytes - allCharacterZero}; +uint32_t calculateBase10(zoo::swar::SWAR<8, uint64_t> convertedToIntegers) noexcept { /* the idea is to perform the following multiplication: * NOTE: THE BASE OF THE NUMBERS is 256 (2^8), then 65536 (2^16), 2^32 * convertedToIntegers is IN BASE 256 the number ABCDEFGH @@ -58,6 +53,18 @@ uint32_t lemire_as_zoo_swar(const char *chars) { return uint32_t(by10001base2to32.value() >> 32); } +// Note: eight digits can represent from 0 to (10^9) - 1, the logarithm base 2 +// of 10^9 is slightly less than 30, thus, only 30 bits are needed. +uint32_t lemire_as_zoo_swar(const char *chars) noexcept { + uint64_t bytes; + memcpy(&bytes, chars, 8); + auto allCharacterZero = zoo::meta::BitmaskMaker::value; + using S8_64 = zoo::swar::SWAR<8, uint64_t>; + S8_64 convertedToIntegers = S8_64{bytes - allCharacterZero}; + auto rv = calculateBase10(convertedToIntegers); + return rv; +} + std::size_t spaces_glibc(const char *ptr) { auto rv = 0; while(isspace(ptr[rv])) { ++rv; } @@ -66,8 +73,8 @@ std::size_t spaces_glibc(const char *ptr) { namespace zoo { -//constexpr -std::size_t leadingSpacesCount(swar::SWAR<8, uint64_t> bytes) noexcept { +template +std::size_t leadingSpacesCountAligned(S bytes) noexcept { /* space (0x20, ' ') form feed (0x0c, '\f') @@ -85,15 +92,16 @@ std::size_t leadingSpacesCount(swar::SWAR<8, uint64_t> bytes) noexcept { }, ExpressedAsEscapeCodes = { ' ', '\r', '\f', '\v', '\n', '\t' }; static_assert(SpaceCharacters == ExpressedAsEscapeCodes); */ - using S = swar::SWAR<8, uint64_t>; + static_assert(sizeof(S) == alignof(S)); constexpr S Space{meta::BitmaskMaker::value}; auto space = swar::equals(bytes, Space); - auto otherWhiteSpace = - swar::constantIsGreaterEqual<'\r'>(bytes) & - ~swar::constantIsGreaterEqual<'\t' - 1>(bytes); + auto belowEqualCarriageReturn = swar::constantIsGreaterEqual<'\r'>(bytes); + auto belowTab = swar::constantIsGreaterEqual<'\t' - 1>(bytes); + auto otherWhiteSpace = belowEqualCarriageReturn & ~belowTab; auto whiteSpace = space | otherWhiteSpace; - auto notWhiteSpace = S{S::MostSignificantBit} ^ whiteSpace; - return notWhiteSpace.lsbIndex(); + auto notWhiteSpace = ~whiteSpace; + auto rv = notWhiteSpace ? notWhiteSpace.lsbIndex() : S::Lanes; + return rv; } /// @brief Loads the "block" containing the pointer, by proper alignment @@ -115,6 +123,110 @@ blockAlignedLoad(PtrT *pointerInsideBlock, Block *b) { return { base, misalignment }; } +std::size_t leadingSpacesCount(const char *p) noexcept { + using S = swar::SWAR<8, uint64_t>; + S bytes; + auto [base, misalignment] = blockAlignedLoad(p, &bytes.m_v); + auto bitDisplacement = 8 * misalignment; + + // deal with misalignment setting the low part to spaces + constexpr static S + AllSpaces{meta::BitmaskMaker::value}, + AllOn = ~S{0}; + // blit the spaces in + auto mask = S{AllOn.value() << bitDisplacement}; + auto misalignedEliminated = bytes & mask; + auto spacesIntroduced = AllSpaces & ~mask; + bytes = spacesIntroduced | misalignedEliminated; + for(;;) { + auto spacesThisBlock = leadingSpacesCountAligned(bytes); + base += spacesThisBlock; + if(8 != spacesThisBlock) { return base - p; } + memcpy(&bytes.m_v, base, 8); + } +} + +auto leadingDigitsCount(const char *p) noexcept { + using S = swar::SWAR<8, uint64_t>; + S bytes; + auto [base, misalignment] = blockAlignedLoad(p, &bytes.m_v); + auto bitDisplacement = 8 * misalignment; + constexpr static S + AllZeroCharacter{meta::BitmaskMaker::value}, + AllOn = ~S{0}; + // blit the zero-characters to the misaligned part + auto mask = S{AllOn.value() << bitDisplacement}; + auto misalignedEliminated = bytes & mask; + auto zeroCharactersIntroduced = AllZeroCharacter & ~mask; + bytes = zeroCharactersIntroduced | misalignedEliminated; + for(;;) { + auto belowOrEqualTo9 = swar::constantIsGreaterEqual<'9'>(bytes); + auto belowCharacter0 = swar::constantIsGreaterEqual<'0' - 1>(bytes); + auto digits = belowOrEqualTo9 & ~belowCharacter0; + auto nonDigits = ~digits; + if(nonDigits) { + auto nonDigitIndex = nonDigits.lsbIndex(); + return base + nonDigitIndex - p; + } + base += 8; + memcpy(&bytes.m_v, base, 8); + } +} + +int c_strToI(const char *str) noexcept { + constexpr static std::array LastFactor = { + 1, 10, 100, 1000, + 10'000, 100'000, 1000'000, 10'000'000 + }; + auto leadingSpaces = leadingSpacesCount(str); + auto s = str + leadingSpaces; + auto sign = 1; + switch(*s) { + case '-': sign = -1; + [[fallthrough]]; + case '+': ++s; break; + default: ; + } + using S = swar::SWAR<8, uint64_t>; + S bytes; + auto [base, misalignment] = blockAlignedLoad(s, &bytes.m_v); + auto bitDisplacement = 8 * misalignment; + constexpr static S + AllZeroCharacter{meta::BitmaskMaker::value}, + AllOn = ~S{0}; + // blit the zero-characters to the misaligned part + auto mask = S{AllOn.value() << bitDisplacement}; + auto misalignedEliminated = bytes & mask; + auto zeroCharactersIntroduced = AllZeroCharacter & ~mask; + bytes = zeroCharactersIntroduced | misalignedEliminated; + long accumulator = 0; + + for(;;) { + auto belowOrEqualTo9 = swar::constantIsGreaterEqual<'9'>(bytes); + auto belowCharacter0 = swar::constantIsGreaterEqual<'0' - 1>(bytes); + auto digits = belowOrEqualTo9 & ~belowCharacter0; + auto nonDigits = ~digits; + if(nonDigits) { + auto nonDigitIndex = nonDigits.lsbIndex(); + auto asIntegers = bytes - AllZeroCharacter; // upper lanes garbage + auto integersInHighLanes = + // allow complete clearing of the 8 bytes by doing 2 shifts, + // since it is UB to shift 64 bits. + asIntegers.shiftLanesLeft(7 - nonDigitIndex).shiftLanesLeft(1); + auto inBase10 = calculateBase10(integersInHighLanes); + auto scaledAccumulator = accumulator * LastFactor[nonDigitIndex]; + return int((scaledAccumulator + inBase10) * sign); + } + // all 8 bytes are digits + auto asIntegers = bytes - AllZeroCharacter; + accumulator *= 100'000'000; + auto inBase10 = calculateBase10(asIntegers); + accumulator += inBase10; + base += 8; + memcpy(&bytes.m_v, base, 8); + } +} + /// \brief Helper function to fix the non-string part of block template S adjustMisalignmentFor_strlen(S data, int misalignment) { @@ -145,7 +257,7 @@ std::size_t c_strLength(const char *s) { // It is safe to read within the page where the string occurs, and to // guarantee that, simply make aligned reads because the size of the SWAR // base size will always divide the memory page size - auto [alignedBase, misalignment] = blockAlignedLoad(s, &initialBytes); + auto [alignedBase, misalignment] = blockAlignedLoad(s, &initialBytes.m_v); auto bytes = adjustMisalignmentFor_strlen(initialBytes, misalignment); for(;;) { auto firstNullTurnsOnMSB = bytes - Ones; @@ -172,7 +284,7 @@ std::size_t c_strLength(const char *s) { std::size_t c_strLength_natural(const char *s) { using S = swar::SWAR<8, std::uint64_t>; S initialBytes; - auto [base, misalignment] = blockAlignedLoad(s, &initialBytes); + auto [base, misalignment] = blockAlignedLoad(s, &initialBytes.m_v); auto bytes = adjustMisalignmentFor_strlen(initialBytes, misalignment); for(;;) { auto nulls = zoo::swar::equals(bytes, S{0}); diff --git a/benchmark/atoi.h b/benchmark/atoi.h index 8c1d14b5..3b01c648 100644 --- a/benchmark/atoi.h +++ b/benchmark/atoi.h @@ -1,18 +1,31 @@ +#ifndef ZOO_ATOI_H +#define ZOO_ATOI_H + #include "zoo/swar/SWAR.h" #include "zoo/pp/platform.h" #include uint32_t parse_eight_digits_swar(const char *chars); -uint32_t lemire_as_zoo_swar(const char *chars); +uint32_t lemire_as_zoo_swar(const char *chars) noexcept; std::size_t spaces_glibc(const char *ptr); namespace zoo { -std::size_t leadingSpacesCount(swar::SWAR<8, uint64_t> bytes) noexcept; +std::size_t leadingSpacesCount(const char *) noexcept; + std::size_t c_strLength(const char *s); std::size_t c_strLength_natural(const char *s); +int32_t c_strToI(const char *) noexcept; + +inline int compareAtoi(const char *s) { + auto + from_stdlib = atoi(s), + from_zoo = c_strToI(s); + if(from_stdlib != from_zoo) { throw 0; } + return from_stdlib; +} #if ZOO_CONFIGURED_TO_USE_AVX() std::size_t avx2_strlen(const char* str); @@ -26,3 +39,5 @@ std::size_t neon_strlen(const char* str); std::size_t STRLEN_old (const char *str); + +#endif diff --git a/benchmark/bm-swar.cpp b/benchmark/bm-swar.cpp index 5aa77a6f..b79013f6 100644 --- a/benchmark/bm-swar.cpp +++ b/benchmark/bm-swar.cpp @@ -23,9 +23,14 @@ void runBenchmark(benchmark::State &s) { Callable function; for(auto _: s) { goOverCorpus(corpus, function); + benchmark::ClobberMemory(); } } +#define X(Typename, _) \ + BENCHMARK(runBenchmark); + LEADING_SPACES_CORPUS_X_LIST +#undef X #define X(Typename, _) \ BENCHMARK(runBenchmark); @@ -37,5 +42,11 @@ void runBenchmark(benchmark::State &s) { STRLEN_CORPUS_X_LIST #undef X + +#define X(Typename, _) \ + BENCHMARK(runBenchmark); + ATOI_CORPUS_X_LIST +#undef X + using RepeatZooStrlen = InvokeZOO_STRLEN; BENCHMARK(runBenchmark); diff --git a/benchmark/catch2swar-demo.cpp b/benchmark/catch2swar-demo.cpp index 3bcaf204..05282af0 100644 --- a/benchmark/catch2swar-demo.cpp +++ b/benchmark/catch2swar-demo.cpp @@ -4,6 +4,8 @@ #define CATCH_CONFIG_ENABLE_BENCHMARKING #include "catch2/catch.hpp" +#include + TEST_CASE("Atoi benchmarks", "[atoi][swar]") { auto traverse = [](auto &&corpus, auto &&function, auto rv) { @@ -34,7 +36,21 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") { #endif } auto corpus8D = Corpus8DecimalDigits::makeCorpus(g); + auto corpusLeadingSpaces = CorpusLeadingSpaces::makeCorpus(g); + SECTION("Leading Spaces Comparison") { + auto iterator = corpusLeadingSpaces.commence(); + do { + auto glibc = spaces_glibc(*iterator); + auto zspc = zoo::leadingSpacesCount(*iterator); + if(glibc != zspc) { + auto v = zoo::leadingSpacesCount(*iterator); + WARN("<<" << *iterator << ">> " << v ); + } + REQUIRE(glibc == zspc); + } while(iterator.next()); + } auto corpusStrlen = CorpusStringLength::makeCorpus(g); + auto corpusAtoi = CorpusAtoi::makeCorpus(g); #define X(Type, Fun) \ auto from##Type = traverse(CORPUS, Invoke##Type{}, 0); @@ -42,10 +58,19 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") { PARSE8BYTES_CORPUS_X_LIST #undef CORPUS + #define CORPUS corpusLeadingSpaces + LEADING_SPACES_CORPUS_X_LIST + #undef CORPUS + #define CORPUS corpusStrlen STRLEN_CORPUS_X_LIST #undef CORPUS + + #define CORPUS corpusAtoi + ATOI_CORPUS_X_LIST + #undef CORPUS #undef X + REQUIRE(fromGLIB_Spaces == fromZooSpaces); REQUIRE(fromLemire == fromZoo); REQUIRE(fromLIBC == fromZoo); REQUIRE(fromZOO_STRLEN == fromLIBC_STRLEN); @@ -54,6 +79,10 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") { #if ZOO_CONFIGURED_TO_USE_AVX() REQUIRE(fromZOO_AVX == fromZOO_STRLEN); #endif + + REQUIRE(fromZooSpaces == fromGLIB_Spaces); + + REQUIRE(fromGLIBC_atoi == fromZOO_ATOI); auto haveTheRoleOfMemoryBarrier = -1; #define X(Type, Fun) \ @@ -70,6 +99,38 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") { #define CORPUS corpusStrlen STRLEN_CORPUS_X_LIST #undef CORPUS + + #define CORPUS corpusLeadingSpaces + LEADING_SPACES_CORPUS_X_LIST + #undef CORPUS #undef X } +TEST_CASE("Atoi correctness", "[swar][atoi]") { + auto empty = ""; + REQUIRE(0 == zoo::c_strToI(empty)); + alignas(8) constexpr char EmptyMisaligned[8] = { 'Q', '\0', '0', '1', '2', '3', '9', '\0' }; + static_assert(8 == alignof(EmptyMisaligned)); + REQUIRE(0 == zoo::c_strToI(EmptyMisaligned + 1)); + auto justSpaces = " \t\t\v "; + REQUIRE(0 == zoo::c_strToI(justSpaces)); + REQUIRE(1239 == zoo::c_strToI(EmptyMisaligned + 2)); + auto hasPositiveSign = "\t\r\t\v+123456"; + REQUIRE(123456 == zoo::c_strToI(hasPositiveSign)); + auto hasNegativeSign9nines = "\t\r\t\v-999999999"; + REQUIRE(-999'999'999 == zoo::c_strToI(hasNegativeSign9nines)); + auto excessiveZeroesCloseToIntMax = "+00000000000001987654321"; + REQUIRE(1'987'654'321 == zoo::c_strToI(excessiveZeroesCloseToIntMax)); + char buffer[20]; + sprintf(buffer, "%d", INT_MAX); + REQUIRE(INT_MAX == zoo::c_strToI(buffer)); + sprintf(buffer, "%d", INT_MIN); + REQUIRE(INT_MIN == zoo::c_strToI(buffer)); + std::random_device rd; + std::mt19937 g(rd()); + std::uniform_int_distribution rnd(INT_MIN, INT_MAX); + auto randomNumber = rnd(g); + sprintf(buffer, " %d", randomNumber); + auto glibc = atoi(buffer); + REQUIRE(zoo::c_strToI(buffer) == glibc); +} \ No newline at end of file diff --git a/benchmark/swar/compress.cpp b/benchmark/swar/compress.cpp new file mode 100644 index 00000000..02643e5b --- /dev/null +++ b/benchmark/swar/compress.cpp @@ -0,0 +1,95 @@ +#include "zoo/pp/platform.h" + +#include "zoo/swar/associative_iteration.h" + +#include "benchmark/benchmark.h" + +#include +#include + +#include +#include + +uint64_t sideEffect = 0; + +template +using S = zoo::swar::SWAR; + +enum ExtractionPrimitive { + UseBuiltin, + UseSWAR, + CompareBuiltinAndSWAR +}; + +template +S parallelExtraction(S i, S m) { + if constexpr(UseSWAR == P) { + return compress(i, m); + } else { + constexpr auto LaneCount = 64 / NB; + uint64_t + input = i.value(), + mask = m.value(), + result = 0; + for(auto lane = 0;;) { + auto lowV = input & S::LeastSignificantLaneMask; + auto lowM = mask & S::LeastSignificantLaneMask; + uint64_t tmp; + if constexpr(NB < 32) { + tmp = __builtin_ia32_pext_si(lowV, lowM); + } else { + tmp = __builtin_ia32_pext_di(lowV, lowM); + } + result |= (tmp << (lane * NB)); + ++lane; + if(LaneCount <= lane) { break; } + if constexpr(NB != 8*sizeof(typename S::type)) { + input >>= NB; + mask >>= NB; + } + } + if constexpr(CompareBuiltinAndSWAR == P) { + auto fromSWAR = compress(i, m); + using B = std::bitset<64>; + auto toBinary = [](S what) { return B(what.value()); }; + if(fromSWAR.value() != result) { + std::cout << NB << '\n' << + toBinary(i) << '\n' << + toBinary(m) << "\n---------\n" << + toBinary(S(result)) << '\n' << + toBinary(fromSWAR) << '\n' << std::endl; + } + } + return S{result}; + } +} + +template +void runCompressions(benchmark::State &s) { + using S = zoo::swar::SWAR; + std::random_device rd; + std::mt19937_64 g(rd()); + std::vector inputs, masks; + for(auto count = 1000; --count; ) { + inputs.push_back(g()); + masks.push_back(g()); + } + for(auto _: s) { + auto result = 0; + for(auto c = 1000; c--; ) { + S input{inputs[c]}, mask{masks[c]}; + result ^= parallelExtraction(input, mask).value(); + } + sideEffect = result; + benchmark::ClobberMemory(); + } +} + +#define BIT_SIZE_X_LIST X(4) X(8) X(16) X(32) X(64) + +#define X(nb) \ + BENCHMARK(runCompressions); \ + BENCHMARK(runCompressions); \ + BENCHMARK(runCompressions); + +BIT_SIZE_X_LIST diff --git a/inc/zoo/map/RobinHood.h b/inc/zoo/map/RobinHood.h index 6270ec74..719ba1fa 100644 --- a/inc/zoo/map/RobinHood.h +++ b/inc/zoo/map/RobinHood.h @@ -88,7 +88,7 @@ struct RH_Backend { // significant bits is to be able to call the cheaper version // _MSB_off here - auto theyBreakInvariant = not theyKeepInvariant; + auto theyBreakInvariant = ~theyKeepInvariant; // because we make the assumption of LITTLE ENDIAN byte ordering, // we're interested in the elements up to the first haystack-richer auto firstBreakage = swar::isolateLSB(theyBreakInvariant.value()); @@ -565,7 +565,7 @@ struct RH_Frontend_WithSkarupkeTail { auto haystackPSLs = md.PSLs(); // haystack < needle => !(haystack >= needle) auto breaksRobinHood = - not greaterEqual_MSB_off(haystackPSLs, needlePSLs); + ~greaterEqual_MSB_off(haystackPSLs, needlePSLs); if(!bool(breaksRobinHood)) { // no place for the evicted element found in this swar. // increment the PSLs in the needle to check the next haystack @@ -600,7 +600,7 @@ struct RH_Frontend_WithSkarupkeTail { ++mdp; haystackPSLs = mdp->PSLs(); breaksRobinHood = - not greaterEqual_MSB_off(haystackPSLs, needlePSLs); + ~greaterEqual_MSB_off(haystackPSLs, needlePSLs); if(breaksRobinHood) { break; } evictedPSL += MD::NSlots; if(HighestSafePSL < evictedPSL) { diff --git a/inc/zoo/map/RobinHoodAlt.h b/inc/zoo/map/RobinHoodAlt.h index 5f8fa484..9d40df23 100644 --- a/inc/zoo/map/RobinHoodAlt.h +++ b/inc/zoo/map/RobinHoodAlt.h @@ -38,7 +38,7 @@ template struct SlotOperations { // PSLs in a swar with sublanes in the least bits, this guarantee // holds. auto satisfied = greaterEqual_MSB_off(pslHaystack, pslNeedle); - auto broken = not satisfied; + auto broken = ~satisfied; return swar::isolateLSB(broken.value()); } diff --git a/inc/zoo/map/RobinHoodUtil.h b/inc/zoo/map/RobinHoodUtil.h index 44dd022f..1ea11d17 100644 --- a/inc/zoo/map/RobinHoodUtil.h +++ b/inc/zoo/map/RobinHoodUtil.h @@ -1,6 +1,6 @@ #pragma once -#include "zoo/swar/SWAR.h" +#include "zoo/swar/SWARWithSubLanes.h" #include #include diff --git a/inc/zoo/pp/platform.h b/inc/zoo/pp/platform.h index b35b7b38..e6120667 100644 --- a/inc/zoo/pp/platform.h +++ b/inc/zoo/pp/platform.h @@ -13,6 +13,12 @@ #define ZOO_CONFIGURED_TO_USE_NEON() 0 #endif +#ifdef __BMI2__ +#define ZOO_CONFIGURED_TO_USE_BMI() 1 +#else +#define ZOO_CONFIGURED_TO_USE_BMI() 0 +#endif + #ifdef _MSC_VER #define MSVC_EMPTY_BASES __declspec(empty_bases) #else diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 1715d3e0..663a2d6b 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -16,7 +16,7 @@ namespace zoo { namespace swar { using u64 = uint64_t; using u32 = uint32_t; using u16 = uint16_t; -using u8 = uint8_t; +using u8 = std::uint8_t; template constexpr uint64_t popcount(uint64_t a) noexcept { @@ -53,17 +53,24 @@ constexpr std::make_unsigned_t lsbIndex(T v) noexcept { /// Certain computational workloads can be materially sped up using SWAR techniques. template struct SWAR { - using type = T; - constexpr static inline std::make_unsigned_t + using type = std::make_unsigned_t; + constexpr static inline type NBits = NBits_, BitWidth = sizeof(T) * 8, Lanes = BitWidth / NBits, NSlots = Lanes, PaddingBitsCount = BitWidth % NBits, SignificantBitsCount = BitWidth - PaddingBitsCount, - AllOnes = ~std::make_unsigned_t{0} >> PaddingBitsCount, + AllOnes = ~std::make_unsigned_t{0} >> PaddingBitsCount, // Also constructed in RobinHood utils: possible bug? LeastSignificantBit = meta::BitmaskMaker{1}, NBits>::value, - MostSignificantBit = LeastSignificantBit << (NBits - 1); + MostSignificantBit = LeastSignificantBit << (NBits - 1), + LeastSignificantLaneMask = + sizeof(T) * 8 == NBits ? // needed to avoid shifting all bits + ~T(0) : + ~(~T(0) << NBits), + // Use LowerBits in favor of ~MostSignificantBit to not pollute + // "don't care" bits when non-power-of-two bit lane sizes are supported + LowerBits = MostSignificantBit - LeastSignificantBit; SWAR() = default; constexpr explicit SWAR(T v): m_v(v) {} @@ -102,14 +109,12 @@ struct SWAR { // Returns lane at position with other lanes cleared. constexpr T isolateLane(int position) const noexcept { - constexpr auto filter = (T(1) << NBits) - 1; - return m_v & (filter << (NBits * position)); + return m_v & (LeastSignificantLaneMask << (NBits * position)); } // Returns lane value at position, in lane 0, rest of SWAR cleared. constexpr T at(int position) const noexcept { - constexpr auto filter = (T(1) << NBits) - 1; - return filter & (m_v >> (NBits * position)); + return LeastSignificantLaneMask & (m_v >> (NBits * position)); } constexpr SWAR clear(int position) const noexcept { @@ -149,129 +154,30 @@ struct SWAR { /// \brief as the name suggests /// \param protectiveMask should clear the bits that would cross the lane. - /// The bits that will be cleared are directly related to the count of shifts, it is natural to maintain - /// the protective mask by the caller, otherwise, the mask will be computed on all invocations. - /// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance. - constexpr SWAR - shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept { - return SWAR{(*this & protectiveMask).value() << bitCount}; - } - - /// \param protectiveMask should clear the bits that would cross the lane - /// \sa shiftIntraLaneLeft - constexpr SWAR - shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept { - return SWAR{(*this & protectiveMask).value() >> bitCount}; - } + /// The bits that will be cleared are directly related to the count of + /// shifts, it is natural to maintain the protective mask by the caller, + /// otherwise, the mask would have to be computed in all invocations. + /// We are not sure the optimizer would maintain this mask somewhere, if it + /// were to recalculate it, it would be disastrous for performance + /// \note the \c static_cast are necessary because of narrowing conversions + #define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>) + #define X(name, op) \ + constexpr SWAR \ + shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \ + T shiftC = static_cast(bitCount); \ + auto V = (*this & protectiveMask).value(); \ + auto rv = static_cast(V op shiftC); \ + return SWAR{rv}; \ + } + SHIFT_INTRALANE_OP_X_LIST + #undef X + #undef SHIFT_INTRALANE_OP_X_LIST constexpr SWAR multiply(T multiplier) const noexcept { return SWAR{m_v * multiplier}; } T m_v; }; -// SWAR is a useful abstraction for performing computations in lanes overlaid -// over any given integral type. -// Doing additions, subtractions, and compares via SWAR techniques requires an -// extra bit per lane be available past the lane size, _or_ knowledge that both -// of your MSBs are set 0 (leaving space for the operation). Similarly, doing -// multiplications via SWAR techniques require double bits per lane (unless you -// can bind your inputs at half lane size). -// This leads to a useful technique (which we use in the robin hood table) -// where we interleave two related small bit count integers inside of a lane of -// swar. More generally, this is useful because it sometimes allows fast -// operations on side "a" of some lane if side "b" is blitted out, and vice -// versa. In the spirit of separation of concerns, we provide a cut-lane-SWAR -// abstraction here. - -template -struct SWARWithSubLanes: SWAR { - static constexpr inline auto NBitsLeast = NBitsLeast_; - static constexpr inline auto NBitsMost = NBitsMost_; - - using Base = SWAR; - static constexpr inline auto Available = sizeof(T); - static constexpr inline auto LaneBits = NBitsLeast + NBitsMost; - - using Base::Base; - constexpr SWARWithSubLanes(Base b) noexcept: Base(b) {} - constexpr SWARWithSubLanes(T most, T least) noexcept: - Base((most << NBitsLeast) | least) - {} - - // M is most significant bits slice, L is least significant bits slice. - // 0x....M2L2M1L1 or MN|LN||...||M2|L2||M1|L1 - using SL = SWARWithSubLanes; - - static constexpr inline auto LeastOnes = - Base(meta::BitmaskMaker::value); - static constexpr inline auto MostOnes = - Base(LeastOnes.value() << NBitsLeast); - static constexpr inline auto LeastMask = MostOnes - LeastOnes; - static constexpr inline auto MostMask = ~LeastMask; - - constexpr auto least() const noexcept { - return SL{LeastMask & *this}; - } - - // Isolate the least significant bits of the lane at the specified position. - constexpr auto least(int pos) const noexcept { - constexpr auto Filter = SL((T(1) << NBitsLeast) - 1); - return Filter.shiftLanesLeft(pos) & *this; - } - - // Returns only the least significant bits at specified position, 'decoded' to their integer value. - constexpr auto leastFlat(int pos) const noexcept { - return least().at(pos); - } - - constexpr auto most() const noexcept { - return SL{MostMask & *this}; - } - - // The most significant bits of the lane at the specified position. - constexpr auto most(int pos) const noexcept { - constexpr auto Filter = - SL(((T(1) << SL::NBitsMost) - 1) << SL::NBitsLeast); - return Filter.shiftLanesLeft(pos) & *this; - } - - // The most significant bits of the lane at the specified position, - // 'decoded' to their integer value. - constexpr auto mostFlat(int pos) const noexcept { - return most().at(pos) >> SL::NBitsLeast; - } - - // Blits most sig bits into least significant bits. Experimental. - constexpr auto flattenMostToLeast(int pos) const noexcept { - return SL(this->m_v >> NBitsLeast) & LeastMask; - } - - // Blits least sig bits into most significant bits. Experimental. - constexpr auto promoteLeastToMost(int pos) const noexcept { - return SL(this->m_v << NBitsMost) & MostMask; - } - - // Sets the lsb sublane at |pos| with least significant NBitsLeast of |in| - constexpr auto least(T in, int pos) const noexcept { - constexpr auto filter = (T(1) << LaneBits) - 1; - const auto keep = ~(filter << (LaneBits * pos)) | MostMask.value(); - const auto rdyToInsert = this->m_v & keep; - const auto rval = rdyToInsert | ((in & LeastMask.value()) << (LaneBits * pos)); - return SL(rval); - } - - // Sets the msb sublane at |pos| with least significant NBitsMost of |in| - constexpr auto most(T in, int pos) const noexcept { - constexpr auto filter = (T(1) << LaneBits) - 1; - const auto keep = ~(filter << (LaneBits * pos)) | LeastMask.value(); - const auto rdyToInsert = this->m_v & keep; - const auto insVal = (((in< constexpr auto horizontalEquality(SWAR left, SWAR right) { @@ -340,11 +246,18 @@ constexpr auto broadcast(SWAR v) { /// BooleanSWAR treats the MSB of each SWAR lane as the boolean associated with that lane. template struct BooleanSWAR: SWAR { - // Booleanness is stored in the MSBs - static constexpr auto MaskLaneMSB = - broadcast(SWAR(T(1) << (NBits -1))); - constexpr explicit BooleanSWAR(T v): SWAR(v) {} + using Base = SWAR; + // Booleanness is stored in the MSBs + static constexpr auto MaskMSB = + broadcast(Base(T(1) << (NBits -1))); + static constexpr auto MaskLSB = + broadcast(Base(T(1))); + // Turns off LSB of each lane + static constexpr auto MaskNonLSB = ~MaskLSB; + static constexpr auto MaskNonMSB = ~MaskMSB; + constexpr explicit BooleanSWAR(T v): Base(v) {} + constexpr BooleanSWAR clear(int bit) const noexcept { constexpr auto Bit = T(1) << (NBits - 1); return this->m_v ^ (Bit << (NBits * bit)); } @@ -356,14 +269,31 @@ struct BooleanSWAR: SWAR { /// BooleanSWAR treats the MSB of each lane as the boolean associated with that lane. /// A logical NOT in this circumstance _only_ flips the MSB of each lane. This operation is /// not ones or twos complement. + + constexpr auto operator ~() const noexcept { + return BooleanSWAR(Base{Base::MostSignificantBit} ^ *this); + } + constexpr auto operator not() const noexcept { - return BooleanSWAR(MaskLaneMSB ^ *this); + return BooleanSWAR(MaskMSB ^ *this); + } + + #define BOOLEANSWAR_BINARY_LOGIC_OPERATOR_X_LIST X(^) X(&) X(|) + #define X(op) \ + constexpr BooleanSWAR operator op(BooleanSWAR other) const noexcept { return this->Base::operator op(other); } + BOOLEANSWAR_BINARY_LOGIC_OPERATOR_X_LIST + #undef X + + // BooleanSWAR as a mask: BooleanSWAR<4, u16>(0x0800).MSBtoLaneMask() => SWAR<4,u16>(0x0F00) + constexpr auto MSBtoLaneMask() const noexcept { + const auto MSBMinusOne = this->m_v - (this->m_v >> (NBits-1)); // Convert pattern 10* to 01* + return SWAR(MSBMinusOne | this->m_v); // Blit 01* and 10* together for 1* when MSB was on. } explicit constexpr operator bool() const noexcept { return this->m_v; } private: - constexpr BooleanSWAR(SWAR initializer) noexcept: + constexpr BooleanSWAR(Base initializer) noexcept: SWAR(initializer) {} @@ -451,7 +381,8 @@ constantIsGreaterEqual_MSB_off(SWAR subtrahend) noexcept { template constexpr BooleanSWAR greaterEqual_MSB_off(SWAR left, SWAR right) noexcept { - constexpr auto MLMSB = BooleanSWAR::MaskLaneMSB; + constexpr auto MLMSB = SWAR{SWAR::MostSignificantBit}; + auto minuend = MLMSB | left; return MLMSB & (minuend - right); } @@ -459,7 +390,7 @@ greaterEqual_MSB_off(SWAR left, SWAR right) noexcept { template constexpr auto booleans(SWAR arg) noexcept { - return not constantIsGreaterEqual<0>(arg); + return ~constantIsGreaterEqual<0>(arg); } template @@ -471,7 +402,7 @@ differents(SWAR a1, SWAR a2) { template constexpr auto equals(SWAR a1, SWAR a2) { - return not differents(a1, a2); + return ~differents(a1, a2); } /* @@ -485,7 +416,8 @@ constexpr SWAR logarithmFloor(SWAR v) noexcept { constexpr auto LogNBits = meta::logFloor(NBits); static_assert(NBits == (1 << LogNBits), "Logarithms of element width not power of two is un-implemented"); auto whole = v.value(); - auto isolationMask = BooleanSWAR::MaskLaneMSB.value(); + auto isolationMask = SWAR::MostSignificantBit; + for(auto groupSize = 1; groupSize < NBits; groupSize <<= 1) { auto shifted = whole >> groupSize; diff --git a/inc/zoo/swar/SWARWithSubLanes.h b/inc/zoo/swar/SWARWithSubLanes.h new file mode 100644 index 00000000..eb50d188 --- /dev/null +++ b/inc/zoo/swar/SWARWithSubLanes.h @@ -0,0 +1,121 @@ +#ifndef ZOO_SWAR_SWARWITHSUBLANES_H +#define ZOO_SWAR_SWARWITHSUBLANES_H + +#include "zoo/swar/SWAR.h" + +namespace zoo { namespace swar { + +/// \brief Allows SWAR Lanes to be treated both as a whole or something with +/// internal structure. + +/// Example: Robin Hood "Haystack" metadata composed of hoisted hash bits and +/// PSL (probe sequence lengths), that are used together or separately. +/// SWAR is a useful abstraction for performing computations in lanes overlaid +/// over any given integral type. +/// To prevent the normal integer operations in a lane to disrrupt the operation +/// in the adjoining lanes, some precautions must be maintained. For example +/// upon an addition of lanes, we either need that the domain of our values +/// does not use the most significant bit (guaranteeing normal addition of +/// lanes won't cross to the upper lane) or that this possibility is explicitly +/// taken into account (see "full addition"). This applies to all operations, +/// including comparisons. +/// Similarly, doing multiplications via SWAR techniques require double bits per +/// lane (unless you can guarantee the values of the input lanes are half lane +/// size). +/// This leads to a useful technique (which we use in the Robin Hood table) +/// where we interleave two related small bit count integers inside of a lane of +/// swar. More generally, this is useful because it sometimes allows fast +/// operations on side "a" of some lane if side "b" is blitted out, and vice +/// versa. In the spirit of separation of concerns, we provide a cut-lane-SWAR +/// abstraction here. +template +struct SWARWithSubLanes: SWAR { + static constexpr inline auto NBitsLeast = NBitsLeast_; + static constexpr inline auto NBitsMost = NBitsMost_; + + using Base = SWAR; + static constexpr inline auto Available = sizeof(T); + static constexpr inline auto LaneBits = NBitsLeast + NBitsMost; + + using Base::Base; + constexpr SWARWithSubLanes(Base b) noexcept: Base(b) {} + constexpr SWARWithSubLanes(T most, T least) noexcept: + Base((most << NBitsLeast) | least) + {} + + // M is most significant bits slice, L is least significant bits slice. + // 0x....M2L2M1L1 or MN|LN||...||M2|L2||M1|L1 + using SL = SWARWithSubLanes; + + static constexpr inline auto LeastOnes = + Base(meta::BitmaskMaker::value); + static constexpr inline auto MostOnes = + Base(LeastOnes.value() << NBitsLeast); + static constexpr inline auto LeastMask = MostOnes - LeastOnes; + static constexpr inline auto MostMask = ~LeastMask; + + constexpr auto least() const noexcept { + return SL{LeastMask & *this}; + } + + // Isolate the least significant bits of the lane at the specified position. + constexpr auto least(int pos) const noexcept { + constexpr auto Filter = SL((T(1) << NBitsLeast) - 1); + return Filter.shiftLanesLeft(pos) & *this; + } + + // Returns only the least significant bits at specified position, 'decoded' to their integer value. + constexpr auto leastFlat(int pos) const noexcept { + return least().at(pos); + } + + constexpr auto most() const noexcept { + return SL{MostMask & *this}; + } + + // The most significant bits of the lane at the specified position. + constexpr auto most(int pos) const noexcept { + constexpr auto Filter = + SL(((T(1) << SL::NBitsMost) - 1) << SL::NBitsLeast); + return Filter.shiftLanesLeft(pos) & *this; + } + + // The most significant bits of the lane at the specified position, + // 'decoded' to their integer value. + constexpr auto mostFlat(int pos) const noexcept { + return most().at(pos) >> SL::NBitsLeast; + } + + // Blits most sig bits into least significant bits. Experimental. + constexpr auto flattenMostToLeast(int pos) const noexcept { + return SL(this->m_v >> NBitsLeast) & LeastMask; + } + + // Blits least sig bits into most significant bits. Experimental. + constexpr auto promoteLeastToMost(int pos) const noexcept { + return SL(this->m_v << NBitsMost) & MostMask; + } + + // Sets the lsb sublane at |pos| with least significant NBitsLeast of |in| + constexpr auto least(T in, int pos) const noexcept { + constexpr auto filter = (T(1) << LaneBits) - 1; + const auto keep = ~(filter << (LaneBits * pos)) | MostMask.value(); + const auto rdyToInsert = this->m_v & keep; + const auto rval = rdyToInsert | ((in & LeastMask.value()) << (LaneBits * pos)); + return SL(rval); + } + + // Sets the msb sublane at |pos| with least significant NBitsMost of |in| + constexpr auto most(T in, int pos) const noexcept { + constexpr auto filter = (T(1) << LaneBits) - 1; + const auto keep = ~(filter << (LaneBits * pos)) | LeastMask.value(); + const auto rdyToInsert = this->m_v & keep; + const auto insVal = (((in< + +inline std::ostream &binary(std::ostream &out, uint64_t input, int count) { + while(count--) { + out << (1 & input); + input >>= 1; + } + return out; +} + +template +std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR s) { + using S = zoo::swar::SWAR; + auto shiftCounter = sizeof(B) * 8 / NB; + out << "<|"; + auto v = s.value(); + do { + binary(out, v, NB) << '|'; + + } while(--shiftCounter); + return out << ">"; +} + +#define ZOO_TO_STRING(a) #a +// std::endl is needed within the context of debugging: flush the line +#define ZOO_TRACEABLE_EXP_IMPL(F, L, ...) std::cout << '"' << (__VA_ARGS__) << "\", \"" << F << ':' << L << "\", \"" << ZOO_TO_STRING(__VA_ARGS__) << "\"" << std::endl; +#define ZOO_TRACEABLE_EXPRESSION(...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__) + +#else + +#define ZOO_TRACEABLE_EXPRESSION(...) (void)(__VA_ARGS__) + +#endif + namespace zoo::swar { /// \note This code should be substituted by an application of "progressive" algebraic iteration @@ -11,32 +47,187 @@ template constexpr SWAR parallelSuffix(SWAR input) { using S = SWAR; auto - shiftClearingMask = S{~S::MostSignificantBit}, + shiftClearingMask = S{static_cast(~S::MostSignificantBit)}, doubling = input, result = S{0}; auto bitsToXOR = NB, power = 1; + + #define ZTE(...) + // ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) for(;;) { + ZTE(doubling); if(1 & bitsToXOR) { + ZTE(result ^ doubling); result = result ^ doubling; + ZTE(doubling.shiftIntraLaneLeft(power, shiftClearingMask)); doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); } + ZTE(bitsToXOR >> 1); bitsToXOR >>= 1; if(!bitsToXOR) { break; } auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); + ZTE(shifted); + ZTE(doubling ^ shifted); doubling = doubling ^ shifted; // 01...1 // 001...1 // 00001...1 // 000000001...1 shiftClearingMask = - shiftClearingMask & S{shiftClearingMask.value() >> power}; + shiftClearingMask & + S{static_cast(shiftClearingMask.value() >> power)}; + ZTE(power << 1); power <<= 1; } + ZTE(input); + #undef ZTE return S{result}; } +/* +Binary compress: A fascinating algorithm. + +Warren (Hacker's Delight) believes Guy L. Steele is the author of the following +binary compression operation, equivalent to Intel's BMI2 instruction PEXT of +"Parallel Extraction" + +From a "mask", a selector of bits from an input, we want to put them together in +the output. + +For example's sake, this is the selector: +Note: this follows the usual 'big endian' convention of denoting the most +significant bit first: +0001 0011 0111 0111 0110 1110 1100 1010 +Imagine the input is the 32-bit or 32-boolean variable expression +abcd efgh ijkl mnop qrst uvxy zABC DEFG +We want the selection + d gh jkl nop rs uvx zA D F +To be compressed into the output +0000 0000 0000 00dg hjkl nopr suvx zADF + +This algorithm will virtually calculate the count of positions that the selected +bits travel to the right, by constructing the binary encoding of that count: +It will identify the positions that will travel an odd number of positions to +the right, these are those whose position-travel-count have the units set. +It will then move those positions by one position to the right, and eliminate +them from the yet-to-move positions. Because it eliminates the positions that +would move an odd count, there remains only positions that move an even number +of positions. Now it finds the positions that move an odd count of /pairs/ of +positions, it moves them 2 positions. This is equivalent to finding the +positions that would have the bit for 2 set in the count of positions to move +right. +Then an odd count of /quartets/ of positions, and moves them 4; +8, 16, 32, ... + +Complete example (32 bits) +Selection mask: +0001 0011 0111 0111 0110 1110 1100 1010 +Input (each letter or variable is a boolean, that can have 0 or 1) +abcd efgh ijkl mnop qrst uvxy zABC DEFG +Selection (using spaces) + d gh jkl nop rs uvx zA D F +Desired result: + dghjklnoprsuvxzADF + +0001 0011 0111 0111 0110 1110 1100 1010 compressionMask +1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask +1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 + == groupsize of ~compressionMask +This indicates the positions that have a 0 immediately to the right in + compressionMask +4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the + current position in forParallelSuffix, + last decimal digit +0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of + forParallelSuffix +We have just identified the positions that need to move an odd number of +positions. Filter them with positions with a bit set in compressionMask: +0001 0011 0111 0111 0110 1110 1100 1010 compressionMask +---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of + compressionMask by 1 == groupSize +0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits + that will move) +---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize +0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask. +0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of + forParallelSuffix +1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved) +1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero + immediately to their right) +1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => + had even zeroes to their right) +At this point, we have removed from compressionMask the positions that moved an +odd number of positions and moved them 1 position, +then, we only keep positions that move an even number of positions. +Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ... +*/ + +template +constexpr SWAR +compress(SWAR input, SWAR compressionMask) { + // This solution uses the parallel suffix operation as a primary tool: + // For every bit postion it indicates an odd number of ones to the right, + // including itself. + // Because we want to detect the "oddness" of groups of zeroes to the right, + // we flip the compression mask. To not count the bit position itself, + // we shift by one. + #define ZTE(...) + // ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) + ZTE(input); + ZTE(compressionMask); + using S = SWAR; + auto result = input & compressionMask; + auto groupSize = 1; + auto + shiftLeftMask = S{S::LowerBits}, + shiftRightMask = S{S::LowerBits << 1}; + ZTE(~compressionMask); + auto forParallelSuffix = // this is called "mk" in the book + (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask); + ZTE(forParallelSuffix); + // note: forParallelSuffix denotes positions with a zero + // immediately to the right in 'compressionMask' + for(;;) { + ZTE(groupSize); + ZTE(shiftLeftMask); + ZTE(shiftRightMask); + ZTE(result); + auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book + parallelSuffix(forParallelSuffix); + ZTE(oddCountOfGroupsOfZerosToTheRight); + // compress the bits just identified in both the result and the mask + auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight; + ZTE(moving); + compressionMask = + (compressionMask ^ moving) | // clear the moving + moving.shiftIntraLaneRight(groupSize, shiftRightMask); + ZTE(compressionMask); + auto movingFromInput = result & moving; + result = + (result ^ movingFromInput) | // clear the moving from the result + movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask); + auto nextGroupSize = groupSize << 1; + if(NB <= nextGroupSize) { + break; + } + auto evenCountOfGroupsOfZerosToTheRight = + ~oddCountOfGroupsOfZerosToTheRight; + forParallelSuffix = + forParallelSuffix & evenCountOfGroupsOfZerosToTheRight; + auto newShiftLeftMask = + shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask); + shiftRightMask = + shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask); + shiftLeftMask = newShiftLeftMask; + groupSize = nextGroupSize; + } + ZTE(result); + #undef ZTE + return result; +} + /// \todo because of the desirability of "accumuating" the XORs at the MSB, /// the parallel suffix operation is more suitable. template @@ -154,6 +345,16 @@ fullAddition(SWAR s1, SWAR s2) { return { result, BS{carry.value()}, BS{overflow.value()} }; }; +template +constexpr SWAR +saturatingUnsignedAddition(SWAR s1, SWAR s2) { + const auto additionResult = fullAddition(s1, s2); + // If we carry unsigned, we need to saturate: thus OR the carry bit with the + // lane bits (carry because it happens to be earlier in the struct + // declaration) + return additionResult.carry.MSBtoLaneMask() | additionResult.result; +} + /// \brief Negation is useful only for the signed integer interpretation template constexpr auto negate(SWAR input) { diff --git a/scripts/mock-includes.sh b/scripts/mock-includes.sh new file mode 100644 index 00000000..381abfeb --- /dev/null +++ b/scripts/mock-includes.sh @@ -0,0 +1,15 @@ +COMPILER=$1 +OUTPUT=$2 + +shift +shift + +${COMPILER} -nostdinc -nostdinc++ -E -I${OUTPUT} $* 2>&1 > /dev/null | \ + sed -n 's/^\(.*\)fatal error: '"'"'\(.*\)'"'"' file not found\(.*\)$/\2/p' | + while read FILE + do + ADDENDUM="${OUTPUT}/$FILE" + mkdir -p $(dirname $ADDENDUM) + echo $ADDENDUM + echo "__include_directive__ <$FILE>" > ${ADDENDUM} + done diff --git a/scripts/redirective.sh b/scripts/redirective.sh new file mode 100644 index 00000000..4b6ac454 --- /dev/null +++ b/scripts/redirective.sh @@ -0,0 +1,7 @@ +#! /usr/bin/bash + +COMPILER=$1 + +shift + +${COMPILER} -D__include_directive__='#include' -E -CC $* diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 331cfa42..411d9ce1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -111,7 +111,7 @@ else() ) set( SWAR_SOURCES - swar/BasicOperations.cpp + swar/BasicOperations.cpp swar/sublanes.cpp ) set( MAP_SOURCES diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 5bdee385..b835c1d1 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -8,9 +8,25 @@ using namespace zoo; using namespace zoo::swar; -namespace Multiplication { - using S4_64 = SWAR<4, uint64_t>; +using S4_32 = SWAR<4, uint32_t>; +using S4_16 = SWAR<4, uint16_t>; +using S4_8 = SWAR<4, uint8_t>; + +using S8_64 = SWAR<8, uint64_t>; +using S8_32 = SWAR<8, uint32_t>; +using S8_16 = SWAR<8, uint16_t>; +using S8_8 = SWAR<8, uint8_t>; + +using S16_64 = SWAR<16, uint64_t>; +using S16_32 = SWAR<16, uint32_t>; +using S16_16 = SWAR<16, uint16_t>; + +using S32_32 = SWAR<32, uint32_t>; + +using S64_64 = SWAR<64, uint64_t>; + +namespace Multiplication { static_assert(~int64_t(0) == negate(S4_64{S4_64::LeastSignificantBit}).value()); static_assert(0x0F0F0F0F == doublingMask<4, uint32_t>().value()); @@ -97,19 +113,38 @@ TEST_CASE( "[swar]" ) { for (auto i = 0; i < 63; ++i) { - CHECK(i == isolate<8>(i)); - CHECK(i == isolate<8>(0xFF00+i)); - CHECK(i == isolate<8>(0xFFFF00+i)); + CHECK(i == isolate<8>(i)); + CHECK(i == isolate<8>(0xFF00+i)); + CHECK(i == isolate<8>(0xFFFF00+i)); } for (auto i = 0; i < 31; ++i) { - CHECK(i == isolate<7>(i)); - CHECK(i == isolate<7>(0xFF00+i)); - CHECK(i == isolate<7>(0xFFFF00+i)); + CHECK(i == isolate<7>(i)); + CHECK(i == isolate<7>(0xFF00+i)); + CHECK(i == isolate<7>(0xFFFF00+i)); } for (auto i = 0; i < 31; ++i) { - CHECK(i == isolate<11>(i)); - CHECK(i == isolate<11>(0xF800+i)); - CHECK(i == isolate<11>(0xFFF800+i)); + CHECK(i == isolate<11>(i)); + CHECK(i == isolate<11>(0xF800+i)); + CHECK(i == isolate<11>(0xFFF800+i)); + } +} + +TEST_CASE("Compress/Expand", "[swar]") { + unsigned + Mask = 0b0001'0011'0111'0111'0110'1110'1100'1010, + ToMove = 0b0101'0101'0101'0101'0101'0101'0101'0101, + // Selection: 1 01 101 101 10 010 01 0 0 + result = 0b0001'0'1'1'0'1'1'0'1'10'0'10'0'1'0'0; + auto q = compress(S32_32{ToMove}, S32_32{Mask}); + CHECK(result == q.value()); + SECTION("Regression 1") { + u64 + input = 0b1010'1001'0110'0001'1001'0000'0010'1010'0100'0111'1110'1001'1111'0001'1110'1011, + mask = 0b0110'0000'0001'0101'0101'1111'0101'1100'0110'1111'0100'0111'0001'1000'0101'0010, + expected =0b0001'0000'0000'0001'0001'0000'0000'0010'0010'0111'0001'0001'0001'0000'0010'0001; + using S = S4_64; + auto v = compress(S{input}, S{mask}); + CHECK(expected == v.value()); } } @@ -276,62 +311,80 @@ GE_MSB_TEST(0x7777'7777, 0x0123'4567, 0x8888'8888) -// 3 bits on msb side, 5 bits on lsb side. -using Lanes = SWARWithSubLanes<5, 3, u32>; -using S8u32 = SWAR<8, u32>; -static constexpr inline u32 all0 = 0; -static constexpr inline u32 allF = broadcast<8>(S8u32(0x0000'00FFul)).value(); - -static_assert(allF == Lanes(allF).value()); -static_assert(0xFFFF'FFFF == Lanes(allF).value()); - -static_assert(0xFFFF'FFE0 == Lanes(allF).least(0,0).value()); -static_assert(0xFFFF'FFE1 == Lanes(allF).least(1,0).value()); -static_assert(0xFFFF'E0FF == Lanes(allF).least(0,1).value()); -static_assert(0xFFFF'E1FF == Lanes(allF).least(1,1).value()); - -static_assert(0xFFE0'FFFF == Lanes(allF).least(0,2).value()); -static_assert(0xFFE1'FFFF == Lanes(allF).least(1,2).value()); -static_assert(0xE0FF'FFFF == Lanes(allF).least(0,3).value()); -static_assert(0xE1FF'FFFF == Lanes(allF).least(1,3).value()); - -static_assert(0xFFFF'FF1F == Lanes(allF).most(0,0).value()); -static_assert(0xFFFF'FF3F == Lanes(allF).most(1,0).value()); -static_assert(0xFFFF'1FFF == Lanes(allF).most(0,1).value()); -static_assert(0xFFFF'3FFF == Lanes(allF).most(1,1).value()); - -static_assert(0xFF1F'FFFF == Lanes(allF).most(0,2).value()); -static_assert(0xFF3F'FFFF == Lanes(allF).most(1,2).value()); -static_assert(0x1FFF'FFFF == Lanes(allF).most(0,3).value()); -static_assert(0x3FFF'FFFF == Lanes(allF).most(1,3).value()); - -static_assert(0x0000'001f == Lanes(all0).least(31, 0).most(0, 0).value()); -static_assert(0x0000'1f00 == Lanes(all0).least(31, 1).most(0, 1).value()); -static_assert(0x001f'0000 == Lanes(all0).least(31, 2).most(0, 2).value()); -static_assert(0x1f00'0000 == Lanes(all0).least(31, 3).most(0, 3).value()); - -static_assert(0x0000'00e0 == Lanes(all0).least(0, 0).most(31, 0).value()); -static_assert(0x0000'e000 == Lanes(all0).least(0, 1).most(31, 1).value()); -static_assert(0x00e0'0000 == Lanes(all0).least(0, 2).most(31, 2).value()); -static_assert(0xe000'0000 == Lanes(all0).least(0, 3).most(31, 3).value()); - -static_assert(0x1F1F'1F1F == Lanes(allF).least().value()); -static_assert(0xE0E0'E0E0 == Lanes(allF).most().value()); - -static_assert(0x0000'001F == Lanes(allF).least(0).value()); -static_assert(0x0000'1F00 == Lanes(allF).least(1).value()); -static_assert(0x001F'0000 == Lanes(allF).least(2).value()); -static_assert(0x1F00'0000 == Lanes(allF).least(3).value()); - -static_assert(0x0000'00E0 == Lanes(allF).most(0).value()); -static_assert(0x0000'E000 == Lanes(allF).most(1).value()); -static_assert(0x00E0'0000 == Lanes(allF).most(2).value()); -static_assert(0xE000'0000 == Lanes(allF).most(3).value()); - static_assert(0x123 == SWAR<4, uint32_t>(0x173).blitElement(1, 2).value()); static_assert(0 == isolateLSB(u32(0))); constexpr auto aBooleansWithTrue = booleans(SWAR<4, u32>{0x1}); static_assert(aBooleansWithTrue); -static_assert(!aBooleansWithTrue); // this is a pitfall, but lesser evil? +//static_assert(~aBooleansWithTrue); static_assert(false == !bool(aBooleansWithTrue)); + +TEST_CASE( + "fullAddition", + "[swar][signed-swar][unsigned-swar]" +) { + SECTION("fullAddition overflow") { + const auto sum = fullAddition(SWAR<4, u32>(0x0000'1000), SWAR<4, u32>(0x0000'7000)); + CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.carry.value()); + CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.overflow.value()); + CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.result.value()); + } + SECTION("no carry or overflow for safe values") { + const auto sum = fullAddition(SWAR<4, u32>(0x0000'8000), SWAR<4, u32>(0x0000'7000)); + CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.carry.value()); + CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.overflow.value()); + CHECK(SWAR<4, u32>(0x0000'F000).value() == sum.result.value()); + } + SECTION("fullAddition signed overflow") { + const auto sum = fullAddition(SWAR<4, u32>(0x0000'5000), SWAR<4, u32>(0x0000'5000)); + CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.carry.value()); + CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.overflow.value()); + CHECK(SWAR<4, u32>(0x0000'A000).value() == sum.result.value()); + } + SECTION("0x0111 (7) + 0x0111 (7) is 0x1110 (0x1110->0x1101->0x0010) (0xe unsigned, 0x2 signed) (signed and unsigned check)") { + const auto sum = fullAddition(SWAR<4, u32>(0x0000'7000), SWAR<4, u32>(0x0000'7000)); + CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.carry.value()); + CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.overflow.value()); + CHECK(SWAR<4, u32>(0x0000'e000).value() == sum.result.value()); + } + SECTION("both carry and overflow") { + const auto sum = fullAddition(SWAR<4, u32>(0x0000'a000), SWAR<4, u32>(0x0000'a000)); + CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.carry.value()); + CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.overflow.value()); + } +} + +TEST_CASE( + "BooleanSWAR MSBtoLaneMask", + "[swar]" +) { + // BooleanSWAR as a mask: + auto bswar =BooleanSWAR<4, u32>(0x0808'0000); + auto mask = S4_32(0x0F0F'0000); + CHECK(bswar.MSBtoLaneMask().value() == mask.value()); +} + +constexpr auto fullAddSumTest = fullAddition(S4_32(0x0111'1101), S4_32(0x1000'0010)); +static_assert( S4_32(0x1111'1111).value() == fullAddSumTest.result.value()); +static_assert( S4_32(0x0000'0000).value() == fullAddSumTest.carry.value()); +static_assert( S4_32(0x0000'0000).value() == fullAddSumTest.overflow.value()); + +// Verify that saturation works (saturates and doesn't saturate as appropriate) +static_assert( S4_16(0x0000).value() == saturatingUnsignedAddition(S4_16(0x0000), S4_16(0x0000)).value()); +static_assert( S4_16(0x0200).value() == saturatingUnsignedAddition(S4_16(0x0100), S4_16(0x0100)).value()); +static_assert( S4_16(0x0400).value() == saturatingUnsignedAddition(S4_16(0x0300), S4_16(0x0100)).value()); +static_assert( S4_16(0x0A00).value() == saturatingUnsignedAddition(S4_16(0x0300), S4_16(0x0700)).value()); +static_assert( S4_16(0x0F00).value() == saturatingUnsignedAddition(S4_16(0x0800), S4_16(0x0700)).value()); +static_assert( S4_16(0x0F00).value() == saturatingUnsignedAddition(S4_16(0x0800), S4_16(0x0800)).value()); + +TEST_CASE( + "saturatingUnsignedAddition", + "[swar][saturation]" +) { + CHECK(SWAR<4, u16>(0x0200).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0100), SWAR<4, u16>(0x0100)).value()); + CHECK(SWAR<4, u16>(0x0400).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0100), SWAR<4, u16>(0x0300)).value()); + CHECK(SWAR<4, u16>(0x0B00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0300)).value()); + CHECK(SWAR<4, u16>(0x0F00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0700)).value()); + CHECK(SWAR<4, u16>(0x0F00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0800)).value()); + CHECK(S4_32(0x0F0C'F000).value() == saturatingUnsignedAddition(S4_32(0x0804'F000), S4_32(0x0808'F000)).value()); +} diff --git a/test/swar/sublanes.cpp b/test/swar/sublanes.cpp new file mode 100644 index 00000000..80bc0dae --- /dev/null +++ b/test/swar/sublanes.cpp @@ -0,0 +1,56 @@ +#include "zoo/swar/SWARWithSubLanes.h" + +using namespace zoo; +using namespace zoo::swar; + +// 3 bits on msb side, 5 bits on lsb side. +using Lanes = SWARWithSubLanes<5, 3, u32>; +using S8u32 = SWAR<8, u32>; +static constexpr inline u32 all0 = 0; +static constexpr inline u32 allF = broadcast<8>(S8u32(0x0000'00FFul)).value(); + +static_assert(allF == Lanes(allF).value()); +static_assert(0xFFFF'FFFF == Lanes(allF).value()); + +static_assert(0xFFFF'FFE0 == Lanes(allF).least(0,0).value()); +static_assert(0xFFFF'FFE1 == Lanes(allF).least(1,0).value()); +static_assert(0xFFFF'E0FF == Lanes(allF).least(0,1).value()); +static_assert(0xFFFF'E1FF == Lanes(allF).least(1,1).value()); + +static_assert(0xFFE0'FFFF == Lanes(allF).least(0,2).value()); +static_assert(0xFFE1'FFFF == Lanes(allF).least(1,2).value()); +static_assert(0xE0FF'FFFF == Lanes(allF).least(0,3).value()); +static_assert(0xE1FF'FFFF == Lanes(allF).least(1,3).value()); + +static_assert(0xFFFF'FF1F == Lanes(allF).most(0,0).value()); +static_assert(0xFFFF'FF3F == Lanes(allF).most(1,0).value()); +static_assert(0xFFFF'1FFF == Lanes(allF).most(0,1).value()); +static_assert(0xFFFF'3FFF == Lanes(allF).most(1,1).value()); + +static_assert(0xFF1F'FFFF == Lanes(allF).most(0,2).value()); +static_assert(0xFF3F'FFFF == Lanes(allF).most(1,2).value()); +static_assert(0x1FFF'FFFF == Lanes(allF).most(0,3).value()); +static_assert(0x3FFF'FFFF == Lanes(allF).most(1,3).value()); + +static_assert(0x0000'001f == Lanes(all0).least(31, 0).most(0, 0).value()); +static_assert(0x0000'1f00 == Lanes(all0).least(31, 1).most(0, 1).value()); +static_assert(0x001f'0000 == Lanes(all0).least(31, 2).most(0, 2).value()); +static_assert(0x1f00'0000 == Lanes(all0).least(31, 3).most(0, 3).value()); + +static_assert(0x0000'00e0 == Lanes(all0).least(0, 0).most(31, 0).value()); +static_assert(0x0000'e000 == Lanes(all0).least(0, 1).most(31, 1).value()); +static_assert(0x00e0'0000 == Lanes(all0).least(0, 2).most(31, 2).value()); +static_assert(0xe000'0000 == Lanes(all0).least(0, 3).most(31, 3).value()); + +static_assert(0x1F1F'1F1F == Lanes(allF).least().value()); +static_assert(0xE0E0'E0E0 == Lanes(allF).most().value()); + +static_assert(0x0000'001F == Lanes(allF).least(0).value()); +static_assert(0x0000'1F00 == Lanes(allF).least(1).value()); +static_assert(0x001F'0000 == Lanes(allF).least(2).value()); +static_assert(0x1F00'0000 == Lanes(allF).least(3).value()); + +static_assert(0x0000'00E0 == Lanes(allF).most(0).value()); +static_assert(0x0000'E000 == Lanes(allF).most(1).value()); +static_assert(0x00E0'0000 == Lanes(allF).most(2).value()); +static_assert(0xE000'0000 == Lanes(allF).most(3).value()); From 449de1f8300523707955ee9463c463536b1511cc Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:30:40 -0700 Subject: [PATCH 25/26] Tidy rm unused multiply more tests --- inc/zoo/swar/associative_iteration.h | 30 ---------------------------- test/swar/BasicOperations.cpp | 20 +++++++++++++++---- 2 files changed, 16 insertions(+), 34 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index cad59b4e..5028a121 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -411,36 +411,6 @@ constexpr auto associativeOperatorIterated_regressive( return result; } - -// What I don't understand is why this doesn't work? -template -constexpr auto multiply(T a , T b) { - auto operation = [](auto left, auto right, auto count) { - if (count) { - return left + right; - } else { - return left; - } - }; - - auto updateCount = [](auto count) { - return count << 1; - }; - - constexpr auto numBits = sizeof(T) * 8; - return associativeOperatorIterated_regressive( - a, // base - 0, // neutral - b, // count - 1, // forSquaring, pretty sure this is where i am not understanding - operation, // operation - numBits, // log2Count - updateCount // halver - ); -} - -// static_assert(multiply(3, 4) == 12, "multiply failed"); - template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR multiplicand, SWAR multiplier diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index b835c1d1..8a8ca464 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -2,6 +2,7 @@ #include "catch2/catch.hpp" +#include #include @@ -59,7 +60,7 @@ static_assert( static_assert(0b00000010000000110000010100000110 == 0x02'03'05'06); -TEST_CASE("Jamie's totally working exponentiation :D") { +TEST_CASE("Expontiation with 8-bit lane width (overflow unsafe)") { using S = SWAR<8, u32>; constexpr auto base = S::fromLaneLiterals({2, 3, 5, 6}); constexpr auto exponent = S::fromLaneLiterals({7, 4, 2, 3}); @@ -69,8 +70,19 @@ TEST_CASE("Jamie's totally working exponentiation :D") { CHECK(expected.value() == actual.value()); } +TEST_CASE("Expontiation with 16-bit lane width (overflow unsafe)") { + using S = SWAR<16, u64>; // Change to 16-bit lane width + constexpr auto base = S::fromLaneLiterals({10, 2, 7, 3}); + constexpr auto exponent = S::fromLaneLiterals({3, 5, 1, 4}); + constexpr auto expected = S::fromLaneLiterals({1000, 32, 7, 81}); + constexpr auto actual = exponentiation_OverflowUnsafe(base, exponent); + static_assert(expected.value() == actual.value()); + CHECK(expected.value() == actual.value()); } +}; + + #define HE(nbits, t, v0, v1) \ static_assert(horizontalEquality(\ SWAR(v0),\ @@ -358,7 +370,7 @@ TEST_CASE( "BooleanSWAR MSBtoLaneMask", "[swar]" ) { - // BooleanSWAR as a mask: + // BooleanSWAR as a mask: auto bswar =BooleanSWAR<4, u32>(0x0808'0000); auto mask = S4_32(0x0F0F'0000); CHECK(bswar.MSBtoLaneMask().value() == mask.value()); @@ -385,6 +397,6 @@ TEST_CASE( CHECK(SWAR<4, u16>(0x0400).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0100), SWAR<4, u16>(0x0300)).value()); CHECK(SWAR<4, u16>(0x0B00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0300)).value()); CHECK(SWAR<4, u16>(0x0F00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0700)).value()); - CHECK(SWAR<4, u16>(0x0F00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0800)).value()); - CHECK(S4_32(0x0F0C'F000).value() == saturatingUnsignedAddition(S4_32(0x0804'F000), S4_32(0x0808'F000)).value()); + CHECK(SWAR<4, u16>(0x0F00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0800)).value()); + CHECK(S4_32(0x0F0C'F000).value() == saturatingUnsignedAddition(S4_32(0x0804'F000), S4_32(0x0808'F000)).value()); } From 756a5ee8bc2c65a3d4a7fe6135273f19a3346e76 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:51:32 -0700 Subject: [PATCH 26/26] more tidy --- test/swar/BasicOperations.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 8a8ca464..06e6b58c 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -2,10 +2,6 @@ #include "catch2/catch.hpp" -#include -#include - - using namespace zoo; using namespace zoo::swar;