Merge pull request #74 from thecppzoo/em/swar-expand-compress

thecppzoo · web-flow · commit e5ab9dc4b9fe · 2024-02-24T10:45:11.000-08:00
Implementation of the "compress" operation (Intel's PEXT, "parallel extraction") on a per-lane basis
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
-# Vscode does not like to build outside of the source tree
-# (multiple glitches)
-
-.vscode
-test/.vscode
-build
-.cache
+# Vscode does not like to build outside of the source tree
+# (multiple glitches)
+
+.vscode
+test/.vscode
+build
+.cache
diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h
@@ -15,7 +15,7 @@ namespace zoo { namespace swar {
 using u64 = uint64_t;
 using u32 = uint32_t;
 using u16 = uint16_t;
-using u8 = uint8_t;
+using u8 = std::uint8_t;
 
 template<int LogNBits>
 constexpr uint64_t popcount(uint64_t a) noexcept {
@@ -58,7 +58,10 @@ struct SWAR {
         SignificantBitsCount = BitWidth - PaddingBitsCount,
         AllOnes = ~std::make_unsigned_t<T>{0} >> PaddingBitsCount,
         LeastSignificantBit = meta::BitmaskMaker<T, std::make_unsigned_t<T>{1}, NBits>::value,
-        MostSignificantBit = LeastSignificantBit << (NBits - 1);
+        MostSignificantBit = LeastSignificantBit << (NBits - 1),
+        // Use LowerBits in favor of ~MostSignificantBit to not pollute
+        // "don't care" bits when non-power-of-two bit lane sizes are supported
+        LowerBits = MostSignificantBit - LeastSignificantBit;
 
     SWAR() = default;
     constexpr explicit SWAR(T v): m_v(v) {}
@@ -129,20 +132,24 @@ struct SWAR {
 
     /// \brief as the name suggests
     /// \param protectiveMask should clear the bits that would cross the lane.
-    /// The bits that will be cleared are directly related to the count of shifts, it is natural to maintain
-    /// the protective mask by the caller, otherwise, the mask will be computed on all invocations.
-    /// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance.
-    constexpr SWAR
-    shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept {
-        return SWAR{(*this & protectiveMask).value() << bitCount};
-    }
-
-    /// \param protectiveMask should clear the bits that would cross the lane
-    /// \sa shiftIntraLaneLeft
-    constexpr SWAR
-    shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept {
-        return SWAR{(*this & protectiveMask).value() >> bitCount};
-    }
+    /// The bits that will be cleared are directly related to the count of
+    /// shifts, it is natural to maintain the protective mask by the caller,
+    /// otherwise, the mask would have to be computed in all invocations.
+    /// We are not sure the optimizer would maintain this mask somewhere, if it
+    /// were to recalculate it, it would be disastrous for performance
+    /// \note the \c static_cast are necessary because of narrowing conversions
+    #define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>)
+    #define X(name, op) \
+        constexpr SWAR \
+        shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \
+            T shiftC = static_cast<T>(bitCount); \
+            auto V = (*this & protectiveMask).value(); \
+            auto rv = static_cast<T>(V op shiftC); \
+            return SWAR{rv}; \
+        }
+    SHIFT_INTRALANE_OP_X_LIST
+    #undef X
+    #undef SHIFT_INTRALANE_OP_X_LIST
 
     constexpr SWAR
     multiply(T multiplier) const noexcept { return SWAR{m_v * multiplier}; }
diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h
@@ -3,6 +3,42 @@
 
 #include "zoo/swar/SWAR.h"
 
+//#define ZOO_DEVELOPMENT_DEBUGGING
+#ifdef ZOO_DEVELOPMENT_DEBUGGING
+#include <iostream>
+
+inline std::ostream &binary(std::ostream &out, uint64_t input, int count) {
+    while(count--) {
+        out << (1 & input);
+        input >>= 1;
+    }
+    return out;
+}
+
+template<int NB, typename B>
+std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR<NB, B> s) {
+    using S = zoo::swar::SWAR<NB, B>;
+    auto shiftCounter = sizeof(B) * 8 / NB;
+    out << "<|";
+    auto v = s.value();
+    do {
+        binary(out, v, NB) << '|';
+
+    } while(--shiftCounter);
+    return out << ">";
+}
+
+#define ZOO_TO_STRING(a) #a
+// std::endl is needed within the context of debugging: flush the line
+#define ZOO_TRACEABLE_EXP_IMPL(F, L, ...) std::cout << '"' << (__VA_ARGS__) << "\", \"" <<  F << ':' << L << "\", \"" << ZOO_TO_STRING(__VA_ARGS__) << "\"" << std::endl;
+#define ZOO_TRACEABLE_EXPRESSION(...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__)
+
+#else
+
+#define ZOO_TRACEABLE_EXPRESSION(...) __VA_ARGS__
+
+#endif
+
 namespace zoo::swar {
 
 /// \note This code should be substituted by an application of "progressive" algebraic iteration
@@ -11,32 +47,196 @@ template<int NB, typename B>
 constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
     using S = SWAR<NB, B>;
     auto
-        shiftClearingMask = S{~S::MostSignificantBit},
+        shiftClearingMask = S{static_cast<B>(~S::MostSignificantBit)},
         doubling = input,
         result = S{0};
     auto
         bitsToXOR = NB,
         power = 1;
+
+    #define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
     for(;;) {
+        ZTE(doubling);
         if(1 & bitsToXOR) {
-            result = result ^ doubling;
-            doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
+            ZTE(result = result ^ doubling);
+            ZTE(doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask));
         }
-        bitsToXOR >>= 1;
+        ZTE(bitsToXOR >>= 1);
         if(!bitsToXOR) { break; }
         auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
-        doubling = doubling ^ shifted;
+        ZTE(shifted);
+        ZTE(doubling = doubling ^ shifted);
         // 01...1
         // 001...1
         // 00001...1
         // 000000001...1
         shiftClearingMask =
-            shiftClearingMask & S{shiftClearingMask.value() >> power};
-        power <<= 1;
+            shiftClearingMask &
+                S{static_cast<B>(shiftClearingMask.value() >> power)};
+        ZTE(power <<= 1);
     }
+    ZTE(input);
+    #undef ZTE
     return S{result};
 }
 
+/*
+Binary compress: A fascinating algorithm.
+
+Warren (Hacker's Delight) believes Guy L. Steele is the author of the following
+binary compression operation, equivalent to Intel's BMI2 instruction PEXT of
+"Parallel Extraction"
+
+From a "mask", a selector of bits from an input, we want to put them together in
+the output.
+
+For example's sake, this is the selector:
+Note: this follows the usual 'big endian' convention of denoting the most
+significant bit first:
+0001 0011 0111 0111 0110 1110 1100 1010
+Imagine the input is the 32-bit or 32-boolean variable expression
+abcd efgh ijkl mnop qrst uvxy zABC DEFG
+We want the selection
+   d   gh  jkl  nop  rs  uvx  zA   D F
+To be compressed into the output
+0000 0000 0000 00dg hjkl nopr suvx zADF
+
+This algorithm will virtually calculate the count of positions that the selected
+bits travel to the right, by constructing the binary encoding of that count:
+It will identify the positions that will travel an odd number of positions to
+the right, these are those whose position-travel-count have the units set.
+It will then move those positions by one position to the right, and eliminate
+them from the yet-to-move positions.  Because it eliminates the positions that
+would move an odd count, there remains only positions that move an even number
+of positions.  Now it finds the positions that move an odd count of /pairs/ of
+positions, it moves them 2 positions.  This is equivalent to finding the
+positions that would have the bit for 2 set in the count of positions to move
+right.
+Then an odd count of /quartets/ of positions, and moves them 4;
+8, 16, 32, ...
+
+
+Complete example (32 bits)
+Selection mask:
+0001 0011 0111 0111 0110 1110 1100 1010
+Input (each letter or variable is a boolean, that can have 0 or 1)
+abcd efgh ijkl mnop qrst uvxy zABC DEFG
+Selection (using spaces)
+   d   gh  jkl  nop  rs  uvx  zA   D F
+Desired result:
+                     dghjklnoprsuvxzADF
+
+0000 1001 1011 1011 1011 0111 0110 0101 shiftLeft 1
+1111 0110 0100 0100 0100 1000 1001 1010 forParallelSuffix
+
+                           10 1101 1101
+/*
+Complete example (32 bits)
+Selection mask:
+0001 0011 0111 0111 0110 1110 1100 1010
+Input (each letter or variable is a boolean, that can have 0 or 1)
+abcd efgh ijkl mnop qrst uvxy zABC DEFG
+Selection (using spaces)
+   d   gh  jkl  nop  rs  uvx  zA   D F
+Desired result:
+                     dghjklnoprsuvxzADF
+
+0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
+1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask
+1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1
+                                            == groupsize of ~compressionMask
+This indicates the positions that have a 0 immediately to the right in
+                                            compressionMask
+4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the
+                                          current position in forParallelSuffix,
+                                          last decimal digit
+0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
+                                            forParallelSuffix
+We have just identified the positions that need to move an odd number of
+positions.  Filter them with positions with a bit set in compressionMask:
+0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
+---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of
+                                            compressionMask by 1 == groupSize
+0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits
+                                            that will move)
+---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize
+0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask.
+0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
+                                            forParallelSuffix
+1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved)
+1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero
+                                            immediately to their right)
+1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved =>
+                                                had even zeroes to their right)
+At this point, we have removed from compressionMask the positions that moved an
+odd number of positions and moved them 1 position,
+then, we only keep positions that move an even number of positions.
+Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ...
+*/
+
+template<int NB, typename B>
+constexpr SWAR<NB, B>
+compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
+    // This solution uses the parallel suffix operation as a primary tool:
+    // For every bit postion it indicates an odd number of ones to the right,
+    // including itself.
+    // Because we want to detect the "oddness" of groups of zeroes to the right,
+    // we flip the compression mask.  To not count the bit position itself,
+    // we shift by one.
+    #define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
+    ZTE(input);
+    ZTE(compressionMask);
+    using S = SWAR<NB, B>;
+    auto result = input & compressionMask;
+    auto groupSize = 1;
+    auto
+        shiftLeftMask = S{S::LowerBits},
+        shiftRightMask = S{S::LowerBits << 1};
+    ZTE(~compressionMask);
+    auto forParallelSuffix = // this is called "mk" in the book
+        (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask);
+    ZTE(forParallelSuffix);
+        // note: forParallelSuffix denotes positions with a zero
+        // immediately to the right in 'compressionMask'
+    for(;;) {
+        ZTE(groupSize);
+        ZTE(shiftLeftMask);
+        ZTE(shiftRightMask);
+        ZTE(result);
+        auto oddCountOfGroupsOfZerosToTheRight =  // called "mp" in the book
+            parallelSuffix(forParallelSuffix);
+        ZTE(oddCountOfGroupsOfZerosToTheRight);
+        // compress the bits just identified in both the result and the mask
+        auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight;
+        ZTE(moving);
+        compressionMask =
+            (compressionMask ^ moving) | // clear the moving
+            moving.shiftIntraLaneRight(groupSize, shiftRightMask);
+        ZTE(compressionMask);
+        auto movingFromInput = result & moving;
+        result =
+            (result ^ movingFromInput) | // clear the moving from the result
+            movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask);
+        auto nextGroupSize = groupSize << 1;
+        if(NB <= nextGroupSize) {
+            break;
+        }
+        auto evenCountOfGroupsOfZerosToTheRight =
+            ~oddCountOfGroupsOfZerosToTheRight;
+        forParallelSuffix =
+            forParallelSuffix & evenCountOfGroupsOfZerosToTheRight;
+        auto newShiftLeftMask =
+            shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask);
+        shiftRightMask =
+            shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask);
+        shiftLeftMask = newShiftLeftMask;
+        groupSize = nextGroupSize;
+    }
+    ZTE(result);
+    #undef ZTE
+    return result;
+}
+
 /// \todo because of the desirability of "accumuating" the XORs at the MSB,
 /// the parallel suffix operation is more suitable.
 template<int NB, typename B>
diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp
@@ -101,6 +101,17 @@ TEST_CASE(
     }
 }
 
+TEST_CASE("Compress/Expand", "[swar]") {
+    unsigned
+        Mask =   0b0001'0011'0111'0111'0110'1110'1100'1010,
+        ToMove = 0b0101'0101'0101'0101'0101'0101'0101'0101,
+        // Selection: 1   01  101  101  10  010  01   0 0
+        result = 0b0001'0'1'1'0'1'1'0'1'10'0'10'0'1'0'0;
+    using S1_32 = SWAR<32, uint32_t>;
+    auto q = compress(S1_32{ToMove}, S1_32{Mask});
+    CHECK(result == q.value());
+}
+
 static_assert(1 == popcount<5>(0x100ull));
 static_assert(1 == popcount<5>(0x010ull));
 static_assert(1 == popcount<5>(0x001ull));