Skip to content

Commit e5ab9dc

Browse files
authored
Merge pull request #74 from thecppzoo/em/swar-expand-compress
Implementation of the "compress" operation (Intel's PEXT, "parallel extraction") on a per-lane basis
2 parents f129574 + d8c2875 commit e5ab9dc

File tree

4 files changed

+248
-30
lines changed

4 files changed

+248
-30
lines changed

.gitignore

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
# Vscode does not like to build outside of the source tree
2-
# (multiple glitches)
3-
4-
.vscode
5-
test/.vscode
6-
build
7-
.cache
1+
# Vscode does not like to build outside of the source tree
2+
# (multiple glitches)
3+
4+
.vscode
5+
test/.vscode
6+
build
7+
.cache

inc/zoo/swar/SWAR.h

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ namespace zoo { namespace swar {
1515
using u64 = uint64_t;
1616
using u32 = uint32_t;
1717
using u16 = uint16_t;
18-
using u8 = uint8_t;
18+
using u8 = std::uint8_t;
1919

2020
template<int LogNBits>
2121
constexpr uint64_t popcount(uint64_t a) noexcept {
@@ -58,7 +58,10 @@ struct SWAR {
5858
SignificantBitsCount = BitWidth - PaddingBitsCount,
5959
AllOnes = ~std::make_unsigned_t<T>{0} >> PaddingBitsCount,
6060
LeastSignificantBit = meta::BitmaskMaker<T, std::make_unsigned_t<T>{1}, NBits>::value,
61-
MostSignificantBit = LeastSignificantBit << (NBits - 1);
61+
MostSignificantBit = LeastSignificantBit << (NBits - 1),
62+
// Use LowerBits in favor of ~MostSignificantBit to not pollute
63+
// "don't care" bits when non-power-of-two bit lane sizes are supported
64+
LowerBits = MostSignificantBit - LeastSignificantBit;
6265

6366
SWAR() = default;
6467
constexpr explicit SWAR(T v): m_v(v) {}
@@ -129,20 +132,24 @@ struct SWAR {
129132

130133
/// \brief as the name suggests
131134
/// \param protectiveMask should clear the bits that would cross the lane.
132-
/// The bits that will be cleared are directly related to the count of shifts, it is natural to maintain
133-
/// the protective mask by the caller, otherwise, the mask will be computed on all invocations.
134-
/// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance.
135-
constexpr SWAR
136-
shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept {
137-
return SWAR{(*this & protectiveMask).value() << bitCount};
138-
}
139-
140-
/// \param protectiveMask should clear the bits that would cross the lane
141-
/// \sa shiftIntraLaneLeft
142-
constexpr SWAR
143-
shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept {
144-
return SWAR{(*this & protectiveMask).value() >> bitCount};
145-
}
135+
/// The bits that will be cleared are directly related to the count of
136+
/// shifts, it is natural to maintain the protective mask by the caller,
137+
/// otherwise, the mask would have to be computed in all invocations.
138+
/// We are not sure the optimizer would maintain this mask somewhere, if it
139+
/// were to recalculate it, it would be disastrous for performance
140+
/// \note the \c static_cast are necessary because of narrowing conversions
141+
#define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>)
142+
#define X(name, op) \
143+
constexpr SWAR \
144+
shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \
145+
T shiftC = static_cast<T>(bitCount); \
146+
auto V = (*this & protectiveMask).value(); \
147+
auto rv = static_cast<T>(V op shiftC); \
148+
return SWAR{rv}; \
149+
}
150+
SHIFT_INTRALANE_OP_X_LIST
151+
#undef X
152+
#undef SHIFT_INTRALANE_OP_X_LIST
146153

147154
constexpr SWAR
148155
multiply(T multiplier) const noexcept { return SWAR{m_v * multiplier}; }

inc/zoo/swar/associative_iteration.h

Lines changed: 207 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,42 @@
33

44
#include "zoo/swar/SWAR.h"
55

6+
//#define ZOO_DEVELOPMENT_DEBUGGING
7+
#ifdef ZOO_DEVELOPMENT_DEBUGGING
8+
#include <iostream>
9+
10+
inline std::ostream &binary(std::ostream &out, uint64_t input, int count) {
11+
while(count--) {
12+
out << (1 & input);
13+
input >>= 1;
14+
}
15+
return out;
16+
}
17+
18+
template<int NB, typename B>
19+
std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR<NB, B> s) {
20+
using S = zoo::swar::SWAR<NB, B>;
21+
auto shiftCounter = sizeof(B) * 8 / NB;
22+
out << "<|";
23+
auto v = s.value();
24+
do {
25+
binary(out, v, NB) << '|';
26+
27+
} while(--shiftCounter);
28+
return out << ">";
29+
}
30+
31+
#define ZOO_TO_STRING(a) #a
32+
// std::endl is needed within the context of debugging: flush the line
33+
#define ZOO_TRACEABLE_EXP_IMPL(F, L, ...) std::cout << '"' << (__VA_ARGS__) << "\", \"" << F << ':' << L << "\", \"" << ZOO_TO_STRING(__VA_ARGS__) << "\"" << std::endl;
34+
#define ZOO_TRACEABLE_EXPRESSION(...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__)
35+
36+
#else
37+
38+
#define ZOO_TRACEABLE_EXPRESSION(...) __VA_ARGS__
39+
40+
#endif
41+
642
namespace zoo::swar {
743

844
/// \note This code should be substituted by an application of "progressive" algebraic iteration
@@ -11,32 +47,196 @@ template<int NB, typename B>
1147
constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
1248
using S = SWAR<NB, B>;
1349
auto
14-
shiftClearingMask = S{~S::MostSignificantBit},
50+
shiftClearingMask = S{static_cast<B>(~S::MostSignificantBit)},
1551
doubling = input,
1652
result = S{0};
1753
auto
1854
bitsToXOR = NB,
1955
power = 1;
56+
57+
#define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
2058
for(;;) {
59+
ZTE(doubling);
2160
if(1 & bitsToXOR) {
22-
result = result ^ doubling;
23-
doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
61+
ZTE(result = result ^ doubling);
62+
ZTE(doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask));
2463
}
25-
bitsToXOR >>= 1;
64+
ZTE(bitsToXOR >>= 1);
2665
if(!bitsToXOR) { break; }
2766
auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
28-
doubling = doubling ^ shifted;
67+
ZTE(shifted);
68+
ZTE(doubling = doubling ^ shifted);
2969
// 01...1
3070
// 001...1
3171
// 00001...1
3272
// 000000001...1
3373
shiftClearingMask =
34-
shiftClearingMask & S{shiftClearingMask.value() >> power};
35-
power <<= 1;
74+
shiftClearingMask &
75+
S{static_cast<B>(shiftClearingMask.value() >> power)};
76+
ZTE(power <<= 1);
3677
}
78+
ZTE(input);
79+
#undef ZTE
3780
return S{result};
3881
}
3982

83+
/*
84+
Binary compress: A fascinating algorithm.
85+
86+
Warren (Hacker's Delight) believes Guy L. Steele is the author of the following
87+
binary compression operation, equivalent to Intel's BMI2 instruction PEXT of
88+
"Parallel Extraction"
89+
90+
From a "mask", a selector of bits from an input, we want to put them together in
91+
the output.
92+
93+
For example's sake, this is the selector:
94+
Note: this follows the usual 'big endian' convention of denoting the most
95+
significant bit first:
96+
0001 0011 0111 0111 0110 1110 1100 1010
97+
Imagine the input is the 32-bit or 32-boolean variable expression
98+
abcd efgh ijkl mnop qrst uvxy zABC DEFG
99+
We want the selection
100+
d gh jkl nop rs uvx zA D F
101+
To be compressed into the output
102+
0000 0000 0000 00dg hjkl nopr suvx zADF
103+
104+
This algorithm will virtually calculate the count of positions that the selected
105+
bits travel to the right, by constructing the binary encoding of that count:
106+
It will identify the positions that will travel an odd number of positions to
107+
the right, these are those whose position-travel-count have the units set.
108+
It will then move those positions by one position to the right, and eliminate
109+
them from the yet-to-move positions. Because it eliminates the positions that
110+
would move an odd count, there remains only positions that move an even number
111+
of positions. Now it finds the positions that move an odd count of /pairs/ of
112+
positions, it moves them 2 positions. This is equivalent to finding the
113+
positions that would have the bit for 2 set in the count of positions to move
114+
right.
115+
Then an odd count of /quartets/ of positions, and moves them 4;
116+
8, 16, 32, ...
117+
118+
119+
Complete example (32 bits)
120+
Selection mask:
121+
0001 0011 0111 0111 0110 1110 1100 1010
122+
Input (each letter or variable is a boolean, that can have 0 or 1)
123+
abcd efgh ijkl mnop qrst uvxy zABC DEFG
124+
Selection (using spaces)
125+
d gh jkl nop rs uvx zA D F
126+
Desired result:
127+
dghjklnoprsuvxzADF
128+
129+
0000 1001 1011 1011 1011 0111 0110 0101 shiftLeft 1
130+
1111 0110 0100 0100 0100 1000 1001 1010 forParallelSuffix
131+
132+
10 1101 1101
133+
/*
134+
Complete example (32 bits)
135+
Selection mask:
136+
0001 0011 0111 0111 0110 1110 1100 1010
137+
Input (each letter or variable is a boolean, that can have 0 or 1)
138+
abcd efgh ijkl mnop qrst uvxy zABC DEFG
139+
Selection (using spaces)
140+
d gh jkl nop rs uvx zA D F
141+
Desired result:
142+
dghjklnoprsuvxzADF
143+
144+
0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
145+
1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask
146+
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1
147+
== groupsize of ~compressionMask
148+
This indicates the positions that have a 0 immediately to the right in
149+
compressionMask
150+
4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the
151+
current position in forParallelSuffix,
152+
last decimal digit
153+
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
154+
forParallelSuffix
155+
We have just identified the positions that need to move an odd number of
156+
positions. Filter them with positions with a bit set in compressionMask:
157+
0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
158+
---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of
159+
compressionMask by 1 == groupSize
160+
0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits
161+
that will move)
162+
---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize
163+
0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask.
164+
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
165+
forParallelSuffix
166+
1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved)
167+
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero
168+
immediately to their right)
169+
1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved =>
170+
had even zeroes to their right)
171+
At this point, we have removed from compressionMask the positions that moved an
172+
odd number of positions and moved them 1 position,
173+
then, we only keep positions that move an even number of positions.
174+
Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ...
175+
*/
176+
177+
template<int NB, typename B>
178+
constexpr SWAR<NB, B>
179+
compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
180+
// This solution uses the parallel suffix operation as a primary tool:
181+
// For every bit postion it indicates an odd number of ones to the right,
182+
// including itself.
183+
// Because we want to detect the "oddness" of groups of zeroes to the right,
184+
// we flip the compression mask. To not count the bit position itself,
185+
// we shift by one.
186+
#define ZTE(...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
187+
ZTE(input);
188+
ZTE(compressionMask);
189+
using S = SWAR<NB, B>;
190+
auto result = input & compressionMask;
191+
auto groupSize = 1;
192+
auto
193+
shiftLeftMask = S{S::LowerBits},
194+
shiftRightMask = S{S::LowerBits << 1};
195+
ZTE(~compressionMask);
196+
auto forParallelSuffix = // this is called "mk" in the book
197+
(~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask);
198+
ZTE(forParallelSuffix);
199+
// note: forParallelSuffix denotes positions with a zero
200+
// immediately to the right in 'compressionMask'
201+
for(;;) {
202+
ZTE(groupSize);
203+
ZTE(shiftLeftMask);
204+
ZTE(shiftRightMask);
205+
ZTE(result);
206+
auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book
207+
parallelSuffix(forParallelSuffix);
208+
ZTE(oddCountOfGroupsOfZerosToTheRight);
209+
// compress the bits just identified in both the result and the mask
210+
auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight;
211+
ZTE(moving);
212+
compressionMask =
213+
(compressionMask ^ moving) | // clear the moving
214+
moving.shiftIntraLaneRight(groupSize, shiftRightMask);
215+
ZTE(compressionMask);
216+
auto movingFromInput = result & moving;
217+
result =
218+
(result ^ movingFromInput) | // clear the moving from the result
219+
movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask);
220+
auto nextGroupSize = groupSize << 1;
221+
if(NB <= nextGroupSize) {
222+
break;
223+
}
224+
auto evenCountOfGroupsOfZerosToTheRight =
225+
~oddCountOfGroupsOfZerosToTheRight;
226+
forParallelSuffix =
227+
forParallelSuffix & evenCountOfGroupsOfZerosToTheRight;
228+
auto newShiftLeftMask =
229+
shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask);
230+
shiftRightMask =
231+
shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask);
232+
shiftLeftMask = newShiftLeftMask;
233+
groupSize = nextGroupSize;
234+
}
235+
ZTE(result);
236+
#undef ZTE
237+
return result;
238+
}
239+
40240
/// \todo because of the desirability of "accumuating" the XORs at the MSB,
41241
/// the parallel suffix operation is more suitable.
42242
template<int NB, typename B>

test/swar/BasicOperations.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,17 @@ TEST_CASE(
101101
}
102102
}
103103

104+
TEST_CASE("Compress/Expand", "[swar]") {
105+
unsigned
106+
Mask = 0b0001'0011'0111'0111'0110'1110'1100'1010,
107+
ToMove = 0b0101'0101'0101'0101'0101'0101'0101'0101,
108+
// Selection: 1 01 101 101 10 010 01 0 0
109+
result = 0b0001'0'1'1'0'1'1'0'1'10'0'10'0'1'0'0;
110+
using S1_32 = SWAR<32, uint32_t>;
111+
auto q = compress(S1_32{ToMove}, S1_32{Mask});
112+
CHECK(result == q.value());
113+
}
114+
104115
static_assert(1 == popcount<5>(0x100ull));
105116
static_assert(1 == popcount<5>(0x010ull));
106117
static_assert(1 == popcount<5>(0x001ull));

0 commit comments

Comments
 (0)