Skip to content

Commit efeb812

Browse files
author
Eddie
committed
Compress tested successfully
1 parent 87ddc04 commit efeb812

File tree

3 files changed

+143
-122
lines changed

3 files changed

+143
-122
lines changed

inc/zoo/swar/SWAR.h

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -132,22 +132,24 @@ struct SWAR {
132132

133133
/// \brief as the name suggests
134134
/// \param protectiveMask should clear the bits that would cross the lane.
135-
/// The bits that will be cleared are directly related to the count of shifts, it is natural to maintain
136-
/// the protective mask by the caller, otherwise, the mask will be computed on all invocations.
137-
/// We are not sure the optimizer would maintain this mask somewhere, if it was to recalculate it it would be disastrous for performance.
138-
constexpr SWAR
139-
shiftIntraLaneLeft(int bitCount, SWAR protectiveMask) const noexcept {
140-
T shiftC = static_cast<T>(bitCount); // could be a narrowing conversion
141-
auto V = (*this & protectiveMask).value();
142-
return SWAR{static_cast<T>(V << shiftC)};
143-
}
144-
145-
/// \param protectiveMask should clear the bits that would cross the lane
146-
/// \sa shiftIntraLaneLeft
147-
constexpr SWAR
148-
shiftIntraLaneRight(int bitCount, SWAR protectiveMask) const noexcept {
149-
return SWAR{(*this & protectiveMask).value() >> T{bitCount}};
150-
}
135+
/// The bits that will be cleared are directly related to the count of
136+
/// shifts, it is natural to maintain the protective mask by the caller,
137+
/// otherwise, the mask would have to be computed in all invocations.
138+
/// We are not sure the optimizer would maintain this mask somewhere, if it
139+
/// were to recalculate it, it would be disastrous for performance
140+
/// \note the \c static_cast are necessary because of narrowing conversions
141+
#define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>)
142+
#define X(name, op) \
143+
constexpr SWAR \
144+
shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \
145+
T shiftC = static_cast<T>(bitCount); \
146+
auto V = (*this & protectiveMask).value(); \
147+
auto rv = static_cast<T>(V op shiftC); \
148+
return SWAR{rv}; \
149+
}
150+
SHIFT_INTRALANE_OP_X_LIST
151+
#undef X
152+
#undef SHIFT_INTRALANE_OP_X_LIST
151153

152154
T m_v;
153155
};

inc/zoo/swar/associative_iteration.h

Lines changed: 119 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -80,68 +80,42 @@ constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
8080
return S{result};
8181
}
8282

83-
template<int NB, typename B>
84-
constexpr SWAR<NB, B>
85-
compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
86-
// the only bits turned on in the result are the bits set in the input that
87-
// are moved down (shifted right)
88-
89-
// Following Henry S. Warren Jr.'s Hacker's Delight, Section 7-4
90-
// The compression moves bits right as many positions as there are zeroes
91-
// in the mask "below" it (or to the right).
92-
// We can count the zeroes in the mask in a logarithmic way:
93-
// First detect an odd count of zeroes, move those bits in the input one
94-
// position down (right).
95-
// Then an odd count of *pairs* of zeroes, moving them 2 positions right.
96-
// Then an odd count of *quartets* (nibbles) of zeroes, shifting them 4
97-
// right.
98-
// An odd count of octects (bytes) of zeroes, shifting right 8,
99-
// Odd count of 16 zeroes, >> 16
100-
// ...
101-
//
102-
// This solution will use the parallel suffix operation as a primary tool:
103-
// For every bit postion it indicates an odd number of ones to the right,
104-
// including itself.
105-
// Because we want to detect the "oddity" of groups of zeroes to the right,
106-
// we flip the compression mask. To not count the bit position itself,
107-
// we shift by one.
108-
#define ZTE ZOO_TRACEABLE_EXPRESSION
109-
ZTE(input);
110-
ZTE(compressionMask);
111-
using S = SWAR<NB, B>;
112-
auto result = input;
113-
auto groupSize = 1;
114-
auto shiftLeftMask = S{S::LowerBits};
115-
auto shiftRightMask = S{S::LowerBits << 1};
116-
auto forParallelSuffix = // this is called "mk" in the book
117-
(~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask);
118-
ZTE(forParallelSuffix);
119-
// note: forParallelSuffix denotes positions with a zero
120-
// immediately to the right in the 'mask'
121-
auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book
122-
parallelSuffix(forParallelSuffix);
123-
ZTE(oddCountOfGroupsOfZerosToTheRight);
124-
// compress the bits just identified in both the result and the mask
125-
auto movingFromMask = compressionMask & oddCountOfGroupsOfZerosToTheRight;
126-
ZTE(movingFromMask);
127-
auto movingFromInput = result & oddCountOfGroupsOfZerosToTheRight;
128-
/*compressionMask =
129-
(compressionMask ^ movingFromMask) |
130-
movingFromMask.shiftIntraLaneRight(groupSize, shiftRightMask);*/
131-
result =
132-
(result ^ movingFromInput) |
133-
movingFromInput.shiftIntraLaneLeft(groupSize, shiftRightMask);
134-
135-
auto evenCountOfGroupsOfZerosToTheRight =
136-
~oddCountOfGroupsOfZerosToTheRight;
137-
138-
//auto moved = toMove.shiftIntraLaneRight(1, ~S{S::LeastSignificantBit});
139-
//result = result ^ moved;
140-
return result;
141-
#undef ZTE
142-
}
143-
14483
/*
84+
Binary compress: A fascinating algorithm.
85+
86+
Warren (Hacker's Delight) believes Guy L. Steele is the author of the following
87+
binary compression operation, equivalent to Intel's BMI2 instruction PEXT of
88+
"Parallel Extraction"
89+
90+
From a "mask", a selector of bits from an input, we want to put them together in
91+
the output.
92+
93+
For example's sake, this is the selector:
94+
Note: this follows the usual 'big endian' convention of denoting the most
95+
significant bit first:
96+
0001 0011 0111 0111 0110 1110 1100 1010
97+
Imagine the input is the 32-bit or 32-boolean variable expression
98+
abcd efgh ijkl mnop qrst uvxy zABC DEFG
99+
We want the selection
100+
d gh jkl nop rs uvx zA D F
101+
To be compressed into the output
102+
0000 0000 0000 00dg hjkl nopr suvx zADF
103+
104+
This algorithm will virtually calculate the count of positions that the selected
105+
bits travel to the right, by constructing the binary encoding of that count:
106+
It will identify the positions that will travel an odd number of positions to
107+
the right, these are those whose position-travel-count have the units set.
108+
It will then move those positions by one position to the right, and eliminate
109+
them from the yet-to-move positions. Because it eliminates the positions that
110+
would move an odd count, there remains only positions that move an even number
111+
of positions. Now it finds the positions that move an odd count of /pairs/ of
112+
positions, it moves them 2 positions. This is equivalent to finding the
113+
positions that would have the bit for 2 set in the count of positions to move
114+
right.
115+
Then an odd count of /quartets/ of positions, and moves them 4;
116+
8, 16, 32, ...
117+
118+
145119
Complete example (32 bits)
146120
Selection mask:
147121
0001 0011 0111 0111 0110 1110 1100 1010
@@ -169,51 +143,97 @@ Desired result:
169143
170144
0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
171145
1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask
172-
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 == groupsize of ~compressionMask
173-
This indicates the positions that have a 0 immediately to the right in compressionMask
174-
4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the current position in forParallelSuffix, last decimal digit
175-
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix
176-
we have just identified the positions that need to move an odd number of positions
177-
filter those positions to positions that have a bit set in the compressionMask:
146+
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1
147+
== groupsize of ~compressionMask
148+
This indicates the positions that have a 0 immediately to the right in
149+
compressionMask
150+
4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the
151+
current position in forParallelSuffix,
152+
last decimal digit
153+
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
154+
forParallelSuffix
155+
We have just identified the positions that need to move an odd number of
156+
positions. Filter them with positions with a bit set in compressionMask:
178157
0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
179-
---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of the compressionMask by 1 == groupSize
180-
0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits that will move)
158+
---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of
159+
compressionMask by 1 == groupSize
160+
0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits
161+
that will move)
181162
---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize
182163
0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask.
183-
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix
164+
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
165+
forParallelSuffix
184166
1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved)
185-
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero immediately to their right)
186-
1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => had even zeroes to their right)
187-
At this point, we have removed from compressionMask the positions that moved an odd number of positions and moved them 1 position,
167+
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero
168+
immediately to their right)
169+
1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved =>
170+
had even zeroes to their right)
171+
At this point, we have removed from compressionMask the positions that moved an
172+
odd number of positions and moved them 1 position,
188173
then, we only keep positions that move an even number of positions.
189-
Now, we will repeat these steps but for groups of two zeroes
190-
191-
192-
Binary compress: A fascinating algorithm.
193-
Warren (Hacker's Delight) believes Guy L. Steele is the author of the following binary compression algorithm:
194-
From a "mask", a selector of bits from an input, we want to put them together in the output.
195-
For example's sake, this is the selector:
196-
Note: this follows the usual 'big endian' convention of denoting the most significant bit first
197-
0001 0011 0111 0111 0110 1110 1100 1010
198-
Imagine the input is the 32-bit or 32-boolean variable expression
199-
abcd efgh ijkl mnop qrst uvxy zABC DEFG
200-
We want the selection
201-
d gh jkl nop rs uvx zA D F
202-
To be compressed into the output
203-
0000 0000 0000 00dg hjkl nopr suvx zADF
204-
This algorithm will virtually calculate the count of positions that the selected bits travel to the right,
205-
by constructing the binary encoding of that count:
206-
It will identify the positions that will travel an odd number of positions to the right, these are those
207-
whose position-travel-count have the units set.
208-
It will move those positions by one position to the right, and eliminate them from the yet-to-move positions.
209-
Because it eliminates the positions that would move an odd count, there remains only positions that move
210-
an even number of positions. Now it finds the positions that move an odd count of /pairs/ of positions,
211-
and moves them 2 positions.
212-
then an odd count of /quartets/ of positions, and moves them 4;
213-
8, 16, 32, ...
214-
174+
Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ...
215175
*/
216176

177+
template<int NB, typename B>
178+
constexpr SWAR<NB, B>
179+
compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
180+
// This solution uses the parallel suffix operation as a primary tool:
181+
// For every bit postion it indicates an odd number of ones to the right,
182+
// including itself.
183+
// Because we want to detect the "oddness" of groups of zeroes to the right,
184+
// we flip the compression mask. To not count the bit position itself,
185+
// we shift by one.
186+
// #define ZTE ZOO_TRACEABLE_EXPRESSION
187+
ZTE(input);
188+
ZTE(compressionMask);
189+
using S = SWAR<NB, B>;
190+
auto result = input & compressionMask;
191+
auto groupSize = 1;
192+
auto
193+
shiftLeftMask = S{S::LowerBits},
194+
shiftRightMask = S{S::LowerBits << 1};
195+
ZTE(~compressionMask);
196+
auto forParallelSuffix = // this is called "mk" in the book
197+
(~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask);
198+
ZTE(forParallelSuffix);
199+
// note: forParallelSuffix denotes positions with a zero
200+
// immediately to the right in 'compressionMask'
201+
do {
202+
ZTE(groupSize);
203+
ZTE(shiftLeftMask);
204+
ZTE(shiftRightMask);
205+
ZTE(result);
206+
auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book
207+
parallelSuffix(forParallelSuffix);
208+
ZTE(oddCountOfGroupsOfZerosToTheRight);
209+
// compress the bits just identified in both the result and the mask
210+
auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight;
211+
ZTE(moving);
212+
compressionMask =
213+
(compressionMask ^ moving) | // clear the moving
214+
moving.shiftIntraLaneRight(groupSize, shiftRightMask);
215+
ZTE(compressionMask);
216+
auto movingFromInput = result & moving;
217+
result =
218+
(result ^ movingFromInput) | // clear the moving from the result
219+
movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask);
220+
221+
auto evenCountOfGroupsOfZerosToTheRight =
222+
~oddCountOfGroupsOfZerosToTheRight;
223+
forParallelSuffix =
224+
forParallelSuffix & evenCountOfGroupsOfZerosToTheRight;
225+
auto newShiftLeftMask =
226+
shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask);
227+
shiftRightMask =
228+
shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask);
229+
shiftLeftMask = newShiftLeftMask;
230+
groupSize <<= 1;
231+
} while(groupSize < NB);
232+
ZTE(result);
233+
#undef ZTE
234+
return result;
235+
}
236+
217237

218238
/// \todo because of the desirability of "accumuating" the XORs at the MSB,
219239
/// the parallel suffix operation is more suitable.

test/swar/BasicOperations.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,14 @@ TEST_CASE(
102102
}
103103

104104
TEST_CASE("Compress/Expand", "[swar]") {
105-
unsigned Mask = 0b0001'0011'0111'0111'0110'1110'1100'1010;
106-
unsigned ToMove = 0x55555555;
105+
unsigned
106+
Mask = 0b0001'0011'0111'0111'0110'1110'1100'1010,
107+
ToMove = 0b0101'0101'0101'0101'0101'0101'0101'0101,
108+
// Selection: 1 01 101 101 10 010 01 0 0
109+
result = 0b0001'0'1'1'0'1'1'0'1'10'0'10'0'1'0'0;
107110
using S1_32 = SWAR<32, uint32_t>;
108111
auto q = compress(S1_32{ToMove}, S1_32{Mask});
109-
CHECK(0 != q.value());
110-
using S2_8 = SWAR<2, uint8_t>;
111-
auto r = compress(S2_8{0b10'10'10'10}, S2_8{0b11'10'00'00});
112-
S2_8 expected{0b10'01'00'00};
113-
CHECK(expected.value() == r.value());
112+
CHECK(result == q.value());
114113
}
115114

116115
static_assert(1 == popcount<5>(0x100ull));

0 commit comments

Comments
 (0)