@@ -80,68 +80,42 @@ constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
8080 return S{result};
8181}
8282
83- template <int NB, typename B>
84- constexpr SWAR<NB, B>
85- compress (SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
86- // the only bits turned on in the result are the bits set in the input that
87- // are moved down (shifted right)
88-
89- // Following Henry S. Warren Jr.'s Hacker's Delight, Section 7-4
90- // The compression moves bits right as many positions as there are zeroes
91- // in the mask "below" it (or to the right).
92- // We can count the zeroes in the mask in a logarithmic way:
93- // First detect an odd count of zeroes, move those bits in the input one
94- // position down (right).
95- // Then an odd count of *pairs* of zeroes, moving them 2 positions right.
96- // Then an odd count of *quartets* (nibbles) of zeroes, shifting them 4
97- // right.
98- // An odd count of octects (bytes) of zeroes, shifting right 8,
99- // Odd count of 16 zeroes, >> 16
100- // ...
101- //
102- // This solution will use the parallel suffix operation as a primary tool:
103- // For every bit postion it indicates an odd number of ones to the right,
104- // including itself.
105- // Because we want to detect the "oddity" of groups of zeroes to the right,
106- // we flip the compression mask. To not count the bit position itself,
107- // we shift by one.
108- #define ZTE ZOO_TRACEABLE_EXPRESSION
109- ZTE (input);
110- ZTE (compressionMask);
111- using S = SWAR<NB, B>;
112- auto result = input;
113- auto groupSize = 1 ;
114- auto shiftLeftMask = S{S::LowerBits};
115- auto shiftRightMask = S{S::LowerBits << 1 };
116- auto forParallelSuffix = // this is called "mk" in the book
117- (~compressionMask).shiftIntraLaneLeft (groupSize, shiftLeftMask);
118- ZTE (forParallelSuffix);
119- // note: forParallelSuffix denotes positions with a zero
120- // immediately to the right in the 'mask'
121- auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book
122- parallelSuffix (forParallelSuffix);
123- ZTE (oddCountOfGroupsOfZerosToTheRight);
124- // compress the bits just identified in both the result and the mask
125- auto movingFromMask = compressionMask & oddCountOfGroupsOfZerosToTheRight;
126- ZTE (movingFromMask);
127- auto movingFromInput = result & oddCountOfGroupsOfZerosToTheRight;
128- /* compressionMask =
129- (compressionMask ^ movingFromMask) |
130- movingFromMask.shiftIntraLaneRight(groupSize, shiftRightMask);*/
131- result =
132- (result ^ movingFromInput) |
133- movingFromInput.shiftIntraLaneLeft (groupSize, shiftRightMask);
134-
135- auto evenCountOfGroupsOfZerosToTheRight =
136- ~oddCountOfGroupsOfZerosToTheRight;
137-
138- // auto moved = toMove.shiftIntraLaneRight(1, ~S{S::LeastSignificantBit});
139- // result = result ^ moved;
140- return result;
141- #undef ZTE
142- }
143-
14483/*
84+ Binary compress: A fascinating algorithm.
85+
86+ Warren (Hacker's Delight) believes Guy L. Steele is the author of the following
87+ binary compression operation, equivalent to Intel's BMI2 instruction PEXT of
88+ "Parallel Extraction"
89+
90+ From a "mask", a selector of bits from an input, we want to put them together in
91+ the output.
92+
93+ For example's sake, this is the selector:
94+ Note: this follows the usual 'big endian' convention of denoting the most
95+ significant bit first:
96+ 0001 0011 0111 0111 0110 1110 1100 1010
97+ Imagine the input is the 32-bit or 32-boolean variable expression
98+ abcd efgh ijkl mnop qrst uvxy zABC DEFG
99+ We want the selection
100+ d gh jkl nop rs uvx zA D F
101+ To be compressed into the output
102+ 0000 0000 0000 00dg hjkl nopr suvx zADF
103+
104+ This algorithm will virtually calculate the count of positions that the selected
105+ bits travel to the right, by constructing the binary encoding of that count:
106+ It will identify the positions that will travel an odd number of positions to
107+ the right, these are those whose position-travel-count have the units set.
108+ It will then move those positions by one position to the right, and eliminate
109+ them from the yet-to-move positions. Because it eliminates the positions that
110+ would move an odd count, there remains only positions that move an even number
111+ of positions. Now it finds the positions that move an odd count of /pairs/ of
112+ positions, it moves them 2 positions. This is equivalent to finding the
113+ positions that would have the bit for 2 set in the count of positions to move
114+ right.
115+ Then an odd count of /quartets/ of positions, and moves them 4;
116+ 8, 16, 32, ...
117+
118+
145119Complete example (32 bits)
146120Selection mask:
1471210001 0011 0111 0111 0110 1110 1100 1010
@@ -169,51 +143,97 @@ Desired result:
169143
1701440001 0011 0111 0111 0110 1110 1100 1010 compressionMask
1711451110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask
172- 1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 == groupsize of ~compressionMask
173- This indicates the positions that have a 0 immediately to the right in compressionMask
174- 4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the current position in forParallelSuffix, last decimal digit
175- 0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix
176- we have just identified the positions that need to move an odd number of positions
177- filter those positions to positions that have a bit set in the compressionMask:
146+ 1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1
147+ == groupsize of ~compressionMask
148+ This indicates the positions that have a 0 immediately to the right in
149+ compressionMask
150+ 4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the
151+ current position in forParallelSuffix,
152+ last decimal digit
153+ 0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
154+ forParallelSuffix
155+ We have just identified the positions that need to move an odd number of
156+ positions. Filter them with positions with a bit set in compressionMask:
1781570001 0011 0111 0111 0110 1110 1100 1010 compressionMask
179- ---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of the compressionMask by 1 == groupSize
180- 0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits that will move)
158+ ---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of
159+ compressionMask by 1 == groupSize
160+ 0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits
161+ that will move)
181162---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize
1821630001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask.
183- 0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix
164+ 0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
165+ forParallelSuffix
1841661011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved)
185- 1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero immediately to their right)
186- 1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => had even zeroes to their right)
187- At this point, we have removed from compressionMask the positions that moved an odd number of positions and moved them 1 position,
167+ 1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero
168+ immediately to their right)
169+ 1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved =>
170+ had even zeroes to their right)
171+ At this point, we have removed from compressionMask the positions that moved an
172+ odd number of positions and moved them 1 position,
188173then, we only keep positions that move an even number of positions.
189- Now, we will repeat these steps but for groups of two zeroes
190-
191-
192- Binary compress: A fascinating algorithm.
193- Warren (Hacker's Delight) believes Guy L. Steele is the author of the following binary compression algorithm:
194- From a "mask", a selector of bits from an input, we want to put them together in the output.
195- For example's sake, this is the selector:
196- Note: this follows the usual 'big endian' convention of denoting the most significant bit first
197- 0001 0011 0111 0111 0110 1110 1100 1010
198- Imagine the input is the 32-bit or 32-boolean variable expression
199- abcd efgh ijkl mnop qrst uvxy zABC DEFG
200- We want the selection
201- d gh jkl nop rs uvx zA D F
202- To be compressed into the output
203- 0000 0000 0000 00dg hjkl nopr suvx zADF
204- This algorithm will virtually calculate the count of positions that the selected bits travel to the right,
205- by constructing the binary encoding of that count:
206- It will identify the positions that will travel an odd number of positions to the right, these are those
207- whose position-travel-count have the units set.
208- It will move those positions by one position to the right, and eliminate them from the yet-to-move positions.
209- Because it eliminates the positions that would move an odd count, there remains only positions that move
210- an even number of positions. Now it finds the positions that move an odd count of /pairs/ of positions,
211- and moves them 2 positions.
212- then an odd count of /quartets/ of positions, and moves them 4;
213- 8, 16, 32, ...
214-
174+ Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ...
215175*/
216176
177+ template <int NB, typename B>
178+ constexpr SWAR<NB, B>
179+ compress (SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
180+ // This solution uses the parallel suffix operation as a primary tool:
181+ // For every bit postion it indicates an odd number of ones to the right,
182+ // including itself.
183+ // Because we want to detect the "oddness" of groups of zeroes to the right,
184+ // we flip the compression mask. To not count the bit position itself,
185+ // we shift by one.
186+ // #define ZTE ZOO_TRACEABLE_EXPRESSION
187+ ZTE (input);
188+ ZTE (compressionMask);
189+ using S = SWAR<NB, B>;
190+ auto result = input & compressionMask;
191+ auto groupSize = 1 ;
192+ auto
193+ shiftLeftMask = S{S::LowerBits},
194+ shiftRightMask = S{S::LowerBits << 1 };
195+ ZTE (~compressionMask);
196+ auto forParallelSuffix = // this is called "mk" in the book
197+ (~compressionMask).shiftIntraLaneLeft (groupSize, shiftLeftMask);
198+ ZTE (forParallelSuffix);
199+ // note: forParallelSuffix denotes positions with a zero
200+ // immediately to the right in 'compressionMask'
201+ do {
202+ ZTE (groupSize);
203+ ZTE (shiftLeftMask);
204+ ZTE (shiftRightMask);
205+ ZTE (result);
206+ auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book
207+ parallelSuffix (forParallelSuffix);
208+ ZTE (oddCountOfGroupsOfZerosToTheRight);
209+ // compress the bits just identified in both the result and the mask
210+ auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight;
211+ ZTE (moving);
212+ compressionMask =
213+ (compressionMask ^ moving) | // clear the moving
214+ moving.shiftIntraLaneRight (groupSize, shiftRightMask);
215+ ZTE (compressionMask);
216+ auto movingFromInput = result & moving;
217+ result =
218+ (result ^ movingFromInput) | // clear the moving from the result
219+ movingFromInput.shiftIntraLaneRight (groupSize, shiftRightMask);
220+
221+ auto evenCountOfGroupsOfZerosToTheRight =
222+ ~oddCountOfGroupsOfZerosToTheRight;
223+ forParallelSuffix =
224+ forParallelSuffix & evenCountOfGroupsOfZerosToTheRight;
225+ auto newShiftLeftMask =
226+ shiftLeftMask.shiftIntraLaneRight (groupSize, shiftRightMask);
227+ shiftRightMask =
228+ shiftRightMask.shiftIntraLaneLeft (groupSize, shiftLeftMask);
229+ shiftLeftMask = newShiftLeftMask;
230+ groupSize <<= 1 ;
231+ } while (groupSize < NB);
232+ ZTE (result);
233+ #undef ZTE
234+ return result;
235+ }
236+
217237
218238// / \todo because of the desirability of "accumuating" the XORs at the MSB,
219239// / the parallel suffix operation is more suitable.
0 commit comments