33
44#include " zoo/swar/SWAR.h"
55
6+ // #define ZOO_DEVELOPMENT_DEBUGGING
7+ #ifdef ZOO_DEVELOPMENT_DEBUGGING
8+ #include < iostream>
9+
10+ inline std::ostream &binary (std::ostream &out, uint64_t input, int count) {
11+ while (count--) {
12+ out << (1 & input);
13+ input >>= 1 ;
14+ }
15+ return out;
16+ }
17+
18+ template <int NB, typename B>
19+ std::ostream &operator <<(std::ostream &out, zoo::swar::SWAR<NB, B> s) {
20+ using S = zoo::swar::SWAR<NB, B>;
21+ auto shiftCounter = sizeof (B) * 8 / NB;
22+ out << " <|" ;
23+ auto v = s.value ();
24+ do {
25+ binary (out, v, NB) << ' |' ;
26+
27+ } while (--shiftCounter);
28+ return out << " >" ;
29+ }
30+
31+ #define ZOO_TO_STRING (a ) #a
32+ // std::endl is needed within the context of debugging: flush the line
33+ #define ZOO_TRACEABLE_EXP_IMPL (F, L, ...) std::cout << ' "' << (__VA_ARGS__) << " \" , \" " << F << ' :' << L << " \" , \" " << ZOO_TO_STRING(__VA_ARGS__) << " \" " << std::endl;
34+ #define ZOO_TRACEABLE_EXPRESSION (...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__)
35+
36+ #else
37+
38+ #define ZOO_TRACEABLE_EXPRESSION (...) __VA_ARGS__
39+
40+ #endif
41+
642namespace zoo ::swar {
743
844// / \note This code should be substituted by an application of "progressive" algebraic iteration
@@ -11,32 +47,196 @@ template<int NB, typename B>
1147constexpr SWAR<NB, B> parallelSuffix (SWAR<NB, B> input) {
1248 using S = SWAR<NB, B>;
1349 auto
14- shiftClearingMask = S{~S::MostSignificantBit},
50+ shiftClearingMask = S{static_cast <B>( ~S::MostSignificantBit) },
1551 doubling = input,
1652 result = S{0 };
1753 auto
1854 bitsToXOR = NB,
1955 power = 1 ;
56+
57+ #define ZTE (...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
2058 for (;;) {
59+ ZTE (doubling);
2160 if (1 & bitsToXOR) {
22- result = result ^ doubling;
23- doubling = doubling.shiftIntraLaneLeft (power, shiftClearingMask);
61+ ZTE ( result = result ^ doubling) ;
62+ ZTE ( doubling = doubling.shiftIntraLaneLeft (power, shiftClearingMask) );
2463 }
25- bitsToXOR >>= 1 ;
64+ ZTE ( bitsToXOR >>= 1 ) ;
2665 if (!bitsToXOR) { break ; }
2766 auto shifted = doubling.shiftIntraLaneLeft (power, shiftClearingMask);
28- doubling = doubling ^ shifted;
67+ ZTE (shifted);
68+ ZTE (doubling = doubling ^ shifted);
2969 // 01...1
3070 // 001...1
3171 // 00001...1
3272 // 000000001...1
3373 shiftClearingMask =
34- shiftClearingMask & S{shiftClearingMask.value () >> power};
35- power <<= 1 ;
74+ shiftClearingMask &
75+ S{static_cast <B>(shiftClearingMask.value () >> power)};
76+ ZTE (power <<= 1 );
3677 }
78+ ZTE (input);
79+ #undef ZTE
3780 return S{result};
3881}
3982
83+ /*
84+ Binary compress: A fascinating algorithm.
85+
86+ Warren (Hacker's Delight) believes Guy L. Steele is the author of the following
87+ binary compression operation, equivalent to Intel's BMI2 instruction PEXT of
88+ "Parallel Extraction"
89+
90+ From a "mask", a selector of bits from an input, we want to put them together in
91+ the output.
92+
93+ For example's sake, this is the selector:
94+ Note: this follows the usual 'big endian' convention of denoting the most
95+ significant bit first:
96+ 0001 0011 0111 0111 0110 1110 1100 1010
97+ Imagine the input is the 32-bit or 32-boolean variable expression
98+ abcd efgh ijkl mnop qrst uvxy zABC DEFG
99+ We want the selection
100+ d gh jkl nop rs uvx zA D F
101+ To be compressed into the output
102+ 0000 0000 0000 00dg hjkl nopr suvx zADF
103+
104+ This algorithm will virtually calculate the count of positions that the selected
105+ bits travel to the right, by constructing the binary encoding of that count:
106+ It will identify the positions that will travel an odd number of positions to
107+ the right, these are those whose position-travel-count have the units set.
108+ It will then move those positions by one position to the right, and eliminate
109+ them from the yet-to-move positions. Because it eliminates the positions that
110+ would move an odd count, there remains only positions that move an even number
111+ of positions. Now it finds the positions that move an odd count of /pairs/ of
112+ positions, it moves them 2 positions. This is equivalent to finding the
113+ positions that would have the bit for 2 set in the count of positions to move
114+ right.
115+ Then an odd count of /quartets/ of positions, and moves them 4;
116+ 8, 16, 32, ...
117+
118+
119+ Complete example (32 bits)
120+ Selection mask:
121+ 0001 0011 0111 0111 0110 1110 1100 1010
122+ Input (each letter or variable is a boolean, that can have 0 or 1)
123+ abcd efgh ijkl mnop qrst uvxy zABC DEFG
124+ Selection (using spaces)
125+ d gh jkl nop rs uvx zA D F
126+ Desired result:
127+ dghjklnoprsuvxzADF
128+
129+ 0000 1001 1011 1011 1011 0111 0110 0101 shiftLeft 1
130+ 1111 0110 0100 0100 0100 1000 1001 1010 forParallelSuffix
131+
132+ 10 1101 1101
133+ /*
134+ Complete example (32 bits)
135+ Selection mask:
136+ 0001 0011 0111 0111 0110 1110 1100 1010
137+ Input (each letter or variable is a boolean, that can have 0 or 1)
138+ abcd efgh ijkl mnop qrst uvxy zABC DEFG
139+ Selection (using spaces)
140+ d gh jkl nop rs uvx zA D F
141+ Desired result:
142+ dghjklnoprsuvxzADF
143+
144+ 0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
145+ 1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask
146+ 1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1
147+ == groupsize of ~compressionMask
148+ This indicates the positions that have a 0 immediately to the right in
149+ compressionMask
150+ 4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the
151+ current position in forParallelSuffix,
152+ last decimal digit
153+ 0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
154+ forParallelSuffix
155+ We have just identified the positions that need to move an odd number of
156+ positions. Filter them with positions with a bit set in compressionMask:
157+ 0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
158+ ---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of
159+ compressionMask by 1 == groupSize
160+ 0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits
161+ that will move)
162+ ---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize
163+ 0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask.
164+ 0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
165+ forParallelSuffix
166+ 1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved)
167+ 1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero
168+ immediately to their right)
169+ 1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved =>
170+ had even zeroes to their right)
171+ At this point, we have removed from compressionMask the positions that moved an
172+ odd number of positions and moved them 1 position,
173+ then, we only keep positions that move an even number of positions.
174+ Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ...
175+ */
176+
177+ template <int NB, typename B>
178+ constexpr SWAR<NB, B>
179+ compress (SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
180+ // This solution uses the parallel suffix operation as a primary tool:
181+ // For every bit postion it indicates an odd number of ones to the right,
182+ // including itself.
183+ // Because we want to detect the "oddness" of groups of zeroes to the right,
184+ // we flip the compression mask. To not count the bit position itself,
185+ // we shift by one.
186+ #define ZTE (...) ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
187+ ZTE (input);
188+ ZTE (compressionMask);
189+ using S = SWAR<NB, B>;
190+ auto result = input & compressionMask;
191+ auto groupSize = 1 ;
192+ auto
193+ shiftLeftMask = S{S::LowerBits},
194+ shiftRightMask = S{S::LowerBits << 1 };
195+ ZTE (~compressionMask);
196+ auto forParallelSuffix = // this is called "mk" in the book
197+ (~compressionMask).shiftIntraLaneLeft (groupSize, shiftLeftMask);
198+ ZTE (forParallelSuffix);
199+ // note: forParallelSuffix denotes positions with a zero
200+ // immediately to the right in 'compressionMask'
201+ for (;;) {
202+ ZTE (groupSize);
203+ ZTE (shiftLeftMask);
204+ ZTE (shiftRightMask);
205+ ZTE (result);
206+ auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book
207+ parallelSuffix (forParallelSuffix);
208+ ZTE (oddCountOfGroupsOfZerosToTheRight);
209+ // compress the bits just identified in both the result and the mask
210+ auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight;
211+ ZTE (moving);
212+ compressionMask =
213+ (compressionMask ^ moving) | // clear the moving
214+ moving.shiftIntraLaneRight (groupSize, shiftRightMask);
215+ ZTE (compressionMask);
216+ auto movingFromInput = result & moving;
217+ result =
218+ (result ^ movingFromInput) | // clear the moving from the result
219+ movingFromInput.shiftIntraLaneRight (groupSize, shiftRightMask);
220+ auto nextGroupSize = groupSize << 1 ;
221+ if (NB <= nextGroupSize) {
222+ break ;
223+ }
224+ auto evenCountOfGroupsOfZerosToTheRight =
225+ ~oddCountOfGroupsOfZerosToTheRight;
226+ forParallelSuffix =
227+ forParallelSuffix & evenCountOfGroupsOfZerosToTheRight;
228+ auto newShiftLeftMask =
229+ shiftLeftMask.shiftIntraLaneRight (groupSize, shiftRightMask);
230+ shiftRightMask =
231+ shiftRightMask.shiftIntraLaneLeft (groupSize, shiftLeftMask);
232+ shiftLeftMask = newShiftLeftMask;
233+ groupSize = nextGroupSize;
234+ }
235+ ZTE (result);
236+ #undef ZTE
237+ return result;
238+ }
239+
40240// / \todo because of the desirability of "accumuating" the XORs at the MSB,
41241// / the parallel suffix operation is more suitable.
42242template <int NB, typename B>
0 commit comments