Skip to content

Commit 1fe091b

Browse files
authored
Refactoring to new file, introduction of negation (#63)
Co-authored-by: Eddie <eddie see email elsewhere>
1 parent 577ba35 commit 1fe091b

File tree

2 files changed

+262
-293
lines changed

2 files changed

+262
-293
lines changed
Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H
2+
#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H
3+
4+
#include "zoo/swar/SWAR.h"
5+
6+
namespace zoo::swar {
7+
8+
/// \note This code should be substituted by an application of "progressive" algebraic iteration
9+
/// \note There is also parallelPrefix (to be implemented)
10+
template<int NB, typename B>
11+
constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
12+
using S = SWAR<NB, B>;
13+
auto
14+
shiftClearingMask = S{~S::MostSignificantBit},
15+
doubling = input,
16+
result = S{0};
17+
auto
18+
bitsToXOR = NB,
19+
power = 1;
20+
for(;;) {
21+
if(1 & bitsToXOR) {
22+
result = result ^ doubling;
23+
doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
24+
}
25+
bitsToXOR >>= 1;
26+
if(!bitsToXOR) { break; }
27+
auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
28+
doubling = doubling ^ shifted;
29+
// 01...1
30+
// 001...1
31+
// 00001...1
32+
// 000000001...1
33+
shiftClearingMask =
34+
shiftClearingMask & S{shiftClearingMask.value() >> power};
35+
power <<= 1;
36+
}
37+
return S{result};
38+
}
39+
40+
/// \todo because of the desirability of "accumuating" the XORs at the MSB,
41+
/// the parallel suffix operation is more suitable.
42+
template<int NB, typename B>
43+
constexpr SWAR<NB, B> parity(SWAR<NB, B> input) {
44+
using S = SWAR<NB, B>;
45+
auto preResult = parallelSuffix(input);
46+
auto onlyMSB = preResult.value() & S::MostSignificantBit;
47+
return S{onlyMSB};
48+
}
49+
50+
51+
namespace impl {
52+
template<int NB, typename B>
53+
constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR<NB, B> msb, SWAR<NB, B> lsb) {
54+
auto msbCopiedDown = msb - lsb;
55+
auto msbReintroduced = msbCopiedDown | msb;
56+
return msbReintroduced;
57+
}
58+
}
59+
60+
template<int NB, typename B>
61+
constexpr auto makeLaneMaskFromLSB(SWAR<NB, B> input) {
62+
using S = SWAR<NB, B>;
63+
auto lsb = input & S{S::LeastSignificantBit};
64+
auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)};
65+
return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb);
66+
}
67+
68+
template<int NB, typename B>
69+
constexpr auto makeLaneMaskFromMSB(SWAR<NB, B> input) {
70+
using S = SWAR<NB, B>;
71+
auto msb = input & S{S::MostSignificantBit};
72+
auto msbCopiedToLSB = S{msb.value() >> (NB - 1)};
73+
return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB);
74+
}
75+
76+
template<int NB, typename B>
77+
struct ArithmeticResultTriplet {
78+
SWAR<NB, B> result;
79+
BooleanSWAR<NB, B> carry, overflow;
80+
};
81+
82+
83+
template<int NB, typename B>
84+
constexpr ArithmeticResultTriplet<NB, B>
85+
fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
86+
using S = SWAR<NB, B>;
87+
constexpr auto
88+
SignBit = S{S::MostSignificantBit},
89+
LowerBits = SignBit - S{S::LeastSignificantBit};
90+
// prevent overflow by clearing the most significant bits
91+
auto
92+
s1prime = LowerBits & s1,
93+
s2prime = LowerBits & s2,
94+
resultPrime = s1prime + s2prime,
95+
s1Sign = SignBit & s1,
96+
s2Sign = SignBit & s2,
97+
signPrime = SignBit & resultPrime,
98+
result = resultPrime ^ s1Sign ^ s2Sign,
99+
// carry is set whenever at least two of the sign bits of s1, s2,
100+
// signPrime are set
101+
carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime),
102+
// overflow: the inputs have the same sign and different to result
103+
// same sign: s1Sign ^ s2Sign
104+
overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result);
105+
using BS = BooleanSWAR<NB, B>;
106+
return { result, BS{carry.value()}, BS{overflow.value()} };
107+
};
108+
109+
110+
template<int NB, typename B>
111+
constexpr auto negate(SWAR<NB, B> input) {
112+
using S = SWAR<NB, B>;
113+
constexpr auto Ones = S{S::LeastSignificantBit};
114+
return fullAddition(~input, Ones).result;
115+
}
116+
117+
/// \brief Performs a generalized iterated application of an associative operator to a base
118+
///
119+
/// In algebra, the repeated application of an operator to a "base" has different names depending on the
120+
/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition",
121+
/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n
122+
/// power".
123+
/// The general term in algebra is "iteration", hence the naming of this function.
124+
/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we
125+
/// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very
126+
/// different meaning in programming and especially different in C++.
127+
/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this
128+
/// function leverages the associative property of the operator to "halve" the count of iterations at each step.
129+
/// \note There is a symmetrical operation to be implemented of associative iteration in the
130+
/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb,
131+
/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)"
132+
/// \tparam Operator a callable with three arguments: the left and right arguments to the operation
133+
/// and the count to be used, the "count" is an artifact of this generalization
134+
/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not
135+
/// be a number, the iteration count is part of the execution context to apply the operator
136+
/// \param forSquaring is an artifact of this generalization
137+
/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows
138+
/// there are fewer iterations than what the type of exponent would allow
139+
template<
140+
typename Base, typename IterationCount, typename Operator,
141+
// the critical use of associativity is that it allows halving the
142+
// iteration count
143+
typename CountHalver
144+
>
145+
constexpr auto associativeOperatorIterated_regressive(
146+
Base base, Base neutral, IterationCount count, IterationCount forSquaring,
147+
Operator op, unsigned log2Count, CountHalver ch
148+
) {
149+
auto result = neutral;
150+
if(!log2Count) { return result; }
151+
for(;;) {
152+
result = op(result, base, count);
153+
if(!--log2Count) { break; }
154+
result = op(result, result, forSquaring);
155+
count = ch(count);
156+
}
157+
return result;
158+
}
159+
160+
template<int ActualBits, int NB, typename T>
161+
constexpr auto multiplication_OverflowUnsafe_SpecificBitCount(
162+
SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier
163+
) {
164+
using S = SWAR<NB, T>;
165+
166+
auto operation = [](auto left, auto right, auto counts) {
167+
auto addendums = makeLaneMaskFromMSB(counts);
168+
return left + (addendums & right);
169+
};
170+
171+
auto halver = [](auto counts) {
172+
auto msbCleared = counts & ~S{S::MostSignificantBit};
173+
return S{msbCleared.value() << 1};
174+
};
175+
176+
multiplier = S{multiplier.value() << (NB - ActualBits)};
177+
return associativeOperatorIterated_regressive(
178+
multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation,
179+
ActualBits, halver
180+
);
181+
}
182+
183+
/// \note Not removed yet because it is an example of "progressive" associative exponentiation
184+
template<int ActualBits, int NB, typename T>
185+
constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated(
186+
SWAR<NB, T> multiplicand,
187+
SWAR<NB, T> multiplier
188+
) {
189+
using S = SWAR<NB, T>;
190+
constexpr auto LeastBit = S::LeastSignificantBit;
191+
auto multiplicandDoubling = multiplicand.value();
192+
auto mplier = multiplier.value();
193+
auto product = S{0};
194+
for(auto count = ActualBits;;) {
195+
auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier});
196+
product = product + (multiplicandDoublingMask & S{multiplicandDoubling});
197+
if(!--count) { break; }
198+
multiplicandDoubling <<= 1;
199+
auto leastBitCleared = mplier & ~LeastBit;
200+
mplier = leastBitCleared >> 1;
201+
}
202+
return product;
203+
}
204+
205+
template<int NB, typename T>
206+
constexpr auto multiplication_OverflowUnsafe(
207+
SWAR<NB, T> multiplicand,
208+
SWAR<NB, T> multiplier
209+
) {
210+
return
211+
multiplication_OverflowUnsafe_SpecificBitCount<NB>(
212+
multiplicand, multiplier
213+
);
214+
}
215+
216+
template<int NB, typename T>
217+
struct SWAR_Pair{
218+
SWAR<NB, T> even, odd;
219+
};
220+
221+
template<int NB, typename T>
222+
constexpr SWAR<NB, T> doublingMask() {
223+
using S = SWAR<NB, T>;
224+
static_assert(0 == S::Lanes % 2, "Only even number of elements supported");
225+
using D = SWAR<NB * 2, T>;
226+
return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit};
227+
}
228+
229+
template<int NB, typename T>
230+
constexpr auto doublePrecision(SWAR<NB, T> input) {
231+
using S = SWAR<NB, T>;
232+
static_assert(
233+
0 == S::NSlots % 2,
234+
"Precision can only be doubled for SWARs of even element count"
235+
);
236+
using RV = SWAR<NB * 2, T>;
237+
constexpr auto DM = doublingMask<NB, T>();
238+
return SWAR_Pair<NB * 2, T>{
239+
RV{(input & DM).value()},
240+
RV{(input.value() >> NB) & DM.value()}
241+
};
242+
}
243+
244+
template<int NB, typename T>
245+
constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) {
246+
using S = SWAR<NB, T>;
247+
static_assert(0 == NB % 2, "Only even lane-bitcounts supported");
248+
using RV = SWAR<NB/2, T>;
249+
constexpr auto HalvingMask = doublingMask<NB/2, T>();
250+
auto
251+
evenHalf = RV{even.value()} & HalvingMask,
252+
oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2};
253+
return evenHalf | oddHalf;
254+
}
255+
256+
}
257+
258+
#endif

0 commit comments

Comments
 (0)