|
| 1 | +#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H |
| 2 | +#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H |
| 3 | + |
| 4 | +#include "zoo/swar/SWAR.h" |
| 5 | + |
| 6 | +namespace zoo::swar { |
| 7 | + |
| 8 | +/// \note This code should be substituted by an application of "progressive" algebraic iteration |
| 9 | +/// \note There is also parallelPrefix (to be implemented) |
| 10 | +template<int NB, typename B> |
| 11 | +constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) { |
| 12 | + using S = SWAR<NB, B>; |
| 13 | + auto |
| 14 | + shiftClearingMask = S{~S::MostSignificantBit}, |
| 15 | + doubling = input, |
| 16 | + result = S{0}; |
| 17 | + auto |
| 18 | + bitsToXOR = NB, |
| 19 | + power = 1; |
| 20 | + for(;;) { |
| 21 | + if(1 & bitsToXOR) { |
| 22 | + result = result ^ doubling; |
| 23 | + doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); |
| 24 | + } |
| 25 | + bitsToXOR >>= 1; |
| 26 | + if(!bitsToXOR) { break; } |
| 27 | + auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); |
| 28 | + doubling = doubling ^ shifted; |
| 29 | + // 01...1 |
| 30 | + // 001...1 |
| 31 | + // 00001...1 |
| 32 | + // 000000001...1 |
| 33 | + shiftClearingMask = |
| 34 | + shiftClearingMask & S{shiftClearingMask.value() >> power}; |
| 35 | + power <<= 1; |
| 36 | + } |
| 37 | + return S{result}; |
| 38 | +} |
| 39 | + |
| 40 | +/// \todo because of the desirability of "accumuating" the XORs at the MSB, |
| 41 | +/// the parallel suffix operation is more suitable. |
| 42 | +template<int NB, typename B> |
| 43 | +constexpr SWAR<NB, B> parity(SWAR<NB, B> input) { |
| 44 | + using S = SWAR<NB, B>; |
| 45 | + auto preResult = parallelSuffix(input); |
| 46 | + auto onlyMSB = preResult.value() & S::MostSignificantBit; |
| 47 | + return S{onlyMSB}; |
| 48 | +} |
| 49 | + |
| 50 | + |
| 51 | +namespace impl { |
| 52 | +template<int NB, typename B> |
| 53 | +constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR<NB, B> msb, SWAR<NB, B> lsb) { |
| 54 | + auto msbCopiedDown = msb - lsb; |
| 55 | + auto msbReintroduced = msbCopiedDown | msb; |
| 56 | + return msbReintroduced; |
| 57 | +} |
| 58 | +} |
| 59 | + |
| 60 | +template<int NB, typename B> |
| 61 | +constexpr auto makeLaneMaskFromLSB(SWAR<NB, B> input) { |
| 62 | + using S = SWAR<NB, B>; |
| 63 | + auto lsb = input & S{S::LeastSignificantBit}; |
| 64 | + auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)}; |
| 65 | + return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb); |
| 66 | +} |
| 67 | + |
| 68 | +template<int NB, typename B> |
| 69 | +constexpr auto makeLaneMaskFromMSB(SWAR<NB, B> input) { |
| 70 | + using S = SWAR<NB, B>; |
| 71 | + auto msb = input & S{S::MostSignificantBit}; |
| 72 | + auto msbCopiedToLSB = S{msb.value() >> (NB - 1)}; |
| 73 | + return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); |
| 74 | +} |
| 75 | + |
| 76 | +template<int NB, typename B> |
| 77 | +struct ArithmeticResultTriplet { |
| 78 | + SWAR<NB, B> result; |
| 79 | + BooleanSWAR<NB, B> carry, overflow; |
| 80 | +}; |
| 81 | + |
| 82 | + |
| 83 | +template<int NB, typename B> |
| 84 | +constexpr ArithmeticResultTriplet<NB, B> |
| 85 | +fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) { |
| 86 | + using S = SWAR<NB, B>; |
| 87 | + constexpr auto |
| 88 | + SignBit = S{S::MostSignificantBit}, |
| 89 | + LowerBits = SignBit - S{S::LeastSignificantBit}; |
| 90 | + // prevent overflow by clearing the most significant bits |
| 91 | + auto |
| 92 | + s1prime = LowerBits & s1, |
| 93 | + s2prime = LowerBits & s2, |
| 94 | + resultPrime = s1prime + s2prime, |
| 95 | + s1Sign = SignBit & s1, |
| 96 | + s2Sign = SignBit & s2, |
| 97 | + signPrime = SignBit & resultPrime, |
| 98 | + result = resultPrime ^ s1Sign ^ s2Sign, |
| 99 | + // carry is set whenever at least two of the sign bits of s1, s2, |
| 100 | + // signPrime are set |
| 101 | + carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime), |
| 102 | + // overflow: the inputs have the same sign and different to result |
| 103 | + // same sign: s1Sign ^ s2Sign |
| 104 | + overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result); |
| 105 | + using BS = BooleanSWAR<NB, B>; |
| 106 | + return { result, BS{carry.value()}, BS{overflow.value()} }; |
| 107 | +}; |
| 108 | + |
| 109 | + |
| 110 | +template<int NB, typename B> |
| 111 | +constexpr auto negate(SWAR<NB, B> input) { |
| 112 | + using S = SWAR<NB, B>; |
| 113 | + constexpr auto Ones = S{S::LeastSignificantBit}; |
| 114 | + return fullAddition(~input, Ones).result; |
| 115 | +} |
| 116 | + |
| 117 | +/// \brief Performs a generalized iterated application of an associative operator to a base |
| 118 | +/// |
| 119 | +/// In algebra, the repeated application of an operator to a "base" has different names depending on the |
| 120 | +/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", |
| 121 | +/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n |
| 122 | +/// power". |
| 123 | +/// The general term in algebra is "iteration", hence the naming of this function. |
| 124 | +/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we |
| 125 | +/// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very |
| 126 | +/// different meaning in programming and especially different in C++. |
| 127 | +/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this |
| 128 | +/// function leverages the associative property of the operator to "halve" the count of iterations at each step. |
| 129 | +/// \note There is a symmetrical operation to be implemented of associative iteration in the |
| 130 | +/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb, |
| 131 | +/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)" |
| 132 | +/// \tparam Operator a callable with three arguments: the left and right arguments to the operation |
| 133 | +/// and the count to be used, the "count" is an artifact of this generalization |
| 134 | +/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not |
| 135 | +/// be a number, the iteration count is part of the execution context to apply the operator |
| 136 | +/// \param forSquaring is an artifact of this generalization |
| 137 | +/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows |
| 138 | +/// there are fewer iterations than what the type of exponent would allow |
| 139 | +template< |
| 140 | + typename Base, typename IterationCount, typename Operator, |
| 141 | + // the critical use of associativity is that it allows halving the |
| 142 | + // iteration count |
| 143 | + typename CountHalver |
| 144 | +> |
| 145 | +constexpr auto associativeOperatorIterated_regressive( |
| 146 | + Base base, Base neutral, IterationCount count, IterationCount forSquaring, |
| 147 | + Operator op, unsigned log2Count, CountHalver ch |
| 148 | +) { |
| 149 | + auto result = neutral; |
| 150 | + if(!log2Count) { return result; } |
| 151 | + for(;;) { |
| 152 | + result = op(result, base, count); |
| 153 | + if(!--log2Count) { break; } |
| 154 | + result = op(result, result, forSquaring); |
| 155 | + count = ch(count); |
| 156 | + } |
| 157 | + return result; |
| 158 | +} |
| 159 | + |
| 160 | +template<int ActualBits, int NB, typename T> |
| 161 | +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( |
| 162 | + SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier |
| 163 | +) { |
| 164 | + using S = SWAR<NB, T>; |
| 165 | + |
| 166 | + auto operation = [](auto left, auto right, auto counts) { |
| 167 | + auto addendums = makeLaneMaskFromMSB(counts); |
| 168 | + return left + (addendums & right); |
| 169 | + }; |
| 170 | + |
| 171 | + auto halver = [](auto counts) { |
| 172 | + auto msbCleared = counts & ~S{S::MostSignificantBit}; |
| 173 | + return S{msbCleared.value() << 1}; |
| 174 | + }; |
| 175 | + |
| 176 | + multiplier = S{multiplier.value() << (NB - ActualBits)}; |
| 177 | + return associativeOperatorIterated_regressive( |
| 178 | + multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, |
| 179 | + ActualBits, halver |
| 180 | + ); |
| 181 | +} |
| 182 | + |
| 183 | +/// \note Not removed yet because it is an example of "progressive" associative exponentiation |
| 184 | +template<int ActualBits, int NB, typename T> |
| 185 | +constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( |
| 186 | + SWAR<NB, T> multiplicand, |
| 187 | + SWAR<NB, T> multiplier |
| 188 | +) { |
| 189 | + using S = SWAR<NB, T>; |
| 190 | + constexpr auto LeastBit = S::LeastSignificantBit; |
| 191 | + auto multiplicandDoubling = multiplicand.value(); |
| 192 | + auto mplier = multiplier.value(); |
| 193 | + auto product = S{0}; |
| 194 | + for(auto count = ActualBits;;) { |
| 195 | + auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier}); |
| 196 | + product = product + (multiplicandDoublingMask & S{multiplicandDoubling}); |
| 197 | + if(!--count) { break; } |
| 198 | + multiplicandDoubling <<= 1; |
| 199 | + auto leastBitCleared = mplier & ~LeastBit; |
| 200 | + mplier = leastBitCleared >> 1; |
| 201 | + } |
| 202 | + return product; |
| 203 | +} |
| 204 | + |
| 205 | +template<int NB, typename T> |
| 206 | +constexpr auto multiplication_OverflowUnsafe( |
| 207 | + SWAR<NB, T> multiplicand, |
| 208 | + SWAR<NB, T> multiplier |
| 209 | +) { |
| 210 | + return |
| 211 | + multiplication_OverflowUnsafe_SpecificBitCount<NB>( |
| 212 | + multiplicand, multiplier |
| 213 | + ); |
| 214 | +} |
| 215 | + |
| 216 | +template<int NB, typename T> |
| 217 | +struct SWAR_Pair{ |
| 218 | + SWAR<NB, T> even, odd; |
| 219 | +}; |
| 220 | + |
| 221 | +template<int NB, typename T> |
| 222 | +constexpr SWAR<NB, T> doublingMask() { |
| 223 | + using S = SWAR<NB, T>; |
| 224 | + static_assert(0 == S::Lanes % 2, "Only even number of elements supported"); |
| 225 | + using D = SWAR<NB * 2, T>; |
| 226 | + return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit}; |
| 227 | +} |
| 228 | + |
| 229 | +template<int NB, typename T> |
| 230 | +constexpr auto doublePrecision(SWAR<NB, T> input) { |
| 231 | + using S = SWAR<NB, T>; |
| 232 | + static_assert( |
| 233 | + 0 == S::NSlots % 2, |
| 234 | + "Precision can only be doubled for SWARs of even element count" |
| 235 | + ); |
| 236 | + using RV = SWAR<NB * 2, T>; |
| 237 | + constexpr auto DM = doublingMask<NB, T>(); |
| 238 | + return SWAR_Pair<NB * 2, T>{ |
| 239 | + RV{(input & DM).value()}, |
| 240 | + RV{(input.value() >> NB) & DM.value()} |
| 241 | + }; |
| 242 | +} |
| 243 | + |
| 244 | +template<int NB, typename T> |
| 245 | +constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) { |
| 246 | + using S = SWAR<NB, T>; |
| 247 | + static_assert(0 == NB % 2, "Only even lane-bitcounts supported"); |
| 248 | + using RV = SWAR<NB/2, T>; |
| 249 | + constexpr auto HalvingMask = doublingMask<NB/2, T>(); |
| 250 | + auto |
| 251 | + evenHalf = RV{even.value()} & HalvingMask, |
| 252 | + oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2}; |
| 253 | + return evenHalf | oddHalf; |
| 254 | +} |
| 255 | + |
| 256 | +} |
| 257 | + |
| 258 | +#endif |
0 commit comments