Skip to content

Commit d7c0180

Browse files
Saturating unsigned addition and BooleanSWAR.asMask() (#71)
* add saturating unsigned add, saturated add tests, boolean asMask()
1 parent 169f4f3 commit d7c0180

File tree

3 files changed

+119
-14
lines changed

3 files changed

+119
-14
lines changed

inc/zoo/swar/SWAR.h

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ struct SWAR {
6060
NSlots = Lanes,
6161
PaddingBitsCount = BitWidth % NBits,
6262
SignificantBitsCount = BitWidth - PaddingBitsCount,
63-
AllOnes = ~std::make_unsigned_t<T>{0} >> PaddingBitsCount,
63+
AllOnes = ~std::make_unsigned_t<T>{0} >> PaddingBitsCount, // Also constructed in RobinHood utils: possible bug?
6464
LeastSignificantBit = meta::BitmaskMaker<T, std::make_unsigned_t<T>{1}, NBits>::value,
6565
MostSignificantBit = LeastSignificantBit << (NBits - 1),
6666
// Use LowerBits in favor of ~MostSignificantBit to not pollute
@@ -229,8 +229,13 @@ constexpr auto broadcast(SWAR<NBits, T> v) {
229229
template<int NBits, typename T>
230230
struct BooleanSWAR: SWAR<NBits, T> {
231231
// Booleanness is stored in the MSBs
232-
static constexpr auto MaskLaneMSB =
232+
static constexpr auto MaskMSB =
233233
broadcast<NBits, T>(SWAR<NBits, T>(T(1) << (NBits -1)));
234+
static constexpr auto MaskLSB =
235+
broadcast<NBits, T>(SWAR<NBits, T>(T(1)));
236+
// Turns off LSB of each lane
237+
static constexpr auto MaskNonLSB = ~MaskLSB;
238+
static constexpr auto MaskNonMSB = ~MaskMSB;
234239
constexpr explicit BooleanSWAR(T v): SWAR<NBits, T>(v) {}
235240

236241
constexpr BooleanSWAR clear(int bit) const noexcept {
@@ -245,9 +250,15 @@ struct BooleanSWAR: SWAR<NBits, T> {
245250
/// A logical NOT in this circumstance _only_ flips the MSB of each lane. This operation is
246251
/// not ones or twos complement.
247252
constexpr auto operator not() const noexcept {
248-
return BooleanSWAR(MaskLaneMSB ^ *this);
253+
return BooleanSWAR(MaskMSB ^ *this);
249254
}
250255

256+
// BooleanSWAR as a mask: BooleanSWAR<4, u16>(0x0800).MSBtoLaneMask() => SWAR<4,u16>(0x0F00)
257+
constexpr auto MSBtoLaneMask() const noexcept {
258+
const auto MSBMinusOne = this->m_v - (this->m_v >> (NBits-1)); // Convert pattern 10* to 01*
259+
return SWAR<NBits,T>(MSBMinusOne | this->m_v); // Blit 01* and 10* together for 1* when MSB was on.
260+
}
261+
251262
explicit
252263
constexpr operator bool() const noexcept { return this->m_v; }
253264
private:
@@ -339,7 +350,7 @@ constantIsGreaterEqual_MSB_off(SWAR<NBits, T> subtrahend) noexcept {
339350
template<int NBits, typename T>
340351
constexpr BooleanSWAR<NBits, T>
341352
greaterEqual_MSB_off(SWAR<NBits, T> left, SWAR<NBits, T> right) noexcept {
342-
constexpr auto MLMSB = BooleanSWAR<NBits, T>::MaskLaneMSB;
353+
constexpr auto MLMSB = BooleanSWAR<NBits, T>::MaskMSB;
343354
auto minuend = MLMSB | left;
344355
return MLMSB & (minuend - right);
345356
}
@@ -373,7 +384,7 @@ constexpr SWAR<NBits, T> logarithmFloor(SWAR<NBits, T> v) noexcept {
373384
constexpr auto LogNBits = meta::logFloor(NBits);
374385
static_assert(NBits == (1 << LogNBits), "Logarithms of element width not power of two is un-implemented");
375386
auto whole = v.value();
376-
auto isolationMask = BooleanSWAR<NBits, T>::MaskLaneMSB.value();
387+
auto isolationMask = BooleanSWAR<NBits, T>::MaskMSB.value();
377388
for(auto groupSize = 1; groupSize < NBits; groupSize <<= 1) {
378389
auto shifted = whole >> groupSize;
379390

inc/zoo/swar/associative_iteration.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,16 @@ fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
354354
return { result, BS{carry.value()}, BS{overflow.value()} };
355355
};
356356

357+
template<int NB, typename B>
358+
constexpr SWAR<NB, B>
359+
saturatingUnsignedAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
360+
const auto additionResult = fullAddition(s1, s2);
361+
// If we carry unsigned, we need to saturate: thus OR the carry bit with the
362+
// lane bits (carry because it happens to be earlier in the struct
363+
// declaration)
364+
return additionResult.carry.MSBtoLaneMask() | additionResult.result;
365+
}
366+
357367
/// \brief Negation is useful only for the signed integer interpretation
358368
template<int NB, typename B>
359369
constexpr auto negate(SWAR<NB, B> input) {

test/swar/BasicOperations.cpp

Lines changed: 93 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,20 @@
88
using namespace zoo;
99
using namespace zoo::swar;
1010

11+
using S4_64 = SWAR<4, uint64_t>;
12+
using S4_32 = SWAR<4, uint32_t>;
13+
using S4_16 = SWAR<4, uint16_t>;
14+
using S4_8 = SWAR<4, uint8_t>;
15+
16+
using S8_64 = SWAR<8, uint64_t>;
17+
using S8_32 = SWAR<8, uint32_t>;
18+
using S8_16 = SWAR<8, uint16_t>;
19+
using S8_8 = SWAR<8, uint8_t>;
20+
21+
using S16_64 = SWAR<16, uint64_t>;
22+
using S16_32 = SWAR<16, uint32_t>;
23+
using S16_16 = SWAR<16, uint16_t>;
24+
1125
namespace Multiplication {
1226

1327
using S4_64 = SWAR<4, uint64_t>;
@@ -85,19 +99,19 @@ TEST_CASE(
8599
"[swar]"
86100
) {
87101
for (auto i = 0; i < 63; ++i) {
88-
CHECK(i == isolate<8>(i));
89-
CHECK(i == isolate<8>(0xFF00+i));
90-
CHECK(i == isolate<8>(0xFFFF00+i));
102+
CHECK(i == isolate<8>(i));
103+
CHECK(i == isolate<8>(0xFF00+i));
104+
CHECK(i == isolate<8>(0xFFFF00+i));
91105
}
92106
for (auto i = 0; i < 31; ++i) {
93-
CHECK(i == isolate<7>(i));
94-
CHECK(i == isolate<7>(0xFF00+i));
95-
CHECK(i == isolate<7>(0xFFFF00+i));
107+
CHECK(i == isolate<7>(i));
108+
CHECK(i == isolate<7>(0xFF00+i));
109+
CHECK(i == isolate<7>(0xFFFF00+i));
96110
}
97111
for (auto i = 0; i < 31; ++i) {
98-
CHECK(i == isolate<11>(i));
99-
CHECK(i == isolate<11>(0xF800+i));
100-
CHECK(i == isolate<11>(0xFFF800+i));
112+
CHECK(i == isolate<11>(i));
113+
CHECK(i == isolate<11>(0xF800+i));
114+
CHECK(i == isolate<11>(0xFFF800+i));
101115
}
102116
}
103117

@@ -282,3 +296,73 @@ constexpr auto aBooleansWithTrue = booleans(SWAR<4, u32>{0x1});
282296
static_assert(aBooleansWithTrue);
283297
static_assert(!aBooleansWithTrue); // this is a pitfall, but lesser evil?
284298
static_assert(false == !bool(aBooleansWithTrue));
299+
300+
TEST_CASE(
301+
"fullAddition",
302+
"[swar][signed-swar][unsigned-swar]"
303+
) {
304+
SECTION("fullAddition overflow") {
305+
const auto sum = fullAddition(SWAR<4, u32>(0x0000'1000), SWAR<4, u32>(0x0000'7000));
306+
CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.carry.value());
307+
CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.overflow.value());
308+
CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.result.value());
309+
}
310+
SECTION("no carry or overflow for safe values") {
311+
const auto sum = fullAddition(SWAR<4, u32>(0x0000'8000), SWAR<4, u32>(0x0000'7000));
312+
CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.carry.value());
313+
CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.overflow.value());
314+
CHECK(SWAR<4, u32>(0x0000'F000).value() == sum.result.value());
315+
}
316+
SECTION("fullAddition signed overflow") {
317+
const auto sum = fullAddition(SWAR<4, u32>(0x0000'5000), SWAR<4, u32>(0x0000'5000));
318+
CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.carry.value());
319+
CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.overflow.value());
320+
CHECK(SWAR<4, u32>(0x0000'A000).value() == sum.result.value());
321+
}
322+
SECTION("0x0111 (7) + 0x0111 (7) is 0x1110 (0x1110->0x1101->0x0010) (0xe unsigned, 0x2 signed) (signed and unsigned check)") {
323+
const auto sum = fullAddition(SWAR<4, u32>(0x0000'7000), SWAR<4, u32>(0x0000'7000));
324+
CHECK(SWAR<4, u32>(0x0000'0000).value() == sum.carry.value());
325+
CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.overflow.value());
326+
CHECK(SWAR<4, u32>(0x0000'e000).value() == sum.result.value());
327+
}
328+
SECTION("both carry and overflow") {
329+
const auto sum = fullAddition(SWAR<4, u32>(0x0000'a000), SWAR<4, u32>(0x0000'a000));
330+
CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.carry.value());
331+
CHECK(SWAR<4, u32>(0x0000'8000).value() == sum.overflow.value());
332+
}
333+
}
334+
335+
TEST_CASE(
336+
"BooleanSWAR MSBtoLaneMask",
337+
"[swar]"
338+
) {
339+
// BooleanSWAR as a mask:
340+
auto bswar =BooleanSWAR<4, u32>(0x0808'0000);
341+
auto mask = S4_32(0x0F0F'0000);
342+
CHECK(bswar.MSBtoLaneMask().value() == mask.value());
343+
}
344+
345+
constexpr auto fullAddSumTest = fullAddition(S4_32(0x0111'1101), S4_32(0x1000'0010));
346+
static_assert( S4_32(0x1111'1111).value() == fullAddSumTest.result.value());
347+
static_assert( S4_32(0x0000'0000).value() == fullAddSumTest.carry.value());
348+
static_assert( S4_32(0x0000'0000).value() == fullAddSumTest.overflow.value());
349+
350+
// Verify that saturation works (saturates and doesn't saturate as appropriate)
351+
static_assert( S4_16(0x0000).value() == saturatingUnsignedAddition(S4_16(0x0000), S4_16(0x0000)).value());
352+
static_assert( S4_16(0x0200).value() == saturatingUnsignedAddition(S4_16(0x0100), S4_16(0x0100)).value());
353+
static_assert( S4_16(0x0400).value() == saturatingUnsignedAddition(S4_16(0x0300), S4_16(0x0100)).value());
354+
static_assert( S4_16(0x0A00).value() == saturatingUnsignedAddition(S4_16(0x0300), S4_16(0x0700)).value());
355+
static_assert( S4_16(0x0F00).value() == saturatingUnsignedAddition(S4_16(0x0800), S4_16(0x0700)).value());
356+
static_assert( S4_16(0x0F00).value() == saturatingUnsignedAddition(S4_16(0x0800), S4_16(0x0800)).value());
357+
358+
TEST_CASE(
359+
"saturatingUnsignedAddition",
360+
"[swar][saturation]"
361+
) {
362+
CHECK(SWAR<4, u16>(0x0200).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0100), SWAR<4, u16>(0x0100)).value());
363+
CHECK(SWAR<4, u16>(0x0400).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0100), SWAR<4, u16>(0x0300)).value());
364+
CHECK(SWAR<4, u16>(0x0B00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0300)).value());
365+
CHECK(SWAR<4, u16>(0x0F00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0700)).value());
366+
CHECK(SWAR<4, u16>(0x0F00).value() == saturatingUnsignedAddition(SWAR<4, u16>(0x0800), SWAR<4, u16>(0x0800)).value());
367+
CHECK(S4_32(0x0F0C'F000).value() == saturatingUnsignedAddition(S4_32(0x0804'F000), S4_32(0x0808'F000)).value());
368+
}

0 commit comments

Comments
 (0)