From 9ecef324a12a9cff59927858ba37b4c48261c364 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Mon, 29 Sep 2025 13:47:48 +0100 Subject: [PATCH] Add PairwiseAdd and ComplexMultiply to SVE microbenchmark --- src/benchmarks/micro/sve/ComplexMultiply.cs | 210 ++++++++++++++++++++ src/benchmarks/micro/sve/PairwiseAdd.cs | 199 +++++++++++++++++++ 2 files changed, 409 insertions(+) create mode 100644 src/benchmarks/micro/sve/ComplexMultiply.cs create mode 100644 src/benchmarks/micro/sve/PairwiseAdd.cs diff --git a/src/benchmarks/micro/sve/ComplexMultiply.cs b/src/benchmarks/micro/sve/ComplexMultiply.cs new file mode 100644 index 00000000000..569f20c9c28 --- /dev/null +++ b/src/benchmarks/micro/sve/ComplexMultiply.cs @@ -0,0 +1,210 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class ComplexMultiply + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve2.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private uint[] _source1; + private uint[] _source2; + private uint[] _result; + + [GlobalSetup] + public virtual void Setup() + { + _source1 = ValuesGenerator.Array(Size * 2); + _source2 = ValuesGenerator.Array(Size * 2); + _result = new uint[Size * 2]; + } + + [GlobalCleanup] + public virtual void Verify() + { + uint[] current = (uint[])_result.Clone(); + Setup(); + Scalar(); + uint[] scalar = (uint[])_result.Clone(); + // Check that the result is the same as the scalar result. + for (int i = 0; i < current.Length; i++) + { + Debug.Assert(current[i] == scalar[i]); + } + } + + // The following algorithms are adapted from the Arm simd-loops repository: + // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_112.c + + [Benchmark] + public unsafe void Scalar() + { + fixed (uint* a = _source1, b = _source2, c = _result) + { + for (int i = 0; i < Size * 2; i += 2) + { + // Index i is the real part, i + 1 is the imaginary part. + c[i] = (a[i] * b[i]) - (a[i + 1] * b[i + 1]); + c[i + 1] = (a[i] * b[i + 1]) + (a[i + 1] * b[i]); + } + } + } + + [Benchmark] + public unsafe void Vector128ComplexMultiply() + { + fixed (uint* a = _source1, b = _source2, c = _result) + { + int i = 0; + int lmt = (Size * 2) - 8; + for (; i <= lmt; i += 8) + { + Vector128 cRe = Vector128.Zero; + Vector128 cIm = Vector128.Zero; + + // Load real and imaginary parts separately. + (Vector128 aRe, Vector128 aIm) = AdvSimd.Arm64.Load2xVector128AndUnzip(a + i); + (Vector128 bRe, Vector128 bIm) = AdvSimd.Arm64.Load2xVector128AndUnzip(b + i); + + // Perform multiplication. + cRe = AdvSimd.MultiplyAdd(cRe, aRe, bRe); + cRe = AdvSimd.MultiplySubtract(cRe, aIm, bIm); + cIm = AdvSimd.MultiplyAdd(cIm, aRe, bIm); + cIm = AdvSimd.MultiplyAdd(cIm, aIm, bRe); + + // Store the output real and imaginary parts. + AdvSimd.Arm64.StoreVectorAndZip(c + i, (cRe, cIm)); + } + for (; i < Size * 2; i += 2) + { + c[i] = (a[i] * b[i]) - (a[i + 1] * b[i + 1]); + c[i + 1] = (a[i] * b[i + 1]) + (a[i + 1] * b[i]); + } + } + } + + [Benchmark] + public unsafe void SveComplexMultiply() + { + fixed (uint* a = _source1, b = _source2, c = _result) + { + int i = 0; + int cntw = (int)Sve.Count32BitElements(); + // Set limit to Size * 2 - cntw * 2. + int lmt = (Size - cntw) * 2; + + Vector pTrue = Sve.CreateTrueMaskUInt32(); + for (; i <= lmt; i += (cntw << 1)) + { + Vector cRe = Vector.Zero; + Vector cIm = Vector.Zero; + + // Load real and imaginary parts separately. + (Vector aRe, Vector aIm) = Sve.Load2xVectorAndUnzip(pTrue, a + i); + (Vector bRe, Vector bIm) = Sve.Load2xVectorAndUnzip(pTrue, b + i); + + // Perform multiplication. + cRe = Sve.MultiplyAdd(cRe, aRe, bRe); + cRe = Sve.MultiplySubtract(cRe, aIm, bIm); + cIm = Sve.MultiplyAdd(cIm, aRe, bIm); + cIm = Sve.MultiplyAdd(cIm, aIm, bRe); + + // Interleaved store the output real and imaginary parts. + Sve.StoreAndZip(pTrue, c + i, (cRe, cIm)); + } + + // Handle remaining elements using predicates. + lmt = Size * 2; + Vector pLoop = Sve.CreateWhileLessThanMask32Bit(i, lmt); + if (Sve.TestFirstTrue(pTrue, pLoop)) + { + // Compute the predicate for elements in i + cntw. + Vector pTail = Sve.CreateWhileLessThanMask32Bit(i + cntw, lmt); + // Unzip the predicates pLoop and pTail for 2xVector load/store. + Vector pInner = Sve.UnzipEven(pLoop, pTail); + + Vector cRe = Vector.Zero; + Vector cIm = Vector.Zero; + (Vector aRe, Vector aIm) = Sve.Load2xVectorAndUnzip(pInner, a + i); + (Vector bRe, Vector bIm) = Sve.Load2xVectorAndUnzip(pInner, b + i); + cRe = Sve.MultiplyAdd(cRe, aRe, bRe); + cRe = Sve.MultiplySubtract(cRe, aIm, bIm); + cIm = Sve.MultiplyAdd(cIm, aRe, bIm); + cIm = Sve.MultiplyAdd(cIm, aIm, bRe); + Sve.StoreAndZip(pInner, c + i, (cRe, cIm)); + } + } + } + + [Benchmark] + public unsafe void Sve2ComplexMultiply() + { + fixed (uint* a = _source1, b = _source2, c = _result) + { + int i = 0; + int cntd = (int)Sve.Count64BitElements(); + int lmt = Size - cntd; + + Vector pTrue = Sve.CreateTrueMaskUInt64(); + for (; i <= lmt; i += (cntd << 1)) + { + Vector c1 = Vector.Zero; + Vector c2 = Vector.Zero; + + // Read complex numbers as 64-bit then reinterpret as 32-bit vectors. + Vector a1 = (Vector)Sve2.LoadVector(pTrue, (ulong*)a + i); + Vector a2 = (Vector)Sve2.LoadVector(pTrue, (ulong*)a + i + cntd); + Vector b1 = (Vector)Sve2.LoadVector(pTrue, (ulong*)b + i); + Vector b2 = (Vector)Sve2.LoadVector(pTrue, (ulong*)b + i + cntd); + + // Perform multiplication. + c1 = Sve2.MultiplyAddRotateComplex(c1, a1, b1, 0); + c1 = Sve2.MultiplyAddRotateComplex(c1, a1, b1, 1); + c2 = Sve2.MultiplyAddRotateComplex(c2, a2, b2, 0); + c2 = Sve2.MultiplyAddRotateComplex(c2, a2, b2, 1); + + // Store to output as 64-bit vectors. + Sve2.StoreAndZip(pTrue, (ulong*)c + i, (Vector)(c1)); + Sve2.StoreAndZip(pTrue, (ulong*)c + i + cntd, (Vector)(c2)); + } + + // Handle remaining elements. + lmt = Size; + Vector pLoop = Sve.CreateWhileLessThanMask64Bit(i, lmt); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + Vector a1 = (Vector)Sve2.LoadVector(pLoop, (ulong*)a + i); + Vector b1 = (Vector)Sve2.LoadVector(pLoop, (ulong*)b + i); + Vector c1 = Vector.Zero; + c1 = Sve2.MultiplyAddRotateComplex(c1, a1, b1, 0); + c1 = Sve2.MultiplyAddRotateComplex(c1, a1, b1, 1); + Sve.StoreAndZip(pLoop, (ulong*)c + i, (Vector)(c1)); + + i += cntd; + pLoop = Sve.CreateWhileLessThanMask64Bit(i, lmt); + } + } + } + + } +} diff --git a/src/benchmarks/micro/sve/PairwiseAdd.cs b/src/benchmarks/micro/sve/PairwiseAdd.cs new file mode 100644 index 00000000000..2fc89e6aa6e --- /dev/null +++ b/src/benchmarks/micro/sve/PairwiseAdd.cs @@ -0,0 +1,199 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class PairwiseAdd + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve2.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private int[] _source1; + private int[] _source2; + private int[] _result; + + [GlobalSetup] + public virtual void Setup() + { + _source1 = ValuesGenerator.Array(Size * 2); + _source2 = new int[Size * 2]; + for (int i = 0; i < _source2.Length; i++) + { + _source2[i] = _source1[i] * 2 + 3; + } + _result = new int[Size * 2]; + } + + [GlobalCleanup] + public virtual void Verify() + { + int[] current = (int[])_result.Clone(); + Setup(); + Scalar(); + int[] scalar = (int[])_result.Clone(); + // Check that the result is the same as the scalar result. + for (int i = 0; i < current.Length; i++) + { + Debug.Assert(current[i] == scalar[i]); + } + } + + // The following algorithms are adapted from the Arm simd-loops repository: + // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_113.c + + [Benchmark] + public unsafe void Scalar() + { + fixed (int* a = _source1, b = _source2, c = _result) + { + for (int i = 0; i < Size * 2; i += 2) + { + c[i] = a[i] + a[i + 1]; + c[i + 1] = b[i] + b[i + 1]; + } + } + } + + [Benchmark] + public unsafe void Vector128PairwiseAdd() + { + fixed (int* a = _source1, b = _source2, c = _result) + { + int i = 0; + int lmt = (Size * 2) - 8; + + for (; i < lmt; i += 8) + { + // Load 2 vectors worth of elements from a and b. + Vector128 a0 = AdvSimd.LoadVector128(a + i); + Vector128 b0 = AdvSimd.LoadVector128(b + i); + Vector128 a1 = AdvSimd.LoadVector128(a + i + 4); + Vector128 b1 = AdvSimd.LoadVector128(b + i + 4); + + // Pairwise add the vectors a and b. + Vector128 c0 = AdvSimd.Arm64.AddPairwise(a0, a1); + Vector128 c1 = AdvSimd.Arm64.AddPairwise(b0, b1); + + // Store the results to c. + AdvSimd.Arm64.StoreVectorAndZip(c + i, (c0, c1)); + } + + // Handle remaining elements. + for (; i < Size * 2; i += 2) + { + c[i] = a[i] + a[i + 1]; + c[i + 1] = b[i] + b[i + 1]; + } + } + } + + [Benchmark] + public unsafe void SvePairwiseAdd() + { + fixed (int* a = _source1, b = _source2, c = _result) + { + int i = 0; + int cntw = (int)Sve.Count32BitElements(); + // Set limit to Size * 2 - cntw * 2. + int lmt = (Size - cntw) * 2; + + Vector pTrue = Sve.CreateTrueMaskInt32(); + for (; i <= lmt; i += cntw << 1) + { + // Load and unzip 2 vectors worth of elements. + (Vector a0, Vector a1) = Sve.Load2xVectorAndUnzip(pTrue, a + i); + (Vector b0, Vector b1) = Sve.Load2xVectorAndUnzip(pTrue, b + i); + + // Add the components of a and b respectively. + Vector c0 = Sve.Add(a0, a1); + Vector c1 = Sve.Add(b0, b1); + + // Interleave store the results to c. + Sve.StoreAndZip(pTrue, c + i, (c0, c1)); + } + + // Handle remaining elements using predicates. + lmt = Size * 2; + Vector pLoop = (Vector)Sve.CreateWhileLessThanMask32Bit(i, lmt); + if (Sve.TestFirstTrue(pTrue, pLoop)) + { + // Compute the predicate for elements in i + cntw. + Vector pTail = (Vector)Sve.CreateWhileLessThanMask32Bit(i + cntw, lmt); + // Unzip the predicates pLoop and pTail for 2xVector load/store. + Vector pInner = Sve.UnzipEven(pLoop, pTail); + + (Vector a0, Vector a1) = Sve.Load2xVectorAndUnzip(pInner, a + i); + (Vector b0, Vector b1) = Sve.Load2xVectorAndUnzip(pInner, b + i); + Vector c0 = Sve.Add(a0, a1); + Vector c1 = Sve.Add(b0, b1); + Sve.StoreAndZip(pInner, c + i, (c0, c1)); + } + } + } + + [Benchmark] + public unsafe void Sve2PairwiseAdd() + { + fixed (int* a = _source1, b = _source2, c = _result) + { + int i = 0; + int cntw = (int)Sve.Count32BitElements(); + // Set limit to Size * 2 - cntw * 2. + int lmt = (Size - cntw) * 2; + + Vector pTrue = Sve.CreateTrueMaskInt32(); + // Unroll loop to handle 2 vectors at a time. + for (; i <= lmt; i += cntw << 1) + { + // Load 2 vectors from a and b. + Vector a0 = Sve.LoadVector(pTrue, a + i); + Vector b0 = Sve.LoadVector(pTrue, b + i); + Vector a1 = Sve.LoadVector(pTrue, a + i + cntw); + Vector b1 = Sve.LoadVector(pTrue, b + i + cntw); + + // Pairwise add the vectors a and b. + Vector c0 = Sve2.AddPairwise(a0, b0); + Vector c1 = Sve2.AddPairwise(a1, b1); + + // Store the results to c. + Sve.StoreAndZip(pTrue, c + i, c0); + Sve.StoreAndZip(pTrue, c + i + cntw, c1); + } + + // Handle remaining elements. + lmt = Size * 2; + Vector pLoop = (Vector)Sve.CreateWhileLessThanMask32Bit(i, lmt); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + Vector a0 = Sve.LoadVector(pLoop, a + i); + Vector b0 = Sve.LoadVector(pLoop, b + i); + Vector c0 = Sve2.AddPairwise(a0, b0); + Sve.StoreAndZip(pLoop, c + i, c0); + i += cntw; + pLoop = (Vector)Sve.CreateWhileLessThanMask32Bit(i, lmt); + } + } + } + + + } +}