From 9ecef324a12a9cff59927858ba37b4c48261c364 Mon Sep 17 00:00:00 2001
From: Yat Long Poon <yatlong.poon@arm.com>
Date: Mon, 29 Sep 2025 13:47:48 +0100
Subject: [PATCH] Add PairwiseAdd and ComplexMultiply to SVE microbenchmark

---
 src/benchmarks/micro/sve/ComplexMultiply.cs | 210 ++++++++++++++++++++
 src/benchmarks/micro/sve/PairwiseAdd.cs     | 199 +++++++++++++++++++
 2 files changed, 409 insertions(+)
 create mode 100644 src/benchmarks/micro/sve/ComplexMultiply.cs
 create mode 100644 src/benchmarks/micro/sve/PairwiseAdd.cs
diff --git a/src/benchmarks/micro/sve/ComplexMultiply.cs b/src/benchmarks/micro/sve/ComplexMultiply.cs
new file mode 100644
index 00000000000..569f20c9c28
--- /dev/null
+++ b/src/benchmarks/micro/sve/ComplexMultiply.cs
@@ -0,0 +1,210 @@
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Extensions;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Filters;
+using MicroBenchmarks;
+
+namespace SveBenchmarks
+{
+    [BenchmarkCategory(Categories.Runtime)]
+    [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
+    [Config(typeof(Config))]
+    public class ComplexMultiply
+    {
+        private class Config : ManualConfig
+        {
+            public Config()
+            {
+                AddFilter(new SimpleFilter(_ => Sve2.IsSupported));
+            }
+        }
+
+        [Params(15, 127, 527, 10015)]
+        public int Size;
+
+        private uint[] _source1;
+        private uint[] _source2;
+        private uint[] _result;
+
+        [GlobalSetup]
+        public virtual void Setup()
+        {
+            _source1 = ValuesGenerator.Array<uint>(Size * 2);
+            _source2 = ValuesGenerator.Array<uint>(Size * 2);
+            _result  = new uint[Size * 2];
+        }
+
+        [GlobalCleanup]
+        public virtual void Verify()
+        {
+            uint[] current = (uint[])_result.Clone();
+            Setup();
+            Scalar();
+            uint[] scalar = (uint[])_result.Clone();
+            // Check that the result is the same as the scalar result.
+            for (int i = 0; i < current.Length; i++)
+            {
+                Debug.Assert(current[i] == scalar[i]);
+            }
+        }
+
+        // The following algorithms are adapted from the Arm simd-loops repository:
+        // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_112.c
+
+        [Benchmark]
+        public unsafe void Scalar()
+        {
+            fixed (uint* a = _source1, b = _source2, c = _result)
+            {
+                for (int i = 0; i < Size * 2; i += 2)
+                {
+                    // Index i is the real part, i + 1 is the imaginary part.
+                    c[i] = (a[i] * b[i]) - (a[i + 1] * b[i + 1]);
+                    c[i + 1] = (a[i] * b[i + 1]) + (a[i + 1] * b[i]);
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Vector128ComplexMultiply()
+        {
+            fixed (uint* a = _source1, b = _source2, c = _result)
+            {
+                int i = 0;
+                int lmt = (Size * 2) - 8;
+                for (; i <= lmt; i += 8)
+                {
+                    Vector128<uint> cRe = Vector128<uint>.Zero;
+                    Vector128<uint> cIm = Vector128<uint>.Zero;
+
+                    // Load real and imaginary parts separately.
+                    (Vector128<uint> aRe, Vector128<uint> aIm) = AdvSimd.Arm64.Load2xVector128AndUnzip(a + i);
+                    (Vector128<uint> bRe, Vector128<uint> bIm) = AdvSimd.Arm64.Load2xVector128AndUnzip(b + i);
+
+                    // Perform multiplication.
+                    cRe = AdvSimd.MultiplyAdd(cRe, aRe, bRe);
+                    cRe = AdvSimd.MultiplySubtract(cRe, aIm, bIm);
+                    cIm = AdvSimd.MultiplyAdd(cIm, aRe, bIm);
+                    cIm = AdvSimd.MultiplyAdd(cIm, aIm, bRe);
+
+                    // Store the output real and imaginary parts.
+                    AdvSimd.Arm64.StoreVectorAndZip(c + i, (cRe, cIm));
+                }
+                for (; i < Size * 2; i += 2)
+                {
+                    c[i] = (a[i] * b[i]) - (a[i + 1] * b[i + 1]);
+                    c[i + 1] = (a[i] * b[i + 1]) + (a[i + 1] * b[i]);
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void SveComplexMultiply()
+        {
+            fixed (uint* a = _source1, b = _source2, c = _result)
+            {
+                int i = 0;
+                int cntw = (int)Sve.Count32BitElements();
+                // Set limit to Size * 2 - cntw * 2.
+                int lmt = (Size - cntw) * 2;
+
+                Vector<uint> pTrue = Sve.CreateTrueMaskUInt32();
+                for (; i <= lmt; i += (cntw << 1))
+                {
+                    Vector<uint> cRe = Vector<uint>.Zero;
+                    Vector<uint> cIm = Vector<uint>.Zero;
+
+                    // Load real and imaginary parts separately.
+                    (Vector<uint> aRe, Vector<uint> aIm) = Sve.Load2xVectorAndUnzip(pTrue, a + i);
+                    (Vector<uint> bRe, Vector<uint> bIm) = Sve.Load2xVectorAndUnzip(pTrue, b + i);
+
+                    // Perform multiplication.
+                    cRe = Sve.MultiplyAdd(cRe, aRe, bRe);
+                    cRe = Sve.MultiplySubtract(cRe, aIm, bIm);
+                    cIm = Sve.MultiplyAdd(cIm, aRe, bIm);
+                    cIm = Sve.MultiplyAdd(cIm, aIm, bRe);
+
+                    // Interleaved store the output real and imaginary parts.
+                    Sve.StoreAndZip(pTrue, c + i, (cRe, cIm));
+                }
+
+                // Handle remaining elements using predicates.
+                lmt = Size * 2;
+                Vector<uint> pLoop = Sve.CreateWhileLessThanMask32Bit(i, lmt);
+                if (Sve.TestFirstTrue(pTrue, pLoop))
+                {
+                    // Compute the predicate for elements in i + cntw.
+                    Vector<uint> pTail = Sve.CreateWhileLessThanMask32Bit(i + cntw, lmt);
+                    // Unzip the predicates pLoop and pTail for 2xVector load/store.
+                    Vector<uint> pInner = Sve.UnzipEven(pLoop, pTail);
+
+                    Vector<uint> cRe = Vector<uint>.Zero;
+                    Vector<uint> cIm = Vector<uint>.Zero;
+                    (Vector<uint> aRe, Vector<uint> aIm) = Sve.Load2xVectorAndUnzip(pInner, a + i);
+                    (Vector<uint> bRe, Vector<uint> bIm) = Sve.Load2xVectorAndUnzip(pInner, b + i);
+                    cRe = Sve.MultiplyAdd(cRe, aRe, bRe);
+                    cRe = Sve.MultiplySubtract(cRe, aIm, bIm);
+                    cIm = Sve.MultiplyAdd(cIm, aRe, bIm);
+                    cIm = Sve.MultiplyAdd(cIm, aIm, bRe);
+                    Sve.StoreAndZip(pInner, c + i, (cRe, cIm));
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Sve2ComplexMultiply()
+        {
+            fixed (uint* a = _source1, b = _source2, c = _result)
+            {
+                int i = 0;
+                int cntd = (int)Sve.Count64BitElements();
+                int lmt = Size - cntd;
+
+                Vector<ulong> pTrue = Sve.CreateTrueMaskUInt64();
+                for (; i <= lmt; i += (cntd << 1))
+                {
+                    Vector<uint> c1 = Vector<uint>.Zero;
+                    Vector<uint> c2 = Vector<uint>.Zero;
+
+                    // Read complex numbers as 64-bit then reinterpret as 32-bit vectors.
+                    Vector<uint> a1 = (Vector<uint>)Sve2.LoadVector(pTrue, (ulong*)a + i);
+                    Vector<uint> a2 = (Vector<uint>)Sve2.LoadVector(pTrue, (ulong*)a + i + cntd);
+                    Vector<uint> b1 = (Vector<uint>)Sve2.LoadVector(pTrue, (ulong*)b + i);
+                    Vector<uint> b2 = (Vector<uint>)Sve2.LoadVector(pTrue, (ulong*)b + i + cntd);
+
+                    // Perform multiplication.
+                    c1 = Sve2.MultiplyAddRotateComplex(c1, a1, b1, 0);
+                    c1 = Sve2.MultiplyAddRotateComplex(c1, a1, b1, 1);
+                    c2 = Sve2.MultiplyAddRotateComplex(c2, a2, b2, 0);
+                    c2 = Sve2.MultiplyAddRotateComplex(c2, a2, b2, 1);
+
+                    // Store to output as 64-bit vectors.
+                    Sve2.StoreAndZip(pTrue, (ulong*)c + i, (Vector<ulong>)(c1));
+                    Sve2.StoreAndZip(pTrue, (ulong*)c + i + cntd, (Vector<ulong>)(c2));
+                }
+
+                // Handle remaining elements.
+                lmt = Size;
+                Vector<ulong> pLoop = Sve.CreateWhileLessThanMask64Bit(i, lmt);
+                while (Sve.TestFirstTrue(pTrue, pLoop))
+                {
+                    Vector<uint> a1 = (Vector<uint>)Sve2.LoadVector(pLoop, (ulong*)a + i);
+                    Vector<uint> b1 = (Vector<uint>)Sve2.LoadVector(pLoop, (ulong*)b + i);
+                    Vector<uint> c1 = Vector<uint>.Zero;
+                    c1 = Sve2.MultiplyAddRotateComplex(c1, a1, b1, 0);
+                    c1 = Sve2.MultiplyAddRotateComplex(c1, a1, b1, 1);
+                    Sve.StoreAndZip(pLoop, (ulong*)c + i, (Vector<ulong>)(c1));
+
+                    i += cntd;
+                    pLoop = Sve.CreateWhileLessThanMask64Bit(i, lmt);
+                }
+            }
+        }
+
+    }
+}
diff --git a/src/benchmarks/micro/sve/PairwiseAdd.cs b/src/benchmarks/micro/sve/PairwiseAdd.cs
new file mode 100644
index 00000000000..2fc89e6aa6e
--- /dev/null
+++ b/src/benchmarks/micro/sve/PairwiseAdd.cs
@@ -0,0 +1,199 @@
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Extensions;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Filters;
+using MicroBenchmarks;
+
+namespace SveBenchmarks
+{
+    [BenchmarkCategory(Categories.Runtime)]
+    [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
+    [Config(typeof(Config))]
+    public class PairwiseAdd
+    {
+        private class Config : ManualConfig
+        {
+            public Config()
+            {
+                AddFilter(new SimpleFilter(_ => Sve2.IsSupported));
+            }
+        }
+
+        [Params(15, 127, 527, 10015)]
+        public int Size;
+
+        private int[] _source1;
+        private int[] _source2;
+        private int[] _result;
+
+        [GlobalSetup]
+        public virtual void Setup()
+        {
+            _source1 = ValuesGenerator.Array<int>(Size * 2);
+            _source2 = new int[Size * 2];
+            for (int i = 0; i < _source2.Length; i++)
+            {
+                _source2[i] = _source1[i] * 2 + 3;
+            }
+            _result = new int[Size * 2];
+        }
+
+        [GlobalCleanup]
+        public virtual void Verify()
+        {
+            int[] current = (int[])_result.Clone();
+            Setup();
+            Scalar();
+            int[] scalar = (int[])_result.Clone();
+            // Check that the result is the same as the scalar result.
+            for (int i = 0; i < current.Length; i++)
+            {
+                Debug.Assert(current[i] == scalar[i]);
+            }
+        }
+
+        // The following algorithms are adapted from the Arm simd-loops repository:
+        // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_113.c
+
+        [Benchmark]
+        public unsafe void Scalar()
+        {
+            fixed (int* a = _source1, b = _source2, c = _result)
+            {
+                for (int i = 0; i < Size * 2; i += 2)
+                {
+                    c[i] = a[i] + a[i + 1];
+                    c[i + 1] = b[i] + b[i + 1];
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Vector128PairwiseAdd()
+        {
+            fixed (int* a = _source1, b = _source2, c = _result)
+            {
+                int i = 0;
+                int lmt = (Size * 2) - 8;
+
+                for (; i < lmt; i += 8)
+                {
+                    // Load 2 vectors worth of elements from a and b.
+                    Vector128<int> a0 = AdvSimd.LoadVector128(a + i);
+                    Vector128<int> b0 = AdvSimd.LoadVector128(b + i);
+                    Vector128<int> a1 = AdvSimd.LoadVector128(a + i + 4);
+                    Vector128<int> b1 = AdvSimd.LoadVector128(b + i + 4);
+
+                    // Pairwise add the vectors a and b.
+                    Vector128<int> c0 = AdvSimd.Arm64.AddPairwise(a0, a1);
+                    Vector128<int> c1 = AdvSimd.Arm64.AddPairwise(b0, b1);
+
+                    // Store the results to c.
+                    AdvSimd.Arm64.StoreVectorAndZip(c + i, (c0, c1));
+                }
+
+                // Handle remaining elements.
+                for (; i < Size * 2; i += 2)
+                {
+                    c[i] = a[i] + a[i + 1];
+                    c[i + 1] = b[i] + b[i + 1];
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void SvePairwiseAdd()
+        {
+            fixed (int* a = _source1, b = _source2, c = _result)
+            {
+                int i = 0;
+                int cntw = (int)Sve.Count32BitElements();
+                // Set limit to Size * 2 - cntw * 2.
+                int lmt = (Size - cntw) * 2;
+
+                Vector<int> pTrue = Sve.CreateTrueMaskInt32();
+                for (; i <= lmt; i += cntw << 1)
+                {
+                    // Load and unzip 2 vectors worth of elements.
+                    (Vector<int> a0, Vector<int> a1) = Sve.Load2xVectorAndUnzip(pTrue, a + i);
+                    (Vector<int> b0, Vector<int> b1) = Sve.Load2xVectorAndUnzip(pTrue, b + i);
+
+                    // Add the components of a and b respectively.
+                    Vector<int> c0 = Sve.Add(a0, a1);
+                    Vector<int> c1 = Sve.Add(b0, b1);
+
+                    // Interleave store the results to c.
+                    Sve.StoreAndZip(pTrue, c + i, (c0, c1));
+                }
+
+                // Handle remaining elements using predicates.
+                lmt = Size * 2;
+                Vector<int> pLoop = (Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, lmt);
+                if (Sve.TestFirstTrue(pTrue, pLoop))
+                {
+                    // Compute the predicate for elements in i + cntw.
+                    Vector<int> pTail = (Vector<int>)Sve.CreateWhileLessThanMask32Bit(i + cntw, lmt);
+                    // Unzip the predicates pLoop and pTail for 2xVector load/store.
+                    Vector<int> pInner = Sve.UnzipEven(pLoop, pTail);
+
+                    (Vector<int> a0, Vector<int> a1) = Sve.Load2xVectorAndUnzip(pInner, a + i);
+                    (Vector<int> b0, Vector<int> b1) = Sve.Load2xVectorAndUnzip(pInner, b + i);
+                    Vector<int> c0 = Sve.Add(a0, a1);
+                    Vector<int> c1 = Sve.Add(b0, b1);
+                    Sve.StoreAndZip(pInner, c + i, (c0, c1));
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Sve2PairwiseAdd()
+        {
+            fixed (int* a = _source1, b = _source2, c = _result)
+            {
+                int i = 0;
+                int cntw = (int)Sve.Count32BitElements();
+                // Set limit to Size * 2 - cntw * 2.
+                int lmt = (Size - cntw) * 2;
+
+                Vector<int> pTrue = Sve.CreateTrueMaskInt32();
+                // Unroll loop to handle 2 vectors at a time.
+                for  (; i <= lmt; i += cntw << 1)
+                {
+                    // Load 2 vectors from a and b.
+                    Vector<int> a0 = Sve.LoadVector(pTrue, a + i);
+                    Vector<int> b0 = Sve.LoadVector(pTrue, b + i);
+                    Vector<int> a1 = Sve.LoadVector(pTrue, a + i + cntw);
+                    Vector<int> b1 = Sve.LoadVector(pTrue, b + i + cntw);
+
+                    // Pairwise add the vectors a and b.
+                    Vector<int> c0 = Sve2.AddPairwise(a0, b0);
+                    Vector<int> c1 = Sve2.AddPairwise(a1, b1);
+
+                    // Store the results to c.
+                    Sve.StoreAndZip(pTrue, c + i, c0);
+                    Sve.StoreAndZip(pTrue, c + i + cntw, c1);
+                }
+
+                // Handle remaining elements.
+                lmt = Size * 2;
+                Vector<int> pLoop = (Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, lmt);
+                while (Sve.TestFirstTrue(pTrue, pLoop))
+                {
+                    Vector<int> a0 = Sve.LoadVector(pLoop, a + i);
+                    Vector<int> b0 = Sve.LoadVector(pLoop, b + i);
+                    Vector<int> c0 = Sve2.AddPairwise(a0, b0);
+                    Sve.StoreAndZip(pLoop, c + i, c0);
+                    i += cntw;
+                    pLoop = (Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, lmt);
+                }
+            }
+        }
+
+
+    }
+}