From 0a4b08ec3261c3a558abcc026e3cc3021dded068 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Tue, 28 May 2024 17:39:45 -0700 Subject: [PATCH 01/10] align --- BitFaster.Caching/Lfu/CmSketchCore.cs | 34 +++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index de255840..3d6b64c9 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -1,6 +1,11 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + + #if !NETSTANDARD2_0 @@ -24,7 +29,7 @@ namespace BitFaster.Caching.Lfu /// /// This is a direct C# translation of FrequencySketch in the Caffeine library by ben.manes@gmail.com (Ben Manes). /// https://github.com/ben-manes/caffeine - public class CmSketchCore + public unsafe class CmSketchCore where T : notnull where I : struct, IsaProbe { @@ -32,6 +37,7 @@ public class CmSketchCore private const long OneMask = 0x1111111111111111L; private long[] table; + private long* tableAddr; private int sampleSize; private int blockMask; private int size; @@ -111,7 +117,15 @@ public void Increment(T value) /// public void Clear() { + #if NET6_0_OR_GREATER + table = GC.AllocateArray(table.Length, true); + GCHandle handle = GCHandle.Alloc(table, GCHandleType.Pinned); + IntPtr pointer = handle.AddrOfPinnedObject(); + + tableAddr = (long*)pointer + pointer.ToInt64() % 32; + #else table = new long[table.Length]; + #endif size = 0; } @@ -120,8 +134,19 @@ private void EnsureCapacity(long maximumSize) { int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); + #if NET6_0_OR_GREATER + // over alloc by 4 to give 32 byte buffer + table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + 4, true); + + GCHandle handle = GCHandle.Alloc(table, GCHandleType.Pinned); + IntPtr pointer = handle.AddrOfPinnedObject(); + + tableAddr = (long*)pointer + pointer.ToInt64() % 32; + + #else table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; - blockMask = (int)((uint)table.Length >> 3) - 1; + #endif + blockMask = (int)((uint)(table.Length-4) >> 3) - 1; sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); size = 0; @@ -246,7 +271,7 @@ private unsafe int EstimateFrequencyAvx(T value) Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) - fixed (long* tablePtr = table) + long* tablePtr = tableAddr; { Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); index = Avx2.ShiftLeftLogical(index, 2); @@ -289,7 +314,8 @@ private unsafe void IncrementAvx(T value) Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) - fixed (long* tablePtr = table) + //fixed (long* tablePtr = table) + long* tablePtr = tableAddr; { Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); From 2e259f0a58deca32b308cc5cfbd0a95133bf41be Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Tue, 28 May 2024 18:23:16 -0700 Subject: [PATCH 02/10] unsafe as ptr --- BitFaster.Caching/Lfu/CmSketchCore.cs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 3d6b64c9..9275d584 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -119,10 +119,8 @@ public void Clear() { #if NET6_0_OR_GREATER table = GC.AllocateArray(table.Length, true); - GCHandle handle = GCHandle.Alloc(table, GCHandleType.Pinned); - IntPtr pointer = handle.AddrOfPinnedObject(); - - tableAddr = (long*)pointer + pointer.ToInt64() % 32; + long pointer = (long)Unsafe.AsPointer(ref table[0]); + tableAddr = (long*)pointer + pointer % 32; #else table = new long[table.Length]; #endif @@ -137,12 +135,8 @@ private void EnsureCapacity(long maximumSize) #if NET6_0_OR_GREATER // over alloc by 4 to give 32 byte buffer table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + 4, true); - - GCHandle handle = GCHandle.Alloc(table, GCHandleType.Pinned); - IntPtr pointer = handle.AddrOfPinnedObject(); - - tableAddr = (long*)pointer + pointer.ToInt64() % 32; - + long pointer = (long)Unsafe.AsPointer(ref table[0]); + tableAddr = (long*)pointer + pointer % 32; #else table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; #endif From 98ec2d85a0bbd1be765fef9572b33330972cd721 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Fri, 31 May 2024 17:49:58 -0700 Subject: [PATCH 03/10] always pin --- BitFaster.Caching/Lfu/CmSketchCore.cs | 42 +++++++++++++-------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 9275d584..b6d21857 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -1,12 +1,7 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; -using System.Reflection; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; - - - #if !NETSTANDARD2_0 using System.Runtime.Intrinsics; @@ -37,7 +32,9 @@ public unsafe class CmSketchCore private const long OneMask = 0x1111111111111111L; private long[] table; +#if NET6_0_OR_GREATER private long* tableAddr; +#endif private int sampleSize; private int blockMask; private int size; @@ -117,13 +114,7 @@ public void Increment(T value) /// public void Clear() { - #if NET6_0_OR_GREATER - table = GC.AllocateArray(table.Length, true); - long pointer = (long)Unsafe.AsPointer(ref table[0]); - tableAddr = (long*)pointer + pointer % 32; - #else - table = new long[table.Length]; - #endif + Array.Clear(table, 0, table.Length); size = 0; } @@ -132,15 +123,17 @@ private void EnsureCapacity(long maximumSize) { int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); - #if NET6_0_OR_GREATER - // over alloc by 4 to give 32 byte buffer - table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + 4, true); +#if NET6_0_OR_GREATER + // over alloc by 4 to give 32 bytes padding, tableAddr is then aligned to 32 bytes + const int buffer = 4; + table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + buffer, true); long pointer = (long)Unsafe.AsPointer(ref table[0]); tableAddr = (long*)pointer + pointer % 32; - #else + blockMask = (int)((uint)(table.Length-buffer) >> 3) - 1; +#else table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; - #endif - blockMask = (int)((uint)(table.Length-4) >> 3) - 1; + blockMask = (int)((uint)(table.Length) >> 3) - 1; +#endif sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); size = 0; @@ -265,7 +258,11 @@ private unsafe int EstimateFrequencyAvx(T value) Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + #if NET6_0_OR_GREATER long* tablePtr = tableAddr; + #else + fixed (long* tablePtr = table) + #endif { Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); index = Avx2.ShiftLeftLogical(index, 2); @@ -283,7 +280,7 @@ private unsafe int EstimateFrequencyAvx(T value) .AsUInt16(); // set the zeroed high parts of the long value to ushort.Max -#if NET6_0 +#if NET6_0_OR_GREATER count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); #else count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); @@ -308,8 +305,11 @@ private unsafe void IncrementAvx(T value) Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) - //fixed (long* tablePtr = table) - long* tablePtr = tableAddr; + #if NET6_0_OR_GREATER + long* tablePtr = tableAddr; + #else + fixed (long* tablePtr = table) + #endif { Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); From 84459c0c28ecf82531790dadfb722d51ea8b8b8b Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Fri, 31 May 2024 18:18:49 -0700 Subject: [PATCH 04/10] try without pad --- BitFaster.Caching/Lfu/CmSketchCore.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index b6d21857..3139c26f 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -125,11 +125,11 @@ private void EnsureCapacity(long maximumSize) #if NET6_0_OR_GREATER // over alloc by 4 to give 32 bytes padding, tableAddr is then aligned to 32 bytes - const int buffer = 4; - table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + buffer, true); + const int pad = 4; + table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, true); long pointer = (long)Unsafe.AsPointer(ref table[0]); tableAddr = (long*)pointer + pointer % 32; - blockMask = (int)((uint)(table.Length-buffer) >> 3) - 1; + blockMask = (int)((uint)(table.Length-pad) >> 3) - 1; #else table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; blockMask = (int)((uint)(table.Length) >> 3) - 1; From 3bf061dbe72912eabf3e708319b101b893451ec0 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Fri, 31 May 2024 19:13:11 -0700 Subject: [PATCH 05/10] direct --- BitFaster.Caching/Lfu/CmSketchCore.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 3139c26f..e52a377e 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -127,8 +127,10 @@ private void EnsureCapacity(long maximumSize) // over alloc by 4 to give 32 bytes padding, tableAddr is then aligned to 32 bytes const int pad = 4; table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, true); - long pointer = (long)Unsafe.AsPointer(ref table[0]); - tableAddr = (long*)pointer + pointer % 32; + + tableAddr = (long*)Unsafe.AsPointer(ref table[0]); + //long pointer = (long)Unsafe.AsPointer(ref table[0]); + //tableAddr = (long*)pointer + pointer % 32; blockMask = (int)((uint)(table.Length-pad) >> 3) - 1; #else table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; From 6877b0dd25f9ad53fb32be366a93b412ca3b3d5e Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Fri, 31 May 2024 19:53:36 -0700 Subject: [PATCH 06/10] cleanup --- BitFaster.Caching/Lfu/CmSketchCore.cs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index e52a377e..18d95e04 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -2,6 +2,8 @@ using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + #if !NETSTANDARD2_0 using System.Runtime.Intrinsics; @@ -129,9 +131,10 @@ private void EnsureCapacity(long maximumSize) table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, true); tableAddr = (long*)Unsafe.AsPointer(ref table[0]); - //long pointer = (long)Unsafe.AsPointer(ref table[0]); - //tableAddr = (long*)pointer + pointer % 32; - blockMask = (int)((uint)(table.Length-pad) >> 3) - 1; + tableAddr = (long*)((long)tableAddr + (long)tableAddr % 32); + + blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; + #else table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; blockMask = (int)((uint)(table.Length) >> 3) - 1; @@ -260,11 +263,11 @@ private unsafe int EstimateFrequencyAvx(T value) Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) - #if NET6_0_OR_GREATER +#if NET6_0_OR_GREATER long* tablePtr = tableAddr; - #else +#else fixed (long* tablePtr = table) - #endif +#endif { Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); index = Avx2.ShiftLeftLogical(index, 2); @@ -307,11 +310,11 @@ private unsafe void IncrementAvx(T value) Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) - #if NET6_0_OR_GREATER +#if NET6_0_OR_GREATER long* tablePtr = tableAddr; - #else +#else fixed (long* tablePtr = table) - #endif +#endif { Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); From 209d429b83776c39c4f9cfb90ec172e46a0cba76 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 21 Nov 2024 04:44:16 +0000 Subject: [PATCH 07/10] align 64 --- .../BitFaster.Caching.Benchmarks.csproj | 1 + .../Lfu/CmSketchNoPin.cs | 318 ++++++++++++++++++ .../Lfu/SketchIncrement.cs | 19 +- BitFaster.Caching/Lfu/CmSketchCore.cs | 31 +- 4 files changed, 355 insertions(+), 14 deletions(-) create mode 100644 BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj index f61ff6a9..df10ca09 100644 --- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj +++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj @@ -2,6 +2,7 @@ Exe + 13 net48;net6.0;net8.0 True diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs new file mode 100644 index 00000000..809a3f9b --- /dev/null +++ b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs @@ -0,0 +1,318 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; + +#if NET6_0_OR_GREATER +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + +namespace BitFaster.Caching.Benchmarks.Lfu +{ + internal class CmSketchNoPin + where T : notnull + where I : struct, IsaProbe + { + private const long ResetMask = 0x7777777777777777L; + private const long OneMask = 0x1111111111111111L; + + private long[] table; + private int sampleSize; + private int blockMask; + private int size; + + private readonly IEqualityComparer comparer; + + /// + /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer. + /// + /// The maximum size. + /// The equality comparer. + public CmSketchNoPin(long maximumSize, IEqualityComparer comparer) + { + EnsureCapacity(maximumSize); + this.comparer = comparer; + } + + /// + /// Gets the reset sample size. + /// + public int ResetSampleSize => this.sampleSize; + + /// + /// Gets the size. + /// + public int Size => this.size; + + /// + /// Estimate the frequency of the specified value, up to the maximum of 15. + /// + /// The value. + /// The estimated frequency of the value. + public int EstimateFrequency(T value) + { +#if NET48 + return EstimateFrequencyStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + return EstimateFrequencyAvx(value); + } + else + { + return EstimateFrequencyStd(value); + } +#endif + } + + /// + /// Increment the count of the specified value. + /// + /// The value. + public void Increment(T value) + { +#if NET48 + IncrementStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + IncrementAvx(value); + } + else + { + IncrementStd(value); + } +#endif + } + + /// + /// Clears the count for all items. + /// + public void Clear() + { + table = new long[table.Length]; + size = 0; + } + + // [MemberNotNull(nameof(table))] + private void EnsureCapacity(long maximumSize) + { + int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); + + table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; + blockMask = (int)((uint)table.Length >> 3) - 1; + sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); + + size = 0; + } + + private unsafe int EstimateFrequencyStd(T value) + { + var count = stackalloc int[4]; + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + for (int i = 0; i < 4; i++) + { + int h = (int)((uint)counterHash >> (i << 3)); + int index = (h >> 1) & 15; + int offset = h & 1; + count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); + } + return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); + } + + private unsafe void IncrementStd(T value) + { + var index = stackalloc int[8]; + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + for (int i = 0; i < 4; i++) + { + int h = (int)((uint)counterHash >> (i << 3)); + index[i] = (h >> 1) & 15; + int offset = h & 1; + index[i + 4] = block + offset + (i << 1); + } + + bool added = + IncrementAt(index[4], index[0]) + | IncrementAt(index[5], index[1]) + | IncrementAt(index[6], index[2]) + | IncrementAt(index[7], index[3]); + + if (added && (++size == sampleSize)) + { + Reset(); + } + } + + // Applies another round of hashing for additional randomization + private static int Rehash(int x) + { + x = (int)(x * 0x31848bab); + x ^= (int)((uint)x >> 14); + return x; + } + + // Applies a supplemental hash functions to defends against poor quality hash. + private static int Spread(int x) + { + x ^= (int)((uint)x >> 17); + x = (int)(x * 0xed5ad4bb); + x ^= (int)((uint)x >> 11); + x = (int)(x * 0xac4c1b51); + x ^= (int)((uint)x >> 15); + return x; + } + + private bool IncrementAt(int i, int j) + { + int offset = j << 2; + long mask = (0xfL << offset); + + if ((table[i] & mask) != mask) + { + table[i] += (1L << offset); + return true; + } + + return false; + } + + private void Reset() + { + // unroll, almost 2x faster + int count0 = 0; + int count1 = 0; + int count2 = 0; + int count3 = 0; + + for (int i = 0; i < table.Length; i += 4) + { + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + count2 += BitOps.BitCount(table[i + 2] & OneMask); + count3 += BitOps.BitCount(table[i + 3] & OneMask); + + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + count0 = (count0 + count1) + (count2 + count3); + + size = (size - (count0 >> 2)) >> 1; + } + +#if NET6_0_OR_GREATER + private unsafe int EstimateFrequencyAvx(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Vector128.Create(counterHash); + h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + + var index = Avx2.ShiftRightLogical(h, 1); + index = Avx2.And(index, Vector128.Create(15)); // j - counter index + Vector128 offset = Avx2.And(h, Vector128.Create(1)); + Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index + blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + + fixed (long* tablePtr = table) + { + Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); + index = Avx2.ShiftLeftLogical(index, 2); + + // convert index from int to long via permute + Vector256 indexLong = Vector256.Create(index, Vector128.Zero).AsInt64(); + Vector256 permuteMask2 = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); + indexLong = Avx2.PermuteVar8x32(indexLong.AsInt32(), permuteMask2).AsInt64(); + tableVector = Avx2.ShiftRightLogicalVariable(tableVector, indexLong.AsUInt64()); + tableVector = Avx2.And(tableVector, Vector256.Create(0xfL)); + + Vector256 permuteMask = Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7); + Vector128 count = Avx2.PermuteVar8x32(tableVector.AsInt32(), permuteMask) + .GetLower() + .AsUInt16(); + + // set the zeroed high parts of the long value to ushort.Max +#if NET6_0 + count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); +#else + count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); +#endif + + return Avx2.MinHorizontal(count).GetElement(0); + } + } + + private unsafe void IncrementAvx(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Vector128.Create(counterHash); + h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + + Vector128 index = Avx2.ShiftRightLogical(h, 1); + index = Avx2.And(index, Vector128.Create(15)); // j - counter index + Vector128 offset = Avx2.And(h, Vector128.Create(1)); + Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index + blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + + fixed (long* tablePtr = table) + { + Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); + + // j == index + index = Avx2.ShiftLeftLogical(index, 2); + Vector256 offsetLong = Vector256.Create(index, Vector128.Zero).AsInt64(); + + Vector256 permuteMask = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); + offsetLong = Avx2.PermuteVar8x32(offsetLong.AsInt32(), permuteMask).AsInt64(); + + // mask = (0xfL << offset) + Vector256 fifteen = Vector256.Create(0xfL); + Vector256 mask = Avx2.ShiftLeftLogicalVariable(fifteen, offsetLong.AsUInt64()); + + // (table[i] & mask) != mask) + // Note masked is 'equal' - therefore use AndNot below + Vector256 masked = Avx2.CompareEqual(Avx2.And(tableVector, mask), mask); + + // 1L << offset + Vector256 inc = Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong.AsUInt64()); + + // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) + inc = Avx2.AndNot(masked, inc); + + Vector256 result = Avx2.CompareEqual(masked.AsByte(), Vector256.Zero); + bool wasInc = Avx2.MoveMask(result.AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); + + tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); + tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); + tablePtr[blockOffset.GetElement(2)] += inc.GetElement(2); + tablePtr[blockOffset.GetElement(3)] += inc.GetElement(3); + + if (wasInc && (++size == sampleSize)) + { + Reset(); + } + } + } +#endif + } +} diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index eb005032..9228b1dd 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -1,5 +1,6 @@  using System.Collections.Generic; +using Benchly; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; using BitFaster.Caching.Lfu; @@ -7,8 +8,11 @@ namespace BitFaster.Caching.Benchmarks.Lfu { [SimpleJob(RuntimeMoniker.Net60)] + [SimpleJob(RuntimeMoniker.Net80)] + [SimpleJob(RuntimeMoniker.Net90)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] + [ColumnChart(Title = "Sketch Increment ({JOB})")] public class SketchIncrement { const int iterations = 1_048_576; @@ -17,6 +21,7 @@ public class SketchIncrement private CmSketchFlat flatAvx; private CmSketchCore blockStd; + private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; [Params(32_768, 524_288, 8_388_608, 134_217_728)] @@ -29,6 +34,7 @@ public void Setup() flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); blockStd = new CmSketchCore(Size, EqualityComparer.Default); + blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } @@ -41,7 +47,7 @@ public void IncFlat() } } - [Benchmark(OperationsPerInvoke = iterations)] + //[Benchmark(OperationsPerInvoke = iterations)] public void IncFlatAvx() { for (int i = 0; i < iterations; i++) @@ -60,7 +66,16 @@ public void IncBlock() } [Benchmark(OperationsPerInvoke = iterations)] - public void IncBlockAvx() + public void IncBlockAvxNotPinned() + { + for (int i = 0; i < iterations; i++) + { + blockAvxNoPin.Increment(i); + } + } + + [Benchmark(OperationsPerInvoke = iterations)] + public void IncBlockAvxPinned() { for (int i = 0; i < iterations; i++) { diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 18d95e04..fdb5d9f0 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -126,19 +126,26 @@ private void EnsureCapacity(long maximumSize) int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); #if NET6_0_OR_GREATER - // over alloc by 4 to give 32 bytes padding, tableAddr is then aligned to 32 bytes - const int pad = 4; - table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, true); - - tableAddr = (long*)Unsafe.AsPointer(ref table[0]); - tableAddr = (long*)((long)tableAddr + (long)tableAddr % 32); + I isa = default; + if (isa.IsAvx2Supported) + { + // over alloc by 8 to give 64 bytes padding, tableAddr is then aligned to 64 bytes + const int pad = 8; + bool pinned = true; + table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); - blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; + tableAddr = (long*)Unsafe.AsPointer(ref table[0]); + tableAddr = (long*)((long)tableAddr + (long)tableAddr % 64); -#else - table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; - blockMask = (int)((uint)(table.Length) >> 3) - 1; + blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; + } + else #endif + { + table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; + blockMask = (int)((uint)(table.Length) >> 3) - 1; + } + sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); size = 0; @@ -188,7 +195,7 @@ private unsafe void IncrementStd(T value) } } - // Applies another round of hashing for additional randomization + // Applies another round of hashing for additional randomization. private static int Rehash(int x) { x = (int)(x * 0x31848bab); @@ -196,7 +203,7 @@ private static int Rehash(int x) return x; } - // Applies a supplemental hash functions to defends against poor quality hash. + // Applies a supplemental hash function to defend against poor quality hash. private static int Spread(int x) { x ^= (int)((uint)x >> 17); From f973e073417ee69aaf327f602790add849951929 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 21 Nov 2024 05:13:02 +0000 Subject: [PATCH 08/10] freq bench --- .../BitFaster.Caching.Benchmarks.csproj | 2 +- .../Lfu/SketchFrequency.cs | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj index df10ca09..789e1174 100644 --- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj +++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj @@ -2,7 +2,7 @@ Exe - 13 + latest net48;net6.0;net8.0 True diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index b97bc19d..4a95f933 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -8,6 +8,8 @@ namespace BitFaster.Caching.Benchmarks.Lfu { [SimpleJob(RuntimeMoniker.Net60)] + [SimpleJob(RuntimeMoniker.Net80)] + [SimpleJob(RuntimeMoniker.Net90)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] [ColumnChart(Title ="Sketch Frequency ({JOB})")] @@ -20,6 +22,7 @@ public class SketchFrequency private CmSketchFlat flatAvx; private CmSketchCore blockStd; + private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; [Params(32_768, 524_288, 8_388_608, 134_217_728)] @@ -32,6 +35,7 @@ public void Setup() flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); blockStd = new CmSketchCore(Size, EqualityComparer.Default); + blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } @@ -45,7 +49,7 @@ public int FrequencyFlat() return count; } - [Benchmark(OperationsPerInvoke = iterations)] + //[Benchmark(OperationsPerInvoke = iterations)] public int FrequencyFlatAvx() { int count = 0; @@ -66,7 +70,17 @@ public int FrequencyBlock() } [Benchmark(OperationsPerInvoke = iterations)] - public int FrequencyBlockAvx() + public int FrequencyBlockAvxNotPinned() + { + int count = 0; + for (int i = 0; i < iterations; i++) + count += blockAvxNoPin.EstimateFrequency(i) > blockAvx.EstimateFrequency(i + 1) ? 1 : 0; + + return count; + } + + [Benchmark(OperationsPerInvoke = iterations)] + public int FrequencyBlockAvxPinned() { int count = 0; for (int i = 0; i < iterations; i++) From 87f9d4bff1f0ab96e85a4d551f92bc69ba59f5f6 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 21 Nov 2024 06:21:53 +0000 Subject: [PATCH 09/10] fix colors --- BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs | 2 +- BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index 4a95f933..23786535 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -12,7 +12,7 @@ namespace BitFaster.Caching.Benchmarks.Lfu [SimpleJob(RuntimeMoniker.Net90)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] - [ColumnChart(Title ="Sketch Frequency ({JOB})")] + [ColumnChart(Title = "Sketch Frequency ({JOB})", Colors = "#cd5c5c,#fa8072,#ffa07a")] public class SketchFrequency { const int sketchSize = 1_048_576; diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index 9228b1dd..7f30d308 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -12,7 +12,7 @@ namespace BitFaster.Caching.Benchmarks.Lfu [SimpleJob(RuntimeMoniker.Net90)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] - [ColumnChart(Title = "Sketch Increment ({JOB})")] + [ColumnChart(Title = "Sketch Increment ({JOB})", Colors = "#cd5c5c,#fa8072,#ffa07a")] public class SketchIncrement { const int iterations = 1_048_576; From a7b2a70b678ed7f683abe729b541ba8a803b1188 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 21 Nov 2024 06:27:08 +0000 Subject: [PATCH 10/10] rem comments --- BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs | 2 +- BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index 23786535..b49e5bcf 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -49,7 +49,7 @@ public int FrequencyFlat() return count; } - //[Benchmark(OperationsPerInvoke = iterations)] + [Benchmark(OperationsPerInvoke = iterations)] public int FrequencyFlatAvx() { int count = 0; diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index 7f30d308..6bcd0272 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -47,7 +47,7 @@ public void IncFlat() } } - //[Benchmark(OperationsPerInvoke = iterations)] + [Benchmark(OperationsPerInvoke = iterations)] public void IncFlatAvx() { for (int i = 0; i < iterations; i++)