From 78dfea89eae07adcbe500819b58bef0d77807a61 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sun, 26 May 2024 21:43:59 -0700 Subject: [PATCH 1/7] pot --- .../Lfu/SketchFrequency.cs | 11 ++- .../Lfu/SketchIncrement.cs | 11 ++- ...itFaster.Caching.ThroughputAnalysis.csproj | 2 +- .../BitFaster.Caching.UnitTests.csproj | 2 +- BitFaster.Caching/Lfu/CmSketchCore.cs | 70 ++++++------------- 5 files changed, 41 insertions(+), 55 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index b97bc19d..8d451dce 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -7,10 +7,15 @@ namespace BitFaster.Caching.Benchmarks.Lfu { +#if Windows + [DisassemblyDiagnoser(printSource: true, maxDepth: 4)] +#endif [SimpleJob(RuntimeMoniker.Net60)] + [SimpleJob(RuntimeMoniker.Net80)] + [SimpleJob(RuntimeMoniker.Net90)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] - [ColumnChart(Title ="Sketch Frequency ({JOB})")] + [ColumnChart(Title ="Sketch Frequency ({JOB})", Colors = "#cd5c5c,#fa8072,#ffa07a")] public class SketchFrequency { const int sketchSize = 1_048_576; @@ -22,7 +27,7 @@ public class SketchFrequency private CmSketchCore blockStd; private CmSketchCore blockAvx; - [Params(32_768, 524_288, 8_388_608, 134_217_728)] + [Params(512, 1024, 32_768, 524_288, 8_388_608, 134_217_728)] public int Size { get; set; } [GlobalSetup] @@ -45,7 +50,7 @@ public int FrequencyFlat() return count; } - [Benchmark(OperationsPerInvoke = iterations)] + //[Benchmark(OperationsPerInvoke = iterations)] public int FrequencyFlatAvx() { int count = 0; diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index eb005032..385a625d 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -1,14 +1,21 @@  using System.Collections.Generic; +using Benchly; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; using BitFaster.Caching.Lfu; namespace BitFaster.Caching.Benchmarks.Lfu { +#if Windows + [DisassemblyDiagnoser(printSource: true, maxDepth: 4)] +#endif [SimpleJob(RuntimeMoniker.Net60)] + [SimpleJob(RuntimeMoniker.Net80)] + [SimpleJob(RuntimeMoniker.Net90)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] + [ColumnChart(Title = "Sketch Increment ({JOB})", Colors = "#cd5c5c,#fa8072,#ffa07a")] public class SketchIncrement { const int iterations = 1_048_576; @@ -19,7 +26,7 @@ public class SketchIncrement private CmSketchCore blockStd; private CmSketchCore blockAvx; - [Params(32_768, 524_288, 8_388_608, 134_217_728)] + [Params(512, 1024, 32_768, 524_288, 8_388_608, 134_217_728)] public int Size { get; set; } [GlobalSetup] @@ -41,7 +48,7 @@ public void IncFlat() } } - [Benchmark(OperationsPerInvoke = iterations)] + //[Benchmark(OperationsPerInvoke = iterations)] public void IncFlatAvx() { for (int i = 0; i < iterations; i++) diff --git a/BitFaster.Caching.ThroughputAnalysis/BitFaster.Caching.ThroughputAnalysis.csproj b/BitFaster.Caching.ThroughputAnalysis/BitFaster.Caching.ThroughputAnalysis.csproj index 6f45042f..30fe5a6b 100644 --- a/BitFaster.Caching.ThroughputAnalysis/BitFaster.Caching.ThroughputAnalysis.csproj +++ b/BitFaster.Caching.ThroughputAnalysis/BitFaster.Caching.ThroughputAnalysis.csproj @@ -2,7 +2,7 @@ Exe - net6.0;net8.0 + net6.0;net8.0;net9.0 False 2.0.0 true diff --git a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj index 40a51510..b36db991 100644 --- a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj +++ b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj @@ -1,7 +1,7 @@ - net48;netcoreapp3.1;net6.0 + net48;netcoreapp3.1;net6.0;net8.0 9.0 diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index de255840..30b404ff 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -1,6 +1,8 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; + #if !NETSTANDARD2_0 @@ -172,6 +174,7 @@ private unsafe void IncrementStd(T value) } // Applies another round of hashing for additional randomization + //[MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Rehash(int x) { x = (int)(x * 0x31848bab); @@ -180,6 +183,7 @@ private static int Rehash(int x) } // Applies a supplemental hash functions to defends against poor quality hash. + //[MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Spread(int x) { x ^= (int)((uint)x >> 17); @@ -231,40 +235,28 @@ private void Reset() } #if !NETSTANDARD2_0 + [MethodImpl(MethodImplOptions.AggressiveInlining)] + //[MethodImpl((MethodImplOptions)512)] private unsafe int EstimateFrequencyAvx(T value) { int blockHash = Spread(comparer.GetHashCode(value)); int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - Vector128 h = Vector128.Create(counterHash); - h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); + Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - var index = Avx2.ShiftRightLogical(h, 1); - index = Avx2.And(index, Vector128.Create(15)); // j - counter index - Vector128 offset = Avx2.And(h, Vector128.Create(1)); - Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index - blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + Vector256 indexLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); fixed (long* tablePtr = table) { - Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); - index = Avx2.ShiftLeftLogical(index, 2); - - // convert index from int to long via permute - Vector256 indexLong = Vector256.Create(index, Vector128.Zero).AsInt64(); - Vector256 permuteMask2 = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); - indexLong = Avx2.PermuteVar8x32(indexLong.AsInt32(), permuteMask2).AsInt64(); - tableVector = Avx2.ShiftRightLogicalVariable(tableVector, indexLong.AsUInt64()); - tableVector = Avx2.And(tableVector, Vector256.Create(0xfL)); - - Vector256 permuteMask = Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7); - Vector128 count = Avx2.PermuteVar8x32(tableVector.AsInt32(), permuteMask) + Vector128 count = Avx2.PermuteVar8x32(Avx2.And(Avx2.ShiftRightLogicalVariable(Avx2.GatherVector256(tablePtr, blockOffset, 8), indexLong), Vector256.Create(0xfL)).AsInt32(), Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7)) .GetLower() .AsUInt16(); // set the zeroed high parts of the long value to ushort.Max -#if NET6_0 +#if NET6_0_OR_GREATER count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); #else count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); @@ -274,48 +266,30 @@ private unsafe int EstimateFrequencyAvx(T value) } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + //[MethodImpl((MethodImplOptions)512)] private unsafe void IncrementAvx(T value) { int blockHash = Spread(comparer.GetHashCode(value)); int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - Vector128 h = Vector128.Create(counterHash); - h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); + Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - Vector128 index = Avx2.ShiftRightLogical(h, 1); - index = Avx2.And(index, Vector128.Create(15)); // j - counter index - Vector128 offset = Avx2.And(h, Vector128.Create(1)); - Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index - blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + Vector256 offsetLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); + Vector256 mask = Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), offsetLong); fixed (long* tablePtr = table) { - Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); - - // j == index - index = Avx2.ShiftLeftLogical(index, 2); - Vector256 offsetLong = Vector256.Create(index, Vector128.Zero).AsInt64(); - - Vector256 permuteMask = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); - offsetLong = Avx2.PermuteVar8x32(offsetLong.AsInt32(), permuteMask).AsInt64(); - - // mask = (0xfL << offset) - Vector256 fifteen = Vector256.Create(0xfL); - Vector256 mask = Avx2.ShiftLeftLogicalVariable(fifteen, offsetLong.AsUInt64()); - - // (table[i] & mask) != mask) // Note masked is 'equal' - therefore use AndNot below - Vector256 masked = Avx2.CompareEqual(Avx2.And(tableVector, mask), mask); - - // 1L << offset - Vector256 inc = Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong.AsUInt64()); + Vector256 masked = Avx2.CompareEqual(Avx2.And(Avx2.GatherVector256(tablePtr, blockOffset, 8), mask), mask); // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) - inc = Avx2.AndNot(masked, inc); + Vector256 inc = Avx2.AndNot(masked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong)); - Vector256 result = Avx2.CompareEqual(masked.AsByte(), Vector256.Zero); - bool wasInc = Avx2.MoveMask(result.AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); + bool wasInc = Avx2.MoveMask(Avx2.CompareEqual(masked.AsByte(), Vector256.Zero).AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); From 139b77a182ba9b0e46c288540838fa7bc07e7f92 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Wed, 20 Nov 2024 23:44:05 -0800 Subject: [PATCH 2/7] fix projs --- .../BitFaster.Caching.ThroughputAnalysis.csproj | 2 +- BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/BitFaster.Caching.ThroughputAnalysis/BitFaster.Caching.ThroughputAnalysis.csproj b/BitFaster.Caching.ThroughputAnalysis/BitFaster.Caching.ThroughputAnalysis.csproj index d6d53562..f1f31f1c 100644 --- a/BitFaster.Caching.ThroughputAnalysis/BitFaster.Caching.ThroughputAnalysis.csproj +++ b/BitFaster.Caching.ThroughputAnalysis/BitFaster.Caching.ThroughputAnalysis.csproj @@ -2,7 +2,7 @@ Exe - net6.0;net8.0;net9.0 + net6.0;net8.0 False 2.0.0 true diff --git a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj index 184acc97..d2bcadcf 100644 --- a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj +++ b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj @@ -1,7 +1,7 @@ - net48;netcoreapp3.1;net6.0;net8.0 + net48;netcoreapp3.1;net6.0 9.0 From d7935d228a104dcda2f5542f431309eb590807b8 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Fri, 22 Nov 2024 04:10:00 +0000 Subject: [PATCH 3/7] test --- .../Lfu/CmSketchPinNoOpt.cs | 353 ++++++++++++++++++ .../Lfu/SketchFrequency.cs | 12 + .../Lfu/SketchIncrement.cs | 11 + 3 files changed, 376 insertions(+) create mode 100644 BitFaster.Caching.Benchmarks/Lfu/CmSketchPinNoOpt.cs diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchPinNoOpt.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchPinNoOpt.cs new file mode 100644 index 00000000..2bf973fb --- /dev/null +++ b/BitFaster.Caching.Benchmarks/Lfu/CmSketchPinNoOpt.cs @@ -0,0 +1,353 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Text; +using System.Threading.Tasks; + + +#if NET6_0_OR_GREATER +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + +namespace BitFaster.Caching.Benchmarks.Lfu +{ + public unsafe class CmSketchPinNoOpt + where T : notnull + where I : struct, IsaProbe + { + private const long ResetMask = 0x7777777777777777L; + private const long OneMask = 0x1111111111111111L; + + private long[] table; +#if NET6_0_OR_GREATER + private long* tableAddr; +#endif + private int sampleSize; + private int blockMask; + private int size; + + private readonly IEqualityComparer comparer; + + /// + /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer. + /// + /// The maximum size. + /// The equality comparer. + public CmSketchPinNoOpt(long maximumSize, IEqualityComparer comparer) + { + EnsureCapacity(maximumSize); + this.comparer = comparer; + } + + /// + /// Gets the reset sample size. + /// + public int ResetSampleSize => this.sampleSize; + + /// + /// Gets the size. + /// + public int Size => this.size; + + /// + /// Estimate the frequency of the specified value, up to the maximum of 15. + /// + /// The value. + /// The estimated frequency of the value. + public int EstimateFrequency(T value) + { +#if NET48 + return EstimateFrequencyStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + return EstimateFrequencyAvx(value); + } + else + { + return EstimateFrequencyStd(value); + } +#endif + } + + /// + /// Increment the count of the specified value. + /// + /// The value. + public void Increment(T value) + { +#if NET48 + IncrementStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + IncrementAvx(value); + } + else + { + IncrementStd(value); + } +#endif + } + + /// + /// Clears the count for all items. + /// + public void Clear() + { + Array.Clear(table, 0, table.Length); + size = 0; + } + + //[MemberNotNull(nameof(table))] + private void EnsureCapacity(long maximumSize) + { + int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); + +#if NET6_0_OR_GREATER + I isa = default; + if (isa.IsAvx2Supported) + { + // over alloc by 8 to give 64 bytes padding, tableAddr is then aligned to 64 bytes + const int pad = 8; + bool pinned = true; + table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); + + tableAddr = (long*)Unsafe.AsPointer(ref table[0]); + tableAddr = (long*)((long)tableAddr + (long)tableAddr % 64); + + blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; + } + else +#endif + { + table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; + blockMask = (int)((uint)(table.Length) >> 3) - 1; + } + + sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); + + size = 0; + } + + private unsafe int EstimateFrequencyStd(T value) + { + var count = stackalloc int[4]; + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + for (int i = 0; i < 4; i++) + { + int h = (int)((uint)counterHash >> (i << 3)); + int index = (h >> 1) & 15; + int offset = h & 1; + count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); + } + return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); + } + + private unsafe void IncrementStd(T value) + { + var index = stackalloc int[8]; + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + for (int i = 0; i < 4; i++) + { + int h = (int)((uint)counterHash >> (i << 3)); + index[i] = (h >> 1) & 15; + int offset = h & 1; + index[i + 4] = block + offset + (i << 1); + } + + bool added = + IncrementAt(index[4], index[0]) + | IncrementAt(index[5], index[1]) + | IncrementAt(index[6], index[2]) + | IncrementAt(index[7], index[3]); + + if (added && (++size == sampleSize)) + { + Reset(); + } + } + + // Applies another round of hashing for additional randomization. + private static int Rehash(int x) + { + x = (int)(x * 0x31848bab); + x ^= (int)((uint)x >> 14); + return x; + } + + // Applies a supplemental hash function to defend against poor quality hash. + private static int Spread(int x) + { + x ^= (int)((uint)x >> 17); + x = (int)(x * 0xed5ad4bb); + x ^= (int)((uint)x >> 11); + x = (int)(x * 0xac4c1b51); + x ^= (int)((uint)x >> 15); + return x; + } + + private bool IncrementAt(int i, int j) + { + int offset = j << 2; + long mask = (0xfL << offset); + + if ((table[i] & mask) != mask) + { + table[i] += (1L << offset); + return true; + } + + return false; + } + + private void Reset() + { + // unroll, almost 2x faster + int count0 = 0; + int count1 = 0; + int count2 = 0; + int count3 = 0; + + for (int i = 0; i < table.Length; i += 4) + { + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + count2 += BitOps.BitCount(table[i + 2] & OneMask); + count3 += BitOps.BitCount(table[i + 3] & OneMask); + + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + count0 = (count0 + count1) + (count2 + count3); + + size = (size - (count0 >> 2)) >> 1; + } + +#if NET6_0_OR_GREATER + private unsafe int EstimateFrequencyAvx(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Vector128.Create(counterHash); + h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + + var index = Avx2.ShiftRightLogical(h, 1); + index = Avx2.And(index, Vector128.Create(15)); // j - counter index + Vector128 offset = Avx2.And(h, Vector128.Create(1)); + Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index + blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + +#if NET6_0_OR_GREATER + long* tablePtr = tableAddr; +#else + fixed (long* tablePtr = table) +#endif + { + Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); + index = Avx2.ShiftLeftLogical(index, 2); + + // convert index from int to long via permute + Vector256 indexLong = Vector256.Create(index, Vector128.Zero).AsInt64(); + Vector256 permuteMask2 = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); + indexLong = Avx2.PermuteVar8x32(indexLong.AsInt32(), permuteMask2).AsInt64(); + tableVector = Avx2.ShiftRightLogicalVariable(tableVector, indexLong.AsUInt64()); + tableVector = Avx2.And(tableVector, Vector256.Create(0xfL)); + + Vector256 permuteMask = Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7); + Vector128 count = Avx2.PermuteVar8x32(tableVector.AsInt32(), permuteMask) + .GetLower() + .AsUInt16(); + + // set the zeroed high parts of the long value to ushort.Max +#if NET6_0_OR_GREATER + count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); +#else + count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); +#endif + + return Avx2.MinHorizontal(count).GetElement(0); + } + } + + private unsafe void IncrementAvx(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Vector128.Create(counterHash); + h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + + Vector128 index = Avx2.ShiftRightLogical(h, 1); + index = Avx2.And(index, Vector128.Create(15)); // j - counter index + Vector128 offset = Avx2.And(h, Vector128.Create(1)); + Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index + blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + +#if NET6_0_OR_GREATER + long* tablePtr = tableAddr; +#else + fixed (long* tablePtr = table) +#endif + { + Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); + + // j == index + index = Avx2.ShiftLeftLogical(index, 2); + Vector256 offsetLong = Vector256.Create(index, Vector128.Zero).AsInt64(); + + Vector256 permuteMask = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); + offsetLong = Avx2.PermuteVar8x32(offsetLong.AsInt32(), permuteMask).AsInt64(); + + // mask = (0xfL << offset) + Vector256 fifteen = Vector256.Create(0xfL); + Vector256 mask = Avx2.ShiftLeftLogicalVariable(fifteen, offsetLong.AsUInt64()); + + // (table[i] & mask) != mask) + // Note masked is 'equal' - therefore use AndNot below + Vector256 masked = Avx2.CompareEqual(Avx2.And(tableVector, mask), mask); + + // 1L << offset + Vector256 inc = Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong.AsUInt64()); + + // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) + inc = Avx2.AndNot(masked, inc); + + Vector256 result = Avx2.CompareEqual(masked.AsByte(), Vector256.Zero); + bool wasInc = Avx2.MoveMask(result.AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); + + tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); + tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); + tablePtr[blockOffset.GetElement(2)] += inc.GetElement(2); + tablePtr[blockOffset.GetElement(3)] += inc.GetElement(3); + + if (wasInc && (++size == sampleSize)) + { + Reset(); + } + } + } +#endif + } +} diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index 1309580a..1267d90a 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -26,6 +26,7 @@ public class SketchFrequency private CmSketchCore blockStd; private CmSketchNoPin blockAvxNoPin; + private CmSketchPinNoOpt blockAvxPinNoOpt; private CmSketchCore blockAvx; [Params(512, 1024, 32_768, 524_288, 8_388_608, 134_217_728)] @@ -39,6 +40,7 @@ public void Setup() blockStd = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); + blockAvxPinNoOpt = new CmSketchPinNoOpt(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } @@ -82,6 +84,16 @@ public int FrequencyBlockAvxNotPinned() return count; } + [Benchmark(OperationsPerInvoke = iterations)] + public int FrequencyBlockAvxPinNotOpt() + { + int count = 0; + for (int i = 0; i < iterations; i++) + count += blockAvxPinNoOpt.EstimateFrequency(i) > blockAvx.EstimateFrequency(i + 1) ? 1 : 0; + + return count; + } + [Benchmark(OperationsPerInvoke = iterations)] public int FrequencyBlockAvxPinned() { diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index 983cf66f..71978d29 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -25,6 +25,7 @@ public class SketchIncrement private CmSketchCore blockStd; private CmSketchNoPin blockAvxNoPin; + private CmSketchPinNoOpt blockAvxPinNoOpt; private CmSketchCore blockAvx; [Params(512, 1024, 32_768, 524_288, 8_388_608, 134_217_728)] @@ -38,6 +39,7 @@ public void Setup() blockStd = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); + blockAvxPinNoOpt = new CmSketchPinNoOpt(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } @@ -77,6 +79,15 @@ public void IncBlockAvxNotPinned() } } + [Benchmark(OperationsPerInvoke = iterations)] + public void IncBlockAvxPinNotOpt() + { + for (int i = 0; i < iterations; i++) + { + blockAvxPinNoOpt.Increment(i); + } + } + [Benchmark(OperationsPerInvoke = iterations)] public void IncBlockAvxPinned() { From 560c6f2c69e859be8dd85bc03c46ac807037b9f5 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Fri, 22 Nov 2024 06:36:07 +0000 Subject: [PATCH 4/7] opt --- .../BitFaster.Caching.HitRateAnalysis.csproj | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/BitFaster.Caching.HitRateAnalysis/BitFaster.Caching.HitRateAnalysis.csproj b/BitFaster.Caching.HitRateAnalysis/BitFaster.Caching.HitRateAnalysis.csproj index 1d760e08..247f9651 100644 --- a/BitFaster.Caching.HitRateAnalysis/BitFaster.Caching.HitRateAnalysis.csproj +++ b/BitFaster.Caching.HitRateAnalysis/BitFaster.Caching.HitRateAnalysis.csproj @@ -2,7 +2,10 @@ Exe - net6.0 + net8.0 + true + true + true From ae1d56703745bb39860032fa25e277e7b0658137 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Tue, 26 Nov 2024 01:44:25 +0000 Subject: [PATCH 5/7] 512 --- .../Lfu/SketchFrequency.cs | 12 + .../Lfu/SketchIncrement.cs | 11 + .../Runner.cs | 6 +- BitFaster.Caching/Lfu/CmSketchCore.cs | 4 +- BitFaster.Caching/Lfu/CmSketchCore512.cs | 337 ++++++++++++++++++ 5 files changed, 365 insertions(+), 5 deletions(-) create mode 100644 BitFaster.Caching/Lfu/CmSketchCore512.cs diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index 1267d90a..0db7920c 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -28,6 +28,7 @@ public class SketchFrequency private CmSketchNoPin blockAvxNoPin; private CmSketchPinNoOpt blockAvxPinNoOpt; private CmSketchCore blockAvx; + private CmSketchCore512 blockAvx512; [Params(512, 1024, 32_768, 524_288, 8_388_608, 134_217_728)] public int Size { get; set; } @@ -42,6 +43,7 @@ public void Setup() blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvxPinNoOpt = new CmSketchPinNoOpt(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); + blockAvx512 = new CmSketchCore512(Size, EqualityComparer.Default); } [Benchmark(Baseline = true, OperationsPerInvoke = iterations)] @@ -103,5 +105,15 @@ public int FrequencyBlockAvxPinned() return count; } + + [Benchmark(OperationsPerInvoke = iterations)] + public int FrequencyBlockAvxPinned512() + { + int count = 0; + for (int i = 0; i < iterations; i++) + count += blockAvx512.EstimateFrequency(i) > blockAvx.EstimateFrequency(i + 1) ? 1 : 0; + + return count; + } } } diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index 71978d29..b4fa8cea 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -27,6 +27,7 @@ public class SketchIncrement private CmSketchNoPin blockAvxNoPin; private CmSketchPinNoOpt blockAvxPinNoOpt; private CmSketchCore blockAvx; + private CmSketchCore512 blockAvx512; [Params(512, 1024, 32_768, 524_288, 8_388_608, 134_217_728)] public int Size { get; set; } @@ -41,6 +42,7 @@ public void Setup() blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvxPinNoOpt = new CmSketchPinNoOpt(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); + blockAvx512 = new CmSketchCore512(Size, EqualityComparer.Default); } [Benchmark(Baseline = true, OperationsPerInvoke = iterations)] @@ -96,5 +98,14 @@ public void IncBlockAvxPinned() blockAvx.Increment(i); } } + + [Benchmark(OperationsPerInvoke = iterations)] + public void IncBlockAvxPinned512() + { + for (int i = 0; i < iterations; i++) + { + blockAvx512.Increment(i); + } + } } } diff --git a/BitFaster.Caching.ThroughputAnalysis/Runner.cs b/BitFaster.Caching.ThroughputAnalysis/Runner.cs index f6caa276..a769015e 100644 --- a/BitFaster.Caching.ThroughputAnalysis/Runner.cs +++ b/BitFaster.Caching.ThroughputAnalysis/Runner.cs @@ -33,10 +33,10 @@ private static void RunTest(Mode mode, int cacheSize) var cachesToTest = new List { - new ClassicLruFactory(capacity), - new MemoryCacheFactory(capacity), + //new ClassicLruFactory(capacity), + //new MemoryCacheFactory(capacity), new FastConcurrentLruFactory(capacity), - new ConcurrentLruFactory(capacity), + //new ConcurrentLruFactory(capacity), new ConcurrentLfuFactory(capacity) }; diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index ec9304c9..9c4333aa 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -258,7 +258,7 @@ private void Reset() #if !NETSTANDARD2_0 [MethodImpl(MethodImplOptions.AggressiveInlining)] - //[MethodImpl((MethodImplOptions)512)] + // [MethodImpl(MethodImplOptions.AggressiveInlining | (MethodImplOptions)512)] private unsafe int EstimateFrequencyAvx(T value) { int blockHash = Spread(comparer.GetHashCode(value)); @@ -293,7 +293,7 @@ private unsafe int EstimateFrequencyAvx(T value) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - //[MethodImpl((MethodImplOptions)512)] + // [MethodImpl(MethodImplOptions.AggressiveInlining | (MethodImplOptions)512)] private unsafe void IncrementAvx(T value) { int blockHash = Spread(comparer.GetHashCode(value)); diff --git a/BitFaster.Caching/Lfu/CmSketchCore512.cs b/BitFaster.Caching/Lfu/CmSketchCore512.cs new file mode 100644 index 00000000..c17bd725 --- /dev/null +++ b/BitFaster.Caching/Lfu/CmSketchCore512.cs @@ -0,0 +1,337 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + + +#if !NETSTANDARD2_0 +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + +namespace BitFaster.Caching.Lfu +{ + /// + /// A probabilistic data structure used to estimate the frequency of a given value. Periodic aging reduces the + /// accumulated count across all values over time, such that a historic popular value will decay to zero frequency + /// over time if it is not accessed. + /// + /// + /// The maximum frequency of an element is limited to 15 (4-bits). Each element is hashed to a 64 byte 'block' + /// consisting of 4 segments of 32 4-bit counters. The 64 byte blocks are the same size as x64 L1 cache lines. + /// While the blocks are not guaranteed to be aligned, this scheme minimizes L1 cache misses resulting in a + /// significant speedup. When supported, a vectorized AVX2 code path provides a further speedup. Together, block + /// and AVX2 are approximately 2x faster than the original implementation. + /// + /// This is a direct C# translation of FrequencySketch in the Caffeine library by ben.manes@gmail.com (Ben Manes). + /// https://github.com/ben-manes/caffeine + public unsafe class CmSketchCore512 + where T : notnull + where I : struct, IsaProbe + { + private const long ResetMask = 0x7777777777777777L; + private const long OneMask = 0x1111111111111111L; + + private long[] table; +#if NET6_0_OR_GREATER + private long* tableAddr; +#endif + private int sampleSize; + private int blockMask; + private int size; + + private readonly IEqualityComparer comparer; + + /// + /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer. + /// + /// The maximum size. + /// The equality comparer. + public CmSketchCore512(long maximumSize, IEqualityComparer comparer) + { + EnsureCapacity(maximumSize); + this.comparer = comparer; + } + + /// + /// Gets the reset sample size. + /// + public int ResetSampleSize => this.sampleSize; + + /// + /// Gets the size. + /// + public int Size => this.size; + + /// + /// Estimate the frequency of the specified value, up to the maximum of 15. + /// + /// The value. + /// The estimated frequency of the value. + public int EstimateFrequency(T value) + { +#if NETSTANDARD2_0 + return EstimateFrequencyStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + return EstimateFrequencyAvx(value); + } + else + { + return EstimateFrequencyStd(value); + } +#endif + } + + /// + /// Increment the count of the specified value. + /// + /// The value. + public void Increment(T value) + { +#if NETSTANDARD2_0 + IncrementStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + IncrementAvx(value); + } + else + { + IncrementStd(value); + } +#endif + } + + /// + /// Clears the count for all items. + /// + public void Clear() + { + Array.Clear(table, 0, table.Length); + size = 0; + } + + [MemberNotNull(nameof(table))] + private void EnsureCapacity(long maximumSize) + { + int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); + +#if NET6_0_OR_GREATER + I isa = default; + if (isa.IsAvx2Supported) + { + // over alloc by 8 to give 64 bytes padding, tableAddr is then aligned to 64 bytes + const int pad = 8; + bool pinned = true; + table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); + + tableAddr = (long*)Unsafe.AsPointer(ref table[0]); + tableAddr = (long*)((long)tableAddr + (long)tableAddr % 64); + + blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; + } + else +#endif + { + table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; + blockMask = (int)((uint)(table.Length) >> 3) - 1; + } + + sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); + + size = 0; + } + + private unsafe int EstimateFrequencyStd(T value) + { + var count = stackalloc int[4]; + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + for (int i = 0; i < 4; i++) + { + int h = (int)((uint)counterHash >> (i << 3)); + int index = (h >> 1) & 15; + int offset = h & 1; + count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); + } + return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); + } + + private unsafe void IncrementStd(T value) + { + var index = stackalloc int[8]; + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + for (int i = 0; i < 4; i++) + { + int h = (int)((uint)counterHash >> (i << 3)); + index[i] = (h >> 1) & 15; + int offset = h & 1; + index[i + 4] = block + offset + (i << 1); + } + + bool added = + IncrementAt(index[4], index[0]) + | IncrementAt(index[5], index[1]) + | IncrementAt(index[6], index[2]) + | IncrementAt(index[7], index[3]); + + if (added && (++size == sampleSize)) + { + Reset(); + } + } + + // Applies another round of hashing for additional randomization. + //[MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int Rehash(int x) + { + x = (int)(x * 0x31848bab); + x ^= (int)((uint)x >> 14); + return x; + } + + // Applies a supplemental hash function to defend against poor quality hash. + //[MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int Spread(int x) + { + x ^= (int)((uint)x >> 17); + x = (int)(x * 0xed5ad4bb); + x ^= (int)((uint)x >> 11); + x = (int)(x * 0xac4c1b51); + x ^= (int)((uint)x >> 15); + return x; + } + + private bool IncrementAt(int i, int j) + { + int offset = j << 2; + long mask = (0xfL << offset); + + if ((table[i] & mask) != mask) + { + table[i] += (1L << offset); + return true; + } + + return false; + } + + private void Reset() + { + // unroll, almost 2x faster + int count0 = 0; + int count1 = 0; + int count2 = 0; + int count3 = 0; + + for (int i = 0; i < table.Length; i += 4) + { + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + count2 += BitOps.BitCount(table[i + 2] & OneMask); + count3 += BitOps.BitCount(table[i + 3] & OneMask); + + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + count0 = (count0 + count1) + (count2 + count3); + + size = (size - (count0 >> 2)) >> 1; + } + +#if !NETSTANDARD2_0 + // [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining | (MethodImplOptions)512)] + private unsafe int EstimateFrequencyAvx(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); + Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + + Vector256 indexLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); + +#if NET6_0_OR_GREATER + long* tablePtr = tableAddr; +#else + fixed (long* tablePtr = table) +#endif + { + Vector128 count = Avx2.PermuteVar8x32(Avx2.And(Avx2.ShiftRightLogicalVariable(Avx2.GatherVector256(tablePtr, blockOffset, 8), indexLong), Vector256.Create(0xfL)).AsInt32(), Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7)) + .GetLower() + .AsUInt16(); + + // set the zeroed high parts of the long value to ushort.Max +#if NET6_0_OR_GREATER + count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); +#else + count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); +#endif + + return Avx2.MinHorizontal(count).GetElement(0); + } + } + + //[MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining | (MethodImplOptions)512)] + private unsafe void IncrementAvx(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); + Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + + Vector256 offsetLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); + Vector256 mask = Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), offsetLong); + +#if NET6_0_OR_GREATER + long* tablePtr = tableAddr; +#else + fixed (long* tablePtr = table) +#endif + { + // Note masked is 'equal' - therefore use AndNot below + Vector256 masked = Avx2.CompareEqual(Avx2.And(Avx2.GatherVector256(tablePtr, blockOffset, 8), mask), mask); + + // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) + Vector256 inc = Avx2.AndNot(masked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong)); + + bool wasInc = Avx2.MoveMask(Avx2.CompareEqual(masked.AsByte(), Vector256.Zero).AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); + + tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); + tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); + tablePtr[blockOffset.GetElement(2)] += inc.GetElement(2); + tablePtr[blockOffset.GetElement(3)] += inc.GetElement(3); + + if (wasInc && (++size == sampleSize)) + { + Reset(); + } + } + } +#endif + } +} From 91ee099e6750715fe531591a3555f242c0313afb Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Tue, 26 Nov 2024 19:50:32 -0800 Subject: [PATCH 6/7] cleanup --- .../Lfu/CmSketchPinNoOpt.cs | 353 ------------------ .../Lfu/SketchFrequency.cs | 28 +- .../Lfu/SketchIncrement.cs | 26 +- .../Runner.cs | 6 +- BitFaster.Caching/Lfu/CmSketchCore.cs | 4 - BitFaster.Caching/Lfu/CmSketchCore512.cs | 337 ----------------- 6 files changed, 7 insertions(+), 747 deletions(-) delete mode 100644 BitFaster.Caching.Benchmarks/Lfu/CmSketchPinNoOpt.cs delete mode 100644 BitFaster.Caching/Lfu/CmSketchCore512.cs diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchPinNoOpt.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchPinNoOpt.cs deleted file mode 100644 index 2bf973fb..00000000 --- a/BitFaster.Caching.Benchmarks/Lfu/CmSketchPinNoOpt.cs +++ /dev/null @@ -1,353 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; -using System.Linq; -using System.Runtime.CompilerServices; -using System.Text; -using System.Threading.Tasks; - - -#if NET6_0_OR_GREATER -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif - -namespace BitFaster.Caching.Benchmarks.Lfu -{ - public unsafe class CmSketchPinNoOpt - where T : notnull - where I : struct, IsaProbe - { - private const long ResetMask = 0x7777777777777777L; - private const long OneMask = 0x1111111111111111L; - - private long[] table; -#if NET6_0_OR_GREATER - private long* tableAddr; -#endif - private int sampleSize; - private int blockMask; - private int size; - - private readonly IEqualityComparer comparer; - - /// - /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer. - /// - /// The maximum size. - /// The equality comparer. - public CmSketchPinNoOpt(long maximumSize, IEqualityComparer comparer) - { - EnsureCapacity(maximumSize); - this.comparer = comparer; - } - - /// - /// Gets the reset sample size. - /// - public int ResetSampleSize => this.sampleSize; - - /// - /// Gets the size. - /// - public int Size => this.size; - - /// - /// Estimate the frequency of the specified value, up to the maximum of 15. - /// - /// The value. - /// The estimated frequency of the value. - public int EstimateFrequency(T value) - { -#if NET48 - return EstimateFrequencyStd(value); -#else - - I isa = default; - - if (isa.IsAvx2Supported) - { - return EstimateFrequencyAvx(value); - } - else - { - return EstimateFrequencyStd(value); - } -#endif - } - - /// - /// Increment the count of the specified value. - /// - /// The value. - public void Increment(T value) - { -#if NET48 - IncrementStd(value); -#else - - I isa = default; - - if (isa.IsAvx2Supported) - { - IncrementAvx(value); - } - else - { - IncrementStd(value); - } -#endif - } - - /// - /// Clears the count for all items. - /// - public void Clear() - { - Array.Clear(table, 0, table.Length); - size = 0; - } - - //[MemberNotNull(nameof(table))] - private void EnsureCapacity(long maximumSize) - { - int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); - -#if NET6_0_OR_GREATER - I isa = default; - if (isa.IsAvx2Supported) - { - // over alloc by 8 to give 64 bytes padding, tableAddr is then aligned to 64 bytes - const int pad = 8; - bool pinned = true; - table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); - - tableAddr = (long*)Unsafe.AsPointer(ref table[0]); - tableAddr = (long*)((long)tableAddr + (long)tableAddr % 64); - - blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; - } - else -#endif - { - table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; - blockMask = (int)((uint)(table.Length) >> 3) - 1; - } - - sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); - - size = 0; - } - - private unsafe int EstimateFrequencyStd(T value) - { - var count = stackalloc int[4]; - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - int index = (h >> 1) & 15; - int offset = h & 1; - count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); - } - return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); - } - - private unsafe void IncrementStd(T value) - { - var index = stackalloc int[8]; - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - index[i] = (h >> 1) & 15; - int offset = h & 1; - index[i + 4] = block + offset + (i << 1); - } - - bool added = - IncrementAt(index[4], index[0]) - | IncrementAt(index[5], index[1]) - | IncrementAt(index[6], index[2]) - | IncrementAt(index[7], index[3]); - - if (added && (++size == sampleSize)) - { - Reset(); - } - } - - // Applies another round of hashing for additional randomization. - private static int Rehash(int x) - { - x = (int)(x * 0x31848bab); - x ^= (int)((uint)x >> 14); - return x; - } - - // Applies a supplemental hash function to defend against poor quality hash. - private static int Spread(int x) - { - x ^= (int)((uint)x >> 17); - x = (int)(x * 0xed5ad4bb); - x ^= (int)((uint)x >> 11); - x = (int)(x * 0xac4c1b51); - x ^= (int)((uint)x >> 15); - return x; - } - - private bool IncrementAt(int i, int j) - { - int offset = j << 2; - long mask = (0xfL << offset); - - if ((table[i] & mask) != mask) - { - table[i] += (1L << offset); - return true; - } - - return false; - } - - private void Reset() - { - // unroll, almost 2x faster - int count0 = 0; - int count1 = 0; - int count2 = 0; - int count3 = 0; - - for (int i = 0; i < table.Length; i += 4) - { - count0 += BitOps.BitCount(table[i] & OneMask); - count1 += BitOps.BitCount(table[i + 1] & OneMask); - count2 += BitOps.BitCount(table[i + 2] & OneMask); - count3 += BitOps.BitCount(table[i + 3] & OneMask); - - table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; - table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; - } - - count0 = (count0 + count1) + (count2 + count3); - - size = (size - (count0 >> 2)) >> 1; - } - -#if NET6_0_OR_GREATER - private unsafe int EstimateFrequencyAvx(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = Vector128.Create(counterHash); - h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - - var index = Avx2.ShiftRightLogical(h, 1); - index = Avx2.And(index, Vector128.Create(15)); // j - counter index - Vector128 offset = Avx2.And(h, Vector128.Create(1)); - Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index - blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) - -#if NET6_0_OR_GREATER - long* tablePtr = tableAddr; -#else - fixed (long* tablePtr = table) -#endif - { - Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); - index = Avx2.ShiftLeftLogical(index, 2); - - // convert index from int to long via permute - Vector256 indexLong = Vector256.Create(index, Vector128.Zero).AsInt64(); - Vector256 permuteMask2 = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); - indexLong = Avx2.PermuteVar8x32(indexLong.AsInt32(), permuteMask2).AsInt64(); - tableVector = Avx2.ShiftRightLogicalVariable(tableVector, indexLong.AsUInt64()); - tableVector = Avx2.And(tableVector, Vector256.Create(0xfL)); - - Vector256 permuteMask = Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7); - Vector128 count = Avx2.PermuteVar8x32(tableVector.AsInt32(), permuteMask) - .GetLower() - .AsUInt16(); - - // set the zeroed high parts of the long value to ushort.Max -#if NET6_0_OR_GREATER - count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); -#else - count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); -#endif - - return Avx2.MinHorizontal(count).GetElement(0); - } - } - - private unsafe void IncrementAvx(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = Vector128.Create(counterHash); - h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - - Vector128 index = Avx2.ShiftRightLogical(h, 1); - index = Avx2.And(index, Vector128.Create(15)); // j - counter index - Vector128 offset = Avx2.And(h, Vector128.Create(1)); - Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index - blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) - -#if NET6_0_OR_GREATER - long* tablePtr = tableAddr; -#else - fixed (long* tablePtr = table) -#endif - { - Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); - - // j == index - index = Avx2.ShiftLeftLogical(index, 2); - Vector256 offsetLong = Vector256.Create(index, Vector128.Zero).AsInt64(); - - Vector256 permuteMask = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); - offsetLong = Avx2.PermuteVar8x32(offsetLong.AsInt32(), permuteMask).AsInt64(); - - // mask = (0xfL << offset) - Vector256 fifteen = Vector256.Create(0xfL); - Vector256 mask = Avx2.ShiftLeftLogicalVariable(fifteen, offsetLong.AsUInt64()); - - // (table[i] & mask) != mask) - // Note masked is 'equal' - therefore use AndNot below - Vector256 masked = Avx2.CompareEqual(Avx2.And(tableVector, mask), mask); - - // 1L << offset - Vector256 inc = Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong.AsUInt64()); - - // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) - inc = Avx2.AndNot(masked, inc); - - Vector256 result = Avx2.CompareEqual(masked.AsByte(), Vector256.Zero); - bool wasInc = Avx2.MoveMask(result.AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); - - tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); - tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); - tablePtr[blockOffset.GetElement(2)] += inc.GetElement(2); - tablePtr[blockOffset.GetElement(3)] += inc.GetElement(3); - - if (wasInc && (++size == sampleSize)) - { - Reset(); - } - } - } -#endif - } -} diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index 0db7920c..f0bf60ee 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -15,7 +15,7 @@ namespace BitFaster.Caching.Benchmarks.Lfu [SimpleJob(RuntimeMoniker.Net90)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] - [ColumnChart(Title ="Sketch Frequency ({JOB})", Colors = "#cd5c5c,#fa8072,#ffa07a")] + [ColumnChart(Title = "Sketch Frequency ({JOB})", Colors = "#cd5c5c,#fa8072,#ffa07a")] public class SketchFrequency { const int sketchSize = 1_048_576; @@ -26,9 +26,7 @@ public class SketchFrequency private CmSketchCore blockStd; private CmSketchNoPin blockAvxNoPin; - private CmSketchPinNoOpt blockAvxPinNoOpt; private CmSketchCore blockAvx; - private CmSketchCore512 blockAvx512; [Params(512, 1024, 32_768, 524_288, 8_388_608, 134_217_728)] public int Size { get; set; } @@ -41,9 +39,7 @@ public void Setup() blockStd = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); - blockAvxPinNoOpt = new CmSketchPinNoOpt(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); - blockAvx512 = new CmSketchCore512(Size, EqualityComparer.Default); } [Benchmark(Baseline = true, OperationsPerInvoke = iterations)] @@ -56,7 +52,7 @@ public int FrequencyFlat() return count; } - //[Benchmark(OperationsPerInvoke = iterations)] + [Benchmark(OperationsPerInvoke = iterations)] public int FrequencyFlatAvx() { int count = 0; @@ -86,16 +82,6 @@ public int FrequencyBlockAvxNotPinned() return count; } - [Benchmark(OperationsPerInvoke = iterations)] - public int FrequencyBlockAvxPinNotOpt() - { - int count = 0; - for (int i = 0; i < iterations; i++) - count += blockAvxPinNoOpt.EstimateFrequency(i) > blockAvx.EstimateFrequency(i + 1) ? 1 : 0; - - return count; - } - [Benchmark(OperationsPerInvoke = iterations)] public int FrequencyBlockAvxPinned() { @@ -105,15 +91,5 @@ public int FrequencyBlockAvxPinned() return count; } - - [Benchmark(OperationsPerInvoke = iterations)] - public int FrequencyBlockAvxPinned512() - { - int count = 0; - for (int i = 0; i < iterations; i++) - count += blockAvx512.EstimateFrequency(i) > blockAvx.EstimateFrequency(i + 1) ? 1 : 0; - - return count; - } } } diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index b4fa8cea..6f6ab1e7 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -25,11 +25,9 @@ public class SketchIncrement private CmSketchCore blockStd; private CmSketchNoPin blockAvxNoPin; - private CmSketchPinNoOpt blockAvxPinNoOpt; private CmSketchCore blockAvx; - private CmSketchCore512 blockAvx512; - [Params(512, 1024, 32_768, 524_288, 8_388_608, 134_217_728)] + [Params(32_768, 524_288, 8_388_608, 134_217_728)] public int Size { get; set; } [GlobalSetup] @@ -40,9 +38,7 @@ public void Setup() blockStd = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); - blockAvxPinNoOpt = new CmSketchPinNoOpt(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); - blockAvx512 = new CmSketchCore512(Size, EqualityComparer.Default); } [Benchmark(Baseline = true, OperationsPerInvoke = iterations)] @@ -54,7 +50,7 @@ public void IncFlat() } } - //[Benchmark(OperationsPerInvoke = iterations)] + [Benchmark(OperationsPerInvoke = iterations)] public void IncFlatAvx() { for (int i = 0; i < iterations; i++) @@ -81,15 +77,6 @@ public void IncBlockAvxNotPinned() } } - [Benchmark(OperationsPerInvoke = iterations)] - public void IncBlockAvxPinNotOpt() - { - for (int i = 0; i < iterations; i++) - { - blockAvxPinNoOpt.Increment(i); - } - } - [Benchmark(OperationsPerInvoke = iterations)] public void IncBlockAvxPinned() { @@ -98,14 +85,5 @@ public void IncBlockAvxPinned() blockAvx.Increment(i); } } - - [Benchmark(OperationsPerInvoke = iterations)] - public void IncBlockAvxPinned512() - { - for (int i = 0; i < iterations; i++) - { - blockAvx512.Increment(i); - } - } } } diff --git a/BitFaster.Caching.ThroughputAnalysis/Runner.cs b/BitFaster.Caching.ThroughputAnalysis/Runner.cs index a769015e..f6caa276 100644 --- a/BitFaster.Caching.ThroughputAnalysis/Runner.cs +++ b/BitFaster.Caching.ThroughputAnalysis/Runner.cs @@ -33,10 +33,10 @@ private static void RunTest(Mode mode, int cacheSize) var cachesToTest = new List { - //new ClassicLruFactory(capacity), - //new MemoryCacheFactory(capacity), + new ClassicLruFactory(capacity), + new MemoryCacheFactory(capacity), new FastConcurrentLruFactory(capacity), - //new ConcurrentLruFactory(capacity), + new ConcurrentLruFactory(capacity), new ConcurrentLfuFactory(capacity) }; diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 9c4333aa..733b1ea0 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -196,7 +196,6 @@ private unsafe void IncrementStd(T value) } // Applies another round of hashing for additional randomization. - //[MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Rehash(int x) { x = (int)(x * 0x31848bab); @@ -205,7 +204,6 @@ private static int Rehash(int x) } // Applies a supplemental hash function to defend against poor quality hash. - //[MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Spread(int x) { x ^= (int)((uint)x >> 17); @@ -258,7 +256,6 @@ private void Reset() #if !NETSTANDARD2_0 [MethodImpl(MethodImplOptions.AggressiveInlining)] - // [MethodImpl(MethodImplOptions.AggressiveInlining | (MethodImplOptions)512)] private unsafe int EstimateFrequencyAvx(T value) { int blockHash = Spread(comparer.GetHashCode(value)); @@ -293,7 +290,6 @@ private unsafe int EstimateFrequencyAvx(T value) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - // [MethodImpl(MethodImplOptions.AggressiveInlining | (MethodImplOptions)512)] private unsafe void IncrementAvx(T value) { int blockHash = Spread(comparer.GetHashCode(value)); diff --git a/BitFaster.Caching/Lfu/CmSketchCore512.cs b/BitFaster.Caching/Lfu/CmSketchCore512.cs deleted file mode 100644 index c17bd725..00000000 --- a/BitFaster.Caching/Lfu/CmSketchCore512.cs +++ /dev/null @@ -1,337 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; - - -#if !NETSTANDARD2_0 -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif - -namespace BitFaster.Caching.Lfu -{ - /// - /// A probabilistic data structure used to estimate the frequency of a given value. Periodic aging reduces the - /// accumulated count across all values over time, such that a historic popular value will decay to zero frequency - /// over time if it is not accessed. - /// - /// - /// The maximum frequency of an element is limited to 15 (4-bits). Each element is hashed to a 64 byte 'block' - /// consisting of 4 segments of 32 4-bit counters. The 64 byte blocks are the same size as x64 L1 cache lines. - /// While the blocks are not guaranteed to be aligned, this scheme minimizes L1 cache misses resulting in a - /// significant speedup. When supported, a vectorized AVX2 code path provides a further speedup. Together, block - /// and AVX2 are approximately 2x faster than the original implementation. - /// - /// This is a direct C# translation of FrequencySketch in the Caffeine library by ben.manes@gmail.com (Ben Manes). - /// https://github.com/ben-manes/caffeine - public unsafe class CmSketchCore512 - where T : notnull - where I : struct, IsaProbe - { - private const long ResetMask = 0x7777777777777777L; - private const long OneMask = 0x1111111111111111L; - - private long[] table; -#if NET6_0_OR_GREATER - private long* tableAddr; -#endif - private int sampleSize; - private int blockMask; - private int size; - - private readonly IEqualityComparer comparer; - - /// - /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer. - /// - /// The maximum size. - /// The equality comparer. - public CmSketchCore512(long maximumSize, IEqualityComparer comparer) - { - EnsureCapacity(maximumSize); - this.comparer = comparer; - } - - /// - /// Gets the reset sample size. - /// - public int ResetSampleSize => this.sampleSize; - - /// - /// Gets the size. - /// - public int Size => this.size; - - /// - /// Estimate the frequency of the specified value, up to the maximum of 15. - /// - /// The value. - /// The estimated frequency of the value. - public int EstimateFrequency(T value) - { -#if NETSTANDARD2_0 - return EstimateFrequencyStd(value); -#else - - I isa = default; - - if (isa.IsAvx2Supported) - { - return EstimateFrequencyAvx(value); - } - else - { - return EstimateFrequencyStd(value); - } -#endif - } - - /// - /// Increment the count of the specified value. - /// - /// The value. - public void Increment(T value) - { -#if NETSTANDARD2_0 - IncrementStd(value); -#else - - I isa = default; - - if (isa.IsAvx2Supported) - { - IncrementAvx(value); - } - else - { - IncrementStd(value); - } -#endif - } - - /// - /// Clears the count for all items. - /// - public void Clear() - { - Array.Clear(table, 0, table.Length); - size = 0; - } - - [MemberNotNull(nameof(table))] - private void EnsureCapacity(long maximumSize) - { - int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); - -#if NET6_0_OR_GREATER - I isa = default; - if (isa.IsAvx2Supported) - { - // over alloc by 8 to give 64 bytes padding, tableAddr is then aligned to 64 bytes - const int pad = 8; - bool pinned = true; - table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); - - tableAddr = (long*)Unsafe.AsPointer(ref table[0]); - tableAddr = (long*)((long)tableAddr + (long)tableAddr % 64); - - blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; - } - else -#endif - { - table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; - blockMask = (int)((uint)(table.Length) >> 3) - 1; - } - - sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); - - size = 0; - } - - private unsafe int EstimateFrequencyStd(T value) - { - var count = stackalloc int[4]; - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - int index = (h >> 1) & 15; - int offset = h & 1; - count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); - } - return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); - } - - private unsafe void IncrementStd(T value) - { - var index = stackalloc int[8]; - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - index[i] = (h >> 1) & 15; - int offset = h & 1; - index[i + 4] = block + offset + (i << 1); - } - - bool added = - IncrementAt(index[4], index[0]) - | IncrementAt(index[5], index[1]) - | IncrementAt(index[6], index[2]) - | IncrementAt(index[7], index[3]); - - if (added && (++size == sampleSize)) - { - Reset(); - } - } - - // Applies another round of hashing for additional randomization. - //[MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int Rehash(int x) - { - x = (int)(x * 0x31848bab); - x ^= (int)((uint)x >> 14); - return x; - } - - // Applies a supplemental hash function to defend against poor quality hash. - //[MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int Spread(int x) - { - x ^= (int)((uint)x >> 17); - x = (int)(x * 0xed5ad4bb); - x ^= (int)((uint)x >> 11); - x = (int)(x * 0xac4c1b51); - x ^= (int)((uint)x >> 15); - return x; - } - - private bool IncrementAt(int i, int j) - { - int offset = j << 2; - long mask = (0xfL << offset); - - if ((table[i] & mask) != mask) - { - table[i] += (1L << offset); - return true; - } - - return false; - } - - private void Reset() - { - // unroll, almost 2x faster - int count0 = 0; - int count1 = 0; - int count2 = 0; - int count3 = 0; - - for (int i = 0; i < table.Length; i += 4) - { - count0 += BitOps.BitCount(table[i] & OneMask); - count1 += BitOps.BitCount(table[i + 1] & OneMask); - count2 += BitOps.BitCount(table[i + 2] & OneMask); - count3 += BitOps.BitCount(table[i + 3] & OneMask); - - table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; - table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; - } - - count0 = (count0 + count1) + (count2 + count3); - - size = (size - (count0 >> 2)) >> 1; - } - -#if !NETSTANDARD2_0 - // [MethodImpl(MethodImplOptions.AggressiveInlining)] - [MethodImpl(MethodImplOptions.AggressiveInlining | (MethodImplOptions)512)] - private unsafe int EstimateFrequencyAvx(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); - Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - - Vector256 indexLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); - -#if NET6_0_OR_GREATER - long* tablePtr = tableAddr; -#else - fixed (long* tablePtr = table) -#endif - { - Vector128 count = Avx2.PermuteVar8x32(Avx2.And(Avx2.ShiftRightLogicalVariable(Avx2.GatherVector256(tablePtr, blockOffset, 8), indexLong), Vector256.Create(0xfL)).AsInt32(), Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7)) - .GetLower() - .AsUInt16(); - - // set the zeroed high parts of the long value to ushort.Max -#if NET6_0_OR_GREATER - count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); -#else - count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); -#endif - - return Avx2.MinHorizontal(count).GetElement(0); - } - } - - //[MethodImpl(MethodImplOptions.AggressiveInlining)] - [MethodImpl(MethodImplOptions.AggressiveInlining | (MethodImplOptions)512)] - private unsafe void IncrementAvx(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); - Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - - Vector256 offsetLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); - Vector256 mask = Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), offsetLong); - -#if NET6_0_OR_GREATER - long* tablePtr = tableAddr; -#else - fixed (long* tablePtr = table) -#endif - { - // Note masked is 'equal' - therefore use AndNot below - Vector256 masked = Avx2.CompareEqual(Avx2.And(Avx2.GatherVector256(tablePtr, blockOffset, 8), mask), mask); - - // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) - Vector256 inc = Avx2.AndNot(masked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong)); - - bool wasInc = Avx2.MoveMask(Avx2.CompareEqual(masked.AsByte(), Vector256.Zero).AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); - - tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); - tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); - tablePtr[blockOffset.GetElement(2)] += inc.GetElement(2); - tablePtr[blockOffset.GetElement(3)] += inc.GetElement(3); - - if (wasInc && (++size == sampleSize)) - { - Reset(); - } - } - } -#endif - } -} From d2b0bc1a9d95ecca4a1f6005653b018c3eb3db50 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Tue, 26 Nov 2024 19:51:44 -0800 Subject: [PATCH 7/7] params --- BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index f0bf60ee..137b9dcd 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -28,7 +28,7 @@ public class SketchFrequency private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; - [Params(512, 1024, 32_768, 524_288, 8_388_608, 134_217_728)] + [Params(32_768, 524_288, 8_388_608, 134_217_728)] public int Size { get; set; } [GlobalSetup]