Merge pull request #41 from synopse/main

gcarreno · web-flow · commit dfc8d736930c · 2024-03-22T14:37:01.000Z
abouchez / mORMot: asm code refactoring + README precisions
diff --git a/entries/abouchez/README.md b/entries/abouchez/README.md
@@ -35,7 +35,15 @@ Here are the main ideas behind this implementation proposal:
 - Can optionally output timing statistics and hash value on the console to debug and refine settings (with the `-v` command line switch);
 - Can optionally set each thread affinity to a single core (with the `-a` command line switch).
 
-The "64 bytes cache line" trick is quite unique among all implementations of the "1brc" I have seen in any language - and it does make a noticeable difference in performance. The L1 cache is well known to be the main bottleneck for any efficient in-memory process. We are very lucky the station names are just big enough to fill no more than 64 bytes, with min/max values reduced as 16-bit smallint - resulting in temperature range of -3276.7..+3276.8 which seems fair on our planet according to the IPCC. ;)
+## Why L1 Cache Matters
+
+The "64 bytes cache line" trick is quite unique among all implementations of the "1brc" I have seen in any language - and it does make a noticeable difference in performance.
+
+The L1 cache is well known in the performance hacking litterature to be the main bottleneck for any efficient in-memory process. If you want things to go fast, you should flatter your CPU L1 cache.
+
+We are very lucky the station names are just big enough to fill no more than 64 bytes, with min/max values reduced as 16-bit smallint - resulting in temperature range of -3276.7..+3276.8 which seems fair on our planet according to the IPCC. ;)
+
+In our first attempt, the `Station[]` array was in fact not aligned to 64 bytes itself. In fact, the RTL `SetLength()` does not align its data to the item size. So the pointer was aligned by 32 bytes, and any memory access would require filling two L1 cache lines. So we added some manual alignement of the data structure, and got 5% better performance.
 
 ## Usage
 
@@ -133,14 +141,60 @@ time ./abouchez measurements.txt -v -t=1
 ```
 This `-t=1` run is for fun: it will run the process in a single thread. It will help to guess how optimized (and lockfree) our parsing code is, and to validate the CPU multi-core abilities. In a perfect world, other `-t=##` runs should stand for a perfect division of `real` time per the number of working threads, and the `user` value reported by `time` should remain almost the same when we add threads up to the number of CPU cores.
 
-## Feedback Needed
+## Back To Reality
+
+Our proposal has been run on the benchmark hardware, using the full automation. 
+
+With 30 threads (on a busy system): 
+```
+-- SSD --
+Benchmark 1: abouchez
+  Time (mean ± σ):      3.634 s ±  0.099 s    [User: 86.580 s, System: 2.012 s]
+  Range (min … max):    3.530 s …  3.834 s    10 runs
+ 
+-- HDD --
+Benchmark 1: abouchez
+  Time (mean ± σ):      3.629 s ±  0.102 s    [User: 86.086 s, System: 2.008 s]
+  Range (min … max):    3.497 s …  3.789 s    10 runs
+```
+
+Later on, only the SSD values are shown, because the HDD version triggered the systemd watchdog, which killed the shell and its benchmark executable. But we can see that once the data is loaded from disk into the RAM cache, there is no difference with a `memmap` file on SSD and HDD. Linux is a great Operating System for sure.
+
+With 24 threads: 
+```
+-- SSD --
+Benchmark 1: abouchez
+  Time (mean ± σ):      2.977 s ±  0.053 s    [User: 53.790 s, System: 1.881 s]
+  Range (min … max):    2.905 s …  3.060 s    10 runs
+```
+
+With 16 threads: 
+```
+-- SSD --
+Benchmark 1: abouchez
+  Time (mean ± σ):      2.472 s ±  0.061 s    [User: 27.787 s, System: 1.720 s]
+  Range (min … max):    2.386 s …  2.588 s    10 runs
+```
 
-Here we will put some additional information, once our proposal has been run on the benchmark hardware.
+With 16 threads and thread affinity (`-a` switch on command line): 
+```
+-- SSD --
+Benchmark 1: abouchez
+  Time (mean ± σ):      3.227 s ±  0.017 s    [User: 39.731 s, System: 1.875 s]
+  Range (min … max):    3.206 s …  3.253 s    10 runs
+```
+
+So it sounds like if we should just run the benchmark with the `-t=16` option.
 
-Stay tuned!
+It may be as expected:
+
+- The Ryzen CPU has 16 cores with 32 threads, and it makes sense that using only the "real" cores with CPU+RAM intensive work is enough to saturate them;
+- It is a known fact from experiment that forcing thread affinity is not a good idea, and it is always much better to let any modern Linux Operating System schedule the threads to the CPU cores, because it has a much better knowledge of the actual system load and status. Even on a "fair" CPU architecture like AMD Zen.
 
 ## Ending Note
 
+You could disable our tuned asm in the project source code, and loose about 10% by using general purpose *mORMot* `crc32c()` and `CompareMem()` functions, which already runs SSE2/SSE4.2 tune assembly.
+
 There is a "*pure mORMot*" name lookup version available if you undefine the `CUSTOMHASH` conditional, which is around 40% slower, because it needs to copy the name into the stack before using `TDynArrayHashed`, and has a little more overhead.
 
 Arnaud :D
diff --git a/entries/abouchez/src/brcmormot.lpr b/entries/abouchez/src/brcmormot.lpr
@@ -5,7 +5,7 @@
 // a dedicated hash table is 40% faster than mORMot generic TDynArrayHashed
 
 {$define CUSTOMASM}
-// a few % faster with some dedicated asm instead of mORMot code on x86_64
+// about 10% faster with some dedicated asm instead of mORMot code on x86_64
 
 {$I mormot.defines.inc}
 
@@ -101,14 +101,14 @@ procedure TBrcList.Init(max: integer; align: boolean);
   SetLength(StationMem, max); // RTL won't align by 64 bytes
   Station := pointer(StationMem);
   if align then
-    while PtrUInt(Station) and 63 <> 0 do // manual alignment
+    while {%H-}PtrUInt(Station) and 63 <> 0 do // manual alignment
       inc(PByte(Station));
   SetLength(StationHash, HASHSIZE);
 end;
 
 {$ifdef CUSTOMASM}
 
-function crc32c(buf: PAnsiChar; len: cardinal): PtrUInt; nostackframe; assembler;
+function dohash(buf: PAnsiChar; len: cardinal): PtrUInt; nostackframe; assembler;
 asm
         xor     eax, eax // it is enough to hash up to 15 bytes for our purpose
         mov     ecx, len
@@ -130,7 +130,7 @@ function crc32c(buf: PAnsiChar; len: cardinal): PtrUInt; nostackframe; assembler
 @z:
 end;
 
-function MemEqual(a, b: pointer; len: PtrInt): integer; nostackframe; assembler;
+function CompareMem(a, b: pointer; len: PtrInt): boolean; nostackframe; assembler;
 asm
         add     a, len
         add     b, len
@@ -164,9 +164,18 @@ function MemEqual(a, b: pointer; len: PtrInt): integer; nostackframe; assembler;
         mov     al, byte ptr [a + len]
         cmp     al, byte ptr [b + len]
         je      @eq
-@diff:  mov     eax, 1
+@diff:  xor     eax, eax
         ret
-@eq:    xor     eax, eax // 0 = found (most common case of no hash collision)
+@eq:    mov     eax, 1 // = found (most common case of no hash collision)
+end;
+
+{$else}
+
+function dohash(buf: PAnsiChar; len: cardinal): PtrUInt; inline;
+begin
+  if len > 16 then
+    len := 16; // it is enough to hash up to 16 bytes for our purpose
+  result := DefaultHasher(0, buf, len); // fast mORMot asm hasher (crc32c)
 end;
 
 {$endif CUSTOMASM}
@@ -176,16 +185,15 @@ function TBrcList.Search(name: pointer; namelen: PtrInt): PBrcStation;
   h, x: PtrUInt;
 begin
   assert(namelen <= SizeOf(TBrcStation.NameText));
-  h := crc32c({$ifndef CUSTOMASM} 0, {$endif} name, namelen);
+  h := dohash(name, namelen);
   repeat
     h := h and (HASHSIZE - 1);
     x := StationHash[h];
     if x = 0 then
       break; // void slot
     result := @Station[x - 1];
     if (result^.NameLen = namelen) and
-       ({$ifdef CUSTOMASM}MemEqual{$else}MemCmp{$endif}(
-         @result^.NameText, name, namelen) = 0) then
+       CompareMem(@result^.NameText, name, namelen) then
       exit; // found
     inc(h); // hash collision: try next slot
   until false;
@@ -460,7 +468,7 @@ function ByStationName(const A, B): integer;
     result := sa.NameLen - sb.NameLen;
 end;
 
-function Average(sum, count: PtrInt): integer;
+function Average(sum, count: PtrInt): PtrInt;
 // sum and result are temperature * 10 (one fixed decimal)
 var
   x, t: PtrInt; // temperature * 100 (two fixed decimals)