Merge pull request #158 from georges-hatem/main

gcarreno · web-flow · commit ae54a7bf646d · 2024-05-07T16:39:09.000+01:00
adjust name
diff --git a/README.md b/README.md
@@ -169,7 +169,7 @@ These are the results from running all entries into the challenge on my personal
 |--:|----------------:|---------:|:----------|:------|:-------------|
 |  1 |   0:1.260 | lazarus-3.99, fpc-3.3.1 | Arnaud Bouchez         | Using `mORMot2`, 32 threads        | |
 |  2 |   0:2.006 | lazarus-3.99, fpc-3.3.1 | O Coddo                | Using `SCL`, 32 threads            | |
-|  3 |   0:3.164 | lazarus-3.99, fpc-3.3.1 | Georges Hatem - FPC    | Free Pascal: Using 32 thread       | |
+|  3 |   0:3.164 | lazarus-3.99, fpc-3.3.1 | Georges Hatem          | Using 32 threads                   | |
 |  4 |   0:9.652 | lazarus-3.99, fpc-3.3.1 | G Klark                | Using 32 threads                   | |
 |  5 |  0:13.388 | lazarus-3.99, fpc-3.3.1 | Székely Balázs         | Using 32 threads                   | |
 |  6 |  0:18.007 | lazarus-3.99, fpc-3.3.1 | Lurendrejer Aksen      | Using 32 threads                   | |
diff --git a/entries/ghatem-fpc/README.md b/entries/ghatem-fpc/README.md
@@ -8,9 +8,8 @@
  - -t flag to specify the thread-count (default reads the thread-count available on the CPU)
 
 currently there are 2 versions that can be compiled / run:
- - `OneBRC.lpr              -> ghatem             `: all threads share the station names - involves locking
- - `OneBRC-nosharedname.lpr -> ghatem-nosharedname`: each thread maintains a copy of the station names - no locking involved
- - `OneBRC-smallrec.lpr     -> ghatem-smallrec    `: same as OneBRC, but the StationData's "count" is UInt16 instead of 32. Will likely fail to match hash on the 5B rows test
+ - `OneBRC.lpr              -> ghatem             `: compact record, optimal for the 1B row / 41k stations, will fail on the other tests due to overflow
+ - `OneBRC-largerec.lpr     -> ghatem-largerec    `: same as OneBRC, but the StationData's "count" is UInt32 instead of 16. Passes all the tests
  
 ## Hardware + Environment
 host: 
@@ -25,10 +24,6 @@ VM (VirtualBox):
  - CPU count: 4 out of 8 (threads, probably)
  - 20 GB RAM
  
-note about the hash:
-run with DEBUG compiler directive to write from stream directly to file, otherwise the hash will not match.
-
-## Baseline
 the initial implementation (the Delphi baseline found in /baseline) aimed to get a correct output, regardless of performance: 
 "Make it work, then make it work better".
 It turns out even the baseline caused some trouble, namely the `Ceil` implementation was yielding different results between FPC and Delphi (and different results between Delphi Win32/Win64).
@@ -267,11 +262,17 @@ The idea:
  - -> data about the same station will be stored at the same index for all threads' data-arrays
  - -> names will also be stored at that same index upon first encounter, and is common to all threads
  - no locking needs to occur when the key is already found, since there is no multiple-write occurring
- - the data-arrays are pre-allocated, and a atomic-counter will be incremented to know where the next element will be stored.
+ - the data-arrays are pre-allocated, and an atomic-counter will be incremented to know where the next element will be stored.
 
 Thinking again, this is likely similar to the approach mentioned by @synopse in one of his comments.
 
 For the ExtractLineData, three ideas to try implementing:
  - avoid using a function, to get rid of the cost of stack checking
  - reduce branching, I think it should be possible to go from 3 if-statements, to only 1
  - unroll the loop (although I had tried this in the past, did not show any improvements)
+
+Edit 2: 
+ - was unable to get rid of the stack_check: removing the function somehow became more expensive, I don't understand why that is.
+ - I was able to reduce branching to zero in the parsing of temperature
+ - unroll the loop was also beneficial, and even more so when the inner if-statement was removed in favor of branchless
+ - dictionary improvements were successful and showed a 30% speedup
diff --git a/entries/ghatem-fpc/src/OneBRC-largerec.lpr b/entries/ghatem-fpc/src/OneBRC-largerec.lpr
@@ -82,17 +82,17 @@   TBRCDictionary = class
 
     // searches for a given key, returns if found the key and the storage index
     // (or, if not found, which index to use next)
-    procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: THashSize);
+    procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: THashSize); {$IFNDEF VALGRIND} inline; {$ENDIF}
 
   public
     constructor Create;
     destructor Destroy; override;
 
     // simple wrapper to find station-record pointers
-    function TryGetValue (const aKey: Cardinal; const aThreadNb: TThreadCount; out aValue: PStationData): Boolean; inline;
+    function TryGetValue (const aKey: Cardinal; const aThreadNb: TThreadCount; out aValue: PStationData): Boolean;
 
     // multithread-unprotected: adds a firstly-encountered station-data (temp, name)
-    procedure Add (const aHashIdx: THashSize; const aThreadNb: TThreadCount; const aTemp: SmallInt; const aStationName: AnsiString); inline;
+    procedure Add (const aHashIdx: THashSize; const aThreadNb: TThreadCount; const aTemp: SmallInt; const aStationName: AnsiString); {$IFNDEF VALGRIND} inline; {$ENDIF}
 
     // multithread-protected: safely assign a slot for a given key
     function AtomicRegisterHash (const aKey: Cardinal): THashSize;
@@ -114,7 +114,7 @@   TOneBRC = class
     FDictionary: TBRCDictionary;
 
     // for a line between idx [aStart; aEnd], returns the station-name length, and the integer-value of temperature
-    procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); inline;
+    procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); {$IFNDEF VALGRIND} inline; {$ENDIF}
 
   public
     constructor Create (const aThreadCount: TThreadCount);
@@ -564,6 +564,8 @@ procedure TOneBRC.GenerateOutput;
     vHash: Cardinal;
     vStations: TStringList;
     iStationName: AnsiString;
+    vIdx: THashSize;
+    vRes: Boolean;
 begin
   vStream := TStringStream.Create;
   vStations := TStringList.Create;
@@ -587,7 +589,10 @@ procedure TOneBRC.GenerateOutput;
       // would it be more efficient to store the hash as well?
       // debatable, and the whole output generation is < 0.3 seconds, so not exactly worth it
       vHash := crc32c(0, @vStations[i][1], Length (vStations[i]));
-      FDictionary.TryGetValue(vHash, 0, vData);
+
+      FDictionary.InternalFind (vHash, vRes, vIdx);
+      vData := @FDictionary.FThreadData[0][FDictionary.FIndexes[vIdx]];
+
       vMean := RoundExInteger(vData^.Sum/vData^.Count/10);
 
       vStream.WriteString(
diff --git a/entries/ghatem-fpc/src/OneBRC.lpr b/entries/ghatem-fpc/src/OneBRC.lpr
@@ -82,17 +82,17 @@   TBRCDictionary = class
 
     // searches for a given key, returns if found the key and the storage index
     // (or, if not found, which index to use next)
-    procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: THashSize);
+    procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: THashSize); {$IFNDEF VALGRIND} inline; {$ENDIF}
 
   public
     constructor Create;
     destructor Destroy; override;
 
     // simple wrapper to find station-record pointers
-    function TryGetValue (const aKey: Cardinal; const aThreadNb: TThreadCount; out aValue: PStationData): Boolean; inline;
+    function TryGetValue (const aKey: Cardinal; const aThreadNb: TThreadCount; out aValue: PStationData): Boolean; {$IFNDEF VALGRIND} inline; {$ENDIF}
 
     // multithread-unprotected: adds a firstly-encountered station-data (temp, name)
-    procedure Add (const aHashIdx: THashSize; const aThreadNb: TThreadCount; const aTemp: SmallInt; const aStationName: AnsiString); inline;
+    procedure Add (const aHashIdx: THashSize; const aThreadNb: TThreadCount; const aTemp: SmallInt; const aStationName: AnsiString); {$IFNDEF VALGRIND} inline; {$ENDIF}
 
     // multithread-protected: safely assign a slot for a given key
     function AtomicRegisterHash (const aKey: Cardinal): THashSize;
@@ -114,7 +114,7 @@   TOneBRC = class
     FDictionary: TBRCDictionary;
 
     // for a line between idx [aStart; aEnd], returns the station-name length, and the integer-value of temperature
-    procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); inline;
+    procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); {$IFNDEF VALGRIND} inline; {$ENDIF}
 
   public
     constructor Create (const aThreadCount: TThreadCount);
@@ -564,6 +564,8 @@ procedure TOneBRC.GenerateOutput;
     vHash: Cardinal;
     vStations: TStringList;
     iStationName: AnsiString;
+    vIdx: THashSize;
+    vRes: Boolean;
 begin
   vStream := TStringStream.Create;
   vStations := TStringList.Create;
@@ -587,7 +589,10 @@ procedure TOneBRC.GenerateOutput;
       // would it be more efficient to store the hash as well?
       // debatable, and the whole output generation is < 0.3 seconds, so not exactly worth it
       vHash := crc32c(0, @vStations[i][1], Length (vStations[i]));
-      FDictionary.TryGetValue(vHash, 0, vData);
+
+      FDictionary.InternalFind (vHash, vRes, vIdx);
+      vData := @FDictionary.FThreadData[0][FDictionary.FIndexes[vIdx]];
+
       vMean := RoundExInteger(vData^.Sum/vData^.Count/10);
 
       vStream.WriteString(