1111 Baseline.Console;
1212
1313const
14- cNumStations = 41343 ;
14+ cNumStations = 41343 ; // as per the input file
1515 cDictSize = 248071 ; // numstations * 6, next prime number
1616 cThreadCount = 32 ;
1717
@@ -41,7 +41,7 @@ TOneBRCApp = class(TCustomApplication)
4141 TStationData = packed record
4242 Min: SmallInt;
4343 Max: SmallInt;
44- Count: UInt16 ;
44+ Count: UInt32 ;
4545 Sum: Integer;
4646 end ;
4747 PStationData = ^TStationData;
@@ -59,18 +59,42 @@ TOneBRCApp = class(TCustomApplication)
5959
6060 TBRCDictionary = class
6161 private
62+ // keys/values for a Large dictionary:
63+ // values will be the index where the actual record is stored
6264 FHashes: THashes;
6365 FIndexes: TIndexes;
64- FStationNames: TStationNames;
66+
67+ // where the actual records are stored:
68+ // - each thread holds its own data
69+ // - for each thread, pre-allocate as much space as needed
70+ // - all threads store their data at the exact same index
6571 FThreadData: array [0 ..cThreadCount-1 ] of array [0 ..cNumStations-1 ] of TStationData;
72+
73+ // station names are also shared, not lock-protected (in the worst case, the value is written twice)
74+ // stored separately as it is rarely needed
75+ FStationNames: TStationNames;
76+
77+ // points to the next slot in FThreadData where we should fill a newly encountered station
6678 FCounter: TStationCount;
79+
80+ // exclusively to protect FCounter from concurrent-writes
6781 FCS: TCriticalSection;
82+
83+ // searches for a given key, returns if found the key and the storage index
84+ // (or, if not found, which index to use next)
6885 procedure InternalFind (const aKey: Cardinal; out aFound: Boolean; out aIndex: THashSize);
86+
6987 public
7088 constructor Create;
7189 destructor Destroy; override;
90+
91+ // simple wrapper to find station-record pointers
7292 function TryGetValue (const aKey: Cardinal; const aThreadNb: TThreadCount; out aValue: PStationData): Boolean; inline;
93+
94+ // multithread-unprotected: adds a firstly-encountered station-data (temp, name)
7395 procedure Add (const aHashIdx: THashSize; const aThreadNb: TThreadCount; const aTemp: SmallInt; const aStationName: AnsiString); inline;
96+
97+ // multithread-protected: safely assign a slot for a given key
7498 function AtomicRegisterHash (const aKey: Cardinal): THashSize;
7599 end ;
76100
@@ -89,17 +113,27 @@ TOneBRC = class
89113 FThreads: array of TThread;
90114 FDictionary: TBRCDictionary;
91115
116+ // for a line between idx [aStart; aEnd], returns the station-name length, and the integer-value of temperature
92117 procedure ExtractLineData (const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); inline;
93118
94119 public
95120 constructor Create (const aThreadCount: TThreadCount);
96121 destructor Destroy; override;
97122 function mORMotMMF (const afilename: string): Boolean;
123+
124+ // initial thread-spawn
98125 procedure DispatchThreads ;
126+
127+ // await for all threads to complete work
99128 procedure WaitAll ;
129+
130+ // executed by each thread to process data in the given range
100131 procedure ProcessData (aThreadNb: TThreadCount; aStartIdx: Int64; aEndIdx: Int64);
132+
133+ // merge data from all threads
101134 procedure Merge (aLeft: TThreadCount; aRight: TThreadCount);
102135 procedure MergeAll ;
136+
103137 procedure GenerateOutput ;
104138 property DataSize: Int64 read FDataSize;
105139 end ;
@@ -160,6 +194,9 @@ function TBRCDictionary.AtomicRegisterHash(const aKey: Cardinal): THashSize;
160194var
161195 vFound: Boolean;
162196begin
197+ // must call InternalFind again, within the critical-section,
198+ // to ensure the slot was not taken by another thread
199+ // this function should execute only once per station, so at most 41343 times
163200 FCS.Acquire;
164201 try
165202 InternalFind (aKey, vFound, Result);
@@ -177,14 +214,19 @@ procedure TBRCDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean;
177214var vIdx: Integer;
178215 vOffset: Integer;
179216begin
217+ // Lemire hashing: faster to ocmpute than modulus, but more collisions from trials
218+ // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
219+ // thanks Arnaud for the suggestion
180220 vIdx := aKey * cDictSize shr 32 ;
181221
182222 if FHashes[vIdx] = 0 then begin
223+ // found match
183224 aIndex := vIdx;
184225 aFound := False;
185226 exit;
186227 end ;
187228 if FHashes[vIdx] = aKey then begin
229+ // found empty bucket to use
188230 aIndex := vIdx;
189231 aFound := True;
190232 exit;
@@ -193,7 +235,7 @@ procedure TBRCDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean;
193235 vOffset := 1 ;
194236
195237 while True do begin
196- // quadratic probing, by incrementing vOffset
238+ // linear (not quadratic) probing, with continous increments to minimize clusters
197239 Inc (vIdx, vOffset);
198240 Inc (vOffset);
199241
@@ -293,29 +335,35 @@ procedure TOneBRC.ExtractLineData(const aStart: Int64; const aEnd: Int64; out aL
293335 // repeat with the remaining digits, multiplying by 10^x (skip the '.')
294336 // multiply by -1 upon reaching a '-'
295337
296- // aTemp := (Ord(FData[aEnd]) - c0ascii)
297- // + 10 *(Ord(FData[aEnd-2]) - c0ascii);
298- // vDigit := Ord(FData[aEnd-3]);
299- // if (vDigit >= c0ascii) and (vDigit <= c9ascii) then begin
300- // aTemp := aTemp + 100*(Ord(FData[aEnd-3]) - c0ascii);
301- // vDigit := Ord(FData[aEnd-4]);
302- // if vDigit = cNegAscii then
303- // aTemp := -aTemp;
304- // end
305- // else if vDigit = cNegAscii then
306- // aTemp := -aTemp;
307-
338+ {
339+ aTemp := (Ord(FData[aEnd]) - c0ascii)
340+ + 10 *(Ord(FData[aEnd-2]) - c0ascii);
341+ vDigit := Ord(FData[aEnd-3]);
342+ if (vDigit >= c0ascii) and (vDigit <= c9ascii) then begin
343+ aTemp := aTemp + 100*(Ord(FData[aEnd-3]) - c0ascii);
344+ vDigit := Ord(FData[aEnd-4]);
345+ if vDigit = cNegAscii then
346+ aTemp := -aTemp;
347+ end
348+ else if vDigit = cNegAscii then
349+ aTemp := -aTemp;
350+ }
351+
352+ // ==========
353+ // entire computation is branchless (for readability, see version above)
354+ // no intermediary results also showed better performance
355+
356+ // 0 if -
357+ // 1 if +
358+ // convert range [0;1] to [-1;1] for branchless negation when needed
359+ // if there is a 3rd digit (*100), add it, otherwise multiply by 0 to cancel it out
308360 vIsNeg := Ord (FData[J+1 ] <> ' -' );
309361
310362 aTemp := (
311363 (Ord(FData[aEnd]) - c0ascii)
312364 + 10 *(Ord(FData[aEnd-2 ]) - c0ascii)
313365 + Ord ((J+4 - vIsNeg < aEnd)) * 100 *(Ord(FData[aEnd-3 ]) - c0ascii)
314366 ) * (vIsNeg * 2 - 1 );
315- // if (J+4 - vIsNeg < aEnd) then begin
316- // aTemp := aTemp
317- // end;
318- // aTemp := (vIsNeg * 2 - 1) * aTemp;
319367end ;
320368
321369// ---------------------------------------------------
@@ -350,6 +398,7 @@ procedure TOneBRC.DispatchThreads;
350398 I: TThreadCount;
351399 vRange: Int64;
352400begin
401+ // distribute input equally across available threads
353402 vRange := Trunc (FDataSize / FThreadCount);
354403
355404 for I := 0 to FThreadCount - 1 do begin
@@ -380,6 +429,7 @@ procedure TOneBRC.ProcessData (aThreadNb: TThreadCount; aStartIdx: Int64; aEndId
380429 vLenStationName: ShortInt;
381430 vFound: Boolean;
382431begin
432+ // initialize min/max, else we may get zeroes (due to our Add that fires once per station across all threads)
383433 for I := 0 to cNumStations - 1 do begin
384434 FDictionary.FThreadData[aThreadNb][I].Max := -2000 ;
385435 FDictionary.FThreadData[aThreadNb][I].Min := 2000 ;
@@ -422,7 +472,6 @@ procedure TOneBRC.ProcessData (aThreadNb: TThreadCount; aStartIdx: Int64; aEndId
422472
423473 FDictionary.InternalFind (vHash, vFound, vHashIdx);
424474
425-
426475 if vFound then begin
427476 vData := @FDictionary.FThreadData[aThreadNb][FDictionary.FIndexes[vHashIdx]];
428477 if vTemp < vData^.Min then
@@ -435,9 +484,11 @@ procedure TOneBRC.ProcessData (aThreadNb: TThreadCount; aStartIdx: Int64; aEndId
435484 else begin
436485 // pre-allocated array of records instead of on-the-go allocation
437486 vHashIdx := FDictionary.AtomicRegisterHash (vHash);
487+
438488 // SetString done only once per station name, for later sorting
439489 SetString(vStation, pAnsiChar(@FData[vLineStart]), vLenStationName);
440490
491+ // data can be safely added at the given index, without locking
441492 FDictionary.Add(vHashIdx, aThreadNb, vTemp, vStation);
442493 end ;
443494
@@ -460,6 +511,7 @@ procedure TOneBRC.Merge(aLeft: TThreadCount; aRight: TThreadCount);
460511 vDataL: PStationData;
461512 I: Integer;
462513begin
514+ // accumulate data into Left
463515 for I := 0 to cNumStations - 1 do begin
464516 vDataR := @FDictionary.FThreadData[aRight][I];
465517 vDataL := @FDictionary.FThreadData[aLeft][I];
@@ -478,6 +530,7 @@ procedure TOneBRC.MergeAll;
478530var
479531 I: TThreadCount;
480532begin
533+ // all thread-data is accumulated into index 0
481534 for I := 1 to FThreadCount - 1 do begin
482535 Merge (0 , I);
483536 end ;
@@ -487,6 +540,8 @@ procedure TOneBRC.MergeAll;
487540
488541function MyFormatInt (const aIn: SmallInt): AnsiString; inline;
489542begin
543+ // much faster than FormatFloat
544+ // oddly, IntToStr does not include leading zeroes for both pos and neg numbers
490545 Result := IntToStr(aIn);
491546 Insert (' .' , Result, Length(Result));
492547
0 commit comments