Skip to content

Commit 42d3bff

Browse files
authored
Merge pull request #156 from georges-hatem/main
get rid of slow version + documentation
2 parents bd3f0c7 + c48fc28 commit 42d3bff

File tree

7 files changed

+340
-1902
lines changed

7 files changed

+340
-1902
lines changed

entries/ghatem-fpc/src/OneBRC-dirty.lpi renamed to entries/ghatem-fpc/src/OneBRC-largerec.lpi

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
<Version Value="11"/>
2323
<PathDelim Value="\"/>
2424
<Target>
25-
<Filename Value="..\..\..\bin\ghatem-dirty"/>
25+
<Filename Value="..\..\..\bin\ghatem-largerec"/>
2626
</Target>
2727
<SearchPaths>
2828
<IncludeFiles Value="$(ProjOutDir)"/>
@@ -60,7 +60,7 @@
6060
<Version Value="11"/>
6161
<PathDelim Value="\"/>
6262
<Target>
63-
<Filename Value="..\..\..\bin\ghatem-dirty"/>
63+
<Filename Value="..\..\..\bin\ghatem-largerec"/>
6464
</Target>
6565
<SearchPaths>
6666
<IncludeFiles Value="$(ProjOutDir)"/>
@@ -89,7 +89,7 @@
8989
<Version Value="11"/>
9090
<PathDelim Value="\"/>
9191
<Target>
92-
<Filename Value="..\..\..\bin\ghatem-dirty"/>
92+
<Filename Value="..\..\..\bin\ghatem-largerec"/>
9393
</Target>
9494
<SearchPaths>
9595
<IncludeFiles Value="$(ProjOutDir)"/>
@@ -126,7 +126,7 @@
126126
</RequiredPackages>
127127
<Units Count="1">
128128
<Unit0>
129-
<Filename Value="OneBRC-dirty.lpr"/>
129+
<Filename Value="OneBRC-largerec.lpr"/>
130130
<IsPartOfProject Value="True"/>
131131
</Unit0>
132132
</Units>
@@ -135,7 +135,7 @@
135135
<Version Value="11"/>
136136
<PathDelim Value="\"/>
137137
<Target>
138-
<Filename Value="..\..\..\bin\ghatem-dirty"/>
138+
<Filename Value="..\..\..\bin\ghatem-largerec"/>
139139
</Target>
140140
<SearchPaths>
141141
<IncludeFiles Value="$(ProjOutDir)"/>

entries/ghatem-fpc/src/OneBRC-dirty.lpr renamed to entries/ghatem-fpc/src/OneBRC-largerec.lpr

Lines changed: 76 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
Baseline.Console;
1212

1313
const
14-
cNumStations = 41343;
14+
cNumStations = 41343; // as per the input file
1515
cDictSize = 248071; // numstations * 6, next prime number
1616
cThreadCount = 32;
1717

@@ -41,7 +41,7 @@ TOneBRCApp = class(TCustomApplication)
4141
TStationData = packed record
4242
Min: SmallInt;
4343
Max: SmallInt;
44-
Count: UInt16;
44+
Count: UInt32;
4545
Sum: Integer;
4646
end;
4747
PStationData = ^TStationData;
@@ -59,18 +59,42 @@ TOneBRCApp = class(TCustomApplication)
5959

6060
TBRCDictionary = class
6161
private
62+
// keys/values for a Large dictionary:
63+
// values will be the index where the actual record is stored
6264
FHashes: THashes;
6365
FIndexes: TIndexes;
64-
FStationNames: TStationNames;
66+
67+
// where the actual records are stored:
68+
// - each thread holds its own data
69+
// - for each thread, pre-allocate as much space as needed
70+
// - all threads store their data at the exact same index
6571
FThreadData: array [0..cThreadCount-1] of array [0..cNumStations-1] of TStationData;
72+
73+
// station names are also shared, not lock-protected (in the worst case, the value is written twice)
74+
// stored separately as it is rarely needed
75+
FStationNames: TStationNames;
76+
77+
// points to the next slot in FThreadData where we should fill a newly encountered station
6678
FCounter: TStationCount;
79+
80+
// exclusively to protect FCounter from concurrent-writes
6781
FCS: TCriticalSection;
82+
83+
// searches for a given key, returns if found the key and the storage index
84+
// (or, if not found, which index to use next)
6885
procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: THashSize);
86+
6987
public
7088
constructor Create;
7189
destructor Destroy; override;
90+
91+
// simple wrapper to find station-record pointers
7292
function TryGetValue (const aKey: Cardinal; const aThreadNb: TThreadCount; out aValue: PStationData): Boolean; inline;
93+
94+
// multithread-unprotected: adds a firstly-encountered station-data (temp, name)
7395
procedure Add (const aHashIdx: THashSize; const aThreadNb: TThreadCount; const aTemp: SmallInt; const aStationName: AnsiString); inline;
96+
97+
// multithread-protected: safely assign a slot for a given key
7498
function AtomicRegisterHash (const aKey: Cardinal): THashSize;
7599
end;
76100

@@ -89,17 +113,27 @@ TOneBRC = class
89113
FThreads: array of TThread;
90114
FDictionary: TBRCDictionary;
91115

116+
// for a line between idx [aStart; aEnd], returns the station-name length, and the integer-value of temperature
92117
procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); inline;
93118

94119
public
95120
constructor Create (const aThreadCount: TThreadCount);
96121
destructor Destroy; override;
97122
function mORMotMMF (const afilename: string): Boolean;
123+
124+
// initial thread-spawn
98125
procedure DispatchThreads;
126+
127+
// await for all threads to complete work
99128
procedure WaitAll;
129+
130+
// executed by each thread to process data in the given range
100131
procedure ProcessData (aThreadNb: TThreadCount; aStartIdx: Int64; aEndIdx: Int64);
132+
133+
// merge data from all threads
101134
procedure Merge (aLeft: TThreadCount; aRight: TThreadCount);
102135
procedure MergeAll;
136+
103137
procedure GenerateOutput;
104138
property DataSize: Int64 read FDataSize;
105139
end;
@@ -160,6 +194,9 @@ function TBRCDictionary.AtomicRegisterHash(const aKey: Cardinal): THashSize;
160194
var
161195
vFound: Boolean;
162196
begin
197+
// must call InternalFind again, within the critical-section,
198+
// to ensure the slot was not taken by another thread
199+
// this function should execute only once per station, so at most 41343 times
163200
FCS.Acquire;
164201
try
165202
InternalFind (aKey, vFound, Result);
@@ -177,14 +214,19 @@ procedure TBRCDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean;
177214
var vIdx: Integer;
178215
vOffset: Integer;
179216
begin
217+
// Lemire hashing: faster to ocmpute than modulus, but more collisions from trials
218+
// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
219+
// thanks Arnaud for the suggestion
180220
vIdx := aKey * cDictSize shr 32;
181221

182222
if FHashes[vIdx] = 0 then begin
223+
// found match
183224
aIndex := vIdx;
184225
aFound := False;
185226
exit;
186227
end;
187228
if FHashes[vIdx] = aKey then begin
229+
// found empty bucket to use
188230
aIndex := vIdx;
189231
aFound := True;
190232
exit;
@@ -193,7 +235,7 @@ procedure TBRCDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean;
193235
vOffset := 1;
194236

195237
while True do begin
196-
// quadratic probing, by incrementing vOffset
238+
// linear (not quadratic) probing, with continous increments to minimize clusters
197239
Inc (vIdx, vOffset);
198240
Inc (vOffset);
199241

@@ -293,29 +335,35 @@ procedure TOneBRC.ExtractLineData(const aStart: Int64; const aEnd: Int64; out aL
293335
// repeat with the remaining digits, multiplying by 10^x (skip the '.')
294336
// multiply by -1 upon reaching a '-'
295337

296-
//aTemp := (Ord(FData[aEnd]) - c0ascii)
297-
// + 10 *(Ord(FData[aEnd-2]) - c0ascii);
298-
//vDigit := Ord(FData[aEnd-3]);
299-
//if (vDigit >= c0ascii) and (vDigit <= c9ascii) then begin
300-
// aTemp := aTemp + 100*(Ord(FData[aEnd-3]) - c0ascii);
301-
// vDigit := Ord(FData[aEnd-4]);
302-
// if vDigit = cNegAscii then
303-
// aTemp := -aTemp;
304-
//end
305-
//else if vDigit = cNegAscii then
306-
// aTemp := -aTemp;
307-
338+
{
339+
aTemp := (Ord(FData[aEnd]) - c0ascii)
340+
+ 10 *(Ord(FData[aEnd-2]) - c0ascii);
341+
vDigit := Ord(FData[aEnd-3]);
342+
if (vDigit >= c0ascii) and (vDigit <= c9ascii) then begin
343+
aTemp := aTemp + 100*(Ord(FData[aEnd-3]) - c0ascii);
344+
vDigit := Ord(FData[aEnd-4]);
345+
if vDigit = cNegAscii then
346+
aTemp := -aTemp;
347+
end
348+
else if vDigit = cNegAscii then
349+
aTemp := -aTemp;
350+
}
351+
352+
//==========
353+
// entire computation is branchless (for readability, see version above)
354+
// no intermediary results also showed better performance
355+
356+
// 0 if -
357+
// 1 if +
358+
// convert range [0;1] to [-1;1] for branchless negation when needed
359+
// if there is a 3rd digit (*100), add it, otherwise multiply by 0 to cancel it out
308360
vIsNeg := Ord (FData[J+1] <> '-');
309361

310362
aTemp := (
311363
(Ord(FData[aEnd]) - c0ascii)
312364
+ 10 *(Ord(FData[aEnd-2]) - c0ascii)
313365
+ Ord ((J+4 - vIsNeg < aEnd)) * 100*(Ord(FData[aEnd-3]) - c0ascii)
314366
) * (vIsNeg * 2 - 1);
315-
//if (J+4 - vIsNeg < aEnd) then begin
316-
//aTemp := aTemp
317-
//end;
318-
//aTemp := (vIsNeg * 2 - 1) * aTemp;
319367
end;
320368

321369
//---------------------------------------------------
@@ -350,6 +398,7 @@ procedure TOneBRC.DispatchThreads;
350398
I: TThreadCount;
351399
vRange: Int64;
352400
begin
401+
// distribute input equally across available threads
353402
vRange := Trunc (FDataSize / FThreadCount);
354403

355404
for I := 0 to FThreadCount - 1 do begin
@@ -380,6 +429,7 @@ procedure TOneBRC.ProcessData (aThreadNb: TThreadCount; aStartIdx: Int64; aEndId
380429
vLenStationName: ShortInt;
381430
vFound: Boolean;
382431
begin
432+
// initialize min/max, else we may get zeroes (due to our Add that fires once per station across all threads)
383433
for I := 0 to cNumStations - 1 do begin
384434
FDictionary.FThreadData[aThreadNb][I].Max := -2000;
385435
FDictionary.FThreadData[aThreadNb][I].Min := 2000;
@@ -422,7 +472,6 @@ procedure TOneBRC.ProcessData (aThreadNb: TThreadCount; aStartIdx: Int64; aEndId
422472

423473
FDictionary.InternalFind (vHash, vFound, vHashIdx);
424474

425-
426475
if vFound then begin
427476
vData := @FDictionary.FThreadData[aThreadNb][FDictionary.FIndexes[vHashIdx]];
428477
if vTemp < vData^.Min then
@@ -435,9 +484,11 @@ procedure TOneBRC.ProcessData (aThreadNb: TThreadCount; aStartIdx: Int64; aEndId
435484
else begin
436485
// pre-allocated array of records instead of on-the-go allocation
437486
vHashIdx := FDictionary.AtomicRegisterHash (vHash);
487+
438488
// SetString done only once per station name, for later sorting
439489
SetString(vStation, pAnsiChar(@FData[vLineStart]), vLenStationName);
440490

491+
// data can be safely added at the given index, without locking
441492
FDictionary.Add(vHashIdx, aThreadNb, vTemp, vStation);
442493
end;
443494

@@ -460,6 +511,7 @@ procedure TOneBRC.Merge(aLeft: TThreadCount; aRight: TThreadCount);
460511
vDataL: PStationData;
461512
I: Integer;
462513
begin
514+
// accumulate data into Left
463515
for I := 0 to cNumStations - 1 do begin
464516
vDataR := @FDictionary.FThreadData[aRight][I];
465517
vDataL := @FDictionary.FThreadData[aLeft][I];
@@ -478,6 +530,7 @@ procedure TOneBRC.MergeAll;
478530
var
479531
I: TThreadCount;
480532
begin
533+
// all thread-data is accumulated into index 0
481534
for I := 1 to FThreadCount - 1 do begin
482535
Merge (0, I);
483536
end;
@@ -487,6 +540,8 @@ procedure TOneBRC.MergeAll;
487540

488541
function MyFormatInt (const aIn: SmallInt): AnsiString; inline;
489542
begin
543+
// much faster than FormatFloat
544+
// oddly, IntToStr does not include leading zeroes for both pos and neg numbers
490545
Result := IntToStr(aIn);
491546
Insert ('.', Result, Length(Result));
492547

0 commit comments

Comments
 (0)