Merge branch 'main' of https://github.com/EagleAglow/1brc-ObjectPascal

68412542+EagleAglow@users.noreply.github.com · 68412542+EagleAglow@users.noreply.github.com · commit f493b84f8577 · 2024-04-24T10:54:37.000-07:00
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ Grab all your threads, reach out to SIMD,  or pull any other trick, and create t
 <img src="img/1brc.png" alt="1BRC" style="display: block; margin-left: auto; margin-right: auto; margin-bottom:1em; width: 50%;">
 </p>
 
-The text file contains temperature values for a range of weather stations. Each row is one measurement in the format `<string: station name>;<double: measurement>`, with the measurement value having exactly one fractional digit.
+The text file contains temperature values for a range of weather stations. Each row is one measurement in the format `<string: station name>;<double: measurement>`, with the measurement value having exactly one fractional digit. Rows are separated by a single line feed equal of LF (ascii 10) for consistency with the original challenge - and not CR+LF (ascii 13+10) any more.
 The following shows ten rows as an example:
 
 ```
@@ -119,7 +119,7 @@ C:> CertUtil -hashfile .\data\measurements.txt SHA256
 Get-FileHash .\data\measurements.txt -Algorithm SHA256
 ```
 Expected `SHA256` hash:
-`ebad17b266ee9f5cb3d118531f197e6f68c9ab988abc5cb9506e6257e1a52ce6`
+`2b48bc2fa0b82d748925a820f43f75df01cc06df7447c7571e52d3962e675960`
 
 ## Verify Output File
 
diff --git a/entries/abouchez/src/brcmormot.lpr b/entries/abouchez/src/brcmormot.lpr
@@ -111,7 +111,7 @@ procedure ParseLine(var chunk: TBrcChunk); nostackframe; assembler;
          setne    dl
          mov      eax, dword ptr [r8 + 1]   // eax = xx.x or x.x
          shl      cl, 3
-         lea      r8, [r8 + rdx + 6]        // r8 = next line
+         lea      r8, [r8 + rdx + 5]        // r8 = next line (LF only)
          shl      eax, cl                   // normalized as xx.x
          and      eax, $0f000f0f            // from ascii to digit
          imul     rax, rax, 1 + 10 shl 16 + 100 shl 24
@@ -156,14 +156,14 @@ function CompareMem(a, b: PAnsiChar; l: PtrInt): boolean; nostackframe; assemble
         cmp     rax, rdx
         jl      @by1
         align   8
-@by8:   mov     rcx, qword ptr [rdi + rdx] // branchless for l>=8
+@by8:   mov     rcx, qword ptr [rdi + rdx] // branchless for 8..16 bytes
         cmp     rcx, qword ptr [rsi + rdx]
         jne     @set
         sub     rdx, rax
         jz      @ok
         cmp     rax, rdx
-        jge     @by8
-        mov     rcx, qword ptr [rdi + rax] // may overlap
+        jg      @by8
+        mov     rcx, qword ptr [rdi + rax] // compare last 8 bytes - may overlap
         cmp     rcx, qword ptr [rsi + rax]
 @set:   sete    al
         ret
@@ -194,7 +194,7 @@ procedure ParseLine(var chunk: TBrcChunk); inline;
   // branchless parsing of the temperature
   neg := ord(p[1] <> '-') * 2 - 1;         // neg = +1 or -1
   inc(p, ord(p[1] = '-'));                 // ignore '-' sign
-  chunk.Start := @p[ord(p[2] <> '.') + 6]; // next line
+  chunk.Start := @p[ord(p[2] <> '.') + 5]; // next line (LF only)
   chunk.Value := PtrInt(cardinal((QWord((PCardinal(p + 1)^ shl
                    (byte(ord(p[2] = '.') shl 3))) and $0f000f0f) *
          (1 + 10 shl 16 + 100 shl 24)) shr 24) and cardinal(1023)) * neg;
@@ -216,7 +216,7 @@ function CompareMem(a, b: PAnsiChar; l: PtrInt): boolean;
       dec(l, ptrsiz);
       if l = 0 then
         exit
-      else if l <= ptrsiz then
+      else if l < ptrsiz then
         continue;
       result := PPtrUInt(@a[ptrsiz])^ = PPtrUInt(@b[ptrsiz])^; // may overlap
       exit;
@@ -246,7 +246,7 @@ constructor TBrcThread.Create(owner: TBrcMain);
   FreeOnTerminate := true;
   SetLength(fStation, fOwner.fMax);
   InterlockedIncrement(fOwner.fRunning);
-  inherited Create({suspended=}false);
+  inherited Create({suspended=}false, {stacksize=}16384);
 end;
 
 
@@ -274,7 +274,7 @@ constructor TBrcMain.Create(const fn: TFileName; threads, chunkmb, max: integer;
   if not fullsearch then
     SetLength(fNameHash, fMax);
   SetLength(fNameLine, fMax);
-  // we tried pre-loading a first chunk here but it was not faster
+  // (we tried pre-loading a first chunk here but it was not faster)
   // run the thread workers
   core := 0;
   cores := SystemInfo.dwNumberOfProcessors;
diff --git a/entries/abouchez/src/brcmormotfullcheck.lpr b/entries/abouchez/src/brcmormotfullcheck.lpr
@@ -117,7 +117,7 @@ procedure ParseLine(var chunk: TBrcChunk); nostackframe; assembler;
          setne    dl
          mov      eax, dword ptr [r8 + 1]   // eax = xx.x or x.x
          shl      cl, 3
-         lea      r8, [r8 + rdx + 6]        // r8 = next line
+         lea      r8, [r8 + rdx + 5]        // r8 = next line
          shl      eax, cl                   // normalized as xx.x
          and      eax, $0f000f0f            // from ascii to digit
          imul     rax, rax, 1 + 10 shl 16 + 100 shl 24
@@ -208,7 +208,7 @@ procedure ParseLine(var chunk: TBrcChunk); inline;
   // branchless parsing of the temperature
   neg := ord(p[1] <> '-') * 2 - 1;         // neg = +1 or -1
   inc(p, ord(p[1] = '-'));                 // ignore '-' sign
-  chunk.Start := @p[ord(p[2] <> '.') + 6]; // next line
+  chunk.Start := @p[ord(p[2] <> '.') + 5]; // next line
   chunk.Value := PtrInt(cardinal((QWord((PCardinal(p + 1)^ shl
                    (byte(ord(p[2] = '.') shl 3))) and $0f000f0f) *
          (1 + 10 shl 16 + 100 shl 24)) shr 24) and cardinal(1023)) * neg;
diff --git a/entries/abouchez/src/brcmormotold.lpr b/entries/abouchez/src/brcmormotold.lpr
@@ -306,12 +306,12 @@ procedure TBrcThread.Execute;
       begin
         // note: the PCardinal(p)^ + "shr and $ff" trick is actually slower
         v := (p[0] * 100 + p[1] * 10 + p[3] - (ord('0') * 111)) * neg;
-        p := @p[6]; // also jump ending $13/$10
+        p := @p[5]; // also jump ending $10
       end
       else
       begin
         v := (p[0] * 10 + p[2] - (ord('0') * 11)) * neg; // x.x
-        p := @p[5];
+        p := @p[4];
       end;
       // store the value
       {$ifdef CUSTOMHASH}
diff --git a/entries/abouchez/src/brcmormotpertheadht.lpr b/entries/abouchez/src/brcmormotpertheadht.lpr
@@ -112,7 +112,7 @@ procedure ParseLine(var chunk: TBrcChunk); nostackframe; assembler;
          setne    dl
          mov      eax, dword ptr [r8 + 1]   // eax = xx.x or x.x
          shl      cl, 3
-         lea      r8, [r8 + rdx + 6]        // r8 = next line
+         lea      r8, [r8 + rdx + 5]        // r8 = next line
          shl      eax, cl                   // normalized as xx.x
          and      eax, $0f000f0f            // from ascii to digit
          imul     rax, rax, 1 + 10 shl 16 + 100 shl 24
@@ -164,7 +164,7 @@ procedure ParseLine(var chunk: TBrcChunk); inline;
   // branchless parsing of the temperature
   neg := ord(p[1] <> '-') * 2 - 1;         // neg = +1 or -1
   inc(p, ord(p[1] = '-'));                 // ignore '-' sign
-  chunk.Start := @p[ord(p[2] <> '.') + 6]; // next line
+  chunk.Start := @p[ord(p[2] <> '.') + 5]; // next line
   chunk.Value := PtrInt(cardinal((QWord((PCardinal(p + 1)^ shl
                    (byte(ord(p[2] = '.') shl 3))) and $0f000f0f) *
          (1 + 10 shl 16 + 100 shl 24)) shr 24) and cardinal(1023)) * neg;
diff --git a/entries/abouchez/src/brcmormotsharedht.lpr b/entries/abouchez/src/brcmormotsharedht.lpr
@@ -411,12 +411,12 @@ procedure TBrcThread.Execute;
       begin
         // note: the PCardinal(p)^ + "shr and $ff" trick is actually slower
         v := (p[0] * 100 + p[1] * 10 + p[3] - (ord('0') * 111)) * neg;
-        p := @p[6]; // also jump ending $13/$10
+        p := @p[5]; // also jump ending $13/$10
       end
       else
       begin
         v := (p[0] * 10 + p[2] - (ord('0') * 11)) * neg; // x.x
-        p := @p[5];
+        p := @p[4];
       end;
       // store the value
       if s^.Count = 0 then
diff --git a/entries/ghatem-fpc/README.md b/entries/ghatem-fpc/README.md
@@ -224,3 +224,12 @@ I implemented my own Dictionary class consisting of two arrays. We compute the m
 
 edit:
 quadratic probing improved performance even further. we could probably do better with 2-level hashing, but finding such a hash function is going to take a lot of trials, this is probably acceptable results
+
+**ACTUAL TIMING (busy machine): ~4 seconds as per gcarreno**
+
+## v.4 (2024-04-24)
+
+a few performance improvements, and measurements as per gcarreno on a busy machine:
+ - using mORMot's `crc32c` function instead of the native `crc32`, time dropped to 3.8 seconds
+ - I had removed my pre-allocated records implementation. restored it in the custom dictionary class, time dropped to 3.2 seconds
+ - skipping a few chars that we don't need to bother with, no timing yet
diff --git a/entries/ghatem-fpc/src/OneBRCproj.lpi b/entries/ghatem-fpc/src/OneBRCproj.lpi
@@ -45,7 +45,6 @@
           <Linking>
             <Debugging>
               <DebugInfoType Value="dsDwarf3"/>
-              <UseHeaptrc Value="True"/>
               <TrashVariables Value="True"/>
               <UseValgrind Value="True"/>
               <UseExternalDbgSyms Value="True"/>
diff --git a/entries/ghatem-fpc/src/onebrc.pas b/entries/ghatem-fpc/src/onebrc.pas
@@ -318,7 +318,7 @@ procedure TOneBRC.ProcessData (aThreadNb: UInt16; aStartIdx: Int64; aEndIdx: Int
   // the given ending point might be in the middle of a line:
   // find the beginning of that line (the last block works well)
   while True do begin
-    if FData[aEndIdx] <> #13 then
+    if FData[aEndIdx] <> #10 then
       Dec (aEndIdx)
     else
       break;
@@ -328,7 +328,7 @@ procedure TOneBRC.ProcessData (aThreadNb: UInt16; aStartIdx: Int64; aEndIdx: Int
   vLineStart := i;
 
   while i < aEndIdx do begin
-    if FData[i] = #13 then begin
+    if FData[i] = #10 then begin
       // new line parsed, process its contents
       ExtractLineData (vLineStart, i - 1, vLenStationName, vTemp);
 
@@ -360,8 +360,18 @@ procedure TOneBRC.ProcessData (aThreadNb: UInt16; aStartIdx: Int64; aEndIdx: Int
         FStationsDicts[aThreadNb].Add (vHash, vData);
       end;
 
-      // next char is #10, so we can skip 2 instead of 1
-      vLineStart := i+2;
+      // we're at a #10: next line starts at the next index
+      vLineStart := i+1;
+
+      // we're at a #10:
+      // until the next #10 char, there will be:
+      // - 1 semicolon
+      // - 3 chars for the temp (min)
+      // - 2 chars for the name (min)
+      // - the usual Inc (I)
+      // so we should be able to skip 7 chars until another #10 may appear
+      Inc (i, 7);
+      continue;
     end;
 
     Inc (i);
diff --git a/entries/hgrosser/README.md b/entries/hgrosser/README.md
@@ -3,24 +3,26 @@
 **1 billion row Challenge entry**
 
 ## Version
-Version 1.51
+Version 1.60
 
 ## How to compile
 The program was developed with FPC 3.2.2 and Lazarus 2.2.4
 
+There is a new Conditional "noCR" which is neccessary, if the input file has no CR's
+
 ## How to start
 ```
-Usage:   <path to input file> <bit-width for hash-list (14..28)>
-Example: hgrosser measurements.txt 15
+Usage:   <path to input file> <bit-width for hash-list (16..28)>
+Example: hgrosser measurements.txt 16
  - bit-width for hash-list: sets the size of the hash list, e.g. '16' => 65536 entries
 ```
-There are no switches like `-i` etc, only values.
+There are no switches like `-i` etc, only 2 values.
 
 ### Optimizing the 2nd command line parameter
 
-In theory the program should run faster with greater bit-widths for the hash-list (because of less collisions), but on my own computer (8 GB RAM) in praxis a small value of 15 is the fastest way, allthough this causes many collisions.
+In theory the program should run faster with greater bit-widths for the hash-list (because of less collisions), but on my own computer (8 GB RAM) in praxis a small value of 16 is the fastest way, allthough this causes many collisions.
 
-Please (if possible) try all values from 14 to 24 (maybe in a for-loop). Thanks a lot.
+Please (if possible) try all values from 16 to 22 (maybe in a for-loop). Thanks a lot.
 
 ## How the program works
 The Program works with 1 thread.
@@ -37,3 +39,4 @@ To speed things up:
 - Version 1.00: initial version
 - Version 1.50: hash-list optimized, small improvements in parsing the file
 - Version 1.51: small improvements in asm function
+- Version 1.60: hash-list optimized, some minor improvements, Conditional "noCR" added
diff --git a/entries/hgrosser/src/1brc.pas b/entries/hgrosser/src/1brc.pas
diff --git a/generator/Lazarus/src/generator.lpr b/generator/Lazarus/src/generator.lpr