Faster approach with flatter nested structure and SIMD search of leaf elements

maneatingape · maneatingape · commit 29ec867567f7 · 2025-10-19T14:23:39.000+01:00
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # Advent of Code [![checks-badge]][checks-link] [![docs-badge]][docs-link]
 
 Blazing fast Rust solutions for every [Advent of Code] puzzle from 2015 to 2024, taking
-**502 milliseconds** to solve all 500 stars. Each solution is carefully optimized for performance
+**501 milliseconds** to solve all 500 stars. Each solution is carefully optimized for performance
 while ensuring the code remains concise, readable, and idiomatic.
 
 ## Features
@@ -67,7 +67,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 
 | Year | [2015](#2015) | [2016](#2016) | [2017](#2017) | [2018](#2018) | [2019](#2019) | [2020](#2020) | [2021](#2021) | [2022](#2022) | [2023](#2023) | [2024](#2024) |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Benchmark (ms) | 15 | 111 | 82 | 35 | 15 | 220 | 8 | 7 | 5 | 4 |
+| Benchmark (ms) | 15 | 111 | 82 | 35 | 15 | 220 | 8 | 6 | 5 | 4 |
 
 ## 2024
 
@@ -158,7 +158,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 17 | [Pyroclastic Flow](https://adventofcode.com/2022/day/17) | [Source](src/year2022/day17.rs) | 71 |
 | 18 | [Boiling Boulders](https://adventofcode.com/2022/day/18) | [Source](src/year2022/day18.rs) | 52 |
 | 19 | [Not Enough Minerals](https://adventofcode.com/2022/day/19) | [Source](src/year2022/day19.rs) | 74 |
-| 20 | [Grove Positioning System](https://adventofcode.com/2022/day/20) | [Source](src/year2022/day20.rs) | 3785 |
+| 20 | [Grove Positioning System](https://adventofcode.com/2022/day/20) | [Source](src/year2022/day20.rs) | 2685 |
 | 21 | [Monkey Math](https://adventofcode.com/2022/day/21) | [Source](src/year2022/day21.rs) | 64 |
 | 22 | [Monkey Map](https://adventofcode.com/2022/day/22) | [Source](src/year2022/day22.rs) | 61 |
 | 23 | [Unstable Diffusion](https://adventofcode.com/2022/day/23) | [Source](src/year2022/day23.rs) | 1521 |
diff --git a/src/year2022/day20.rs b/src/year2022/day20.rs
@@ -1,9 +1,9 @@
 //! # Grove Positioning System
 //!
-//! We store the numbers in a triple nested `vec` of `vec` of `vec`. The initial size of each
-//! vector is ∛5000 ~= 17, so that numbers are spread as evenly as possible.
+//! We store the numbers in an array of `vec`s. The initial size of each vector is 20
+//! so that numbers are spread as evenly as possible.
 //!
-//! Numbes are stored in the leaf `vec`s. This greatly reduces the time to insert, remove and find
+//! Using multiple leaf `vec`s greatly reduces the time to insert, remove and find
 //! numbers, compared to storing all numbers in a single flat `vec`. Some further optimizations:
 //! * The first and second level indices of a number change only when it moves, so these can be
 //!   stored in a lookup array for fast access.
@@ -14,7 +14,17 @@
 //! commit history) that used an [order statistic tree](https://en.wikipedia.org/wiki/Order_statistic_tree),
 //! although perhaps adding [balancing rotations](https://en.wikipedia.org/wiki/Tree_rotation)
 //! to the tree would make it faster.
+//!
+//! Leaf `vec`s are padded to a size modulo 64 to speed up seaching for numbers. A SIMD variant
+//! can search for 64 numbers simultaneously.
 use crate::util::parse::*;
+use std::array::from_fn;
+use std::iter::repeat_n;
+
+struct PaddedVec {
+    size: usize,
+    vec: Vec<u16>,
+}
 
 pub fn parse(input: &str) -> Vec<i64> {
     input.iter_signed().collect()
@@ -32,72 +42,70 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
     // Important nuance, size is one less because we don't consider the moving number.
     let size = input.len() - 1;
     // Another nuance, input contain duplicate numbers, so use index to refer to each number uniquely.
-    let indices: Vec<_> = (0..input.len()).collect();
+    let indices: Vec<_> = (0..input.len() as u16).collect();
     // Pre-process the numbers, coverting any negative indices to positive indices that will wrap.
     // For example, -1 becomes 4998.
     let numbers: Vec<_> =
-        input.iter().map(|n| (n * key).rem_euclid(size as i64) as usize).collect();
-
-    // Store first and second level indices.
-    let mut lookup = Vec::new();
-    // Triple nested vec of numbers.
-    let mut mixed = Vec::new();
-    // Size of each first level element for convenience.
-    let mut skip = Vec::new();
-
-    // Break 5000 numbers into roughly equals chunks at each level. 289 = 17 * 17.
-    for first in indices.chunks(289) {
-        let mut outer = Vec::new();
-
-        for second in first.chunks(17) {
-            // Initial first and second level indices.
-            (0..second.len()).for_each(|_| lookup.push((mixed.len(), outer.len())));
-
-            // Leave some extra room, as mixing won't balance evenly.
-            let mut inner = Vec::with_capacity(100);
-            inner.extend_from_slice(second);
-
-            outer.push(inner);
-        }
-
-        mixed.push(outer);
-        skip.push(first.len());
+        input.iter().map(|&n| (n * key).rem_euclid(size as i64) as usize).collect();
+    // Store location of each number within `mixed` for faster lookup.
+    let mut lookup = Vec::with_capacity(input.len());
+    // Size of each block of 16 elements for faster lookup.
+    let mut skip = [0; 16];
+    // Break 5000 numbers into roughly equals chunks.
+    let mut mixed: [_; 256] = from_fn(|_| PaddedVec { size: 0, vec: Vec::with_capacity(128) });
+
+    for (second, slice) in indices.chunks(input.len().div_ceil(256)).enumerate() {
+        let size = slice.len();
+
+        mixed[second].size = size;
+        mixed[second].vec.resize(size.next_multiple_of(64), 0);
+        mixed[second].vec[..size].copy_from_slice(slice);
+
+        lookup.extend(repeat_n(second, size));
+        skip[second / 16] += size;
     }
 
     for _ in 0..rounds {
         'mix: for index in 0..input.len() {
             // Quickly find the leaf vector storing the number.
             let number = numbers[index];
-            let (first, second) = lookup[index];
+            let second = lookup[index];
+            let first = second / 16;
+
             // Third level changes as other numbers are added and removed,
             // so needs to be checked each time.
-            let third = mixed[first][second].iter().position(|&i| i == index).unwrap();
+            let third = position(&mixed[second], index as u16);
 
             // Find the offset of the number by adding the size of all previous `vec`s.
             let position = third
                 + skip[..first].iter().sum::<usize>()
-                + mixed[first][..second].iter().map(Vec::len).sum::<usize>();
+                + mixed[16 * first..second].iter().map(|v| v.size).sum::<usize>();
             // Update our position, wrapping around if necessary.
             let mut next = (position + number) % size;
 
             // Remove number from current leaf vector, also updating the first level size.
-            mixed[first][second].remove(third);
+            mixed[second].size -= 1;
+            mixed[second].vec.remove(third);
+            mixed[second].vec.push(0);
             skip[first] -= 1;
 
             // Find our new destination, by checking `vec`s in order until the total elements
             // are greater than our new index.
-            for (first, outer) in mixed.iter_mut().enumerate() {
+            for (first, outer) in mixed.chunks_exact_mut(16).enumerate() {
                 if next > skip[first] {
                     next -= skip[first];
                 } else {
                     for (second, inner) in outer.iter_mut().enumerate() {
-                        if next > inner.len() {
-                            next -= inner.len();
+                        if next > inner.size {
+                            next -= inner.size;
                         } else {
                             // Insert number into its new home.
-                            inner.insert(next, index);
+                            inner.size += 1;
+                            inner.vec.insert(next, index as u16);
+                            inner.vec.resize(inner.size.next_multiple_of(64), 0);
+                            // Update location.
                             skip[first] += 1;
-                            lookup[index] = (first, second);
+                            lookup[index] = 16 * first + second;
                             continue 'mix;
                         }
                     }
@@ -106,12 +114,44 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
         }
     }
 
-    let indices: Vec<_> = mixed.into_iter().flatten().flatten().collect();
-    let zeroth = indices.iter().position(|&i| input[i] == 0).unwrap();
+    let indices: Vec<_> =
+        mixed.into_iter().flat_map(|pv| pv.vec.into_iter().take(pv.size)).collect();
+    let zeroth = indices.iter().position(|&i| input[i as usize] == 0).unwrap();
 
     [1000, 2000, 3000]
         .iter()
         .map(|offset| (zeroth + offset) % indices.len())
-        .map(|index| input[indices[index]] * key)
+        .map(|index| input[indices[index] as usize] * key)
         .sum()
 }
+
+/// The compiler optimizes the position search when the size of the chunk is known.
+#[cfg(not(feature = "simd"))]
+#[inline]
+fn position(haystack: &PaddedVec, needle: u16) -> usize {
+    for (base, slice) in haystack.vec.chunks_exact(64).enumerate() {
+        if let Some(offset) = slice.iter().position(|&i| i == needle) {
+            return 64 * base + offset;
+        }
+    }
+
+    unreachable!()
+}
+
+/// Search 64 lanes simultaneously.
+#[cfg(feature = "simd")]
+#[inline]
+fn position(haystack: &PaddedVec, needle: u16) -> usize {
+    use std::simd::cmp::SimdPartialEq as _;
+    use std::simd::*;
+
+    for (base, slice) in haystack.vec.chunks_exact(64).enumerate() {
+        if let Some(offset) =
+            Simd::<u16, 64>::from_slice(slice).simd_eq(Simd::splat(needle)).first_set()
+        {
+            return 64 * base + offset;
+        }
+    }
+
+    unreachable!()
+}