Skip to content

Commit 29ec867

Browse files
committed
Faster approach with flatter nested structure and SIMD search of leaf elements
1 parent 3e10f18 commit 29ec867

File tree

2 files changed

+85
-45
lines changed

2 files changed

+85
-45
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Advent of Code [![checks-badge]][checks-link] [![docs-badge]][docs-link]
22

33
Blazing fast Rust solutions for every [Advent of Code] puzzle from 2015 to 2024, taking
4-
**502 milliseconds** to solve all 500 stars. Each solution is carefully optimized for performance
4+
**501 milliseconds** to solve all 500 stars. Each solution is carefully optimized for performance
55
while ensuring the code remains concise, readable, and idiomatic.
66

77
## Features
@@ -67,7 +67,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
6767

6868
| Year | [2015](#2015) | [2016](#2016) | [2017](#2017) | [2018](#2018) | [2019](#2019) | [2020](#2020) | [2021](#2021) | [2022](#2022) | [2023](#2023) | [2024](#2024) |
6969
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
70-
| Benchmark (ms) | 15 | 111 | 82 | 35 | 15 | 220 | 8 | 7 | 5 | 4 |
70+
| Benchmark (ms) | 15 | 111 | 82 | 35 | 15 | 220 | 8 | 6 | 5 | 4 |
7171

7272
## 2024
7373

@@ -158,7 +158,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
158158
| 17 | [Pyroclastic Flow](https://adventofcode.com/2022/day/17) | [Source](src/year2022/day17.rs) | 71 |
159159
| 18 | [Boiling Boulders](https://adventofcode.com/2022/day/18) | [Source](src/year2022/day18.rs) | 52 |
160160
| 19 | [Not Enough Minerals](https://adventofcode.com/2022/day/19) | [Source](src/year2022/day19.rs) | 74 |
161-
| 20 | [Grove Positioning System](https://adventofcode.com/2022/day/20) | [Source](src/year2022/day20.rs) | 3785 |
161+
| 20 | [Grove Positioning System](https://adventofcode.com/2022/day/20) | [Source](src/year2022/day20.rs) | 2685 |
162162
| 21 | [Monkey Math](https://adventofcode.com/2022/day/21) | [Source](src/year2022/day21.rs) | 64 |
163163
| 22 | [Monkey Map](https://adventofcode.com/2022/day/22) | [Source](src/year2022/day22.rs) | 61 |
164164
| 23 | [Unstable Diffusion](https://adventofcode.com/2022/day/23) | [Source](src/year2022/day23.rs) | 1521 |

src/year2022/day20.rs

Lines changed: 82 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
//! # Grove Positioning System
22
//!
3-
//! We store the numbers in a triple nested `vec` of `vec` of `vec`. The initial size of each
4-
//! vector is ∛5000 ~= 17, so that numbers are spread as evenly as possible.
3+
//! We store the numbers in an array of `vec`s. The initial size of each vector is 20
4+
//! so that numbers are spread as evenly as possible.
55
//!
6-
//! Numbes are stored in the leaf `vec`s. This greatly reduces the time to insert, remove and find
6+
//! Using multiple leaf `vec`s greatly reduces the time to insert, remove and find
77
//! numbers, compared to storing all numbers in a single flat `vec`. Some further optimizations:
88
//! * The first and second level indices of a number change only when it moves, so these can be
99
//! stored in a lookup array for fast access.
@@ -14,7 +14,17 @@
1414
//! commit history) that used an [order statistic tree](https://en.wikipedia.org/wiki/Order_statistic_tree),
1515
//! although perhaps adding [balancing rotations](https://en.wikipedia.org/wiki/Tree_rotation)
1616
//! to the tree would make it faster.
17+
//!
18+
//! Leaf `vec`s are padded to a size modulo 64 to speed up seaching for numbers. A SIMD variant
19+
//! can search for 64 numbers simultaneously.
1720
use crate::util::parse::*;
21+
use std::array::from_fn;
22+
use std::iter::repeat_n;
23+
24+
struct PaddedVec {
25+
size: usize,
26+
vec: Vec<u16>,
27+
}
1828

1929
pub fn parse(input: &str) -> Vec<i64> {
2030
input.iter_signed().collect()
@@ -32,72 +42,70 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
3242
// Important nuance, size is one less because we don't consider the moving number.
3343
let size = input.len() - 1;
3444
// Another nuance, input contain duplicate numbers, so use index to refer to each number uniquely.
35-
let indices: Vec<_> = (0..input.len()).collect();
45+
let indices: Vec<_> = (0..input.len() as u16).collect();
3646
// Pre-process the numbers, coverting any negative indices to positive indices that will wrap.
3747
// For example, -1 becomes 4998.
3848
let numbers: Vec<_> =
39-
input.iter().map(|n| (n * key).rem_euclid(size as i64) as usize).collect();
40-
41-
// Store first and second level indices.
42-
let mut lookup = Vec::new();
43-
// Triple nested vec of numbers.
44-
let mut mixed = Vec::new();
45-
// Size of each first level element for convenience.
46-
let mut skip = Vec::new();
47-
48-
// Break 5000 numbers into roughly equals chunks at each level. 289 = 17 * 17.
49-
for first in indices.chunks(289) {
50-
let mut outer = Vec::new();
51-
52-
for second in first.chunks(17) {
53-
// Initial first and second level indices.
54-
(0..second.len()).for_each(|_| lookup.push((mixed.len(), outer.len())));
55-
56-
// Leave some extra room, as mixing won't balance evenly.
57-
let mut inner = Vec::with_capacity(100);
58-
inner.extend_from_slice(second);
59-
60-
outer.push(inner);
61-
}
62-
63-
mixed.push(outer);
64-
skip.push(first.len());
49+
input.iter().map(|&n| (n * key).rem_euclid(size as i64) as usize).collect();
50+
// Store location of each number within `mixed` for faster lookup.
51+
let mut lookup = Vec::with_capacity(input.len());
52+
// Size of each block of 16 elements for faster lookup.
53+
let mut skip = [0; 16];
54+
// Break 5000 numbers into roughly equals chunks.
55+
let mut mixed: [_; 256] = from_fn(|_| PaddedVec { size: 0, vec: Vec::with_capacity(128) });
56+
57+
for (second, slice) in indices.chunks(input.len().div_ceil(256)).enumerate() {
58+
let size = slice.len();
59+
60+
mixed[second].size = size;
61+
mixed[second].vec.resize(size.next_multiple_of(64), 0);
62+
mixed[second].vec[..size].copy_from_slice(slice);
63+
64+
lookup.extend(repeat_n(second, size));
65+
skip[second / 16] += size;
6566
}
6667

6768
for _ in 0..rounds {
6869
'mix: for index in 0..input.len() {
6970
// Quickly find the leaf vector storing the number.
7071
let number = numbers[index];
71-
let (first, second) = lookup[index];
72+
let second = lookup[index];
73+
let first = second / 16;
74+
7275
// Third level changes as other numbers are added and removed,
7376
// so needs to be checked each time.
74-
let third = mixed[first][second].iter().position(|&i| i == index).unwrap();
77+
let third = position(&mixed[second], index as u16);
7578

7679
// Find the offset of the number by adding the size of all previous `vec`s.
7780
let position = third
7881
+ skip[..first].iter().sum::<usize>()
79-
+ mixed[first][..second].iter().map(Vec::len).sum::<usize>();
82+
+ mixed[16 * first..second].iter().map(|v| v.size).sum::<usize>();
8083
// Update our position, wrapping around if necessary.
8184
let mut next = (position + number) % size;
8285

8386
// Remove number from current leaf vector, also updating the first level size.
84-
mixed[first][second].remove(third);
87+
mixed[second].size -= 1;
88+
mixed[second].vec.remove(third);
89+
mixed[second].vec.push(0);
8590
skip[first] -= 1;
8691

8792
// Find our new destination, by checking `vec`s in order until the total elements
8893
// are greater than our new index.
89-
for (first, outer) in mixed.iter_mut().enumerate() {
94+
for (first, outer) in mixed.chunks_exact_mut(16).enumerate() {
9095
if next > skip[first] {
9196
next -= skip[first];
9297
} else {
9398
for (second, inner) in outer.iter_mut().enumerate() {
94-
if next > inner.len() {
95-
next -= inner.len();
99+
if next > inner.size {
100+
next -= inner.size;
96101
} else {
97102
// Insert number into its new home.
98-
inner.insert(next, index);
103+
inner.size += 1;
104+
inner.vec.insert(next, index as u16);
105+
inner.vec.resize(inner.size.next_multiple_of(64), 0);
106+
// Update location.
99107
skip[first] += 1;
100-
lookup[index] = (first, second);
108+
lookup[index] = 16 * first + second;
101109
continue 'mix;
102110
}
103111
}
@@ -106,12 +114,44 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
106114
}
107115
}
108116

109-
let indices: Vec<_> = mixed.into_iter().flatten().flatten().collect();
110-
let zeroth = indices.iter().position(|&i| input[i] == 0).unwrap();
117+
let indices: Vec<_> =
118+
mixed.into_iter().flat_map(|pv| pv.vec.into_iter().take(pv.size)).collect();
119+
let zeroth = indices.iter().position(|&i| input[i as usize] == 0).unwrap();
111120

112121
[1000, 2000, 3000]
113122
.iter()
114123
.map(|offset| (zeroth + offset) % indices.len())
115-
.map(|index| input[indices[index]] * key)
124+
.map(|index| input[indices[index] as usize] * key)
116125
.sum()
117126
}
127+
128+
/// The compiler optimizes the position search when the size of the chunk is known.
129+
#[cfg(not(feature = "simd"))]
130+
#[inline]
131+
fn position(haystack: &PaddedVec, needle: u16) -> usize {
132+
for (base, slice) in haystack.vec.chunks_exact(64).enumerate() {
133+
if let Some(offset) = slice.iter().position(|&i| i == needle) {
134+
return 64 * base + offset;
135+
}
136+
}
137+
138+
unreachable!()
139+
}
140+
141+
/// Search 64 lanes simultaneously.
142+
#[cfg(feature = "simd")]
143+
#[inline]
144+
fn position(haystack: &PaddedVec, needle: u16) -> usize {
145+
use std::simd::cmp::SimdPartialEq as _;
146+
use std::simd::*;
147+
148+
for (base, slice) in haystack.vec.chunks_exact(64).enumerate() {
149+
if let Some(offset) =
150+
Simd::<u16, 64>::from_slice(slice).simd_eq(Simd::splat(needle)).first_set()
151+
{
152+
return 64 * base + offset;
153+
}
154+
}
155+
156+
unreachable!()
157+
}

0 commit comments

Comments
 (0)