11//! # Grove Positioning System
22//!
3- //! We store the numbers in a triple nested `vec` of `vec` of `vec` . The initial size of each
4- //! vector is ∛5000 ~= 17, so that numbers are spread as evenly as possible.
3+ //! We store the numbers in an array of `vec`s . The initial size of each vector is 20
4+ //! so that numbers are spread as evenly as possible.
55//!
6- //! Numbes are stored in the leaf `vec`s. This greatly reduces the time to insert, remove and find
6+ //! Using multiple leaf `vec`s greatly reduces the time to insert, remove and find
77//! numbers, compared to storing all numbers in a single flat `vec`. Some further optimizations:
88//! * The first and second level indices of a number change only when it moves, so these can be
99//! stored in a lookup array for fast access.
1414//! commit history) that used an [order statistic tree](https://en.wikipedia.org/wiki/Order_statistic_tree),
1515//! although perhaps adding [balancing rotations](https://en.wikipedia.org/wiki/Tree_rotation)
1616//! to the tree would make it faster.
17+ //!
18+ //! Leaf `vec`s are padded to a size modulo 64 to speed up seaching for numbers. A SIMD variant
19+ //! can search for 64 numbers simultaneously.
1720use crate :: util:: parse:: * ;
21+ use std:: array:: from_fn;
22+ use std:: iter:: repeat_n;
23+
24+ struct PaddedVec {
25+ size : usize ,
26+ vec : Vec < u16 > ,
27+ }
1828
1929pub fn parse ( input : & str ) -> Vec < i64 > {
2030 input. iter_signed ( ) . collect ( )
@@ -32,72 +42,70 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
3242 // Important nuance, size is one less because we don't consider the moving number.
3343 let size = input. len ( ) - 1 ;
3444 // Another nuance, input contain duplicate numbers, so use index to refer to each number uniquely.
35- let indices: Vec < _ > = ( 0 ..input. len ( ) ) . collect ( ) ;
45+ let indices: Vec < _ > = ( 0 ..input. len ( ) as u16 ) . collect ( ) ;
3646 // Pre-process the numbers, coverting any negative indices to positive indices that will wrap.
3747 // For example, -1 becomes 4998.
3848 let numbers: Vec < _ > =
39- input. iter ( ) . map ( |n| ( n * key) . rem_euclid ( size as i64 ) as usize ) . collect ( ) ;
40-
41- // Store first and second level indices.
42- let mut lookup = Vec :: new ( ) ;
43- // Triple nested vec of numbers.
44- let mut mixed = Vec :: new ( ) ;
45- // Size of each first level element for convenience.
46- let mut skip = Vec :: new ( ) ;
47-
48- // Break 5000 numbers into roughly equals chunks at each level. 289 = 17 * 17.
49- for first in indices. chunks ( 289 ) {
50- let mut outer = Vec :: new ( ) ;
51-
52- for second in first. chunks ( 17 ) {
53- // Initial first and second level indices.
54- ( 0 ..second. len ( ) ) . for_each ( |_| lookup. push ( ( mixed. len ( ) , outer. len ( ) ) ) ) ;
55-
56- // Leave some extra room, as mixing won't balance evenly.
57- let mut inner = Vec :: with_capacity ( 100 ) ;
58- inner. extend_from_slice ( second) ;
59-
60- outer. push ( inner) ;
61- }
62-
63- mixed. push ( outer) ;
64- skip. push ( first. len ( ) ) ;
49+ input. iter ( ) . map ( |& n| ( n * key) . rem_euclid ( size as i64 ) as usize ) . collect ( ) ;
50+ // Store location of each number within `mixed` for faster lookup.
51+ let mut lookup = Vec :: with_capacity ( input. len ( ) ) ;
52+ // Size of each block of 16 elements for faster lookup.
53+ let mut skip = [ 0 ; 16 ] ;
54+ // Break 5000 numbers into roughly equals chunks.
55+ let mut mixed: [ _ ; 256 ] = from_fn ( |_| PaddedVec { size : 0 , vec : Vec :: with_capacity ( 128 ) } ) ;
56+
57+ for ( second, slice) in indices. chunks ( input. len ( ) . div_ceil ( 256 ) ) . enumerate ( ) {
58+ let size = slice. len ( ) ;
59+
60+ mixed[ second] . size = size;
61+ mixed[ second] . vec . resize ( size. next_multiple_of ( 64 ) , 0 ) ;
62+ mixed[ second] . vec [ ..size] . copy_from_slice ( slice) ;
63+
64+ lookup. extend ( repeat_n ( second, size) ) ;
65+ skip[ second / 16 ] += size;
6566 }
6667
6768 for _ in 0 ..rounds {
6869 ' mix: for index in 0 ..input. len ( ) {
6970 // Quickly find the leaf vector storing the number.
7071 let number = numbers[ index] ;
71- let ( first, second) = lookup[ index] ;
72+ let second = lookup[ index] ;
73+ let first = second / 16 ;
74+
7275 // Third level changes as other numbers are added and removed,
7376 // so needs to be checked each time.
74- let third = mixed[ first ] [ second] . iter ( ) . position ( | & i| i == index ) . unwrap ( ) ;
77+ let third = position ( & mixed[ second] , index as u16 ) ;
7578
7679 // Find the offset of the number by adding the size of all previous `vec`s.
7780 let position = third
7881 + skip[ ..first] . iter ( ) . sum :: < usize > ( )
79- + mixed[ first] [ ..second] . iter ( ) . map ( Vec :: len ) . sum :: < usize > ( ) ;
82+ + mixed[ 16 * first..second] . iter ( ) . map ( |v| v . size ) . sum :: < usize > ( ) ;
8083 // Update our position, wrapping around if necessary.
8184 let mut next = ( position + number) % size;
8285
8386 // Remove number from current leaf vector, also updating the first level size.
84- mixed[ first] [ second] . remove ( third) ;
87+ mixed[ second] . size -= 1 ;
88+ mixed[ second] . vec . remove ( third) ;
89+ mixed[ second] . vec . push ( 0 ) ;
8590 skip[ first] -= 1 ;
8691
8792 // Find our new destination, by checking `vec`s in order until the total elements
8893 // are greater than our new index.
89- for ( first, outer) in mixed. iter_mut ( ) . enumerate ( ) {
94+ for ( first, outer) in mixed. chunks_exact_mut ( 16 ) . enumerate ( ) {
9095 if next > skip[ first] {
9196 next -= skip[ first] ;
9297 } else {
9398 for ( second, inner) in outer. iter_mut ( ) . enumerate ( ) {
94- if next > inner. len ( ) {
95- next -= inner. len ( ) ;
99+ if next > inner. size {
100+ next -= inner. size ;
96101 } else {
97102 // Insert number into its new home.
98- inner. insert ( next, index) ;
103+ inner. size += 1 ;
104+ inner. vec . insert ( next, index as u16 ) ;
105+ inner. vec . resize ( inner. size . next_multiple_of ( 64 ) , 0 ) ;
106+ // Update location.
99107 skip[ first] += 1 ;
100- lookup[ index] = ( first, second) ;
108+ lookup[ index] = 16 * first + second;
101109 continue ' mix;
102110 }
103111 }
@@ -106,12 +114,44 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
106114 }
107115 }
108116
109- let indices: Vec < _ > = mixed. into_iter ( ) . flatten ( ) . flatten ( ) . collect ( ) ;
110- let zeroth = indices. iter ( ) . position ( |& i| input[ i] == 0 ) . unwrap ( ) ;
117+ let indices: Vec < _ > =
118+ mixed. into_iter ( ) . flat_map ( |pv| pv. vec . into_iter ( ) . take ( pv. size ) ) . collect ( ) ;
119+ let zeroth = indices. iter ( ) . position ( |& i| input[ i as usize ] == 0 ) . unwrap ( ) ;
111120
112121 [ 1000 , 2000 , 3000 ]
113122 . iter ( )
114123 . map ( |offset| ( zeroth + offset) % indices. len ( ) )
115- . map ( |index| input[ indices[ index] ] * key)
124+ . map ( |index| input[ indices[ index] as usize ] * key)
116125 . sum ( )
117126}
127+
128+ /// The compiler optimizes the position search when the size of the chunk is known.
129+ #[ cfg( not( feature = "simd" ) ) ]
130+ #[ inline]
131+ fn position ( haystack : & PaddedVec , needle : u16 ) -> usize {
132+ for ( base, slice) in haystack. vec . chunks_exact ( 64 ) . enumerate ( ) {
133+ if let Some ( offset) = slice. iter ( ) . position ( |& i| i == needle) {
134+ return 64 * base + offset;
135+ }
136+ }
137+
138+ unreachable ! ( )
139+ }
140+
141+ /// Search 64 lanes simultaneously.
142+ #[ cfg( feature = "simd" ) ]
143+ #[ inline]
144+ fn position ( haystack : & PaddedVec , needle : u16 ) -> usize {
145+ use std:: simd:: cmp:: SimdPartialEq as _;
146+ use std:: simd:: * ;
147+
148+ for ( base, slice) in haystack. vec . chunks_exact ( 64 ) . enumerate ( ) {
149+ if let Some ( offset) =
150+ Simd :: < u16 , 64 > :: from_slice ( slice) . simd_eq ( Simd :: splat ( needle) ) . first_set ( )
151+ {
152+ return 64 * base + offset;
153+ }
154+ }
155+
156+ unreachable ! ( )
157+ }
0 commit comments