@@ -822,90 +822,6 @@ where
822822 merge_sort ( v, & mut is_less) ;
823823}
824824
825- // Sort a small number of elements as fast as possible, without allocations.
826- #[ cfg( not( no_global_oom_handling) ) ]
827- fn stable_sort_small < T , F > ( v : & mut [ T ] , is_less : & mut F )
828- where
829- F : FnMut ( & T , & T ) -> bool ,
830- {
831- let len = v. len ( ) ;
832-
833- // This implementation is really not fit for anything beyond that, and the call is probably a
834- // bug.
835- debug_assert ! ( len <= 40 ) ;
836-
837- if len < 2 {
838- return ;
839- }
840-
841- // It's not clear that using custom code for specific sizes is worth it here.
842- // So we go with the simpler code.
843- let offset = if len <= 6 || !qualifies_for_branchless_sort :: < T > ( ) {
844- 1
845- } else {
846- // Once a certain threshold is reached, it becomes worth it to analyze the input and do
847- // branchless swapping for the first 5 elements.
848-
849- // SAFETY: We just checked that len >= 5
850- unsafe {
851- let arr_ptr = v. as_mut_ptr ( ) ;
852-
853- let should_swap_0_1 = is_less ( & * arr_ptr. add ( 1 ) , & * arr_ptr. add ( 0 ) ) ;
854- let should_swap_1_2 = is_less ( & * arr_ptr. add ( 2 ) , & * arr_ptr. add ( 1 ) ) ;
855- let should_swap_2_3 = is_less ( & * arr_ptr. add ( 3 ) , & * arr_ptr. add ( 2 ) ) ;
856- let should_swap_3_4 = is_less ( & * arr_ptr. add ( 4 ) , & * arr_ptr. add ( 3 ) ) ;
857-
858- let swap_count = should_swap_0_1 as usize
859- + should_swap_1_2 as usize
860- + should_swap_2_3 as usize
861- + should_swap_3_4 as usize ;
862-
863- if swap_count == 0 {
864- // Potentially already sorted. No need to swap, we know the first 5 elements are
865- // already in the right order.
866- 5
867- } else if swap_count == 4 {
868- // Potentially reversed.
869- let mut rev_i = 4 ;
870- while rev_i < ( len - 1 ) {
871- if !is_less ( & * arr_ptr. add ( rev_i + 1 ) , & * arr_ptr. add ( rev_i) ) {
872- break ;
873- }
874- rev_i += 1 ;
875- }
876- rev_i += 1 ;
877- v[ ..rev_i] . reverse ( ) ;
878- insertion_sort_shift_left ( v, rev_i, is_less) ;
879- return ;
880- } else {
881- // Potentially random pattern.
882- branchless_swap ( arr_ptr. add ( 0 ) , arr_ptr. add ( 1 ) , should_swap_0_1) ;
883- branchless_swap ( arr_ptr. add ( 2 ) , arr_ptr. add ( 3 ) , should_swap_2_3) ;
884-
885- if len >= 12 {
886- // This aims to find a good balance between generating more code, which is bad
887- // for cold loops and improving hot code while not increasing mean comparison
888- // count too much.
889- sort8_stable ( & mut v[ 4 ..12 ] , is_less) ;
890- insertion_sort_shift_left ( & mut v[ 4 ..] , 8 , is_less) ;
891- insertion_sort_shift_right ( v, 4 , is_less) ;
892- return ;
893- } else {
894- // Complete the sort network for the first 4 elements.
895- swap_next_if_less ( arr_ptr. add ( 1 ) , is_less) ;
896- swap_next_if_less ( arr_ptr. add ( 2 ) , is_less) ;
897- swap_next_if_less ( arr_ptr. add ( 0 ) , is_less) ;
898- swap_next_if_less ( arr_ptr. add ( 1 ) , is_less) ;
899-
900- 4
901- }
902- }
903- }
904- } ;
905-
906- insertion_sort_shift_left ( v, offset, is_less) ;
907- }
908-
909825#[ cfg( not( no_global_oom_handling) ) ]
910826fn merge_sort < T , F > ( v : & mut [ T ] , is_less : & mut F )
911827where
@@ -918,12 +834,7 @@ where
918834
919835 let len = v. len ( ) ;
920836
921- // Slices of up to this length get sorted using insertion sort.
922- const MAX_NO_ALLOC_SIZE : usize = 20 ;
923-
924- // Short arrays get sorted in-place via insertion sort to avoid allocations.
925- if len <= MAX_NO_ALLOC_SIZE {
926- stable_sort_small ( v, is_less) ;
837+ if len < 2 {
927838 return ;
928839 }
929840
@@ -963,6 +874,11 @@ where
963874 // return without allocating.
964875 return ;
965876 } else if buf_ptr. is_null ( ) {
877+ // Short arrays get sorted in-place via insertion sort to avoid allocations.
878+ if sort_small_stable ( v, start, is_less) {
879+ return ;
880+ }
881+
966882 // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
967883 // shallow copies of the contents of `v` without risking the dtors running on copies if
968884 // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the
@@ -1016,11 +932,7 @@ where
1016932 || ( n >= 3 && runs[ n - 3 ] . len <= runs[ n - 2 ] . len + runs[ n - 1 ] . len )
1017933 || ( n >= 4 && runs[ n - 4 ] . len <= runs[ n - 3 ] . len + runs[ n - 2 ] . len ) )
1018934 {
1019- if n >= 3 && runs[ n - 3 ] . len < runs[ n - 1 ] . len {
1020- Some ( n - 3 )
1021- } else {
1022- Some ( n - 2 )
1023- }
935+ if n >= 3 && runs[ n - 3 ] . len < runs[ n - 1 ] . len { Some ( n - 3 ) } else { Some ( n - 2 ) }
1024936 } else {
1025937 None
1026938 }
@@ -1033,6 +945,67 @@ where
1033945 }
1034946}
1035947
948+ /// Check whether `v` applies for small sort optimization.
949+ /// `v[start..]` is assumed already sorted.
950+ #[ cfg( not( no_global_oom_handling) ) ]
951+ fn sort_small_stable < T , F > ( v : & mut [ T ] , start : usize , is_less : & mut F ) -> bool
952+ where
953+ F : FnMut ( & T , & T ) -> bool ,
954+ {
955+ let len = v. len ( ) ;
956+
957+ if qualifies_for_branchless_sort :: < T > ( ) {
958+ // Testing showed that even though this incurs more comparisons, up to size 32 (4 * 8),
959+ // avoiding the allocation and sticking with simple code is worth it. Going further eg. 40
960+ // is still worth it for u64 or even types with more expensive comparisons, but risks
961+ // incurring just too many comparisons than doing the regular TimSort.
962+ const MAX_NO_ALLOC_SIZE : usize = 32 ;
963+ if len <= MAX_NO_ALLOC_SIZE {
964+ if len < 8 {
965+ insertion_sort_shift_right ( v, start, is_less) ;
966+ return true ;
967+ }
968+
969+ let mut merge_count = 0 ;
970+ for chunk in v. chunks_exact_mut ( 8 ) {
971+ // SAFETY: chunks_exact_mut promised to give us slices of len 8.
972+ unsafe {
973+ sort8_stable ( chunk, is_less) ;
974+ }
975+ merge_count += 1 ;
976+ }
977+
978+ let mut swap = mem:: MaybeUninit :: < [ T ; 8 ] > :: uninit ( ) ;
979+ let swap_ptr = swap. as_mut_ptr ( ) as * mut T ;
980+
981+ let mut i = 8 ;
982+ while merge_count > 1 {
983+ // SAFETY: We know the smaller side will be of size 8 because mid is 8. And both
984+ // sides are non empty because of merge_count, and the right side will always be of
985+ // size 8 and the left size of 8 or greater. Thus the smaller side will always be
986+ // exactly 8 long, the size of swap.
987+ unsafe {
988+ merge ( & mut v[ 0 ..( i + 8 ) ] , i, swap_ptr, is_less) ;
989+ }
990+ i += 8 ;
991+ merge_count -= 1 ;
992+ }
993+
994+ insertion_sort_shift_left ( v, i, is_less) ;
995+
996+ return true ;
997+ }
998+ } else {
999+ const MAX_NO_ALLOC_SIZE : usize = 20 ;
1000+ if len <= MAX_NO_ALLOC_SIZE {
1001+ insertion_sort_shift_right ( v, start, is_less) ;
1002+ return true ;
1003+ }
1004+ }
1005+
1006+ false
1007+ }
1008+
10361009/// Takes a range as denoted by start and end, that is already sorted and extends it if necessary
10371010/// with sorts optimized for smaller ranges such as insertion sort.
10381011#[ cfg( not( no_global_oom_handling) ) ]
@@ -1042,8 +1015,7 @@ where
10421015{
10431016 debug_assert ! ( end > start) ;
10441017
1045- // Testing showed that using MAX_INSERTION here yields the best performance for many types, but
1046- // incurs more total comparisons. A balance between least comparisons and best performance, as
1018+ // This value is a balance between least comparisons and best performance, as
10471019 // influenced by for example cache locality.
10481020 const MIN_INSERTION_RUN : usize = 10 ;
10491021
@@ -1115,6 +1087,7 @@ impl<T> Drop for InsertionHole<T> {
11151087
11161088/// Inserts `v[v.len() - 1]` into pre-sorted sequence `v[..v.len() - 1]` so that whole `v[..]`
11171089/// becomes sorted.
1090+ #[ cfg( not( no_global_oom_handling) ) ]
11181091unsafe fn insert_tail < T , F > ( v : & mut [ T ] , is_less : & mut F )
11191092where
11201093 F : FnMut ( & T , & T ) -> bool ,
@@ -1167,11 +1140,12 @@ where
11671140 }
11681141}
11691142
1170- /// Sort v assuming v[..offset] is already sorted.
1143+ /// Sort `v` assuming ` v[..offset]` is already sorted.
11711144///
11721145/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
11731146/// performance impact. Even improving performance in some cases.
11741147#[ inline( never) ]
1148+ #[ cfg( not( no_global_oom_handling) ) ]
11751149fn insertion_sort_shift_left < T , F > ( v : & mut [ T ] , offset : usize , is_less : & mut F )
11761150where
11771151 F : FnMut ( & T , & T ) -> bool ,
@@ -1195,11 +1169,12 @@ where
11951169 }
11961170}
11971171
1198- /// Sort v assuming v[offset..] is already sorted.
1172+ /// Sort `v` assuming ` v[offset..]` is already sorted.
11991173///
12001174/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
12011175/// performance impact. Even improving performance in some cases.
12021176#[ inline( never) ]
1177+ #[ cfg( not( no_global_oom_handling) ) ]
12031178fn insertion_sort_shift_right < T , F > ( v : & mut [ T ] , offset : usize , is_less : & mut F )
12041179where
12051180 F : FnMut ( & T , & T ) -> bool ,
@@ -1227,6 +1202,7 @@ where
12271202/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
12281203///
12291204/// This is the integral subroutine of insertion sort.
1205+ #[ cfg( not( no_global_oom_handling) ) ]
12301206unsafe fn insert_head < T , F > ( v : & mut [ T ] , is_less : & mut F )
12311207where
12321208 F : FnMut ( & T , & T ) -> bool ,
@@ -1287,6 +1263,10 @@ where
12871263///
12881264/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
12891265/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
1266+ ///
1267+ /// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
1268+ /// performance impact.
1269+ #[ inline( never) ]
12901270#[ cfg( not( no_global_oom_handling) ) ]
12911271unsafe fn merge < T , F > ( v : & mut [ T ] , mid : usize , buf : * mut T , is_less : & mut F )
12921272where
@@ -1506,6 +1486,7 @@ where
15061486/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
15071487/// performance impact.
15081488#[ inline( never) ]
1489+ #[ cfg( not( no_global_oom_handling) ) ]
15091490unsafe fn sort8_stable < T , F > ( v : & mut [ T ] , is_less : & mut F )
15101491where
15111492 F : FnMut ( & T , & T ) -> bool ,
@@ -1559,6 +1540,7 @@ where
15591540 }
15601541}
15611542
1543+ #[ cfg( not( no_global_oom_handling) ) ]
15621544unsafe fn sort24_stable < T , F > ( v : & mut [ T ] , is_less : & mut F )
15631545where
15641546 F : FnMut ( & T , & T ) -> bool ,
0 commit comments