@@ -1196,52 +1196,37 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
11961196
11971197 let mut runs = RunVec :: new ( run_alloc_fn, run_dealloc_fn) ;
11981198
1199- // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
1200- // strange decision, but consider the fact that merges more often go in the opposite direction
1201- // (forwards). According to benchmarks, merging forwards is slightly faster than merging
1202- // backwards. To conclude, identifying runs by traversing backwards improves performance.
1203- let mut end = len;
1204- while end > 0 {
1205- // Find the next natural run, and reverse it if it's strictly descending.
1206- let mut start = end - 1 ;
1207- if start > 0 {
1208- start -= 1 ;
1209-
1210- // SAFETY: The v.get_unchecked must be fed with correct inbound indicies.
1211- unsafe {
1212- if is_less ( v. get_unchecked ( start + 1 ) , v. get_unchecked ( start) ) {
1213- while start > 0 && is_less ( v. get_unchecked ( start) , v. get_unchecked ( start - 1 ) ) {
1214- start -= 1 ;
1215- }
1216- v[ start..end] . reverse ( ) ;
1217- } else {
1218- while start > 0 && !is_less ( v. get_unchecked ( start) , v. get_unchecked ( start - 1 ) )
1219- {
1220- start -= 1 ;
1221- }
1222- }
1223- }
1199+ let mut end = 0 ;
1200+ let mut start = 0 ;
1201+
1202+ // Scan forward. Memory pre-fetching prefers forward scanning vs backwards scanning, and the
1203+ // code-gen is usually better. For the most sensitive types such as integers, these are merged
1204+ // bidirectionally at once. So there is no benefit in scanning backwards.
1205+ while end < len {
1206+ let ( streak_end, was_reversed) = find_streak ( & v[ start..] , is_less) ;
1207+ end += streak_end;
1208+ if was_reversed {
1209+ v[ start..end] . reverse ( ) ;
12241210 }
12251211
12261212 // Insert some more elements into the run if it's too short. Insertion sort is faster than
12271213 // merge sort on short sequences, so this significantly improves performance.
1228- start = provide_sorted_batch ( v, start, end, is_less) ;
1214+ end = provide_sorted_batch ( v, start, end, is_less) ;
12291215
12301216 // Push this run onto the stack.
12311217 runs. push ( TimSortRun { start, len : end - start } ) ;
1232- end = start ;
1218+ start = end ;
12331219
12341220 // Merge some pairs of adjacent runs to satisfy the invariants.
1235- while let Some ( r) = collapse ( runs. as_slice ( ) ) {
1236- let left = runs[ r + 1 ] ;
1237- let right = runs[ r] ;
1238- // SAFETY: `buf_ptr` must hold enough capacity for the shorter of the two sides, and
1239- // neither side may be on length 0.
1221+ while let Some ( r) = collapse ( runs. as_slice ( ) , len) {
1222+ let left = runs[ r] ;
1223+ let right = runs[ r + 1 ] ;
1224+ let merge_slice = & mut v[ left. start ..right. start + right. len ] ;
12401225 unsafe {
1241- merge ( & mut v [ left . start ..right . start + right . len ] , left. len , buf_ptr, is_less) ;
1226+ merge ( merge_slice , left. len , buf_ptr, is_less) ;
12421227 }
1243- runs[ r] = TimSortRun { start : left. start , len : left. len + right. len } ;
1244- runs. remove ( r + 1 ) ;
1228+ runs[ r + 1 ] = TimSortRun { start : left. start , len : left. len + right. len } ;
1229+ runs. remove ( r) ;
12451230 }
12461231 }
12471232
@@ -1263,10 +1248,10 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
12631248 // run starts at index 0, it will always demand a merge operation until the stack is fully
12641249 // collapsed, in order to complete the sort.
12651250 #[ inline]
1266- fn collapse ( runs : & [ TimSortRun ] ) -> Option < usize > {
1251+ fn collapse ( runs : & [ TimSortRun ] , stop : usize ) -> Option < usize > {
12671252 let n = runs. len ( ) ;
12681253 if n >= 2
1269- && ( runs[ n - 1 ] . start == 0
1254+ && ( runs[ n - 1 ] . start + runs [ n - 1 ] . len == stop
12701255 || runs[ n - 2 ] . len <= runs[ n - 1 ] . len
12711256 || ( n >= 3 && runs[ n - 3 ] . len <= runs[ n - 2 ] . len + runs[ n - 1 ] . len )
12721257 || ( n >= 4 && runs[ n - 4 ] . len <= runs[ n - 3 ] . len + runs[ n - 2 ] . len ) )
@@ -1454,33 +1439,70 @@ pub struct TimSortRun {
14541439 start : usize ,
14551440}
14561441
1457- /// Takes a range as denoted by start and end, that is already sorted and extends it to the left if
1442+ /// Takes a range as denoted by start and end, that is already sorted and extends it to the right if
14581443/// necessary with sorts optimized for smaller ranges such as insertion sort.
14591444#[ cfg( not( no_global_oom_handling) ) ]
1460- fn provide_sorted_batch < T , F > ( v : & mut [ T ] , mut start : usize , end : usize , is_less : & mut F ) -> usize
1445+ fn provide_sorted_batch < T , F > ( v : & mut [ T ] , start : usize , mut end : usize , is_less : & mut F ) -> usize
14611446where
14621447 F : FnMut ( & T , & T ) -> bool ,
14631448{
1464- debug_assert ! ( end > start) ;
1449+ let len = v. len ( ) ;
1450+ assert ! ( end >= start && end <= len) ;
14651451
14661452 // This value is a balance between least comparisons and best performance, as
14671453 // influenced by for example cache locality.
14681454 const MIN_INSERTION_RUN : usize = 10 ;
14691455
14701456 // Insert some more elements into the run if it's too short. Insertion sort is faster than
14711457 // merge sort on short sequences, so this significantly improves performance.
1472- let start_found = start;
14731458 let start_end_diff = end - start;
14741459
1475- if start_end_diff < MIN_INSERTION_RUN && start != 0 {
1460+ if start_end_diff < MIN_INSERTION_RUN && end < len {
14761461 // v[start_found..end] are elements that are already sorted in the input. We want to extend
14771462 // the sorted region to the left, so we push up MIN_INSERTION_RUN - 1 to the right. Which is
14781463 // more efficient that trying to push those already sorted elements to the left.
1464+ end = cmp:: min ( start + MIN_INSERTION_RUN , len) ;
1465+ let presorted_start = cmp:: max ( start_end_diff, 1 ) ;
14791466
1480- start = if end >= MIN_INSERTION_RUN { end - MIN_INSERTION_RUN } else { 0 } ;
1467+ insertion_sort_shift_left ( & mut v[ start..end] , presorted_start, is_less) ;
1468+ }
14811469
1482- insertion_sort_shift_right ( & mut v[ start..end] , start_found - start, is_less) ;
1470+ end
1471+ }
1472+
1473+ /// Finds a streak of presorted elements starting at the beginning of the slice. Returns the first
1474+ /// value that is not part of said streak, and a bool denoting wether the streak was reversed.
1475+ /// Streaks can be increasing or decreasing.
1476+ fn find_streak < T , F > ( v : & [ T ] , is_less : & mut F ) -> ( usize , bool )
1477+ where
1478+ F : FnMut ( & T , & T ) -> bool ,
1479+ {
1480+ let len = v. len ( ) ;
1481+
1482+ if len < 2 {
1483+ return ( len, false ) ;
14831484 }
14841485
1485- start
1486+ let mut end = 2 ;
1487+
1488+ // SAFETY: See below specific.
1489+ unsafe {
1490+ // SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
1491+ let assume_reverse = is_less ( v. get_unchecked ( 1 ) , v. get_unchecked ( 0 ) ) ;
1492+
1493+ // SAFETY: We know end >= 2 and check end < len.
1494+ // From that follows that accessing v at end and end - 1 is safe.
1495+ if assume_reverse {
1496+ while end < len && is_less ( v. get_unchecked ( end) , v. get_unchecked ( end - 1 ) ) {
1497+ end += 1 ;
1498+ }
1499+
1500+ ( end, true )
1501+ } else {
1502+ while end < len && !is_less ( v. get_unchecked ( end) , v. get_unchecked ( end - 1 ) ) {
1503+ end += 1 ;
1504+ }
1505+ ( end, false )
1506+ }
1507+ }
14861508}
0 commit comments