@@ -248,7 +248,8 @@ where
248248 }
249249 let size_estimate = mono_item. size_estimate ( cx. tcx ) ;
250250
251- cgu. items_mut ( ) . insert ( mono_item, MonoItemData { linkage, visibility, size_estimate } ) ;
251+ cgu. items_mut ( )
252+ . insert ( mono_item, MonoItemData { inlined : false , linkage, visibility, size_estimate } ) ;
252253
253254 // Get all inlined items that are reachable from `mono_item` without
254255 // going via another root item. This includes drop-glue, functions from
@@ -263,6 +264,7 @@ where
263264 for inlined_item in reachable_inlined_items {
264265 // This is a CGU-private copy.
265266 cgu. items_mut ( ) . entry ( inlined_item) . or_insert_with ( || MonoItemData {
267+ inlined : true ,
266268 linkage : Linkage :: Internal ,
267269 visibility : Visibility :: Default ,
268270 size_estimate : inlined_item. size_estimate ( cx. tcx ) ,
@@ -316,31 +318,83 @@ fn merge_codegen_units<'tcx>(
316318 let mut cgu_contents: FxHashMap < Symbol , Vec < Symbol > > =
317319 codegen_units. iter ( ) . map ( |cgu| ( cgu. name ( ) , vec ! [ cgu. name( ) ] ) ) . collect ( ) ;
318320
321+ // If N is the maximum number of CGUs, and the CGUs are sorted from largest
322+ // to smallest, we repeatedly find which CGU in codegen_units[N..] has the
323+ // greatest overlap of inlined items with codegen_units[N-1], merge that
324+ // CGU into codegen_units[N-1], then re-sort by size and repeat.
325+ //
326+ // We use inlined item overlap to guide this merging because it minimizes
327+ // duplication of inlined items, which makes LLVM be faster and generate
328+ // better and smaller machine code.
329+ //
330+ // Why merge into codegen_units[N-1]? We want CGUs to have similar sizes,
331+ // which means we don't want codegen_units[0..N] (the already big ones)
332+ // getting any bigger, if we can avoid it. When we have more than N CGUs
333+ // then at least one of the biggest N will have to grow. codegen_units[N-1]
334+ // is the smallest of those, and so has the most room to grow.
335+ let max_codegen_units = cx. tcx . sess . codegen_units ( ) . as_usize ( ) ;
336+ while codegen_units. len ( ) > max_codegen_units {
337+ // Sort small CGUs to the back.
338+ codegen_units. sort_by_key ( |cgu| cmp:: Reverse ( cgu. size_estimate ( ) ) ) ;
339+
340+ let cgu_dst = & codegen_units[ max_codegen_units - 1 ] ;
341+
342+ // Find the CGU that overlaps the most with `cgu_dst`. In the case of a
343+ // tie, favour the earlier (bigger) CGU.
344+ let mut max_overlap = 0 ;
345+ let mut max_overlap_i = max_codegen_units;
346+ for ( i, cgu_src) in codegen_units. iter ( ) . enumerate ( ) . skip ( max_codegen_units) {
347+ if cgu_src. size_estimate ( ) <= max_overlap {
348+ // None of the remaining overlaps can exceed `max_overlap`, so
349+ // stop looking.
350+ break ;
351+ }
352+
353+ let overlap = compute_inlined_overlap ( cgu_dst, cgu_src) ;
354+ if overlap > max_overlap {
355+ max_overlap = overlap;
356+ max_overlap_i = i;
357+ }
358+ }
359+
360+ let mut cgu_src = codegen_units. swap_remove ( max_overlap_i) ;
361+ let cgu_dst = & mut codegen_units[ max_codegen_units - 1 ] ;
362+
363+ // Move the items from `cgu_src` to `cgu_dst`. Some of them may be
364+ // duplicate inlined items, in which case the destination CGU is
365+ // unaffected. Recalculate size estimates afterwards.
366+ cgu_dst. items_mut ( ) . extend ( cgu_src. items_mut ( ) . drain ( ) ) ;
367+ cgu_dst. compute_size_estimate ( ) ;
368+
369+ // Record that `cgu_dst` now contains all the stuff that was in
370+ // `cgu_src` before.
371+ let mut consumed_cgu_names = cgu_contents. remove ( & cgu_src. name ( ) ) . unwrap ( ) ;
372+ cgu_contents. get_mut ( & cgu_dst. name ( ) ) . unwrap ( ) . append ( & mut consumed_cgu_names) ;
373+ }
374+
319375 // Having multiple CGUs can drastically speed up compilation. But for
320376 // non-incremental builds, tiny CGUs slow down compilation *and* result in
321377 // worse generated code. So we don't allow CGUs smaller than this (unless
322378 // there is just one CGU, of course). Note that CGU sizes of 100,000+ are
323379 // common in larger programs, so this isn't all that large.
324380 const NON_INCR_MIN_CGU_SIZE : usize = 1800 ;
325381
326- // Repeatedly merge the two smallest codegen units as long as:
327- // - we have more CGUs than the upper limit, or
328- // - (Non-incremental builds only) the user didn't specify a CGU count, and
329- // there are multiple CGUs, and some are below the minimum size.
382+ // Repeatedly merge the two smallest codegen units as long as: it's a
383+ // non-incremental build, and the user didn't specify a CGU count, and
384+ // there are multiple CGUs, and some are below the minimum size.
330385 //
331386 // The "didn't specify a CGU count" condition is because when an explicit
332387 // count is requested we observe it as closely as possible. For example,
333388 // the `compiler_builtins` crate sets `codegen-units = 10000` and it's
334389 // critical they aren't merged. Also, some tests use explicit small values
335390 // and likewise won't work if small CGUs are merged.
336- while codegen_units. len ( ) > cx. tcx . sess . codegen_units ( ) . as_usize ( )
337- || ( cx. tcx . sess . opts . incremental . is_none ( )
338- && matches ! ( cx. tcx. sess. codegen_units( ) , CodegenUnits :: Default ( _) )
339- && codegen_units. len ( ) > 1
340- && codegen_units. iter ( ) . any ( |cgu| cgu. size_estimate ( ) < NON_INCR_MIN_CGU_SIZE ) )
391+ while cx. tcx . sess . opts . incremental . is_none ( )
392+ && matches ! ( cx. tcx. sess. codegen_units( ) , CodegenUnits :: Default ( _) )
393+ && codegen_units. len ( ) > 1
394+ && codegen_units. iter ( ) . any ( |cgu| cgu. size_estimate ( ) < NON_INCR_MIN_CGU_SIZE )
341395 {
342396 // Sort small cgus to the back.
343- codegen_units. sort_by_key ( |cgu| cmp:: Reverse ( cgu. size_estimate ( ) ) ) ;
397+ codegen_units. sort_by_cached_key ( |cgu| cmp:: Reverse ( cgu. size_estimate ( ) ) ) ;
344398
345399 let mut smallest = codegen_units. pop ( ) . unwrap ( ) ;
346400 let second_smallest = codegen_units. last_mut ( ) . unwrap ( ) ;
@@ -351,16 +405,7 @@ fn merge_codegen_units<'tcx>(
351405 second_smallest. items_mut ( ) . extend ( smallest. items_mut ( ) . drain ( ) ) ;
352406 second_smallest. compute_size_estimate ( ) ;
353407
354- // Record that `second_smallest` now contains all the stuff that was
355- // in `smallest` before.
356- let mut consumed_cgu_names = cgu_contents. remove ( & smallest. name ( ) ) . unwrap ( ) ;
357- cgu_contents. get_mut ( & second_smallest. name ( ) ) . unwrap ( ) . append ( & mut consumed_cgu_names) ;
358-
359- debug ! (
360- "CodegenUnit {} merged into CodegenUnit {}" ,
361- smallest. name( ) ,
362- second_smallest. name( )
363- ) ;
408+ // Don't update `cgu_contents`, that's only for incremental builds.
364409 }
365410
366411 let cgu_name_builder = & mut CodegenUnitNameBuilder :: new ( cx. tcx ) ;
@@ -439,6 +484,25 @@ fn merge_codegen_units<'tcx>(
439484 }
440485}
441486
487+ /// Compute the combined size of all inlined items that appear in both `cgu1`
488+ /// and `cgu2`.
489+ fn compute_inlined_overlap < ' tcx > ( cgu1 : & CodegenUnit < ' tcx > , cgu2 : & CodegenUnit < ' tcx > ) -> usize {
490+ // Either order works. We pick the one that involves iterating over fewer
491+ // items.
492+ let ( src_cgu, dst_cgu) =
493+ if cgu1. items ( ) . len ( ) <= cgu2. items ( ) . len ( ) { ( cgu1, cgu2) } else { ( cgu2, cgu1) } ;
494+
495+ let mut overlap = 0 ;
496+ for ( item, data) in src_cgu. items ( ) . iter ( ) {
497+ if data. inlined {
498+ if dst_cgu. items ( ) . contains_key ( item) {
499+ overlap += data. size_estimate ;
500+ }
501+ }
502+ }
503+ overlap
504+ }
505+
442506fn internalize_symbols < ' tcx > (
443507 cx : & PartitioningCx < ' _ , ' tcx > ,
444508 codegen_units : & mut [ CodegenUnit < ' tcx > ] ,
@@ -870,19 +934,16 @@ fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<
870934 all_cgu_sizes. push ( cgu. size_estimate ( ) ) ;
871935
872936 for ( item, data) in cgu. items ( ) {
873- match item. instantiation_mode ( tcx) {
874- InstantiationMode :: GloballyShared { .. } => {
875- root_items += 1 ;
876- root_size += data. size_estimate ;
877- }
878- InstantiationMode :: LocalCopy => {
879- if inlined_items. insert ( item) {
880- unique_inlined_items += 1 ;
881- unique_inlined_size += data. size_estimate ;
882- }
883- placed_inlined_items += 1 ;
884- placed_inlined_size += data. size_estimate ;
937+ if !data. inlined {
938+ root_items += 1 ;
939+ root_size += data. size_estimate ;
940+ } else {
941+ if inlined_items. insert ( item) {
942+ unique_inlined_items += 1 ;
943+ unique_inlined_size += data. size_estimate ;
885944 }
945+ placed_inlined_items += 1 ;
946+ placed_inlined_size += data. size_estimate ;
886947 }
887948 }
888949 }
@@ -937,10 +998,7 @@ fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<
937998 let symbol_name = item. symbol_name ( tcx) . name ;
938999 let symbol_hash_start = symbol_name. rfind ( 'h' ) ;
9391000 let symbol_hash = symbol_hash_start. map_or ( "<no hash>" , |i| & symbol_name[ i..] ) ;
940- let kind = match item. instantiation_mode ( tcx) {
941- InstantiationMode :: GloballyShared { .. } => "root" ,
942- InstantiationMode :: LocalCopy => "inlined" ,
943- } ;
1001+ let kind = if !data. inlined { "root" } else { "inlined" } ;
9441002 let size = data. size_estimate ;
9451003 let _ = with_no_trimmed_paths ! ( writeln!(
9461004 s,
0 commit comments