Introduce a minimum CGU size in non-incremental builds.

Because tiny CGUs make compilation less efficient *and* result in worse generated code. We don't do this when the number of CGUs is explicitly given, because there are times when the requested number is very important, as described in some comments within the commit. So the commit also introduces a `CodegenUnits` type that distinguishes between default values and user-specified values. This change has a roughly neutral effect on walltimes across the rustc-perf benchmarks; there are some speedups and some slowdowns. But it has significant wins for most other metrics on numerous benchmarks, including instruction counts, cycles, binary size, and max-rss. It also reduces parallelism, which is good for reducing jobserver competition when multiple rustc processes are running at the same time. It's smaller benchmarks that benefit the most; larger benchmarks already have CGUs that are all larger than the minimum size. Here are some example before/after CGU sizes for opt builds. - html5ever - CGUs: 16, mean size: 1196.1, sizes: [3908, 2992, 1706, 1652, 1572, 1136, 1045, 948, 946, 938, 579, 471, 443, 327, 286, 189] - CGUs: 4, mean size: 4396.0, sizes: [6706, 3908, 3490, 3480] - libc - CGUs: 12, mean size: 35.3, sizes: [163, 93, 58, 53, 37, 8, 2 (x6)] - CGUs: 1, mean size: 424.0, sizes: [424] - tt-muncher - CGUs: 5, mean size: 1819.4, sizes: [8508, 350, 198, 34, 7] - CGUs: 1, mean size: 9075.0, sizes: [9075] Note that CGUs of size 100,000+ aren't unusual in larger programs.
tgross35 · Jun 14, 2023 · 7c3ce02 · 7c3ce02
1 parent 95d8589
commit 7c3ce02
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 18 deletions.
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
@@ -1385,7 +1385,7 @@ fn vcall_visibility_metadata<'ll, 'tcx>(
     let trait_def_id = trait_ref_self.def_id();
     let trait_vis = cx.tcx.visibility(trait_def_id);
 
-    let cgus = cx.sess().codegen_units();
+    let cgus = cx.sess().codegen_units().as_usize();
     let single_cgu = cgus == 1;
 
     let lto = cx.sess().lto();

diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs
@@ -646,10 +646,10 @@ fn produce_final_output_artifacts(
         // rlib.
         let needs_crate_object = crate_output.outputs.contains_key(&OutputType::Exe);
 
-        let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units() > 1;
+        let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units().as_usize() > 1;
 
         let keep_numbered_objects =
-            needs_crate_object || (user_wants_objects && sess.codegen_units() > 1);
+            needs_crate_object || (user_wants_objects && sess.codegen_units().as_usize() > 1);
 
         for module in compiled_modules.modules.iter() {
             if let Some(ref path) = module.object {
@@ -1923,7 +1923,7 @@ impl<B: ExtraBackendMethods> OngoingCodegen<B> {
 
         // FIXME: time_llvm_passes support - does this use a global context or
         // something?
-        if sess.codegen_units() == 1 && sess.opts.unstable_opts.time_llvm_passes {
+        if sess.codegen_units().as_usize() == 1 && sess.opts.unstable_opts.time_llvm_passes {
             self.backend.print_pass_timings()
         }
 

diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -113,6 +113,7 @@ use rustc_middle::query::Providers;
 use rustc_middle::ty::print::{characteristic_def_id_of_type, with_no_trimmed_paths};
 use rustc_middle::ty::{self, visit::TypeVisitableExt, InstanceDef, TyCtxt};
 use rustc_session::config::{DumpMonoStatsFormat, SwitchWithOptPath};
+use rustc_session::CodegenUnits;
 use rustc_span::symbol::Symbol;
 
 use crate::collector::UsageMap;
@@ -322,7 +323,7 @@ fn merge_codegen_units<'tcx>(
     cx: &PartitioningCx<'_, 'tcx>,
     codegen_units: &mut Vec<CodegenUnit<'tcx>>,
 ) {
-    assert!(cx.tcx.sess.codegen_units() >= 1);
+    assert!(cx.tcx.sess.codegen_units().as_usize() >= 1);
 
     // A sorted order here ensures merging is deterministic.
     assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
@@ -331,11 +332,32 @@ fn merge_codegen_units<'tcx>(
     let mut cgu_contents: FxHashMap<Symbol, Vec<Symbol>> =
         codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect();
 
-    // Merge the two smallest codegen units until the target size is
-    // reached.
-    while codegen_units.len() > cx.tcx.sess.codegen_units() {
-        // Sort small cgus to the back
+    // Having multiple CGUs can drastically speed up compilation. But for
+    // non-incremental builds, tiny CGUs slow down compilation *and* result in
+    // worse generated code. So we don't allow CGUs smaller than this (unless
+    // there is just one CGU, of course). Note that CGU sizes of 100,000+ are
+    // common in larger programs, so this isn't all that large.
+    const NON_INCR_MIN_CGU_SIZE: usize = 1000;
+
+    // Repeatedly merge the two smallest codegen units as long as:
+    // - we have more CGUs than the upper limit, or
+    // - (Non-incremental builds only) the user didn't specify a CGU count, and
+    //   there are multiple CGUs, and some are below the minimum size.
+    //
+    // The "didn't specify a CGU count" condition is because when an explicit
+    // count is requested we observe it as closely as possible. For example,
+    // the `compiler_builtins` crate sets `codegen-units = 10000` and it's
+    // critical they aren't merged. Also, some tests use explicit small values
+    // and likewise won't work if small CGUs are merged.
+    while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
+        || (cx.tcx.sess.opts.incremental.is_none()
+            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
+            && codegen_units.len() > 1
+            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
+    {
+        // Sort small cgus to the back.
         codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
+
         let mut smallest = codegen_units.pop().unwrap();
         let second_smallest = codegen_units.last_mut().unwrap();
 
@@ -918,9 +940,13 @@ fn debug_dump<'a, 'tcx: 'a>(
                 let symbol_hash_start = symbol_name.rfind('h');
                 let symbol_hash = symbol_hash_start.map_or("<no hash>", |i| &symbol_name[i..]);
                 let size = item.size_estimate(tcx);
+                let kind = match item.instantiation_mode(tcx) {
+                    InstantiationMode::GloballyShared { .. } => "root",
+                    InstantiationMode::LocalCopy => "inlined",
+                };
                 let _ = with_no_trimmed_paths!(writeln!(
                     s,
-                    "  - {item} [{linkage:?}] [{symbol_hash}] (size={size})"
+                    "  - {item} [{linkage:?}] [{symbol_hash}] ({kind}, size: {size})"
                 ));
             }
 

diff --git a/compiler/rustc_session/src/session.rs b/compiler/rustc_session/src/session.rs
@@ -234,6 +234,27 @@ pub enum MetadataKind {
     Compressed,
 }
 
+#[derive(Clone, Copy)]
+pub enum CodegenUnits {
+    /// Specified by the user. In this case we try fairly hard to produce the
+    /// number of CGUs requested.
+    User(usize),
+
+    /// A default value, i.e. not specified by the user. In this case we take
+    /// more liberties about CGU formation, e.g. avoid producing very small
+    /// CGUs.
+    Default(usize),
+}
+
+impl CodegenUnits {
+    pub fn as_usize(self) -> usize {
+        match self {
+            CodegenUnits::User(n) => n,
+            CodegenUnits::Default(n) => n,
+        }
+    }
+}
+
 impl Session {
     pub fn miri_unleashed_feature(&self, span: Span, feature_gate: Option<Symbol>) {
         self.miri_unleashed_features.lock().push((span, feature_gate));
@@ -1104,7 +1125,7 @@ impl Session {
 
         // If there's only one codegen unit and LTO isn't enabled then there's
         // no need for ThinLTO so just return false.
-        if self.codegen_units() == 1 {
+        if self.codegen_units().as_usize() == 1 {
             return config::Lto::No;
         }
 
@@ -1206,19 +1227,19 @@ impl Session {
 
     /// Returns the number of codegen units that should be used for this
     /// compilation
-    pub fn codegen_units(&self) -> usize {
+    pub fn codegen_units(&self) -> CodegenUnits {
         if let Some(n) = self.opts.cli_forced_codegen_units {
-            return n;
+            return CodegenUnits::User(n);
         }
         if let Some(n) = self.target.default_codegen_units {
-            return n as usize;
+            return CodegenUnits::Default(n as usize);
         }
 
         // If incremental compilation is turned on, we default to a high number
         // codegen units in order to reduce the "collateral damage" small
         // changes cause.
         if self.opts.incremental.is_some() {
-            return 256;
+            return CodegenUnits::Default(256);
         }
 
         // Why is 16 codegen units the default all the time?
@@ -1271,7 +1292,7 @@ impl Session {
         // As a result 16 was chosen here! Mostly because it was a power of 2
         // and most benchmarks agreed it was roughly a local optimum. Not very
         // scientific.
-        16
+        CodegenUnits::Default(16)
     }
 
     pub fn teach(&self, code: &DiagnosticId) -> bool {

diff --git a/src/doc/rustc/src/codegen-options/index.md b/src/doc/rustc/src/codegen-options/index.md
@@ -31,8 +31,8 @@ Supported values can also be discovered by running `rustc --print code-models`.
 
 ## codegen-units
 
-This flag controls how many code generation units the crate is split into. It
-takes an integer greater than 0.
+This flag controls the maximum number of code generation units the crate is
+split into. It takes an integer greater than 0.
 
 When a crate is split into multiple codegen units, LLVM is able to process
 them in parallel. Increasing parallelism may speed up compile times, but may