Auto merge of #590 - Mark-Simulacrum:improve-errs, r=Mark-Simulacrum

bors · bors · commit 64bc28fb41f5 · 2022-01-08T01:54:08.000Z
Workaround docker is not running bug

Currently, if an individual agent reports an error during execution (e.g., docker is not running, or one of its worker threads ended with an error), that job will be marked as failed in its entirety. Particularly as we currently have a transient agent (crater-gcp-1), which is sometimes down due to being a spot instance, this means that it can be hard to complete a Crater job if the GCP-1 instance is killing jobs midway through.

It should be noted that in theory these errors shouldn't happen in the first place. In practice it looks like "docker is not running" is the primary cause of failure -- which is relatively hard to investigate; logs for the relevant time period appear absent. This PR restructures the code which detects docker absence to instead spin until docker *is* up. It looks relatively more difficult to re-organize the crater code to deal well with worker failure, likely by re-assigning the jobs to a live worker, though that is likely a better long-term solution.
diff --git a/.github/workflows/bors.yml b/.github/workflows/bors.yml
@@ -12,8 +12,8 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: Install Rust stable
-        run: rustup update --no-self-update stable && rustup default stable
+      - name: Install Rust nightly
+        run: rustup update nightly && rustup default nightly && rustup component add rustfmt clippy
 
       - name: Check the code formatting with rustfmt
         run: cargo fmt --all -- --check
@@ -31,7 +31,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest]
-        channel: [stable, beta, nightly]
+        channel: [nightly]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v2
@@ -61,8 +61,8 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: Install Rust stable
-        run: rustup update --no-self-update stable && rustup default stable
+      - name: Install Rust nightly
+        run: rustup update --no-self-update nightly && rustup default nightly
 
       - name: Run minicrater
         shell: bash
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
@@ -8,8 +8,8 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: Install Rust Stable
-        run: rustup update stable && rustup default stable
+      - name: Install Rust nightly
+        run: rustup update nightly && rustup default nightly && rustup component add rustfmt clippy
 
       - name: Check the code formatting with rustfmt
         run: cargo fmt --all -- --check
@@ -28,8 +28,8 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: Install Rust Stable
-        run: rustup update stable && rustup default stable
+      - name: Install Rust nightly
+        run: rustup update nightly && rustup default nightly
 
       - name: Build Crater
         run: cargo build
diff --git a/Dockerfile b/Dockerfile
@@ -19,7 +19,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
 # Install the currently pinned toolchain with rustup
 RUN curl https://static.rust-lang.org/rustup/dist/x86_64-unknown-linux-gnu/rustup-init >/tmp/rustup-init && \
     chmod +x /tmp/rustup-init && \
-    /tmp/rustup-init -y --no-modify-path --default-toolchain stable --profile minimal
+    /tmp/rustup-init -y --no-modify-path --default-toolchain nightly --profile minimal
 ENV PATH=/root/.cargo/bin:$PATH
 
 # Build the dependencies in a separate step to avoid rebuilding all of them
diff --git a/rust-toolchain b/rust-toolchain
@@ -1 +1 @@
-stable
+nightly
diff --git a/src/crates/mod.rs b/src/crates/mod.rs
@@ -94,7 +94,7 @@ impl TryFrom<&'_ PackageId> for Crate {
             [_, _, "path", path] => Ok(Crate::Path(path.to_string())),
             [_, _, "git", repo] => {
                 if repo.starts_with("https://github.com") {
-                    Ok(Crate::GitHub(repo.replace("#", "/").parse()?))
+                    Ok(Crate::GitHub(repo.replace('#', "/").parse()?))
                 } else {
                     let mut parts = repo.split('#').rev().collect::<Vec<_>>();
                     let url = parts.pop();
diff --git a/src/db/mod.rs b/src/db/mod.rs
@@ -30,7 +30,7 @@ impl CustomizeConnection<Connection, ::rusqlite::Error> for ConnectionCustomizer
 pub struct Database {
     pool: Pool<SqliteConnectionManager>,
     // The tempfile is stored here to drop it after all the connections are closed
-    tempfile: Option<Arc<NamedTempFile>>,
+    _tempfile: Option<Arc<NamedTempFile>>,
 }
 
 impl Database {
@@ -76,7 +76,7 @@ impl Database {
 
         Ok(Database {
             pool,
-            tempfile: tempfile.map(Arc::new),
+            _tempfile: tempfile.map(Arc::new),
         })
     }
 
diff --git a/src/report/markdown.rs b/src/report/markdown.rs
@@ -66,7 +66,7 @@ fn write_crate(
     let prefix = if is_child { "  * " } else { "* " };
     let status_warning = krate
         .status
-        .map(|status| format!(" ({})", status.to_string()))
+        .map(|status| format!(" ({})", status))
         .unwrap_or_default();
 
     if let ReportConfig::Complete(toolchain) = comparison.report_config() {
@@ -82,7 +82,7 @@ fn write_crate(
             krate.name,
             status_warning,
             krate.url,
-            comparison.to_string(),
+            comparison,
             conj,
             runs[run],
             runs[1],
@@ -92,13 +92,7 @@ fn write_crate(
         writeln!(
             &mut rendered,
             "{}[{}{}]({}) {} [start]({}/log.txt) | [end]({}/log.txt)",
-            prefix,
-            krate.name,
-            status_warning,
-            krate.url,
-            comparison.to_string(),
-            runs[1],
-            runs[3]
+            prefix, krate.name, status_warning, krate.url, comparison, runs[1], runs[3]
         )?;
     };
 
@@ -112,7 +106,7 @@ fn render_markdown(context: &ResultsContext) -> Fallible<String> {
     writeln!(&mut rendered, "# Crater report for {}\n\n", context.ex.name)?;
 
     for (comparison, results) in context.categories.iter() {
-        writeln!(&mut rendered, "\n### {}", comparison.to_string())?;
+        writeln!(&mut rendered, "\n### {}", comparison)?;
         match results {
             ReportCratesMD::Plain(crates) => {
                 for krate in crates {
diff --git a/src/report/mod.rs b/src/report/mod.rs
@@ -235,7 +235,7 @@ pub fn generate_report<DB: ReadResults>(
                     log: crate_to_path_fragment(tc, krate, SanitizationContext::Url)
                         .to_str()
                         .unwrap()
-                        .replace(r"\", "/"), // Normalize paths in reports generated on Windows
+                        .replace('\'', "/"), // Normalize paths in reports generated on Windows
                 })
             });
             // Convert errors to Nones
@@ -290,7 +290,7 @@ fn write_logs<DB: ReadResults, W: ReportWriter>(
             let content = db
                 .load_log(ex, tc, krate)
                 .and_then(|c| c.ok_or_else(|| err_msg("missing logs")))
-                .with_context(|_| format!("failed to read log of {} on {}", krate, tc.to_string()));
+                .with_context(|_| format!("failed to read log of {} on {}", krate, tc));
             let content = match content {
                 Ok(c) => c,
                 Err(e) => {
@@ -991,12 +991,12 @@ mod tests {
             TestResult::BuildFail(FailureReason::Unknown)
         );
         assert_eq!(
-            (&gh_result.runs[0]).as_ref().unwrap().log.as_str(),
-            "stable/gh/brson.hello-rs"
+            Path::new((&gh_result.runs[0]).as_ref().unwrap().log.as_str()),
+            Path::new("stable/gh/brson.hello-rs")
         );
         assert_eq!(
-            (&gh_result.runs[1]).as_ref().unwrap().log.as_str(),
-            "beta/gh/brson.hello-rs"
+            Path::new((&gh_result.runs[1]).as_ref().unwrap().log.as_str()),
+            Path::new("beta/gh/brson.hello-rs")
         );
 
         assert_eq!(reg_result.name.as_str(), "syn-1.0.0");
@@ -1014,12 +1014,12 @@ mod tests {
             TestResult::BuildFail(FailureReason::Unknown)
         );
         assert_eq!(
-            (&reg_result.runs[0]).as_ref().unwrap().log.as_str(),
-            "stable/reg/syn-1.0.0"
+            Path::new((&reg_result.runs[0]).as_ref().unwrap().log.as_str()),
+            Path::new("stable/reg/syn-1.0.0")
         );
         assert_eq!(
-            (&reg_result.runs[1]).as_ref().unwrap().log.as_str(),
-            "beta/reg/syn-1.0.0"
+            Path::new((&reg_result.runs[1]).as_ref().unwrap().log.as_str()),
+            Path::new("beta/reg/syn-1.0.0")
         );
 
         assert_eq!(
diff --git a/src/runner/mod.rs b/src/runner/mod.rs
@@ -56,8 +56,29 @@ pub fn run_ex<DB: WriteResults + Sync>(
     threads_count: usize,
     config: &Config,
 ) -> Fallible<()> {
-    if !rustwide::cmd::docker_running(workspace) {
-        return Err(err_msg("docker is not running"));
+    // Attempt to spin indefinitely until docker is up. Ideally, we would
+    // decomission this agent until docker is up, instead of leaving the
+    // assigned crates to 'hang' until we get our act together. In practice, we
+    // expect workers to be around most of the time (just sometimes being
+    // restarted etc.) and so the assigned crates shouldn't hang for long.
+    //
+    // If we return an Err(...) from this function, then currently that is
+    // treated as a hard failure of the underlying experiment, but this error
+    // has nothing to do with the experiment, so shouldn't be reported as such.
+    //
+    // In the future we'll want to *alert* on this error so that a human can
+    // investigate, but the hope is that in practice docker is just being slow
+    // or similar and this will fix itself, which currently makes the most sense
+    // given low human resources. Additionally, it'll be indirectly alerted
+    // through the worker being "down" according to our progress metrics, since
+    // jobs won't be completed.
+    let mut i = 0;
+    while !rustwide::cmd::docker_running(workspace) {
+        log::error!(
+            "docker is not currently up, waiting for it to start (tried {} times)",
+            i
+        );
+        i += 1;
     }
 
     info!("computing the tasks graph...");
diff --git a/src/runner/tasks.rs b/src/runner/tasks.rs
@@ -76,7 +76,7 @@ impl fmt::Debug for TaskStep {
 
         write!(f, "{}", name)?;
         if let Some(tc) = tc {
-            write!(f, " {}", tc.to_string())?;
+            write!(f, " {}", tc)?;
         }
         if quiet {
             write!(f, " (quiet)")?;
diff --git a/src/runner/test.rs b/src/runner/test.rs
@@ -17,9 +17,9 @@ fn failure_reason(err: &Error) -> FailureReason {
     for cause in err.iter_chain() {
         if let Some(&CommandError::SandboxOOM) = cause.downcast_ctx() {
             return FailureReason::OOM;
-        } else if let Some(&CommandError::NoOutputFor(_)) = cause.downcast_ctx() {
-            return FailureReason::Timeout;
-        } else if let Some(&CommandError::Timeout(_)) = cause.downcast_ctx() {
+        } else if let Some(&CommandError::NoOutputFor(_) | &CommandError::Timeout(_)) =
+            cause.downcast_ctx()
+        {
             return FailureReason::Timeout;
         } else if let Some(reason) = cause.downcast_ctx::<FailureReason>() {
             return reason.clone();
diff --git a/src/server/routes/agent.rs b/src/server/routes/agent.rs
@@ -244,9 +244,7 @@ fn handle_results(resp: Fallible<Response<Body>>) -> Response<Body> {
 fn handle_errors(err: Rejection) -> Result<Response<Body>, Rejection> {
     let error = if let Some(compat) = err.find_cause::<Compat<HttpError>>() {
         Some(*compat.get_ref())
-    } else if let StatusCode::NOT_FOUND = err.status() {
-        Some(HttpError::NotFound)
-    } else if let StatusCode::METHOD_NOT_ALLOWED = err.status() {
+    } else if let StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED = err.status() {
         Some(HttpError::NotFound)
     } else {
         None

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ impl CustomizeConnection<Connection, ::rusqlite::Error> for ConnectionCustomizer`
`30`	`30`	`pub struct Database {`
`31`	`31`	`pool: Pool<SqliteConnectionManager>,`
`32`	`32`	`// The tempfile is stored here to drop it after all the connections are closed`
`33`		`- tempfile: Option<Arc<NamedTempFile>>,`
	`33`	`+ _tempfile: Option<Arc<NamedTempFile>>,`
`34`	`34`	`}`
`35`	`35`
`36`	`36`	`impl Database {`
`@@ -76,7 +76,7 @@ impl Database {`
`76`	`76`
`77`	`77`	`Ok(Database {`
`78`	`78`	`pool,`
`79`		`- tempfile: tempfile.map(Arc::new),`
	`79`	`+ _tempfile: tempfile.map(Arc::new),`
`80`	`80`	`})`
`81`	`81`	`}`
`82`	`82`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ impl fmt::Debug for TaskStep {`
`76`	`76`
`77`	`77`	`write!(f, "{}", name)?;`
`78`	`78`	`if let Some(tc) = tc {`
`79`		`- write!(f, " {}", tc.to_string())?;`
	`79`	`+ write!(f, " {}", tc)?;`
`80`	`80`	`}`
`81`	`81`	`if quiet {`
`82`	`82`	`write!(f, " (quiet)")?;`