From c35ab255c74831305c7fde604be4311d3308062d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 4 Nov 2025 15:56:18 -0800 Subject: [PATCH 01/18] blerrghhh --- nexus/fm/Cargo.toml | 12 ++++++++++++ nexus/fm/src/lib.rs | 13 +++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 nexus/fm/Cargo.toml create mode 100644 nexus/fm/src/lib.rs diff --git a/nexus/fm/Cargo.toml b/nexus/fm/Cargo.toml new file mode 100644 index 00000000000..f2a575198e0 --- /dev/null +++ b/nexus/fm/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "nexus-fm" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +nexus-types.workspace = true + +omicron-workspace-hack.workspace = true diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs new file mode 100644 index 00000000000..18a986cdbc7 --- /dev/null +++ b/nexus/fm/src/lib.rs @@ -0,0 +1,13 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fault management + +use nexus_types::fm; +use nexus_types::inventory; + +pub struct DiagnosisInput<'a> { + inventory: &'a inventory::Collection, + parent_sitrep: Option<&'a fm::Sitrep>, +} From d41781e0beec08311a08719d2995f58d7cedbee2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Nov 2025 09:20:46 -0800 Subject: [PATCH 02/18] well here's something --- Cargo.lock | 10 ++++++++++ Cargo.toml | 1 + nexus/fm/Cargo.toml | 2 ++ nexus/fm/src/lib.rs | 31 ++++++++++++++++++++++++++++++- 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index cd8cb6ce274..cf8d3c66b92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6871,6 +6871,16 @@ dependencies = [ "tufaceous-artifact", ] +[[package]] +name = "nexus-fm" +version = "0.1.0" +dependencies = [ + "chrono", + "nexus-types", + "omicron-uuid-kinds", + "omicron-workspace-hack", +] + [[package]] name = "nexus-internal-api" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 0aa02a2e81c..a78cd83afab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,6 +87,7 @@ members = [ "nexus/db-schema", "nexus/defaults", "nexus/external-api", + "nexus/fm", "nexus/internal-api", "nexus/inventory", "nexus/lockstep-api", diff --git a/nexus/fm/Cargo.toml b/nexus/fm/Cargo.toml index f2a575198e0..cd890597077 100644 --- a/nexus/fm/Cargo.toml +++ b/nexus/fm/Cargo.toml @@ -8,5 +8,7 @@ workspace = true [dependencies] nexus-types.workspace = true +chrono.workspace = true +omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index 18a986cdbc7..ee2c24345ae 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -6,8 +6,37 @@ use nexus_types::fm; use nexus_types::inventory; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SitrepUuid; -pub struct DiagnosisInput<'a> { +#[derive(Debug)] +pub struct SitrepBuilder<'a> { inventory: &'a inventory::Collection, parent_sitrep: Option<&'a fm::Sitrep>, + comment: String, +} + +impl<'a> SitrepBuilder<'a> { + pub fn new( + inventory: &'a inventory::Collection, + parent_sitrep: Option<&'a fm::Sitrep>, + ) -> Self { + SitrepBuilder { inventory, parent_sitrep, comment: String::new() } + } + + + + pub fn build(self, creator_id: OmicronZoneUuid) -> fm::Sitrep { + fm::Sitrep { + metadata: fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + parent_sitrep_id: self.parent_sitrep.map(|s| s.metadata.id), + inv_collection_id: self.inventory.id, + creator_id, + comment: self.comment, + time_created: chrono::Utc::now(), + }, + // TODO(eliza): draw the rest of the owl... + } + } } From e07b186abb4d308e4b40e9dcb9d9de14f265274c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Nov 2025 10:56:01 -0800 Subject: [PATCH 03/18] start throwing together some kind of DE stuff --- Cargo.lock | 3 + dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 2 +- ereport/types/src/lib.rs | 4 +- nexus/db-queries/src/db/datastore/fm.rs | 7 +- nexus/fm/Cargo.toml | 5 +- nexus/fm/src/de.rs | 7 ++ nexus/fm/src/de/power_shelf.rs | 10 +++ nexus/fm/src/lib.rs | 93 ++++++++++++++++++++++-- nexus/types/src/fm.rs | 49 +++++++++++-- nexus/types/src/fm/alert.rs | 29 ++++++++ uuid-kinds/src/lib.rs | 1 + 11 files changed, 192 insertions(+), 18 deletions(-) create mode 100644 nexus/fm/src/de.rs create mode 100644 nexus/fm/src/de/power_shelf.rs create mode 100644 nexus/types/src/fm/alert.rs diff --git a/Cargo.lock b/Cargo.lock index cf8d3c66b92..47320e2cf79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6875,10 +6875,13 @@ dependencies = [ name = "nexus-fm" version = "0.1.0" dependencies = [ + "anyhow", "chrono", + "iddqd", "nexus-types", "omicron-uuid-kinds", "omicron-workspace-hack", + "slog", ] [[package]] diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index 439798d577b..1f517b04642 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -238,7 +238,7 @@ async fn cmd_db_sitrep_show( } }; - let fm::Sitrep { metadata } = sitrep; + let fm::Sitrep { metadata, cases, alerts_requested } = sitrep; let fm::SitrepMetadata { id, creator_id, diff --git a/ereport/types/src/lib.rs b/ereport/types/src/lib.rs index 9727684a6c8..63612b9caab 100644 --- a/ereport/types/src/lib.rs +++ b/ereport/types/src/lib.rs @@ -102,7 +102,9 @@ impl TryFrom for Ena { } /// Unique identifier for an ereport. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Ord, +)] pub struct EreportId { pub restart_id: EreporterRestartUuid, pub ena: Ena, diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 04d44c72e2a..1b10ebcb0f5 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -142,7 +142,12 @@ impl DataStore { // TODO(eliza): this is where we would read all the other sitrep data, // if there was any. - Ok(Sitrep { metadata }) + Ok(Sitrep { + metadata, + // TODO(eliza) read these + alerts_requested: Default::default(), + cases: Default::default(), + }) } /// Insert the provided [`Sitrep`] into the database, and attempt to mark it diff --git a/nexus/fm/Cargo.toml b/nexus/fm/Cargo.toml index cd890597077..d530377bf83 100644 --- a/nexus/fm/Cargo.toml +++ b/nexus/fm/Cargo.toml @@ -7,8 +7,11 @@ edition = "2021" workspace = true [dependencies] -nexus-types.workspace = true +anyhow.workspace = true chrono.workspace = true +iddqd.workspace = true +nexus-types.workspace = true omicron-uuid-kinds.workspace = true +slog.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/fm/src/de.rs b/nexus/fm/src/de.rs new file mode 100644 index 00000000000..0d1d2580785 --- /dev/null +++ b/nexus/fm/src/de.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Diagnosis engines + +pub mod power_shelf; diff --git a/nexus/fm/src/de/power_shelf.rs b/nexus/fm/src/de/power_shelf.rs new file mode 100644 index 00000000000..fef366785b5 --- /dev/null +++ b/nexus/fm/src/de/power_shelf.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Power shelf diagnosis +use crate::SitrepBuilder; + +pub fn diagnose(sitrep: &mut SitrepBuilder<'_>) -> anyhow::Result<()> { + Ok(()) +} diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index ee2c24345ae..62ead12a756 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -6,37 +6,116 @@ use nexus_types::fm; use nexus_types::inventory; +use omicron_uuid_kinds::CaseUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SitrepUuid; +use slog::Logger; +use std::fmt::Write; + +pub mod de; #[derive(Debug)] pub struct SitrepBuilder<'a> { - inventory: &'a inventory::Collection, - parent_sitrep: Option<&'a fm::Sitrep>, + pub log: Logger, + pub inventory: &'a inventory::Collection, + pub parent_sitrep: Option<&'a fm::Sitrep>, + pub sitrep_id: SitrepUuid, + pub cases: iddqd::IdOrdMap, comment: String, } impl<'a> SitrepBuilder<'a> { pub fn new( + log: &Logger, inventory: &'a inventory::Collection, parent_sitrep: Option<&'a fm::Sitrep>, ) -> Self { - SitrepBuilder { inventory, parent_sitrep, comment: String::new() } + let sitrep_id = SitrepUuid::new_v4(); + let log = log.new(slog::o!( + "sitrep_id" => format!("{sitrep_id:?}"), + "parent_sitrep_id" => format!("{:?}", parent_sitrep.as_ref().map(|s| s.id())), + "inv_collection_id" => format!("{:?}", inventory.id), + )); + SitrepBuilder { + log, + sitrep_id, + inventory, + parent_sitrep, + comment: String::new(), + cases: Default::default(), + } + } + + pub fn open_case(&mut self, case: fm::Case) -> anyhow::Result { + let case_id = case.id; + if self.cases.contains_key(&case_id) { + anyhow::bail!("case with ID {case_id:?} already exists"); + } + + slog::info!( + self.log, + "opened case {case_id:?}"; + "case_id" => ?case_id, + "de" => %case.de + ); + + writeln!(&mut self.comment, "* de {} opened case {case_id:?}", case.de) + .unwrap(); + + self.cases + .insert_unique(case) + .expect("we just checked that it doesn't exist"); + Ok(case_id) + } + + pub fn request_alert( + &mut self, + case_id: CaseUuid, + req: fm::AlertRequest, + ) -> anyhow::Result<()> { + let mut case = self.cases + .get_mut(&case_id) + .ok_or_else(|| anyhow::anyhow!( + "cannot create an alert request for non-existent case ID {case_id:?}", + ))?; + let alert_id = req.id; + let alert_class = req.class; + + case.alerts_requested.insert_unique(req).map_err(|_| { + anyhow::anyhow!("an alert with ID {alert_id:?} already exists") + })?; + + writeln!( + &mut self.comment, + "* de {} requested {alert_class:?} alert {alert_id:?} for case \ + {case_id:?}", + case.de + ) + .unwrap(); + + slog::info!( + self.log, + "requested an alert for case {case_id:?}"; + "case_id" => ?case_id, + "de" => %case.de, + "alert_id" => ?alert_id, + "alert_class" => ?alert_class, + ); + + Ok(()) } - - pub fn build(self, creator_id: OmicronZoneUuid) -> fm::Sitrep { fm::Sitrep { metadata: fm::SitrepMetadata { - id: SitrepUuid::new_v4(), + id: self.sitrep_id, parent_sitrep_id: self.parent_sitrep.map(|s| s.metadata.id), inv_collection_id: self.inventory.id, creator_id, comment: self.comment, time_created: chrono::Utc::now(), }, - // TODO(eliza): draw the rest of the owl... + cases: self.cases, } } } diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index 3f90379388c..cf6945b70e4 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -8,12 +8,18 @@ //! structure containing fault management state. pub mod ereport; -pub use ereport::Ereport; +pub use ereport::{Ereport, EreportId}; + +mod alert; +pub use alert::*; use chrono::{DateTime, Utc}; -use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid, SitrepUuid}; -use schemars::JsonSchema; +use iddqd::{IdOrdItem, IdOrdMap}; +use omicron_uuid_kinds::{ + CaseUuid, CollectionUuid, OmicronZoneUuid, SitrepUuid, +}; use serde::{Deserialize, Serialize}; +use std::collections::BTreeSet; /// A fault management situation report, or _sitrep_. /// @@ -30,12 +36,12 @@ use serde::{Deserialize, Serialize}; /// The sitrep, how it is represented in the database, and how the fault /// management subsystem creates and interacts with sitreps, is described in /// detail in [RFD 603](https://rfd.shared.oxide.computer/rfd/0603). -#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct Sitrep { /// Metadata describing this sitrep, when it was created, its parent sitrep /// ID, and which Nexus produced it. pub metadata: SitrepMetadata, - // TODO(eliza): draw the rest of the sitrep + pub cases: IdOrdMap, } impl Sitrep { @@ -46,12 +52,22 @@ impl Sitrep { pub fn parent_id(&self) -> Option { self.metadata.parent_sitrep_id } + + /// Iterate over all alerts requested by cases in this sitrep. + pub fn alerts_requested( + &self, + ) -> impl Iterator + '_ { + self.cases.iter().flat_map(|case| { + let case_id = case.id; + case.alerts_requested.iter().map(move |alert| (case_id, alert)) + }) + } } /// Metadata describing a sitrep. /// /// This corresponds to the records stored in the `fm_sitrep` database table. -#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct SitrepMetadata { /// The ID of this sitrep. pub id: SitrepUuid, @@ -91,9 +107,28 @@ pub struct SitrepMetadata { } /// An entry in the sitrep version history. -#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct SitrepVersion { pub id: SitrepUuid, pub version: u32, pub time_made_current: DateTime, } + +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct Case { + pub id: CaseUuid, + pub created_sitrep_id: SitrepUuid, + pub de: String, + pub ereports: BTreeSet, + // TODO(eliza) what else? + pub alerts_requested: IdOrdMap, // TODO(eliza): draw the rest of the sitrep +} + +impl IdOrdItem for Case { + type Key<'a> = &'a CaseUuid; + fn key(&self) -> Self::Key<'_> { + &self.id + } + + iddqd::id_upcast!(); +} diff --git a/nexus/types/src/fm/alert.rs b/nexus/types/src/fm/alert.rs new file mode 100644 index 00000000000..5eee875ac7e --- /dev/null +++ b/nexus/types/src/fm/alert.rs @@ -0,0 +1,29 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use omicron_uuid_kinds::AlertUuid; +use omicron_uuid_kinds::SitrepUuid; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct AlertRequest { + pub id: AlertUuid, + pub class: AlertClass, + pub payload: serde_json::Value, + pub requested_sitrep_id: SitrepUuid, +} + +impl iddqd::IdOrdItem for AlertRequest { + type Key<'a> = &'a AlertUuid; + fn key(&self) -> Self::Key<'_> { + &self.id + } + + iddqd::id_upcast!(); +} + +#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum AlertClass { + // TODO +} diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index beef2f61da1..f119a7edef8 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -45,6 +45,7 @@ impl_typed_uuid_kinds! { AntiAffinityGroup = {}, Blueprint = {}, BuiltInUser = {}, + Case = {}, Collection = {}, ConsoleSession = {}, Dataset = {}, From 97643f11ec6c4538e7d378dd1a353e4408acee7e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Nov 2025 11:23:38 -0800 Subject: [PATCH 04/18] add alert classes --- nexus/db-model/src/alert_class.rs | 20 ++++++++++++++++++++ nexus/types/src/fm/alert.rs | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/nexus/db-model/src/alert_class.rs b/nexus/db-model/src/alert_class.rs index 5f0b2129707..f7f5356b247 100644 --- a/nexus/db-model/src/alert_class.rs +++ b/nexus/db-model/src/alert_class.rs @@ -30,6 +30,8 @@ impl_enum_type!( TestFooBaz => b"test.foo.baz" TestQuuxBar => b"test.quux.bar" TestQuuxBarBaz => b"test.quux.bar.baz" + PsuInserted => b"hw.insert.power.power_shelf.psu" + PsuRemoved => b"hw.remove.power.power_shelf.psu" ); impl AlertClass { @@ -44,6 +46,8 @@ impl AlertClass { Self::TestFooBaz => "test.foo.baz", Self::TestQuuxBar => "test.quux.bar", Self::TestQuuxBarBaz => "test.quux.bar.baz", + Self::PsuInserted => "hw.insert.power.power_shelf.psu", + Self::PsuRemoved => "hw.remove.power.power_shelf.psu", } } @@ -76,6 +80,12 @@ impl AlertClass { | Self::TestQuuxBarBaz => { "This is a test of the emergency alert system" } + Self::PsuInserted => { + "A power supply unit (PSU) has been inserted into the power shelf" + } + Self::PsuRemoved => { + "A power supply unit (PSU) has been removed from the power shelf" + } } } @@ -84,6 +94,16 @@ impl AlertClass { ::VARIANTS; } +impl From for AlertClass { + fn from(input: nexus_types::fm::AlertClass) -> Self { + use nexus_types::fm::AlertClass as In; + match input { + In::PsuRemoved => Self::PsuRemoved, + In::PsuInserted => Self::PsuInserted, + } + } +} + impl fmt::Display for AlertClass { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.as_str()) diff --git a/nexus/types/src/fm/alert.rs b/nexus/types/src/fm/alert.rs index 5eee875ac7e..05ad85073ee 100644 --- a/nexus/types/src/fm/alert.rs +++ b/nexus/types/src/fm/alert.rs @@ -25,5 +25,6 @@ impl iddqd::IdOrdItem for AlertRequest { #[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub enum AlertClass { - // TODO + PsuInserted, + PsuRemoved, } From 90112abb890a542c37219c4aaf25912e85354268 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Nov 2025 13:10:31 -0800 Subject: [PATCH 05/18] tidiness --- dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 2 +- nexus/db-queries/src/db/datastore/fm.rs | 12 +++++++++++- nexus/src/app/background/tasks/fm_sitrep_gc.rs | 5 +++++ nexus/src/app/background/tasks/fm_sitrep_load.rs | 2 ++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index 1f517b04642..17d0e1c96ce 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -238,7 +238,7 @@ async fn cmd_db_sitrep_show( } }; - let fm::Sitrep { metadata, cases, alerts_requested } = sitrep; + let fm::Sitrep { metadata, cases } = sitrep; let fm::SitrepMetadata { id, creator_id, diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 1b10ebcb0f5..914b4355460 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -145,7 +145,6 @@ impl DataStore { Ok(Sitrep { metadata, // TODO(eliza) read these - alerts_requested: Default::default(), cases: Default::default(), }) } @@ -760,6 +759,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; datastore.fm_sitrep_insert(&opctx, &sitrep).await.unwrap(); @@ -806,6 +806,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); @@ -819,6 +820,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.id()), }, + cases: Default::default(), }; datastore.fm_sitrep_insert(&opctx, &sitrep2).await.expect( "inserting a sitrep whose parent is current should succeed", @@ -859,6 +861,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); @@ -873,6 +876,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(nonexistent_id), }, + cases: Default::default(), }; let result = datastore.fm_sitrep_insert(&opctx, &sitrep2).await; @@ -907,6 +911,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); @@ -920,6 +925,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.id()), }, + cases: Default::default(), }; datastore.fm_sitrep_insert(&opctx, &sitrep2).await.unwrap(); @@ -934,6 +940,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.id()), }, + cases: Default::default(), }; let result = datastore.fm_sitrep_insert(&opctx, &sitrep3).await; @@ -974,6 +981,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; datastore .fm_sitrep_insert(&opctx, &sitrep1) @@ -1014,6 +1022,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.metadata.id), }, + cases: Default::default(), }; datastore .fm_sitrep_insert(&opctx, &sitrep2) @@ -1077,6 +1086,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id, }, + cases: Default::default(), }; match datastore.fm_sitrep_insert(&opctx, &sitrep).await { Ok(_) => { diff --git a/nexus/src/app/background/tasks/fm_sitrep_gc.rs b/nexus/src/app/background/tasks/fm_sitrep_gc.rs index 92214faef4b..7295e3c2459 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_gc.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_gc.rs @@ -152,6 +152,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; datastore .fm_sitrep_insert(&opctx, &sitrep1) @@ -174,6 +175,7 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.metadata.id), }, + cases: Default::default(), }; datastore .fm_sitrep_insert(&opctx, &sitrep2) @@ -264,7 +266,10 @@ mod tests { comment: format!("test sitrep v{i}; orphan {i}"), time_created: Utc::now(), parent_sitrep_id, + // TODO(eliza): we should populate cases and assert they get + // cleaned up... }, + cases: Default::default(), }; match datastore.fm_sitrep_insert(&opctx, &sitrep).await { Ok(_) => { diff --git a/nexus/src/app/background/tasks/fm_sitrep_load.rs b/nexus/src/app/background/tasks/fm_sitrep_load.rs index 0a2c52f95b1..723a96bf3b7 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_load.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_load.rs @@ -224,6 +224,7 @@ mod test { comment: "test sitrep 1".to_string(), time_created: Utc::now(), }, + cases: Default::default(), }; datastore .fm_sitrep_insert(&opctx, &sitrep1) @@ -288,6 +289,7 @@ mod test { comment: "test sitrep 2".to_string(), time_created: Utc::now(), }, + cases: Default::default(), }; datastore .fm_sitrep_insert(&opctx, &sitrep2) From 30fc5dd8949824eec765621f650d3a3bfc94fa22 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Nov 2025 10:37:00 -0800 Subject: [PATCH 06/18] sketchy --- nexus/fm/src/de.rs | 1 + nexus/fm/src/de/power_shelf.rs | 25 ++++++++++++++++++++++++- nexus/types/src/fm.rs | 33 ++++++++++++++++++++++++++++----- nexus/types/src/fm/ereport.rs | 15 +++++++++++++++ schema/crdb/dbinit.sql | 12 ++++++++++++ 5 files changed, 80 insertions(+), 6 deletions(-) diff --git a/nexus/fm/src/de.rs b/nexus/fm/src/de.rs index 0d1d2580785..bccbf63069f 100644 --- a/nexus/fm/src/de.rs +++ b/nexus/fm/src/de.rs @@ -5,3 +5,4 @@ //! Diagnosis engines pub mod power_shelf; + diff --git a/nexus/fm/src/de/power_shelf.rs b/nexus/fm/src/de/power_shelf.rs index fef366785b5..1ffec2940d0 100644 --- a/nexus/fm/src/de/power_shelf.rs +++ b/nexus/fm/src/de/power_shelf.rs @@ -4,7 +4,30 @@ //! Power shelf diagnosis use crate::SitrepBuilder; +use nexus_types::fm::Ereport; +use nexus_types::fm::ereport; +use nexus_types::inventory::SpType; +use std::sync::Arc; + +pub fn diagnose( + log: &slog::Logger, + sitrep: &mut SitrepBuilder<'_>, + new_ereports: &[Arc], +) -> anyhow::Result<()> { + for ereport in new_ereports { + // Skip non-power shelf reports + if !matches!( + ereport.reporter, + ereport::Reporter::Sp { sp_type: SpType::Power, .. } + ) { + continue; + } + + // TODO: check for existing cases tracked for this power shelf and see + // if the ereport is related to them... + + let case = sitrep.open_case(todo!())?; + } -pub fn diagnose(sitrep: &mut SitrepBuilder<'_>) -> anyhow::Result<()> { Ok(()) } diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index cf6945b70e4..e004ca92169 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -19,7 +19,7 @@ use omicron_uuid_kinds::{ CaseUuid, CollectionUuid, OmicronZoneUuid, SitrepUuid, }; use serde::{Deserialize, Serialize}; -use std::collections::BTreeSet; +use std::sync::Arc; /// A fault management situation report, or _sitrep_. /// @@ -118,10 +118,16 @@ pub struct SitrepVersion { pub struct Case { pub id: CaseUuid, pub created_sitrep_id: SitrepUuid, - pub de: String, - pub ereports: BTreeSet, - // TODO(eliza) what else? - pub alerts_requested: IdOrdMap, // TODO(eliza): draw the rest of the sitrep + pub time_created: DateTime, + pub time_closed: Option>, + + pub de: DiagnosisEngine, + + pub ereports: IdOrdMap>, + + pub alerts_requested: IdOrdMap, + + pub comment: String, } impl IdOrdItem for Case { @@ -132,3 +138,20 @@ impl IdOrdItem for Case { iddqd::id_upcast!(); } + +#[derive( + Copy, + Clone, + Debug, + PartialEq, + Eq, + Hash, + serde::Serialize, + serde::Deserialize, + strum::Display, +)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum DiagnosisEngine { + PowerShelf, +} diff --git a/nexus/types/src/fm/ereport.rs b/nexus/types/src/fm/ereport.rs index 06012927bb6..e21e1b50d1f 100644 --- a/nexus/types/src/fm/ereport.rs +++ b/nexus/types/src/fm/ereport.rs @@ -22,6 +22,21 @@ pub struct Ereport { pub reporter: Reporter, } +impl Ereport { + pub fn id(&self) -> &EreportId { + &self.data.id + } +} + +impl iddqd::IdOrdItem for Ereport { + type Key<'a> = &'a EreportId; + fn key(&self) -> Self::Key<'_> { + self.id() + } + + iddqd::id_upcast!(); +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct EreportData { #[serde(flatten)] diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 3d640255673..bc5e3ba7acd 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6838,6 +6838,18 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_sitrep_version_by_id ON omicron.public.fm_sitrep_history (sitrep_id); +CREATE TABLE IF NOT EXISTS omicron.public.fm_case ( + -- Case UUID + id UUID NOT NULL, + -- UUID of the sitrep in which the case had this state. + sitrep_id UUID NOT NULL, + -- UUID of the sitrep in which the case was created. + created_sitrep_id UUID NOT NULL, + + time_created TIMESTAMPTZ NOT NULL, + time_closed TIMESTAMPTZ, +); + /* * List of datasets available to be sliced up and passed to VMMs for instance * local storage. From 16c901a23647c729f24bb8d5ff33f51ddec1dcd5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Nov 2025 19:12:27 -0800 Subject: [PATCH 07/18] stuff --- Cargo.lock | 3 + nexus/fm/Cargo.toml | 3 + nexus/fm/src/alert.rs | 22 +++ nexus/fm/src/alert/power_shelf.rs | 52 ++++++++ nexus/fm/src/de.rs | 1 - nexus/fm/src/de/power_shelf.rs | 57 +++++++- nexus/fm/src/lib.rs | 213 ++++++++++++++++++++++-------- nexus/types/src/fm.rs | 10 ++ nexus/types/src/fm/ereport.rs | 7 + 9 files changed, 305 insertions(+), 63 deletions(-) create mode 100644 nexus/fm/src/alert.rs create mode 100644 nexus/fm/src/alert/power_shelf.rs diff --git a/Cargo.lock b/Cargo.lock index 47320e2cf79..ca2c2d19105 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6881,6 +6881,9 @@ dependencies = [ "nexus-types", "omicron-uuid-kinds", "omicron-workspace-hack", + "schemars 0.8.22", + "serde", + "serde_json", "slog", ] diff --git a/nexus/fm/Cargo.toml b/nexus/fm/Cargo.toml index d530377bf83..71c731a49a6 100644 --- a/nexus/fm/Cargo.toml +++ b/nexus/fm/Cargo.toml @@ -12,6 +12,9 @@ chrono.workspace = true iddqd.workspace = true nexus-types.workspace = true omicron-uuid-kinds.workspace = true +schemars.workspace = true +serde.workspace = true +serde_json.workspace = true slog.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/fm/src/alert.rs b/nexus/fm/src/alert.rs new file mode 100644 index 00000000000..8e0e067706e --- /dev/null +++ b/nexus/fm/src/alert.rs @@ -0,0 +1,22 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Alert messages. + +use nexus_types::fm::AlertClass; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +pub mod power_shelf; + +pub trait Alert: Serialize + JsonSchema + std::fmt::Debug { + const CLASS: AlertClass; +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct VpdIdentity { + pub part_number: Option, + pub revision: Option, + pub serial_number: Option, +} diff --git a/nexus/fm/src/alert/power_shelf.rs b/nexus/fm/src/alert/power_shelf.rs new file mode 100644 index 00000000000..8dfcc25efb3 --- /dev/null +++ b/nexus/fm/src/alert/power_shelf.rs @@ -0,0 +1,52 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Power shelf alerts. + +use super::{Alert, VpdIdentity}; +use nexus_types::fm::AlertClass; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +#[serde(tag = "version", rename_all = "snake_case")] +pub enum PsuInserted { + V0 { + #[serde(flatten)] + psc_psu: PscPsu, + }, +} + +impl Alert for PsuInserted { + const CLASS: AlertClass = AlertClass::PsuInserted; +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +#[serde(tag = "version", rename_all = "snake_case")] +pub enum PsuRemoved { + V0 { + #[serde(flatten)] + psc_psu: PscPsu, + }, +} + +impl Alert for PsuRemoved { + const CLASS: AlertClass = AlertClass::PsuInserted; +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct PscPsu { + pub psc_id: VpdIdentity, + pub psc_slot: u16, + pub psu_id: PsuIdentity, + pub psu_slot: u16, +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct PsuIdentity { + pub manufacturer: Option, + pub part_number: Option, + pub firmware_revision: Option, + pub serial_number: Option, +} diff --git a/nexus/fm/src/de.rs b/nexus/fm/src/de.rs index bccbf63069f..0d1d2580785 100644 --- a/nexus/fm/src/de.rs +++ b/nexus/fm/src/de.rs @@ -5,4 +5,3 @@ //! Diagnosis engines pub mod power_shelf; - diff --git a/nexus/fm/src/de/power_shelf.rs b/nexus/fm/src/de/power_shelf.rs index 1ffec2940d0..b8f4b61ead1 100644 --- a/nexus/fm/src/de/power_shelf.rs +++ b/nexus/fm/src/de/power_shelf.rs @@ -3,30 +3,73 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Power shelf diagnosis + use crate::SitrepBuilder; +use crate::alert +use nexus_types::fm::AlertRequest; +use nexus_types::fm::DiagnosisEngine; use nexus_types::fm::Ereport; use nexus_types::fm::ereport; use nexus_types::inventory::SpType; use std::sync::Arc; pub fn diagnose( - log: &slog::Logger, sitrep: &mut SitrepBuilder<'_>, new_ereports: &[Arc], ) -> anyhow::Result<()> { for ereport in new_ereports { // Skip non-power shelf reports - if !matches!( - ereport.reporter, - ereport::Reporter::Sp { sp_type: SpType::Power, .. } - ) { + let ereport::Reporter::Sp { sp_type: SpType::Power, slot, } = ereport.reporter else { continue; - } + }; // TODO: check for existing cases tracked for this power shelf and see // if the ereport is related to them... - let case = sitrep.open_case(todo!())?; + match ereport.data.class.as_deref() { + // PSU inserted + Some("hw.insert.psu") => { + let mut case = sitrep.open_case(DiagnosisEngine::PowerShelf)?; + case.add_ereport(ereport); + case.comment = "PSU inserted".to_string(); + let psu_id = match ereport.get("fruid") { + Some(serde_json::Value::Object(fruid)) => { + todo!() + }, + None => { + todo!() + } + }; + case.request_alert(alert::power_shelf::PsuInserted::V0 { + psc_psu: alert::power_shelf::PscPsu { + psc_id: alert::VpdIdentity { + serial_number: ereport.serial_number.clone(), + revision: ereport.report.get("baseboard_rev").map(ToString::to_string), + part_number: ereport.part_number.clone(), + }, + psc_slot: slot, + psu_id, + psu_slot: ereport.report.get("slot").map(|s| todo!()), + } + }) + } + Some("hw.remove.psu") => {} + Some(unknown) => { + slog::warn!( + &sitrep.log, + "ignoring unhandled PSC ereport class"; + "ereport_class" => %unknown, + "ereport" => %ereport.id, + ); + } + None => { + slog::warn!( + &sitrep.log, + "ignoring PSC ereport with no class"; + "ereport" => %ereport.id, + ); + } + } } Ok(()) diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index 62ead12a756..8c5ef214130 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -6,12 +6,17 @@ use nexus_types::fm; use nexus_types::inventory; +use omicron_uuid_kinds::AlertUuid; use omicron_uuid_kinds::CaseUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SitrepUuid; use slog::Logger; -use std::fmt::Write; +// use std::fmt::Write; +use anyhow::Context; +use chrono::Utc; +use std::sync::Arc; +pub mod alert; pub mod de; #[derive(Debug)] @@ -20,7 +25,7 @@ pub struct SitrepBuilder<'a> { pub inventory: &'a inventory::Collection, pub parent_sitrep: Option<&'a fm::Sitrep>, pub sitrep_id: SitrepUuid, - pub cases: iddqd::IdOrdMap, + pub cases: iddqd::IdOrdMap, comment: String, } @@ -36,73 +41,64 @@ impl<'a> SitrepBuilder<'a> { "parent_sitrep_id" => format!("{:?}", parent_sitrep.as_ref().map(|s| s.id())), "inv_collection_id" => format!("{:?}", inventory.id), )); + + // Copy forward any open cases from the parent sitrep. + // If a case was closed in the parent sitrep, skip it. + let cases: iddqd::IdOrdMap<_> = parent_sitrep + .iter() + .flat_map(|s| s.open_cases()) + .map(|case| CaseBuilder::new(&log, sitrep_id, case.clone())) + .collect(); + + slog::info!( + &log, + "preparing sitrep {sitrep_id:?}"; + "existing_open_cases" => cases.len(), + ); + SitrepBuilder { log, sitrep_id, inventory, parent_sitrep, comment: String::new(), - cases: Default::default(), - } - } - - pub fn open_case(&mut self, case: fm::Case) -> anyhow::Result { - let case_id = case.id; - if self.cases.contains_key(&case_id) { - anyhow::bail!("case with ID {case_id:?} already exists"); + cases, } - - slog::info!( - self.log, - "opened case {case_id:?}"; - "case_id" => ?case_id, - "de" => %case.de - ); - - writeln!(&mut self.comment, "* de {} opened case {case_id:?}", case.de) - .unwrap(); - - self.cases - .insert_unique(case) - .expect("we just checked that it doesn't exist"); - Ok(case_id) } - pub fn request_alert( + pub fn open_case( &mut self, - case_id: CaseUuid, - req: fm::AlertRequest, - ) -> anyhow::Result<()> { - let mut case = self.cases - .get_mut(&case_id) - .ok_or_else(|| anyhow::anyhow!( - "cannot create an alert request for non-existent case ID {case_id:?}", - ))?; - let alert_id = req.id; - let alert_class = req.class; - - case.alerts_requested.insert_unique(req).map_err(|_| { - anyhow::anyhow!("an alert with ID {alert_id:?} already exists") - })?; - - writeln!( - &mut self.comment, - "* de {} requested {alert_class:?} alert {alert_id:?} for case \ - {case_id:?}", - case.de - ) - .unwrap(); + de: fm::DiagnosisEngine, + ) -> anyhow::Result> { + let id = CaseUuid::new_v4(); + let sitrep_id = self.sitrep_id; + let case = match self.cases.entry(&id) { + iddqd::id_ord_map::Entry::Occupied(_) => { + panic!("generated a colliding UUID!") + } + iddqd::id_ord_map::Entry::Vacant(entry) => { + let case = fm::Case { + id, + created_sitrep_id: self.sitrep_id, + time_created: chrono::Utc::now(), + time_closed: None, + de, + comment: String::new(), + ereports: Default::default(), + alerts_requested: Default::default(), + }; + entry.insert(CaseBuilder::new(&self.log, sitrep_id, case)) + } + }; slog::info!( self.log, - "requested an alert for case {case_id:?}"; - "case_id" => ?case_id, - "de" => %case.de, - "alert_id" => ?alert_id, - "alert_class" => ?alert_class, + "opened case {id:?}"; + "case_id" => ?id, + "de" => %de ); - Ok(()) + Ok(case) } pub fn build(self, creator_id: OmicronZoneUuid) -> fm::Sitrep { @@ -115,7 +111,114 @@ impl<'a> SitrepBuilder<'a> { comment: self.comment, time_created: chrono::Utc::now(), }, - cases: self.cases, + cases: self + .cases + .into_iter() + .map(|builder| fm::Case::from(builder)) + .collect(), + } + } +} + +#[derive(Debug)] +pub struct CaseBuilder { + pub log: slog::Logger, + pub case: fm::Case, + pub sitrep_id: SitrepUuid, +} + +impl CaseBuilder { + fn new(log: &slog::Logger, sitrep_id: SitrepUuid, case: fm::Case) -> Self { + let log = log.new(slog::o!( + "case_id" => format!("{:?}", case.id), + "de" => case.de.to_string(), + "created_sitrep_id" => format!("{:?}", case.created_sitrep_id), + )); + Self { log, case, sitrep_id } + } + + pub fn request_alert( + &mut self, + alert: &A, + ) -> anyhow::Result<()> { + let id = AlertUuid::new_v4(); + let class = A::CLASS; + let req = fm::AlertRequest { + id, + class, + requested_sitrep_id: self.sitrep_id, + payload: serde_json::to_value(&alert).with_context(|| { + format!( + "failed to serialize payload for {class:?} alert {alert:?}" + ) + })?, + }; + self.case.alerts_requested.insert_unique(req).map_err(|_| { + anyhow::anyhow!("an alert with ID {id:?} already exists") + })?; + + slog::info!( + &self.log, + "requested an alert"; + "alert_id" => ?id, + "alert_class" => ?class, + ); + + Ok(()) + } + + pub fn close(&mut self, log: &slog::Logger) { + self.case.time_closed = Some(Utc::now()); + + slog::info!(log, "case closed"); + } + + pub fn add_ereport(&mut self, report: &Arc) { + match self.case.ereports.insert_unique(report.clone()) { + Ok(_) => { + slog::info!( + self.log, + "assigned ereport {} to case", report.id(); + "ereport_id" => ?report.id(), + "ereport_class" => ?report.class, + ); + } + Err(_) => { + slog::warn!( + self.log, + "ereport {} already assigned to case", report.id(); + "ereport_id" => ?report.id(), + "ereport_class" => ?report.class, + ); + } } } } + +impl From for fm::Case { + fn from(CaseBuilder { case, .. }: CaseBuilder) -> Self { + case + } +} + +impl core::ops::Deref for CaseBuilder { + type Target = fm::Case; + fn deref(&self) -> &Self::Target { + &self.case + } +} + +impl core::ops::DerefMut for CaseBuilder { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.case + } +} + +impl iddqd::IdOrdItem for CaseBuilder { + type Key<'a> = &'a CaseUuid; + fn key(&self) -> Self::Key<'_> { + &self.case.id + } + + iddqd::id_upcast!(); +} diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index e004ca92169..b590ad5ba42 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -53,6 +53,10 @@ impl Sitrep { self.metadata.parent_sitrep_id } + pub fn open_cases(&self) -> impl Iterator + '_ { + self.cases.iter().filter(|c| c.is_open()) + } + /// Iterate over all alerts requested by cases in this sitrep. pub fn alerts_requested( &self, @@ -130,6 +134,12 @@ pub struct Case { pub comment: String, } +impl Case { + pub fn is_open(&self) -> bool { + self.time_closed.is_none() + } +} + impl IdOrdItem for Case { type Key<'a> = &'a CaseUuid; fn key(&self) -> Self::Key<'_> { diff --git a/nexus/types/src/fm/ereport.rs b/nexus/types/src/fm/ereport.rs index e21e1b50d1f..17426a70179 100644 --- a/nexus/types/src/fm/ereport.rs +++ b/nexus/types/src/fm/ereport.rs @@ -28,6 +28,13 @@ impl Ereport { } } +impl core::ops::Deref for Ereport { + type Target = EreportData; + fn deref(&self) -> &Self::Target { + &self.data + } +} + impl iddqd::IdOrdItem for Ereport { type Key<'a> = &'a EreportId; fn key(&self) -> Self::Key<'_> { From 9e3dda54794538fafabd5cef7461606d677d0ada Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 12 Nov 2025 10:27:08 -0800 Subject: [PATCH 08/18] woag --- nexus/fm/src/alert/power_shelf.rs | 2 +- nexus/fm/src/de/power_shelf.rs | 131 ++++++++++++++++++++++++------ nexus/fm/src/lib.rs | 4 +- 3 files changed, 109 insertions(+), 28 deletions(-) diff --git a/nexus/fm/src/alert/power_shelf.rs b/nexus/fm/src/alert/power_shelf.rs index 8dfcc25efb3..4b080d6b64a 100644 --- a/nexus/fm/src/alert/power_shelf.rs +++ b/nexus/fm/src/alert/power_shelf.rs @@ -40,7 +40,7 @@ pub struct PscPsu { pub psc_id: VpdIdentity, pub psc_slot: u16, pub psu_id: PsuIdentity, - pub psu_slot: u16, + pub psu_slot: Option, } #[derive(Debug, Serialize, Deserialize, JsonSchema)] diff --git a/nexus/fm/src/de/power_shelf.rs b/nexus/fm/src/de/power_shelf.rs index b8f4b61ead1..bd27e633b6e 100644 --- a/nexus/fm/src/de/power_shelf.rs +++ b/nexus/fm/src/de/power_shelf.rs @@ -5,12 +5,13 @@ //! Power shelf diagnosis use crate::SitrepBuilder; -use crate::alert -use nexus_types::fm::AlertRequest; +use crate::alert; use nexus_types::fm::DiagnosisEngine; use nexus_types::fm::Ereport; use nexus_types::fm::ereport; use nexus_types::inventory::SpType; +use serde::de::DeserializeOwned; +use serde_json::Value; use std::sync::Arc; pub fn diagnose( @@ -19,7 +20,9 @@ pub fn diagnose( ) -> anyhow::Result<()> { for ereport in new_ereports { // Skip non-power shelf reports - let ereport::Reporter::Sp { sp_type: SpType::Power, slot, } = ereport.reporter else { + let ereport::Reporter::Sp { sp_type: SpType::Power, slot } = + ereport.reporter + else { continue; }; @@ -29,31 +32,30 @@ pub fn diagnose( match ereport.data.class.as_deref() { // PSU inserted Some("hw.insert.psu") => { + let psc_psu = extract_psc_psu(&ereport, slot, &sitrep.log); let mut case = sitrep.open_case(DiagnosisEngine::PowerShelf)?; case.add_ereport(ereport); - case.comment = "PSU inserted".to_string(); - let psu_id = match ereport.get("fruid") { - Some(serde_json::Value::Object(fruid)) => { - todo!() - }, - None => { - todo!() - } - }; - case.request_alert(alert::power_shelf::PsuInserted::V0 { - psc_psu: alert::power_shelf::PscPsu { - psc_id: alert::VpdIdentity { - serial_number: ereport.serial_number.clone(), - revision: ereport.report.get("baseboard_rev").map(ToString::to_string), - part_number: ereport.part_number.clone(), - }, - psc_slot: slot, - psu_id, - psu_slot: ereport.report.get("slot").map(|s| todo!()), - } - }) + case.comment = + format!("PSC {slot} PSU {:?} inserted", psc_psu.psu_slot); + case.request_alert(&alert::power_shelf::PsuInserted::V0 { + psc_psu, + })?; + // Nothing else to do at this time. + case.close(); + } + Some("hw.remove.psu") => { + let psc_psu = extract_psc_psu(&ereport, slot, &sitrep.log); + let mut case = sitrep.open_case(DiagnosisEngine::PowerShelf)?; + case.add_ereport(ereport); + case.comment = + format!("PSC {slot} PSU {:?} removed", psc_psu.psu_slot); + case.request_alert(&alert::power_shelf::PsuRemoved::V0 { + psc_psu, + })?; + + // Nothing else to do at this time. + case.close(); } - Some("hw.remove.psu") => {} Some(unknown) => { slog::warn!( &sitrep.log, @@ -74,3 +76,82 @@ pub fn diagnose( Ok(()) } + +fn extract_psc_psu( + ereport: &Ereport, + psc_slot: u16, + log: &slog::Logger, +) -> alert::power_shelf::PscPsu { + let psc_id = extract_psc_id(ereport, log); + let psu_id = extract_psu_id(ereport, log); + let psu_slot = grab_json_value(ereport, "slot", &ereport.report, log); + alert::power_shelf::PscPsu { psc_id, psc_slot, psu_id, psu_slot } +} + +fn extract_psc_id(ereport: &Ereport, log: &slog::Logger) -> alert::VpdIdentity { + let serial_number = ereport.serial_number.clone(); + let revision = + grab_json_value(ereport, "baseboard_rev", &ereport.report, log); + let part_number = ereport.part_number.clone(); + alert::VpdIdentity { serial_number, revision, part_number } +} + +fn extract_psu_id( + ereport: &Ereport, + log: &slog::Logger, +) -> alert::power_shelf::PsuIdentity { + // These are the same field names that Hubris uses in the ereport. See: + // https://github.com/oxidecomputer/hubris/blob/ec18e4f11aaa14600c61f67335c32b250ef38269/drv/psc-seq-server/src/main.rs#L1107-L1117 + #[derive(serde::Deserialize, Default)] + struct Fruid { + mfr: Option, + mpn: Option, + serial: Option, + fw_rev: Option, + } + + let Fruid { mfr, mpn, serial, fw_rev } = + grab_json_value(ereport, "fruid", &ereport.report, log) + .unwrap_or_default(); + + alert::power_shelf::PsuIdentity { + serial_number: serial, + part_number: mpn, + firmware_revision: fw_rev, + manufacturer: mfr, + } +} + +fn grab_json_value( + ereport: &Ereport, + key: &str, + obj: &Value, + log: &slog::Logger, +) -> Option { + let v = match obj.get("key") { + Some(v) => v, + None => { + slog::warn!( + log, + "expected ereport to contain a '{key}' field"; + "ereport_id" => %ereport.id, + "ereport_class" => ?ereport.class, + ); + return None; + } + }; + match serde_json::from_value(v.clone()) { + Ok(v) => Some(v), + Err(e) => { + slog::warn!( + log, + "expected ereport '{key}' field to deserialize as a {}", + std::any::type_name::(); + "ereport_id" => %ereport.id, + "ereport_class" => ?ereport.class, + "error" => %e, + ); + None + } + } +} diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index 8c5ef214130..5588003bedf 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -167,10 +167,10 @@ impl CaseBuilder { Ok(()) } - pub fn close(&mut self, log: &slog::Logger) { + pub fn close(&mut self) { self.case.time_closed = Some(Utc::now()); - slog::info!(log, "case closed"); + slog::info!(&self.log, "case closed"); } pub fn add_ereport(&mut self, report: &Arc) { From 0785c8c0c5cbd9aef161af6024ac40a01487d4e9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 12 Nov 2025 12:21:32 -0800 Subject: [PATCH 09/18] lots of case DB stuff --- ereport/types/src/lib.rs | 12 +- nexus/db-model/src/alert_class.rs | 17 ++ nexus/db-model/src/fm.rs | 7 + nexus/db-model/src/fm/alert_request.rs | 55 ++++++ nexus/db-model/src/fm/case.rs | 38 ++++ nexus/db-model/src/fm/diagnosis_engine.rs | 50 +++++ nexus/db-model/src/lib.rs | 4 +- nexus/db-queries/src/db/datastore/ereport.rs | 10 +- nexus/db-queries/src/db/datastore/fm.rs | 188 ++++++++++++++++++- nexus/db-schema/src/enums.rs | 1 + nexus/db-schema/src/schema.rs | 41 ++++ nexus/fm/src/lib.rs | 2 + nexus/types/src/fm.rs | 2 + schema/crdb/dbinit.sql | 69 ++++++- 14 files changed, 483 insertions(+), 13 deletions(-) create mode 100644 nexus/db-model/src/fm/alert_request.rs create mode 100644 nexus/db-model/src/fm/case.rs create mode 100644 nexus/db-model/src/fm/diagnosis_engine.rs diff --git a/ereport/types/src/lib.rs b/ereport/types/src/lib.rs index 63612b9caab..d06f5f2fc3d 100644 --- a/ereport/types/src/lib.rs +++ b/ereport/types/src/lib.rs @@ -32,6 +32,7 @@ pub struct Ereport { Serialize, Deserialize, JsonSchema, + Hash, )] #[repr(transparent)] #[serde(from = "u64", into = "u64")] @@ -103,7 +104,16 @@ impl TryFrom for Ena { /// Unique identifier for an ereport. #[derive( - Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Ord, + Debug, + Clone, + Copy, + PartialEq, + Eq, + Serialize, + Deserialize, + PartialOrd, + Ord, + Hash, )] pub struct EreportId { pub restart_id: EreporterRestartUuid, diff --git a/nexus/db-model/src/alert_class.rs b/nexus/db-model/src/alert_class.rs index f7f5356b247..39004961b9b 100644 --- a/nexus/db-model/src/alert_class.rs +++ b/nexus/db-model/src/alert_class.rs @@ -4,6 +4,7 @@ use super::impl_enum_type; use nexus_types::external_api::views; +use omicron_common::api::external::Error; use serde::de::{self, Deserialize, Deserializer}; use serde::ser::{Serialize, Serializer}; use std::fmt; @@ -104,6 +105,22 @@ impl From for AlertClass { } } +impl TryFrom for nexus_types::fm::AlertClass { + type Error = Error; + + fn try_from(input: AlertClass) -> Result { + use nexus_types::fm::AlertClass as Out; + match input { + AlertClass::PsuRemoved => Ok(Out::PsuRemoved), + AlertClass::PsuInserted => Ok(Out::PsuInserted), + class => Err(Error::invalid_value( + "alert_class", + format!("'{class}' is not a FM alert class"), + )), + } + } +} + impl fmt::Display for AlertClass { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.as_str()) diff --git a/nexus/db-model/src/fm.rs b/nexus/db-model/src/fm.rs index d9d7ac3c2dc..353a2174aac 100644 --- a/nexus/db-model/src/fm.rs +++ b/nexus/db-model/src/fm.rs @@ -19,6 +19,13 @@ use chrono::{DateTime, Utc}; use nexus_db_schema::schema::{fm_sitrep, fm_sitrep_history}; use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind, SitrepKind}; +mod alert_request; +pub use alert_request::*; +mod case; +pub use case::*; +mod diagnosis_engine; +pub use diagnosis_engine::*; + #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = fm_sitrep)] pub struct SitrepMetadata { diff --git a/nexus/db-model/src/fm/alert_request.rs b/nexus/db-model/src/fm/alert_request.rs new file mode 100644 index 00000000000..551085aa065 --- /dev/null +++ b/nexus/db-model/src/fm/alert_request.rs @@ -0,0 +1,55 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fault management alert requests. + +use crate::AlertClass; +use crate::DbTypedUuid; +use nexus_db_schema::schema::fm_alert_request; +use nexus_types::fm; +use omicron_uuid_kinds::{ + AlertKind, CaseKind, CaseUuid, SitrepKind, SitrepUuid, +}; + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_alert_request)] +pub struct AlertRequest { + pub id: DbTypedUuid, + pub sitrep_id: DbTypedUuid, + pub requested_sitrep_id: DbTypedUuid, + pub case_id: DbTypedUuid, + #[diesel(column_name = "class")] + pub class: AlertClass, + pub payload: serde_json::Value, +} + +impl AlertRequest { + pub fn new( + current_sitrep_id: SitrepUuid, + case_id: CaseUuid, + req: fm::AlertRequest, + ) -> Self { + let fm::AlertRequest { id, requested_sitrep_id, payload, class } = req; + AlertRequest { + id: id.into(), + sitrep_id: current_sitrep_id.into(), + requested_sitrep_id: requested_sitrep_id.into(), + case_id: case_id.into(), + class: class.into(), + payload, + } + } +} + +impl TryFrom for fm::AlertRequest { + type Error = >::Error; + fn try_from(req: AlertRequest) -> Result { + Ok(fm::AlertRequest { + id: req.id.into(), + requested_sitrep_id: req.requested_sitrep_id.into(), + payload: req.payload, + class: req.class.try_into()?, + }) + } +} diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs new file mode 100644 index 00000000000..cd40b6d72c2 --- /dev/null +++ b/nexus/db-model/src/fm/case.rs @@ -0,0 +1,38 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fault management cases. + +use super::DiagnosisEngine; +use crate::DbTypedUuid; +use crate::ereport; +use chrono::{DateTime, Utc}; +use nexus_db_schema::schema::{fm_case, fm_ereport_in_case}; +use omicron_uuid_kinds::{CaseKind, EreporterRestartKind, SitrepKind}; + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_case)] +pub struct CaseMetadata { + pub id: DbTypedUuid, + pub sitrep_id: DbTypedUuid, + pub de: DiagnosisEngine, + + pub created_sitrep_id: DbTypedUuid, + pub time_created: DateTime, + + pub time_closed: Option>, + pub closed_sitrep_id: Option>, + + pub comment: String, +} + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_ereport_in_case)] +pub struct CaseEreport { + pub restart_id: DbTypedUuid, + pub ena: ereport::DbEna, + pub case_id: DbTypedUuid, + pub sitrep_id: DbTypedUuid, + pub assigned_sitrep_id: DbTypedUuid, +} diff --git a/nexus/db-model/src/fm/diagnosis_engine.rs b/nexus/db-model/src/fm/diagnosis_engine.rs new file mode 100644 index 00000000000..7d4523fa74e --- /dev/null +++ b/nexus/db-model/src/fm/diagnosis_engine.rs @@ -0,0 +1,50 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::impl_enum_type; +use nexus_types::fm; +use serde::{Deserialize, Serialize}; +use std::fmt; + +impl_enum_type!( + DiagnosisEngineEnum: + + #[derive( + Copy, + Clone, + Debug, + PartialEq, + Serialize, + Deserialize, + AsExpression, + FromSqlRow, + )] + #[serde(rename_all = "snake_case")] + pub enum DiagnosisEngine; + + PowerShelf => b"power_shelf" + +); + +impl From for fm::DiagnosisEngine { + fn from(de: DiagnosisEngine) -> Self { + match de { + DiagnosisEngine::PowerShelf => fm::DiagnosisEngine::PowerShelf, + } + } +} + +impl From for DiagnosisEngine { + fn from(fm_de: fm::DiagnosisEngine) -> Self { + match fm_de { + fm::DiagnosisEngine::PowerShelf => DiagnosisEngine::PowerShelf, + } + } +} + +impl fmt::Display for DiagnosisEngine { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fm::DiagnosisEngine::from(*self).fmt(f) + } +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 91067f9ace9..f1e643bcb34 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -41,7 +41,7 @@ mod downstairs; pub mod ereport; mod ereporter_type; mod external_ip; -mod fm; +pub mod fm; mod generation; mod identity_provider; mod image; @@ -187,7 +187,7 @@ pub use downstairs::*; pub use ereport::Ereport; pub use ereporter_type::*; pub use external_ip::*; -pub use fm::*; +pub use fm::{SitrepMetadata, SitrepVersion}; pub use generation::*; pub use identity_provider::*; pub use image::*; diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 2fa7489d5aa..5ea3dbbb5ba 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -98,6 +98,14 @@ impl DataStore { ) -> LookupResult { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; + self.ereport_fetch_on_conn(&conn, id).await + } + + pub(crate) async fn ereport_fetch_on_conn( + &self, + conn: &async_bb8_diesel::Connection, + id: fm::EreportId, + ) -> LookupResult { let restart_id = id.restart_id.into_untyped_uuid(); let ena = DbEna::from(id.ena); @@ -106,7 +114,7 @@ impl DataStore { .filter(dsl::ena.eq(ena)) .filter(dsl::time_deleted.is_null()) .select(Ereport::as_select()) - .first_async(&*conn) + .first_async(conn) .await .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 914b4355460..7dbb8069bfb 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -12,9 +12,14 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; use crate::db::datastore::RunnableQuery; +use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::model; +use crate::db::model::DbTypedUuid; use crate::db::model::SqlU32; +use crate::db::model::ereport::DbEna; +use crate::db::pagination::Paginator; use crate::db::pagination::paginated; +use crate::db::pagination::paginated_multicolumn; use crate::db::raw_query_builder::QueryBuilder; use crate::db::raw_query_builder::TypedSqlQuery; use async_bb8_diesel::AsyncRunQueryDsl; @@ -26,6 +31,9 @@ use dropshot::PaginationOrder; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::DbConnection; +use nexus_db_schema::schema::fm_alert_request::dsl as alert_req_dsl; +use nexus_db_schema::schema::fm_case::dsl as case_dsl; +use nexus_db_schema::schema::fm_ereport_in_case::dsl as case_ereport_dsl; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; use nexus_types::fm; @@ -33,8 +41,10 @@ use nexus_types::fm::Sitrep; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; +use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SitrepUuid; +use std::sync::Arc; use uuid::Uuid; impl DataStore { @@ -139,14 +149,176 @@ impl DataStore { let metadata = self.fm_sitrep_metadata_read_on_conn(id, &conn).await?.into(); - // TODO(eliza): this is where we would read all the other sitrep data, - // if there was any. + let mut all_ereports = iddqd::IdOrdMap::>::new(); + let cases = { + let mut cases = iddqd::IdOrdMap::new(); + let mut paginator = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = self + .fm_sitrep_cases_list_on_conn( + id, + &p.current_pagparams(), + &conn, + ) + .await + .map_err(|e| { + e.internal_context("failed to list sitrep cases") + })?; + paginator = p.found_batch(&batch, &|case| case.id); + + for case in batch { + // TODO(eliza): consider using a `ParallelTaskSet` to fetch the + // cases in parallel here. + let (ereport_assignments, alerts_requested) = + self.fm_case_read_on_conn(&case, conn).await?; + + // Fetch ereports assigned to this case. + let mut ereports = iddqd::IdOrdMap::with_capacity( + ereport_assignments.len(), + ); + for model::fm::CaseEreport { + restart_id, + ena: DbEna(ena), + .. + } in ereport_assignments + { + let ereport_id = fm::EreportId { + restart_id: restart_id.into(), + ena, + }; + let ereport = match all_ereports.entry(&ereport_id) { + iddqd::id_ord_map::Entry::Occupied(entry) => { + entry.get().clone() + } + iddqd::id_ord_map::Entry::Vacant(entry) => { + let ereport: fm::Ereport = self.ereport_fetch_on_conn(conn, ereport_id) + .await + .map_err(|e| e.internal_context(format!( + "failed to fetch ereport {ereport_id} for case {}", + case.id, + )))? + .into(); + entry.insert(Arc::new(ereport)).clone() + } + }; + ereports.insert_unique(ereport).unwrap(); + } + + cases + .insert_unique(fm::Case { + id: case.id.into(), + created_sitrep_id: case.created_sitrep_id.into(), + time_created: case.time_created.into(), + time_closed: case.time_closed.map(Into::into), + closed_sitrep_id: case + .closed_sitrep_id + .map(Into::into), + de: case.de.into(), + comment: case.comment, + ereports, + alerts_requested, + }) + .expect("case UUIDs should be unique"); + } + } - Ok(Sitrep { - metadata, - // TODO(eliza) read these - cases: Default::default(), - }) + cases + }; + + Ok(Sitrep { metadata, cases }) + } + + async fn fm_sitrep_cases_list_on_conn( + &self, + sitrep_id: SitrepUuid, + pagparams: &DataPageParams<'_, DbTypedUuid>, + conn: &async_bb8_diesel::Connection, + ) -> ListResultVec { + paginated(case_dsl::fm_case, case_dsl::id, &pagparams) + .filter(case_dsl::sitrep_id.eq(sitrep_id.into_untyped_uuid())) + .select(model::fm::CaseMetadata::as_select()) + .load_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + async fn fm_case_read_on_conn( + &self, + case: &model::fm::CaseMetadata, + conn: &async_bb8_diesel::Connection, + ) -> Result< + (Vec, iddqd::IdOrdMap), + Error, + > { + // Read ereports assigned to this case. + let ereports = { + let mut ereports = Vec::new(); + let mut paginator = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = paginated_multicolumn( + case_ereport_dsl::fm_ereport_in_case, + (case_ereport_dsl::restart_id, case_ereport_dsl::ena), + &p.current_pagparams(), + ) + .filter(case_ereport_dsl::case_id.eq(case.id)) + .filter(case_ereport_dsl::sitrep_id.eq(case.sitrep_id)) + .select(model::fm::CaseEreport::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to list ereports assigned to case {}", + case.id + )) + })?; + + paginator = p.found_batch(&batch, &|ereport| { + (ereport.restart_id, ereport.ena) + }); + ereports.extend(batch); + } + ereports + }; + + // Read alerts requested for this case. + let alerts_requested = { + let mut alerts = iddqd::IdOrdMap::new(); + let mut paginator = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = paginated( + alert_req_dsl::fm_alert_request, + alert_req_dsl::id, + &p.current_pagparams(), + ) + .filter(alert_req_dsl::case_id.eq(case.id)) + .filter(alert_req_dsl::sitrep_id.eq(case.sitrep_id)) + .select(model::fm::AlertRequest::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to list alerts requested for case {}", + case.id + )) + })?; + + paginator = p.found_batch(&batch, &|req| req.id); + for alert in batch { + alerts + .insert_unique(alert.try_into()?) + .expect("alert UUIDs should be unique"); + } + } + + alerts + }; + + Ok((ereports, alerts_requested)) } /// Insert the provided [`Sitrep`] into the database, and attempt to mark it @@ -1056,7 +1228,7 @@ mod tests { ) -> Result, Error> { let mut listed_orphans = BTreeSet::new(); let mut paginator = Paginator::new( - crate::db::datastore::SQL_BATCH_SIZE, + crate::dbSQL_BATC::datastore::H_SIZE, dropshot::PaginationOrder::Descending, ); while let Some(p) = paginator.next() { diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 5b966b38be4..684d656f308 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -39,6 +39,7 @@ define_enums! { ClickhouseModeEnum => "clickhouse_mode", DatasetKindEnum => "dataset_kind", DbMetadataNexusStateEnum => "db_metadata_nexus_state", + DiagnosisEngineEnum => "diagnosis_engine", DiskTypeEnum => "disk_type", DnsGroupEnum => "dns_group", DownstairsClientStopRequestReasonEnum => "downstairs_client_stop_request_reason_type", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index be483c399ea..4b8550c38d9 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2860,3 +2860,44 @@ table! { } allow_tables_to_appear_in_same_query!(fm_sitrep, fm_sitrep_history); + +table! { + fm_case (sitrep_id, id) { + id -> Uuid, + sitrep_id -> Uuid, + de -> crate::enums::DiagnosisEngineEnum, + + time_created -> Timestamptz, + created_sitrep_id -> Uuid, + + time_closed -> Nullable, + closed_sitrep_id -> Nullable, + + comment -> Text, + } +} + +table! { + fm_ereport_in_case (sitrep_id, restart_id, ena) { + restart_id -> Uuid, + ena -> Int8, + case_id -> Uuid, + sitrep_id -> Uuid, + assigned_sitrep_id -> Uuid, + } +} + +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_case); + +table! { + fm_alert_request (sitrep_id, id) { + id -> Uuid, + sitrep_id -> Uuid, + requested_sitrep_id -> Uuid, + case_id -> Uuid, + class -> crate::enums::AlertClassEnum, + payload -> Jsonb, + } +} + +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_alert_request); diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index 5588003bedf..b3ffa4c8dac 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -81,6 +81,7 @@ impl<'a> SitrepBuilder<'a> { id, created_sitrep_id: self.sitrep_id, time_created: chrono::Utc::now(), + closed_sitrep_id: None, time_closed: None, de, comment: String::new(), @@ -169,6 +170,7 @@ impl CaseBuilder { pub fn close(&mut self) { self.case.time_closed = Some(Utc::now()); + self.case.closed_sitrep_id = Some(self.sitrep_id); slog::info!(&self.log, "case closed"); } diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index b590ad5ba42..bf2ede82d6c 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -123,6 +123,8 @@ pub struct Case { pub id: CaseUuid, pub created_sitrep_id: SitrepUuid, pub time_created: DateTime, + + pub closed_sitrep_id: Option, pub time_closed: Option>, pub de: DiagnosisEngine, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index bc5e3ba7acd..2ee6b4ce092 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6838,18 +6838,85 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_sitrep_version_by_id ON omicron.public.fm_sitrep_history (sitrep_id); +CREATE TYPE IF NOT EXISTS omicron.public.diagnosis_engine AS ENUM ( + 'power_shelf', +); + CREATE TABLE IF NOT EXISTS omicron.public.fm_case ( -- Case UUID id UUID NOT NULL, -- UUID of the sitrep in which the case had this state. sitrep_id UUID NOT NULL, + + de omicron.public.diagnosis_engine NOT NULL, + + time_created TIMESTAMPTZ NOT NULL, -- UUID of the sitrep in which the case was created. created_sitrep_id UUID NOT NULL, - time_created TIMESTAMPTZ NOT NULL, + -- Time when the case was closed (if not null). time_closed TIMESTAMPTZ, + -- UUID of the sitrep in which the case was closed. + closed_sitrep_id UUID, + + comment TEXT NOT NULL, + + CONSTRAINT closed_case_validity CHECK ( + (closed_sitrep_id IS NULL AND time_closed IS NULL) OR + (closed_sitrep_id IS NOT NULL AND time_closed IS NOT NULL) + ), + + PRIMARY KEY (sitrep_id, id) ); +CREATE INDEX IF NOT EXISTS + lookup_fm_cases_for_sitrep +ON omicron.public.fm_case (sitrep_id); + +CREATE TABLE IF NOT EXISTS omicron.public.fm_ereport_in_case ( + -- The ereport's identity. + restart_id UUID NOT NULL, + ena INT8 NOT NULL, + + -- UUID of the case the ereport is assigned to. + case_id UUID NOT NULL, + + -- UUID of the sitrep in which this assignment exists. + sitrep_id UUID NOT NULL, + -- UUID of the sitrep in which the ereport was initially assigned to this + -- case. + assigned_sitrep_id UUID NOT NULL, + + PRIMARY KEY (sitrep_id, restart_id, ena) +); + +CREATE INDEX IF NOT EXISTS + lookup_ereports_assigned_to_fm_case +ON omicron.public.fm_ereport_in_case (sitrep_id, case_id); + + +CREATE TABLE IF NOT EXISTS omicron.public.fm_alert_request ( + -- Requested alert UUID + id UUID NOT NULL, + -- UUID of the sitrep in which the alert is requested. + sitrep_id UUID NOT NULL, + -- UUID of the sitrep in which the alert request was created. + requested_sitrep_id UUID NOT NULL, + -- UUID of the case to which this alert request belongs. + case_id UUID NOT NULL, + + -- The class of alert that was requested + alert_class omicron.public.alert_class NOT NULL, + -- Actual alert data. The structure of this depends on the alert class. + payload JSONB NOT NULL, + + PRIMARY KEY (sitrep_id, id) +); + +CREATE INDEX IF NOT EXISTS + lookup_fm_alert_requests_for_case +ON omicron.public.fm_alert_request (sitrep_id, case_id); + /* * List of datasets available to be sliced up and passed to VMMs for instance * local storage. From 9a39748752ec9aa41752ce52f669f3113925b28a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 12 Nov 2025 16:01:36 -0800 Subject: [PATCH 10/18] start associating cases with SPs --- nexus/db-model/src/fm/case.rs | 17 ++++++++- nexus/db-schema/src/schema.rs | 11 ++++++ nexus/fm/src/case.rs | 0 nexus/fm/src/lib.rs | 2 ++ nexus/types/src/fm.rs | 39 +++----------------- nexus/types/src/fm/case.rs | 67 +++++++++++++++++++++++++++++++++++ schema/crdb/dbinit.sql | 13 +++++++ 7 files changed, 113 insertions(+), 36 deletions(-) create mode 100644 nexus/fm/src/case.rs create mode 100644 nexus/types/src/fm/case.rs diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index cd40b6d72c2..60349aa8bce 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -6,9 +6,13 @@ use super::DiagnosisEngine; use crate::DbTypedUuid; +use crate::SpMgsSlot; +use crate::SpType; use crate::ereport; use chrono::{DateTime, Utc}; -use nexus_db_schema::schema::{fm_case, fm_ereport_in_case}; +use nexus_db_schema::schema::{ + fm_case, fm_case_impacts_sp_slot, fm_ereport_in_case, +}; use omicron_uuid_kinds::{CaseKind, EreporterRestartKind, SitrepKind}; #[derive(Queryable, Insertable, Clone, Debug, Selectable)] @@ -36,3 +40,14 @@ pub struct CaseEreport { pub sitrep_id: DbTypedUuid, pub assigned_sitrep_id: DbTypedUuid, } + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_case_impacts_sp_slot)] +pub struct CaseImpactsSp { + pub sitrep_id: DbTypedUuid, + pub case_id: DbTypedUuid, + pub sp_type: SpType, + pub sp_slot: SpMgsSlot, + pub created_sitrep_id: DbTypedUuid, + pub comment: String, +} diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 4b8550c38d9..153241ba929 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2889,6 +2889,17 @@ table! { allow_tables_to_appear_in_same_query!(fm_sitrep, fm_case); +table! { + fm_case_impacts_sp_slot (sitrep_id, case_id, sp_type, sp_slot) { + sitrep_id -> Uuid, + case_id -> Uuid, + sp_type -> crate::enums::SpTypeEnum, + sp_slot -> Int4, + created_sitrep_id -> Uuid, + comment -> Text, + } +} + table! { fm_alert_request (sitrep_id, id) { id -> Uuid, diff --git a/nexus/fm/src/case.rs b/nexus/fm/src/case.rs new file mode 100644 index 00000000000..e69de29bb2d diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index b3ffa4c8dac..ce9107b8d34 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -17,6 +17,7 @@ use chrono::Utc; use std::sync::Arc; pub mod alert; +pub mod case; pub mod de; #[derive(Debug)] @@ -87,6 +88,7 @@ impl<'a> SitrepBuilder<'a> { comment: String::new(), ereports: Default::default(), alerts_requested: Default::default(), + impacted_sp_slots: Default::default(), }; entry.insert(CaseBuilder::new(&self.log, sitrep_id, case)) } diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index bf2ede82d6c..8504c5fd8df 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -13,13 +13,15 @@ pub use ereport::{Ereport, EreportId}; mod alert; pub use alert::*; +pub mod case; +pub use case::Case; + use chrono::{DateTime, Utc}; -use iddqd::{IdOrdItem, IdOrdMap}; +use iddqd::IdOrdMap; use omicron_uuid_kinds::{ CaseUuid, CollectionUuid, OmicronZoneUuid, SitrepUuid, }; use serde::{Deserialize, Serialize}; -use std::sync::Arc; /// A fault management situation report, or _sitrep_. /// @@ -118,39 +120,6 @@ pub struct SitrepVersion { pub time_made_current: DateTime, } -#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] -pub struct Case { - pub id: CaseUuid, - pub created_sitrep_id: SitrepUuid, - pub time_created: DateTime, - - pub closed_sitrep_id: Option, - pub time_closed: Option>, - - pub de: DiagnosisEngine, - - pub ereports: IdOrdMap>, - - pub alerts_requested: IdOrdMap, - - pub comment: String, -} - -impl Case { - pub fn is_open(&self) -> bool { - self.time_closed.is_none() - } -} - -impl IdOrdItem for Case { - type Key<'a> = &'a CaseUuid; - fn key(&self) -> Self::Key<'_> { - &self.id - } - - iddqd::id_upcast!(); -} - #[derive( Copy, Clone, diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs new file mode 100644 index 00000000000..c1842426d4d --- /dev/null +++ b/nexus/types/src/fm/case.rs @@ -0,0 +1,67 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::fm::AlertRequest; +use crate::fm::DiagnosisEngine; +use crate::fm::Ereport; +use crate::inventory::SpType; +use chrono::{DateTime, Utc}; +use iddqd::{IdOrdItem, IdOrdMap}; +use omicron_uuid_kinds::{ + CaseUuid, CollectionUuid, OmicronZoneUuid, SitrepUuid, +}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; + +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct Case { + pub id: CaseUuid, + pub created_sitrep_id: SitrepUuid, + pub time_created: DateTime, + + pub closed_sitrep_id: Option, + pub time_closed: Option>, + + pub de: DiagnosisEngine, + + pub ereports: IdOrdMap>, + + pub alerts_requested: IdOrdMap, + + pub impacted_sp_slots: IdOrdMap, + + pub comment: String, +} + +impl Case { + pub fn is_open(&self) -> bool { + self.time_closed.is_none() + } +} + +impl IdOrdItem for Case { + type Key<'a> = &'a CaseUuid; + fn key(&self) -> Self::Key<'_> { + &self.id + } + + iddqd::id_upcast!(); +} + +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct ImpactedSpSlot { + pub sp_type: SpType, + pub slot: u8, + pub created_sitrep_id: SitrepUuid, + pub comment: String, +} + +impl IdOrdItem for ImpactedSpSlot { + type Key<'a> = (&'a SpType, &'a u8); + fn key(&self) -> Self::Key<'_> { + (&self.sp_type, &self.slot) + } + + iddqd::id_upcast!(); +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 2ee6b4ce092..99f25b2eb78 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6894,6 +6894,19 @@ CREATE INDEX IF NOT EXISTS lookup_ereports_assigned_to_fm_case ON omicron.public.fm_ereport_in_case (sitrep_id, case_id); +CREATE TABLE IF NOT EXISTS omicron.public.fm_case_impacts_sp_slot ( + sitrep_id UUID NOT NULL, + case_id UUID NOT NULL, + -- location of this device according to MGS + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + -- ID of the sitrep in which this SP was added to the case. + created_sitrep_id UUID NOT NULL, + comment TEXT NOT NULL, + + PRIMARY KEY (sitrep_id, case_id, sp_type, sp_slot) +); CREATE TABLE IF NOT EXISTS omicron.public.fm_alert_request ( -- Requested alert UUID From 34cae77b7d578a3b5089c5c7aa40578ac714575e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 13 Nov 2025 12:14:41 -0800 Subject: [PATCH 11/18] quick case pretty printer --- dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 7 + nexus/types/src/fm/case.rs | 258 ++++++++++++++++++++++- 2 files changed, 259 insertions(+), 6 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index 17d0e1c96ce..6f1d70c34ad 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -345,5 +345,12 @@ async fn cmd_db_sitrep_show( } } + if !cases.is_empty() { + println!("\n{:-<80}\n", "== CASES"); + for case in cases { + println!("{}", case.display_indented(4)); + } + } + Ok(()) } diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index c1842426d4d..9b9e289ffe8 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -8,10 +8,9 @@ use crate::fm::Ereport; use crate::inventory::SpType; use chrono::{DateTime, Utc}; use iddqd::{IdOrdItem, IdOrdMap}; -use omicron_uuid_kinds::{ - CaseUuid, CollectionUuid, OmicronZoneUuid, SitrepUuid, -}; +use omicron_uuid_kinds::{CaseUuid, SitrepUuid}; use serde::{Deserialize, Serialize}; +use std::fmt; use std::sync::Arc; #[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] @@ -25,10 +24,8 @@ pub struct Case { pub de: DiagnosisEngine, - pub ereports: IdOrdMap>, - + pub ereports: IdOrdMap, pub alerts_requested: IdOrdMap, - pub impacted_sp_slots: IdOrdMap, pub comment: String, @@ -38,6 +35,16 @@ impl Case { pub fn is_open(&self) -> bool { self.time_closed.is_none() } + + pub fn display_indented(&self, indent: usize) -> impl fmt::Display + '_ { + DisplayCase { case: self, indent } + } +} + +impl fmt::Display for Case { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.display_indented(0).fmt(f) + } } impl IdOrdItem for Case { @@ -49,6 +56,22 @@ impl IdOrdItem for Case { iddqd::id_upcast!(); } +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct CaseEreport { + pub ereport: Arc, + pub assigned_sitrep_id: SitrepUuid, + pub comment: String, +} + +impl IdOrdItem for CaseEreport { + type Key<'a> = as IdOrdItem>::Key<'a>; + fn key(&self) -> Self::Key<'_> { + self.ereport.key() + } + + iddqd::id_upcast!(); +} + #[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct ImpactedSpSlot { pub sp_type: SpType, @@ -65,3 +88,226 @@ impl IdOrdItem for ImpactedSpSlot { iddqd::id_upcast!(); } + +struct DisplayCase<'a> { + case: &'a Case, + indent: usize, +} + +impl fmt::Display for DisplayCase<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + const BULLET: &str = "* "; + const LIST_INDENT: usize = 4; + + let &Self { + case: + Case { + ref id, + ref created_sitrep_id, + ref time_created, + ref closed_sitrep_id, + ref time_closed, + ref de, + ref ereports, + ref alerts_requested, + ref impacted_sp_slots, + ref comment, + }, + indent, + } = self; + writeln!( + f, + "{:>indent$}case: {id:?}", + if indent > 0 { BULLET } else { "" } + )?; + writeln!(f, "{:>indent$}comment: {comment}", "")?; + writeln!(f, "{:>indent$}diagnosis engine: {de}", "")?; + writeln!(f, "{:>indent$}created in sitrep: {created_sitrep_id}", "")?; + writeln!(f, "{:>indent$} at: {time_created}", "")?; + if let Some(closed_id) = closed_sitrep_id { + writeln!(f, "{:>indent$}closed in sitrep: {closed_id}", "")?; + if let Some(time_closed) = time_closed { + writeln!(f, "{:>indent$} at: {time_closed}", "")?; + } else { + writeln!(f, "{:>indent$} at: ", "")?; + } + } + + if !ereports.is_empty() { + writeln!(f, "\n{:>indent$}ereports:", "")?; + let indent = indent + LIST_INDENT; + for CaseEreport { ereport, assigned_sitrep_id, comment } in ereports + { + writeln!(f, "{BULLET:>indent$}{}", ereport.id())?; + writeln!(f, "{:>indent$}class: {:?}", "", ereport.class)?; + writeln!(f, "{:>indent$}reporter: {}", "", ereport.reporter)?; + writeln!( + f, + "{:>indent$}added in sitrep: {assigned_sitrep_id}", + "" + )?; + writeln!(f, "{:>indent$}comment: {comment}", "")?; + } + } + + if !impacted_sp_slots.is_empty() { + writeln!(f, "\n{:>indent$}SP slots impacted:", "")?; + let indent = indent + LIST_INDENT; + for ImpactedSpSlot { sp_type, slot, created_sitrep_id, comment } in + impacted_sp_slots + { + writeln!(f, "{BULLET:>indent$}{sp_type:<6} {slot:02}")?; + writeln!( + f, + "{:>indent$}added in sitrep: {created_sitrep_id}", + "" + )?; + writeln!(f, "{:>indent$}comment: {comment}", "")?; + } + } + + if !alerts_requested.is_empty() { + writeln!(f, "\n{:>indent$}alerts requested:", "")?; + let indent = indent + LIST_INDENT; + for AlertRequest { id, class, requested_sitrep_id, .. } in + alerts_requested + { + writeln!(f, "{BULLET:>indent$}{id:?}")?; + writeln!(f, "{:>indent$}class: {class:?}", "")?; + writeln!( + f, + "{:>indent$}requested in sitrep: {requested_sitrep_id}", + "" + )?; + } + } + + writeln!(f)?; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::fm::{AlertClass, AlertRequest, DiagnosisEngine}; + use chrono::Utc; + use ereport_types::{Ena, EreportId}; + use omicron_uuid_kinds::{ + AlertUuid, CaseUuid, EreporterRestartUuid, OmicronZoneUuid, SitrepUuid, + }; + use std::sync::Arc; + + #[test] + fn test_case_display() { + // Create UUIDs for the case + let case_id = CaseUuid::new_v4(); + let created_sitrep_id = SitrepUuid::new_v4(); + let closed_sitrep_id = SitrepUuid::new_v4(); + let time_created = Utc::now(); + let time_closed = Utc::now(); + + // Create some ereports + let mut ereports = IdOrdMap::new(); + + let ereport1 = CaseEreport { + ereport: Arc::new(Ereport { + data: crate::fm::ereport::EreportData { + id: EreportId { + restart_id: EreporterRestartUuid::new_v4(), + ena: Ena::from(2u64), + }, + time_collected: time_created, + collector_id: OmicronZoneUuid::new_v4(), + serial_number: Some("BRM6900420".to_string()), + part_number: Some("913-0000037".to_string()), + class: Some("hw.pwr.remove.psu".to_string()), + report: serde_json::json!({}), + }, + reporter: crate::fm::ereport::Reporter::Sp { + sp_type: SpType::Power, + slot: 0, + }, + }), + assigned_sitrep_id: created_sitrep_id, + comment: "PSU removed".to_string(), + }; + ereports.insert_unique(ereport1).unwrap(); + + let ereport2 = CaseEreport { + ereport: Arc::new(Ereport { + data: crate::fm::ereport::EreportData { + id: EreportId { + restart_id: EreporterRestartUuid::new_v4(), + ena: Ena::from(3u64), + }, + time_collected: time_created, + collector_id: OmicronZoneUuid::new_v4(), + serial_number: Some("BRM6900420".to_string()), + part_number: Some("913-0000037".to_string()), + class: Some("hw.pwr.insert.psu".to_string()), + report: serde_json::json!({"link": "eth0", "status": "down"}), + }, + reporter: crate::fm::ereport::Reporter::Sp { + sp_type: SpType::Power, + slot: 0, + }, + }), + assigned_sitrep_id: closed_sitrep_id, + comment: "PSU inserted, closing this case".to_string(), + }; + ereports.insert_unique(ereport2).unwrap(); + + // Create some alerts + let mut alerts_requested = IdOrdMap::new(); + + let alert1 = AlertRequest { + id: AlertUuid::new_v4(), + class: AlertClass::PsuRemoved, + payload: serde_json::json!({}), + requested_sitrep_id: created_sitrep_id, + }; + alerts_requested.insert_unique(alert1).unwrap(); + + let alert2 = AlertRequest { + id: AlertUuid::new_v4(), + class: AlertClass::PsuInserted, + payload: serde_json::json!({}), + requested_sitrep_id: closed_sitrep_id, + }; + alerts_requested.insert_unique(alert2).unwrap(); + + let mut impacted_sp_slots = IdOrdMap::new(); + let slot2 = ImpactedSpSlot { + sp_type: SpType::Power, + slot: 0, + created_sitrep_id, + comment: "Power shelf 0 reduced redundancy".to_string(), + }; + impacted_sp_slots.insert_unique(slot2).unwrap(); + + // Create the case + let case = Case { + id: case_id, + created_sitrep_id, + time_created, + closed_sitrep_id: Some(closed_sitrep_id), + time_closed: Some(time_closed), + de: DiagnosisEngine::PowerShelf, + ereports, + alerts_requested, + impacted_sp_slots, + comment: "Power shelf rectifier added and removed here :-)" + .to_string(), + }; + + eprintln!("example case display:"); + eprintln!("====================="); + eprintln!("{case}"); + + eprintln!("example case display (indented by 4):"); + eprintln!("======================================"); + eprintln!("{}", case.display_indented(4)); + } +} From a05ce9d94ac5e4aca6b63107555b26d92e511507 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 13 Nov 2025 12:48:54 -0800 Subject: [PATCH 12/18] more case db ops --- nexus/db-model/src/fm/case.rs | 1 + nexus/db-queries/src/db/datastore/fm.rs | 13 +++++++++++-- nexus/db-schema/src/schema.rs | 2 ++ nexus/fm/src/de/power_shelf.rs | 4 ++-- nexus/fm/src/lib.rs | 12 ++++++++++-- schema/crdb/dbinit.sql | 2 ++ 6 files changed, 28 insertions(+), 6 deletions(-) diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index 60349aa8bce..80ff153f15e 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -39,6 +39,7 @@ pub struct CaseEreport { pub case_id: DbTypedUuid, pub sitrep_id: DbTypedUuid, pub assigned_sitrep_id: DbTypedUuid, + pub comment: String, } #[derive(Queryable, Insertable, Clone, Debug, Selectable)] diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 7dbb8069bfb..aab3cb17191 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -180,6 +180,8 @@ impl DataStore { for model::fm::CaseEreport { restart_id, ena: DbEna(ena), + comment, + assigned_sitrep_id, .. } in ereport_assignments { @@ -202,7 +204,13 @@ impl DataStore { entry.insert(Arc::new(ereport)).clone() } }; - ereports.insert_unique(ereport).unwrap(); + ereports + .insert_unique(fm::case::CaseEreport { + ereport, + assigned_sitrep_id: assigned_sitrep_id.into(), + comment, + }) + .unwrap(); } cases @@ -218,6 +226,7 @@ impl DataStore { comment: case.comment, ereports, alerts_requested, + impacted_sp_slots: Default::default(), // TODO }) .expect("case UUIDs should be unique"); } @@ -1228,7 +1237,7 @@ mod tests { ) -> Result, Error> { let mut listed_orphans = BTreeSet::new(); let mut paginator = Paginator::new( - crate::dbSQL_BATC::datastore::H_SIZE, + SQL_BATCH_SIZE, dropshot::PaginationOrder::Descending, ); while let Some(p) = paginator.next() { diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 153241ba929..151c1bc65b7 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2884,6 +2884,8 @@ table! { case_id -> Uuid, sitrep_id -> Uuid, assigned_sitrep_id -> Uuid, + + comment -> Text, } } diff --git a/nexus/fm/src/de/power_shelf.rs b/nexus/fm/src/de/power_shelf.rs index bd27e633b6e..888be8b48de 100644 --- a/nexus/fm/src/de/power_shelf.rs +++ b/nexus/fm/src/de/power_shelf.rs @@ -34,7 +34,7 @@ pub fn diagnose( Some("hw.insert.psu") => { let psc_psu = extract_psc_psu(&ereport, slot, &sitrep.log); let mut case = sitrep.open_case(DiagnosisEngine::PowerShelf)?; - case.add_ereport(ereport); + case.add_ereport(ereport, "PSU inserted ereport"); case.comment = format!("PSC {slot} PSU {:?} inserted", psc_psu.psu_slot); case.request_alert(&alert::power_shelf::PsuInserted::V0 { @@ -46,7 +46,7 @@ pub fn diagnose( Some("hw.remove.psu") => { let psc_psu = extract_psc_psu(&ereport, slot, &sitrep.log); let mut case = sitrep.open_case(DiagnosisEngine::PowerShelf)?; - case.add_ereport(ereport); + case.add_ereport(ereport, "PSU removed ereport"); case.comment = format!("PSC {slot} PSU {:?} removed", psc_psu.psu_slot); case.request_alert(&alert::power_shelf::PsuRemoved::V0 { diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index ce9107b8d34..59e6b38e281 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -177,8 +177,16 @@ impl CaseBuilder { slog::info!(&self.log, "case closed"); } - pub fn add_ereport(&mut self, report: &Arc) { - match self.case.ereports.insert_unique(report.clone()) { + pub fn add_ereport( + &mut self, + report: &Arc, + comment: impl std::fmt::Display, + ) { + match self.case.ereports.insert_unique(fm::case::CaseEreport { + ereport: report.clone(), + assigned_sitrep_id: self.sitrep_id, + comment: comment.to_string(), + }) { Ok(_) => { slog::info!( self.log, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 99f25b2eb78..ee1ad3e0573 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6887,6 +6887,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.fm_ereport_in_case ( -- case. assigned_sitrep_id UUID NOT NULL, + comment TEXT NOT NULL, + PRIMARY KEY (sitrep_id, restart_id, ena) ); From b8d6cc806d7af14c9659a6035a4ab921d87a6aba Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 13 Nov 2025 13:02:36 -0800 Subject: [PATCH 13/18] tweak case display a bit more --- nexus/types/src/fm/case.rs | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index 9b9e289ffe8..9b16867bbb0 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -117,10 +117,9 @@ impl fmt::Display for DisplayCase<'_> { } = self; writeln!( f, - "{:>indent$}case: {id:?}", + "{:>indent$}case: {id}", if indent > 0 { BULLET } else { "" } )?; - writeln!(f, "{:>indent$}comment: {comment}", "")?; writeln!(f, "{:>indent$}diagnosis engine: {de}", "")?; writeln!(f, "{:>indent$}created in sitrep: {created_sitrep_id}", "")?; writeln!(f, "{:>indent$} at: {time_created}", "")?; @@ -133,25 +132,41 @@ impl fmt::Display for DisplayCase<'_> { } } + writeln!(f, "\n{:>indent$}comment: {comment}", "")?; + if !ereports.is_empty() { - writeln!(f, "\n{:>indent$}ereports:", "")?; + writeln!(f, "\n{:>indent$}ereports:\n", "")?; let indent = indent + LIST_INDENT; for CaseEreport { ereport, assigned_sitrep_id, comment } in ereports { + let pn = + ereport.part_number.as_deref().unwrap_or(""); + let sn = ereport + .serial_number + .as_deref() + .unwrap_or(""); writeln!(f, "{BULLET:>indent$}{}", ereport.id())?; - writeln!(f, "{:>indent$}class: {:?}", "", ereport.class)?; - writeln!(f, "{:>indent$}reporter: {}", "", ereport.reporter)?; + writeln!( + f, + "{:>indent$}class: {}", + "", + ereport.class.as_deref().unwrap_or("") + )?; + writeln!(f, "{:>indent$}reported by:", "")?; + + writeln!(f, "{:>indent$} location: {}", "", ereport.reporter)?; + writeln!(f, "{:>indent$} identity: {pn}:{sn}", "")?; writeln!( f, "{:>indent$}added in sitrep: {assigned_sitrep_id}", "" )?; - writeln!(f, "{:>indent$}comment: {comment}", "")?; + writeln!(f, "{:>indent$}comment: {comment}\n", "")?; } } if !impacted_sp_slots.is_empty() { - writeln!(f, "\n{:>indent$}SP slots impacted:", "")?; + writeln!(f, "\n{:>indent$}SP slots impacted:\n", "")?; let indent = indent + LIST_INDENT; for ImpactedSpSlot { sp_type, slot, created_sitrep_id, comment } in impacted_sp_slots @@ -162,21 +177,21 @@ impl fmt::Display for DisplayCase<'_> { "{:>indent$}added in sitrep: {created_sitrep_id}", "" )?; - writeln!(f, "{:>indent$}comment: {comment}", "")?; + writeln!(f, "{:>indent$}comment: {comment}\n", "")?; } } if !alerts_requested.is_empty() { - writeln!(f, "\n{:>indent$}alerts requested:", "")?; + writeln!(f, "{:>indent$}alerts requested:\n", "")?; let indent = indent + LIST_INDENT; for AlertRequest { id, class, requested_sitrep_id, .. } in alerts_requested { - writeln!(f, "{BULLET:>indent$}{id:?}")?; + writeln!(f, "{BULLET:>indent$}{id}")?; writeln!(f, "{:>indent$}class: {class:?}", "")?; writeln!( f, - "{:>indent$}requested in sitrep: {requested_sitrep_id}", + "{:>indent$}requested in sitrep: {requested_sitrep_id}\n", "" )?; } From bdb742bcc4e43fa2fb4e80271b1c3e691f3d5568 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 13 Nov 2025 17:01:42 -0800 Subject: [PATCH 14/18] put thing into db --- nexus/db-model/src/fm/case.rs | 97 ++++++++++++++- nexus/db-queries/src/db/datastore/fm.rs | 116 +++++++++++++++--- .../src/app/background/tasks/fm_sitrep_gc.rs | 6 +- .../app/background/tasks/fm_sitrep_load.rs | 4 +- nexus/types/src/fm.rs | 1 + 5 files changed, 200 insertions(+), 24 deletions(-) diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index 80ff153f15e..4a99133a76e 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -4,6 +4,7 @@ //! Fault management cases. +use super::AlertRequest; use super::DiagnosisEngine; use crate::DbTypedUuid; use crate::SpMgsSlot; @@ -13,7 +14,10 @@ use chrono::{DateTime, Utc}; use nexus_db_schema::schema::{ fm_case, fm_case_impacts_sp_slot, fm_ereport_in_case, }; -use omicron_uuid_kinds::{CaseKind, EreporterRestartKind, SitrepKind}; +use nexus_types::fm; +use omicron_uuid_kinds::{ + CaseKind, EreporterRestartKind, SitrepKind, SitrepUuid, +}; #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = fm_case)] @@ -52,3 +56,94 @@ pub struct CaseImpactsSp { pub created_sitrep_id: DbTypedUuid, pub comment: String, } + +#[derive(Clone, Debug)] +pub struct Case { + pub metadata: CaseMetadata, + pub ereports: Vec, + pub impacted_sp_slots: Vec, + pub alerts_requested: Vec, +} + +impl Case { + pub fn from_sitrep(sitrep_id: SitrepUuid, case: fm::Case) -> Self { + let sitrep_id = sitrep_id.into(); + let case_id = case.id.into(); + let ereports = case + .ereports + .into_iter() + .map( + |fm::case::CaseEreport { + ereport, + assigned_sitrep_id, + comment, + }| { + let restart_id = ereport.id().restart_id.into(); + let ena = ereport.id().ena.into(); + CaseEreport { + case_id, + restart_id, + ena, + comment, + sitrep_id, + assigned_sitrep_id: assigned_sitrep_id.into(), + } + }, + ) + .collect(); + let impacted_sp_slots = case + .impacted_sp_slots + .into_iter() + .map( + |fm::case::ImpactedSpSlot { + sp_type, + slot, + comment, + created_sitrep_id, + }| CaseImpactsSp { + sitrep_id, + case_id, + sp_type: sp_type.into(), + sp_slot: SpMgsSlot::from(slot as u16), + created_sitrep_id: created_sitrep_id.into(), + comment, + }, + ) + .collect(); + let alerts_requested = case + .alerts_requested + .into_iter() + .map( + |fm::AlertRequest { + id, + class, + payload, + requested_sitrep_id, + }| AlertRequest { + sitrep_id, + case_id, + class: class.into(), + id: id.into(), + payload, + requested_sitrep_id: requested_sitrep_id.into(), + }, + ) + .collect(); + + Self { + metadata: CaseMetadata { + id: case_id, + sitrep_id, + de: case.de.into(), + created_sitrep_id: case.created_sitrep_id.into(), + time_created: case.time_created.into(), + time_closed: case.time_closed.map(Into::into), + closed_sitrep_id: case.closed_sitrep_id.map(Into::into), + comment: case.comment, + }, + ereports, + impacted_sp_slots, + alerts_requested, + } + } +} diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index aab3cb17191..0f27458daf9 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -33,6 +33,7 @@ use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::DbConnection; use nexus_db_schema::schema::fm_alert_request::dsl as alert_req_dsl; use nexus_db_schema::schema::fm_case::dsl as case_dsl; +use nexus_db_schema::schema::fm_case_impacts_sp_slot::dsl as impacted_sp_dsl; use nexus_db_schema::schema::fm_ereport_in_case::dsl as case_ereport_dsl; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; @@ -130,7 +131,8 @@ impl DataStore { Ok(Some((version, sitrep))) } - /// Reads the entire content of the sitrep with the provided ID, if one exists. + /// Reads the entire content of the sitrep with the provided ID, if one + /// exists. pub async fn fm_sitrep_read( &self, opctx: &OpContext, @@ -356,16 +358,27 @@ impl DataStore { pub async fn fm_sitrep_insert( &self, opctx: &OpContext, - sitrep: &Sitrep, + sitrep: Sitrep, ) -> Result<(), InsertSitrepError> { let conn = self.pool_connection_authorized(opctx).await?; // TODO(eliza): there should probably be an authz object for the fm sitrep? opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + let sitrep_id = sitrep.id(); + // Create the sitrep metadata record. + // + // NOTE: we must insert this record before anything else, because it's + // how orphaned sitreps are found when performing garbage collection. + // Were we to first insert some other records and insert the metadata + // record *last*, we could die when we have inserted some sitrep data + // but have yet to create the metadata record. If this occurs, those + // records could not be easily found by the garbage collection task. + // Those (unused) records would then be permanently leaked without + // manual human intervention to delete them. diesel::insert_into(sitrep_dsl::fm_sitrep) - .values(model::SitrepMetadata::from(sitrep.metadata.clone())) + .values(model::SitrepMetadata::from(sitrep.metadata)) .execute_async(&*conn) .await .map_err(|e| { @@ -373,10 +386,77 @@ impl DataStore { .internal_context("failed to insert sitrep metadata record") })?; - // TODO(eliza): other sitrep records would be inserted here... + // Create case records. + let mut cases = Vec::with_capacity(sitrep.cases.len()); + for case in sitrep.cases { + // TODO(eliza): some of this could be done in parallel using a + // `ParallelTaskSet`, if the time it takes to insert a sitrep were + // to become important? + let model::fm::Case { + metadata, + ereports, + alerts_requested, + impacted_sp_slots, + } = model::fm::Case::from_sitrep(sitrep_id, case); + + if !ereports.is_empty() { + diesel::insert_into(case_ereport_dsl::fm_ereport_in_case) + .values(ereports) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to insert ereport records for case {}", + metadata.id + )) + })?; + } + + if !alerts_requested.is_empty() { + diesel::insert_into(alert_req_dsl::fm_alert_request) + .values(alerts_requested) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to insert ereport alert requests for case {}", + metadata.id + )) + })?; + } + + if !impacted_sp_slots.is_empty() { + diesel::insert_into(impacted_sp_dsl::fm_case_impacts_sp_slot) + .values(impacted_sp_slots) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to insert impacted SP slots for case {}", + metadata.id + )) + })?; + } + + cases.push(metadata); + } + + if !cases.is_empty() { + diesel::insert_into(case_dsl::fm_case) + .values(cases) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to insert case records") + })?; + } // Now, try to make the sitrep current. - let query = Self::insert_sitrep_version_query(sitrep.id()); + let query = Self::insert_sitrep_version_query(sitrep_id); query .execute_async(&*conn) .await @@ -387,7 +467,7 @@ impl DataStore { ) if info.message() == Self::PARENT_NOT_CURRENT_ERROR_MESSAGE => { - InsertSitrepError::ParentNotCurrent(sitrep.id()) + InsertSitrepError::ParentNotCurrent(sitrep_id) } err => { let err = @@ -943,7 +1023,7 @@ mod tests { cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep.clone()).await.unwrap(); let current = datastore .fm_sitrep_read_current(&opctx) @@ -962,7 +1042,7 @@ mod tests { // Trying to insert the same sitrep again should fail. let err = - datastore.fm_sitrep_insert(&opctx, &sitrep).await.unwrap_err(); + datastore.fm_sitrep_insert(&opctx, sitrep.clone()).await.unwrap_err(); assert!(err.to_string().contains("duplicate key")); // Clean up. @@ -989,7 +1069,7 @@ mod tests { }, cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep1.clone()).await.unwrap(); // Create a second sitrep with the first as parent let sitrep2 = nexus_types::fm::Sitrep { @@ -1003,7 +1083,7 @@ mod tests { }, cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep2).await.expect( + datastore.fm_sitrep_insert(&opctx, sitrep2.clone()).await.expect( "inserting a sitrep whose parent is current should succeed", ); @@ -1044,7 +1124,7 @@ mod tests { }, cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep1.clone()).await.unwrap(); // Try to insert a sitrep with a non-existent parent ID let nonexistent_id = SitrepUuid::new_v4(); @@ -1060,7 +1140,7 @@ mod tests { cases: Default::default(), }; - let result = datastore.fm_sitrep_insert(&opctx, &sitrep2).await; + let result = datastore.fm_sitrep_insert(&opctx, sitrep2).await; // Should fail with ParentNotCurrent error match result { @@ -1094,7 +1174,7 @@ mod tests { }, cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep1.clone()).await.unwrap(); // Create a second sitrep with the first as parent let sitrep2 = nexus_types::fm::Sitrep { @@ -1108,7 +1188,7 @@ mod tests { }, cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep2).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep2.clone()).await.unwrap(); // Try to create a third sitrep with sitrep1 (outdated) as parent. // This should fail, as sitrep2 is now the current sitrep. @@ -1123,7 +1203,7 @@ mod tests { }, cases: Default::default(), }; - let result = datastore.fm_sitrep_insert(&opctx, &sitrep3).await; + let result = datastore.fm_sitrep_insert(&opctx, sitrep3.clone()).await; // Should fail with ParentNotCurrent error match result { @@ -1165,7 +1245,7 @@ mod tests { cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep1) + .fm_sitrep_insert(&opctx, sitrep1.clone()) .await .expect("inserting initial sitrep should succeed"); @@ -1206,7 +1286,7 @@ mod tests { cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep2) + .fm_sitrep_insert(&opctx, sitrep2.clone()) .await .expect("inserting child sitrep should succeed"); @@ -1269,7 +1349,7 @@ mod tests { }, cases: Default::default(), }; - match datastore.fm_sitrep_insert(&opctx, &sitrep).await { + match datastore.fm_sitrep_insert(&opctx, sitrep).await { Ok(_) => { panic!("inserting sitrep v{v} orphan {i} should not succeed") } diff --git a/nexus/src/app/background/tasks/fm_sitrep_gc.rs b/nexus/src/app/background/tasks/fm_sitrep_gc.rs index 7295e3c2459..372ae80c6a7 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_gc.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_gc.rs @@ -155,7 +155,7 @@ mod tests { cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep1) + .fm_sitrep_insert(&opctx, sitrep1.clone()) .await .expect("inserting initial sitrep should succeed"); @@ -178,7 +178,7 @@ mod tests { cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep2) + .fm_sitrep_insert(&opctx, sitrep2.clone()) .await .expect("inserting child sitrep should succeed"); @@ -271,7 +271,7 @@ mod tests { }, cases: Default::default(), }; - match datastore.fm_sitrep_insert(&opctx, &sitrep).await { + match datastore.fm_sitrep_insert(&opctx, sitrep).await { Ok(_) => { panic!("inserting sitrep v{v} orphan {i} should not succeed") } diff --git a/nexus/src/app/background/tasks/fm_sitrep_load.rs b/nexus/src/app/background/tasks/fm_sitrep_load.rs index 723a96bf3b7..2e7efeec849 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_load.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_load.rs @@ -227,7 +227,7 @@ mod test { cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep1) + .fm_sitrep_insert(&opctx, sitrep1.clone()) .await .expect("sitrep should be inserted successfully"); @@ -292,7 +292,7 @@ mod test { cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep2) + .fm_sitrep_insert(&opctx, sitrep2.clone()) .await .expect("sitrep2 should be inserted successfully"); diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index 8504c5fd8df..52ce4842323 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -44,6 +44,7 @@ pub struct Sitrep { /// ID, and which Nexus produced it. pub metadata: SitrepMetadata, pub cases: IdOrdMap, + // pub cases_by_sp: } impl Sitrep { From 911816bf814c29f8eb3f6f3282d27952b7245004 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 13 Nov 2025 17:02:08 -0800 Subject: [PATCH 15/18] now with '<-- this sitrep' technology --- dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 2 +- ereport/types/src/lib.rs | 2 +- nexus/types/src/fm/case.rs | 49 ++++++++++++++++-------- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index 6f1d70c34ad..4fe302465de 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -348,7 +348,7 @@ async fn cmd_db_sitrep_show( if !cases.is_empty() { println!("\n{:-<80}\n", "== CASES"); for case in cases { - println!("{}", case.display_indented(4)); + println!("{}", case.display_indented(4, Some(id))); } } diff --git a/ereport/types/src/lib.rs b/ereport/types/src/lib.rs index d06f5f2fc3d..440a3dd78e5 100644 --- a/ereport/types/src/lib.rs +++ b/ereport/types/src/lib.rs @@ -122,7 +122,7 @@ pub struct EreportId { impl fmt::Display for EreportId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{:?}:{:x}", self.restart_id, self.ena) + write!(f, "{}:{:x}", self.restart_id, self.ena.0) } } diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index 9b16867bbb0..3ab5f37b23c 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -36,14 +36,14 @@ impl Case { self.time_closed.is_none() } - pub fn display_indented(&self, indent: usize) -> impl fmt::Display + '_ { - DisplayCase { case: self, indent } + pub fn display_indented(&self, indent: usize, sitrep_id: Option) -> impl fmt::Display + '_ { + DisplayCase { case: self, indent, sitrep_id } } } impl fmt::Display for Case { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.display_indented(0).fmt(f) + self.display_indented(0, None).fmt(f) } } @@ -92,6 +92,7 @@ impl IdOrdItem for ImpactedSpSlot { struct DisplayCase<'a> { case: &'a Case, indent: usize, + sitrep_id: Option, } impl fmt::Display for DisplayCase<'_> { @@ -114,17 +115,28 @@ impl fmt::Display for DisplayCase<'_> { ref comment, }, indent, + sitrep_id, } = self; + + let this_sitrep = move |s| { + if Some(s) == sitrep_id { + " <-- this sitrep" + } else { + "" + } + }; + writeln!( f, - "{:>indent$}case: {id}", + "{:>indent$}case {id}", if indent > 0 { BULLET } else { "" } )?; + writeln!(f, "{:>indent$}-----------------------------------------", "")?; writeln!(f, "{:>indent$}diagnosis engine: {de}", "")?; - writeln!(f, "{:>indent$}created in sitrep: {created_sitrep_id}", "")?; + writeln!(f, "{:>indent$}created in sitrep: {created_sitrep_id}{}", "", this_sitrep(*created_sitrep_id))?; writeln!(f, "{:>indent$} at: {time_created}", "")?; if let Some(closed_id) = closed_sitrep_id { - writeln!(f, "{:>indent$}closed in sitrep: {closed_id}", "")?; + writeln!(f, "{:>indent$}closed in sitrep: {closed_id}{}", "", this_sitrep(*closed_id))?; if let Some(time_closed) = time_closed { writeln!(f, "{:>indent$} at: {time_closed}", "")?; } else { @@ -145,7 +157,7 @@ impl fmt::Display for DisplayCase<'_> { .serial_number .as_deref() .unwrap_or(""); - writeln!(f, "{BULLET:>indent$}{}", ereport.id())?; + writeln!(f, "{BULLET:>indent$}ereport {}", ereport.id())?; writeln!( f, "{:>indent$}class: {}", @@ -158,8 +170,9 @@ impl fmt::Display for DisplayCase<'_> { writeln!(f, "{:>indent$} identity: {pn}:{sn}", "")?; writeln!( f, - "{:>indent$}added in sitrep: {assigned_sitrep_id}", - "" + "{:>indent$}added in sitrep: {assigned_sitrep_id}{}", + "", + this_sitrep(*assigned_sitrep_id) )?; writeln!(f, "{:>indent$}comment: {comment}\n", "")?; } @@ -174,8 +187,9 @@ impl fmt::Display for DisplayCase<'_> { writeln!(f, "{BULLET:>indent$}{sp_type:<6} {slot:02}")?; writeln!( f, - "{:>indent$}added in sitrep: {created_sitrep_id}", - "" + "{:>indent$}added in sitrep: {created_sitrep_id}{}", + "", + this_sitrep(*created_sitrep_id) )?; writeln!(f, "{:>indent$}comment: {comment}\n", "")?; } @@ -187,12 +201,13 @@ impl fmt::Display for DisplayCase<'_> { for AlertRequest { id, class, requested_sitrep_id, .. } in alerts_requested { - writeln!(f, "{BULLET:>indent$}{id}")?; + writeln!(f, "{BULLET:>indent$}alert {id}")?; writeln!(f, "{:>indent$}class: {class:?}", "")?; writeln!( f, - "{:>indent$}requested in sitrep: {requested_sitrep_id}\n", - "" + "{:>indent$}requested in sitrep: {requested_sitrep_id}{}\n", + "", + this_sitrep(*requested_sitrep_id) )?; } } @@ -318,11 +333,11 @@ mod tests { }; eprintln!("example case display:"); - eprintln!("====================="); + eprintln!("=====================\n"); eprintln!("{case}"); eprintln!("example case display (indented by 4):"); - eprintln!("======================================"); - eprintln!("{}", case.display_indented(4)); + eprintln!("======================================\n"); + eprintln!("{}", case.display_indented(4, Some(closed_sitrep_id))); } } From d1a7a8db074145d203e143393583531d52ad210d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 14 Nov 2025 10:06:04 -0800 Subject: [PATCH 16/18] format tweaks etc --- nexus/db-queries/src/db/datastore/fm.rs | 6 ++-- nexus/types/src/fm/case.rs | 40 +++++++++++++++++-------- nexus/types/src/fm/ereport.rs | 17 +++++------ 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 0f27458daf9..b8984ac4235 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -1041,8 +1041,10 @@ mod tests { assert_eq!(sitrep.metadata.comment, current_sitrep.metadata.comment); // Trying to insert the same sitrep again should fail. - let err = - datastore.fm_sitrep_insert(&opctx, sitrep.clone()).await.unwrap_err(); + let err = datastore + .fm_sitrep_insert(&opctx, sitrep.clone()) + .await + .unwrap_err(); assert!(err.to_string().contains("duplicate key")); // Clean up. diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index 3ab5f37b23c..a616872a4d6 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -36,7 +36,11 @@ impl Case { self.time_closed.is_none() } - pub fn display_indented(&self, indent: usize, sitrep_id: Option) -> impl fmt::Display + '_ { + pub fn display_indented( + &self, + indent: usize, + sitrep_id: Option, + ) -> impl fmt::Display + '_ { DisplayCase { case: self, indent, sitrep_id } } } @@ -119,11 +123,7 @@ impl fmt::Display for DisplayCase<'_> { } = self; let this_sitrep = move |s| { - if Some(s) == sitrep_id { - " <-- this sitrep" - } else { - "" - } + if Some(s) == sitrep_id { " <-- this sitrep" } else { "" } }; writeln!( @@ -131,12 +131,26 @@ impl fmt::Display for DisplayCase<'_> { "{:>indent$}case {id}", if indent > 0 { BULLET } else { "" } )?; - writeln!(f, "{:>indent$}-----------------------------------------", "")?; + writeln!( + f, + "{:>indent$}-----------------------------------------", + "" + )?; writeln!(f, "{:>indent$}diagnosis engine: {de}", "")?; - writeln!(f, "{:>indent$}created in sitrep: {created_sitrep_id}{}", "", this_sitrep(*created_sitrep_id))?; + writeln!( + f, + "{:>indent$}created in sitrep: {created_sitrep_id}{}", + "", + this_sitrep(*created_sitrep_id) + )?; writeln!(f, "{:>indent$} at: {time_created}", "")?; if let Some(closed_id) = closed_sitrep_id { - writeln!(f, "{:>indent$}closed in sitrep: {closed_id}{}", "", this_sitrep(*closed_id))?; + writeln!( + f, + "{:>indent$}closed in sitrep: {closed_id}{}", + "", + this_sitrep(*closed_id) + )?; if let Some(time_closed) = time_closed { writeln!(f, "{:>indent$} at: {time_closed}", "")?; } else { @@ -172,7 +186,7 @@ impl fmt::Display for DisplayCase<'_> { f, "{:>indent$}added in sitrep: {assigned_sitrep_id}{}", "", - this_sitrep(*assigned_sitrep_id) + this_sitrep(*assigned_sitrep_id) )?; writeln!(f, "{:>indent$}comment: {comment}\n", "")?; } @@ -184,12 +198,12 @@ impl fmt::Display for DisplayCase<'_> { for ImpactedSpSlot { sp_type, slot, created_sitrep_id, comment } in impacted_sp_slots { - writeln!(f, "{BULLET:>indent$}{sp_type:<6} {slot:02}")?; + writeln!(f, "{BULLET:>indent$}{sp_type:<6} {slot}")?; writeln!( f, "{:>indent$}added in sitrep: {created_sitrep_id}{}", "", - this_sitrep(*created_sitrep_id) + this_sitrep(*created_sitrep_id) )?; writeln!(f, "{:>indent$}comment: {comment}\n", "")?; } @@ -207,7 +221,7 @@ impl fmt::Display for DisplayCase<'_> { f, "{:>indent$}requested in sitrep: {requested_sitrep_id}{}\n", "", - this_sitrep(*requested_sitrep_id) + this_sitrep(*requested_sitrep_id) )?; } } diff --git a/nexus/types/src/fm/ereport.rs b/nexus/types/src/fm/ereport.rs index 17426a70179..cc3273fcef9 100644 --- a/nexus/types/src/fm/ereport.rs +++ b/nexus/types/src/fm/ereport.rs @@ -69,18 +69,17 @@ pub enum Reporter { impl fmt::Display for Reporter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Display format based on: + // https://rfd.shared.oxide.computer/rfd/200#_labeling match self { - Self::Sp { sp_type: SpType::Sled, slot } => { - write!(f, "Sled (SP) {slot:02}") - } - Self::Sp { sp_type: SpType::Switch, slot } => { - write!(f, "Switch {slot}") - } - Self::Sp { sp_type: SpType::Power, slot } => { - write!(f, "PSC {slot}") + Self::Sp { sp_type: sp_type @ SpType::Sled, slot } => { + write!(f, "{sp_type} {slot:<2} (SP)") } Self::HostOs { sled } => { - write!(f, "Sled (OS) {sled:?}") + write!(f, "{} {sled:?} (OS)", SpType::Sled) + } + Self::Sp { sp_type, slot } => { + write!(f, "{sp_type} {slot}") } } } From fc3eb68f7834abc70cd58e007448e95003f4a8b0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 14 Nov 2025 11:46:29 -0800 Subject: [PATCH 17/18] try to make impact lists usable --- nexus/db-model/src/fm/diagnosis_engine.rs | 12 +- nexus/fm/src/case.rs | 270 ++++++++++++++++++++++ nexus/fm/src/de.rs | 12 + nexus/fm/src/de/power_shelf.rs | 68 ++++-- nexus/fm/src/lib.rs | 171 +------------- nexus/types/src/fm.rs | 2 +- nexus/types/src/fm/case.rs | 14 +- 7 files changed, 358 insertions(+), 191 deletions(-) diff --git a/nexus/db-model/src/fm/diagnosis_engine.rs b/nexus/db-model/src/fm/diagnosis_engine.rs index 7d4523fa74e..7d354142bbb 100644 --- a/nexus/db-model/src/fm/diagnosis_engine.rs +++ b/nexus/db-model/src/fm/diagnosis_engine.rs @@ -27,24 +27,24 @@ impl_enum_type!( ); -impl From for fm::DiagnosisEngine { +impl From for fm::DiagnosisEngineKind { fn from(de: DiagnosisEngine) -> Self { match de { - DiagnosisEngine::PowerShelf => fm::DiagnosisEngine::PowerShelf, + DiagnosisEngine::PowerShelf => fm::DiagnosisEngineKind::PowerShelf, } } } -impl From for DiagnosisEngine { - fn from(fm_de: fm::DiagnosisEngine) -> Self { +impl From for DiagnosisEngine { + fn from(fm_de: fm::DiagnosisEngineKind) -> Self { match fm_de { - fm::DiagnosisEngine::PowerShelf => DiagnosisEngine::PowerShelf, + fm::DiagnosisEngineKind::PowerShelf => DiagnosisEngine::PowerShelf, } } } impl fmt::Display for DiagnosisEngine { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fm::DiagnosisEngine::from(*self).fmt(f) + fm::DiagnosisEngineKind::from(*self).fmt(f) } } diff --git a/nexus/fm/src/case.rs b/nexus/fm/src/case.rs index e69de29bb2d..f9c23ae3136 100644 --- a/nexus/fm/src/case.rs +++ b/nexus/fm/src/case.rs @@ -0,0 +1,270 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::alert; +use anyhow::Context; +use chrono::Utc; +use iddqd::id_ord_map::{self, IdOrdMap}; +use nexus_types::fm; +use nexus_types::inventory::SpType; +use omicron_uuid_kinds::AlertUuid; +use omicron_uuid_kinds::CaseUuid; +use omicron_uuid_kinds::SitrepUuid; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +#[derive(Debug)] +pub struct CaseBuilder { + pub log: slog::Logger, + pub case: fm::Case, + pub sitrep_id: SitrepUuid, +} + +#[derive(Debug)] +pub struct AllCases { + log: slog::Logger, + sitrep_id: SitrepUuid, + pub cases: IdOrdMap, +} + +impl AllCases { + pub fn new( + log: slog::Logger, + sitrep_id: SitrepUuid, + parent_sitrep: Option<&fm::Sitrep>, + ) -> (Self, ImpactLists) { + // Copy forward any open cases from the parent sitrep. + // If a case was closed in the parent sitrep, skip it. + let mut cases_by_sp: HashMap<_, HashSet> = HashMap::new(); + let cases: IdOrdMap<_> = parent_sitrep + .iter() + .flat_map(|s| s.open_cases()) + .map(|case| { + for sp in &case.impacted_sp_slots { + cases_by_sp + .entry((sp.sp_type, sp.slot)) + .or_default() + .insert(case.id.clone()); + } + CaseBuilder::new(&log, sitrep_id, case.clone()) + }) + .collect(); + + let cases = Self { log, sitrep_id, cases }; + let impact_lists = ImpactLists { cases_by_sp }; + (cases, impact_lists) + } + + pub fn open_case( + &mut self, + de: fm::DiagnosisEngineKind, + ) -> anyhow::Result> { + let id = CaseUuid::new_v4(); + let sitrep_id = self.sitrep_id; + let case = match self.cases.entry(&id) { + iddqd::id_ord_map::Entry::Occupied(_) => { + panic!("generated a colliding UUID!") + } + iddqd::id_ord_map::Entry::Vacant(entry) => { + let case = fm::Case { + id, + created_sitrep_id: self.sitrep_id, + time_created: chrono::Utc::now(), + closed_sitrep_id: None, + time_closed: None, + de, + comment: String::new(), + ereports: Default::default(), + alerts_requested: Default::default(), + impacted_sp_slots: Default::default(), + }; + entry.insert(CaseBuilder::new(&self.log, sitrep_id, case)) + } + }; + + slog::info!( + self.log, + "opened case {id:?}"; + "case_id" => ?id, + "de" => %de + ); + + Ok(case) + } + + pub fn case(&self, id: CaseUuid) -> Option<&CaseBuilder> { + self.cases.get(&id) + } + + pub fn case_mut( + &mut self, + id: CaseUuid, + ) -> Option> { + self.cases.get_mut(&id) + } +} + +impl CaseBuilder { + fn new(log: &slog::Logger, sitrep_id: SitrepUuid, case: fm::Case) -> Self { + let log = log.new(slog::o!( + "case_id" => format!("{:?}", case.id), + "de" => case.de.to_string(), + "created_sitrep_id" => format!("{:?}", case.created_sitrep_id), + )); + Self { log, case, sitrep_id } + } + + pub fn request_alert( + &mut self, + alert: &A, + ) -> anyhow::Result<()> { + let id = AlertUuid::new_v4(); + let class = A::CLASS; + let req = fm::AlertRequest { + id, + class, + requested_sitrep_id: self.sitrep_id, + payload: serde_json::to_value(&alert).with_context(|| { + format!( + "failed to serialize payload for {class:?} alert {alert:?}" + ) + })?, + }; + self.case.alerts_requested.insert_unique(req).map_err(|_| { + anyhow::anyhow!("an alert with ID {id:?} already exists") + })?; + + slog::info!( + &self.log, + "requested an alert"; + "alert_id" => ?id, + "alert_class" => ?class, + ); + + Ok(()) + } + + pub fn close(&mut self) { + self.case.time_closed = Some(Utc::now()); + self.case.closed_sitrep_id = Some(self.sitrep_id); + + slog::info!(&self.log, "case closed"); + } + + pub fn add_ereport( + &mut self, + report: &Arc, + comment: impl std::fmt::Display, + ) { + match self.case.ereports.insert_unique(fm::case::CaseEreport { + ereport: report.clone(), + assigned_sitrep_id: self.sitrep_id, + comment: comment.to_string(), + }) { + Ok(_) => { + slog::info!( + self.log, + "assigned ereport {} to case", report.id(); + "ereport_id" => ?report.id(), + "ereport_class" => ?report.class, + ); + } + Err(_) => { + slog::warn!( + self.log, + "ereport {} already assigned to case", report.id(); + "ereport_id" => ?report.id(), + "ereport_class" => ?report.class, + ); + } + } + } + + pub fn impacts_sp( + &mut self, + impact_lists: &mut ImpactLists, + sp_type: SpType, + slot: u16, + comment: impl ToString, + ) -> anyhow::Result<()> { + if self.impacted_sp_slots.contains_key(&(sp_type, slot)) { + return Err(anyhow::anyhow!("case already impacts this SP")); + } + + impact_lists + .cases_by_sp + .entry((sp_type, slot)) + .or_default() + .insert(self.id); + + let comment = comment.to_string(); + slog::info!( + &self.log, + "case impacts SP"; + "sp_type" => %sp_type, + "slot" => %slot, + "comment" => %comment, + ); + let created_sitrep_id = self.sitrep_id; + self.impacted_sp_slots + .insert_unique(fm::case::ImpactedSpSlot { + sp_type, + slot, + created_sitrep_id, + comment: comment.to_string(), + }) + .expect( + "we just checked that there wasn't already an entry for this \ + SP slot", + ); + + Ok(()) + } +} + +impl From for fm::Case { + fn from(CaseBuilder { case, .. }: CaseBuilder) -> Self { + case + } +} + +impl core::ops::Deref for CaseBuilder { + type Target = fm::Case; + fn deref(&self) -> &Self::Target { + &self.case + } +} + +impl core::ops::DerefMut for CaseBuilder { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.case + } +} + +impl iddqd::IdOrdItem for CaseBuilder { + type Key<'a> = &'a CaseUuid; + fn key(&self) -> Self::Key<'_> { + &self.case.id + } + + iddqd::id_upcast!(); +} + +#[derive(Debug)] +pub struct ImpactLists { + cases_by_sp: HashMap<(SpType, u16), HashSet>, +} + +impl ImpactLists { + pub fn cases_impacting_sp( + &self, + sp_type: SpType, + slot: u16, + ) -> impl Iterator + '_ { + self.cases_by_sp + .get(&(sp_type, slot)) + .into_iter() + .flat_map(|ids| ids.iter().copied()) + } +} diff --git a/nexus/fm/src/de.rs b/nexus/fm/src/de.rs index 0d1d2580785..c006ec97d39 100644 --- a/nexus/fm/src/de.rs +++ b/nexus/fm/src/de.rs @@ -4,4 +4,16 @@ //! Diagnosis engines +use crate::SitrepBuilder; +use nexus_types::fm; pub mod power_shelf; +use std::sync::Arc; + +pub trait DiagnosisEngine { + fn kind(&self) -> fm::DiagnosisEngineKind; + fn analyze_ereport( + &mut self, + sitrep: &mut SitrepBuilder<'_>, + ereport: &Arc, + ) -> anyhow::Result<()>; +} diff --git a/nexus/fm/src/de/power_shelf.rs b/nexus/fm/src/de/power_shelf.rs index 888be8b48de..ca52948659e 100644 --- a/nexus/fm/src/de/power_shelf.rs +++ b/nexus/fm/src/de/power_shelf.rs @@ -4,9 +4,10 @@ //! Power shelf diagnosis +use super::DiagnosisEngine; use crate::SitrepBuilder; use crate::alert; -use nexus_types::fm::DiagnosisEngine; +use nexus_types::fm::DiagnosisEngineKind; use nexus_types::fm::Ereport; use nexus_types::fm::ereport; use nexus_types::inventory::SpType; @@ -14,39 +15,78 @@ use serde::de::DeserializeOwned; use serde_json::Value; use std::sync::Arc; -pub fn diagnose( - sitrep: &mut SitrepBuilder<'_>, - new_ereports: &[Arc], -) -> anyhow::Result<()> { - for ereport in new_ereports { +pub struct PowerShelfDiagnosis { + log: slog::Logger, + // TODO(eliza): does this need/want any internal state? +} + +impl PowerShelfDiagnosis { + pub fn new(log: &slog::Logger) -> Self { + Self { log: log.new(slog::o!("de" => "power_shelf")) } + } +} + +impl DiagnosisEngine for PowerShelfDiagnosis { + fn kind(&self) -> DiagnosisEngineKind { + DiagnosisEngineKind::PowerShelf + } + + fn analyze_ereport( + &mut self, + sitrep: &mut SitrepBuilder<'_>, + ereport: &Arc, + ) -> anyhow::Result<()> { // Skip non-power shelf reports let ereport::Reporter::Sp { sp_type: SpType::Power, slot } = ereport.reporter else { - continue; + slog::debug!( + self.log, + "skipping ereport that was not reported by a power shelf"; + "ereport_id" => %ereport.id, + "reporter" => %ereport.reporter, + ); + return Ok(()); }; - // TODO: check for existing cases tracked for this power shelf and see - // if the ereport is related to them... - match ereport.data.class.as_deref() { // PSU inserted Some("hw.insert.psu") => { + // TODO: Check for existing cases tracked for this power shelf + // and see if the ereport is related to them. + let psc_psu = extract_psc_psu(&ereport, slot, &sitrep.log); - let mut case = sitrep.open_case(DiagnosisEngine::PowerShelf)?; + let mut case = + sitrep.cases.open_case(DiagnosisEngineKind::PowerShelf)?; case.add_ereport(ereport, "PSU inserted ereport"); case.comment = format!("PSC {slot} PSU {:?} inserted", psc_psu.psu_slot); case.request_alert(&alert::power_shelf::PsuInserted::V0 { psc_psu, })?; + case.impacts_sp( + &mut sitrep.impact_lists, + SpType::Power, + slot, + "this is the PSC on the power shelf where the PSU was inserted", + )?; // Nothing else to do at this time. case.close(); } Some("hw.remove.psu") => { + // TODO: Check for existing cases tracked for this power shelf + // and see if the ereport is related to them. + let psc_psu = extract_psc_psu(&ereport, slot, &sitrep.log); - let mut case = sitrep.open_case(DiagnosisEngine::PowerShelf)?; + let mut case = + sitrep.cases.open_case(DiagnosisEngineKind::PowerShelf)?; case.add_ereport(ereport, "PSU removed ereport"); + case.impacts_sp( + &mut sitrep.impact_lists, + SpType::Power, + slot, + "this is the PSC on the power shelf where the PSU was inserted", + )?; case.comment = format!("PSC {slot} PSU {:?} removed", psc_psu.psu_slot); case.request_alert(&alert::power_shelf::PsuRemoved::V0 { @@ -72,9 +112,9 @@ pub fn diagnose( ); } } - } - Ok(()) + Ok(()) + } } fn extract_psc_psu( diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index 59e6b38e281..7e7ea8c879d 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -6,18 +6,14 @@ use nexus_types::fm; use nexus_types::inventory; -use omicron_uuid_kinds::AlertUuid; -use omicron_uuid_kinds::CaseUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SitrepUuid; use slog::Logger; // use std::fmt::Write; -use anyhow::Context; -use chrono::Utc; -use std::sync::Arc; pub mod alert; pub mod case; +pub use case::CaseBuilder; pub mod de; #[derive(Debug)] @@ -26,7 +22,8 @@ pub struct SitrepBuilder<'a> { pub inventory: &'a inventory::Collection, pub parent_sitrep: Option<&'a fm::Sitrep>, pub sitrep_id: SitrepUuid, - pub cases: iddqd::IdOrdMap, + pub cases: case::AllCases, + pub impact_lists: case::ImpactLists, comment: String, } @@ -43,18 +40,13 @@ impl<'a> SitrepBuilder<'a> { "inv_collection_id" => format!("{:?}", inventory.id), )); - // Copy forward any open cases from the parent sitrep. - // If a case was closed in the parent sitrep, skip it. - let cases: iddqd::IdOrdMap<_> = parent_sitrep - .iter() - .flat_map(|s| s.open_cases()) - .map(|case| CaseBuilder::new(&log, sitrep_id, case.clone())) - .collect(); + let (cases, impact_lists) = + case::AllCases::new(log.clone(), sitrep_id, parent_sitrep); slog::info!( &log, "preparing sitrep {sitrep_id:?}"; - "existing_open_cases" => cases.len(), + "existing_open_cases" => cases.cases.len(), ); SitrepBuilder { @@ -64,46 +56,10 @@ impl<'a> SitrepBuilder<'a> { parent_sitrep, comment: String::new(), cases, + impact_lists, } } - pub fn open_case( - &mut self, - de: fm::DiagnosisEngine, - ) -> anyhow::Result> { - let id = CaseUuid::new_v4(); - let sitrep_id = self.sitrep_id; - let case = match self.cases.entry(&id) { - iddqd::id_ord_map::Entry::Occupied(_) => { - panic!("generated a colliding UUID!") - } - iddqd::id_ord_map::Entry::Vacant(entry) => { - let case = fm::Case { - id, - created_sitrep_id: self.sitrep_id, - time_created: chrono::Utc::now(), - closed_sitrep_id: None, - time_closed: None, - de, - comment: String::new(), - ereports: Default::default(), - alerts_requested: Default::default(), - impacted_sp_slots: Default::default(), - }; - entry.insert(CaseBuilder::new(&self.log, sitrep_id, case)) - } - }; - - slog::info!( - self.log, - "opened case {id:?}"; - "case_id" => ?id, - "de" => %de - ); - - Ok(case) - } - pub fn build(self, creator_id: OmicronZoneUuid) -> fm::Sitrep { fm::Sitrep { metadata: fm::SitrepMetadata { @@ -115,6 +71,7 @@ impl<'a> SitrepBuilder<'a> { time_created: chrono::Utc::now(), }, cases: self + .cases .cases .into_iter() .map(|builder| fm::Case::from(builder)) @@ -122,115 +79,3 @@ impl<'a> SitrepBuilder<'a> { } } } - -#[derive(Debug)] -pub struct CaseBuilder { - pub log: slog::Logger, - pub case: fm::Case, - pub sitrep_id: SitrepUuid, -} - -impl CaseBuilder { - fn new(log: &slog::Logger, sitrep_id: SitrepUuid, case: fm::Case) -> Self { - let log = log.new(slog::o!( - "case_id" => format!("{:?}", case.id), - "de" => case.de.to_string(), - "created_sitrep_id" => format!("{:?}", case.created_sitrep_id), - )); - Self { log, case, sitrep_id } - } - - pub fn request_alert( - &mut self, - alert: &A, - ) -> anyhow::Result<()> { - let id = AlertUuid::new_v4(); - let class = A::CLASS; - let req = fm::AlertRequest { - id, - class, - requested_sitrep_id: self.sitrep_id, - payload: serde_json::to_value(&alert).with_context(|| { - format!( - "failed to serialize payload for {class:?} alert {alert:?}" - ) - })?, - }; - self.case.alerts_requested.insert_unique(req).map_err(|_| { - anyhow::anyhow!("an alert with ID {id:?} already exists") - })?; - - slog::info!( - &self.log, - "requested an alert"; - "alert_id" => ?id, - "alert_class" => ?class, - ); - - Ok(()) - } - - pub fn close(&mut self) { - self.case.time_closed = Some(Utc::now()); - self.case.closed_sitrep_id = Some(self.sitrep_id); - - slog::info!(&self.log, "case closed"); - } - - pub fn add_ereport( - &mut self, - report: &Arc, - comment: impl std::fmt::Display, - ) { - match self.case.ereports.insert_unique(fm::case::CaseEreport { - ereport: report.clone(), - assigned_sitrep_id: self.sitrep_id, - comment: comment.to_string(), - }) { - Ok(_) => { - slog::info!( - self.log, - "assigned ereport {} to case", report.id(); - "ereport_id" => ?report.id(), - "ereport_class" => ?report.class, - ); - } - Err(_) => { - slog::warn!( - self.log, - "ereport {} already assigned to case", report.id(); - "ereport_id" => ?report.id(), - "ereport_class" => ?report.class, - ); - } - } - } -} - -impl From for fm::Case { - fn from(CaseBuilder { case, .. }: CaseBuilder) -> Self { - case - } -} - -impl core::ops::Deref for CaseBuilder { - type Target = fm::Case; - fn deref(&self) -> &Self::Target { - &self.case - } -} - -impl core::ops::DerefMut for CaseBuilder { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.case - } -} - -impl iddqd::IdOrdItem for CaseBuilder { - type Key<'a> = &'a CaseUuid; - fn key(&self) -> Self::Key<'_> { - &self.case.id - } - - iddqd::id_upcast!(); -} diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index 52ce4842323..bedc71b301a 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -134,6 +134,6 @@ pub struct SitrepVersion { )] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] -pub enum DiagnosisEngine { +pub enum DiagnosisEngineKind { PowerShelf, } diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index a616872a4d6..40a7d023d9d 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -3,7 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::fm::AlertRequest; -use crate::fm::DiagnosisEngine; +use crate::fm::DiagnosisEngineKind; use crate::fm::Ereport; use crate::inventory::SpType; use chrono::{DateTime, Utc}; @@ -22,7 +22,7 @@ pub struct Case { pub closed_sitrep_id: Option, pub time_closed: Option>, - pub de: DiagnosisEngine, + pub de: DiagnosisEngineKind, pub ereports: IdOrdMap, pub alerts_requested: IdOrdMap, @@ -79,15 +79,15 @@ impl IdOrdItem for CaseEreport { #[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct ImpactedSpSlot { pub sp_type: SpType, - pub slot: u8, + pub slot: u16, pub created_sitrep_id: SitrepUuid, pub comment: String, } impl IdOrdItem for ImpactedSpSlot { - type Key<'a> = (&'a SpType, &'a u8); + type Key<'a> = (SpType, u16); fn key(&self) -> Self::Key<'_> { - (&self.sp_type, &self.slot) + (self.sp_type, self.slot) } iddqd::id_upcast!(); @@ -235,7 +235,7 @@ impl fmt::Display for DisplayCase<'_> { #[cfg(test)] mod tests { use super::*; - use crate::fm::{AlertClass, AlertRequest, DiagnosisEngine}; + use crate::fm::{AlertClass, AlertRequest, DiagnosisEngineKind}; use chrono::Utc; use ereport_types::{Ena, EreportId}; use omicron_uuid_kinds::{ @@ -338,7 +338,7 @@ mod tests { time_created, closed_sitrep_id: Some(closed_sitrep_id), time_closed: Some(time_closed), - de: DiagnosisEngine::PowerShelf, + de: DiagnosisEngineKind::PowerShelf, ereports, alerts_requested, impacted_sp_slots, From bff107aa15072b1d7f4b5f4e3d22293d3b056a58 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 14 Nov 2025 13:04:07 -0800 Subject: [PATCH 18/18] BLURG --- nexus/fm/src/case.rs | 14 ++++ nexus/fm/src/de.rs | 6 ++ nexus/fm/src/de/power_shelf.rs | 102 +++++++++++++++++++++--- nexus/fm/src/ereport_analysis.rs | 128 +++++++++++++++++++++++++++++++ nexus/fm/src/lib.rs | 1 + 5 files changed, 240 insertions(+), 11 deletions(-) create mode 100644 nexus/fm/src/ereport_analysis.rs diff --git a/nexus/fm/src/case.rs b/nexus/fm/src/case.rs index f9c23ae3136..b8e313a7c1e 100644 --- a/nexus/fm/src/case.rs +++ b/nexus/fm/src/case.rs @@ -221,6 +221,20 @@ impl CaseBuilder { Ok(()) } + + /// Returns an iterator over all ereports that were assigned to this case in + /// the current sitrep. + pub fn new_ereports( + &self, + ) -> impl Iterator> + '_ { + self.ereports.iter().filter_map(|ereport| { + if ereport.assigned_sitrep_id == self.sitrep_id { + Some(&ereport.ereport) + } else { + None + } + }) + } } impl From for fm::Case { diff --git a/nexus/fm/src/de.rs b/nexus/fm/src/de.rs index c006ec97d39..d9f08569357 100644 --- a/nexus/fm/src/de.rs +++ b/nexus/fm/src/de.rs @@ -11,9 +11,15 @@ use std::sync::Arc; pub trait DiagnosisEngine { fn kind(&self) -> fm::DiagnosisEngineKind; + fn analyze_ereport( &mut self, sitrep: &mut SitrepBuilder<'_>, ereport: &Arc, ) -> anyhow::Result<()>; + + fn process_cases( + &mut self, + sitrep: &mut SitrepBuilder<'_>, + ) -> anyhow::Result<()>; } diff --git a/nexus/fm/src/de/power_shelf.rs b/nexus/fm/src/de/power_shelf.rs index ca52948659e..c01ceb7cecb 100644 --- a/nexus/fm/src/de/power_shelf.rs +++ b/nexus/fm/src/de/power_shelf.rs @@ -7,6 +7,7 @@ use super::DiagnosisEngine; use crate::SitrepBuilder; use crate::alert; +use crate::ereport_analysis; use nexus_types::fm::DiagnosisEngineKind; use nexus_types::fm::Ereport; use nexus_types::fm::ereport; @@ -115,6 +116,13 @@ impl DiagnosisEngine for PowerShelfDiagnosis { Ok(()) } + + fn process_cases( + &mut self, + sitrep: &mut SitrepBuilder<'_>, + ) -> anyhow::Result<()> { + todo!() + } } fn extract_psc_psu( @@ -140,17 +148,7 @@ fn extract_psu_id( ereport: &Ereport, log: &slog::Logger, ) -> alert::power_shelf::PsuIdentity { - // These are the same field names that Hubris uses in the ereport. See: - // https://github.com/oxidecomputer/hubris/blob/ec18e4f11aaa14600c61f67335c32b250ef38269/drv/psc-seq-server/src/main.rs#L1107-L1117 - #[derive(serde::Deserialize, Default)] - struct Fruid { - mfr: Option, - mpn: Option, - serial: Option, - fw_rev: Option, - } - - let Fruid { mfr, mpn, serial, fw_rev } = + let PsuFruid { mfr, mpn, serial, fw_rev } = grab_json_value(ereport, "fruid", &ereport.report, log) .unwrap_or_default(); @@ -195,3 +193,85 @@ fn grab_json_value( } } } + +#[derive(Debug, Eq, PartialEq, serde::Deserialize)] +struct PscEreport { + #[serde(flatten)] + metadata: ereport_analysis::HubrisMetadata, + #[serde(flatten)] + class: EreportClass, +} + +#[derive(Debug, Eq, PartialEq, serde::Deserialize)] +#[serde(tag = "k")] +enum EreportClass { + #[serde(rename = "hw.insert.psu")] + PsuInserted { + #[serde(flatten)] + ereport: PsuInsertedEreport, + }, + #[serde(rename = "hw.remove.psu")] + PsuRemoved { + #[serde(flatten)] + ereport: PsuInsertedEreport, + }, + #[serde(rename = "hw.pwr.pwr_good.bad")] + PwrBad { + #[serde(flatten)] + ereport: PwrGoodEreport, + }, +} + +#[derive(Debug, Eq, PartialEq, serde::Deserialize)] +struct PsuInsertedEreport { + refdes: String, + rail: String, + slot: u8, + fruid: PsuFruid, +} + +#[derive(Debug, Eq, PartialEq, serde::Deserialize)] +struct PwrGoodEreport { + refdes: String, + rail: String, + slot: u8, + fruid: PsuFruid, + pmbus_status: PmbusStatus, +} + +// These are the same field names that Hubris uses in the ereport. See: +// https://github.com/oxidecomputer/hubris/blob/ec18e4f11aaa14600c61f67335c32b250ef38269/drv/psc-seq-server/src/main.rs#L1107-L1117 +#[derive(serde::Deserialize, Debug, PartialEq, Eq, Default)] +struct PsuFruid { + mfr: Option, + mpn: Option, + serial: Option, + fw_rev: Option, +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq, serde::Deserialize)] +// TODO(eliza): bitflags types for these? +struct PmbusStatus { + word: Option, + input: Option, + iout: Option, + vout: Option, + temp: Option, + cml: Option, + mfr: Option, +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_pwr_bad_ereport() { + let json_value: serde_json::Value = + serde_json::from_str(ereport_analysis::test::PSU_PWR_BAD_JSON) + .expect("JSON should parse"); + let ereport: PscEreport = serde_json::from_value(dbg!(json_value)) + .expect("JSON value should be interpretable"); + eprintln!("{ereport:?}"); + } +} diff --git a/nexus/fm/src/ereport_analysis.rs b/nexus/fm/src/ereport_analysis.rs new file mode 100644 index 00000000000..64eef9b64a4 --- /dev/null +++ b/nexus/fm/src/ereport_analysis.rs @@ -0,0 +1,128 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Ereport analysis tools. + +/// Metadata that should be present in *all* hubris ereports. +#[derive(Clone, Debug, Eq, PartialEq, Hash, serde::Deserialize)] +pub(crate) struct HubrisMetadata { + pub hubris_archive_id: String, + pub hubris_task_gen: u16, + pub hubris_task_name: String, + pub hubris_uptime_ms: u64, + // Added by MGS + pub ereport_message_version: u8, +} + +#[cfg(test)] +pub(crate) mod test { + use super::*; + + // These are real life ereports I copied from the dogfood rack. + pub(crate) const PSU_REMOVE_JSON: &str = r#"{ + "baseboard_part_number": "913-0000003", + "baseboard_rev": 8, + "baseboard_serial_number": "BRM45220004", + "ereport_message_version": 0, + "fruid": { + "fw_rev": "0701", + "mfr": "Murata-PS", + "mpn": "MWOCP68-3600-D-RM", + "serial": "LL2216RB003Z" + }, + "hubris_archive_id": "qSm4IUtvQe0", + "hubris_task_gen": 0, + "hubris_task_name": "sequencer", + "hubris_uptime_ms": 1197337481, + "k": "hw.remove.psu", + "rail": "V54_PSU4", + "refdes": "PSU4", + "slot": 4, + "v": 0 + }"#; + + pub(crate) const PSU_INSERT_JSON: &str = r#"{ + "baseboard_part_number": "913-0000003", + "baseboard_rev": 8, + "baseboard_serial_number": "BRM45220004", + "ereport_message_version": 0, + "fruid": { + "fw_rev": "0701", + "mfr": "Murata-PS", + "mpn": "MWOCP68-3600-D-RM", + "serial": "LL2216RB003Z" + }, + "hubris_archive_id": "qSm4IUtvQe0", + "hubris_task_gen": 0, + "hubris_task_name": "sequencer", + "hubris_uptime_ms": 1197337481, + "k": "hw.remove.psu", + "rail": "V54_PSU4", + "refdes": "PSU4", + "slot": 4, + "v": 0 + }"#; + + pub(crate) const PSU_PWR_BAD_JSON: &str = r#"{ + "baseboard_part_number": "913-0000003", + "baseboard_rev": 8, + "baseboard_serial_number": "BRM45220004", + "ereport_message_version": 0, + "fruid": { + "fw_rev": "0701", + "mfr": "Murata-PS", + "mpn": "MWOCP68-3600-D-RM", + "serial": "LL2216RB003Z" + }, + "hubris_archive_id": "qSm4IUtvQe0", + "hubris_task_gen": 0, + "hubris_task_name": "sequencer", + "hubris_uptime_ms": 1197408566, + "k": "hw.pwr.pwr_good.bad", + "pmbus_status": { + "cml": 0, + "input": 48, + "iout": 0, + "mfr": 0, + "temp": 0, + "vout": 0, + "word": 10312 + }, + "rail": "V54_PSU4", + "refdes": "PSU4", + "slot": 4, + "v": 0 + }"#; + + #[test] + fn test_hubris_metadata() { + let expected_metadata = HubrisMetadata { + hubris_archive_id: "qSm4IUtvQe0".to_string(), + hubris_task_gen: 0, + hubris_task_name: "sequencer".to_string(), + hubris_uptime_ms: 0, + ereport_message_version: 0, + }; + let ereports = [ + (PSU_REMOVE_JSON, 1197337481), + (PSU_INSERT_JSON, 1197337481), + (PSU_PWR_BAD_JSON, 1197408566), + ]; + + for (json, hubris_uptime_ms) in ereports { + let json_value: serde_json::Value = + serde_json::from_str(json).expect("JSON should parse"); + let metadata: HubrisMetadata = + serde_json::from_value(dbg!(json_value)) + .expect("value should contain a HubrisMetadata"); + assert_eq!( + metadata, + HubrisMetadata { + hubris_uptime_ms, + ..expected_metadata.clone() + } + ); + } + } +} diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index 7e7ea8c879d..b25f351f2d5 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -15,6 +15,7 @@ pub mod alert; pub mod case; pub use case::CaseBuilder; pub mod de; +pub mod ereport_analysis; #[derive(Debug)] pub struct SitrepBuilder<'a> {