From 35641ca9878f53b91c4ed16cbfeba5cecb19f607 Mon Sep 17 00:00:00 2001 From: "Matt LaFayette (Kurjanowicz)" Date: Mon, 1 Dec 2025 23:19:01 +0000 Subject: [PATCH 1/5] underhill_core: specify which CPUs have outstanding IO, not just if they have interrupts --- openhcl/openhcl_boot/src/host_params/dt/mod.rs | 3 +++ openhcl/underhill_core/src/dispatch/mod.rs | 11 ++++------- .../underhill_core/src/loader/vtl2_config/mod.rs | 2 ++ openhcl/underhill_core/src/nvme_manager/mod.rs | 1 + .../underhill_core/src/nvme_manager/save_restore.rs | 13 ------------- vm/loader/loader_defs/src/shim.rs | 13 +++++++++++++ 6 files changed, 23 insertions(+), 20 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dt/mod.rs b/openhcl/openhcl_boot/src/host_params/dt/mod.rs index 62aa2669e5..3449fd8be2 100644 --- a/openhcl/openhcl_boot/src/host_params/dt/mod.rs +++ b/openhcl/openhcl_boot/src/host_params/dt/mod.rs @@ -379,6 +379,7 @@ struct PartitionTopology { struct PersistedPartitionTopology { topology: PartitionTopology, cpus_with_mapped_interrupts: Vec, + cpus_with_outstanding_io: Vec, } // Calculate the default mmio size for VTL2 when not specified by the host. @@ -615,6 +616,7 @@ fn topology_from_persisted_state( partition_memory, partition_mmio, cpus_with_mapped_interrupts, + cpus_with_outstanding_io, } = parsed_protobuf; // FUTURE: should memory allocation mode should persist in saved state and @@ -764,6 +766,7 @@ fn topology_from_persisted_state( memory_allocation_mode, }, cpus_with_mapped_interrupts, + cpus_with_outstanding_io, }) } diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index dd2ede7f25..64f21692f4 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -627,17 +627,14 @@ impl LoadedVm { }; // Save the persisted state used by the next openhcl_boot. - let cpus_with_mapped_interrupts = match state + let nvme_vp_interrupt_state = crate::nvme_manager::save_restore_helpers::cpus_with_interrupts(state .init_state - .nvme_state - .as_ref() { - Some(nvme_state) => crate::nvme_manager::save_restore::cpus_with_interrupts(&nvme_state.nvme_state), - None => vec![], - }; + .nvme_state.as_ref().map(|n| &n.nvme_state)); crate::loader::vtl2_config::write_persisted_info( self.runtime_params.parsed_openhcl_boot(), - cpus_with_mapped_interrupts, + nvme_vp_interrupt_state.vps_with_mapped_interrupts, + nvme_vp_interrupt_state.vps_with_outstanding_io ) .context("failed to write persisted info")?; diff --git a/openhcl/underhill_core/src/loader/vtl2_config/mod.rs b/openhcl/underhill_core/src/loader/vtl2_config/mod.rs index 1e0df0c048..ea75a93f4c 100644 --- a/openhcl/underhill_core/src/loader/vtl2_config/mod.rs +++ b/openhcl/underhill_core/src/loader/vtl2_config/mod.rs @@ -204,6 +204,7 @@ impl Drop for Vtl2ParamsMap<'_> { pub fn write_persisted_info( parsed: &ParsedBootDtInfo, cpus_with_mapped_interrupts: Vec, + cpus_with_outstanding_io: Vec, ) -> anyhow::Result<()> { use loader_defs::shim::PersistedStateHeader; use loader_defs::shim::save_restore::MemoryEntry; @@ -249,6 +250,7 @@ pub fn write_persisted_info( }) .collect(), cpus_with_mapped_interrupts, + cpus_with_outstanding_io, }; let protobuf = mesh_protobuf::encode(state); diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 83b3b9e49d..244f91fff1 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -58,6 +58,7 @@ use vmcore::vm_task::VmTaskDriverSource; pub mod device; pub mod manager; pub mod save_restore; +pub mod save_restore_helpers; #[derive(Debug, Error)] #[error("nvme device {pci_id} error")] diff --git a/openhcl/underhill_core/src/nvme_manager/save_restore.rs b/openhcl/underhill_core/src/nvme_manager/save_restore.rs index eb0a05d680..f66bbd3166 100644 --- a/openhcl/underhill_core/src/nvme_manager/save_restore.rs +++ b/openhcl/underhill_core/src/nvme_manager/save_restore.rs @@ -2,7 +2,6 @@ // Licensed under the MIT License. use mesh::payload::Protobuf; -use std::collections::BTreeSet; use vmcore::save_restore::SavedStateRoot; #[derive(Protobuf, SavedStateRoot)] @@ -22,15 +21,3 @@ pub struct NvmeSavedDiskConfig { #[mesh(2)] pub driver_state: nvme_driver::NvmeDriverSavedState, } - -/// Returns a sorted list of CPU IDs that have mapped device interrupts in the saved NVMe state. -/// -/// This information is used to make heuristic decisions during restore, such as whether to -/// disable sidecar for VMs with active device interrupts. -pub fn cpus_with_interrupts(state: &NvmeManagerSavedState) -> Vec { - let mut cpus_with_interrupts = BTreeSet::new(); - for disk in &state.nvme_disks { - cpus_with_interrupts.extend(disk.driver_state.worker_data.io.iter().map(|q| q.cpu)); - } - cpus_with_interrupts.into_iter().collect() -} diff --git a/vm/loader/loader_defs/src/shim.rs b/vm/loader/loader_defs/src/shim.rs index 158c686704..37f5fbe3b8 100644 --- a/vm/loader/loader_defs/src/shim.rs +++ b/vm/loader/loader_defs/src/shim.rs @@ -346,5 +346,18 @@ pub mod save_restore { /// still be functionally correct. #[mesh(3)] pub cpus_with_mapped_interrupts: Vec, + /// The list of CPUs with mapped device interrupts present at save time, + /// and that have outstanding IO on that CPU. + /// + /// While this list is today used as a semaphore (either there are device + /// interrupts mapped or not), in the future it may be used to provide more + /// granular restore hints. E.g., only start the CPUs with active + /// interrupts right away and defer other CPU startup until later. + /// + /// DEFAULT: For save state from prior versions, this will be empty. + /// This is fine: the restore heuristics might be less optimal, but will + /// still be functionally correct. + #[mesh(4)] + pub cpus_with_outstanding_io: Vec, } } From dba16abfd51af1e965128f0ce74317a044a1e19c Mon Sep 17 00:00:00 2001 From: "Matt LaFayette (Kurjanowicz)" Date: Mon, 1 Dec 2025 23:19:33 +0000 Subject: [PATCH 2/5] oops, don't forget this file --- .../src/nvme_manager/save_restore_helpers.rs | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs diff --git a/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs new file mode 100644 index 0000000000..7ff794b613 --- /dev/null +++ b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use crate::nvme_manager::save_restore::NvmeManagerSavedState; +use std::collections::BTreeMap; +use std::collections::btree_map::Entry; + +/// Useful state about how the VM's vCPUs interacted with NVMe device interrupts at the time of save. +/// +/// This information is used to make heuristic decisions during restore, such as whether to +/// disable sidecar for VMs with active device interrupts. +pub struct VPInterruptState { + /// List of vCPUs with any mapped device interrupts, sorted by CPU ID. + pub vps_with_mapped_interrupts: Vec, + + /// List of vCPUs with outstanding I/O at the time of save, sorted by CPU ID. + /// It is expected that this is a subset of `vps_with_mapped_interrupts`, since + /// only some queues will have in-flight I/O. + pub vps_with_outstanding_io: Vec, +} + +/// Analyzes the saved NVMe manager state to determine which vCPUs had mapped device interrupts +/// and which had outstanding I/O at the time of save. +/// +/// See [`VPInterruptState`] for more details. +pub fn cpus_with_interrupts(state: Option<&NvmeManagerSavedState>) -> VPInterruptState { + let mut vp_state = BTreeMap::new(); + + if let Some(state) = state { + for disk in &state.nvme_disks { + for q in &disk.driver_state.worker_data.io { + match vp_state.entry(q.cpu) { + Entry::Vacant(e) => { + e.insert(!q.queue_data.handler_data.pending_cmds.commands.is_empty()); + } + Entry::Occupied(mut e) => { + e.insert( + e.get() | !q.queue_data.handler_data.pending_cmds.commands.is_empty(), + ); + } + } + } + } + } + + VPInterruptState { + vps_with_mapped_interrupts: vp_state.keys().cloned().collect(), + vps_with_outstanding_io: vp_state + .iter() + .filter_map( + |(&vp, &has_outstanding_io)| { + if has_outstanding_io { Some(vp) } else { None } + }, + ) + .collect(), + } +} From fdee11fede877610b0aa3cb20cf7eba6de082f03 Mon Sep 17 00:00:00 2001 From: "Matt LaFayette (Kurjanowicz)" Date: Tue, 2 Dec 2025 17:58:56 +0000 Subject: [PATCH 3/5] copilot pr feedback + unit tests --- Cargo.lock | 1 + openhcl/underhill_core/Cargo.toml | 1 + openhcl/underhill_core/src/dispatch/mod.rs | 2 +- .../src/nvme_manager/save_restore_helpers.rs | 161 +++++++++++++++++- .../disk_nvme/nvme_driver/src/driver.rs | 7 + .../storage/disk_nvme/nvme_driver/src/lib.rs | 1 + 6 files changed, 169 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 496372549b..795cdce3ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7979,6 +7979,7 @@ dependencies = [ "netvsp", "nvme_driver", "nvme_resources", + "nvme_spec", "openhcl_attestation_protocol", "openhcl_dma_manager", "pal", diff --git a/openhcl/underhill_core/Cargo.toml b/openhcl/underhill_core/Cargo.toml index 2cd7cf94f5..c68b2f5b3c 100644 --- a/openhcl/underhill_core/Cargo.toml +++ b/openhcl/underhill_core/Cargo.toml @@ -179,6 +179,7 @@ firmware_pcat.workspace = true [dev-dependencies] guest_emulation_transport = { workspace = true, features = ["test_utilities"] } test_with_tracing.workspace = true +nvme_spec.workspace = true [build-dependencies] build_rs_guest_arch.workspace = true diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index 64f21692f4..5cc58c4c33 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -634,7 +634,7 @@ impl LoadedVm { crate::loader::vtl2_config::write_persisted_info( self.runtime_params.parsed_openhcl_boot(), nvme_vp_interrupt_state.vps_with_mapped_interrupts, - nvme_vp_interrupt_state.vps_with_outstanding_io + nvme_vp_interrupt_state.vps_with_outstanding_io, ) .context("failed to write persisted info")?; diff --git a/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs index 7ff794b613..266746db4d 100644 --- a/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs +++ b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs @@ -34,9 +34,7 @@ pub fn cpus_with_interrupts(state: Option<&NvmeManagerSavedState>) -> VPInterrup e.insert(!q.queue_data.handler_data.pending_cmds.commands.is_empty()); } Entry::Occupied(mut e) => { - e.insert( - e.get() | !q.queue_data.handler_data.pending_cmds.commands.is_empty(), - ); + *e.get_mut() |= !q.queue_data.handler_data.pending_cmds.commands.is_empty(); } } } @@ -55,3 +53,160 @@ pub fn cpus_with_interrupts(state: Option<&NvmeManagerSavedState>) -> VPInterrup .collect(), } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::nvme_manager::save_restore::{NvmeManagerSavedState, NvmeSavedDiskConfig}; + use nvme_driver::NvmeDriverSavedState; + use nvme_driver::save_restore::{ + CompletionQueueSavedState, IoQueueSavedState, NvmeDriverWorkerSavedState, + PendingCommandSavedState, PendingCommandsSavedState, QueueHandlerSavedState, + QueuePairSavedState, SubmissionQueueSavedState, + }; + use nvme_spec as spec; + use zerocopy::FromZeros; + + #[test] + fn returns_empty_when_state_absent() { + let result = cpus_with_interrupts(None); + assert!(result.vps_with_mapped_interrupts.is_empty()); + assert!(result.vps_with_outstanding_io.is_empty()); + } + + #[test] + fn collects_unique_sorted_vps_and_outstanding_subset() { + let state = build_state(vec![ + vec![QueueSpec::new(2, false), QueueSpec::new(1, true)], + vec![QueueSpec::new(1, false), QueueSpec::new(3, true)], + ]); + + let result = cpus_with_interrupts(Some(&state)); + + assert_eq!(result.vps_with_mapped_interrupts, vec![1, 2, 3]); + assert_eq!(result.vps_with_outstanding_io, vec![1, 3]); + } + + #[test] + fn reports_outstanding_if_any_queue_pending_for_vp() { + let state = build_state(vec![vec![ + QueueSpec::new(4, false), + QueueSpec::new(4, true), + ]]); + + let result = cpus_with_interrupts(Some(&state)); + + assert_eq!(result.vps_with_mapped_interrupts, vec![4]); + assert_eq!(result.vps_with_outstanding_io, vec![4]); + } + + #[test] + fn handles_state_with_no_disks() { + let state = NvmeManagerSavedState { + cpu_count: 0, + nvme_disks: Vec::new(), + }; + + let result = cpus_with_interrupts(Some(&state)); + + assert!(result.vps_with_mapped_interrupts.is_empty()); + assert!(result.vps_with_outstanding_io.is_empty()); + } + + struct QueueSpec { + cpu: u32, + has_outstanding_io: bool, + } + + impl QueueSpec { + const fn new(cpu: u32, has_outstanding_io: bool) -> Self { + Self { + cpu, + has_outstanding_io, + } + } + } + + // Helper to fabricate NVMe manager save-state snapshots with specific CPU/IO mappings. + fn build_state(disk_queue_specs: Vec>) -> NvmeManagerSavedState { + NvmeManagerSavedState { + cpu_count: 0, // Not relevant for these tests. + nvme_disks: disk_queue_specs + .into_iter() + .enumerate() + .map(|(disk_index, queues)| NvmeSavedDiskConfig { + pci_id: format!("0000:{disk_index:02x}.0"), + driver_state: NvmeDriverSavedState { + identify_ctrl: spec::IdentifyController::new_zeroed(), + device_id: format!("disk{disk_index}"), + namespaces: Vec::new(), + worker_data: NvmeDriverWorkerSavedState { + admin: None, + io: queues + .into_iter() + .enumerate() + .map(|(queue_index, spec)| { + // Tests only care about per-disk affinity, so queue IDs can + // restart from zero for each disk without losing coverage. + build_io_queue( + queue_index as u16, + spec.cpu, + spec.has_outstanding_io, + ) + }) + .collect(), + qsize: 0, + max_io_queues: 0, + }, + }, + }) + .collect(), + } + } + + fn build_io_queue(qid: u16, cpu: u32, outstanding: bool) -> IoQueueSavedState { + IoQueueSavedState { + cpu, + iv: qid as u32, + queue_data: QueuePairSavedState { + mem_len: 0, + base_pfn: 0, + qid, + sq_entries: 1, + cq_entries: 1, + handler_data: QueueHandlerSavedState { + sq_state: SubmissionQueueSavedState { + sqid: qid, + head: 0, + tail: 0, + committed_tail: 0, + len: 1, + }, + cq_state: CompletionQueueSavedState { + cqid: qid, + head: 0, + committed_head: 0, + len: 1, + phase: false, + }, + pending_cmds: build_pending_cmds(outstanding), + aer_handler: None, + }, + }, + } + } + + fn build_pending_cmds(outstanding: bool) -> PendingCommandsSavedState { + PendingCommandsSavedState { + commands: if outstanding { + vec![PendingCommandSavedState { + command: spec::Command::new_zeroed(), + }] + } else { + Vec::new() + }, + next_cid_high_bits: 0, + cid_key_bits: 0, + } + } +} diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index b11c092231..11bc7b99cd 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -1392,6 +1392,8 @@ impl InspectTask for DriverWorkerTask { } } +/// Save/restore data structures exposed by the NVMe driver. +#[allow(missing_docs)] pub mod save_restore { use super::*; @@ -1493,6 +1495,7 @@ pub mod save_restore { pub aer_handler: Option, } + /// Snapshot of submission queue metadata captured during save. #[derive(Protobuf, Clone, Debug)] #[mesh(package = "nvme_driver")] pub struct SubmissionQueueSavedState { @@ -1508,6 +1511,7 @@ pub mod save_restore { pub len: u32, } + /// Snapshot of completion queue metadata captured during save. #[derive(Protobuf, Clone, Debug)] #[mesh(package = "nvme_driver")] pub struct CompletionQueueSavedState { @@ -1524,6 +1528,7 @@ pub mod save_restore { pub phase: bool, } + /// Pending command entry captured from a queue handler. #[derive(Protobuf, Clone, Debug)] #[mesh(package = "nvme_driver")] pub struct PendingCommandSavedState { @@ -1531,6 +1536,7 @@ pub mod save_restore { pub command: spec::Command, } + /// Collection of pending commands indexed by CID. #[derive(Protobuf, Clone, Debug)] #[mesh(package = "nvme_driver")] pub struct PendingCommandsSavedState { @@ -1552,6 +1558,7 @@ pub mod save_restore { pub identify_ns: nvme_spec::nvm::IdentifyNamespace, } + /// Saved Async Event Request handler metadata. #[derive(Clone, Debug, Protobuf)] #[mesh(package = "nvme_driver")] pub struct AerHandlerSavedState { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs index 3efc12dd39..6662417313 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs @@ -14,6 +14,7 @@ mod registers; mod tests; pub use self::driver::NvmeDriver; +pub use self::driver::save_restore; pub use self::driver::save_restore::Error; pub use self::driver::save_restore::NvmeDriverSavedState; pub use self::namespace::Namespace; From cf127f2709f89061af91ecfbe0af81b4ffc9d97e Mon Sep 17 00:00:00 2001 From: "Matt LaFayette (Kurjanowicz)" Date: Tue, 2 Dec 2025 18:37:59 +0000 Subject: [PATCH 4/5] pr feedback --- .../openhcl_boot/src/host_params/dt/mod.rs | 11 +++-- openhcl/underhill_core/src/dispatch/mod.rs | 6 +-- .../src/loader/vtl2_config/mod.rs | 7 ++- .../src/nvme_manager/save_restore_helpers.rs | 48 +++++++++++-------- .../disk_nvme/nvme_driver/src/driver.rs | 2 +- vm/loader/loader_defs/src/shim.rs | 17 ++----- 6 files changed, 47 insertions(+), 44 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dt/mod.rs b/openhcl/openhcl_boot/src/host_params/dt/mod.rs index 3449fd8be2..8a08b2b5bf 100644 --- a/openhcl/openhcl_boot/src/host_params/dt/mod.rs +++ b/openhcl/openhcl_boot/src/host_params/dt/mod.rs @@ -378,7 +378,7 @@ struct PartitionTopology { #[derive(Debug, PartialEq, Eq)] struct PersistedPartitionTopology { topology: PartitionTopology, - cpus_with_mapped_interrupts: Vec, + cpus_with_mapped_interrupts_no_io: Vec, cpus_with_outstanding_io: Vec, } @@ -615,7 +615,7 @@ fn topology_from_persisted_state( let loader_defs::shim::save_restore::SavedState { partition_memory, partition_mmio, - cpus_with_mapped_interrupts, + cpus_with_mapped_interrupts_no_io, cpus_with_outstanding_io, } = parsed_protobuf; @@ -765,7 +765,7 @@ fn topology_from_persisted_state( vtl2_mmio, memory_allocation_mode, }, - cpus_with_mapped_interrupts, + cpus_with_mapped_interrupts_no_io, cpus_with_outstanding_io, }) } @@ -857,7 +857,10 @@ impl PartitionInfo { ( persisted_topology.topology, - !persisted_topology.cpus_with_mapped_interrupts.is_empty(), + !(persisted_topology + .cpus_with_mapped_interrupts_no_io + .is_empty() + && persisted_topology.cpus_with_outstanding_io.is_empty()), ) } else { ( diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index 5cc58c4c33..71367cfa08 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -627,14 +627,14 @@ impl LoadedVm { }; // Save the persisted state used by the next openhcl_boot. - let nvme_vp_interrupt_state = crate::nvme_manager::save_restore_helpers::cpus_with_interrupts(state + // In the future, this could be extended to include other devices (e.g. MANA). + let nvme_vp_interrupt_state = crate::nvme_manager::save_restore_helpers::nvme_interrupt_state(state .init_state .nvme_state.as_ref().map(|n| &n.nvme_state)); crate::loader::vtl2_config::write_persisted_info( self.runtime_params.parsed_openhcl_boot(), - nvme_vp_interrupt_state.vps_with_mapped_interrupts, - nvme_vp_interrupt_state.vps_with_outstanding_io, + nvme_vp_interrupt_state, ) .context("failed to write persisted info")?; diff --git a/openhcl/underhill_core/src/loader/vtl2_config/mod.rs b/openhcl/underhill_core/src/loader/vtl2_config/mod.rs index ea75a93f4c..899c5ac888 100644 --- a/openhcl/underhill_core/src/loader/vtl2_config/mod.rs +++ b/openhcl/underhill_core/src/loader/vtl2_config/mod.rs @@ -203,8 +203,7 @@ impl Drop for Vtl2ParamsMap<'_> { /// Write persisted info into the bootshim described persisted region. pub fn write_persisted_info( parsed: &ParsedBootDtInfo, - cpus_with_mapped_interrupts: Vec, - cpus_with_outstanding_io: Vec, + interrupt_state: crate::nvme_manager::save_restore_helpers::VPInterruptState, ) -> anyhow::Result<()> { use loader_defs::shim::PersistedStateHeader; use loader_defs::shim::save_restore::MemoryEntry; @@ -249,8 +248,8 @@ pub fn write_persisted_info( bootloader_fdt_parser::AddressRange::Memory(_) => None, }) .collect(), - cpus_with_mapped_interrupts, - cpus_with_outstanding_io, + cpus_with_mapped_interrupts_no_io: interrupt_state.vps_with_mapped_interrupts_no_io, + cpus_with_outstanding_io: interrupt_state.vps_with_outstanding_io, }; let protobuf = mesh_protobuf::encode(state); diff --git a/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs index 266746db4d..6646cc8dbc 100644 --- a/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs +++ b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs @@ -11,11 +11,11 @@ use std::collections::btree_map::Entry; /// disable sidecar for VMs with active device interrupts. pub struct VPInterruptState { /// List of vCPUs with any mapped device interrupts, sorted by CPU ID. - pub vps_with_mapped_interrupts: Vec, + /// This excludes vCPUs that also had outstanding I/O at the time of save, + /// which are counted in `vps_with_outstanding_io`. + pub vps_with_mapped_interrupts_no_io: Vec, /// List of vCPUs with outstanding I/O at the time of save, sorted by CPU ID. - /// It is expected that this is a subset of `vps_with_mapped_interrupts`, since - /// only some queues will have in-flight I/O. pub vps_with_outstanding_io: Vec, } @@ -23,7 +23,7 @@ pub struct VPInterruptState { /// and which had outstanding I/O at the time of save. /// /// See [`VPInterruptState`] for more details. -pub fn cpus_with_interrupts(state: Option<&NvmeManagerSavedState>) -> VPInterruptState { +pub fn nvme_interrupt_state(state: Option<&NvmeManagerSavedState>) -> VPInterruptState { let mut vp_state = BTreeMap::new(); if let Some(state) = state { @@ -41,15 +41,19 @@ pub fn cpus_with_interrupts(state: Option<&NvmeManagerSavedState>) -> VPInterrup } } + let (vps_with_outstanding_io, vps_with_mapped_interrupts_no_io): (Vec<_>, Vec<_>) = vp_state + .iter() + .map(|(&vp, &has_outstanding_io)| (vp, has_outstanding_io)) + .partition(|&(_, has_outstanding_io)| has_outstanding_io); + VPInterruptState { - vps_with_mapped_interrupts: vp_state.keys().cloned().collect(), - vps_with_outstanding_io: vp_state - .iter() - .filter_map( - |(&vp, &has_outstanding_io)| { - if has_outstanding_io { Some(vp) } else { None } - }, - ) + vps_with_mapped_interrupts_no_io: vps_with_mapped_interrupts_no_io + .into_iter() + .map(|(vp, _)| vp) + .collect(), + vps_with_outstanding_io: vps_with_outstanding_io + .into_iter() + .map(|(vp, _)| vp) .collect(), } } @@ -69,8 +73,8 @@ mod tests { #[test] fn returns_empty_when_state_absent() { - let result = cpus_with_interrupts(None); - assert!(result.vps_with_mapped_interrupts.is_empty()); + let result = nvme_interrupt_state(None); + assert!(result.vps_with_mapped_interrupts_no_io.is_empty()); assert!(result.vps_with_outstanding_io.is_empty()); } @@ -79,11 +83,12 @@ mod tests { let state = build_state(vec![ vec![QueueSpec::new(2, false), QueueSpec::new(1, true)], vec![QueueSpec::new(1, false), QueueSpec::new(3, true)], + vec![QueueSpec::new(5, false), QueueSpec::new(2, false)], ]); - let result = cpus_with_interrupts(Some(&state)); + let result = nvme_interrupt_state(Some(&state)); - assert_eq!(result.vps_with_mapped_interrupts, vec![1, 2, 3]); + assert_eq!(result.vps_with_mapped_interrupts_no_io, vec![2, 5]); assert_eq!(result.vps_with_outstanding_io, vec![1, 3]); } @@ -94,9 +99,12 @@ mod tests { QueueSpec::new(4, true), ]]); - let result = cpus_with_interrupts(Some(&state)); + let result = nvme_interrupt_state(Some(&state)); - assert_eq!(result.vps_with_mapped_interrupts, vec![4]); + assert_eq!( + result.vps_with_mapped_interrupts_no_io, + Vec::::from_iter([]) + ); assert_eq!(result.vps_with_outstanding_io, vec![4]); } @@ -107,9 +115,9 @@ mod tests { nvme_disks: Vec::new(), }; - let result = cpus_with_interrupts(Some(&state)); + let result = nvme_interrupt_state(Some(&state)); - assert!(result.vps_with_mapped_interrupts.is_empty()); + assert!(result.vps_with_mapped_interrupts_no_io.is_empty()); assert!(result.vps_with_outstanding_io.is_empty()); } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 11bc7b99cd..1adaec6b6a 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -1393,7 +1393,7 @@ impl InspectTask for DriverWorkerTask { } /// Save/restore data structures exposed by the NVMe driver. -#[allow(missing_docs)] +#[expect(missing_docs)] pub mod save_restore { use super::*; diff --git a/vm/loader/loader_defs/src/shim.rs b/vm/loader/loader_defs/src/shim.rs index 37f5fbe3b8..ae36ab8915 100644 --- a/vm/loader/loader_defs/src/shim.rs +++ b/vm/loader/loader_defs/src/shim.rs @@ -335,24 +335,17 @@ pub mod save_restore { /// The mmio entries describing mmio for the whole partition. #[mesh(2)] pub partition_mmio: Vec, - /// The list of CPUs with mapped device interrupts present at save time. - /// While this list is today used as a semaphore (either there are device - /// interrupts mapped or not), in the future it may be used to provide more - /// granular restore hints. E.g., only start the CPUs with active - /// interrupts right away and defer other CPU startup until later. + /// The list of CPUs with mapped device interrupts present at save time + /// that do not have outstanding IO (those CPUs are counted in + /// `cpus_with_outstanding_io`). /// /// DEFAULT: For save state from prior versions, this will be empty. /// This is fine: the restore heuristics might be less optimal, but will /// still be functionally correct. #[mesh(3)] - pub cpus_with_mapped_interrupts: Vec, + pub cpus_with_mapped_interrupts_no_io: Vec, /// The list of CPUs with mapped device interrupts present at save time, - /// and that have outstanding IO on that CPU. - /// - /// While this list is today used as a semaphore (either there are device - /// interrupts mapped or not), in the future it may be used to provide more - /// granular restore hints. E.g., only start the CPUs with active - /// interrupts right away and defer other CPU startup until later. + /// and that also have outstanding IO on that CPU. /// /// DEFAULT: For save state from prior versions, this will be empty. /// This is fine: the restore heuristics might be less optimal, but will From 5b214fb420e3a496719e57a58a1aeca3b576e4c0 Mon Sep 17 00:00:00 2001 From: "Matt LaFayette (Kurjanowicz)" Date: Wed, 3 Dec 2025 20:23:32 +0000 Subject: [PATCH 5/5] refactor: update NVMe driver state references to use save_restore module --- .../src/loader/vtl2_config/mod.rs | 12 +++++++--- .../underhill_core/src/nvme_manager/device.rs | 2 +- .../src/nvme_manager/manager.rs | 4 ++-- .../underhill_core/src/nvme_manager/mod.rs | 4 ++-- .../src/nvme_manager/save_restore.rs | 2 +- .../src/nvme_manager/save_restore_helpers.rs | 24 +++++++++---------- .../disk_nvme/nvme_driver/src/driver.rs | 2 +- .../storage/disk_nvme/nvme_driver/src/lib.rs | 2 -- 8 files changed, 27 insertions(+), 25 deletions(-) diff --git a/openhcl/underhill_core/src/loader/vtl2_config/mod.rs b/openhcl/underhill_core/src/loader/vtl2_config/mod.rs index 899c5ac888..e981688358 100644 --- a/openhcl/underhill_core/src/loader/vtl2_config/mod.rs +++ b/openhcl/underhill_core/src/loader/vtl2_config/mod.rs @@ -8,6 +8,7 @@ //! runtime, unlike measured config. Parameters provided by openhcl_boot are //! expected to be already validated by the bootloader. +use crate::nvme_manager::save_restore_helpers::VPInterruptState; use anyhow::Context; use bootloader_fdt_parser::IsolationType; use bootloader_fdt_parser::ParsedBootDtInfo; @@ -203,7 +204,7 @@ impl Drop for Vtl2ParamsMap<'_> { /// Write persisted info into the bootshim described persisted region. pub fn write_persisted_info( parsed: &ParsedBootDtInfo, - interrupt_state: crate::nvme_manager::save_restore_helpers::VPInterruptState, + interrupt_state: VPInterruptState, ) -> anyhow::Result<()> { use loader_defs::shim::PersistedStateHeader; use loader_defs::shim::save_restore::MemoryEntry; @@ -219,6 +220,11 @@ pub fn write_persisted_info( let mapping = Vtl2ParamsMap::new_writeable(&ranges).context("failed to map persisted protobuf region")?; + let VPInterruptState { + vps_with_mapped_interrupts_no_io: cpus_with_mapped_interrupts_no_io, + vps_with_outstanding_io: cpus_with_outstanding_io, + } = interrupt_state; + // Create the serialized data to write. let state = SavedState { partition_memory: parsed @@ -248,8 +254,8 @@ pub fn write_persisted_info( bootloader_fdt_parser::AddressRange::Memory(_) => None, }) .collect(), - cpus_with_mapped_interrupts_no_io: interrupt_state.vps_with_mapped_interrupts_no_io, - cpus_with_outstanding_io: interrupt_state.vps_with_outstanding_io, + cpus_with_mapped_interrupts_no_io, + cpus_with_outstanding_io, }; let protobuf = mesh_protobuf::encode(state); diff --git a/openhcl/underhill_core/src/nvme_manager/device.rs b/openhcl/underhill_core/src/nvme_manager/device.rs index b3cf60fb5a..bdb1c4ad56 100644 --- a/openhcl/underhill_core/src/nvme_manager/device.rs +++ b/openhcl/underhill_core/src/nvme_manager/device.rs @@ -13,7 +13,7 @@ use inspect::Inspect; use mesh::rpc::Rpc; use mesh::rpc::RpcError; use mesh::rpc::RpcSend; -use nvme_driver::NvmeDriverSavedState; +use nvme_driver::save_restore::NvmeDriverSavedState; use openhcl_dma_manager::AllocationVisibility; use openhcl_dma_manager::DmaClientParameters; use openhcl_dma_manager::DmaClientSpawner; diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 6a8cea13a7..ddd34125ac 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -540,7 +540,7 @@ pub mod save_restore { #[mesh(1)] pub pci_id: String, #[mesh(2)] - pub driver_state: nvme_driver::NvmeDriverSavedState, + pub driver_state: nvme_driver::save_restore::NvmeDriverSavedState, } } @@ -553,7 +553,7 @@ mod tests { use inspect::Inspect; use inspect::InspectionBuilder; use nvme_driver::Namespace; - use nvme_driver::NvmeDriverSavedState; + use nvme_driver::save_restore::NvmeDriverSavedState; use pal_async::DefaultDriver; use pal_async::async_test; use std::sync::atomic::AtomicU32; diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 244f91fff1..c16c34e139 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -96,7 +96,7 @@ pub trait NvmeDevice: Inspect + Send + Sync { &mut self, nsid: u32, ) -> Result, nvme_driver::NamespaceError>; - async fn save(&mut self) -> anyhow::Result; + async fn save(&mut self) -> anyhow::Result; async fn shutdown(mut self: Box); fn update_servicing_flags(&mut self, keep_alive: bool); } @@ -109,6 +109,6 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { pci_id: &str, vp_count: u32, save_restore_supported: bool, - saved_state: Option<&nvme_driver::NvmeDriverSavedState>, + saved_state: Option<&nvme_driver::save_restore::NvmeDriverSavedState>, ) -> Result, NvmeSpawnerError>; } diff --git a/openhcl/underhill_core/src/nvme_manager/save_restore.rs b/openhcl/underhill_core/src/nvme_manager/save_restore.rs index f66bbd3166..3f621638bc 100644 --- a/openhcl/underhill_core/src/nvme_manager/save_restore.rs +++ b/openhcl/underhill_core/src/nvme_manager/save_restore.rs @@ -19,5 +19,5 @@ pub struct NvmeSavedDiskConfig { #[mesh(1)] pub pci_id: String, #[mesh(2)] - pub driver_state: nvme_driver::NvmeDriverSavedState, + pub driver_state: nvme_driver::save_restore::NvmeDriverSavedState, } diff --git a/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs index 6646cc8dbc..ba6c3b2849 100644 --- a/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs +++ b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs @@ -41,20 +41,19 @@ pub fn nvme_interrupt_state(state: Option<&NvmeManagerSavedState>) -> VPInterrup } } - let (vps_with_outstanding_io, vps_with_mapped_interrupts_no_io): (Vec<_>, Vec<_>) = vp_state + let (vps_with_outstanding_io, vps_with_mapped_interrupts_no_io): ( + BTreeMap, + BTreeMap, + ) = vp_state .iter() - .map(|(&vp, &has_outstanding_io)| (vp, has_outstanding_io)) - .partition(|&(_, has_outstanding_io)| has_outstanding_io); + .partition(|&(_, has_outstanding_io)| *has_outstanding_io); VPInterruptState { vps_with_mapped_interrupts_no_io: vps_with_mapped_interrupts_no_io - .into_iter() - .map(|(vp, _)| vp) - .collect(), - vps_with_outstanding_io: vps_with_outstanding_io - .into_iter() - .map(|(vp, _)| vp) + .keys() + .cloned() .collect(), + vps_with_outstanding_io: vps_with_outstanding_io.keys().cloned().collect(), } } @@ -62,11 +61,10 @@ pub fn nvme_interrupt_state(state: Option<&NvmeManagerSavedState>) -> VPInterrup mod tests { use super::*; use crate::nvme_manager::save_restore::{NvmeManagerSavedState, NvmeSavedDiskConfig}; - use nvme_driver::NvmeDriverSavedState; use nvme_driver::save_restore::{ - CompletionQueueSavedState, IoQueueSavedState, NvmeDriverWorkerSavedState, - PendingCommandSavedState, PendingCommandsSavedState, QueueHandlerSavedState, - QueuePairSavedState, SubmissionQueueSavedState, + CompletionQueueSavedState, IoQueueSavedState, NvmeDriverSavedState, + NvmeDriverWorkerSavedState, PendingCommandSavedState, PendingCommandsSavedState, + QueueHandlerSavedState, QueuePairSavedState, SubmissionQueueSavedState, }; use nvme_spec as spec; use zerocopy::FromZeros; diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 5115958b74..eee78bbe13 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -7,7 +7,6 @@ use super::spec; use crate::NVME_PAGE_SHIFT; use crate::Namespace; use crate::NamespaceError; -use crate::NvmeDriverSavedState; use crate::RequestError; use crate::driver::save_restore::IoQueueSavedState; use crate::queue_pair::AdminAerHandler; @@ -19,6 +18,7 @@ use crate::queue_pair::QueuePair; use crate::queue_pair::admin_cmd; use crate::registers::Bar0; use crate::registers::DeviceRegisters; +use crate::save_restore::NvmeDriverSavedState; use anyhow::Context as _; use futures::StreamExt; use futures::future::join_all; diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs index 6662417313..b46d3f7055 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs @@ -15,8 +15,6 @@ mod tests; pub use self::driver::NvmeDriver; pub use self::driver::save_restore; -pub use self::driver::save_restore::Error; -pub use self::driver::save_restore::NvmeDriverSavedState; pub use self::namespace::Namespace; pub use self::namespace::NamespaceError; pub use self::queue_pair::RequestError;