diff --git a/Cargo.lock b/Cargo.lock index 496372549b..795cdce3ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7979,6 +7979,7 @@ dependencies = [ "netvsp", "nvme_driver", "nvme_resources", + "nvme_spec", "openhcl_attestation_protocol", "openhcl_dma_manager", "pal", diff --git a/openhcl/openhcl_boot/src/host_params/dt/mod.rs b/openhcl/openhcl_boot/src/host_params/dt/mod.rs index 62aa2669e5..8a08b2b5bf 100644 --- a/openhcl/openhcl_boot/src/host_params/dt/mod.rs +++ b/openhcl/openhcl_boot/src/host_params/dt/mod.rs @@ -378,7 +378,8 @@ struct PartitionTopology { #[derive(Debug, PartialEq, Eq)] struct PersistedPartitionTopology { topology: PartitionTopology, - cpus_with_mapped_interrupts: Vec, + cpus_with_mapped_interrupts_no_io: Vec, + cpus_with_outstanding_io: Vec, } // Calculate the default mmio size for VTL2 when not specified by the host. @@ -614,7 +615,8 @@ fn topology_from_persisted_state( let loader_defs::shim::save_restore::SavedState { partition_memory, partition_mmio, - cpus_with_mapped_interrupts, + cpus_with_mapped_interrupts_no_io, + cpus_with_outstanding_io, } = parsed_protobuf; // FUTURE: should memory allocation mode should persist in saved state and @@ -763,7 +765,8 @@ fn topology_from_persisted_state( vtl2_mmio, memory_allocation_mode, }, - cpus_with_mapped_interrupts, + cpus_with_mapped_interrupts_no_io, + cpus_with_outstanding_io, }) } @@ -854,7 +857,10 @@ impl PartitionInfo { ( persisted_topology.topology, - !persisted_topology.cpus_with_mapped_interrupts.is_empty(), + !(persisted_topology + .cpus_with_mapped_interrupts_no_io + .is_empty() + && persisted_topology.cpus_with_outstanding_io.is_empty()), ) } else { ( diff --git a/openhcl/underhill_core/Cargo.toml b/openhcl/underhill_core/Cargo.toml index 2cd7cf94f5..c68b2f5b3c 100644 --- a/openhcl/underhill_core/Cargo.toml +++ b/openhcl/underhill_core/Cargo.toml @@ -179,6 +179,7 @@ firmware_pcat.workspace = true [dev-dependencies] guest_emulation_transport = { workspace = true, features = ["test_utilities"] } test_with_tracing.workspace = true +nvme_spec.workspace = true [build-dependencies] build_rs_guest_arch.workspace = true diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index 2f6c344497..75d9401ca6 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -650,16 +650,15 @@ impl LoadedVm { }; // Save the persisted state used by the next openhcl_boot. - let cpus_with_mapped_interrupts = match state.init_state.nvme_state.as_ref() { - Some(nvme_state) => { - crate::nvme_manager::save_restore::cpus_with_interrupts(&nvme_state.nvme_state) - } - None => vec![], - }; + // In the future, this could be extended to include other devices (e.g. MANA). + let nvme_vp_interrupt_state = + crate::nvme_manager::save_restore_helpers::nvme_interrupt_state( + state.init_state.nvme_state.as_ref().map(|n| &n.nvme_state), + ); crate::loader::vtl2_config::write_persisted_info( self.runtime_params.parsed_openhcl_boot(), - cpus_with_mapped_interrupts, + nvme_vp_interrupt_state, ) .context("failed to write persisted info")?; diff --git a/openhcl/underhill_core/src/loader/vtl2_config/mod.rs b/openhcl/underhill_core/src/loader/vtl2_config/mod.rs index 1e0df0c048..e981688358 100644 --- a/openhcl/underhill_core/src/loader/vtl2_config/mod.rs +++ b/openhcl/underhill_core/src/loader/vtl2_config/mod.rs @@ -8,6 +8,7 @@ //! runtime, unlike measured config. Parameters provided by openhcl_boot are //! expected to be already validated by the bootloader. +use crate::nvme_manager::save_restore_helpers::VPInterruptState; use anyhow::Context; use bootloader_fdt_parser::IsolationType; use bootloader_fdt_parser::ParsedBootDtInfo; @@ -203,7 +204,7 @@ impl Drop for Vtl2ParamsMap<'_> { /// Write persisted info into the bootshim described persisted region. pub fn write_persisted_info( parsed: &ParsedBootDtInfo, - cpus_with_mapped_interrupts: Vec, + interrupt_state: VPInterruptState, ) -> anyhow::Result<()> { use loader_defs::shim::PersistedStateHeader; use loader_defs::shim::save_restore::MemoryEntry; @@ -219,6 +220,11 @@ pub fn write_persisted_info( let mapping = Vtl2ParamsMap::new_writeable(&ranges).context("failed to map persisted protobuf region")?; + let VPInterruptState { + vps_with_mapped_interrupts_no_io: cpus_with_mapped_interrupts_no_io, + vps_with_outstanding_io: cpus_with_outstanding_io, + } = interrupt_state; + // Create the serialized data to write. let state = SavedState { partition_memory: parsed @@ -248,7 +254,8 @@ pub fn write_persisted_info( bootloader_fdt_parser::AddressRange::Memory(_) => None, }) .collect(), - cpus_with_mapped_interrupts, + cpus_with_mapped_interrupts_no_io, + cpus_with_outstanding_io, }; let protobuf = mesh_protobuf::encode(state); diff --git a/openhcl/underhill_core/src/nvme_manager/device.rs b/openhcl/underhill_core/src/nvme_manager/device.rs index b3cf60fb5a..bdb1c4ad56 100644 --- a/openhcl/underhill_core/src/nvme_manager/device.rs +++ b/openhcl/underhill_core/src/nvme_manager/device.rs @@ -13,7 +13,7 @@ use inspect::Inspect; use mesh::rpc::Rpc; use mesh::rpc::RpcError; use mesh::rpc::RpcSend; -use nvme_driver::NvmeDriverSavedState; +use nvme_driver::save_restore::NvmeDriverSavedState; use openhcl_dma_manager::AllocationVisibility; use openhcl_dma_manager::DmaClientParameters; use openhcl_dma_manager::DmaClientSpawner; diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 6a8cea13a7..ddd34125ac 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -540,7 +540,7 @@ pub mod save_restore { #[mesh(1)] pub pci_id: String, #[mesh(2)] - pub driver_state: nvme_driver::NvmeDriverSavedState, + pub driver_state: nvme_driver::save_restore::NvmeDriverSavedState, } } @@ -553,7 +553,7 @@ mod tests { use inspect::Inspect; use inspect::InspectionBuilder; use nvme_driver::Namespace; - use nvme_driver::NvmeDriverSavedState; + use nvme_driver::save_restore::NvmeDriverSavedState; use pal_async::DefaultDriver; use pal_async::async_test; use std::sync::atomic::AtomicU32; diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 83b3b9e49d..c16c34e139 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -58,6 +58,7 @@ use vmcore::vm_task::VmTaskDriverSource; pub mod device; pub mod manager; pub mod save_restore; +pub mod save_restore_helpers; #[derive(Debug, Error)] #[error("nvme device {pci_id} error")] @@ -95,7 +96,7 @@ pub trait NvmeDevice: Inspect + Send + Sync { &mut self, nsid: u32, ) -> Result, nvme_driver::NamespaceError>; - async fn save(&mut self) -> anyhow::Result; + async fn save(&mut self) -> anyhow::Result; async fn shutdown(mut self: Box); fn update_servicing_flags(&mut self, keep_alive: bool); } @@ -108,6 +109,6 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { pci_id: &str, vp_count: u32, save_restore_supported: bool, - saved_state: Option<&nvme_driver::NvmeDriverSavedState>, + saved_state: Option<&nvme_driver::save_restore::NvmeDriverSavedState>, ) -> Result, NvmeSpawnerError>; } diff --git a/openhcl/underhill_core/src/nvme_manager/save_restore.rs b/openhcl/underhill_core/src/nvme_manager/save_restore.rs index eb0a05d680..3f621638bc 100644 --- a/openhcl/underhill_core/src/nvme_manager/save_restore.rs +++ b/openhcl/underhill_core/src/nvme_manager/save_restore.rs @@ -2,7 +2,6 @@ // Licensed under the MIT License. use mesh::payload::Protobuf; -use std::collections::BTreeSet; use vmcore::save_restore::SavedStateRoot; #[derive(Protobuf, SavedStateRoot)] @@ -20,17 +19,5 @@ pub struct NvmeSavedDiskConfig { #[mesh(1)] pub pci_id: String, #[mesh(2)] - pub driver_state: nvme_driver::NvmeDriverSavedState, -} - -/// Returns a sorted list of CPU IDs that have mapped device interrupts in the saved NVMe state. -/// -/// This information is used to make heuristic decisions during restore, such as whether to -/// disable sidecar for VMs with active device interrupts. -pub fn cpus_with_interrupts(state: &NvmeManagerSavedState) -> Vec { - let mut cpus_with_interrupts = BTreeSet::new(); - for disk in &state.nvme_disks { - cpus_with_interrupts.extend(disk.driver_state.worker_data.io.iter().map(|q| q.cpu)); - } - cpus_with_interrupts.into_iter().collect() + pub driver_state: nvme_driver::save_restore::NvmeDriverSavedState, } diff --git a/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs new file mode 100644 index 0000000000..ba6c3b2849 --- /dev/null +++ b/openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs @@ -0,0 +1,218 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use crate::nvme_manager::save_restore::NvmeManagerSavedState; +use std::collections::BTreeMap; +use std::collections::btree_map::Entry; + +/// Useful state about how the VM's vCPUs interacted with NVMe device interrupts at the time of save. +/// +/// This information is used to make heuristic decisions during restore, such as whether to +/// disable sidecar for VMs with active device interrupts. +pub struct VPInterruptState { + /// List of vCPUs with any mapped device interrupts, sorted by CPU ID. + /// This excludes vCPUs that also had outstanding I/O at the time of save, + /// which are counted in `vps_with_outstanding_io`. + pub vps_with_mapped_interrupts_no_io: Vec, + + /// List of vCPUs with outstanding I/O at the time of save, sorted by CPU ID. + pub vps_with_outstanding_io: Vec, +} + +/// Analyzes the saved NVMe manager state to determine which vCPUs had mapped device interrupts +/// and which had outstanding I/O at the time of save. +/// +/// See [`VPInterruptState`] for more details. +pub fn nvme_interrupt_state(state: Option<&NvmeManagerSavedState>) -> VPInterruptState { + let mut vp_state = BTreeMap::new(); + + if let Some(state) = state { + for disk in &state.nvme_disks { + for q in &disk.driver_state.worker_data.io { + match vp_state.entry(q.cpu) { + Entry::Vacant(e) => { + e.insert(!q.queue_data.handler_data.pending_cmds.commands.is_empty()); + } + Entry::Occupied(mut e) => { + *e.get_mut() |= !q.queue_data.handler_data.pending_cmds.commands.is_empty(); + } + } + } + } + } + + let (vps_with_outstanding_io, vps_with_mapped_interrupts_no_io): ( + BTreeMap, + BTreeMap, + ) = vp_state + .iter() + .partition(|&(_, has_outstanding_io)| *has_outstanding_io); + + VPInterruptState { + vps_with_mapped_interrupts_no_io: vps_with_mapped_interrupts_no_io + .keys() + .cloned() + .collect(), + vps_with_outstanding_io: vps_with_outstanding_io.keys().cloned().collect(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::nvme_manager::save_restore::{NvmeManagerSavedState, NvmeSavedDiskConfig}; + use nvme_driver::save_restore::{ + CompletionQueueSavedState, IoQueueSavedState, NvmeDriverSavedState, + NvmeDriverWorkerSavedState, PendingCommandSavedState, PendingCommandsSavedState, + QueueHandlerSavedState, QueuePairSavedState, SubmissionQueueSavedState, + }; + use nvme_spec as spec; + use zerocopy::FromZeros; + + #[test] + fn returns_empty_when_state_absent() { + let result = nvme_interrupt_state(None); + assert!(result.vps_with_mapped_interrupts_no_io.is_empty()); + assert!(result.vps_with_outstanding_io.is_empty()); + } + + #[test] + fn collects_unique_sorted_vps_and_outstanding_subset() { + let state = build_state(vec![ + vec![QueueSpec::new(2, false), QueueSpec::new(1, true)], + vec![QueueSpec::new(1, false), QueueSpec::new(3, true)], + vec![QueueSpec::new(5, false), QueueSpec::new(2, false)], + ]); + + let result = nvme_interrupt_state(Some(&state)); + + assert_eq!(result.vps_with_mapped_interrupts_no_io, vec![2, 5]); + assert_eq!(result.vps_with_outstanding_io, vec![1, 3]); + } + + #[test] + fn reports_outstanding_if_any_queue_pending_for_vp() { + let state = build_state(vec![vec![ + QueueSpec::new(4, false), + QueueSpec::new(4, true), + ]]); + + let result = nvme_interrupt_state(Some(&state)); + + assert_eq!( + result.vps_with_mapped_interrupts_no_io, + Vec::::from_iter([]) + ); + assert_eq!(result.vps_with_outstanding_io, vec![4]); + } + + #[test] + fn handles_state_with_no_disks() { + let state = NvmeManagerSavedState { + cpu_count: 0, + nvme_disks: Vec::new(), + }; + + let result = nvme_interrupt_state(Some(&state)); + + assert!(result.vps_with_mapped_interrupts_no_io.is_empty()); + assert!(result.vps_with_outstanding_io.is_empty()); + } + + struct QueueSpec { + cpu: u32, + has_outstanding_io: bool, + } + + impl QueueSpec { + const fn new(cpu: u32, has_outstanding_io: bool) -> Self { + Self { + cpu, + has_outstanding_io, + } + } + } + + // Helper to fabricate NVMe manager save-state snapshots with specific CPU/IO mappings. + fn build_state(disk_queue_specs: Vec>) -> NvmeManagerSavedState { + NvmeManagerSavedState { + cpu_count: 0, // Not relevant for these tests. + nvme_disks: disk_queue_specs + .into_iter() + .enumerate() + .map(|(disk_index, queues)| NvmeSavedDiskConfig { + pci_id: format!("0000:{disk_index:02x}.0"), + driver_state: NvmeDriverSavedState { + identify_ctrl: spec::IdentifyController::new_zeroed(), + device_id: format!("disk{disk_index}"), + namespaces: Vec::new(), + worker_data: NvmeDriverWorkerSavedState { + admin: None, + io: queues + .into_iter() + .enumerate() + .map(|(queue_index, spec)| { + // Tests only care about per-disk affinity, so queue IDs can + // restart from zero for each disk without losing coverage. + build_io_queue( + queue_index as u16, + spec.cpu, + spec.has_outstanding_io, + ) + }) + .collect(), + qsize: 0, + max_io_queues: 0, + }, + }, + }) + .collect(), + } + } + + fn build_io_queue(qid: u16, cpu: u32, outstanding: bool) -> IoQueueSavedState { + IoQueueSavedState { + cpu, + iv: qid as u32, + queue_data: QueuePairSavedState { + mem_len: 0, + base_pfn: 0, + qid, + sq_entries: 1, + cq_entries: 1, + handler_data: QueueHandlerSavedState { + sq_state: SubmissionQueueSavedState { + sqid: qid, + head: 0, + tail: 0, + committed_tail: 0, + len: 1, + }, + cq_state: CompletionQueueSavedState { + cqid: qid, + head: 0, + committed_head: 0, + len: 1, + phase: false, + }, + pending_cmds: build_pending_cmds(outstanding), + aer_handler: None, + }, + }, + } + } + + fn build_pending_cmds(outstanding: bool) -> PendingCommandsSavedState { + PendingCommandsSavedState { + commands: if outstanding { + vec![PendingCommandSavedState { + command: spec::Command::new_zeroed(), + }] + } else { + Vec::new() + }, + next_cid_high_bits: 0, + cid_key_bits: 0, + } + } +} diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 1e4214e8d7..eee78bbe13 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -7,7 +7,6 @@ use super::spec; use crate::NVME_PAGE_SHIFT; use crate::Namespace; use crate::NamespaceError; -use crate::NvmeDriverSavedState; use crate::RequestError; use crate::driver::save_restore::IoQueueSavedState; use crate::queue_pair::AdminAerHandler; @@ -19,6 +18,7 @@ use crate::queue_pair::QueuePair; use crate::queue_pair::admin_cmd; use crate::registers::Bar0; use crate::registers::DeviceRegisters; +use crate::save_restore::NvmeDriverSavedState; use anyhow::Context as _; use futures::StreamExt; use futures::future::join_all; @@ -1409,6 +1409,8 @@ impl InspectTask for DriverWorkerTask { } } +/// Save/restore data structures exposed by the NVMe driver. +#[expect(missing_docs)] pub mod save_restore { use super::*; @@ -1510,6 +1512,7 @@ pub mod save_restore { pub aer_handler: Option, } + /// Snapshot of submission queue metadata captured during save. #[derive(Protobuf, Clone, Debug)] #[mesh(package = "nvme_driver")] pub struct SubmissionQueueSavedState { @@ -1525,6 +1528,7 @@ pub mod save_restore { pub len: u32, } + /// Snapshot of completion queue metadata captured during save. #[derive(Protobuf, Clone, Debug)] #[mesh(package = "nvme_driver")] pub struct CompletionQueueSavedState { @@ -1541,6 +1545,7 @@ pub mod save_restore { pub phase: bool, } + /// Pending command entry captured from a queue handler. #[derive(Protobuf, Clone, Debug)] #[mesh(package = "nvme_driver")] pub struct PendingCommandSavedState { @@ -1548,6 +1553,7 @@ pub mod save_restore { pub command: spec::Command, } + /// Collection of pending commands indexed by CID. #[derive(Protobuf, Clone, Debug)] #[mesh(package = "nvme_driver")] pub struct PendingCommandsSavedState { @@ -1569,6 +1575,7 @@ pub mod save_restore { pub identify_ns: nvme_spec::nvm::IdentifyNamespace, } + /// Saved Async Event Request handler metadata. #[derive(Clone, Debug, Protobuf)] #[mesh(package = "nvme_driver")] pub struct AerHandlerSavedState { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs index 3efc12dd39..b46d3f7055 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs @@ -14,8 +14,7 @@ mod registers; mod tests; pub use self::driver::NvmeDriver; -pub use self::driver::save_restore::Error; -pub use self::driver::save_restore::NvmeDriverSavedState; +pub use self::driver::save_restore; pub use self::namespace::Namespace; pub use self::namespace::NamespaceError; pub use self::queue_pair::RequestError; diff --git a/vm/loader/loader_defs/src/shim.rs b/vm/loader/loader_defs/src/shim.rs index 158c686704..ae36ab8915 100644 --- a/vm/loader/loader_defs/src/shim.rs +++ b/vm/loader/loader_defs/src/shim.rs @@ -335,16 +335,22 @@ pub mod save_restore { /// The mmio entries describing mmio for the whole partition. #[mesh(2)] pub partition_mmio: Vec, - /// The list of CPUs with mapped device interrupts present at save time. - /// While this list is today used as a semaphore (either there are device - /// interrupts mapped or not), in the future it may be used to provide more - /// granular restore hints. E.g., only start the CPUs with active - /// interrupts right away and defer other CPU startup until later. + /// The list of CPUs with mapped device interrupts present at save time + /// that do not have outstanding IO (those CPUs are counted in + /// `cpus_with_outstanding_io`). /// /// DEFAULT: For save state from prior versions, this will be empty. /// This is fine: the restore heuristics might be less optimal, but will /// still be functionally correct. #[mesh(3)] - pub cpus_with_mapped_interrupts: Vec, + pub cpus_with_mapped_interrupts_no_io: Vec, + /// The list of CPUs with mapped device interrupts present at save time, + /// and that also have outstanding IO on that CPU. + /// + /// DEFAULT: For save state from prior versions, this will be empty. + /// This is fine: the restore heuristics might be less optimal, but will + /// still be functionally correct. + #[mesh(4)] + pub cpus_with_outstanding_io: Vec, } }