Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7979,6 +7979,7 @@ dependencies = [
"netvsp",
"nvme_driver",
"nvme_resources",
"nvme_spec",
"openhcl_attestation_protocol",
"openhcl_dma_manager",
"pal",
Expand Down
14 changes: 10 additions & 4 deletions openhcl/openhcl_boot/src/host_params/dt/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,8 @@ struct PartitionTopology {
#[derive(Debug, PartialEq, Eq)]
struct PersistedPartitionTopology {
topology: PartitionTopology,
cpus_with_mapped_interrupts: Vec<u32>,
cpus_with_mapped_interrupts_no_io: Vec<u32>,
cpus_with_outstanding_io: Vec<u32>,
}

// Calculate the default mmio size for VTL2 when not specified by the host.
Expand Down Expand Up @@ -614,7 +615,8 @@ fn topology_from_persisted_state(
let loader_defs::shim::save_restore::SavedState {
partition_memory,
partition_mmio,
cpus_with_mapped_interrupts,
cpus_with_mapped_interrupts_no_io,
cpus_with_outstanding_io,
} = parsed_protobuf;

// FUTURE: should memory allocation mode should persist in saved state and
Expand Down Expand Up @@ -763,7 +765,8 @@ fn topology_from_persisted_state(
vtl2_mmio,
memory_allocation_mode,
},
cpus_with_mapped_interrupts,
cpus_with_mapped_interrupts_no_io,
cpus_with_outstanding_io,
})
}

Expand Down Expand Up @@ -854,7 +857,10 @@ impl PartitionInfo {

(
persisted_topology.topology,
!persisted_topology.cpus_with_mapped_interrupts.is_empty(),
!(persisted_topology
.cpus_with_mapped_interrupts_no_io
.is_empty()
&& persisted_topology.cpus_with_outstanding_io.is_empty()),
)
} else {
(
Expand Down
1 change: 1 addition & 0 deletions openhcl/underhill_core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ firmware_pcat.workspace = true
[dev-dependencies]
guest_emulation_transport = { workspace = true, features = ["test_utilities"] }
test_with_tracing.workspace = true
nvme_spec.workspace = true

[build-dependencies]
build_rs_guest_arch.workspace = true
Expand Down
11 changes: 4 additions & 7 deletions openhcl/underhill_core/src/dispatch/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -627,17 +627,14 @@ impl LoadedVm {
};

// Save the persisted state used by the next openhcl_boot.
let cpus_with_mapped_interrupts = match state
// In the future, this could be extended to include other devices (e.g. MANA).
let nvme_vp_interrupt_state = crate::nvme_manager::save_restore_helpers::nvme_interrupt_state(state
.init_state
.nvme_state
.as_ref() {
Some(nvme_state) => crate::nvme_manager::save_restore::cpus_with_interrupts(&nvme_state.nvme_state),
None => vec![],
};
.nvme_state.as_ref().map(|n| &n.nvme_state));

crate::loader::vtl2_config::write_persisted_info(
self.runtime_params.parsed_openhcl_boot(),
cpus_with_mapped_interrupts,
nvme_vp_interrupt_state,
)
.context("failed to write persisted info")?;

Expand Down
5 changes: 3 additions & 2 deletions openhcl/underhill_core/src/loader/vtl2_config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ impl Drop for Vtl2ParamsMap<'_> {
/// Write persisted info into the bootshim described persisted region.
pub fn write_persisted_info(
parsed: &ParsedBootDtInfo,
cpus_with_mapped_interrupts: Vec<u32>,
interrupt_state: crate::nvme_manager::save_restore_helpers::VPInterruptState,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: consider destructuring the interrupt state so that new fields won't get forgotten here?

) -> anyhow::Result<()> {
use loader_defs::shim::PersistedStateHeader;
use loader_defs::shim::save_restore::MemoryEntry;
Expand Down Expand Up @@ -248,7 +248,8 @@ pub fn write_persisted_info(
bootloader_fdt_parser::AddressRange::Memory(_) => None,
})
.collect(),
cpus_with_mapped_interrupts,
cpus_with_mapped_interrupts_no_io: interrupt_state.vps_with_mapped_interrupts_no_io,
cpus_with_outstanding_io: interrupt_state.vps_with_outstanding_io,
};

let protobuf = mesh_protobuf::encode(state);
Expand Down
1 change: 1 addition & 0 deletions openhcl/underhill_core/src/nvme_manager/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ use vmcore::vm_task::VmTaskDriverSource;
pub mod device;
pub mod manager;
pub mod save_restore;
pub mod save_restore_helpers;

#[derive(Debug, Error)]
#[error("nvme device {pci_id} error")]
Expand Down
13 changes: 0 additions & 13 deletions openhcl/underhill_core/src/nvme_manager/save_restore.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// Licensed under the MIT License.

use mesh::payload::Protobuf;
use std::collections::BTreeSet;
use vmcore::save_restore::SavedStateRoot;

#[derive(Protobuf, SavedStateRoot)]
Expand All @@ -22,15 +21,3 @@ pub struct NvmeSavedDiskConfig {
#[mesh(2)]
pub driver_state: nvme_driver::NvmeDriverSavedState,
}

/// Returns a sorted list of CPU IDs that have mapped device interrupts in the saved NVMe state.
///
/// This information is used to make heuristic decisions during restore, such as whether to
/// disable sidecar for VMs with active device interrupts.
pub fn cpus_with_interrupts(state: &NvmeManagerSavedState) -> Vec<u32> {
let mut cpus_with_interrupts = BTreeSet::new();
for disk in &state.nvme_disks {
cpus_with_interrupts.extend(disk.driver_state.worker_data.io.iter().map(|q| q.cpu));
}
cpus_with_interrupts.into_iter().collect()
}
220 changes: 220 additions & 0 deletions openhcl/underhill_core/src/nvme_manager/save_restore_helpers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

use crate::nvme_manager::save_restore::NvmeManagerSavedState;
use std::collections::BTreeMap;
use std::collections::btree_map::Entry;

/// Useful state about how the VM's vCPUs interacted with NVMe device interrupts at the time of save.
///
/// This information is used to make heuristic decisions during restore, such as whether to
/// disable sidecar for VMs with active device interrupts.
pub struct VPInterruptState {
/// List of vCPUs with any mapped device interrupts, sorted by CPU ID.
/// This excludes vCPUs that also had outstanding I/O at the time of save,
/// which are counted in `vps_with_outstanding_io`.
pub vps_with_mapped_interrupts_no_io: Vec<u32>,

/// List of vCPUs with outstanding I/O at the time of save, sorted by CPU ID.
pub vps_with_outstanding_io: Vec<u32>,
}

/// Analyzes the saved NVMe manager state to determine which vCPUs had mapped device interrupts
/// and which had outstanding I/O at the time of save.
///
/// See [`VPInterruptState`] for more details.
pub fn nvme_interrupt_state(state: Option<&NvmeManagerSavedState>) -> VPInterruptState {
let mut vp_state = BTreeMap::new();

if let Some(state) = state {
for disk in &state.nvme_disks {
for q in &disk.driver_state.worker_data.io {
match vp_state.entry(q.cpu) {
Entry::Vacant(e) => {
e.insert(!q.queue_data.handler_data.pending_cmds.commands.is_empty());
}
Entry::Occupied(mut e) => {
*e.get_mut() |= !q.queue_data.handler_data.pending_cmds.commands.is_empty();
}
}
}
}
}

let (vps_with_outstanding_io, vps_with_mapped_interrupts_no_io): (Vec<_>, Vec<_>) = vp_state
.iter()
.map(|(&vp, &has_outstanding_io)| (vp, has_outstanding_io))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
.map(|(&vp, &has_outstanding_io)| (vp, has_outstanding_io))
.copied()

.partition(|&(_, has_outstanding_io)| has_outstanding_io);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
.partition(|&(_, has_outstanding_io)| has_outstanding_io);
.partition(|&(_, has_outstanding_io)| has_outstanding_io).map(|(vp, _)| vp);

You can dedup the mappings done below here i think.


VPInterruptState {
vps_with_mapped_interrupts_no_io: vps_with_mapped_interrupts_no_io
.into_iter()
.map(|(vp, _)| vp)
.collect(),
vps_with_outstanding_io: vps_with_outstanding_io
.into_iter()
.map(|(vp, _)| vp)
.collect(),
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::nvme_manager::save_restore::{NvmeManagerSavedState, NvmeSavedDiskConfig};
use nvme_driver::NvmeDriverSavedState;
use nvme_driver::save_restore::{
CompletionQueueSavedState, IoQueueSavedState, NvmeDriverWorkerSavedState,
PendingCommandSavedState, PendingCommandsSavedState, QueueHandlerSavedState,
QueuePairSavedState, SubmissionQueueSavedState,
};
use nvme_spec as spec;
use zerocopy::FromZeros;

#[test]
fn returns_empty_when_state_absent() {
let result = nvme_interrupt_state(None);
assert!(result.vps_with_mapped_interrupts_no_io.is_empty());
assert!(result.vps_with_outstanding_io.is_empty());
}

#[test]
fn collects_unique_sorted_vps_and_outstanding_subset() {
let state = build_state(vec![
vec![QueueSpec::new(2, false), QueueSpec::new(1, true)],
vec![QueueSpec::new(1, false), QueueSpec::new(3, true)],
vec![QueueSpec::new(5, false), QueueSpec::new(2, false)],
]);

let result = nvme_interrupt_state(Some(&state));

assert_eq!(result.vps_with_mapped_interrupts_no_io, vec![2, 5]);
assert_eq!(result.vps_with_outstanding_io, vec![1, 3]);
}

#[test]
fn reports_outstanding_if_any_queue_pending_for_vp() {
let state = build_state(vec![vec![
QueueSpec::new(4, false),
QueueSpec::new(4, true),
]]);

let result = nvme_interrupt_state(Some(&state));

assert_eq!(
result.vps_with_mapped_interrupts_no_io,
Vec::<u32>::from_iter([])
);
assert_eq!(result.vps_with_outstanding_io, vec![4]);
}

#[test]
fn handles_state_with_no_disks() {
let state = NvmeManagerSavedState {
cpu_count: 0,
nvme_disks: Vec::new(),
};

let result = nvme_interrupt_state(Some(&state));

assert!(result.vps_with_mapped_interrupts_no_io.is_empty());
assert!(result.vps_with_outstanding_io.is_empty());
}

struct QueueSpec {
cpu: u32,
has_outstanding_io: bool,
}

impl QueueSpec {
const fn new(cpu: u32, has_outstanding_io: bool) -> Self {
Self {
cpu,
has_outstanding_io,
}
}
}

// Helper to fabricate NVMe manager save-state snapshots with specific CPU/IO mappings.
fn build_state(disk_queue_specs: Vec<Vec<QueueSpec>>) -> NvmeManagerSavedState {
NvmeManagerSavedState {
cpu_count: 0, // Not relevant for these tests.
nvme_disks: disk_queue_specs
.into_iter()
.enumerate()
.map(|(disk_index, queues)| NvmeSavedDiskConfig {
pci_id: format!("0000:{disk_index:02x}.0"),
driver_state: NvmeDriverSavedState {
identify_ctrl: spec::IdentifyController::new_zeroed(),
device_id: format!("disk{disk_index}"),
namespaces: Vec::new(),
worker_data: NvmeDriverWorkerSavedState {
admin: None,
io: queues
.into_iter()
.enumerate()
.map(|(queue_index, spec)| {
// Tests only care about per-disk affinity, so queue IDs can
// restart from zero for each disk without losing coverage.
build_io_queue(
queue_index as u16,
spec.cpu,
spec.has_outstanding_io,
)
})
.collect(),
qsize: 0,
max_io_queues: 0,
},
},
})
.collect(),
}
}

fn build_io_queue(qid: u16, cpu: u32, outstanding: bool) -> IoQueueSavedState {
IoQueueSavedState {
cpu,
iv: qid as u32,
queue_data: QueuePairSavedState {
mem_len: 0,
base_pfn: 0,
qid,
sq_entries: 1,
cq_entries: 1,
handler_data: QueueHandlerSavedState {
sq_state: SubmissionQueueSavedState {
sqid: qid,
head: 0,
tail: 0,
committed_tail: 0,
len: 1,
},
cq_state: CompletionQueueSavedState {
cqid: qid,
head: 0,
committed_head: 0,
len: 1,
phase: false,
},
pending_cmds: build_pending_cmds(outstanding),
aer_handler: None,
},
},
}
}

fn build_pending_cmds(outstanding: bool) -> PendingCommandsSavedState {
PendingCommandsSavedState {
commands: if outstanding {
vec![PendingCommandSavedState {
command: spec::Command::new_zeroed(),
}]
} else {
Vec::new()
},
next_cid_high_bits: 0,
cid_key_bits: 0,
}
}
}
Loading