Skip to content
71 changes: 40 additions & 31 deletions openhcl/openhcl_boot/src/host_params/dt/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -849,32 +849,29 @@ impl PartitionInfo {
init_heap(params);

let persisted_state_header = read_persisted_region_header(params);
let (topology, has_devices_that_should_disable_sidecar) =
if let Some(header) = persisted_state_header {
log!("found persisted state header");
let persisted_topology =
topology_from_persisted_state(header, params, parsed, address_space)?;

(
persisted_topology.topology,
!(persisted_topology
.cpus_with_mapped_interrupts_no_io
.is_empty()
&& persisted_topology.cpus_with_outstanding_io.is_empty()),
)
} else {
(
topology_from_host_dt(params, parsed, &options, address_space)?,
false,
)
};
let (topology, cpus_with_mapped_interrupts) = if let Some(header) = persisted_state_header {
log!("found persisted state header");
let persisted_topology =
topology_from_persisted_state(header, params, parsed, address_space)?;

(
persisted_topology.topology,
persisted_topology.cpus_with_outstanding_io,
)
} else {
(
topology_from_host_dt(params, parsed, &options, address_space)?,
Vec::new(),
)
};

let Self {
vtl2_ram,
partition_ram,
isolation,
bsp_reg,
cpus,
sidecar_cpu_overrides,
vmbus_vtl0,
vmbus_vtl2,
cmdline: _,
Expand All @@ -888,20 +885,32 @@ impl PartitionInfo {
boot_options,
} = storage;

if let (SidecarOptions::Enabled { cpu_threshold, .. }, true) = (
if let (SidecarOptions::Enabled { .. }, true) = (
&boot_options.sidecar,
has_devices_that_should_disable_sidecar,
!cpus_with_mapped_interrupts.is_empty(),
) {
if cpu_threshold.is_none()
|| cpu_threshold
.and_then(|threshold| threshold.try_into().ok())
.is_some_and(|threshold| parsed.cpu_count() < threshold)
{
// If we are in the restore path, disable sidecar for small VMs, as the amortization
// benefits don't apply when devices are kept alive; the CPUs need to be powered on anyway
// to check for interrupts.
log!("disabling sidecar, as we are restoring from persisted state");
boot_options.sidecar = SidecarOptions::DisabledServicing;
let max_cpu_id = *cpus_with_mapped_interrupts
.iter()
.max()
.expect("non-empty vector") as usize;
if max_cpu_id < sidecar_cpu_overrides.sidecar_starts_cpu.len() {
sidecar_cpu_overrides.per_cpu_state_specified = true;
cpus_with_mapped_interrupts.iter().for_each(|&cpu_id| {
sidecar_cpu_overrides.sidecar_starts_cpu[cpu_id as usize] = false;
});
log!(
"disabling sidecar for CPUs {:?} due to mapped interrupts, per_cpu_state_specified={}, per_cpu_state={:?}",
cpus_with_mapped_interrupts,
sidecar_cpu_overrides.per_cpu_state_specified,
&sidecar_cpu_overrides.sidecar_starts_cpu[..max_cpu_id + 1],
);
} else {
// Degenerate case, and we'll need those VPs started by the time OpenHCL usermode restores anyways, so just
// disable sidecar.
log!(
"too many CPUs with mapped interrupts ({}), disabling sidecar",
cpus_with_mapped_interrupts.len()
);
options.sidecar = SidecarOptions::DisabledServicing;
}
}
Expand Down
6 changes: 6 additions & 0 deletions openhcl/openhcl_boot/src/host_params/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ pub struct PartitionInfo {
pub bsp_reg: u32,
/// Cpu info for enabled cpus.
pub cpus: ArrayVec<CpuEntry, MAX_CPU_COUNT>,
/// Per-CPU state to apply when starting the sidecar kernel.
pub sidecar_cpu_overrides: sidecar_defs::PerCpuState,
/// VMBUS info for VTL2.
pub vmbus_vtl2: VmbusInfo,
/// VMBUS info for VTL0.
Expand Down Expand Up @@ -90,6 +92,10 @@ impl PartitionInfo {
isolation: IsolationType::None,
bsp_reg: 0,
cpus: ArrayVec::new_const(),
sidecar_cpu_overrides: sidecar_defs::PerCpuState {
per_cpu_state_specified: false,
sidecar_starts_cpu: [true; sidecar_defs::NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE],
},
vmbus_vtl2: VmbusInfo {
mmio: ArrayVec::new_const(),
connection_id: 0,
Expand Down
5 changes: 5 additions & 0 deletions openhcl/openhcl_boot/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,7 @@ mod test {
use loader_defs::linux::e820entry;
use memory_range::MemoryRange;
use memory_range::subtract_ranges;
use sidecar_defs::PerCpuState;
use zerocopy::FromZeros;

const HIGH_MMIO_GAP_END: u64 = 0x1000000000; // 64 GiB
Expand All @@ -915,6 +916,10 @@ mod test {
isolation: IsolationType::None,
bsp_reg: cpus[0].reg as u32,
cpus,
sidecar_cpu_overrides: PerCpuState {
per_cpu_state_specified: false,
sidecar_starts_cpu: [true; sidecar_defs::NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE],
},
cmdline: ArrayString::new(),
vmbus_vtl2: VmbusInfo {
mmio,
Expand Down
38 changes: 34 additions & 4 deletions openhcl/openhcl_boot/src/sidecar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ const _: () = assert!(
);

pub struct SidecarConfig<'a> {
pub num_cpus: usize,
pub node_params: &'a [SidecarNodeParams],
pub per_cpu_state: &'a sidecar_defs::PerCpuState,
pub nodes: &'a [SidecarNodeOutput],
pub start_reftime: u64,
pub end_reftime: u64,
Expand All @@ -49,10 +51,27 @@ impl core::fmt::Display for SidecarKernelCommandLine<'_> {
// Linux boots with the base VP of each sidecar node. Other CPUs will
// be brought up by the sidecar kernel.
f.write_str("boot_cpus=")?;
let mut comma = "";
for node in self.0.node_params {
write!(f, "{}{}", comma, node.base_vp)?;
comma = ",";
if self.0.per_cpu_state.per_cpu_state_specified {
let mut comma = "";
for (cpu_index, &sidecar_starts) in self
.0
.per_cpu_state
.sidecar_starts_cpu
.iter()
.take(self.0.num_cpus)
.enumerate()
{
if !sidecar_starts {
write!(f, "{}{}", comma, cpu_index)?;
comma = ",";
}
}
} else {
let mut comma = "";
for node in self.0.node_params {
write!(f, "{}{}", comma, node.base_vp)?;
comma = ",";
}
}
Ok(())
}
Expand Down Expand Up @@ -137,6 +156,7 @@ pub fn start_sidecar<'a>(
enable_logging: _,
node_count,
nodes,
initial_state,
} = sidecar_params;

*hypercall_page = 0;
Expand Down Expand Up @@ -189,6 +209,14 @@ pub fn start_sidecar<'a>(
base_vp,
vp_count: cpus.len() as u32,
};
*initial_state = partition_info.sidecar_cpu_overrides.clone();
if initial_state.per_cpu_state_specified {
// If per-CPU state is specified, make sure to explicitly state that
// sidecar should not start the base vp of this node.
// The code that set per_cpu_state_specified should have already ensured that
// the array is large enough for any `base_vp` we might have here.
initial_state.sidecar_starts_cpu[base_vp as usize] = false;
}
base_vp += cpus.len() as u32;
*node_count += 1;
total_ram += required_ram;
Expand Down Expand Up @@ -217,9 +245,11 @@ pub fn start_sidecar<'a>(

let SidecarOutput { nodes, error: _ } = sidecar_output;
Some(SidecarConfig {
num_cpus: partition_info.cpus.len(),
start_reftime: boot_start_reftime,
end_reftime: boot_end_reftime,
node_params: &sidecar_params.nodes[..node_count],
nodes: &nodes[..node_count],
per_cpu_state: &sidecar_params.initial_state,
})
}
47 changes: 44 additions & 3 deletions openhcl/sidecar/src/arch/x86_64/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use core::hint::spin_loop;
use core::mem::MaybeUninit;
use core::ptr::addr_of;
use core::ptr::addr_of_mut;
use core::sync::atomic::AtomicU8;
use core::sync::atomic::AtomicU32;
use core::sync::atomic::Ordering::Acquire;
use core::sync::atomic::Ordering::Relaxed;
Expand Down Expand Up @@ -185,6 +186,7 @@ fn init(
enable_logging,
node_count,
ref nodes,
ref initial_state,
} = params;

ENABLE_LOG.store(enable_logging, Relaxed);
Expand Down Expand Up @@ -318,9 +320,31 @@ fn init(
*response_vector = 0.into();
*needs_attention = 0.into();
reserved.fill(0);

// Default: the base VP is REMOVED (meaning the kernel will start it), the
// next `vp_count - 1` VPs are RUN, and any remaining VPs are REMOVED.
cpu_status[0] = CpuStatus::REMOVED.0.into();
cpu_status[1..vp_count as usize].fill_with(|| CpuStatus::RUN.0.into());
cpu_status[vp_count as usize..].fill_with(|| CpuStatus::REMOVED.0.into());

// Override with the initial state from openhcl_boot.
if initial_state.per_cpu_state_specified {
for (i, &should_start) in initial_state
.sidecar_starts_cpu
.iter()
.skip(base_vp as usize)
.enumerate()
.take(vp_count as usize)
{
// Pragmatically, any CPU that has `RUN` should already match
// the default above, but set it anyway for clarity.
cpu_status[i] = if should_start {
CpuStatus::RUN.0.into()
} else {
CpuStatus::REMOVED.0.into()
};
}
}
}

node_init.push(NodeInit {
Expand All @@ -340,7 +364,8 @@ fn init(
//
// SAFETY: no concurrent mutators.
let node_init = unsafe { &*addr_of!(NODE_INIT) };
start_aps(node_init, mapper);
let cpu_status: [AtomicU8; 1] = [0.into()]; // TODO: get the cpu_status for the BSP node
start_aps(node_init, mapper, &cpu_status);

// Wait for all the APs to finish starting.
{
Expand All @@ -366,14 +391,28 @@ struct NodeInit {

static mut NODE_INIT: ArrayVec<NodeInit, { sidecar_defs::MAX_NODES }> = ArrayVec::new_const();

fn start_aps(node_init: &[NodeInit], mapper: &mut temporary_map::Mapper) {
fn start_aps(node_init: &[NodeInit], mapper: &mut temporary_map::Mapper, cpu_status: &[AtomicU8]) {
for node in node_init {
loop {
let node_cpu_index = node.next_vp.fetch_add(1, Relaxed);
assert!(node_cpu_index != u32::MAX);
if node_cpu_index >= node.node.vp_count {
break;
}

// See the init function for where we set the initial CPU status for why we can end up here with a REMOVED CPU.
// todo: map the control page PA for this node rather than relying on the passed in cpu_status slice (since that is only for
// the node that's actually starting something).
if cpu_status[node_cpu_index as usize].load(Relaxed) == CpuStatus::REMOVED.0.into() {
log!(
"skipping VP {} (node base {}, node idx {}) as REMOVED",
node.node.base_vp + node_cpu_index,
node.node.base_vp,
node_cpu_index
);
continue;
}

match node.node.start(mapper, node_cpu_index) {
Ok(()) => {}
Err(err) => {
Expand All @@ -392,12 +431,14 @@ fn start_aps(node_init: &[NodeInit], mapper: &mut temporary_map::Mapper) {
/// Must be called as an AP entry point.
unsafe fn ap_init() -> ! {
// Start any other pending APs.
// mattkur: I'm confused why this works ... start_aps will start at node 0, just like the BSP.
// what's stopping each node from trampling over each other?
{
// SAFETY: `NODE_INIT` is set before this routine is called.
let node_init = unsafe { &*addr_of!(NODE_INIT) };
// SAFETY: nothing else on this CPU is using the temporary map.
let mut mapper = unsafe { temporary_map::Mapper::new(0) };
start_aps(node_init, &mut mapper)
start_aps(node_init, &mut mapper, &super::vp::control().cpu_status); // see todo in start_aps for why cpu_status is wrong here.
}
// SAFETY: this is an entry point.
unsafe { super::vp::ap_entry() }
Expand Down
4 changes: 3 additions & 1 deletion openhcl/sidecar/src/arch/x86_64/vp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,9 @@ fn ap_run(globals: &mut VpGlobals) {
}
}

fn control() -> &'static ControlPage {
/// Returns a reference to the control page for this VP.
/// Only use this on AP startup paths.
pub fn control() -> &'static ControlPage {
// SAFETY: all mutable fields of the control page have interior mutability,
// so this is a valid dereference.
unsafe { &*addr_space::control_page() }
Expand Down
23 changes: 23 additions & 0 deletions openhcl/sidecar_defs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,23 @@ use zerocopy::Immutable;
use zerocopy::IntoBytes;
use zerocopy::KnownLayout;

/// Starting state for each CPU, supplied by `openhcl_boot`.
/// For various reasons, `openhcl_boot` may have decided to
/// instruct the linux kernel to start additional CPUs (notably,
/// in the event that we know we're about to spawn tasks on those
/// CPUs right away to handle device interrupts).
#[repr(C)]
#[derive(FromZeros, Immutable, KnownLayout, Debug, Clone)]
pub struct PerCpuState {
/// Whether the per-CPU state is specified, since `NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE`
/// is less than the maximum number of CPUs supported by OpenHCL and also by sidecar.
pub per_cpu_state_specified: bool,

/// Whether the CPU should be started by the sidecar kernel. If false,
/// the CPU will be started by (or remain with) the main kernel instead.
pub sidecar_starts_cpu: [bool; NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE],
}

/// Sidecar start input parameters.
#[repr(C, align(4096))]
#[derive(FromZeros, Immutable, KnownLayout)]
Expand All @@ -30,6 +47,8 @@ pub struct SidecarParams {
pub node_count: u32,
/// The node-specific input parameters.
pub nodes: [SidecarNodeParams; MAX_NODES],
/// The initial CPU state for each CPU.
pub initial_state: PerCpuState,
}

/// Node-specific input parameters.
Expand All @@ -49,6 +68,10 @@ pub struct SidecarNodeParams {

/// The maximum number of supported sidecar nodes.
pub const MAX_NODES: usize = 128;
/// The maximum number of supported sidecar CPUs.
/// This odd number is chosen so that `SidecarParams` still
/// fits within a single page.
pub const NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE: usize = 1000;

const _: () = assert!(size_of::<SidecarParams>() <= PAGE_SIZE);

Expand Down
Loading