|
| 1 | +/* |
| 2 | +Copyright 2025 The Hyperlight Authors. |
| 3 | +
|
| 4 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +you may not use this file except in compliance with the License. |
| 6 | +You may obtain a copy of the License at |
| 7 | +
|
| 8 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +
|
| 10 | +Unless required by applicable law or agreed to in writing, software |
| 11 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +See the License for the specific language governing permissions and |
| 14 | +limitations under the License. |
| 15 | +*/ |
| 16 | + |
| 17 | +//! Host process crash handler for generating sandbox dumps. |
| 18 | +//! |
| 19 | +//! This module provides crash detection and dump generation for sandboxes |
| 20 | +//! when the host process crashes due to unhandled signals (Linux) or |
| 21 | +//! exceptions (Windows). |
| 22 | +//! |
| 23 | +//! # Architecture |
| 24 | +//! |
| 25 | +//! - **Registry**: Global map of sandbox ID -> (hypervisor raw pointer, dump enabled flag) |
| 26 | +//! - **Linux**: Signal handlers via `sigaction()` for fatal signals |
| 27 | +//! - **Windows**: Vectored exception handler via `AddVectoredExceptionHandler()` |
| 28 | +//! - **Automatic**: Initialized on first sandbox registration |
| 29 | +//! - **Cleanup**: Entries removed on sandbox Drop |
| 30 | +//! |
| 31 | +//! # Usage |
| 32 | +//! |
| 33 | +//! The crash handler is automatically initialized when the first sandbox |
| 34 | +//! is created. No explicit setup is required. When the host process crashes, |
| 35 | +//! dumps are generated for all registered sandboxes that have `guest_core_dump` |
| 36 | +//! enabled in their runtime configuration. |
| 37 | +//! |
| 38 | +//! # Feature Flag |
| 39 | +//! |
| 40 | +//! This entire module requires the `crashdump` feature to be enabled. |
| 41 | +
|
| 42 | +use std::sync::Mutex as StdMutex; |
| 43 | +use std::sync::atomic::{AtomicBool, Ordering}; |
| 44 | + |
| 45 | +use dashmap::DashMap; |
| 46 | +use once_cell::sync::Lazy; |
| 47 | + |
| 48 | +use crate::hypervisor::Hypervisor; |
| 49 | +use crate::{Result, new_error}; |
| 50 | + |
| 51 | +/// Entry in the sandbox registry. |
| 52 | +/// |
| 53 | +/// Stores a raw pointer to the hypervisor (unsafe!). |
| 54 | +/// This is safe during crash handling because: |
| 55 | +/// 1. The sandbox owns the hypervisor and won't drop it while registered |
| 56 | +/// 2. During a crash, normal thread-safety doesn't matter |
| 57 | +/// 3. We only access these pointers during crash (process is dying anyway) |
| 58 | +struct SandboxEntry { |
| 59 | + /// Raw pointer to the hypervisor (UNSAFE - only valid while sandbox is alive) |
| 60 | + hypervisor_ptr: *const dyn Hypervisor, |
| 61 | +} |
| 62 | + |
| 63 | +// SAFETY: We only access these pointers during crash handling, when the process |
| 64 | +// is dying anyway and normal thread-safety rules don't apply |
| 65 | +unsafe impl Send for SandboxEntry {} |
| 66 | +unsafe impl Sync for SandboxEntry {} |
| 67 | + |
| 68 | +/// Global registry of active sandboxes. |
| 69 | +/// |
| 70 | +/// Maps sandbox ID to hypervisor pointer. Uses DashMap for lock-free concurrent access. |
| 71 | +/// Entries are removed when sandboxes are dropped. |
| 72 | +static SANDBOX_REGISTRY: Lazy<DashMap<u64, SandboxEntry>> = Lazy::new(DashMap::new); |
| 73 | + |
| 74 | +/// Fast check for whether crash handlers have been initialized. |
| 75 | +/// |
| 76 | +/// This atomic bool allows us to skip the initialization lock on the fast path |
| 77 | +/// (after first initialization). We use Acquire/Release ordering to ensure |
| 78 | +/// proper synchronization with the initialization code. |
| 79 | +static INITIALIZED_FAST: AtomicBool = AtomicBool::new(false); |
| 80 | + |
| 81 | +/// Tracks if initialization failed (poisoned mutex or other error). |
| 82 | +/// |
| 83 | +/// If true, we skip all crash handler operations since they won't work anyway. |
| 84 | +static INITIALIZATION_FAILED: AtomicBool = AtomicBool::new(false); |
| 85 | + |
| 86 | +/// Mutex-protected initialization flag (only used during first initialization). |
| 87 | +/// We use std::sync::Mutex here (not parking_lot) so we can detect poisoning. |
| 88 | +static INITIALIZED: Lazy<StdMutex<bool>> = Lazy::new(|| StdMutex::new(false)); |
| 89 | + |
| 90 | +/// Register a sandbox with the crash handler. |
| 91 | +/// |
| 92 | +/// This function: |
| 93 | +/// 1. Stores a raw pointer to the hypervisor (unsafe but controlled) |
| 94 | +/// 2. Initializes crash handlers on first call (lazy init) |
| 95 | +/// |
| 96 | +/// Only registers the sandbox if crash dumps are enabled. If disabled, |
| 97 | +/// this function returns immediately without doing anything. |
| 98 | +/// |
| 99 | +/// # Arguments |
| 100 | +/// |
| 101 | +/// * `sandbox_id` - Unique ID of the sandbox |
| 102 | +/// * `hypervisor` - Reference to the hypervisor (we store a raw pointer) |
| 103 | +/// |
| 104 | +/// # Safety |
| 105 | +/// |
| 106 | +/// The caller MUST ensure the sandbox is unregistered before the hypervisor is dropped! |
| 107 | +/// This is enforced by MultiUseSandbox::Drop. |
| 108 | +/// |
| 109 | +/// # Errors |
| 110 | +/// |
| 111 | +/// Returns an error if the mutex is poisoned (extremely rare, would indicate |
| 112 | +/// a serious issue elsewhere in the program). |
| 113 | +pub fn register_sandbox(sandbox_id: u64, hypervisor: &dyn Hypervisor) -> Result<()> { |
| 114 | + // Check if initialization previously failed - no point trying again |
| 115 | + if INITIALIZATION_FAILED.load(Ordering::Acquire) { |
| 116 | + return Err(new_error!( |
| 117 | + "Crash handler initialization previously failed, skipping registration" |
| 118 | + )); |
| 119 | + } |
| 120 | + |
| 121 | + // Fast path: check if already initialized (lock-free!) |
| 122 | + if !INITIALIZED_FAST.load(Ordering::Acquire) { |
| 123 | + // Slow path: need to initialize (only happens once) |
| 124 | + match INITIALIZED.lock() { |
| 125 | + Ok(mut initialized) => { |
| 126 | + // Double-check inside the lock (another thread might have initialized) |
| 127 | + if !*initialized { |
| 128 | + platform::init_crash_handlers(); |
| 129 | + *initialized = true; |
| 130 | + // Mark as initialized atomically (Release ensures all init is visible) |
| 131 | + INITIALIZED_FAST.store(true, Ordering::Release); |
| 132 | + } |
| 133 | + } |
| 134 | + Err(e) => { |
| 135 | + // Mutex is poisoned - mark as failed and return error |
| 136 | + INITIALIZATION_FAILED.store(true, Ordering::Release); |
| 137 | + return Err(new_error!( |
| 138 | + "INITIALIZED mutex poisoned during crash handler init: {}", |
| 139 | + e |
| 140 | + )); |
| 141 | + } |
| 142 | + } |
| 143 | + } |
| 144 | + |
| 145 | + // Add entry to registry (lock-free with DashMap!) |
| 146 | + let hypervisor_ptr = unsafe { |
| 147 | + std::mem::transmute::<*const dyn Hypervisor, *const dyn Hypervisor>( |
| 148 | + hypervisor as *const dyn Hypervisor, |
| 149 | + ) |
| 150 | + }; |
| 151 | + |
| 152 | + SANDBOX_REGISTRY.insert(sandbox_id, SandboxEntry { hypervisor_ptr }); |
| 153 | + |
| 154 | + Ok(()) |
| 155 | +} |
| 156 | + |
| 157 | +/// Unregister a sandbox from the crash handler. |
| 158 | +/// |
| 159 | +/// Called automatically by MultiUseSandbox::Drop. |
| 160 | +/// |
| 161 | +/// # Arguments |
| 162 | +/// |
| 163 | +/// * `sandbox_id` - Unique ID of the sandbox to unregister |
| 164 | +pub fn unregister_sandbox(sandbox_id: u64) { |
| 165 | + // Lock-free removal with DashMap |
| 166 | + SANDBOX_REGISTRY.remove(&sandbox_id); |
| 167 | +} |
| 168 | + |
| 169 | +/// Generate dumps for all registered sandboxes. |
| 170 | +/// |
| 171 | +/// Called by platform-specific crash handlers when a fatal signal/exception occurs. |
| 172 | +/// Iterates through the registry and generates dumps for all registered sandboxes. |
| 173 | +/// Only sandboxes with dumps enabled are registered, so all entries get dumped. |
| 174 | +/// |
| 175 | +/// # Safety |
| 176 | +/// |
| 177 | +/// This function is called during crash handling and: |
| 178 | +/// - Dereferences raw pointers (unsafe but acceptable during crash) |
| 179 | +/// - May violate async-signal-safety on Linux |
| 180 | +/// - Accesses hypervisor state without locks |
| 181 | +/// |
| 182 | +/// All of this is acceptable because the process is crashing anyway. |
| 183 | +/// |
| 184 | +/// # Returns |
| 185 | +/// |
| 186 | +/// Number of dumps successfully generated. |
| 187 | +pub(crate) fn generate_crash_dumps() -> usize { |
| 188 | + let mut dump_count = 0; |
| 189 | + |
| 190 | + // Iterate over the lock-free registry |
| 191 | + for entry_ref in SANDBOX_REGISTRY.iter() { |
| 192 | + let entry = entry_ref.value(); |
| 193 | + |
| 194 | + // SAFETY: This is unsafe! We're dereferencing a raw pointer. |
| 195 | + // This is acceptable because: |
| 196 | + // 1. The sandbox registers/unregisters properly via Drop |
| 197 | + // 2. During a crash, the process is dying anyway |
| 198 | + // 3. We're willing to accept potential UB during crash handling |
| 199 | + unsafe { |
| 200 | + let hypervisor = &*entry.hypervisor_ptr; |
| 201 | + |
| 202 | + // Try to generate the crash dump |
| 203 | + // This is NOT async-signal-safe (file I/O, allocations, etc.) |
| 204 | + // but we're crashing, so this is acceptable |
| 205 | + // |
| 206 | + // Catch panics: If generating one dump panics, it maybe indicates |
| 207 | + // a systemic issue so we short-circuit |
| 208 | + // rather than risk cascading failures |
| 209 | + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { |
| 210 | + crate::hypervisor::crashdump::generate_crashdump(hypervisor) |
| 211 | + })); |
| 212 | + |
| 213 | + match result { |
| 214 | + Ok(Ok(())) => { |
| 215 | + dump_count += 1; |
| 216 | + } |
| 217 | + Ok(Err(_)) => { |
| 218 | + // Silent failure - dump generation returned an error |
| 219 | + } |
| 220 | + Err(_) => { |
| 221 | + // Panic during dump generation - abort remaining dumps |
| 222 | + // This may indicate a systemic issue |
| 223 | + break; |
| 224 | + } |
| 225 | + } |
| 226 | + } |
| 227 | + } |
| 228 | + |
| 229 | + dump_count |
| 230 | +} |
| 231 | + |
| 232 | +// Platform-specific implementations |
| 233 | +#[cfg(target_os = "linux")] |
| 234 | +#[path = "crash_handler/linux.rs"] |
| 235 | +mod platform; |
| 236 | + |
| 237 | +#[cfg(target_os = "windows")] |
| 238 | +#[path = "crash_handler/windows.rs"] |
| 239 | +mod platform; |
0 commit comments