From 1852abf9da0971b00e5c370380657ee5e8f80a7a Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Tue, 28 Oct 2025 12:43:28 +0000 Subject: [PATCH] Create crashdumps for VMs if process is crashing and creating a dump Signed-off-by: Simon Davies --- Cargo.lock | 22 ++ src/hyperlight_host/Cargo.toml | 4 + src/hyperlight_host/src/crash_handler.rs | 239 ++++++++++++++ .../src/crash_handler/linux.rs | 301 ++++++++++++++++++ .../src/crash_handler/windows.rs | 267 ++++++++++++++++ .../src/hypervisor/hyperv_linux.rs | 5 + .../src/hypervisor/hyperv_windows.rs | 5 + src/hyperlight_host/src/hypervisor/kvm.rs | 5 + src/hyperlight_host/src/hypervisor/mod.rs | 3 + src/hyperlight_host/src/lib.rs | 3 + .../src/sandbox/initialized_multi_use.rs | 24 +- typos.toml | 1 + 12 files changed, 878 insertions(+), 1 deletion(-) create mode 100644 src/hyperlight_host/src/crash_handler.rs create mode 100644 src/hyperlight_host/src/crash_handler/linux.rs create mode 100644 src/hyperlight_host/src/crash_handler/windows.rs diff --git a/Cargo.lock b/Cargo.lock index a9526f543..9e24ee8ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -640,6 +640,20 @@ dependencies = [ "typenum", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "derive_arbitrary" version = "1.4.2" @@ -1233,6 +1247,12 @@ dependencies = [ "byteorder", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "hashbrown" version = "0.15.5" @@ -1474,6 +1494,7 @@ dependencies = [ "criterion", "crossbeam-channel", "crossbeam-queue", + "dashmap", "elfcore", "env_logger", "envy", @@ -1499,6 +1520,7 @@ dependencies = [ "mshv-bindings 0.6.1", "mshv-ioctls 0.2.1", "mshv-ioctls 0.6.1", + "once_cell", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", diff --git a/src/hyperlight_host/Cargo.toml b/src/hyperlight_host/Cargo.toml index 0c7bf1372..c614dbce0 100644 --- a/src/hyperlight_host/Cargo.toml +++ b/src/hyperlight_host/Cargo.toml @@ -51,6 +51,8 @@ metrics = "0.24.2" serde_json = "1.0" elfcore = "2.0" uuid = { version = "1.18.1", features = ["v4"] } +once_cell = "1.20" +dashmap = "6.1" [target.'cfg(windows)'.dependencies] windows = { version = "0.62", features = [ @@ -66,6 +68,8 @@ windows = { version = "0.62", features = [ "Win32_System_Threading", "Win32_System_JobObjects", "Win32_System_SystemServices", + "Win32_System_Registry", + "Win32_System_Kernel", ] } windows-sys = { version = "0.61", features = ["Win32"] } windows-result = "0.4" diff --git a/src/hyperlight_host/src/crash_handler.rs b/src/hyperlight_host/src/crash_handler.rs new file mode 100644 index 000000000..3baacade4 --- /dev/null +++ b/src/hyperlight_host/src/crash_handler.rs @@ -0,0 +1,239 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Host process crash handler for generating sandbox dumps. +//! +//! This module provides crash detection and dump generation for sandboxes +//! when the host process crashes due to unhandled signals (Linux) or +//! exceptions (Windows). +//! +//! # Architecture +//! +//! - **Registry**: Global map of sandbox ID -> (hypervisor raw pointer, dump enabled flag) +//! - **Linux**: Signal handlers via `sigaction()` for fatal signals +//! - **Windows**: Vectored exception handler via `AddVectoredExceptionHandler()` +//! - **Automatic**: Initialized on first sandbox registration +//! - **Cleanup**: Entries removed on sandbox Drop +//! +//! # Usage +//! +//! The crash handler is automatically initialized when the first sandbox +//! is created. No explicit setup is required. When the host process crashes, +//! dumps are generated for all registered sandboxes that have `guest_core_dump` +//! enabled in their runtime configuration. +//! +//! # Feature Flag +//! +//! This entire module requires the `crashdump` feature to be enabled. + +use std::sync::Mutex as StdMutex; +use std::sync::atomic::{AtomicBool, Ordering}; + +use dashmap::DashMap; +use once_cell::sync::Lazy; + +use crate::hypervisor::Hypervisor; +use crate::{Result, new_error}; + +/// Entry in the sandbox registry. +/// +/// Stores a raw pointer to the hypervisor (unsafe!). +/// This is safe during crash handling because: +/// 1. The sandbox owns the hypervisor and won't drop it while registered +/// 2. During a crash, normal thread-safety doesn't matter +/// 3. We only access these pointers during crash (process is dying anyway) +struct SandboxEntry { + /// Raw pointer to the hypervisor (UNSAFE - only valid while sandbox is alive) + hypervisor_ptr: *const dyn Hypervisor, +} + +// SAFETY: We only access these pointers during crash handling, when the process +// is dying anyway and normal thread-safety rules don't apply +unsafe impl Send for SandboxEntry {} +unsafe impl Sync for SandboxEntry {} + +/// Global registry of active sandboxes. +/// +/// Maps sandbox ID to hypervisor pointer. Uses DashMap for lock-free concurrent access. +/// Entries are removed when sandboxes are dropped. +static SANDBOX_REGISTRY: Lazy> = Lazy::new(DashMap::new); + +/// Fast check for whether crash handlers have been initialized. +/// +/// This atomic bool allows us to skip the initialization lock on the fast path +/// (after first initialization). We use Acquire/Release ordering to ensure +/// proper synchronization with the initialization code. +static INITIALIZED_FAST: AtomicBool = AtomicBool::new(false); + +/// Tracks if initialization failed (poisoned mutex or other error). +/// +/// If true, we skip all crash handler operations since they won't work anyway. +static INITIALIZATION_FAILED: AtomicBool = AtomicBool::new(false); + +/// Mutex-protected initialization flag (only used during first initialization). +/// We use std::sync::Mutex here (not parking_lot) so we can detect poisoning. +static INITIALIZED: Lazy> = Lazy::new(|| StdMutex::new(false)); + +/// Register a sandbox with the crash handler. +/// +/// This function: +/// 1. Stores a raw pointer to the hypervisor (unsafe but controlled) +/// 2. Initializes crash handlers on first call (lazy init) +/// +/// Only registers the sandbox if crash dumps are enabled. If disabled, +/// this function returns immediately without doing anything. +/// +/// # Arguments +/// +/// * `sandbox_id` - Unique ID of the sandbox +/// * `hypervisor` - Reference to the hypervisor (we store a raw pointer) +/// +/// # Safety +/// +/// The caller MUST ensure the sandbox is unregistered before the hypervisor is dropped! +/// This is enforced by MultiUseSandbox::Drop. +/// +/// # Errors +/// +/// Returns an error if the mutex is poisoned (extremely rare, would indicate +/// a serious issue elsewhere in the program). +pub fn register_sandbox(sandbox_id: u64, hypervisor: &dyn Hypervisor) -> Result<()> { + // Check if initialization previously failed - no point trying again + if INITIALIZATION_FAILED.load(Ordering::Acquire) { + return Err(new_error!( + "Crash handler initialization previously failed, skipping registration" + )); + } + + // Fast path: check if already initialized (lock-free!) + if !INITIALIZED_FAST.load(Ordering::Acquire) { + // Slow path: need to initialize (only happens once) + match INITIALIZED.lock() { + Ok(mut initialized) => { + // Double-check inside the lock (another thread might have initialized) + if !*initialized { + platform::init_crash_handlers(); + *initialized = true; + // Mark as initialized atomically (Release ensures all init is visible) + INITIALIZED_FAST.store(true, Ordering::Release); + } + } + Err(e) => { + // Mutex is poisoned - mark as failed and return error + INITIALIZATION_FAILED.store(true, Ordering::Release); + return Err(new_error!( + "INITIALIZED mutex poisoned during crash handler init: {}", + e + )); + } + } + } + + // Add entry to registry (lock-free with DashMap!) + let hypervisor_ptr = unsafe { + std::mem::transmute::<*const dyn Hypervisor, *const dyn Hypervisor>( + hypervisor as *const dyn Hypervisor, + ) + }; + + SANDBOX_REGISTRY.insert(sandbox_id, SandboxEntry { hypervisor_ptr }); + + Ok(()) +} + +/// Unregister a sandbox from the crash handler. +/// +/// Called automatically by MultiUseSandbox::Drop. +/// +/// # Arguments +/// +/// * `sandbox_id` - Unique ID of the sandbox to unregister +pub fn unregister_sandbox(sandbox_id: u64) { + // Lock-free removal with DashMap + SANDBOX_REGISTRY.remove(&sandbox_id); +} + +/// Generate dumps for all registered sandboxes. +/// +/// Called by platform-specific crash handlers when a fatal signal/exception occurs. +/// Iterates through the registry and generates dumps for all registered sandboxes. +/// Only sandboxes with dumps enabled are registered, so all entries get dumped. +/// +/// # Safety +/// +/// This function is called during crash handling and: +/// - Dereferences raw pointers (unsafe but acceptable during crash) +/// - May violate async-signal-safety on Linux +/// - Accesses hypervisor state without locks +/// +/// All of this is acceptable because the process is crashing anyway. +/// +/// # Returns +/// +/// Number of dumps successfully generated. +pub(crate) fn generate_crash_dumps() -> usize { + let mut dump_count = 0; + + // Iterate over the lock-free registry + for entry_ref in SANDBOX_REGISTRY.iter() { + let entry = entry_ref.value(); + + // SAFETY: This is unsafe! We're dereferencing a raw pointer. + // This is acceptable because: + // 1. The sandbox registers/unregisters properly via Drop + // 2. During a crash, the process is dying anyway + // 3. We're willing to accept potential UB during crash handling + unsafe { + let hypervisor = &*entry.hypervisor_ptr; + + // Try to generate the crash dump + // This is NOT async-signal-safe (file I/O, allocations, etc.) + // but we're crashing, so this is acceptable + // + // Catch panics: If generating one dump panics, it maybe indicates + // a systemic issue so we short-circuit + // rather than risk cascading failures + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + crate::hypervisor::crashdump::generate_crashdump(hypervisor) + })); + + match result { + Ok(Ok(())) => { + dump_count += 1; + } + Ok(Err(_)) => { + // Silent failure - dump generation returned an error + } + Err(_) => { + // Panic during dump generation - abort remaining dumps + // This may indicate a systemic issue + break; + } + } + } + } + + dump_count +} + +// Platform-specific implementations +#[cfg(target_os = "linux")] +#[path = "crash_handler/linux.rs"] +mod platform; + +#[cfg(target_os = "windows")] +#[path = "crash_handler/windows.rs"] +mod platform; diff --git a/src/hyperlight_host/src/crash_handler/linux.rs b/src/hyperlight_host/src/crash_handler/linux.rs new file mode 100644 index 000000000..6c7b681dc --- /dev/null +++ b/src/hyperlight_host/src/crash_handler/linux.rs @@ -0,0 +1,301 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Linux-specific crash handler using signal handlers. +//! +//! This module installs signal handlers for fatal signals that would +//! normally trigger a core dump. When such a signal is received: +//! +//! 1. Generate dumps for all registered sandboxes +//! 2. Chain to the previous signal handler (if any) +//! 3. Re-raise the signal to allow OS to generate host core dump +//! +//! # Signals Handled +//! +//! - SIGSEGV: Segmentation fault +//! - SIGABRT: Abort signal +//! - SIGBUS: Bus error +//! - SIGFPE: Floating point exception +//! - SIGILL: Illegal instruction +//! - SIGQUIT: Quit signal +//! - SIGTRAP: Trace/breakpoint trap +//! - SIGSYS: Bad system call +//! +//! # Async-Signal-Safety +//! +//! Signal handlers should only call async-signal-safe functions. +//! However, we intentionally violate this because: +//! - The process is crashing anyway +//! - We want to generate useful dumps +//! - Silent failure is acceptable +//! +//! The handler is annotated with comments explaining which operations +//! are NOT async-signal-safe. + +use std::sync::atomic::{AtomicBool, Ordering}; + +use libc::{SA_RESTART, SA_SIGINFO, c_int, sigaction, siginfo_t}; + +/// Flag to prevent recursive crash handling. +/// +/// If we crash while handling a crash, don't try to handle it again. +static IN_CRASH_HANDLER: AtomicBool = AtomicBool::new(false); + +/// Storage for previous signal handlers. +/// +/// We chain to these after generating our dumps. +static mut PREV_SIGSEGV: Option = None; +static mut PREV_SIGABRT: Option = None; +static mut PREV_SIGBUS: Option = None; +static mut PREV_SIGFPE: Option = None; +static mut PREV_SIGILL: Option = None; +static mut PREV_SIGQUIT: Option = None; +static mut PREV_SIGTRAP: Option = None; +static mut PREV_SIGSYS: Option = None; + +/// Check if core dumps are enabled on this system. +/// +/// Returns true if the OS will generate a core dump for the process on crash. +/// This checks multiple Linux configuration options that can disable core dumps: +/// +/// 1. RLIMIT_CORE: Resource limit (ulimit -c) +/// 2. /proc/sys/kernel/core_pattern: Must not be empty or "|/bin/false" +/// 3. Process dumpable flag (prctl PR_GET_DUMPABLE) +/// +/// All checks are defensive and handle permission errors gracefully. +/// If we can't determine the state, we assume dumps are enabled (fail-open). +/// +/// # Returns +/// +/// true if core dumps are likely enabled, false if clearly disabled +fn are_core_dumps_enabled() -> bool { + // Check 1: RLIMIT_CORE (ulimit -c) + // This should always work - no special permissions needed + unsafe { + let mut rlim: libc::rlimit = std::mem::zeroed(); + if libc::getrlimit(libc::RLIMIT_CORE, &mut rlim) == 0 && rlim.rlim_cur == 0 { + // Core size limit is 0 - definitely disabled + return false; + } + + // If getrlimit failed continue checking other options + } + + // Check 2: /proc/sys/kernel/core_pattern + // This might fail in containers or with restricted permissions + // If we can't read it, we assume dumps are enabled (fail-open) + match std::fs::read_to_string("/proc/sys/kernel/core_pattern") { + Ok(pattern) => { + let pattern = pattern.trim(); + if pattern.is_empty() + || pattern == "|/bin/false" + || pattern == "|/usr/bin/false" + || pattern.starts_with("|/bin/false ") + || pattern.starts_with("|/usr/bin/false ") + { + // Core pattern is explicitly disabled + return false; + } + } + Err(_) => { + // Can't read the file (permission denied, doesn't exist in container, etc.) + // Fail-open: assume dumps are enabled + } + } + + // Check 3: Process dumpable flag + // This should always work - no special permissions needed + // Some security policies (like running setuid) disable dumping + unsafe { + let dumpable = libc::prctl(libc::PR_GET_DUMPABLE, 0, 0, 0, 0); + if dumpable == 0 { + // Process is marked non-dumpable (0 = non-dumpable) + return false; + } + // dumpable == 1 means normal dumpable + // dumpable == 2 means dumpable but with restrictions + // dumpable < 0 means error (very unlikely) + // In all these cases, continue + } + + // All checks passed or couldn't determine - assume enabled (fail-open) + + true +} + +/// Initialize crash handlers for all fatal signals. +/// +/// Called once on first sandbox registration. +/// +/// Only installs handlers if the OS will actually generate core dumps. +/// If core dumps are disabled, we don't install handlers since there's +/// no point generating sandbox dumps when the host won't dump. +pub(super) fn init_crash_handlers() { + // Check if core dumps are enabled + if !are_core_dumps_enabled() { + log::info!("Core dumps disabled on this system, skipping crash handler installation"); + return; + } + + unsafe { + install_handler(libc::SIGSEGV, &raw mut PREV_SIGSEGV); + install_handler(libc::SIGABRT, &raw mut PREV_SIGABRT); + install_handler(libc::SIGBUS, &raw mut PREV_SIGBUS); + install_handler(libc::SIGFPE, &raw mut PREV_SIGFPE); + install_handler(libc::SIGILL, &raw mut PREV_SIGILL); + install_handler(libc::SIGQUIT, &raw mut PREV_SIGQUIT); + install_handler(libc::SIGTRAP, &raw mut PREV_SIGTRAP); + install_handler(libc::SIGSYS, &raw mut PREV_SIGSYS); + } +} + +/// Install a signal handler and save the previous handler. +/// +/// # Safety +/// +/// Calls unsafe libc::sigaction. Caller must ensure `prev_handler` +/// points to valid static storage. +unsafe fn install_handler(signal: c_int, prev_handler: *mut Option) { + // SAFETY: All operations in this block are guarded by unsafe blocks + // and are part of the signal handler installation process + unsafe { + let mut sa: sigaction = std::mem::zeroed(); + sa.sa_sigaction = crash_signal_handler as usize; + sa.sa_flags = SA_SIGINFO | SA_RESTART; + libc::sigemptyset(&mut sa.sa_mask); + + let mut old_sa: sigaction = std::mem::zeroed(); + + if libc::sigaction(signal, &sa, &mut old_sa) == 0 { + // Only save the previous handler if it was actually set + // (not SIG_DFL or SIG_IGN) + if old_sa.sa_sigaction != libc::SIG_DFL && old_sa.sa_sigaction != libc::SIG_IGN { + *prev_handler = Some(old_sa); + } + } + } + // If sigaction failed or there was no previous handler, prev_handler stays None (its initial value) +} + +/// Signal handler for fatal signals. +/// +/// # Safety +/// +/// This function is called by the OS as a signal handler. +/// It violates async-signal-safety but this is acceptable during crash. +/// +/// # Arguments +/// +/// * `signal` - Signal number +/// * `_info` - Signal info (unused) +/// * `_context` - Signal context (unused) +extern "C" fn crash_signal_handler( + signal: c_int, + _info: *mut siginfo_t, + _context: *mut libc::c_void, +) { + // Prevent recursive crash handling + if IN_CRASH_HANDLER.swap(true, Ordering::SeqCst) { + // We crashed while handling a crash - bail out immediately + // Chain to previous handler or re-raise + chain_to_previous_handler(signal); + return; + } + + // Try to write a message to stderr + // write() with a string literal is async-signal-safe + let msg = b"Hyperlight: Host process crashed, generating sandbox dump...\n"; + unsafe { + libc::write( + libc::STDERR_FILENO, + msg.as_ptr() as *const libc::c_void, + msg.len(), + ); + } + + // Generate dumps for all registered sandboxes + // NOTE: This is NOT async-signal-safe! It: + // - Locks mutexes (SANDBOX_REGISTRY) + // - Performs file I/O (writing dumps) + // - May allocate memory + // - Calls complex Rust code + // + // BUT: We're crashing anyway, so this is acceptable. + // Worst case: We crash again and the recursive check above prevents infinite loop. + let dump_count = super::generate_crash_dumps(); + + // Try to report success to stderr (write is async-signal-safe) + if dump_count > 0 { + let success_msg = b"Hyperlight: Generated sandbox dumps\n"; + unsafe { + libc::write( + libc::STDERR_FILENO, + success_msg.as_ptr() as *const libc::c_void, + success_msg.len(), + ); + } + } + + // Chain to previous handler + chain_to_previous_handler(signal); +} + +/// Chain to the previous signal handler, or re-raise the signal. +/// +/// This ensures that: +/// 1. Other crash handlers in the chain get invoked +/// 2. The OS default handler runs (generating host core dump if configured) +fn chain_to_previous_handler(signal: c_int) { + unsafe { + let prev_ptr = match signal { + libc::SIGSEGV => &raw const PREV_SIGSEGV, + libc::SIGABRT => &raw const PREV_SIGABRT, + libc::SIGBUS => &raw const PREV_SIGBUS, + libc::SIGFPE => &raw const PREV_SIGFPE, + libc::SIGILL => &raw const PREV_SIGILL, + libc::SIGQUIT => &raw const PREV_SIGQUIT, + libc::SIGTRAP => &raw const PREV_SIGTRAP, + libc::SIGSYS => &raw const PREV_SIGSYS, + _ => { + // Unknown signal - just re-raise + libc::raise(signal); + return; + } + }; + + if let Some(old_sa) = (*prev_ptr).as_ref() { + // We have a previous handler to chain to + // Call the previous handler + if old_sa.sa_flags & SA_SIGINFO != 0 { + // Previous handler was SA_SIGINFO style + let handler: extern "C" fn(c_int, *mut siginfo_t, *mut libc::c_void) = + std::mem::transmute(old_sa.sa_sigaction); + handler(signal, std::ptr::null_mut(), std::ptr::null_mut()); + } else { + // Previous handler was simple signal handler + let handler: extern "C" fn(c_int) = std::mem::transmute(old_sa.sa_sigaction); + handler(signal); + } + return; + } + + // No previous handler, or it was SIG_DFL - restore default and re-raise + let mut sa: sigaction = std::mem::zeroed(); + sa.sa_sigaction = libc::SIG_DFL; + libc::sigaction(signal, &sa, std::ptr::null_mut()); + libc::raise(signal); + } +} diff --git a/src/hyperlight_host/src/crash_handler/windows.rs b/src/hyperlight_host/src/crash_handler/windows.rs new file mode 100644 index 000000000..48472aceb --- /dev/null +++ b/src/hyperlight_host/src/crash_handler/windows.rs @@ -0,0 +1,267 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Windows-specific crash handler using vectored exception handling. +//! +//! This module installs a vectored exception handler that catches fatal +//! exceptions before they reach the default handler. When an exception occurs: +//! +//! 1. Generate dumps for all registered sandboxes +//! 2. Return EXCEPTION_CONTINUE_SEARCH to chain to other handlers +//! 3. Allow OS to generate host crash dump (if configured) +//! +//! # Exceptions Handled +//! +//! - EXCEPTION_ACCESS_VIOLATION (0xC0000005): Access violation +//! - EXCEPTION_ILLEGAL_INSTRUCTION (0xC000001D): Illegal instruction +//! - EXCEPTION_INT_DIVIDE_BY_ZERO (0xC0000094): Integer divide by zero +//! - EXCEPTION_STACK_OVERFLOW (0xC00000FD): Stack overflow +//! - EXCEPTION_ARRAY_BOUNDS_EXCEEDED (0xC000008C): Array bounds exceeded +//! - EXCEPTION_FLT_* (various): Floating point exceptions +//! +//! # Why AddVectoredExceptionHandler? +//! +//! We use `AddVectoredExceptionHandler` instead of `SetUnhandledExceptionFilter` +//! because: +//! - It's called BEFORE SEH unwinding starts (we see the exception first) +//! - It properly chains to other handlers +//! - It's more reliable with threads +//! - SetUnhandledExceptionFilter can be bypassed by SEH +//! +//! We don't use `WerRegisterRuntimeExceptionModule` because: +//! - It requires a separate DLL +//! - It's more complex to set up +//! - Our use case is simpler (just dump sandboxes) + +use std::sync::atomic::{AtomicBool, Ordering}; + +use windows::Win32::Foundation::{ + EXCEPTION_ACCESS_VIOLATION, EXCEPTION_ARRAY_BOUNDS_EXCEEDED, EXCEPTION_FLT_DENORMAL_OPERAND, + EXCEPTION_FLT_DIVIDE_BY_ZERO, EXCEPTION_FLT_INEXACT_RESULT, EXCEPTION_FLT_INVALID_OPERATION, + EXCEPTION_FLT_OVERFLOW, EXCEPTION_FLT_STACK_CHECK, EXCEPTION_FLT_UNDERFLOW, + EXCEPTION_ILLEGAL_INSTRUCTION, EXCEPTION_INT_DIVIDE_BY_ZERO, EXCEPTION_INT_OVERFLOW, + EXCEPTION_PRIV_INSTRUCTION, EXCEPTION_STACK_OVERFLOW, NTSTATUS, +}; +use windows::Win32::System::Diagnostics::Debug::{ + AddVectoredExceptionHandler, EXCEPTION_CONTINUE_SEARCH, EXCEPTION_POINTERS, IsDebuggerPresent, +}; +use windows::Win32::System::Registry::{ + HKEY_LOCAL_MACHINE, KEY_READ, REG_DWORD, RegCloseKey, RegOpenKeyExW, RegQueryValueExW, +}; +use windows::core::PCWSTR; + +/// Flag to prevent recursive crash handling. +/// +/// If we crash while handling a crash, don't try to handle it again. +static IN_CRASH_HANDLER: AtomicBool = AtomicBool::new(false); + +/// Check if crash dumps are likely to be generated on Windows. +/// +/// This checks: +/// 1. If a debugger is attached (debugger will handle the crash) +/// 2. Windows Error Reporting (WER) disabled state +/// +/// All checks are defensive and handle permission/access errors gracefully. +/// If we can't determine the state, we assume dumps are enabled (fail-open). +/// +/// # Returns +/// +/// true if crash dumps are likely enabled, false if clearly disabled +fn are_crash_dumps_enabled() -> bool { + // Check 1: If a debugger is attached, it will handle crashes + // This is always safe to check + unsafe { + if IsDebuggerPresent().as_bool() { + // Debugger present - it will handle the crash + return true; + } + } + + // Check 2: Check if WER is completely disabled + // Registry key: HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\Windows Error Reporting + // Value: Disabled (DWORD) - if 1, WER is disabled + // + // This check might fail due to permissions, which is fine (fail-open) + unsafe { + let key_path: Vec = "SOFTWARE\\Microsoft\\Windows\\Windows Error Reporting\0" + .encode_utf16() + .collect(); + let mut hkey = std::mem::zeroed(); + + if RegOpenKeyExW( + HKEY_LOCAL_MACHINE, + PCWSTR(key_path.as_ptr()), + Some(0), + KEY_READ, + &mut hkey, + ) + .is_ok() + { + let value_name: Vec = "Disabled\0".encode_utf16().collect(); + let mut data: u32 = 0; + let mut data_size: u32 = std::mem::size_of::() as u32; + let mut value_type = std::mem::zeroed(); + + let query_result = RegQueryValueExW( + hkey, + PCWSTR(value_name.as_ptr()), + None, + Some(&mut value_type), + Some(&mut data as *mut u32 as *mut u8), + Some(&mut data_size), + ); + + // Close the key before checking results + let _ = RegCloseKey(hkey); + + if query_result.is_ok() && value_type == REG_DWORD && data == 1 { + // WER is explicitly disabled + return false; + } + } + // If we couldn't read the registry (permission denied, key doesn't exist, etc.), + // fail-open: assume dumps are enabled + } + + // All checks passed or couldn't determine - assume enabled (fail-open) + true +} + +/// Initialize crash handlers using vectored exception handling. +/// +/// Called once on first sandbox registration. +/// +/// Only installs handlers if crash dumps are likely to be generated. +/// This checks if WER is disabled or if a debugger is attached. +pub(super) fn init_crash_handlers() { + // Check if crash dumps are likely to be generated + if !are_crash_dumps_enabled() { + log::info!( + "Crash dumps disabled on this system (WER disabled), skipping crash handler installation" + ); + return; + } + + unsafe { + // Add vectored exception handler + // First argument: 1 = add to front of chain (we want to see exceptions first) + // Returns handle on success, null on failure + let handler = AddVectoredExceptionHandler(1, Some(vectored_exception_handler)); + + if handler.is_null() { + // Failed to install handler - this is bad but not fatal + log::error!("Failed to install Hyperlight crash handler on Windows"); + } + // We never remove the handler - it stays installed for the life of the process + } +} + +/// Vectored exception handler for fatal exceptions. +/// +/// # Safety +/// +/// This function is called by Windows as an exception handler. +/// It's relatively safe to call Rust code here (unlike signal handlers on Linux), +/// but we still need to be careful about locking and potential re-entrancy. +/// +/// # Arguments +/// +/// * `exception_info` - Pointer to EXCEPTION_POINTERS structure +/// +/// # Returns +/// +/// EXCEPTION_CONTINUE_SEARCH to allow other handlers to run +unsafe extern "system" fn vectored_exception_handler( + exception_info: *mut EXCEPTION_POINTERS, +) -> i32 { + // Prevent recursive crash handling - check this FIRST before doing anything else + if IN_CRASH_HANDLER.swap(true, Ordering::SeqCst) { + // We crashed while handling a crash - bail out immediately + return EXCEPTION_CONTINUE_SEARCH; + } + + if exception_info.is_null() { + return EXCEPTION_CONTINUE_SEARCH; + } + + // SAFETY: We've checked that exception_info is not null + let exception_record = unsafe { (*exception_info).ExceptionRecord }; + if exception_record.is_null() { + return EXCEPTION_CONTINUE_SEARCH; + } + + // SAFETY: We've checked that exception_record is not null + let exception_code = unsafe { (*exception_record).ExceptionCode }; + + // Check if this is a fatal exception we care about + if !is_fatal_exception(exception_code) { + return EXCEPTION_CONTINUE_SEARCH; + } + + // Try to write a message to stderr + eprintln!( + "Hyperlight: Host process crashed (exception 0x{:X}), generating sandbox dumps...", + exception_code.0 + ); + + // Generate dumps for all registered sandboxes + // On Windows, this is safer than on Linux because: + // - We're not in a signal handler (no async-signal-safety restrictions) + // - Exception handlers can safely allocate, lock mutexes, do I/O, etc. + // - Still need to be careful about re-entrancy (hence the IN_CRASH_HANDLER flag) + let dump_count = super::generate_crash_dumps(); + + if dump_count > 0 { + eprintln!("Hyperlight: Generated {} sandbox dump(s)", dump_count); + } + + // Always return EXCEPTION_CONTINUE_SEARCH to chain to other handlers + // This allows: + // - Other vectored exception handlers to run + // - SEH handlers to run + // - Windows Error Reporting to generate a minidump + // - The OS default handler to terminate the process + EXCEPTION_CONTINUE_SEARCH +} + +/// Check if an exception code represents a fatal crash. +/// +/// # Arguments +/// +/// * `code` - Windows exception code +/// +/// # Returns +/// +/// true if this exception should trigger dump generation +fn is_fatal_exception(code: NTSTATUS) -> bool { + matches!( + code, + EXCEPTION_ACCESS_VIOLATION + | EXCEPTION_ILLEGAL_INSTRUCTION + | EXCEPTION_INT_DIVIDE_BY_ZERO + | EXCEPTION_STACK_OVERFLOW + | EXCEPTION_ARRAY_BOUNDS_EXCEEDED + | EXCEPTION_FLT_DENORMAL_OPERAND + | EXCEPTION_FLT_DIVIDE_BY_ZERO + | EXCEPTION_FLT_INEXACT_RESULT + | EXCEPTION_FLT_INVALID_OPERATION + | EXCEPTION_FLT_OVERFLOW + | EXCEPTION_FLT_STACK_CHECK + | EXCEPTION_FLT_UNDERFLOW + | EXCEPTION_INT_OVERFLOW + | EXCEPTION_PRIV_INSTRUCTION + ) +} diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index 801f361a7..efe4fc850 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -946,6 +946,11 @@ impl Hypervisor for HypervLinuxDriver { } } + #[cfg(crashdump)] + fn runtime_config(&self) -> &crate::sandbox::uninitialized::SandboxRuntimeConfig { + &self.rt_cfg + } + #[cfg(gdb)] fn handle_debug( &mut self, diff --git a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs index 5c6c9db9c..a2301e9f8 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs @@ -792,6 +792,11 @@ impl Hypervisor for HypervWindowsDriver { } } + #[cfg(crashdump)] + fn runtime_config(&self) -> &crate::sandbox::uninitialized::SandboxRuntimeConfig { + &self.rt_cfg + } + #[cfg(gdb)] fn handle_debug( &mut self, diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 330a5f5b5..7d838ffd0 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -881,6 +881,11 @@ impl Hypervisor for KVMDriver { } } + #[cfg(crashdump)] + fn runtime_config(&self) -> &crate::sandbox::uninitialized::SandboxRuntimeConfig { + &self.rt_cfg + } + #[cfg(gdb)] fn handle_debug( &mut self, diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index e5592509a..9afb9aa71 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -303,6 +303,9 @@ pub(crate) trait Hypervisor: Debug + Send { #[cfg(crashdump)] fn crashdump_context(&self) -> Result>; + #[cfg(crashdump)] + fn runtime_config(&self) -> &crate::sandbox::uninitialized::SandboxRuntimeConfig; + #[cfg(gdb)] /// handles the cases when the vCPU stops due to a Debug event fn handle_debug( diff --git a/src/hyperlight_host/src/lib.rs b/src/hyperlight_host/src/lib.rs index 5f034dc79..5fd1d3ebd 100644 --- a/src/hyperlight_host/src/lib.rs +++ b/src/hyperlight_host/src/lib.rs @@ -50,6 +50,9 @@ use std::sync::Once; pub(crate) mod built_info { include!(concat!(env!("OUT_DIR"), "/built.rs")); } +/// Crash handler for generating sandbox dumps when host process crashes +#[cfg(feature = "crashdump")] +mod crash_handler; /// Dealing with errors, including errors across VM boundaries pub mod error; /// Wrappers for host and guest functions. diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 9a2256bf7..ff14402f2 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -121,8 +121,22 @@ impl MultiUseSandbox { dispatch_ptr: RawPtr, #[cfg(gdb)] dbg_mem_access_fn: Arc>>, ) -> MultiUseSandbox { + let id = SANDBOX_ID_COUNTER.fetch_add(1, Ordering::Relaxed); + + // Register with crash handler if dumps are enabled + #[cfg(feature = "crashdump")] + if vm.runtime_config().guest_core_dump + && let Err(e) = crate::crash_handler::register_sandbox(id, vm.as_ref()) + { + tracing::error!( + "Failed to register sandbox {} with crash handler: {}", + id, + e + ); + } + Self { - id: SANDBOX_ID_COUNTER.fetch_add(1, Ordering::Relaxed), + id, poisoned: false, _host_funcs: host_funcs, mem_mgr: mgr, @@ -759,6 +773,14 @@ impl Callable for MultiUseSandbox { } } +impl Drop for MultiUseSandbox { + fn drop(&mut self) { + // Unregister from crash handler if crashdump feature is enabled + #[cfg(feature = "crashdump")] + crate::crash_handler::unregister_sandbox(self.id); + } +} + impl std::fmt::Debug for MultiUseSandbox { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MultiUseSandbox") diff --git a/typos.toml b/typos.toml index 07ffa9dee..81ae47cf6 100644 --- a/typos.toml +++ b/typos.toml @@ -9,3 +9,4 @@ extend-exclude = ["**/*.patch", "src/hyperlight_guest_bin/third_party/**/*", "NO typ="typ" mmaped="mmapped" fpr="fpr" +SEH="SEH"