Skip to content

Commit 1852abf

Browse files
committed
Create crashdumps for VMs if process is crashing and creating a dump
Signed-off-by: Simon Davies <simongdavies@users.noreply.github.com>
1 parent f3acc2e commit 1852abf

File tree

12 files changed

+878
-1
lines changed

12 files changed

+878
-1
lines changed

Cargo.lock

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/hyperlight_host/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ metrics = "0.24.2"
5151
serde_json = "1.0"
5252
elfcore = "2.0"
5353
uuid = { version = "1.18.1", features = ["v4"] }
54+
once_cell = "1.20"
55+
dashmap = "6.1"
5456

5557
[target.'cfg(windows)'.dependencies]
5658
windows = { version = "0.62", features = [
@@ -66,6 +68,8 @@ windows = { version = "0.62", features = [
6668
"Win32_System_Threading",
6769
"Win32_System_JobObjects",
6870
"Win32_System_SystemServices",
71+
"Win32_System_Registry",
72+
"Win32_System_Kernel",
6973
] }
7074
windows-sys = { version = "0.61", features = ["Win32"] }
7175
windows-result = "0.4"
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
/*
2+
Copyright 2025 The Hyperlight Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
//! Host process crash handler for generating sandbox dumps.
18+
//!
19+
//! This module provides crash detection and dump generation for sandboxes
20+
//! when the host process crashes due to unhandled signals (Linux) or
21+
//! exceptions (Windows).
22+
//!
23+
//! # Architecture
24+
//!
25+
//! - **Registry**: Global map of sandbox ID -> (hypervisor raw pointer, dump enabled flag)
26+
//! - **Linux**: Signal handlers via `sigaction()` for fatal signals
27+
//! - **Windows**: Vectored exception handler via `AddVectoredExceptionHandler()`
28+
//! - **Automatic**: Initialized on first sandbox registration
29+
//! - **Cleanup**: Entries removed on sandbox Drop
30+
//!
31+
//! # Usage
32+
//!
33+
//! The crash handler is automatically initialized when the first sandbox
34+
//! is created. No explicit setup is required. When the host process crashes,
35+
//! dumps are generated for all registered sandboxes that have `guest_core_dump`
36+
//! enabled in their runtime configuration.
37+
//!
38+
//! # Feature Flag
39+
//!
40+
//! This entire module requires the `crashdump` feature to be enabled.
41+
42+
use std::sync::Mutex as StdMutex;
43+
use std::sync::atomic::{AtomicBool, Ordering};
44+
45+
use dashmap::DashMap;
46+
use once_cell::sync::Lazy;
47+
48+
use crate::hypervisor::Hypervisor;
49+
use crate::{Result, new_error};
50+
51+
/// Entry in the sandbox registry.
52+
///
53+
/// Stores a raw pointer to the hypervisor (unsafe!).
54+
/// This is safe during crash handling because:
55+
/// 1. The sandbox owns the hypervisor and won't drop it while registered
56+
/// 2. During a crash, normal thread-safety doesn't matter
57+
/// 3. We only access these pointers during crash (process is dying anyway)
58+
struct SandboxEntry {
59+
/// Raw pointer to the hypervisor (UNSAFE - only valid while sandbox is alive)
60+
hypervisor_ptr: *const dyn Hypervisor,
61+
}
62+
63+
// SAFETY: We only access these pointers during crash handling, when the process
64+
// is dying anyway and normal thread-safety rules don't apply
65+
unsafe impl Send for SandboxEntry {}
66+
unsafe impl Sync for SandboxEntry {}
67+
68+
/// Global registry of active sandboxes.
69+
///
70+
/// Maps sandbox ID to hypervisor pointer. Uses DashMap for lock-free concurrent access.
71+
/// Entries are removed when sandboxes are dropped.
72+
static SANDBOX_REGISTRY: Lazy<DashMap<u64, SandboxEntry>> = Lazy::new(DashMap::new);
73+
74+
/// Fast check for whether crash handlers have been initialized.
75+
///
76+
/// This atomic bool allows us to skip the initialization lock on the fast path
77+
/// (after first initialization). We use Acquire/Release ordering to ensure
78+
/// proper synchronization with the initialization code.
79+
static INITIALIZED_FAST: AtomicBool = AtomicBool::new(false);
80+
81+
/// Tracks if initialization failed (poisoned mutex or other error).
82+
///
83+
/// If true, we skip all crash handler operations since they won't work anyway.
84+
static INITIALIZATION_FAILED: AtomicBool = AtomicBool::new(false);
85+
86+
/// Mutex-protected initialization flag (only used during first initialization).
87+
/// We use std::sync::Mutex here (not parking_lot) so we can detect poisoning.
88+
static INITIALIZED: Lazy<StdMutex<bool>> = Lazy::new(|| StdMutex::new(false));
89+
90+
/// Register a sandbox with the crash handler.
91+
///
92+
/// This function:
93+
/// 1. Stores a raw pointer to the hypervisor (unsafe but controlled)
94+
/// 2. Initializes crash handlers on first call (lazy init)
95+
///
96+
/// Only registers the sandbox if crash dumps are enabled. If disabled,
97+
/// this function returns immediately without doing anything.
98+
///
99+
/// # Arguments
100+
///
101+
/// * `sandbox_id` - Unique ID of the sandbox
102+
/// * `hypervisor` - Reference to the hypervisor (we store a raw pointer)
103+
///
104+
/// # Safety
105+
///
106+
/// The caller MUST ensure the sandbox is unregistered before the hypervisor is dropped!
107+
/// This is enforced by MultiUseSandbox::Drop.
108+
///
109+
/// # Errors
110+
///
111+
/// Returns an error if the mutex is poisoned (extremely rare, would indicate
112+
/// a serious issue elsewhere in the program).
113+
pub fn register_sandbox(sandbox_id: u64, hypervisor: &dyn Hypervisor) -> Result<()> {
114+
// Check if initialization previously failed - no point trying again
115+
if INITIALIZATION_FAILED.load(Ordering::Acquire) {
116+
return Err(new_error!(
117+
"Crash handler initialization previously failed, skipping registration"
118+
));
119+
}
120+
121+
// Fast path: check if already initialized (lock-free!)
122+
if !INITIALIZED_FAST.load(Ordering::Acquire) {
123+
// Slow path: need to initialize (only happens once)
124+
match INITIALIZED.lock() {
125+
Ok(mut initialized) => {
126+
// Double-check inside the lock (another thread might have initialized)
127+
if !*initialized {
128+
platform::init_crash_handlers();
129+
*initialized = true;
130+
// Mark as initialized atomically (Release ensures all init is visible)
131+
INITIALIZED_FAST.store(true, Ordering::Release);
132+
}
133+
}
134+
Err(e) => {
135+
// Mutex is poisoned - mark as failed and return error
136+
INITIALIZATION_FAILED.store(true, Ordering::Release);
137+
return Err(new_error!(
138+
"INITIALIZED mutex poisoned during crash handler init: {}",
139+
e
140+
));
141+
}
142+
}
143+
}
144+
145+
// Add entry to registry (lock-free with DashMap!)
146+
let hypervisor_ptr = unsafe {
147+
std::mem::transmute::<*const dyn Hypervisor, *const dyn Hypervisor>(
148+
hypervisor as *const dyn Hypervisor,
149+
)
150+
};
151+
152+
SANDBOX_REGISTRY.insert(sandbox_id, SandboxEntry { hypervisor_ptr });
153+
154+
Ok(())
155+
}
156+
157+
/// Unregister a sandbox from the crash handler.
158+
///
159+
/// Called automatically by MultiUseSandbox::Drop.
160+
///
161+
/// # Arguments
162+
///
163+
/// * `sandbox_id` - Unique ID of the sandbox to unregister
164+
pub fn unregister_sandbox(sandbox_id: u64) {
165+
// Lock-free removal with DashMap
166+
SANDBOX_REGISTRY.remove(&sandbox_id);
167+
}
168+
169+
/// Generate dumps for all registered sandboxes.
170+
///
171+
/// Called by platform-specific crash handlers when a fatal signal/exception occurs.
172+
/// Iterates through the registry and generates dumps for all registered sandboxes.
173+
/// Only sandboxes with dumps enabled are registered, so all entries get dumped.
174+
///
175+
/// # Safety
176+
///
177+
/// This function is called during crash handling and:
178+
/// - Dereferences raw pointers (unsafe but acceptable during crash)
179+
/// - May violate async-signal-safety on Linux
180+
/// - Accesses hypervisor state without locks
181+
///
182+
/// All of this is acceptable because the process is crashing anyway.
183+
///
184+
/// # Returns
185+
///
186+
/// Number of dumps successfully generated.
187+
pub(crate) fn generate_crash_dumps() -> usize {
188+
let mut dump_count = 0;
189+
190+
// Iterate over the lock-free registry
191+
for entry_ref in SANDBOX_REGISTRY.iter() {
192+
let entry = entry_ref.value();
193+
194+
// SAFETY: This is unsafe! We're dereferencing a raw pointer.
195+
// This is acceptable because:
196+
// 1. The sandbox registers/unregisters properly via Drop
197+
// 2. During a crash, the process is dying anyway
198+
// 3. We're willing to accept potential UB during crash handling
199+
unsafe {
200+
let hypervisor = &*entry.hypervisor_ptr;
201+
202+
// Try to generate the crash dump
203+
// This is NOT async-signal-safe (file I/O, allocations, etc.)
204+
// but we're crashing, so this is acceptable
205+
//
206+
// Catch panics: If generating one dump panics, it maybe indicates
207+
// a systemic issue so we short-circuit
208+
// rather than risk cascading failures
209+
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
210+
crate::hypervisor::crashdump::generate_crashdump(hypervisor)
211+
}));
212+
213+
match result {
214+
Ok(Ok(())) => {
215+
dump_count += 1;
216+
}
217+
Ok(Err(_)) => {
218+
// Silent failure - dump generation returned an error
219+
}
220+
Err(_) => {
221+
// Panic during dump generation - abort remaining dumps
222+
// This may indicate a systemic issue
223+
break;
224+
}
225+
}
226+
}
227+
}
228+
229+
dump_count
230+
}
231+
232+
// Platform-specific implementations
233+
#[cfg(target_os = "linux")]
234+
#[path = "crash_handler/linux.rs"]
235+
mod platform;
236+
237+
#[cfg(target_os = "windows")]
238+
#[path = "crash_handler/windows.rs"]
239+
mod platform;

0 commit comments

Comments
 (0)