Skip to content

Commit a169cc5

Browse files
committed
Fix calling kill in advance
Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com>
1 parent 353439e commit a169cc5

File tree

5 files changed

+693
-129
lines changed

5 files changed

+693
-129
lines changed

src/hyperlight_host/benches/benchmarks.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ fn guest_call_benchmark(c: &mut Criterion) {
144144
// Small delay to ensure the guest function is running in VM before interrupting
145145
thread::sleep(std::time::Duration::from_millis(1));
146146
let kill_start = Instant::now();
147-
interrupt_handle.kill();
147+
assert!(interrupt_handle.kill());
148148
kill_start
149149
});
150150

src/hyperlight_host/src/hypervisor/hyperlight_vm.rs

Lines changed: 42 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::convert::TryFrom;
1919
#[cfg(crashdump)]
2020
use std::path::Path;
2121
#[cfg(target_os = "windows")]
22-
use std::sync::atomic::AtomicBool;
22+
use std::sync::atomic::{AtomicBool, AtomicU64};
2323
#[cfg(any(kvm, mshv))]
2424
use std::sync::atomic::{AtomicBool, AtomicU64};
2525
use std::sync::{Arc, Mutex};
@@ -146,7 +146,6 @@ impl HyperlightVm {
146146
#[cfg(any(kvm, mshv))]
147147
let interrupt_handle: Arc<dyn InterruptHandleImpl> = Arc::new(LinuxInterruptHandle {
148148
running: AtomicU64::new(0),
149-
cancel_requested: AtomicBool::new(false),
150149
#[cfg(gdb)]
151150
debug_interrupt: AtomicBool::new(false),
152151
#[cfg(all(
@@ -170,8 +169,7 @@ impl HyperlightVm {
170169

171170
#[cfg(target_os = "windows")]
172171
let interrupt_handle: Arc<dyn InterruptHandleImpl> = Arc::new(WindowsInterruptHandle {
173-
running: AtomicBool::new(false),
174-
cancel_requested: AtomicBool::new(false),
172+
state: AtomicU64::new(0),
175173
#[cfg(gdb)]
176174
debug_interrupt: AtomicBool::new(false),
177175
partition_handle: vm.partition_handle(),
@@ -374,28 +372,42 @@ impl HyperlightVm {
374372
&mut self,
375373
#[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
376374
) -> Result<()> {
375+
// ===== KILL() TIMING POINT 1: Between guest function calls =====
376+
// Clear any stale cancellation from a previous guest function call or if kill() was called too early.
377+
// This ensures that kill() called BETWEEN different guest function calls doesn't affect the next call.
378+
//
379+
// If kill() was called and ran to completion BEFORE this line executes:
380+
// - kill() has NO effect on this guest function call because CANCEL_BIT is cleared here.
381+
// - NOTE: stale signals can still be delivered, but they will be ignored.
382+
self.interrupt_handle.clear_cancel();
383+
377384
// Keeps the trace context and open spans
378385
#[cfg(feature = "trace_guest")]
379386
let mut tc = crate::sandbox::trace::TraceContext::new();
380387

381388
let result = loop {
389+
// ===== KILL() TIMING POINT 2: Before set_tid() =====
390+
// If kill() is called and ran to completion BEFORE this line executes:
391+
// - CANCEL_BIT will be set and we will return an early VmExit::Cancelled()
382392
self.interrupt_handle.set_tid();
383-
// Note: if a `InterruptHandle::kill()` called while this thread is **here**
384-
// Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
385393
self.interrupt_handle.set_running();
386394

387-
// Don't run the vcpu if `cancel_requested` is true
388-
//
389-
// Note: if a `InterruptHandle::kill()` called while this thread is **here**
390-
// Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
391395
let exit_reason = if self.interrupt_handle.is_cancelled()
392396
|| self.interrupt_handle.is_debug_interrupted()
393397
{
394398
Ok(VmExit::Cancelled())
395399
} else {
396400
#[cfg(feature = "trace_guest")]
397401
tc.setup_guest_trace(Span::current().context());
402+
403+
// ===== KILL() TIMING POINT 3: Before calling run_vcpu() =====
404+
// If kill() is called and ran to completion BEFORE this line executes:
405+
// - CANCEL_BIT will be set, but it's too late to prevent entering the guest this iteration
406+
// - Signals will interrupt the guest (RUNNING_BIT=true), causing VmExit::Cancelled()
407+
// - If the guest completes before any signals arrive, kill() may have no effect
408+
// - If there are more iterations to do (IO/host func, etc.), the next iteration will be cancelled
398409
let exit_reason = self.vm.run_vcpu();
410+
399411
// End current host trace by closing the current span that captures traces
400412
// happening when a guest exits and re-enters.
401413
#[cfg(feature = "trace_guest")]
@@ -411,24 +423,28 @@ impl HyperlightVm {
411423
exit_reason
412424
};
413425

414-
// Note: if a `InterruptHandle::kill()` called while this thread is **here**
415-
// Then signals will be sent to this thread until `running` is set to false.
416-
// This is fine since the signal handler is a no-op.
426+
// ===== KILL() TIMING POINT 4: Before capturing cancel_requested =====
427+
// If kill() is called and ran to completion BEFORE this line executes:
428+
// - CANCEL_BIT will be set
429+
// - Signals may still be sent (RUNNING_BIT=true) but are harmless no-ops
430+
// - kill() will have no effect on this iteration, but CANCEL_BIT will persist
431+
// - If the loop continues (e.g., for a host call), the next iteration will be cancelled
432+
// - Stale signals from before clear_running() may arrive and kick future iterations,
433+
// but will be filtered out by the cancel_requested check below (and retried).
417434
let cancel_requested = self.interrupt_handle.is_cancelled();
418-
#[cfg(any(kvm, mshv))]
419435
let debug_interrupted = self.interrupt_handle.is_debug_interrupted();
420436

421-
// Note: if a `InterruptHandle::kill()` called while this thread is **here**
422-
// Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
423-
// Additionally signals will be sent to this thread until `running` is set to false.
424-
// This is fine since the signal handler is a no-op.
437+
// ===== KILL() TIMING POINT 5: Before calling clear_running() =====
438+
// Same as point 4.
425439
self.interrupt_handle.clear_running();
426-
self.interrupt_handle.clear_cancel();
427440

428-
// At this point, `running` is `false` so no more signals will be sent to this thread,
429-
// but we may still receive async signals that were sent before this point.
430-
// To prevent those signals from interrupting subsequent calls to `run()` (on other vms!),
431-
// we make sure to check `cancel_requested` before cancelling (see `libc::EINTR` match-arm below).
441+
// ===== KILL() TIMING POINT 6: After calling clear_running() =====
442+
// If kill() is called and ran to completion BEFORE this line executes:
443+
// - CANCEL_BIT will be set but won't affect this iteration, it is never read below this comment
444+
// and cleared at next run() start
445+
// - RUNNING_BIT=false, so no new signals will be sent
446+
// - Stale signals from before clear_running() may arrive and kick future iterations,
447+
// but will be filtered out by the cancel_requested check below (and retried).
432448
match exit_reason {
433449
#[cfg(gdb)]
434450
Ok(VmExit::Debug { dr6, exception }) => {
@@ -517,13 +533,10 @@ impl HyperlightVm {
517533
}
518534
}
519535
Ok(VmExit::Cancelled()) => {
520-
// On Linux (kvm/mshv), if cancellation was not requested for this specific vm,
521-
// the vcpu was interrupted because of debug interrupt or a stale signal that
522-
// meant to be delivered to a previous/other vcpu on this same thread, so let's ignore it
523-
// On Windows, WHvCancelRunVirtualProcessor is explicit, so we always break
524-
#[cfg(any(kvm, mshv))]
536+
// If cancellation was not requested for this specific guest function call,
537+
// the vcpu was interrupted by a stale cancellation from a previous call
525538
if !cancel_requested && !debug_interrupted {
526-
// treat this the same as a VmExit::Retry, the cancel was not meant for this vcpu
539+
// treat this the same as a VmExit::Retry, the cancel was not meant for this call
527540
continue;
528541
}
529542

@@ -538,10 +551,6 @@ impl HyperlightVm {
538551
}
539552
}
540553

541-
if cancel_requested {
542-
self.interrupt_handle.clear_cancel();
543-
}
544-
545554
metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1);
546555
break Err(ExecutionCanceledByHost());
547556
}

0 commit comments

Comments
 (0)