diff --git a/tests/fault_tolerance/hardware/fault-injection-service/examples/README.md b/tests/fault_tolerance/hardware/fault-injection-service/examples/README.md new file mode 100644 index 0000000000..215a45bbfc --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/examples/README.md @@ -0,0 +1,77 @@ +# Fault Tolerance Tests + +End-to-end tests for GPU failure scenarios with automated validation. + +## Quick Start + +```bash +# Set your deployment +export TEST_NAMESPACE="your-namespace" +export TARGET_DEPLOYMENT="your-deployment" + +# Run XID 79 test (GPU fell off bus) +pytest test_xid79_minimal.py::test_xid79_cordon_drain_only -v -s +``` + +## Available Fixtures + +### Test Fixtures + +- **`xid79_test`** - GPU fell off bus (XID 79) +- **`xid74_test`** - NVLink failure (XID 74) + +### Expectation Fixtures + +- **`expect_cordon_and_drain`** - Node cordoned + pods evicted (5-10 min) +- **`expect_full_automation`** - Full NVSentinel automation (cordon → drain → remediate → uncordon) +- **`expect_cordon_only`** - Only cordon, no drain + +## Example Test + +```python +@pytest.mark.xid79 +def test_xid79_cordon_drain_only(xid79_test, expect_cordon_and_drain): + """Test GPU failure with NVSentinel cordon and drain.""" + xid79_test(gpu_id=0, expect=expect_cordon_and_drain) +``` + +## What Gets Validated + +- Fault injection (pods crash on target node) +- NVSentinel response (cordon/drain) +- Pod recovery (reschedule to healthy nodes) +- Inference recovery (service restored) +- Latency impact analysis (baseline → degraded → recovered) + +## Requirements + +- NVSentinel deployed with `DeleteAfterTimeout` mode +- Multi-node cluster with GPU nodes +- Target deployment with 2+ worker pods + +## Troubleshooting + +**Test fails with "Pods did not recover" after 15+ minutes?** + +NVSentinel is likely in `AllowCompletion` mode, which never evicts crash-looping pods. + +**Fix:** +```bash +kubectl edit configmap node-drainer-config -n nvsentinel +``` + +Change: +```toml +deleteAfterTimeoutMinutes = 5 # From 60 +[[userNamespaces]] +name = "*" +mode = "DeleteAfterTimeout" # From "AllowCompletion" +``` + +Restart: +```bash +kubectl rollout restart deployment nvsentinel-node-drainer -n nvsentinel +``` + +Test will now complete in ~5-10 minutes with proper pod eviction and recovery. + diff --git a/tests/fault_tolerance/hardware/fault-injection-service/examples/test_xid79_minimal.py b/tests/fault_tolerance/hardware/fault-injection-service/examples/test_xid79_minimal.py new file mode 100644 index 0000000000..e8899d5198 --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/examples/test_xid79_minimal.py @@ -0,0 +1,56 @@ +""" +XID 79 E2E Test - Persistent Hardware Failure Simulation + +This tests realistic node-level GPU failure with persistent hardware fault: + +Phase 0: Natural pod distribution across nodes (realistic!) +Phase 1: Fault injection on target node (persistent via hostPath) + - Fault marker written to /var/lib/cuda-fault-test on node (survives pod restarts) + - Pods on faulty node crash-loop indefinitely (realistic!) + - Pods on other nodes stay healthy + - Inference partially degraded + +Phase 2: NVSentinel response + - Cordons faulty node + - Waits for drain/eviction (NVSentinel policy-dependent) + - Crashed pods reschedule to healthy nodes where fault marker absent + - Pods on healthy nodes recover automatically (no fault file there) + +Phase 3: Recovery + - All pods on healthy nodes (no fault marker) + - Inference fully recovered + - Test cleanup removes fault marker from node + +This simulates: Persistent GPU hardware failure requiring rescheduling +(not transient failures that restart-recovery can fix) + +""" + +import pytest + + +@pytest.mark.xid79 +@pytest.mark.nvsentinel +@pytest.mark.slow +def test_xid79_full_automation(xid79_test, expect_full_automation): + """XID 79 with full NVSentinel automation (detection → cordon → drain → remediate → uncordon).""" + xid79_test(gpu_id=0, expect=expect_full_automation) + + +@pytest.mark.xid79 +@pytest.mark.nvsentinel +def test_xid79_cordon_drain_only(xid79_test, expect_cordon_and_drain): + """XID 79 with cordon + drain (no auto-remediation).""" + xid79_test(gpu_id=0, expect=expect_cordon_and_drain) + + +@pytest.mark.xid79 +@pytest.mark.parametrize("gpu_id", [0, 1, 2, 3]) +def test_xid79_all_gpus(xid79_test, expect_cordon_and_drain, gpu_id): + """Test XID 79 on each GPU.""" + xid79_test(gpu_id=gpu_id, expect=expect_cordon_and_drain) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s", "-m", "xid79"]) +