Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@

## What This Does

Makes CUDA calls return error codes to simulate various GPU failures. Uses LD_PRELOAD to intercept CUDA library calls.
Intercepts CUDA calls to simulate GPU failures using LD_PRELOAD. Faults persist across pod restarts via hostPath volumes, enabling realistic hardware failure testing.

```
Pod calls cudaMalloc() → LD_PRELOAD intercepts → Returns error → Pod crashes
Pod calls cudaMalloc() → LD_PRELOAD intercepts → Checks /host-fault/cuda_fault_enabled → Returns error → Pod crashes
```

**Result**: Realistic GPU failure testing without hardware damage.
**Key Features**:
- **Persistent faults**: hostPath volume (`/var/lib/cuda-fault-test`) survives pod restarts on same node
- **Runtime toggle**: Enable/disable faults without pod restarts via `/host-fault/cuda_fault_enabled`
- **Node-specific**: Faults only on target node, healthy nodes unaffected

## Scope

Expand All @@ -35,13 +38,20 @@ This library simulates **software/orchestration-level failures** that occur when
| **43** | GPU stopped responding | `CUDA_ERROR_LAUNCH_TIMEOUT` | Hung kernel |
| **74** | NVLink error | `CUDA_ERROR_PEER_ACCESS_UNSUPPORTED` | Multi-GPU communication failure |

## How It Works

1. **Deployment patching**: Adds hostPath volume + init container to compile library
2. **LD_PRELOAD injection**: Environment variable loads library before CUDA
3. **Runtime control**: Toggle file (`/host-fault/cuda_fault_enabled`) controls fault state
4. **Node persistence**: hostPath ensures faults survive pod restarts on same node

## Files in This Directory

| File | Purpose |
|------|---------|
| `cuda_intercept.c` | C library source that intercepts CUDA calls |
| `inject_into_pods.py` | Helper functions for patching Kubernetes deployments |
| `Makefile` | Builds the `.so` library locally (optional, for standalone testing) |
| `cuda_intercept.c` | C library that intercepts CUDA calls and checks fault markers |
| `inject_into_pods.py` | Kubernetes deployment patcher (adds hostPath volume + library) |
| `Makefile` | Local build (optional, for testing) |

## Prerequisites

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,20 @@ static const xid_mapping_t xid_mappings[] = {
};

// Get XID type and corresponding CUDA error
// Supports runtime toggling via /tmp/cuda_fault_enabled file
static void
get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
{
static int initialized = 0;
static int cached_inject = 0;
static int env_inject = 0; // From environment variable
static int cached_xid = 79; // Default to XID 79
static cudaError_t cached_error = cudaErrorNoDevice;

if (!initialized) {
// Check if injection is enabled
// Check if injection is enabled via environment
char* env = getenv("CUDA_FAULT_INJECTION_ENABLED");
if (env) {
cached_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
env_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
}

// Get XID type
Expand All @@ -85,8 +86,7 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
if (xid_mappings[i].xid == cached_xid) {
cached_error = xid_mappings[i].cuda_error;
fprintf(
stderr, "[CUDA FAULT INJECTION] ENABLED - Simulating XID %d (%s)\n", cached_xid,
xid_mappings[i].description);
stderr, "[CUDA FAULT INJECTION] Library loaded - XID %d (%s)\n", cached_xid, xid_mappings[i].description);
found = 1;
break;
}
Expand All @@ -97,16 +97,37 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
cached_xid = 79;
cached_error = cudaErrorNoDevice;
}
} else {
fprintf(
stderr, "[CUDA FAULT INJECTION] %s (default: XID 79 - GPU fell off bus)\n",
cached_inject ? "ENABLED" : "DISABLED");
}

initialized = 1;
}

*inject = cached_inject;
// Runtime toggle: Check node-persistent fault marker on EVERY call
// Use hostPath (/host-fault) so fault persists across pod restarts on same node
// Pod reschedules to different node → no file there → automatic recovery!
int runtime_inject = env_inject; // Default to env var

// Check hostPath first (persistent across restarts on same node)
FILE* toggle_file = fopen("/host-fault/cuda_fault_enabled", "r");
if (toggle_file) {
char toggle_value[4] = {0};
if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
runtime_inject = (toggle_value[0] == '1');
}
fclose(toggle_file);
} else {
// Fallback to ephemeral /tmp for backwards compatibility
toggle_file = fopen("/tmp/cuda_fault_enabled", "r");
if (toggle_file) {
char toggle_value[4] = {0};
if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
runtime_inject = (toggle_value[0] == '1');
}
fclose(toggle_file);
}
}

*inject = runtime_inject;
*xid_type = cached_xid;
*error_code = cached_error;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,18 @@ def _patch_service_for_injection(
{"name": "cuda-fault-lib", "emptyDir": {}}
)

# Add hostPath volume for persistent fault marker (survives pod restarts on same node)
# This simulates persistent hardware failure!
service["extraPodSpec"]["volumes"].append(
{
"name": "node-fault-marker",
"hostPath": {
"path": "/var/lib/cuda-fault-test",
"type": "DirectoryOrCreate",
},
}
)

# Add init container to decode base64
if "initContainers" not in service["extraPodSpec"]:
service["extraPodSpec"]["initContainers"] = []
Expand Down Expand Up @@ -247,7 +259,7 @@ def _patch_service_for_injection(
if vm.get("name") != "cuda-fault-lib"
]

# Add mount
# Add mount for compiled library
service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
{
"name": "cuda-fault-lib",
Expand All @@ -256,8 +268,18 @@ def _patch_service_for_injection(
}
)

# Add mount for persistent fault marker (hostPath)
service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
{
"name": "node-fault-marker",
"mountPath": "/host-fault",
"readOnly": False, # Need write access
}
)

print(" ✓ Added init container to compile library")
print(" ✓ Added ConfigMap volume mount")
print(" ✓ Added hostPath volume for persistent fault marker")

# Add node affinity to pin pods to target node (simulates real XID 79 behavior)
if target_node and enable:
Expand Down Expand Up @@ -287,14 +309,15 @@ def _patch_service_for_injection(
service["extraPodSpec"]["volumes"] = [
v
for v in service["extraPodSpec"]["volumes"]
if v.get("name") not in ["cuda-fault-lib", "cuda-fault-lib-source"]
if v.get("name")
not in ["cuda-fault-lib", "cuda-fault-lib-source", "node-fault-marker"]
]

if "volumeMounts" in service["extraPodSpec"].get("mainContainer", {}):
service["extraPodSpec"]["mainContainer"]["volumeMounts"] = [
vm
for vm in service["extraPodSpec"]["mainContainer"]["volumeMounts"]
if vm.get("name") != "cuda-fault-lib"
if vm.get("name") not in ["cuda-fault-lib", "node-fault-marker"]
]

# Remove init container
Expand Down Expand Up @@ -323,6 +346,7 @@ def patch_deployment_env(
use_configmap=True,
target_node=None,
xid_type=79,
passthrough_mode=False,
):
"""Patch deployment to add/remove LD_PRELOAD environment variable.

Expand All @@ -334,6 +358,8 @@ def patch_deployment_env(
target_node: If provided, adds node affinity to pin pods to this node
(simulates real XID where pods crash on the faulty node)
xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79
passthrough_mode: If True, set CUDA_FAULT_INJECTION_ENABLED=0 (library loaded but disabled)
Allows baseline testing before enabling faults via toggle
"""
custom_api = client.CustomObjectsApi()
apps_api = client.AppsV1Api()
Expand Down Expand Up @@ -385,9 +411,14 @@ def patch_deployment_env(
# Prepare environment variables
new_envs = []
if enable:
# Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
fault_enabled_value = "0" if passthrough_mode else "1"
new_envs = [
{"name": "LD_PRELOAD", "value": lib_path},
{"name": "CUDA_FAULT_INJECTION_ENABLED", "value": "1"},
{
"name": "CUDA_FAULT_INJECTION_ENABLED",
"value": fault_enabled_value,
},
{"name": "CUDA_XID_TYPE", "value": str(xid_type)},
]

Expand All @@ -400,6 +431,28 @@ def patch_deployment_env(
available_services = list(services.keys())
print(f" → Available services: {available_services}")

# Set aggressive update strategy when enabling (allow all pods to update at once)
# This ensures all pods get CUDA faults, not just the first few
if enable:
if "updateStrategy" not in spec:
spec["updateStrategy"] = {}
if "rollingUpdate" not in spec["updateStrategy"]:
spec["updateStrategy"]["rollingUpdate"] = {}

# Allow all pods to be unavailable during update
spec["updateStrategy"]["rollingUpdate"]["maxUnavailable"] = "100%"
# Don't create surge pods
spec["updateStrategy"]["rollingUpdate"]["maxSurge"] = 0
print(" → Set update strategy: maxUnavailable=100%, maxSurge=0")
print(" (All pods will update simultaneously)")
else:
# Restore default update strategy when disabling
if "updateStrategy" in spec:
spec["updateStrategy"] = {
"rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}
}
print(" → Restored default update strategy (maxUnavailable=25%)")

for service_name in services_to_patch:
if service_name in services:
print(f" → Patching service: {service_name}")
Expand Down Expand Up @@ -465,6 +518,38 @@ def patch_deployment_env(
print(f" Services patched: {', '.join(patched_services)}")
if use_configmap and enable:
print(f" Library mounted at: {lib_path}")

# Force restart all worker pods when enabling to apply changes immediately
if enable:
print(
" → Force-deleting all worker pods to apply changes immediately..."
)
core_api = client.CoreV1Api()
try:
worker_pods = core_api.list_namespaced_pod(
namespace=namespace,
label_selector=f"nvidia.com/dynamo-graph-deployment-name={deployment_name},nvidia.com/dynamo-component-type=worker",
)
deleted_count = 0
for pod in worker_pods.items:
try:
core_api.delete_namespaced_pod(
name=pod.metadata.name,
namespace=namespace,
grace_period_seconds=0,
)
deleted_count += 1
except Exception as e:
print(
f" ⚠ Could not delete pod {pod.metadata.name}: {e}"
)
print(
f" ✓ Deleted {deleted_count} pod(s) - they will restart with CUDA library"
)
except Exception as e:
print(f" ⚠ Could not list/delete pods: {e}")
print(" Pods will eventually restart, but may take longer")

return True

except ApiException as e:
Expand Down Expand Up @@ -505,11 +590,15 @@ def patch_deployment_env(

if enable:
# Add new env vars
# Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
fault_enabled_value = "0" if passthrough_mode else "1"
container.env.append(
client.V1EnvVar(name="LD_PRELOAD", value="/tmp/cuda_intercept.so")
)
container.env.append(
client.V1EnvVar(name="CUDA_FAULT_INJECTION_ENABLED", value="1")
client.V1EnvVar(
name="CUDA_FAULT_INJECTION_ENABLED", value=fault_enabled_value
)
)
container.env.append(
client.V1EnvVar(name="CUDA_XID_TYPE", value=str(xid_type))
Expand Down
Loading
Loading