Skip to content

Commit 0c46640

Browse files
feat: enhance health check robustness and observability
Improve the device health check system to prevent blocking, enable graceful shutdown, and provide better error categorization. These changes address stability issues in production environments with multiple GPUs and bursty XID error scenarios. Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
1 parent 7644011 commit 0c46640

File tree

3 files changed

+888
-48
lines changed

3 files changed

+888
-48
lines changed

internal/plugin/server.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ const (
4646
deviceListEnvVar = "NVIDIA_VISIBLE_DEVICES"
4747
deviceListAsVolumeMountsHostPath = "/dev/null"
4848
deviceListAsVolumeMountsContainerPathRoot = "/var/run/nvidia-container-devices"
49+
50+
// healthChannelBufferSize defines the buffer capacity for the health
51+
// channel. This is sized to handle bursts of unhealthy device reports
52+
// without blocking the health check goroutine. With 8 GPUs and
53+
// potential for multiple events per GPU (XID errors, ECC errors, etc.),
54+
// a buffer of 64 provides ample headroom while using a power-of-2 size
55+
// for cache-friendly alignment.
56+
healthChannelBufferSize = 64
4957
)
5058

5159
// nvidiaDevicePlugin implements the Kubernetes device plugin API
@@ -107,7 +115,7 @@ func getPluginSocketPath(resource spec.ResourceName) string {
107115

108116
func (plugin *nvidiaDevicePlugin) initialize() {
109117
plugin.server = grpc.NewServer([]grpc.ServerOption{}...)
110-
plugin.health = make(chan *rm.Device)
118+
plugin.health = make(chan *rm.Device, healthChannelBufferSize)
111119
plugin.stop = make(chan interface{})
112120
}
113121

internal/rm/health.go

Lines changed: 251 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,17 @@
1717
package rm
1818

1919
import (
20+
"context"
2021
"fmt"
2122
"os"
2223
"strconv"
2324
"strings"
25+
"sync"
26+
"time"
2427

2528
"github.com/NVIDIA/go-nvml/pkg/nvml"
2629
"k8s.io/klog/v2"
30+
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
2731
)
2832

2933
const (
@@ -40,8 +44,133 @@ const (
4044
envEnableHealthChecks = "DP_ENABLE_HEALTHCHECKS"
4145
)
4246

47+
// eventResult packages an NVML event with its return code for passing
48+
// between the event receiver goroutine and the main processing loop.
49+
type eventResult struct {
50+
event nvml.EventData
51+
ret nvml.Return
52+
}
53+
54+
// sendUnhealthyDevice sends a device to the unhealthy channel without
55+
// blocking. If the channel is full, it logs an error and updates the device
56+
// state directly. This prevents the health check goroutine from being blocked
57+
// indefinitely if ListAndWatch is stalled.
58+
func sendUnhealthyDevice(unhealthy chan<- *Device, d *Device) {
59+
select {
60+
case unhealthy <- d:
61+
klog.V(2).Infof("Device %s sent to unhealthy channel", d.ID)
62+
default:
63+
// Channel is full - this indicates ListAndWatch is not consuming
64+
// or the channel buffer is insufficient for the event rate
65+
klog.Errorf("Health channel full (capacity=%d)! "+
66+
"Unable to report device %s as unhealthy. "+
67+
"ListAndWatch may be stalled or event rate is too high.",
68+
cap(unhealthy), d.ID)
69+
// Update device state directly as fallback
70+
d.Health = pluginapi.Unhealthy
71+
}
72+
}
73+
74+
// healthCheckStats tracks statistics about health check operations for
75+
// observability and debugging.
76+
type healthCheckStats struct {
77+
startTime time.Time
78+
eventsProcessed uint64
79+
devicesMarkedUnhealthy uint64
80+
errorCount uint64
81+
xidByType map[uint64]uint64 // XID code -> count
82+
mu sync.Mutex
83+
}
84+
85+
// recordEvent increments the events processed counter and tracks XID
86+
// distribution.
87+
func (s *healthCheckStats) recordEvent(xid uint64) {
88+
s.mu.Lock()
89+
defer s.mu.Unlock()
90+
s.eventsProcessed++
91+
if s.xidByType == nil {
92+
s.xidByType = make(map[uint64]uint64)
93+
}
94+
s.xidByType[xid]++
95+
}
96+
97+
// recordUnhealthy increments the devices marked unhealthy counter.
98+
func (s *healthCheckStats) recordUnhealthy() {
99+
s.mu.Lock()
100+
defer s.mu.Unlock()
101+
s.devicesMarkedUnhealthy++
102+
}
103+
104+
// recordError increments the error counter.
105+
func (s *healthCheckStats) recordError() {
106+
s.mu.Lock()
107+
defer s.mu.Unlock()
108+
s.errorCount++
109+
}
110+
111+
// report logs a summary of health check statistics.
112+
func (s *healthCheckStats) report() {
113+
s.mu.Lock()
114+
defer s.mu.Unlock()
115+
116+
uptime := time.Since(s.startTime)
117+
klog.Infof("HealthCheck Stats: uptime=%v, events=%d, unhealthy=%d, errors=%d",
118+
uptime.Round(time.Second), s.eventsProcessed,
119+
s.devicesMarkedUnhealthy, s.errorCount)
120+
121+
if len(s.xidByType) > 0 {
122+
klog.Infof("HealthCheck XID distribution: %v", s.xidByType)
123+
}
124+
}
125+
126+
// handleEventWaitError categorizes NVML errors and determines the
127+
// appropriate action. Returns true if health checking should continue,
128+
// false if it should terminate.
129+
func (r *nvmlResourceManager) handleEventWaitError(
130+
ret nvml.Return,
131+
devices Devices,
132+
unhealthy chan<- *Device,
133+
) bool {
134+
klog.Errorf("Error waiting for NVML event: %v (code: %d)", ret, ret)
135+
136+
switch ret {
137+
case nvml.ERROR_GPU_IS_LOST:
138+
// Definitive hardware failure - mark all devices unhealthy
139+
klog.Error("GPU_IS_LOST error: Marking all devices as unhealthy")
140+
for _, d := range devices {
141+
sendUnhealthyDevice(unhealthy, d)
142+
}
143+
return true // Continue checking - devices may recover
144+
145+
case nvml.ERROR_UNINITIALIZED:
146+
// NVML state corrupted - this shouldn't happen in event loop
147+
klog.Error("NVML uninitialized error: This is unexpected, terminating health check")
148+
return false // Fatal, exit health check
149+
150+
case nvml.ERROR_UNKNOWN, nvml.ERROR_NOT_SUPPORTED:
151+
// Potentially transient or driver issue
152+
klog.Warningf("Transient NVML error (%v): Will retry on next iteration", ret)
153+
return true // Continue checking
154+
155+
default:
156+
// Unknown error - be conservative and mark devices unhealthy
157+
klog.Errorf("Unexpected NVML error %v: Marking all devices unhealthy conservatively", ret)
158+
for _, d := range devices {
159+
sendUnhealthyDevice(unhealthy, d)
160+
}
161+
return true // Continue checking
162+
}
163+
}
164+
43165
// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
44166
func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error {
167+
// Initialize stats tracking
168+
stats := &healthCheckStats{
169+
startTime: time.Now(),
170+
xidByType: make(map[uint64]uint64),
171+
}
172+
defer stats.report() // Log stats summary on exit
173+
45174
xids := getDisabledHealthCheckXids()
46175
if xids.IsAllDisabled() {
47176
return nil
@@ -62,6 +191,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
62191
}()
63192

64193
klog.Infof("Ignoring the following XIDs for health checks: %v", xids)
194+
klog.V(2).Infof("CheckHealth: Starting for %d devices", len(devices))
65195

66196
eventSet, ret := r.nvml.EventSetCreate()
67197
if ret != nvml.SUCCESS {
@@ -80,7 +210,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
80210
uuid, gi, ci, err := r.getDevicePlacement(d)
81211
if err != nil {
82212
klog.Warningf("Could not determine device placement for %v: %v; Marking it unhealthy.", d.ID, err)
83-
unhealthy <- d
213+
sendUnhealthyDevice(unhealthy, d)
84214
continue
85215
}
86216
deviceIDToGiMap[d.ID] = gi
@@ -90,14 +220,14 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
90220
gpu, ret := r.nvml.DeviceGetHandleByUUID(uuid)
91221
if ret != nvml.SUCCESS {
92222
klog.Infof("unable to get device handle from UUID: %v; marking it as unhealthy", ret)
93-
unhealthy <- d
223+
sendUnhealthyDevice(unhealthy, d)
94224
continue
95225
}
96226

97227
supportedEvents, ret := gpu.GetSupportedEventTypes()
98228
if ret != nvml.SUCCESS {
99229
klog.Infof("unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret)
100-
unhealthy <- d
230+
sendUnhealthyDevice(unhealthy, d)
101231
continue
102232
}
103233

@@ -107,67 +237,141 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
107237
}
108238
if ret != nvml.SUCCESS {
109239
klog.Infof("Marking device %v as unhealthy: %v", d.ID, ret)
110-
unhealthy <- d
240+
sendUnhealthyDevice(unhealthy, d)
111241
}
112242
}
113243

244+
// Create context for coordinating shutdown
245+
ctx, cancel := context.WithCancel(context.Background())
246+
defer cancel()
247+
248+
// Goroutine to watch for stop signal and cancel context
249+
go func() {
250+
<-stop
251+
cancel()
252+
}()
253+
254+
// Start periodic stats reporting goroutine
255+
go func() {
256+
ticker := time.NewTicker(5 * time.Minute)
257+
defer ticker.Stop()
258+
for {
259+
select {
260+
case <-ctx.Done():
261+
return
262+
case <-ticker.C:
263+
stats.report()
264+
}
265+
}
266+
}()
267+
268+
// Event receive channel with small buffer
269+
eventChan := make(chan eventResult, 10)
270+
271+
// Start goroutine to receive NVML events
272+
go func() {
273+
defer close(eventChan)
274+
for {
275+
// Check if we should stop
276+
select {
277+
case <-ctx.Done():
278+
return
279+
default:
280+
}
281+
282+
// Wait for NVML event with timeout
283+
e, ret := eventSet.Wait(5000)
284+
285+
// Try to send event result, but respect context cancellation
286+
select {
287+
case <-ctx.Done():
288+
return
289+
case eventChan <- eventResult{event: e, ret: ret}:
290+
}
291+
}
292+
}()
293+
294+
// Main event processing loop
114295
for {
115296
select {
116-
case <-stop:
297+
case <-ctx.Done():
298+
klog.V(2).Info("Health check stopped cleanly")
117299
return nil
118-
default:
119-
}
120300

121-
e, ret := eventSet.Wait(5000)
122-
if ret == nvml.ERROR_TIMEOUT {
123-
continue
124-
}
125-
if ret != nvml.SUCCESS {
126-
klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", ret)
127-
for _, d := range devices {
128-
unhealthy <- d
301+
case result, ok := <-eventChan:
302+
if !ok {
303+
// Event channel closed, exit
304+
return nil
129305
}
130-
continue
131-
}
132306

133-
if e.EventType != nvml.EventTypeXidCriticalError {
134-
klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", e)
135-
continue
136-
}
307+
// Handle timeout - just continue
308+
if result.ret == nvml.ERROR_TIMEOUT {
309+
continue
310+
}
137311

138-
if xids.IsDisabled(e.EventData) {
139-
klog.Infof("Skipping event %+v", e)
140-
continue
141-
}
312+
// Handle NVML errors with granular error handling
313+
if result.ret != nvml.SUCCESS {
314+
stats.recordError()
315+
shouldContinue := r.handleEventWaitError(result.ret, devices, unhealthy)
316+
if !shouldContinue {
317+
return fmt.Errorf("fatal NVML error: %v", result.ret)
318+
}
319+
continue
320+
}
142321

143-
klog.Infof("Processing event %+v", e)
144-
eventUUID, ret := e.Device.GetUUID()
145-
if ret != nvml.SUCCESS {
146-
// If we cannot reliably determine the device UUID, we mark all devices as unhealthy.
147-
klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", e, ret)
148-
for _, d := range devices {
149-
unhealthy <- d
322+
e := result.event
323+
324+
// Filter non-critical events
325+
if e.EventType != nvml.EventTypeXidCriticalError {
326+
klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", e)
327+
continue
150328
}
151-
continue
152-
}
153329

154-
d, exists := parentToDeviceMap[eventUUID]
155-
if !exists {
156-
klog.Infof("Ignoring event for unexpected device: %v", eventUUID)
157-
continue
158-
}
330+
// Check if this XID is disabled
331+
if xids.IsDisabled(e.EventData) {
332+
klog.Infof("Skipping event %+v", e)
333+
continue
334+
}
159335

160-
if d.IsMigDevice() && e.GpuInstanceId != 0xFFFFFFFF && e.ComputeInstanceId != 0xFFFFFFFF {
161-
gi := deviceIDToGiMap[d.ID]
162-
ci := deviceIDToCiMap[d.ID]
163-
if gi != e.GpuInstanceId || ci != e.ComputeInstanceId {
336+
klog.Infof("Processing event %+v", e)
337+
338+
// Record event stats
339+
stats.recordEvent(e.EventData)
340+
341+
// Get device UUID from event
342+
eventUUID, ret := e.Device.GetUUID()
343+
if ret != nvml.SUCCESS {
344+
// If we cannot reliably determine the device UUID, we mark all devices as unhealthy.
345+
klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", e, ret)
346+
stats.recordError()
347+
for _, d := range devices {
348+
stats.recordUnhealthy()
349+
sendUnhealthyDevice(unhealthy, d)
350+
}
164351
continue
165352
}
166-
klog.Infof("Event for mig device %v (gi=%v, ci=%v)", d.ID, gi, ci)
167-
}
168353

169-
klog.Infof("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy.", e.EventData, d.ID)
170-
unhealthy <- d
354+
// Find the device that matches this event
355+
d, exists := parentToDeviceMap[eventUUID]
356+
if !exists {
357+
klog.Infof("Ignoring event for unexpected device: %v", eventUUID)
358+
continue
359+
}
360+
361+
// For MIG devices, verify the GI/CI matches
362+
if d.IsMigDevice() && e.GpuInstanceId != 0xFFFFFFFF && e.ComputeInstanceId != 0xFFFFFFFF {
363+
gi := deviceIDToGiMap[d.ID]
364+
ci := deviceIDToCiMap[d.ID]
365+
if gi != e.GpuInstanceId || ci != e.ComputeInstanceId {
366+
continue
367+
}
368+
klog.Infof("Event for mig device %v (gi=%v, ci=%v)", d.ID, gi, ci)
369+
}
370+
371+
klog.Infof("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy.", e.EventData, d.ID)
372+
stats.recordUnhealthy()
373+
sendUnhealthyDevice(unhealthy, d)
374+
}
171375
}
172376
}
173377

0 commit comments

Comments
 (0)