1717package rm
1818
1919import (
20+ "context"
2021 "fmt"
2122 "os"
2223 "strconv"
2324 "strings"
25+ "sync"
26+ "time"
2427
2528 "github.com/NVIDIA/go-nvml/pkg/nvml"
2629 "k8s.io/klog/v2"
30+ pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
2731)
2832
2933const (
@@ -40,8 +44,133 @@ const (
4044 envEnableHealthChecks = "DP_ENABLE_HEALTHCHECKS"
4145)
4246
47+ // eventResult packages an NVML event with its return code for passing
48+ // between the event receiver goroutine and the main processing loop.
49+ type eventResult struct {
50+ event nvml.EventData
51+ ret nvml.Return
52+ }
53+
54+ // sendUnhealthyDevice sends a device to the unhealthy channel without
55+ // blocking. If the channel is full, it logs an error and updates the device
56+ // state directly. This prevents the health check goroutine from being blocked
57+ // indefinitely if ListAndWatch is stalled.
58+ func sendUnhealthyDevice (unhealthy chan <- * Device , d * Device ) {
59+ select {
60+ case unhealthy <- d :
61+ klog .V (2 ).Infof ("Device %s sent to unhealthy channel" , d .ID )
62+ default :
63+ // Channel is full - this indicates ListAndWatch is not consuming
64+ // or the channel buffer is insufficient for the event rate
65+ klog .Errorf ("Health channel full (capacity=%d)! " +
66+ "Unable to report device %s as unhealthy. " +
67+ "ListAndWatch may be stalled or event rate is too high." ,
68+ cap (unhealthy ), d .ID )
69+ // Update device state directly as fallback
70+ d .Health = pluginapi .Unhealthy
71+ }
72+ }
73+
74+ // healthCheckStats tracks statistics about health check operations for
75+ // observability and debugging.
76+ type healthCheckStats struct {
77+ startTime time.Time
78+ eventsProcessed uint64
79+ devicesMarkedUnhealthy uint64
80+ errorCount uint64
81+ xidByType map [uint64 ]uint64 // XID code -> count
82+ mu sync.Mutex
83+ }
84+
85+ // recordEvent increments the events processed counter and tracks XID
86+ // distribution.
87+ func (s * healthCheckStats ) recordEvent (xid uint64 ) {
88+ s .mu .Lock ()
89+ defer s .mu .Unlock ()
90+ s .eventsProcessed ++
91+ if s .xidByType == nil {
92+ s .xidByType = make (map [uint64 ]uint64 )
93+ }
94+ s .xidByType [xid ]++
95+ }
96+
97+ // recordUnhealthy increments the devices marked unhealthy counter.
98+ func (s * healthCheckStats ) recordUnhealthy () {
99+ s .mu .Lock ()
100+ defer s .mu .Unlock ()
101+ s .devicesMarkedUnhealthy ++
102+ }
103+
104+ // recordError increments the error counter.
105+ func (s * healthCheckStats ) recordError () {
106+ s .mu .Lock ()
107+ defer s .mu .Unlock ()
108+ s .errorCount ++
109+ }
110+
111+ // report logs a summary of health check statistics.
112+ func (s * healthCheckStats ) report () {
113+ s .mu .Lock ()
114+ defer s .mu .Unlock ()
115+
116+ uptime := time .Since (s .startTime )
117+ klog .Infof ("HealthCheck Stats: uptime=%v, events=%d, unhealthy=%d, errors=%d" ,
118+ uptime .Round (time .Second ), s .eventsProcessed ,
119+ s .devicesMarkedUnhealthy , s .errorCount )
120+
121+ if len (s .xidByType ) > 0 {
122+ klog .Infof ("HealthCheck XID distribution: %v" , s .xidByType )
123+ }
124+ }
125+
126+ // handleEventWaitError categorizes NVML errors and determines the
127+ // appropriate action. Returns true if health checking should continue,
128+ // false if it should terminate.
129+ func (r * nvmlResourceManager ) handleEventWaitError (
130+ ret nvml.Return ,
131+ devices Devices ,
132+ unhealthy chan <- * Device ,
133+ ) bool {
134+ klog .Errorf ("Error waiting for NVML event: %v (code: %d)" , ret , ret )
135+
136+ switch ret {
137+ case nvml .ERROR_GPU_IS_LOST :
138+ // Definitive hardware failure - mark all devices unhealthy
139+ klog .Error ("GPU_IS_LOST error: Marking all devices as unhealthy" )
140+ for _ , d := range devices {
141+ sendUnhealthyDevice (unhealthy , d )
142+ }
143+ return true // Continue checking - devices may recover
144+
145+ case nvml .ERROR_UNINITIALIZED :
146+ // NVML state corrupted - this shouldn't happen in event loop
147+ klog .Error ("NVML uninitialized error: This is unexpected, terminating health check" )
148+ return false // Fatal, exit health check
149+
150+ case nvml .ERROR_UNKNOWN , nvml .ERROR_NOT_SUPPORTED :
151+ // Potentially transient or driver issue
152+ klog .Warningf ("Transient NVML error (%v): Will retry on next iteration" , ret )
153+ return true // Continue checking
154+
155+ default :
156+ // Unknown error - be conservative and mark devices unhealthy
157+ klog .Errorf ("Unexpected NVML error %v: Marking all devices unhealthy conservatively" , ret )
158+ for _ , d := range devices {
159+ sendUnhealthyDevice (unhealthy , d )
160+ }
161+ return true // Continue checking
162+ }
163+ }
164+
43165// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
44166func (r * nvmlResourceManager ) checkHealth (stop <- chan interface {}, devices Devices , unhealthy chan <- * Device ) error {
167+ // Initialize stats tracking
168+ stats := & healthCheckStats {
169+ startTime : time .Now (),
170+ xidByType : make (map [uint64 ]uint64 ),
171+ }
172+ defer stats .report () // Log stats summary on exit
173+
45174 xids := getDisabledHealthCheckXids ()
46175 if xids .IsAllDisabled () {
47176 return nil
@@ -62,6 +191,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
62191 }()
63192
64193 klog .Infof ("Ignoring the following XIDs for health checks: %v" , xids )
194+ klog .V (2 ).Infof ("CheckHealth: Starting for %d devices" , len (devices ))
65195
66196 eventSet , ret := r .nvml .EventSetCreate ()
67197 if ret != nvml .SUCCESS {
@@ -80,7 +210,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
80210 uuid , gi , ci , err := r .getDevicePlacement (d )
81211 if err != nil {
82212 klog .Warningf ("Could not determine device placement for %v: %v; Marking it unhealthy." , d .ID , err )
83- unhealthy <- d
213+ sendUnhealthyDevice ( unhealthy , d )
84214 continue
85215 }
86216 deviceIDToGiMap [d .ID ] = gi
@@ -90,14 +220,14 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
90220 gpu , ret := r .nvml .DeviceGetHandleByUUID (uuid )
91221 if ret != nvml .SUCCESS {
92222 klog .Infof ("unable to get device handle from UUID: %v; marking it as unhealthy" , ret )
93- unhealthy <- d
223+ sendUnhealthyDevice ( unhealthy , d )
94224 continue
95225 }
96226
97227 supportedEvents , ret := gpu .GetSupportedEventTypes ()
98228 if ret != nvml .SUCCESS {
99229 klog .Infof ("unable to determine the supported events for %v: %v; marking it as unhealthy" , d .ID , ret )
100- unhealthy <- d
230+ sendUnhealthyDevice ( unhealthy , d )
101231 continue
102232 }
103233
@@ -107,67 +237,141 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
107237 }
108238 if ret != nvml .SUCCESS {
109239 klog .Infof ("Marking device %v as unhealthy: %v" , d .ID , ret )
110- unhealthy <- d
240+ sendUnhealthyDevice ( unhealthy , d )
111241 }
112242 }
113243
244+ // Create context for coordinating shutdown
245+ ctx , cancel := context .WithCancel (context .Background ())
246+ defer cancel ()
247+
248+ // Goroutine to watch for stop signal and cancel context
249+ go func () {
250+ <- stop
251+ cancel ()
252+ }()
253+
254+ // Start periodic stats reporting goroutine
255+ go func () {
256+ ticker := time .NewTicker (5 * time .Minute )
257+ defer ticker .Stop ()
258+ for {
259+ select {
260+ case <- ctx .Done ():
261+ return
262+ case <- ticker .C :
263+ stats .report ()
264+ }
265+ }
266+ }()
267+
268+ // Event receive channel with small buffer
269+ eventChan := make (chan eventResult , 10 )
270+
271+ // Start goroutine to receive NVML events
272+ go func () {
273+ defer close (eventChan )
274+ for {
275+ // Check if we should stop
276+ select {
277+ case <- ctx .Done ():
278+ return
279+ default :
280+ }
281+
282+ // Wait for NVML event with timeout
283+ e , ret := eventSet .Wait (5000 )
284+
285+ // Try to send event result, but respect context cancellation
286+ select {
287+ case <- ctx .Done ():
288+ return
289+ case eventChan <- eventResult {event : e , ret : ret }:
290+ }
291+ }
292+ }()
293+
294+ // Main event processing loop
114295 for {
115296 select {
116- case <- stop :
297+ case <- ctx .Done ():
298+ klog .V (2 ).Info ("Health check stopped cleanly" )
117299 return nil
118- default :
119- }
120300
121- e , ret := eventSet .Wait (5000 )
122- if ret == nvml .ERROR_TIMEOUT {
123- continue
124- }
125- if ret != nvml .SUCCESS {
126- klog .Infof ("Error waiting for event: %v; Marking all devices as unhealthy" , ret )
127- for _ , d := range devices {
128- unhealthy <- d
301+ case result , ok := <- eventChan :
302+ if ! ok {
303+ // Event channel closed, exit
304+ return nil
129305 }
130- continue
131- }
132306
133- if e . EventType != nvml . EventTypeXidCriticalError {
134- klog . Infof ( "Skipping non-nvmlEventTypeXidCriticalError event: %+v" , e )
135- continue
136- }
307+ // Handle timeout - just continue
308+ if result . ret == nvml . ERROR_TIMEOUT {
309+ continue
310+ }
137311
138- if xids .IsDisabled (e .EventData ) {
139- klog .Infof ("Skipping event %+v" , e )
140- continue
141- }
312+ // Handle NVML errors with granular error handling
313+ if result .ret != nvml .SUCCESS {
314+ stats .recordError ()
315+ shouldContinue := r .handleEventWaitError (result .ret , devices , unhealthy )
316+ if ! shouldContinue {
317+ return fmt .Errorf ("fatal NVML error: %v" , result .ret )
318+ }
319+ continue
320+ }
142321
143- klog .Infof ("Processing event %+v" , e )
144- eventUUID , ret := e .Device .GetUUID ()
145- if ret != nvml .SUCCESS {
146- // If we cannot reliably determine the device UUID, we mark all devices as unhealthy.
147- klog .Infof ("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy." , e , ret )
148- for _ , d := range devices {
149- unhealthy <- d
322+ e := result .event
323+
324+ // Filter non-critical events
325+ if e .EventType != nvml .EventTypeXidCriticalError {
326+ klog .Infof ("Skipping non-nvmlEventTypeXidCriticalError event: %+v" , e )
327+ continue
150328 }
151- continue
152- }
153329
154- d , exists := parentToDeviceMap [ eventUUID ]
155- if ! exists {
156- klog .Infof ("Ignoring event for unexpected device: % v" , eventUUID )
157- continue
158- }
330+ // Check if this XID is disabled
331+ if xids . IsDisabled ( e . EventData ) {
332+ klog .Infof ("Skipping event %+ v" , e )
333+ continue
334+ }
159335
160- if d .IsMigDevice () && e .GpuInstanceId != 0xFFFFFFFF && e .ComputeInstanceId != 0xFFFFFFFF {
161- gi := deviceIDToGiMap [d .ID ]
162- ci := deviceIDToCiMap [d .ID ]
163- if gi != e .GpuInstanceId || ci != e .ComputeInstanceId {
336+ klog .Infof ("Processing event %+v" , e )
337+
338+ // Record event stats
339+ stats .recordEvent (e .EventData )
340+
341+ // Get device UUID from event
342+ eventUUID , ret := e .Device .GetUUID ()
343+ if ret != nvml .SUCCESS {
344+ // If we cannot reliably determine the device UUID, we mark all devices as unhealthy.
345+ klog .Infof ("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy." , e , ret )
346+ stats .recordError ()
347+ for _ , d := range devices {
348+ stats .recordUnhealthy ()
349+ sendUnhealthyDevice (unhealthy , d )
350+ }
164351 continue
165352 }
166- klog .Infof ("Event for mig device %v (gi=%v, ci=%v)" , d .ID , gi , ci )
167- }
168353
169- klog .Infof ("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy." , e .EventData , d .ID )
170- unhealthy <- d
354+ // Find the device that matches this event
355+ d , exists := parentToDeviceMap [eventUUID ]
356+ if ! exists {
357+ klog .Infof ("Ignoring event for unexpected device: %v" , eventUUID )
358+ continue
359+ }
360+
361+ // For MIG devices, verify the GI/CI matches
362+ if d .IsMigDevice () && e .GpuInstanceId != 0xFFFFFFFF && e .ComputeInstanceId != 0xFFFFFFFF {
363+ gi := deviceIDToGiMap [d .ID ]
364+ ci := deviceIDToCiMap [d .ID ]
365+ if gi != e .GpuInstanceId || ci != e .ComputeInstanceId {
366+ continue
367+ }
368+ klog .Infof ("Event for mig device %v (gi=%v, ci=%v)" , d .ID , gi , ci )
369+ }
370+
371+ klog .Infof ("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy." , e .EventData , d .ID )
372+ stats .recordUnhealthy ()
373+ sendUnhealthyDevice (unhealthy , d )
374+ }
171375 }
172376}
173377
0 commit comments