Skip to content

Commit 1461168

Browse files
feat(gfd): enhance device health check with compute capability probe
Improve GFD's device health checking to catch devices in degraded states (e.g., after XID errors). Previously, devices might pass the GetName() check but fail during actual labeling. Now uses two probes: 1. GetName() - catches completely dead devices 2. GetCudaComputeCapability() - catches degraded devices This prevents scenarios where XID errors leave devices in a state where basic queries succeed but complex queries fail, which would cause partial label generation and unnecessary warnings. Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
1 parent 8e66a4a commit 1461168

File tree

7 files changed

+436
-21
lines changed

7 files changed

+436
-21
lines changed

cmd/gpu-feature-discovery/main.go

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,29 @@ func (d *gfd) run(sigs chan os.Signal) (bool, error) {
254254
}()
255255

256256
timestampLabeler := lm.NewTimestampLabeler(d.config)
257+
258+
// Stats for tracking labeling health
259+
totalRuns := 0
260+
failedRuns := 0
261+
lastSuccessTime := time.Now()
262+
257263
rerun:
264+
totalRuns++
258265
loopLabelers, err := lm.NewLabelers(d.manager, d.vgpu, d.config)
259266
if err != nil {
260-
return false, err
267+
failedRuns++
268+
klog.Errorf("GFD: Error creating labelers (run %d, failures %d): %v",
269+
totalRuns, failedRuns, err)
270+
271+
// In oneshot mode, fail immediately (for testing and CI)
272+
if *d.config.Flags.GFD.Oneshot {
273+
return false, err
274+
}
275+
276+
// Otherwise, sleep and retry
277+
klog.Info("Sleeping for ", *d.config.Flags.GFD.SleepInterval)
278+
time.Sleep(time.Duration(*d.config.Flags.GFD.SleepInterval))
279+
goto rerun
261280
}
262281

263282
labelers := lm.Merge(
@@ -267,18 +286,52 @@ rerun:
267286

268287
labels, err := labelers.Labels()
269288
if err != nil {
270-
return false, fmt.Errorf("error generating labels: %v", err)
289+
failedRuns++
290+
klog.Errorf("GFD: Error generating labels (run %d, failures %d): %v",
291+
totalRuns, failedRuns, err)
292+
293+
// In oneshot mode, fail immediately (for testing and CI)
294+
if *d.config.Flags.GFD.Oneshot {
295+
return false, err
296+
}
297+
298+
// Otherwise, sleep and retry
299+
klog.Info("Sleeping for ", *d.config.Flags.GFD.SleepInterval)
300+
time.Sleep(time.Duration(*d.config.Flags.GFD.SleepInterval))
301+
goto rerun
271302
}
272303

273304
if len(labels) <= 1 {
274305
klog.Warning("No labels generated from any source")
275306
}
276307

308+
// Log stats periodically (every 10 successful runs)
309+
if totalRuns%10 == 0 {
310+
klog.Infof("GFD Stats: runs=%d, failures=%d, success_rate=%.1f%%, last_success=%v",
311+
totalRuns, failedRuns, float64(totalRuns-failedRuns)/float64(totalRuns)*100,
312+
time.Since(lastSuccessTime))
313+
}
314+
277315
klog.Info("Creating Labels")
278316
if err := d.labelOutputer.Output(labels); err != nil {
279-
return false, err
317+
failedRuns++
318+
klog.Errorf("GFD: Error outputting labels (run %d, failures %d): %v",
319+
totalRuns, failedRuns, err)
320+
321+
// In oneshot mode, fail immediately
322+
if *d.config.Flags.GFD.Oneshot {
323+
return false, err
324+
}
325+
326+
// Otherwise, sleep and retry
327+
klog.Info("Sleeping for ", *d.config.Flags.GFD.SleepInterval)
328+
time.Sleep(time.Duration(*d.config.Flags.GFD.SleepInterval))
329+
goto rerun
280330
}
281331

332+
// Update last success time
333+
lastSuccessTime = time.Now()
334+
282335
if *d.config.Flags.GFD.Oneshot {
283336
return false, nil
284337
}

internal/lm/imex.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
package lm
1818

1919
import (
20-
"fmt"
2120
"strings"
2221

2322
"k8s.io/klog/v2"
@@ -48,16 +47,19 @@ func getFabricIDs(devices []resource.Device) (string, string, error) {
4847
for i, device := range devices {
4948
isFabricAttached, err := device.IsFabricAttached()
5049
if err != nil {
51-
return "", "", fmt.Errorf("error checking imex capability: %v", err)
50+
// Skip unhealthy devices when checking fabric attachment
51+
klog.V(2).Infof("Skipping device %d for IMEX capability check: %v", i, err)
52+
continue
5253
}
5354
if !isFabricAttached {
5455
continue
5556
}
5657

5758
clusterUUID, cliqueID, err := device.GetFabricIDs()
5859
if err != nil {
59-
60-
return "", "", fmt.Errorf("error getting fabric IDs: %w", err)
60+
// Skip devices that fail fabric ID retrieval
61+
klog.V(2).Infof("Skipping device %d, error getting fabric IDs: %v", i, err)
62+
continue
6163
}
6264

6365
uniqueClusterUUIDs[clusterUUID] = append(uniqueClusterUUIDs[clusterUUID], i)

internal/lm/nvml.go

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,23 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
5050
return empty{}, nil
5151
}
5252

53+
// Filter to only healthy devices - skip those that fail basic health
54+
// checks. This prevents GFD from crashing when a device goes unhealthy.
55+
totalDevices := len(devices)
56+
healthyDevices := resource.FilterHealthyDevices(devices)
57+
if len(healthyDevices) < totalDevices {
58+
klog.Warningf("GFD: %d/%d devices failed health check, continuing with healthy devices",
59+
totalDevices-len(healthyDevices), totalDevices)
60+
}
61+
62+
if len(healthyDevices) == 0 {
63+
klog.Warning("GFD: No healthy devices available for labeling")
64+
return empty{}, nil
65+
}
66+
67+
// Use healthyDevices for all subsequent labeling
68+
devices = healthyDevices
69+
5370
machineTypeLabeler, err := newMachineTypeLabeler(*config.Flags.GFD.MachineTypeFile)
5471
if err != nil {
5572
return nil, fmt.Errorf("failed to construct machine type labeler: %v", err)
@@ -160,7 +177,9 @@ func newMigCapabilityLabeler(manager resource.Manager) (Labeler, error) {
160177
for _, d := range devices {
161178
isMigCapable, err = d.IsMigCapable()
162179
if err != nil {
163-
return nil, fmt.Errorf("error getting mig capability: %v", err)
180+
// Skip unhealthy devices, continue checking others
181+
klog.V(2).Infof("Skipping device for MIG capability check: %v", err)
182+
continue
164183
}
165184
if isMigCapable {
166185
break
@@ -201,7 +220,9 @@ func isMPSCapable(manager resource.Manager) (bool, error) {
201220
for _, d := range devices {
202221
isMigEnabled, err := d.IsMigEnabled()
203222
if err != nil {
204-
return false, fmt.Errorf("failed to check if device is MIG-enabled: %w", err)
223+
// Skip unhealthy devices when checking MPS capability
224+
klog.V(2).Infof("Skipping device for MPS capability check: %v", err)
225+
continue
205226
}
206227
if isMigEnabled {
207228
return false, fmt.Errorf("%w for mig devices", errMPSSharingNotSupported)
@@ -247,10 +268,12 @@ func getModeForClasses(classes []uint32) string {
247268

248269
func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
249270
seenClasses := make(map[uint32]bool)
250-
for _, d := range devices {
271+
for i, d := range devices {
251272
class, err := d.GetPCIClass()
252273
if err != nil {
253-
return nil, err
274+
// Skip unhealthy devices when determining GPU mode
275+
klog.V(2).Infof("Skipping device %d for PCI class check: %v", i, err)
276+
continue
254277
}
255278
seenClasses[class] = true
256279
}

internal/lm/resource.go

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,17 @@ func NewGPUResourceLabeler(config *spec.Config, device resource.Device, count in
4141
return empty{}, nil
4242
}
4343

44+
// Check if device is healthy before querying
45+
if err := resource.CheckDeviceHealth(device); err != nil {
46+
klog.Warningf("Skipping unhealthy device in GPU resource labeler: %v", err)
47+
return empty{}, nil
48+
}
49+
4450
model, err := device.GetName()
4551
if err != nil {
46-
return nil, fmt.Errorf("failed to get device model: %v", err)
52+
// If health check passed but GetName fails, log and skip
53+
klog.Warningf("Device health check passed but GetName failed: %v", err)
54+
return empty{}, nil
4755
}
4856

4957
totalMemoryMiB, err := device.GetTotalMemoryMiB()
@@ -55,7 +63,9 @@ func NewGPUResourceLabeler(config *spec.Config, device resource.Device, count in
5563

5664
architectureLabels, err := newArchitectureLabels(resourceLabeler, device)
5765
if err != nil {
58-
return nil, fmt.Errorf("failed to create architecture labels: %v", err)
66+
// Don't crash on architecture label failure, log and continue
67+
klog.Warningf("Failed to create architecture labels: %v", err)
68+
architectureLabels = make(Labels)
5969
}
6070

6171
memoryLabeler := (Labeler)(&empty{})
@@ -78,25 +88,35 @@ func NewMIGResourceLabeler(resourceName spec.ResourceName, config *spec.Config,
7888
return empty{}, nil
7989
}
8090

91+
// Check if device is healthy before querying
92+
if err := resource.CheckDeviceHealth(device); err != nil {
93+
klog.Warningf("Skipping unhealthy MIG device in resource labeler: %v", err)
94+
return empty{}, nil
95+
}
96+
8197
parent, err := device.GetDeviceHandleFromMigDeviceHandle()
8298
if err != nil {
83-
return nil, fmt.Errorf("failed to get parent of MIG device: %v", err)
99+
klog.Warningf("Failed to get parent of MIG device, skipping: %v", err)
100+
return empty{}, nil
84101
}
85102
model, err := parent.GetName()
86103
if err != nil {
87-
return nil, fmt.Errorf("failed to get device model: %v", err)
104+
klog.Warningf("Failed to get device model, skipping: %v", err)
105+
return empty{}, nil
88106
}
89107

90108
migProfile, err := device.GetName()
91109
if err != nil {
92-
return nil, fmt.Errorf("failed to get MIG profile name: %v", err)
110+
klog.Warningf("Failed to get MIG profile name, skipping: %v", err)
111+
return empty{}, nil
93112
}
94113

95114
resourceLabeler := newResourceLabeler(resourceName, config)
96115

97116
attributeLabels, err := newMigAttributeLabels(resourceLabeler, device)
98117
if err != nil {
99-
return nil, fmt.Errorf("faled to get MIG attribute labels: %v", err)
118+
klog.Warningf("Failed to get MIG attribute labels: %v", err)
119+
attributeLabels = make(Labels)
100120
}
101121

102122
labelers := Merge(
@@ -252,7 +272,9 @@ func (rl resourceLabeler) replicationInfo() *spec.ReplicatedResource {
252272
func newMigAttributeLabels(rl resourceLabeler, device resource.Device) (Labels, error) {
253273
attributes, err := device.GetAttributes()
254274
if err != nil {
255-
return nil, fmt.Errorf("unable to get attributes of MIG device: %v", err)
275+
// Return empty labels instead of crashing
276+
klog.Warningf("Unable to get attributes of MIG device, skipping: %v", err)
277+
return make(Labels), nil
256278
}
257279

258280
labels := rl.labels(attributes)
@@ -263,7 +285,9 @@ func newMigAttributeLabels(rl resourceLabeler, device resource.Device) (Labels,
263285
func newArchitectureLabels(rl resourceLabeler, device resource.Device) (Labels, error) {
264286
computeMajor, computeMinor, err := device.GetCudaComputeCapability()
265287
if err != nil {
266-
return nil, fmt.Errorf("failed to determine CUDA compute capability: %v", err)
288+
// Return empty labels instead of error - allows labeling to continue
289+
klog.Warningf("Failed to determine CUDA compute capability, skipping architecture labels: %v", err)
290+
return make(Labels), nil
267291
}
268292

269293
if computeMajor == 0 {

internal/resource/device_health.go

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/**
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package resource
18+
19+
import (
20+
"fmt"
21+
22+
"github.com/NVIDIA/go-nvml/pkg/nvml"
23+
"k8s.io/klog/v2"
24+
)
25+
26+
// DeviceUnhealthyError indicates that a device is unhealthy and cannot be
27+
// queried. This error type allows GFD to skip unhealthy devices gracefully
28+
// rather than crashing.
29+
type DeviceUnhealthyError struct {
30+
DeviceID string
31+
Reason string
32+
NVMLErr nvml.Return
33+
}
34+
35+
func (e *DeviceUnhealthyError) Error() string {
36+
if e.NVMLErr != nvml.SUCCESS {
37+
return fmt.Sprintf("device %s unhealthy: %v (NVML: %v)",
38+
e.DeviceID, e.Reason, e.NVMLErr)
39+
}
40+
return fmt.Sprintf("device %s unhealthy: %v", e.DeviceID, e.Reason)
41+
}
42+
43+
// IsUnhealthyNVMLError categorizes NVML errors to determine if they
44+
// indicate an unhealthy device vs a transient or fatal error.
45+
func IsUnhealthyNVMLError(ret nvml.Return) bool {
46+
switch ret {
47+
case nvml.SUCCESS:
48+
return false
49+
case nvml.ERROR_GPU_IS_LOST,
50+
nvml.ERROR_UNINITIALIZED,
51+
nvml.ERROR_INVALID_ARGUMENT,
52+
nvml.ERROR_NOT_FOUND,
53+
nvml.ERROR_NO_PERMISSION:
54+
return true
55+
default:
56+
// For unknown errors, be conservative and treat as unhealthy
57+
// This prevents GFD from crashing on unexpected errors
58+
return true
59+
}
60+
}
61+
62+
// CheckDeviceHealth performs a health check on a device to verify it's
63+
// queryable. This uses two probes to catch both completely dead devices
64+
// and devices in degraded states (e.g., after XID errors). This prevents
65+
// GFD from attempting to label devices that will fail during query.
66+
func CheckDeviceHealth(d Device) error {
67+
// Probe 1: Basic identity check
68+
_, err := d.GetName()
69+
if err != nil {
70+
return &DeviceUnhealthyError{
71+
Reason: "GetName failed",
72+
NVMLErr: toNVMLReturn(err),
73+
}
74+
}
75+
76+
// Probe 2: Compute capability check (catches devices in degraded state)
77+
// Many XID errors leave the device in a state where GetName() works but
78+
// more complex queries fail. This probe catches those scenarios.
79+
_, _, err = d.GetCudaComputeCapability()
80+
if err != nil {
81+
return &DeviceUnhealthyError{
82+
Reason: "GetCudaComputeCapability failed",
83+
NVMLErr: toNVMLReturn(err),
84+
}
85+
}
86+
87+
return nil
88+
}
89+
90+
// toNVMLReturn extracts the NVML return code from an error
91+
func toNVMLReturn(err error) nvml.Return {
92+
if nvmlErr, ok := err.(nvml.Return); ok {
93+
return nvmlErr
94+
}
95+
return nvml.ERROR_UNKNOWN
96+
}
97+
98+
// FilterHealthyDevices filters a list of devices to only include those
99+
// that pass basic health checks. Unhealthy devices are logged as warnings
100+
// but don't cause GFD to crash.
101+
func FilterHealthyDevices(devices []Device) []Device {
102+
var healthy []Device
103+
for i, d := range devices {
104+
if err := CheckDeviceHealth(d); err != nil {
105+
// Log warning but continue
106+
klog.Warningf("Skipping unhealthy device at index %d: %v", i, err)
107+
continue
108+
}
109+
healthy = append(healthy, d)
110+
}
111+
return healthy
112+
}

0 commit comments

Comments
 (0)