Skip to content

Commit bf82a3c

Browse files
Revert "feat(gfd): enhance device health check with compute capability probe"
This reverts commit 525ea47.
1 parent 525ea47 commit bf82a3c

File tree

7 files changed

+21
-436
lines changed

7 files changed

+21
-436
lines changed

cmd/gpu-feature-discovery/main.go

Lines changed: 3 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -254,29 +254,10 @@ func (d *gfd) run(sigs chan os.Signal) (bool, error) {
254254
}()
255255

256256
timestampLabeler := lm.NewTimestampLabeler(d.config)
257-
258-
// Stats for tracking labeling health
259-
totalRuns := 0
260-
failedRuns := 0
261-
lastSuccessTime := time.Now()
262-
263257
rerun:
264-
totalRuns++
265258
loopLabelers, err := lm.NewLabelers(d.manager, d.vgpu, d.config)
266259
if err != nil {
267-
failedRuns++
268-
klog.Errorf("GFD: Error creating labelers (run %d, failures %d): %v",
269-
totalRuns, failedRuns, err)
270-
271-
// In oneshot mode, fail immediately (for testing and CI)
272-
if *d.config.Flags.GFD.Oneshot {
273-
return false, err
274-
}
275-
276-
// Otherwise, sleep and retry
277-
klog.Info("Sleeping for ", *d.config.Flags.GFD.SleepInterval)
278-
time.Sleep(time.Duration(*d.config.Flags.GFD.SleepInterval))
279-
goto rerun
260+
return false, err
280261
}
281262

282263
labelers := lm.Merge(
@@ -286,52 +267,18 @@ rerun:
286267

287268
labels, err := labelers.Labels()
288269
if err != nil {
289-
failedRuns++
290-
klog.Errorf("GFD: Error generating labels (run %d, failures %d): %v",
291-
totalRuns, failedRuns, err)
292-
293-
// In oneshot mode, fail immediately (for testing and CI)
294-
if *d.config.Flags.GFD.Oneshot {
295-
return false, err
296-
}
297-
298-
// Otherwise, sleep and retry
299-
klog.Info("Sleeping for ", *d.config.Flags.GFD.SleepInterval)
300-
time.Sleep(time.Duration(*d.config.Flags.GFD.SleepInterval))
301-
goto rerun
270+
return false, fmt.Errorf("error generating labels: %v", err)
302271
}
303272

304273
if len(labels) <= 1 {
305274
klog.Warning("No labels generated from any source")
306275
}
307276

308-
// Log stats periodically (every 10 successful runs)
309-
if totalRuns%10 == 0 {
310-
klog.Infof("GFD Stats: runs=%d, failures=%d, success_rate=%.1f%%, last_success=%v",
311-
totalRuns, failedRuns, float64(totalRuns-failedRuns)/float64(totalRuns)*100,
312-
time.Since(lastSuccessTime))
313-
}
314-
315277
klog.Info("Creating Labels")
316278
if err := d.labelOutputer.Output(labels); err != nil {
317-
failedRuns++
318-
klog.Errorf("GFD: Error outputting labels (run %d, failures %d): %v",
319-
totalRuns, failedRuns, err)
320-
321-
// In oneshot mode, fail immediately
322-
if *d.config.Flags.GFD.Oneshot {
323-
return false, err
324-
}
325-
326-
// Otherwise, sleep and retry
327-
klog.Info("Sleeping for ", *d.config.Flags.GFD.SleepInterval)
328-
time.Sleep(time.Duration(*d.config.Flags.GFD.SleepInterval))
329-
goto rerun
279+
return false, err
330280
}
331281

332-
// Update last success time
333-
lastSuccessTime = time.Now()
334-
335282
if *d.config.Flags.GFD.Oneshot {
336283
return false, nil
337284
}

internal/lm/imex.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package lm
1818

1919
import (
20+
"fmt"
2021
"strings"
2122

2223
"k8s.io/klog/v2"
@@ -47,19 +48,16 @@ func getFabricIDs(devices []resource.Device) (string, string, error) {
4748
for i, device := range devices {
4849
isFabricAttached, err := device.IsFabricAttached()
4950
if err != nil {
50-
// Skip unhealthy devices when checking fabric attachment
51-
klog.V(2).Infof("Skipping device %d for IMEX capability check: %v", i, err)
52-
continue
51+
return "", "", fmt.Errorf("error checking imex capability: %v", err)
5352
}
5453
if !isFabricAttached {
5554
continue
5655
}
5756

5857
clusterUUID, cliqueID, err := device.GetFabricIDs()
5958
if err != nil {
60-
// Skip devices that fail fabric ID retrieval
61-
klog.V(2).Infof("Skipping device %d, error getting fabric IDs: %v", i, err)
62-
continue
59+
60+
return "", "", fmt.Errorf("error getting fabric IDs: %w", err)
6361
}
6462

6563
uniqueClusterUUIDs[clusterUUID] = append(uniqueClusterUUIDs[clusterUUID], i)

internal/lm/nvml.go

Lines changed: 4 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -50,23 +50,6 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
5050
return empty{}, nil
5151
}
5252

53-
// Filter to only healthy devices - skip those that fail basic health
54-
// checks. This prevents GFD from crashing when a device goes unhealthy.
55-
totalDevices := len(devices)
56-
healthyDevices := resource.FilterHealthyDevices(devices)
57-
if len(healthyDevices) < totalDevices {
58-
klog.Warningf("GFD: %d/%d devices failed health check, continuing with healthy devices",
59-
totalDevices-len(healthyDevices), totalDevices)
60-
}
61-
62-
if len(healthyDevices) == 0 {
63-
klog.Warning("GFD: No healthy devices available for labeling")
64-
return empty{}, nil
65-
}
66-
67-
// Use healthyDevices for all subsequent labeling
68-
devices = healthyDevices
69-
7053
machineTypeLabeler, err := newMachineTypeLabeler(*config.Flags.GFD.MachineTypeFile)
7154
if err != nil {
7255
return nil, fmt.Errorf("failed to construct machine type labeler: %v", err)
@@ -177,9 +160,7 @@ func newMigCapabilityLabeler(manager resource.Manager) (Labeler, error) {
177160
for _, d := range devices {
178161
isMigCapable, err = d.IsMigCapable()
179162
if err != nil {
180-
// Skip unhealthy devices, continue checking others
181-
klog.V(2).Infof("Skipping device for MIG capability check: %v", err)
182-
continue
163+
return nil, fmt.Errorf("error getting mig capability: %v", err)
183164
}
184165
if isMigCapable {
185166
break
@@ -220,9 +201,7 @@ func isMPSCapable(manager resource.Manager) (bool, error) {
220201
for _, d := range devices {
221202
isMigEnabled, err := d.IsMigEnabled()
222203
if err != nil {
223-
// Skip unhealthy devices when checking MPS capability
224-
klog.V(2).Infof("Skipping device for MPS capability check: %v", err)
225-
continue
204+
return false, fmt.Errorf("failed to check if device is MIG-enabled: %w", err)
226205
}
227206
if isMigEnabled {
228207
return false, fmt.Errorf("%w for mig devices", errMPSSharingNotSupported)
@@ -268,12 +247,10 @@ func getModeForClasses(classes []uint32) string {
268247

269248
func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
270249
seenClasses := make(map[uint32]bool)
271-
for i, d := range devices {
250+
for _, d := range devices {
272251
class, err := d.GetPCIClass()
273252
if err != nil {
274-
// Skip unhealthy devices when determining GPU mode
275-
klog.V(2).Infof("Skipping device %d for PCI class check: %v", i, err)
276-
continue
253+
return nil, err
277254
}
278255
seenClasses[class] = true
279256
}

internal/lm/resource.go

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,9 @@ func NewGPUResourceLabeler(config *spec.Config, device resource.Device, count in
4141
return empty{}, nil
4242
}
4343

44-
// Check if device is healthy before querying
45-
if err := resource.CheckDeviceHealth(device); err != nil {
46-
klog.Warningf("Skipping unhealthy device in GPU resource labeler: %v", err)
47-
return empty{}, nil
48-
}
49-
5044
model, err := device.GetName()
5145
if err != nil {
52-
// If health check passed but GetName fails, log and skip
53-
klog.Warningf("Device health check passed but GetName failed: %v", err)
54-
return empty{}, nil
46+
return nil, fmt.Errorf("failed to get device model: %v", err)
5547
}
5648

5749
totalMemoryMiB, err := device.GetTotalMemoryMiB()
@@ -63,9 +55,7 @@ func NewGPUResourceLabeler(config *spec.Config, device resource.Device, count in
6355

6456
architectureLabels, err := newArchitectureLabels(resourceLabeler, device)
6557
if err != nil {
66-
// Don't crash on architecture label failure, log and continue
67-
klog.Warningf("Failed to create architecture labels: %v", err)
68-
architectureLabels = make(Labels)
58+
return nil, fmt.Errorf("failed to create architecture labels: %v", err)
6959
}
7060

7161
memoryLabeler := (Labeler)(&empty{})
@@ -88,35 +78,25 @@ func NewMIGResourceLabeler(resourceName spec.ResourceName, config *spec.Config,
8878
return empty{}, nil
8979
}
9080

91-
// Check if device is healthy before querying
92-
if err := resource.CheckDeviceHealth(device); err != nil {
93-
klog.Warningf("Skipping unhealthy MIG device in resource labeler: %v", err)
94-
return empty{}, nil
95-
}
96-
9781
parent, err := device.GetDeviceHandleFromMigDeviceHandle()
9882
if err != nil {
99-
klog.Warningf("Failed to get parent of MIG device, skipping: %v", err)
100-
return empty{}, nil
83+
return nil, fmt.Errorf("failed to get parent of MIG device: %v", err)
10184
}
10285
model, err := parent.GetName()
10386
if err != nil {
104-
klog.Warningf("Failed to get device model, skipping: %v", err)
105-
return empty{}, nil
87+
return nil, fmt.Errorf("failed to get device model: %v", err)
10688
}
10789

10890
migProfile, err := device.GetName()
10991
if err != nil {
110-
klog.Warningf("Failed to get MIG profile name, skipping: %v", err)
111-
return empty{}, nil
92+
return nil, fmt.Errorf("failed to get MIG profile name: %v", err)
11293
}
11394

11495
resourceLabeler := newResourceLabeler(resourceName, config)
11596

11697
attributeLabels, err := newMigAttributeLabels(resourceLabeler, device)
11798
if err != nil {
118-
klog.Warningf("Failed to get MIG attribute labels: %v", err)
119-
attributeLabels = make(Labels)
99+
return nil, fmt.Errorf("faled to get MIG attribute labels: %v", err)
120100
}
121101

122102
labelers := Merge(
@@ -272,9 +252,7 @@ func (rl resourceLabeler) replicationInfo() *spec.ReplicatedResource {
272252
func newMigAttributeLabels(rl resourceLabeler, device resource.Device) (Labels, error) {
273253
attributes, err := device.GetAttributes()
274254
if err != nil {
275-
// Return empty labels instead of crashing
276-
klog.Warningf("Unable to get attributes of MIG device, skipping: %v", err)
277-
return make(Labels), nil
255+
return nil, fmt.Errorf("unable to get attributes of MIG device: %v", err)
278256
}
279257

280258
labels := rl.labels(attributes)
@@ -285,9 +263,7 @@ func newMigAttributeLabels(rl resourceLabeler, device resource.Device) (Labels,
285263
func newArchitectureLabels(rl resourceLabeler, device resource.Device) (Labels, error) {
286264
computeMajor, computeMinor, err := device.GetCudaComputeCapability()
287265
if err != nil {
288-
// Return empty labels instead of error - allows labeling to continue
289-
klog.Warningf("Failed to determine CUDA compute capability, skipping architecture labels: %v", err)
290-
return make(Labels), nil
266+
return nil, fmt.Errorf("failed to determine CUDA compute capability: %v", err)
291267
}
292268

293269
if computeMajor == 0 {

internal/resource/device_health.go

Lines changed: 0 additions & 112 deletions
This file was deleted.

0 commit comments

Comments
 (0)