Skip to content

Commit d03a060

Browse files
ArangoGutierrezelezar
authored andcommitted
BUGFIX: modifier: respect GPU volume-mount device requests
The gated modifiers used to add support for GDS, Mofed, and CUDA Forward Comatibility only check the NVIDIA_VISIBLE_DEVICES envvar to determine whether GPUs are requested and modifications should be made. This means that use cases where volume mounts are used to request devices are not supported. This change ensures that device extraction is consistent for all use cases. Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com> Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent f4f7da6 commit d03a060

File tree

6 files changed

+113
-37
lines changed

6 files changed

+113
-37
lines changed

internal/config/image/cuda_image.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ func (i CUDA) VisibleDevices() []string {
270270
}
271271

272272
// Get the Fallback to reading from the environment variable if privileges are correct
273-
envVarDeviceRequests := i.VisibleDevicesFromEnvVar()
273+
envVarDeviceRequests := i.visibleDevicesFromEnvVar()
274274
if len(envVarDeviceRequests) == 0 {
275275
return nil
276276
}
@@ -322,11 +322,11 @@ func (i CUDA) cdiDeviceRequestsFromAnnotations() []string {
322322
return devices
323323
}
324324

325-
// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
325+
// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
326326
// If any of the preferredVisibleDeviceEnvVars are present in the image, they
327327
// are used to determine the visible devices. If this is not the case, the
328328
// NVIDIA_VISIBLE_DEVICES environment variable is used.
329-
func (i CUDA) VisibleDevicesFromEnvVar() []string {
329+
func (i CUDA) visibleDevicesFromEnvVar() []string {
330330
envVars := i.visibleEnvVars()
331331
return i.DevicesFromEnvvars(envVars...).List()
332332
}

internal/config/image/cuda_image_test.go

Lines changed: 88 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
429429
)
430430

431431
require.NoError(t, err)
432-
devices := image.VisibleDevicesFromEnvVar()
432+
devices := image.visibleDevicesFromEnvVar()
433433
require.EqualValues(t, tc.expectedDevices, devices)
434434
})
435435
}
@@ -508,13 +508,15 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) {
508508

509509
func TestVisibleDevices(t *testing.T) {
510510
var tests = []struct {
511-
description string
512-
mountDevices []specs.Mount
513-
envvarDevices string
514-
privileged bool
515-
acceptUnprivileged bool
516-
acceptMounts bool
517-
expectedDevices []string
511+
description string
512+
mountDevices []specs.Mount
513+
envvarDevices string
514+
privileged bool
515+
acceptUnprivileged bool
516+
acceptMounts bool
517+
preferredVisibleDeviceEnvVars []string
518+
env map[string]string
519+
expectedDevices []string
518520
}{
519521
{
520522
description: "Mount devices, unprivileged, no accept unprivileged",
@@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) {
597599
acceptMounts: false,
598600
expectedDevices: nil,
599601
},
602+
// New test cases for visibleEnvVars functionality
603+
{
604+
description: "preferred env var set and present in env, privileged",
605+
mountDevices: nil,
606+
envvarDevices: "",
607+
privileged: true,
608+
acceptUnprivileged: false,
609+
acceptMounts: true,
610+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
611+
env: map[string]string{
612+
"DOCKER_RESOURCE_GPUS": "GPU-12345",
613+
},
614+
expectedDevices: []string{"GPU-12345"},
615+
},
616+
{
617+
description: "preferred env var set and present in env, unprivileged but accepted",
618+
mountDevices: nil,
619+
envvarDevices: "",
620+
privileged: false,
621+
acceptUnprivileged: true,
622+
acceptMounts: true,
623+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
624+
env: map[string]string{
625+
"DOCKER_RESOURCE_GPUS": "GPU-12345",
626+
},
627+
expectedDevices: []string{"GPU-12345"},
628+
},
629+
{
630+
description: "preferred env var set and present in env, unprivileged and not accepted",
631+
mountDevices: nil,
632+
envvarDevices: "",
633+
privileged: false,
634+
acceptUnprivileged: false,
635+
acceptMounts: true,
636+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
637+
env: map[string]string{
638+
"DOCKER_RESOURCE_GPUS": "GPU-12345",
639+
},
640+
expectedDevices: nil,
641+
},
642+
{
643+
description: "multiple preferred env vars, both present, privileged",
644+
mountDevices: nil,
645+
envvarDevices: "",
646+
privileged: true,
647+
acceptUnprivileged: false,
648+
acceptMounts: true,
649+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
650+
env: map[string]string{
651+
"DOCKER_RESOURCE_GPUS": "GPU-12345",
652+
"DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890",
653+
},
654+
expectedDevices: []string{"GPU-12345", "GPU-67890"},
655+
},
656+
{
657+
description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged",
658+
mountDevices: nil,
659+
envvarDevices: "GPU-12345",
660+
privileged: true,
661+
acceptUnprivileged: false,
662+
acceptMounts: true,
663+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
664+
env: map[string]string{
665+
EnvVarNvidiaVisibleDevices: "GPU-12345",
666+
},
667+
expectedDevices: []string{"GPU-12345"},
668+
},
600669
}
601670
for _, tc := range tests {
602671
t.Run(tc.description, func(t *testing.T) {
603-
// Wrap the call to getDevices() in a closure.
672+
// Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars
673+
env := make(map[string]string)
674+
if tc.envvarDevices != "" {
675+
env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices
676+
}
677+
for k, v := range tc.env {
678+
env[k] = v
679+
}
680+
604681
image, err := New(
605-
WithEnvMap(
606-
map[string]string{
607-
EnvVarNvidiaVisibleDevices: tc.envvarDevices,
608-
},
609-
),
682+
WithEnvMap(env),
610683
WithMounts(tc.mountDevices),
611684
WithPrivileged(tc.privileged),
612685
WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts),
613686
WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged),
687+
WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...),
614688
)
615689
require.NoError(t, err)
616690
require.Equal(t, tc.expectedDevices, image.VisibleDevices())

internal/modifier/csv.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import (
3333
// NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper.
3434
// The modifications are defined by CSV MountSpecs.
3535
func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) {
36-
if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 {
36+
if devices := container.VisibleDevices(); len(devices) == 0 {
3737
logger.Infof("No modification required; no devices requested")
3838
return nil, nil
3939
}

internal/modifier/gated.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ import (
3737
//
3838
// If not devices are selected, no changes are made.
3939
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
40-
if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
40+
if devices := image.VisibleDevices(); len(devices) == 0 {
4141
logger.Infof("No modification required; no devices requested")
4242
return nil, nil
4343
}

internal/modifier/graphics.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@ import (
2929

3030
// NewGraphicsModifier constructs a modifier that injects graphics-related modifications into an OCI runtime specification.
3131
// The value of the NVIDIA_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made.
32-
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerImage image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
33-
if required, reason := requiresGraphicsModifier(containerImage); !required {
34-
logger.Infof("No graphics modifier required: %v", reason)
32+
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, container image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
33+
devices, reason := requiresGraphicsModifier(container)
34+
if len(devices) == 0 {
35+
logger.Infof("No graphics modifier required; %v", reason)
3536
return nil, nil
3637
}
3738

@@ -48,7 +49,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI
4849
devRoot := driver.Root
4950
drmNodes, err := discover.NewDRMNodesDiscoverer(
5051
logger,
51-
containerImage.DevicesFromEnvvars(image.EnvVarNvidiaVisibleDevices),
52+
image.NewVisibleDevices(devices...),
5253
devRoot,
5354
hookCreator,
5455
)
@@ -64,14 +65,15 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI
6465
}
6566

6667
// requiresGraphicsModifier determines whether a graphics modifier is required.
67-
func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) {
68-
if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 {
69-
return false, "no devices requested"
68+
func requiresGraphicsModifier(cudaImage image.CUDA) ([]string, string) {
69+
devices := cudaImage.VisibleDevices()
70+
if len(devices) == 0 {
71+
return nil, "no devices requested"
7072
}
7173

7274
if !cudaImage.GetDriverCapabilities().Any(image.DriverCapabilityGraphics, image.DriverCapabilityDisplay) {
73-
return false, "no required capabilities requested"
75+
return nil, "no required capabilities requested"
7476
}
7577

76-
return true, ""
78+
return devices, ""
7779
}

internal/modifier/graphics_test.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ import (
2626

2727
func TestGraphicsModifier(t *testing.T) {
2828
testCases := []struct {
29-
description string
30-
envmap map[string]string
31-
expectedRequired bool
29+
description string
30+
envmap map[string]string
31+
expectedDevices []string
3232
}{
3333
{
3434
description: "empty image does not create modifier",
@@ -52,39 +52,39 @@ func TestGraphicsModifier(t *testing.T) {
5252
"NVIDIA_VISIBLE_DEVICES": "all",
5353
"NVIDIA_DRIVER_CAPABILITIES": "all",
5454
},
55-
expectedRequired: true,
55+
expectedDevices: []string{"all"},
5656
},
5757
{
5858
description: "devices with graphics capability creates modifier",
5959
envmap: map[string]string{
6060
"NVIDIA_VISIBLE_DEVICES": "all",
6161
"NVIDIA_DRIVER_CAPABILITIES": "graphics",
6262
},
63-
expectedRequired: true,
63+
expectedDevices: []string{"all"},
6464
},
6565
{
6666
description: "devices with compute,graphics capability creates modifier",
6767
envmap: map[string]string{
6868
"NVIDIA_VISIBLE_DEVICES": "all",
6969
"NVIDIA_DRIVER_CAPABILITIES": "compute,graphics",
7070
},
71-
expectedRequired: true,
71+
expectedDevices: []string{"all"},
7272
},
7373
{
7474
description: "devices with display capability creates modifier",
7575
envmap: map[string]string{
7676
"NVIDIA_VISIBLE_DEVICES": "all",
7777
"NVIDIA_DRIVER_CAPABILITIES": "display",
7878
},
79-
expectedRequired: true,
79+
expectedDevices: []string{"all"},
8080
},
8181
{
8282
description: "devices with display,graphics capability creates modifier",
8383
envmap: map[string]string{
8484
"NVIDIA_VISIBLE_DEVICES": "all",
8585
"NVIDIA_DRIVER_CAPABILITIES": "display,graphics",
8686
},
87-
expectedRequired: true,
87+
expectedDevices: []string{"all"},
8888
},
8989
}
9090

@@ -94,7 +94,7 @@ func TestGraphicsModifier(t *testing.T) {
9494
image.WithEnvMap(tc.envmap),
9595
)
9696
required, _ := requiresGraphicsModifier(image)
97-
require.EqualValues(t, tc.expectedRequired, required)
97+
require.EqualValues(t, tc.expectedDevices, required)
9898
})
9999
}
100100
}

0 commit comments

Comments
 (0)