Skip to content

Commit 6149592

Browse files
committed
Default to jit-cdi mode in the nvidia runtime
Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent d3ece78 commit 6149592

File tree

6 files changed

+72
-78
lines changed

6 files changed

+72
-78
lines changed

cmd/nvidia-container-runtime/main_test.go

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,10 @@ func TestGoodInput(t *testing.T) {
122122
err = cmdCreate.Run()
123123
require.NoError(t, err, "runtime should not return an error")
124124

125-
// Check config.json for NVIDIA prestart hook
125+
// Check config.json to ensure that the NVIDIA prestart was not inserted.
126126
spec, err = cfg.getRuntimeSpec()
127127
require.NoError(t, err, "should be no errors when reading and parsing spec from config.json")
128-
require.NotEmpty(t, spec.Hooks, "there should be hooks in config.json")
129-
require.Equal(t, 1, nvidiaHookCount(spec.Hooks), "exactly one nvidia prestart hook should be inserted correctly into config.json")
128+
require.Empty(t, spec.Hooks, "there should be no hooks in config.json")
130129
}
131130

132131
// NVIDIA prestart hook already present in config file
@@ -168,11 +167,10 @@ func TestDuplicateHook(t *testing.T) {
168167
output, err := cmdCreate.CombinedOutput()
169168
require.NoErrorf(t, err, "runtime should not return an error", "output=%v", string(output))
170169

171-
// Check config.json for NVIDIA prestart hook
170+
// Check config.json to ensure that the NVIDIA prestart hook was removed.
172171
spec, err = cfg.getRuntimeSpec()
173172
require.NoError(t, err, "should be no errors when reading and parsing spec from config.json")
174-
require.NotEmpty(t, spec.Hooks, "there should be hooks in config.json")
175-
require.Equal(t, 1, nvidiaHookCount(spec.Hooks), "exactly one nvidia prestart hook should be inserted correctly into config.json")
173+
require.Empty(t, spec.Hooks, "there should be no hooks in config.json")
176174
}
177175

178176
// addNVIDIAHook is a basic wrapper for an addHookModifier that is used for
@@ -240,18 +238,3 @@ func (c testConfig) generateNewRuntimeSpec() error {
240238
}
241239
return nil
242240
}
243-
244-
// Return number of valid NVIDIA prestart hooks in runtime spec
245-
func nvidiaHookCount(hooks *specs.Hooks) int {
246-
if hooks == nil {
247-
return 0
248-
}
249-
250-
count := 0
251-
for _, hook := range hooks.Prestart {
252-
if strings.Contains(hook.Path, nvidiaHook) {
253-
count++
254-
}
255-
}
256-
return count
257-
}

internal/info/auto.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ const (
4141
// to the container config required for the requested CDI devices in the
4242
// same way that other CDI clients would.
4343
CDIRuntimeMode = RuntimeMode("cdi")
44+
// In JitCDIRuntimeMode the nvidia-container-runtime generates in-memory CDI
45+
// specifications for requested NVIDIA devices.
46+
JitCDIRuntimeMode = RuntimeMode("jit-cdi")
4447
)
4548

4649
type RuntimeModeResolver interface {
@@ -116,9 +119,9 @@ func (m *modeResolver) ResolveRuntimeMode(mode string) (rmode RuntimeMode) {
116119

117120
switch nvinfo.ResolvePlatform() {
118121
case info.PlatformNVML, info.PlatformWSL:
119-
return LegacyRuntimeMode
122+
return JitCDIRuntimeMode
120123
case info.PlatformTegra:
121124
return CSVRuntimeMode
122125
}
123-
return LegacyRuntimeMode
126+
return JitCDIRuntimeMode
124127
}

internal/info/auto_test.go

Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,16 @@ func TestResolveAutoMode(t *testing.T) {
4343
mode: "not-auto",
4444
expectedMode: "not-auto",
4545
},
46+
{
47+
description: "legacy resolves to legacy",
48+
mode: "legacy",
49+
expectedMode: "legacy",
50+
},
4651
{
4752
description: "no info defaults to legacy",
4853
mode: "auto",
4954
info: map[string]bool{},
50-
expectedMode: "legacy",
55+
expectedMode: "jit-cdi",
5156
},
5257
{
5358
description: "non-nvml, non-tegra, nvgpu resolves to csv",
@@ -80,14 +85,14 @@ func TestResolveAutoMode(t *testing.T) {
8085
expectedMode: "csv",
8186
},
8287
{
83-
description: "nvml, non-tegra, non-nvgpu resolves to legacy",
88+
description: "nvml, non-tegra, non-nvgpu resolves to jit-cdi",
8489
mode: "auto",
8590
info: map[string]bool{
8691
"nvml": true,
8792
"tegra": false,
8893
"nvgpu": false,
8994
},
90-
expectedMode: "legacy",
95+
expectedMode: "jit-cdi",
9196
},
9297
{
9398
description: "nvml, non-tegra, nvgpu resolves to csv",
@@ -100,14 +105,14 @@ func TestResolveAutoMode(t *testing.T) {
100105
expectedMode: "csv",
101106
},
102107
{
103-
description: "nvml, tegra, non-nvgpu resolves to legacy",
108+
description: "nvml, tegra, non-nvgpu resolves to jit-cdi",
104109
mode: "auto",
105110
info: map[string]bool{
106111
"nvml": true,
107112
"tegra": true,
108113
"nvgpu": false,
109114
},
110-
expectedMode: "legacy",
115+
expectedMode: "jit-cdi",
111116
},
112117
{
113118
description: "nvml, tegra, nvgpu resolves to csv",
@@ -136,7 +141,7 @@ func TestResolveAutoMode(t *testing.T) {
136141
},
137142
},
138143
{
139-
description: "at least one non-cdi device resolves to legacy",
144+
description: "at least one non-cdi device resolves to jit-cdi",
140145
mode: "auto",
141146
envmap: map[string]string{
142147
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
@@ -146,7 +151,7 @@ func TestResolveAutoMode(t *testing.T) {
146151
"tegra": false,
147152
"nvgpu": false,
148153
},
149-
expectedMode: "legacy",
154+
expectedMode: "jit-cdi",
150155
},
151156
{
152157
description: "at least one non-cdi device resolves to csv",
@@ -170,7 +175,7 @@ func TestResolveAutoMode(t *testing.T) {
170175
expectedMode: "cdi",
171176
},
172177
{
173-
description: "cdi mount and non-CDI devices resolves to legacy",
178+
description: "cdi mount and non-CDI devices resolves to jit-cdi",
174179
mode: "auto",
175180
mounts: []string{
176181
"/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/0",
@@ -181,7 +186,7 @@ func TestResolveAutoMode(t *testing.T) {
181186
"tegra": false,
182187
"nvgpu": false,
183188
},
184-
expectedMode: "legacy",
189+
expectedMode: "jit-cdi",
185190
},
186191
{
187192
description: "cdi mount and non-CDI envvar resolves to cdi",
@@ -199,22 +204,6 @@ func TestResolveAutoMode(t *testing.T) {
199204
},
200205
expectedMode: "cdi",
201206
},
202-
{
203-
description: "non-cdi mount and CDI envvar resolves to legacy",
204-
mode: "auto",
205-
envmap: map[string]string{
206-
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0",
207-
},
208-
mounts: []string{
209-
"/var/run/nvidia-container-devices/0",
210-
},
211-
info: map[string]bool{
212-
"nvml": true,
213-
"tegra": false,
214-
"nvgpu": false,
215-
},
216-
expectedMode: "legacy",
217-
},
218207
}
219208

220209
for _, tc := range testCases {

internal/modifier/cdi.go

Lines changed: 34 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package modifier
1818

1919
import (
2020
"fmt"
21+
"strings"
2122

2223
"tags.cncf.io/container-device-interface/pkg/parser"
2324

@@ -27,17 +28,27 @@ import (
2728
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
2829
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
2930
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
30-
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
31+
)
32+
33+
const (
34+
automaticDeviceVendor = "runtime.nvidia.com"
35+
automaticDeviceClass = "gpu"
36+
automaticDeviceKind = automaticDeviceVendor + "/" + automaticDeviceClass
37+
automaticDevicePrefix = automaticDeviceKind + "="
3138
)
3239

3340
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
3441
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
3542
// used to select the devices to include.
36-
func NewCDIModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
43+
func NewCDIModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, isJitCDI bool) (oci.SpecModifier, error) {
44+
defaultKind := cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind
45+
if isJitCDI {
46+
defaultKind = automaticDeviceKind
47+
}
3748
deviceRequestor := newCDIDeviceRequestor(
3849
logger,
3950
image,
40-
cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind,
51+
defaultKind,
4152
)
4253
devices := deviceRequestor.DeviceRequests()
4354
if len(devices) == 0 {
@@ -107,50 +118,46 @@ func (c *cdiDeviceRequestor) DeviceRequests() []string {
107118
func filterAutomaticDevices(devices []string) []string {
108119
var automatic []string
109120
for _, device := range devices {
110-
vendor, class, _ := parser.ParseDevice(device)
111-
if vendor == "runtime.nvidia.com" && class == "gpu" {
112-
automatic = append(automatic, device)
121+
if !strings.HasPrefix(device, automaticDevicePrefix) {
122+
continue
113123
}
124+
automatic = append(automatic, device)
114125
}
115126
return automatic
116127
}
117128

118129
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
119130
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
120-
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
121-
if err != nil {
122-
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
123-
}
124-
cdiDeviceRequestor, err := cdi.New(
125-
cdi.WithLogger(logger),
126-
cdi.WithSpec(spec.Raw()),
127-
)
128-
if err != nil {
129-
return nil, fmt.Errorf("failed to construct CDI modifier: %w", err)
130-
}
131131

132-
return cdiDeviceRequestor, nil
133-
}
132+
var identifiers []string
133+
for _, device := range devices {
134+
identifiers = append(identifiers, strings.TrimPrefix(device, automaticDevicePrefix))
135+
}
134136

135-
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
136137
cdilib, err := nvcdi.New(
137138
nvcdi.WithLogger(logger),
138139
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
139140
nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root),
140-
nvcdi.WithVendor("runtime.nvidia.com"),
141-
nvcdi.WithClass("gpu"),
141+
nvcdi.WithVendor(automaticDeviceVendor),
142+
nvcdi.WithClass(automaticDeviceClass),
142143
)
143144
if err != nil {
144145
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
145146
}
146147

147-
var identifiers []string
148-
for _, device := range devices {
149-
_, _, id := parser.ParseDevice(device)
150-
identifiers = append(identifiers, id)
148+
spec, err := cdilib.GetSpec(identifiers...)
149+
if err != nil {
150+
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
151+
}
152+
cdiDeviceRequestor, err := cdi.New(
153+
cdi.WithLogger(logger),
154+
cdi.WithSpec(spec.Raw()),
155+
)
156+
if err != nil {
157+
return nil, fmt.Errorf("failed to construct CDI modifier: %w", err)
151158
}
152159

153-
return cdilib.GetSpec(identifiers...)
160+
return cdiDeviceRequestor, nil
154161
}
155162

156163
type deduplicatedDeviceRequestor struct {

internal/modifier/cdi_test.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,18 @@ func TestDeviceRequests(t *testing.T) {
7070
},
7171
expectedDevices: []string{"nvidia.com/gpu=0", "example.com/class=device"},
7272
},
73+
{
74+
description: "cdi devices from envvar with default kind",
75+
input: cdiDeviceRequestor{
76+
defaultKind: "runtime.nvidia.com/gpu",
77+
},
78+
spec: &specs.Spec{
79+
Process: &specs.Process{
80+
Env: []string{"NVIDIA_VISIBLE_DEVICES=all"},
81+
},
82+
},
83+
expectedDevices: []string{"runtime.nvidia.com/gpu=all"},
84+
},
7385
{
7486
description: "no matching annotations",
7587
prefixes: []string{"not-prefix/"},

internal/runtime/runtime_factory.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ func newModeModifier(logger logger.Interface, mode info.RuntimeMode, cfg *config
107107
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
108108
case info.CSVRuntimeMode:
109109
return modifier.NewCSVModifier(logger, cfg, image)
110-
case info.CDIRuntimeMode:
111-
return modifier.NewCDIModifier(logger, cfg, image)
110+
case info.CDIRuntimeMode, info.JitCDIRuntimeMode:
111+
return modifier.NewCDIModifier(logger, cfg, image, mode == info.JitCDIRuntimeMode)
112112
}
113113

114114
return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)
@@ -160,7 +160,7 @@ func initRuntimeModeAndImage(logger logger.Interface, cfg *config.Config, ociSpe
160160
// supportedModifierTypes returns the modifiers supported for a specific runtime mode.
161161
func supportedModifierTypes(mode info.RuntimeMode) []string {
162162
switch mode {
163-
case info.CDIRuntimeMode:
163+
case info.CDIRuntimeMode, info.JitCDIRuntimeMode:
164164
// For CDI mode we make no additional modifications.
165165
return []string{"nvidia-hook-remover", "mode"}
166166
case info.CSVRuntimeMode:

0 commit comments

Comments
 (0)