Skip to content

Commit 0f9036c

Browse files
committed
[nvidia-ctk-installer] do not revert cri-o config on shutdown
This commit updates the behavior of the nvidia-ctk-installer for cri-o. On shutdown, we no longer delete the drop-in config file as long as none of the nvidia runtime handlers are set as the default runtime. This change was made to workaround an issue observed when uninstalling the gpu-operator -- management containers launched with the nvidia runtime handler would get stuck in the terminating state with the below error message: ``` failed to find runtime handler nvidia from runtime list map[crun:... runc:...], failed to "KillPodSandbox" for ... ``` There appears to be a race condition where the nvidia-ctk-installer removes the drop-in file and restarts cri-o. After the cri-o restart, if there are still pods / containers to terminate that were started with the nvidia runtime, then cri-o fails to terminate them. The behavior of cri-o, and its in-memory runtime handler cache, appears to differ from that of containerd as we have never encountered such an issue with containerd. This commit can be considered a stop-gap solution until more robust solution is developed. Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
1 parent 4fd94d7 commit 0f9036c

File tree

2 files changed

+4
-5
lines changed

2 files changed

+4
-5
lines changed

cmd/nvidia-ctk-installer/container/runtime/crio/config_test.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ func TestCrioConfigLifecycle(t *testing.T) {
8888
},
8989
assertCleanupPostConditions: func(t *testing.T, co *container.Options, _ *Options) error {
9090
require.NoFileExists(t, co.TopLevelConfigPath)
91-
require.NoFileExists(t, co.DropInConfig)
9291
return nil
9392
},
9493
},
@@ -182,8 +181,6 @@ signature_policy = "/etc/crio/policy.json"
182181
assertCleanupPostConditions: func(t *testing.T, co *container.Options, o *Options) error {
183182
require.FileExists(t, co.TopLevelConfigPath)
184183

185-
require.NoFileExists(t, co.DropInConfig)
186-
187184
actualTopLevel, err := os.ReadFile(co.TopLevelConfigPath)
188185
require.NoError(t, err)
189186

@@ -480,8 +477,6 @@ plugin_dirs = [
480477
`
481478
require.Equal(t, expected, string(actual))
482479

483-
require.NoFileExists(t, co.DropInConfig)
484-
485480
return nil
486481
},
487482
},

cmd/nvidia-ctk-installer/container/runtime/crio/crio.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,10 @@ func cleanupHook(co *Options) error {
180180

181181
// cleanupConfig removes the NVIDIA container runtime from the cri-o config
182182
func cleanupConfig(o *container.Options) error {
183+
if !o.SetAsDefault {
184+
return nil
185+
}
186+
183187
log.Infof("Reverting config file modifications")
184188

185189
cfg, err := getRuntimeConfig(o)

0 commit comments

Comments
 (0)