Skip to content

Commit 63f4fe1

Browse files
committed
[no-relnote] Refactor CDI version extraction
Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 1172f27 commit 63f4fe1

File tree

5 files changed

+51
-49
lines changed

5 files changed

+51
-49
lines changed

pkg/nvcdi/common-nvml.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import (
2424

2525
// newCommonNVMLDiscoverer returns a discoverer for entities that are not associated with a specific CDI device.
2626
// This includes driver libraries and meta devices, for example.
27-
func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) {
27+
func (l *nvmllib) newCommonNVMLDiscoverer(version string) (discover.Discover, error) {
2828
metaDevices := discover.NewCharDeviceDiscoverer(
2929
l.logger,
3030
l.devRoot,
@@ -41,7 +41,7 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) {
4141
l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err)
4242
}
4343

44-
driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, l.nvmllib)
44+
driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, version)
4545
if err != nil {
4646
return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err)
4747
}

pkg/nvcdi/driver-nvml.go

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"path/filepath"
2323
"strings"
2424

25-
"github.com/NVIDIA/go-nvml/pkg/nvml"
2625
"golang.org/x/sys/unix"
2726

2827
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
@@ -34,21 +33,7 @@ import (
3433

3534
// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation.
3635
// The supplied NVML Library is used to query the expected driver version.
37-
func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, nvmllib nvml.Interface) (discover.Discover, error) {
38-
if r := nvmllib.Init(); r != nvml.SUCCESS {
39-
return nil, fmt.Errorf("failed to initialize NVML: %v", r)
40-
}
41-
defer func() {
42-
if r := nvmllib.Shutdown(); r != nvml.SUCCESS {
43-
logger.Warningf("failed to shutdown NVML: %v", r)
44-
}
45-
}()
46-
47-
version, r := nvmllib.SystemGetDriverVersion()
48-
if r != nvml.SUCCESS {
49-
return nil, fmt.Errorf("failed to determine driver version: %v", r)
50-
}
51-
36+
func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, version string) (discover.Discover, error) {
5237
return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version)
5338
}
5439

pkg/nvcdi/lib-nvml.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,25 @@ func (l *nvmllib) GetAllDeviceSpecs() ([]specs.Device, error) {
8383

8484
// GetCommonEdits generates a CDI specification that can be used for ANY devices
8585
func (l *nvmllib) GetCommonEdits() (*cdi.ContainerEdits, error) {
86-
common, err := l.newCommonNVMLDiscoverer()
86+
if l.nvsandboxutilslib != nil {
87+
if r := l.nvsandboxutilslib.Init(l.driverRoot); r != nvsandboxutils.SUCCESS {
88+
l.logger.Warningf("Failed to init nvsandboxutils: %v; ignoring", r)
89+
l.nvsandboxutilslib = nil
90+
}
91+
defer func() {
92+
if l.nvsandboxutilslib == nil {
93+
return
94+
}
95+
_ = l.nvsandboxutilslib.Shutdown()
96+
}()
97+
}
98+
99+
version, err := (*nvcdilib)(l).getDriverVersion()
100+
if err != nil {
101+
return nil, fmt.Errorf("failed to get driver version: %v", err)
102+
}
103+
104+
common, err := l.newCommonNVMLDiscoverer(version)
87105
if err != nil {
88106
return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err)
89107
}

pkg/nvcdi/lib.go

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,15 @@ package nvcdi
1818

1919
import (
2020
"fmt"
21+
"path/filepath"
22+
"strings"
2123

2224
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
2325
"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
2426
"github.com/NVIDIA/go-nvml/pkg/nvml"
2527

2628
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
29+
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda"
2730
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
2831
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils"
2932
"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv"
@@ -184,18 +187,36 @@ func (l *nvcdilib) resolveMode() (rmode string) {
184187
return ModeNvml
185188
}
186189

187-
// getCudaVersion returns the CUDA version of the current system.
188-
func (l *nvcdilib) getCudaVersion() (string, error) {
189-
version, err := l.getCudaVersionNvsandboxutils()
190-
if err == nil {
190+
// getDriverVersion returns the driver version of the current system.
191+
func (l *nvcdilib) getDriverVersion() (string, error) {
192+
if version, err := l.getDriverVersionNvsandboxutils(); err == nil && version != "" {
191193
return version, err
192194
}
193195

194196
// Fallback to NVML
195-
return l.getCudaVersionNvml()
197+
if version, err := l.getDriverVersionNvml(); err == nil && version != "" {
198+
return version, err
199+
}
200+
201+
// Fallback to getting the version from the libcuda.so suffix.
202+
return l.getDriverVersionLibcudaSo()
203+
}
204+
205+
func (l *nvcdilib) getDriverVersionLibcudaSo() (string, error) {
206+
libCudaPaths, err := cuda.New(
207+
l.driver.Libraries(),
208+
).Locate(".*.*")
209+
if err != nil {
210+
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
211+
}
212+
libCudaPath := libCudaPaths[0]
213+
214+
version := strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")
215+
216+
return version, nil
196217
}
197218

198-
func (l *nvcdilib) getCudaVersionNvml() (string, error) {
219+
func (l *nvcdilib) getDriverVersionNvml() (string, error) {
199220
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
200221
return "", fmt.Errorf("nvml not detected: %v", reason)
201222
}
@@ -219,7 +240,7 @@ func (l *nvcdilib) getCudaVersionNvml() (string, error) {
219240
return version, nil
220241
}
221242

222-
func (l *nvcdilib) getCudaVersionNvsandboxutils() (string, error) {
243+
func (l *nvcdilib) getDriverVersionNvsandboxutils() (string, error) {
223244
if l.nvsandboxutilslib == nil {
224245
return "", fmt.Errorf("libnvsandboxutils is not available")
225246
}

pkg/nvcdi/management.go

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ import (
2727

2828
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
2929
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
30-
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda"
3130
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils"
3231
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
3332
)
@@ -75,7 +74,7 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
7574
}()
7675
}
7776

78-
version, err := m.getCudaVersion()
77+
version, err := (*nvcdilib)(m).getDriverVersion()
7978
if err != nil {
8079
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
8180
}
@@ -93,27 +92,6 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
9392
return edits, nil
9493
}
9594

96-
// getCudaVersion returns the CUDA version for use in managementlib containers.
97-
func (m *managementlib) getCudaVersion() (string, error) {
98-
version, err := (*nvcdilib)(m).getCudaVersion()
99-
if err == nil {
100-
return version, nil
101-
}
102-
103-
libCudaPaths, err := cuda.New(
104-
m.driver.Libraries(),
105-
).Locate(".*.*")
106-
if err != nil {
107-
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
108-
}
109-
110-
libCudaPath := libCudaPaths[0]
111-
112-
version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")
113-
114-
return version, nil
115-
}
116-
11795
type managementDiscoverer struct {
11896
discover.Discover
11997
}

0 commit comments

Comments
 (0)