@@ -21,11 +21,69 @@ import (
2121 "fmt"
2222 "os"
2323 "os/exec"
24+ "strings"
25+ "text/template"
2426 "time"
2527
2628 "golang.org/x/crypto/ssh"
2729)
2830
31+ const (
32+ startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \
33+ -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
34+ -e NVIDIA_DRIVER_CAPABILITIES=all \
35+ {{ range $i, $a := .AdditionalArguments -}}
36+ {{ $a }} \
37+ {{ end -}}
38+ ubuntu sleep infinity`
39+
40+ installDockerTemplate = `
41+ export DEBIAN_FRONTEND=noninteractive
42+
43+ # Add Docker official GPG key:
44+ apt-get update
45+ apt-get install -y ca-certificates curl apt-utils gnupg2
46+ install -m 0755 -d /etc/apt/keyrings
47+ curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
48+ chmod a+r /etc/apt/keyrings/docker.asc
49+
50+ # Add the repository to Apt sources:
51+ echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
52+ apt-get update
53+
54+ apt-get install -y docker-ce docker-ce-cli containerd.io
55+
56+ # start dockerd in the background
57+ dockerd &
58+
59+ # wait for dockerd to be ready with timeout
60+ timeout=30
61+ elapsed=0
62+ while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do
63+ echo "Waiting for dockerd to be ready..."
64+ sleep 1
65+ elapsed=$((elapsed + 1))
66+ done
67+ if [ $elapsed -ge $timeout ]; then
68+ echo "Docker failed to start within $timeout seconds"
69+ exit 1
70+ fi
71+ `
72+
73+ installCTKTemplate = `
74+ # Create a temporary directory and rootfs path
75+ TMPDIR="$(mktemp -d)"
76+
77+ # Expose TMPDIR for the child namespace
78+ export TMPDIR
79+
80+ docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/"
81+ dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb
82+
83+ nvidia-ctk --version
84+ `
85+ )
86+
2987type localRunner struct {}
3088type remoteRunner struct {
3189 sshKey string
@@ -34,7 +92,13 @@ type remoteRunner struct {
3492 port string
3593}
3694
95+ type nestedContainerRunner struct {
96+ runner Runner
97+ containerName string
98+ }
99+
37100type runnerOption func (* remoteRunner )
101+ type nestedContainerOption func (* nestedContainerRunner )
38102
39103type Runner interface {
40104 Run (script string ) (string , string , error )
@@ -79,6 +143,112 @@ func NewRunner(opts ...runnerOption) Runner {
79143 return r
80144}
81145
146+ // NewNestedContainerRunner creates a new nested container runner.
147+ // A nested container runs a container inside another container based on a
148+ // given runner (remote or local).
149+ func NewNestedContainerRunner (runner Runner , installCTK bool , image string , containerName string , opts ... nestedContainerOption ) (Runner , error ) {
150+ additionalContainerArguments := []string {}
151+
152+ // If a container with the same name exists from a previous test run, remove it first.
153+ // Ignore errors as container might not exist
154+ _ , _ , err := runner .Run (fmt .Sprintf ("docker rm -f %s 2>/dev/null || true" , containerName )) //nolint:errcheck
155+ if err != nil {
156+ return nil , fmt .Errorf ("failed to remove container: %w" , err )
157+ }
158+
159+ if installCTK {
160+ installer , err := NewToolkitInstaller (
161+ WithRunner (runner ),
162+ WithImage (image ),
163+ WithTemplate (dockerInstallTemplate ),
164+ )
165+ if err != nil {
166+ return nil , fmt .Errorf ("failed to create toolkit installer: %w" , err )
167+ }
168+
169+ err = installer .Install ()
170+ if err != nil {
171+ return nil , fmt .Errorf ("failed to install toolkit: %w" , err )
172+ }
173+ } else {
174+ // If installCTK is false, we use the preinstalled toolkit.
175+ // TODO: This should be updated for other distributions and other components of the toolkit.
176+ output , _ , err := runner .Run ("ls /lib/**/libnvidia-container*.so.*.*" )
177+ if err != nil {
178+ return nil , fmt .Errorf ("failed to list toolkit libraries: %w" , err )
179+ }
180+
181+ output = strings .TrimSpace (output )
182+ if output == "" {
183+ return nil , fmt .Errorf ("no toolkit libraries found" ) //nolint:goerr113
184+ }
185+
186+ for _ , lib := range strings .Split (output , "\n " ) {
187+ additionalContainerArguments = append (additionalContainerArguments , "-v " + lib + ":" + lib )
188+ }
189+ additionalContainerArguments = append (additionalContainerArguments , "-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli" )
190+ }
191+
192+ // Launch the container in detached mode.
193+ var startContainerScriptBuilder strings.Builder
194+ startContainerTemplate , err := template .New ("startContainer" ).Parse (startTestContainerTemplate )
195+ if err != nil {
196+ return nil , fmt .Errorf ("failed to parse start container template: %w" , err )
197+ }
198+ err = startContainerTemplate .Execute (& startContainerScriptBuilder , struct {
199+ ContainerName string
200+ AdditionalArguments []string
201+ }{
202+ ContainerName : containerName ,
203+ AdditionalArguments : additionalContainerArguments ,
204+ })
205+ if err != nil {
206+ return nil , fmt .Errorf ("failed to execute start container template: %w" , err )
207+ }
208+
209+ startContainerScript := startContainerScriptBuilder .String ()
210+ _ , _ , err = runner .Run (startContainerScript )
211+ if err != nil {
212+ return nil , fmt .Errorf ("failed to run start container script: %w" , err )
213+ }
214+
215+ // install docker in the nested container
216+ _ , _ , err = runner .Run (fmt .Sprintf ("docker exec -u root " + containerName + " bash -c '%s'" , installDockerTemplate ))
217+ if err != nil {
218+ return nil , fmt .Errorf ("failed to install docker: %w" , err )
219+ }
220+
221+ if installCTK {
222+ // Install nvidia-container-cli in the container.
223+ tmpl , err := template .New ("toolkitInstall" ).Parse (installCTKTemplate )
224+ if err != nil {
225+ return nil , fmt .Errorf ("failed to parse installCTK template: %w" , err )
226+ }
227+
228+ var toolkitInstall strings.Builder
229+ err = tmpl .Execute (& toolkitInstall , struct {
230+ ToolkitImage string
231+ }{
232+ ToolkitImage : image ,
233+ })
234+ if err != nil {
235+ return nil , fmt .Errorf ("failed to execute installCTK template: %w" , err )
236+ }
237+
238+ _ , _ , err = runner .Run (fmt .Sprintf ("docker exec -u root " + containerName + " bash -c '%s'" , toolkitInstall .String ()))
239+ if err != nil {
240+ return nil , fmt .Errorf ("failed to install nvidia-container-cli: %w" , err )
241+ }
242+ }
243+
244+ nc := & nestedContainerRunner {
245+ runner : runner ,
246+ containerName : containerName ,
247+ }
248+
249+ return nc , nil
250+ }
251+
82252func (l localRunner ) Run (script string ) (string , string , error ) {
83253 // Create a command to run the script using bash
84254 cmd := exec .Command ("bash" , "-c" , script )
@@ -131,6 +301,10 @@ func (r remoteRunner) Run(script string) (string, string, error) {
131301 return stdout .String (), "" , nil
132302}
133303
304+ func (r nestedContainerRunner ) Run (script string ) (string , string , error ) {
305+ return r .runner .Run (fmt .Sprintf ("docker exec -u root " + r .containerName + " bash -c '%s'" , script ))
306+ }
307+
134308// createSshClient creates a ssh client, and retries if it fails to connect
135309func connectOrDie (sshKey , sshUser , host , port string ) (* ssh.Client , error ) {
136310 var client * ssh.Client
0 commit comments