Skip to content

Commit bb0b341

Browse files
[no-relnote] implement a nestedContainerRunner for E2E test suite
Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
1 parent 8d6e5cf commit bb0b341

File tree

2 files changed

+182
-129
lines changed

2 files changed

+182
-129
lines changed

tests/e2e/nvidia-container-cli_test.go

Lines changed: 8 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -20,58 +20,12 @@ import (
2020
"context"
2121
"fmt"
2222
"strings"
23-
"text/template"
2423

2524
. "github.com/onsi/ginkgo/v2"
2625
. "github.com/onsi/gomega"
2726
)
2827

2928
const (
30-
installDockerTemplate = `
31-
export DEBIAN_FRONTEND=noninteractive
32-
33-
# Add Docker official GPG key:
34-
apt-get update
35-
apt-get install -y ca-certificates curl apt-utils gnupg2
36-
install -m 0755 -d /etc/apt/keyrings
37-
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
38-
chmod a+r /etc/apt/keyrings/docker.asc
39-
40-
# Add the repository to Apt sources:
41-
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
42-
apt-get update
43-
44-
apt-get install -y docker-ce docker-ce-cli containerd.io
45-
46-
# start dockerd in the background
47-
dockerd &
48-
49-
# wait for dockerd to be ready with timeout
50-
timeout=30
51-
elapsed=0
52-
while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do
53-
echo "Waiting for dockerd to be ready..."
54-
sleep 1
55-
elapsed=$((elapsed + 1))
56-
done
57-
if [ $elapsed -ge $timeout ]; then
58-
echo "Docker failed to start within $timeout seconds"
59-
exit 1
60-
fi
61-
`
62-
installCTKTemplate = `
63-
# Create a temporary directory and rootfs path
64-
TMPDIR="$(mktemp -d)"
65-
66-
# Expose TMPDIR for the child namespace
67-
export TMPDIR
68-
69-
docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/"
70-
dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb
71-
72-
nvidia-container-cli --version
73-
`
74-
7529
libnvidiaContainerCliTestTemplate = `
7630
# Create a temporary directory and rootfs path
7731
TMPDIR="$(mktemp -d)"
@@ -113,22 +67,14 @@ unshare --mount --pid --fork --propagation private -- sh -eux <<'\''IN_NS'\''
11367
11468
IN_NS
11569
`
116-
117-
startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \
118-
-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
119-
-e NVIDIA_DRIVER_CAPABILITIES=all \
120-
{{ range $i, $a := .AdditionalArguments -}}
121-
{{ $a }} \
122-
{{ end -}}
123-
ubuntu sleep infinity`
12470
)
12571

12672
var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libnvidia-container"), func() {
12773
var (
128-
runner Runner
129-
containerName = "node-container-e2e"
130-
hostOutput string
131-
additionalContainerArguments []string
74+
runner Runner
75+
nestedContainerRunner Runner
76+
containerName = "node-container-e2e"
77+
hostOutput string
13278
)
13379

13480
BeforeAll(func(ctx context.Context) {
@@ -139,44 +85,16 @@ var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libn
13985
WithSshUser(sshUser),
14086
)
14187

142-
if installCTK {
143-
installer, err := NewToolkitInstaller(
144-
WithRunner(runner),
145-
WithImage(imageName+":"+imageTag),
146-
WithTemplate(dockerInstallTemplate),
147-
)
148-
Expect(err).ToNot(HaveOccurred())
149-
150-
err = installer.Install()
151-
Expect(err).ToNot(HaveOccurred())
152-
} else {
153-
// If installCTK is false, we use the preinstalled toolkit.
154-
// TODO: This should be updated for other distributions and other components of the toolkit.
155-
output, _, err := runner.Run("ls /lib/**/libnvidia-container*.so.*.*")
156-
Expect(err).ToNot(HaveOccurred())
157-
158-
output = strings.TrimSpace(output)
159-
Expect(output).ToNot(BeEmpty())
160-
161-
for _, lib := range strings.Split(output, "\n") {
162-
additionalContainerArguments = append(additionalContainerArguments, "-v "+lib+":"+lib)
163-
}
164-
additionalContainerArguments = append(additionalContainerArguments,
165-
"-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli",
166-
)
167-
}
88+
var err error
89+
nestedContainerRunner, err = NewNestedContainerRunner(runner, installCTK, imageName+":"+imageTag, containerName)
90+
Expect(err).ToNot(HaveOccurred())
16891

16992
// Capture the host GPU list.
170-
var err error
17193
hostOutput, _, err = runner.Run("nvidia-smi -L")
17294
Expect(err).ToNot(HaveOccurred())
17395

17496
// Normalize the output once
17597
hostOutput = strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", ""))
176-
177-
// If a container with the same name exists from a previous test run, remove it first.
178-
// Ignore errors as container might not exist
179-
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck
18098
})
18199

182100
AfterAll(func(ctx context.Context) {
@@ -186,47 +104,8 @@ var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libn
186104
})
187105

188106
It("should report the same GPUs inside the container as on the host", func(ctx context.Context) {
189-
// Launch the container in detached mode.
190-
var startContainerScriptBuilder strings.Builder
191-
startContainerTemplate, err := template.New("startContainer").Parse(startTestContainerTemplate)
192-
Expect(err).ToNot(HaveOccurred())
193-
err = startContainerTemplate.Execute(&startContainerScriptBuilder, struct {
194-
ContainerName string
195-
AdditionalArguments []string
196-
}{
197-
ContainerName: containerName,
198-
AdditionalArguments: additionalContainerArguments,
199-
})
200-
Expect(err).ToNot(HaveOccurred())
201-
202-
startContainerScript := startContainerScriptBuilder.String()
203-
GinkgoLogr.Info("Starting test container", "script", startContainerScript)
204-
_, _, err = runner.Run(startContainerScript)
205-
Expect(err).ToNot(HaveOccurred())
206-
207-
// Install docker in the container.
208-
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", installDockerTemplate))
209-
Expect(err).ToNot(HaveOccurred())
210-
211-
if installCTK {
212-
// Install nvidia-container-cli in the container.
213-
tmpl, err := template.New("toolkitInstall").Parse(installCTKTemplate)
214-
Expect(err).ToNot(HaveOccurred())
215-
216-
var toolkitInstall strings.Builder
217-
err = tmpl.Execute(&toolkitInstall, struct {
218-
ToolkitImage string
219-
}{
220-
ToolkitImage: imageName + ":" + imageTag,
221-
})
222-
Expect(err).ToNot(HaveOccurred())
223-
224-
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", toolkitInstall.String()))
225-
Expect(err).ToNot(HaveOccurred())
226-
}
227-
228107
// Run the test script in the container.
229-
output, _, err := runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", libnvidiaContainerCliTestTemplate))
108+
output, _, err := nestedContainerRunner.Run(libnvidiaContainerCliTestTemplate)
230109
Expect(err).ToNot(HaveOccurred())
231110
Expect(strings.TrimSpace(output)).ToNot(BeEmpty())
232111
Expect(hostOutput).To(ContainSubstring(strings.TrimSpace(output)))

tests/e2e/runner.go

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,69 @@ import (
2121
"fmt"
2222
"os"
2323
"os/exec"
24+
"strings"
25+
"text/template"
2426
"time"
2527

2628
"golang.org/x/crypto/ssh"
2729
)
2830

31+
const (
32+
startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \
33+
-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
34+
-e NVIDIA_DRIVER_CAPABILITIES=all \
35+
{{ range $i, $a := .AdditionalArguments -}}
36+
{{ $a }} \
37+
{{ end -}}
38+
ubuntu sleep infinity`
39+
40+
installDockerTemplate = `
41+
export DEBIAN_FRONTEND=noninteractive
42+
43+
# Add Docker official GPG key:
44+
apt-get update
45+
apt-get install -y ca-certificates curl apt-utils gnupg2
46+
install -m 0755 -d /etc/apt/keyrings
47+
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
48+
chmod a+r /etc/apt/keyrings/docker.asc
49+
50+
# Add the repository to Apt sources:
51+
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
52+
apt-get update
53+
54+
apt-get install -y docker-ce docker-ce-cli containerd.io
55+
56+
# start dockerd in the background
57+
dockerd &
58+
59+
# wait for dockerd to be ready with timeout
60+
timeout=30
61+
elapsed=0
62+
while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do
63+
echo "Waiting for dockerd to be ready..."
64+
sleep 1
65+
elapsed=$((elapsed + 1))
66+
done
67+
if [ $elapsed -ge $timeout ]; then
68+
echo "Docker failed to start within $timeout seconds"
69+
exit 1
70+
fi
71+
`
72+
73+
installCTKTemplate = `
74+
# Create a temporary directory and rootfs path
75+
TMPDIR="$(mktemp -d)"
76+
77+
# Expose TMPDIR for the child namespace
78+
export TMPDIR
79+
80+
docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/"
81+
dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb
82+
83+
nvidia-ctk --version
84+
`
85+
)
86+
2987
type localRunner struct{}
3088
type remoteRunner struct {
3189
sshKey string
@@ -34,7 +92,13 @@ type remoteRunner struct {
3492
port string
3593
}
3694

95+
type nestedContainerRunner struct {
96+
runner Runner
97+
containerName string
98+
}
99+
37100
type runnerOption func(*remoteRunner)
101+
type nestedContainerOption func(*nestedContainerRunner)
38102

39103
type Runner interface {
40104
Run(script string) (string, string, error)
@@ -79,6 +143,112 @@ func NewRunner(opts ...runnerOption) Runner {
79143
return r
80144
}
81145

146+
// NewNestedContainerRunner creates a new nested container runner.
147+
// A nested container runs a container inside another container based on a
148+
// given runner (remote or local).
149+
func NewNestedContainerRunner(runner Runner, installCTK bool, image string, containerName string, opts ...nestedContainerOption) (Runner, error) {
150+
additionalContainerArguments := []string{}
151+
152+
// If a container with the same name exists from a previous test run, remove it first.
153+
// Ignore errors as container might not exist
154+
_, _, err := runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck
155+
if err != nil {
156+
return nil, fmt.Errorf("failed to remove container: %w", err)
157+
}
158+
159+
if installCTK {
160+
installer, err := NewToolkitInstaller(
161+
WithRunner(runner),
162+
WithImage(image),
163+
WithTemplate(dockerInstallTemplate),
164+
)
165+
if err != nil {
166+
return nil, fmt.Errorf("failed to create toolkit installer: %w", err)
167+
}
168+
169+
err = installer.Install()
170+
if err != nil {
171+
return nil, fmt.Errorf("failed to install toolkit: %w", err)
172+
}
173+
} else {
174+
// If installCTK is false, we use the preinstalled toolkit.
175+
// TODO: This should be updated for other distributions and other components of the toolkit.
176+
output, _, err := runner.Run("ls /lib/**/libnvidia-container*.so.*.*")
177+
if err != nil {
178+
return nil, fmt.Errorf("failed to list toolkit libraries: %w", err)
179+
}
180+
181+
output = strings.TrimSpace(output)
182+
if output == "" {
183+
return nil, fmt.Errorf("no toolkit libraries found") //nolint:goerr113
184+
}
185+
186+
for _, lib := range strings.Split(output, "\n") {
187+
additionalContainerArguments = append(additionalContainerArguments, "-v "+lib+":"+lib)
188+
}
189+
additionalContainerArguments = append(additionalContainerArguments, "-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli")
190+
}
191+
192+
// Launch the container in detached mode.
193+
var startContainerScriptBuilder strings.Builder
194+
startContainerTemplate, err := template.New("startContainer").Parse(startTestContainerTemplate)
195+
if err != nil {
196+
return nil, fmt.Errorf("failed to parse start container template: %w", err)
197+
}
198+
err = startContainerTemplate.Execute(&startContainerScriptBuilder, struct {
199+
ContainerName string
200+
AdditionalArguments []string
201+
}{
202+
ContainerName: containerName,
203+
AdditionalArguments: additionalContainerArguments,
204+
})
205+
if err != nil {
206+
return nil, fmt.Errorf("failed to execute start container template: %w", err)
207+
}
208+
209+
startContainerScript := startContainerScriptBuilder.String()
210+
_, _, err = runner.Run(startContainerScript)
211+
if err != nil {
212+
return nil, fmt.Errorf("failed to run start container script: %w", err)
213+
}
214+
215+
// install docker in the nested container
216+
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", installDockerTemplate))
217+
if err != nil {
218+
return nil, fmt.Errorf("failed to install docker: %w", err)
219+
}
220+
221+
if installCTK {
222+
// Install nvidia-container-cli in the container.
223+
tmpl, err := template.New("toolkitInstall").Parse(installCTKTemplate)
224+
if err != nil {
225+
return nil, fmt.Errorf("failed to parse installCTK template: %w", err)
226+
}
227+
228+
var toolkitInstall strings.Builder
229+
err = tmpl.Execute(&toolkitInstall, struct {
230+
ToolkitImage string
231+
}{
232+
ToolkitImage: image,
233+
})
234+
if err != nil {
235+
return nil, fmt.Errorf("failed to execute installCTK template: %w", err)
236+
}
237+
238+
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", toolkitInstall.String()))
239+
if err != nil {
240+
return nil, fmt.Errorf("failed to install nvidia-container-cli: %w", err)
241+
}
242+
}
243+
244+
nc := &nestedContainerRunner{
245+
runner: runner,
246+
containerName: containerName,
247+
}
248+
249+
return nc, nil
250+
}
251+
82252
func (l localRunner) Run(script string) (string, string, error) {
83253
// Create a command to run the script using bash
84254
cmd := exec.Command("bash", "-c", script)
@@ -131,6 +301,10 @@ func (r remoteRunner) Run(script string) (string, string, error) {
131301
return stdout.String(), "", nil
132302
}
133303

304+
func (r nestedContainerRunner) Run(script string) (string, string, error) {
305+
return r.runner.Run(fmt.Sprintf("docker exec -u root "+r.containerName+" bash -c '%s'", script))
306+
}
307+
134308
// createSshClient creates a ssh client, and retries if it fails to connect
135309
func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) {
136310
var client *ssh.Client

0 commit comments

Comments
 (0)