Skip to content

Commit cca0fe8

Browse files
authored
Expose GPUs using device driver (#1366)
1 parent 56433dc commit cca0fe8

File tree

3 files changed

+66
-54
lines changed

3 files changed

+66
-54
lines changed

cli/local/docker_spec.go

Lines changed: 63 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ func deployPythonContainer(api *spec.API, awsClient *aws.Client) error {
124124
portBinding.HostPort = s.Int(*api.Networking.LocalPort)
125125
}
126126

127-
runtime := ""
128127
resources := container.Resources{}
129128
if api.Compute != nil {
130129
if api.Compute.CPU != nil {
@@ -134,15 +133,19 @@ func deployPythonContainer(api *spec.API, awsClient *aws.Client) error {
134133
resources.Memory = api.Compute.Mem.Quantity.Value()
135134
}
136135
if api.Compute.GPU > 0 {
137-
runtime = "nvidia"
136+
resources.DeviceRequests = []container.DeviceRequest{{
137+
Count: -1,
138+
Capabilities: [][]string{
139+
{"gpu"},
140+
},
141+
}}
138142
}
139143
}
140144

141145
hostConfig := &container.HostConfig{
142146
PortBindings: nat.PortMap{
143147
_defaultPortStr + "/tcp": []nat.PortBinding{portBinding},
144148
},
145-
Runtime: runtime,
146149
Resources: resources,
147150
Mounts: []mount.Mount{
148151
{
@@ -183,7 +186,13 @@ func deployPythonContainer(api *spec.API, awsClient *aws.Client) error {
183186

184187
err = docker.MustDockerClient().ContainerStart(context.Background(), containerInfo.ID, dockertypes.ContainerStartOptions{})
185188
if err != nil {
186-
return errors.Wrap(err, api.Identify())
189+
if api.Compute.GPU == 0 {
190+
return errors.Wrap(err, api.Identify())
191+
}
192+
err := retryWithNvidiaRuntime(err, containerConfig, hostConfig)
193+
if err != nil {
194+
return errors.Wrap(err, api.Identify())
195+
}
187196
}
188197

189198
return nil
@@ -195,7 +204,6 @@ func deployONNXContainer(api *spec.API, awsClient *aws.Client) error {
195204
portBinding.HostPort = s.Int(*api.Networking.LocalPort)
196205
}
197206

198-
runtime := ""
199207
resources := container.Resources{}
200208
if api.Compute != nil {
201209
if api.Compute.CPU != nil {
@@ -205,7 +213,12 @@ func deployONNXContainer(api *spec.API, awsClient *aws.Client) error {
205213
resources.Memory = api.Compute.Mem.Quantity.Value()
206214
}
207215
if api.Compute.GPU > 0 {
208-
runtime = "nvidia"
216+
resources.DeviceRequests = []container.DeviceRequest{{
217+
Count: -1,
218+
Capabilities: [][]string{
219+
{"gpu"},
220+
},
221+
}}
209222
}
210223
}
211224

@@ -233,7 +246,6 @@ func deployONNXContainer(api *spec.API, awsClient *aws.Client) error {
233246
PortBindings: nat.PortMap{
234247
_defaultPortStr + "/tcp": []nat.PortBinding{portBinding},
235248
},
236-
Runtime: runtime,
237249
Resources: resources,
238250
Mounts: mounts,
239251
}
@@ -264,14 +276,19 @@ func deployONNXContainer(api *spec.API, awsClient *aws.Client) error {
264276

265277
err = docker.MustDockerClient().ContainerStart(context.Background(), containerInfo.ID, dockertypes.ContainerStartOptions{})
266278
if err != nil {
267-
return errors.Wrap(err, api.Identify())
279+
if api.Compute.GPU == 0 {
280+
return errors.Wrap(err, api.Identify())
281+
}
282+
err := retryWithNvidiaRuntime(err, containerConfig, hostConfig)
283+
if err != nil {
284+
return errors.Wrap(err, api.Identify())
285+
}
268286
}
269287

270288
return nil
271289
}
272290

273291
func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
274-
serveRuntime := ""
275292
serveResources := container.Resources{}
276293
apiResources := container.Resources{}
277294

@@ -287,7 +304,12 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
287304
serveResources.Memory = totalMemory - apiResources.Memory
288305
}
289306
if api.Compute.GPU > 0 {
290-
serveRuntime = "nvidia"
307+
serveResources.DeviceRequests = append(serveResources.DeviceRequests, container.DeviceRequest{
308+
Count: -1,
309+
Capabilities: [][]string{
310+
{"gpu"},
311+
},
312+
})
291313
}
292314
}
293315

@@ -301,7 +323,6 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
301323
}
302324

303325
serveHostConfig := &container.HostConfig{
304-
Runtime: serveRuntime,
305326
Resources: serveResources,
306327
Mounts: mounts,
307328
}
@@ -349,7 +370,13 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
349370

350371
err = docker.MustDockerClient().ContainerStart(context.Background(), containerCreateRequest.ID, dockertypes.ContainerStartOptions{})
351372
if err != nil {
352-
return errors.Wrap(err, api.Identify())
373+
if api.Compute.GPU == 0 {
374+
return errors.Wrap(err, api.Identify())
375+
}
376+
err := retryWithNvidiaRuntime(err, serveContainerConfig, serveHostConfig)
377+
if err != nil {
378+
return errors.Wrap(err, api.Identify())
379+
}
353380
}
354381

355382
containerInfo, err := docker.MustDockerClient().ContainerInspect(context.Background(), containerCreateRequest.ID)
@@ -416,6 +443,30 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
416443
return nil
417444
}
418445

446+
// Retries deploying a container requiring GPU using nvidia runtime, returns original error if isn't relevant, nil if successful and new error if a retry was attempted but failed
447+
func retryWithNvidiaRuntime(err error, containerConfig *container.Config, hostConfig *container.HostConfig) error {
448+
// error message if device driver may look like 'could not select device driver "" with capabilities: [[gpu]]'
449+
if !(strings.Contains(err.Error(), "could not select device driver") && strings.Contains(err.Error(), "gpu")) {
450+
return err
451+
}
452+
453+
if _, ok := docker.MustDockerClient().Info.Runtimes["nvidia"]; ok {
454+
fmt.Println("retrying API deployment using nvidia runtime because device driver for GPU was not found")
455+
hostConfig.Runtime = "nvidia"
456+
hostConfig.Resources.DeviceRequests = nil
457+
containerCreateRequest, err := docker.MustDockerClient().ContainerCreate(context.Background(), containerConfig, hostConfig, nil, "")
458+
if err != nil {
459+
return errors.Wrap(err, "failed to request a GPU")
460+
}
461+
err = docker.MustDockerClient().ContainerStart(context.Background(), containerCreateRequest.ID, dockertypes.ContainerStartOptions{})
462+
if err != nil {
463+
return errors.Wrap(err, "failed to run a container using nvidia runtime; it is recommended to use the latest Docker Engine (https://docs.docker.com/engine/install/) with nvidia-container-runtime or nvidia-container-toolkit (https://docs.docker.com/config/containers/resource_constraints/#gpu)")
464+
}
465+
return nil
466+
}
467+
return errors.Append(errors.Wrap(err, "failed to allocate GPU"), "\n\n* only NVIDIA gpus are supported\n* please make sure that you've set up nvidia-container-runtime or nvidia-container-toolkit for your Docker Engine correctly (https://docs.docker.com/config/containers/resource_constraints/#gpu)\n\nAlternatively, try deploying the API without requesting a GPU by updating `compute.gpu` in your API configuration yaml")
468+
}
469+
419470
func GetContainersByAPI(apiName string) ([]dockertypes.Container, error) {
420471
dargs := filters.NewArgs()
421472
dargs.Add("label", "cortex=true")

cli/local/get.go

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ package local
1818

1919
import (
2020
"encoding/json"
21+
"fmt"
2122
"path/filepath"
2223
"strings"
2324

2425
"github.com/cortexlabs/cortex/pkg/consts"
2526
"github.com/cortexlabs/cortex/pkg/lib/docker"
2627
"github.com/cortexlabs/cortex/pkg/lib/errors"
2728
"github.com/cortexlabs/cortex/pkg/lib/files"
28-
s "github.com/cortexlabs/cortex/pkg/lib/strings"
2929
"github.com/cortexlabs/cortex/pkg/operator/schema"
3030
"github.com/cortexlabs/cortex/pkg/types/spec"
3131
)
@@ -161,19 +161,14 @@ func GetAPI(apiName string) (schema.GetAPIResponse, error) {
161161
apiContainer = containers[1]
162162
}
163163

164-
apiPort := ""
165-
for _, port := range apiContainer.Ports {
166-
if port.PrivatePort == 8888 {
167-
apiPort = s.Uint16(port.PublicPort)
168-
}
169-
}
164+
apiPort := apiSpec.Networking.LocalPort
170165

171166
return schema.GetAPIResponse{
172167
RealtimeAPI: &schema.RealtimeAPI{
173168
Spec: *apiSpec,
174169
Status: apiStatus,
175170
Metrics: apiMetrics,
176-
Endpoint: "http://localhost:" + apiPort,
171+
Endpoint: fmt.Sprintf("http://localhost:%d", *apiPort),
177172
},
178173
}, nil
179174
}

cli/local/validations.go

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,8 @@ import (
2121
"math"
2222
"net"
2323
"path/filepath"
24-
"runtime"
2524
"strings"
2625

27-
"github.com/cortexlabs/cortex/pkg/consts"
2826
"github.com/cortexlabs/cortex/pkg/lib/aws"
2927
"github.com/cortexlabs/cortex/pkg/lib/docker"
3028
"github.com/cortexlabs/cortex/pkg/lib/errors"
@@ -117,7 +115,6 @@ func ValidateLocalAPIs(apis []userconfig.API, projectFiles ProjectFiles, awsClie
117115
return err
118116
}
119117

120-
apisRequiringGPU := strset.New()
121118
for i := range apis {
122119
api := &apis[i]
123120

@@ -128,37 +125,6 @@ func ValidateLocalAPIs(apis []userconfig.API, projectFiles ProjectFiles, awsClie
128125
if api.Compute.CPU != nil && (api.Compute.CPU.MilliValue() > int64(dockerClient.Info.NCPU)*1000) {
129126
api.Compute.CPU = k8s.NewQuantity(int64(dockerClient.Info.NCPU))
130127
}
131-
132-
if api.Compute.GPU > 0 {
133-
apisRequiringGPU.Add(api.Name)
134-
}
135-
}
136-
137-
if len(apisRequiringGPU) > 0 {
138-
if _, ok := dockerClient.Info.Runtimes["nvidia"]; !ok {
139-
if !strings.HasPrefix(strings.ToLower(runtime.GOOS), "linux") {
140-
fmt.Printf("warning: %s will run without gpu access because the nvidia container runtime is not supported on your operating system; see https://docs.cortex.dev/troubleshooting/nvidia-container-runtime-not-found for more information\n\n", s.StrsAnd(apisRequiringGPU.SliceSorted()))
141-
} else {
142-
fmt.Printf("warning: %s will run without gpu access because your local machine doesn't have a gpu or the nvidia container runtime is not configured properly; see https://docs.cortex.dev/troubleshooting/nvidia-container-runtime-not-found for more information\n\n", s.StrsAnd(apisRequiringGPU.SliceSorted()))
143-
}
144-
145-
for i := range apis {
146-
api := &apis[i]
147-
if apisRequiringGPU.Has(api.Name) {
148-
api.Compute.GPU = 0
149-
}
150-
switch api.Predictor.Image {
151-
case consts.DefaultImageONNXPredictorGPU:
152-
api.Predictor.Image = consts.DefaultImageONNXPredictorCPU
153-
case consts.DefaultImagePythonPredictorGPU:
154-
api.Predictor.Image = consts.DefaultImagePythonPredictorCPU
155-
}
156-
157-
if api.Predictor.Type == userconfig.TensorFlowPredictorType && api.Predictor.TensorFlowServingImage == consts.DefaultImageTensorFlowServingGPU {
158-
api.Predictor.TensorFlowServingImage = consts.DefaultImageTensorFlowServingCPU
159-
}
160-
}
161-
}
162128
}
163129

164130
dups := spec.FindDuplicateNames(apis)

0 commit comments

Comments
 (0)