Skip to content

Commit b99e767

Browse files
authored
Add GPU support (#197)
1 parent e81ffb7 commit b99e767

File tree

13 files changed

+75
-211
lines changed

13 files changed

+75
-211
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ ci-build-images:
136136
@./build/build-image.sh images/argo-executor argo-executor
137137
@./build/build-image.sh images/python-packager python-packager
138138
@./build/build-image.sh images/cluster-autoscaler cluster-autoscaler
139+
@./build/build-image.sh images/nvidia nvidia
139140
@./build/build-image.sh images/metrics-server metrics-server
140141

141142
ci-push-images:
@@ -155,6 +156,7 @@ ci-push-images:
155156
@./build/push-image.sh argo-executor
156157
@./build/push-image.sh python-packager
157158
@./build/push-image.sh cluster-autoscaler
159+
@./build/push-image.sh nvidia
158160
@./build/push-image.sh metrics-server
159161

160162

cortex.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/
131131
export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}"
132132
export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}"
133133
export CORTEX_IMAGE_CLUSTER_AUTOSCALER="${CORTEX_IMAGE_CLUSTER_AUTOSCALER:-cortexlabs/cluster-autoscaler:$CORTEX_VERSION_STABLE}"
134+
export CORTEX_IMAGE_NVIDIA="${CORTEX_IMAGE_NVIDIA:-cortexlabs/nvidia:$CORTEX_VERSION_STABLE}"
134135
export CORTEX_IMAGE_METRICS_SERVER="${CORTEX_IMAGE_METRICS_SERVER:-cortexlabs/metrics-server:$CORTEX_VERSION_STABLE}"
135136

136137
export CORTEX_ENABLE_TELEMETRY="${CORTEX_ENABLE_TELEMETRY:-""}"
@@ -182,6 +183,7 @@ function install_cortex() {
182183
-e CORTEX_IMAGE_TF_SERVE_GPU=$CORTEX_IMAGE_TF_SERVE_GPU \
183184
-e CORTEX_IMAGE_TF_TRAIN_GPU=$CORTEX_IMAGE_TF_TRAIN_GPU \
184185
-e CORTEX_IMAGE_CLUSTER_AUTOSCALER=$CORTEX_IMAGE_CLUSTER_AUTOSCALER \
186+
-e CORTEX_IMAGE_NVIDIA=$CORTEX_IMAGE_NVIDIA \
185187
-e CORTEX_IMAGE_METRICS_SERVER=$CORTEX_IMAGE_METRICS_SERVER \
186188
-e CORTEX_ENABLE_TELEMETRY=$CORTEX_ENABLE_TELEMETRY \
187189
$CORTEX_IMAGE_MANAGER

dev/eks.sh

Lines changed: 0 additions & 45 deletions
This file was deleted.

dev/kops.sh

Lines changed: 0 additions & 163 deletions
This file was deleted.

dev/registry.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ function create_registry() {
5151
aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
5252
aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
5353
aws ecr create-repository --repository-name=cortexlabs/cluster-autoscaler --region=$REGISTRY_REGION || true
54+
aws ecr create-repository --repository-name=cortexlabs/nvidia --region=$REGISTRY_REGION || true
5455
aws ecr create-repository --repository-name=cortexlabs/metrics-server --region=$REGISTRY_REGION || true
5556
}
5657

@@ -139,6 +140,7 @@ elif [ "$cmd" = "update" ]; then
139140
build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
140141
build_and_push $ROOT/images/python-packager python-packager latest
141142
build_and_push $ROOT/images/cluster-autoscaler cluster-autoscaler latest
143+
build_and_push $ROOT/images/nvidia nvidia latest
142144
build_and_push $ROOT/images/metrics-server metrics-server latest
143145
fi
144146

docs/apis/compute.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,6 @@ One unit of memory is one byte. Memory can be expressed as an integer or by usin
2525

2626
## GPU
2727

28-
One unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. Here's some information on [adding GPU enabled nodes on EKS](https://docs.aws.amazon.com/en_ca/eks/latest/userguide/gpu-ami.html).
28+
1. Please make sure your AWS account is subscribed to the [EKS-optimized AMI with GPU Support](https://aws.amazon.com/marketplace/pp/B07GRHFXGM).
29+
2. Set CORTEX_NODE_TYPE to an AWS GPU instance (e.g. p2.xlarge) before installing Cortex.
30+
3. Note that one unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed.

docs/cluster/config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="cortexlabs/tf-train-gpu:master"
5252
export CORTEX_IMAGE_TF_SERVE_GPU="cortexlabs/tf-serve-gpu:master"
5353
export CORTEX_IMAGE_PYTHON_PACKAGER="cortexlabs/python-packager:master"
5454
export CORTEX_IMAGE_CLUSTER_AUTOSCALER="cortexlabs/cluster-autoscaler:master"
55+
export CORTEX_IMAGE_NVIDIA="cortexlabs/nvidia:master"
5556
export CORTEX_IMAGE_METRICS_SERVER="cortexlabs/metrics-server:master"
5657

5758
# Flag to enable collecting error reports and usage stats. If flag is not set to either "true" or "false", you will be prompted.

docs/cluster/development.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/corte
8686
export CORTEX_IMAGE_TF_TRANSFORM="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/tf-transform:latest"
8787
export CORTEX_IMAGE_PYTHON_PACKAGER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/python-packager:latest"
8888
export CORTEX_IMAGE_CLUSTER_AUTOSCALER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/cluster-autoscaler:latest"
89+
export CORTEX_IMAGE_NVIDIA="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/nvidia:latest"
8990
export CORTEX_IMAGE_METRICS_SERVER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/metrics-server:latest"
9091

9192
export AWS_ACCESS_KEY_ID="XXXXXX"

images/nvidia/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
FROM nvidia/k8s-device-plugin:1.0.0-beta

manager/install_cortex.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,5 +169,6 @@ envsubst < manifests/fluentd.yaml | kubectl apply -f - >/dev/null
169169
envsubst < manifests/operator.yaml | kubectl apply -f - >/dev/null
170170
envsubst < manifests/cluster-autoscaler.yaml | kubectl apply -f - >/dev/null
171171
envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
172+
envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null
172173

173174
validate_cortex

0 commit comments

Comments
 (0)