File tree Expand file tree Collapse file tree 13 files changed +75
-211
lines changed Expand file tree Collapse file tree 13 files changed +75
-211
lines changed Original file line number Diff line number Diff line change @@ -136,6 +136,7 @@ ci-build-images:
136136 @./build/build-image.sh images/argo-executor argo-executor
137137 @./build/build-image.sh images/python-packager python-packager
138138 @./build/build-image.sh images/cluster-autoscaler cluster-autoscaler
139+ @./build/build-image.sh images/nvidia nvidia
139140 @./build/build-image.sh images/metrics-server metrics-server
140141
141142ci-push-images :
@@ -155,6 +156,7 @@ ci-push-images:
155156 @./build/push-image.sh argo-executor
156157 @./build/push-image.sh python-packager
157158 @./build/push-image.sh cluster-autoscaler
159+ @./build/push-image.sh nvidia
158160 @./build/push-image.sh metrics-server
159161
160162
Original file line number Diff line number Diff line change @@ -131,6 +131,7 @@ export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/
131131export CORTEX_IMAGE_TF_SERVE_GPU=" ${CORTEX_IMAGE_TF_SERVE_GPU:- cortexlabs/ tf-serve-gpu: $CORTEX_VERSION_STABLE } "
132132export CORTEX_IMAGE_TF_TRAIN_GPU=" ${CORTEX_IMAGE_TF_TRAIN_GPU:- cortexlabs/ tf-train-gpu: $CORTEX_VERSION_STABLE } "
133133export CORTEX_IMAGE_CLUSTER_AUTOSCALER=" ${CORTEX_IMAGE_CLUSTER_AUTOSCALER:- cortexlabs/ cluster-autoscaler: $CORTEX_VERSION_STABLE } "
134+ export CORTEX_IMAGE_NVIDIA=" ${CORTEX_IMAGE_NVIDIA:- cortexlabs/ nvidia: $CORTEX_VERSION_STABLE } "
134135export CORTEX_IMAGE_METRICS_SERVER=" ${CORTEX_IMAGE_METRICS_SERVER:- cortexlabs/ metrics-server: $CORTEX_VERSION_STABLE } "
135136
136137export CORTEX_ENABLE_TELEMETRY=" ${CORTEX_ENABLE_TELEMETRY:- " " } "
@@ -182,6 +183,7 @@ function install_cortex() {
182183 -e CORTEX_IMAGE_TF_SERVE_GPU=$CORTEX_IMAGE_TF_SERVE_GPU \
183184 -e CORTEX_IMAGE_TF_TRAIN_GPU=$CORTEX_IMAGE_TF_TRAIN_GPU \
184185 -e CORTEX_IMAGE_CLUSTER_AUTOSCALER=$CORTEX_IMAGE_CLUSTER_AUTOSCALER \
186+ -e CORTEX_IMAGE_NVIDIA=$CORTEX_IMAGE_NVIDIA \
185187 -e CORTEX_IMAGE_METRICS_SERVER=$CORTEX_IMAGE_METRICS_SERVER \
186188 -e CORTEX_ENABLE_TELEMETRY=$CORTEX_ENABLE_TELEMETRY \
187189 $CORTEX_IMAGE_MANAGER
Load Diff This file was deleted.
Load Diff This file was deleted.
Original file line number Diff line number Diff line change @@ -51,6 +51,7 @@ function create_registry() {
5151 aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
5252 aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
5353 aws ecr create-repository --repository-name=cortexlabs/cluster-autoscaler --region=$REGISTRY_REGION || true
54+ aws ecr create-repository --repository-name=cortexlabs/nvidia --region=$REGISTRY_REGION || true
5455 aws ecr create-repository --repository-name=cortexlabs/metrics-server --region=$REGISTRY_REGION || true
5556}
5657
@@ -139,6 +140,7 @@ elif [ "$cmd" = "update" ]; then
139140 build_and_push $ROOT /images/tf-serve-gpu tf-serve-gpu latest
140141 build_and_push $ROOT /images/python-packager python-packager latest
141142 build_and_push $ROOT /images/cluster-autoscaler cluster-autoscaler latest
143+ build_and_push $ROOT /images/nvidia nvidia latest
142144 build_and_push $ROOT /images/metrics-server metrics-server latest
143145 fi
144146
Original file line number Diff line number Diff line change @@ -25,4 +25,6 @@ One unit of memory is one byte. Memory can be expressed as an integer or by usin
2525
2626# # GPU
2727
28- One unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. Here's some information on [adding GPU enabled nodes on EKS](https://docs.aws.amazon.com/en_ca/eks/latest/userguide/gpu-ami.html).
28+ 1. Please make sure your AWS account is subscribed to the [EKS-optimized AMI with GPU Support](https://aws.amazon.com/marketplace/pp/B07GRHFXGM).
29+ 2. Set CORTEX_NODE_TYPE to an AWS GPU instance (e.g. p2.xlarge) before installing Cortex.
30+ 3. Note that one unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed.
Original file line number Diff line number Diff line change @@ -52,6 +52,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="cortexlabs/tf-train-gpu:master"
5252export CORTEX_IMAGE_TF_SERVE_GPU=" cortexlabs/tf-serve-gpu:master"
5353export CORTEX_IMAGE_PYTHON_PACKAGER=" cortexlabs/python-packager:master"
5454export CORTEX_IMAGE_CLUSTER_AUTOSCALER=" cortexlabs/cluster-autoscaler:master"
55+ export CORTEX_IMAGE_NVIDIA=" cortexlabs/nvidia:master"
5556export CORTEX_IMAGE_METRICS_SERVER=" cortexlabs/metrics-server:master"
5657
5758# Flag to enable collecting error reports and usage stats. If flag is not set to either "true" or "false", you will be prompted.
Original file line number Diff line number Diff line change @@ -86,6 +86,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/corte
8686export CORTEX_IMAGE_TF_TRANSFORM=" XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/tf-transform:latest"
8787export CORTEX_IMAGE_PYTHON_PACKAGER=" XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/python-packager:latest"
8888export CORTEX_IMAGE_CLUSTER_AUTOSCALER=" XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/cluster-autoscaler:latest"
89+ export CORTEX_IMAGE_NVIDIA=" XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/nvidia:latest"
8990export CORTEX_IMAGE_METRICS_SERVER=" XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/metrics-server:latest"
9091
9192export AWS_ACCESS_KEY_ID=" XXXXXX"
Original file line number Diff line number Diff line change 1+ FROM nvidia/k8s-device-plugin:1.0.0-beta
Original file line number Diff line number Diff line change @@ -169,5 +169,6 @@ envsubst < manifests/fluentd.yaml | kubectl apply -f - >/dev/null
169169envsubst < manifests/operator.yaml | kubectl apply -f - > /dev/null
170170envsubst < manifests/cluster-autoscaler.yaml | kubectl apply -f - > /dev/null
171171envsubst < manifests/metrics-server.yaml | kubectl apply -f - > /dev/null
172+ envsubst < manifests/nvidia.yaml | kubectl apply -f - > /dev/null
172173
173174validate_cortex
You can’t perform that action at this time.
0 commit comments