Update versions of Cortex dependencies (#1886)

deliahu · web-flow · commit c61e56877633 · 2021-02-14T22:43:05.000-08:00
diff --git a/build/build-image.sh b/build/build-image.sh
@@ -30,8 +30,8 @@ fi
 build_args=""
 
 if [ "${image}" == "python-predictor-gpu" ]; then
-  cuda=("10.0" "10.1" "10.1" "10.2" "10.2" "11.0" "11.1")
-  cudnn=("7" "7" "8" "7" "8" "8" "8")
+  cuda=("10.0" "10.1" "10.1" "10.2" "10.2" "11.0" "11.1" "11.2")
+  cudnn=("7" "7" "8" "7" "8" "8" "8" "8")
   for i in ${!cudnn[@]}; do
     build_args="${build_args} --build-arg CUDA_VERSION=${cuda[$i]} --build-arg CUDNN=${cudnn[$i]}"
     docker build "$ROOT" -f $ROOT/images/$image/Dockerfile $build_args -t quay.io/cortexlabs/${image}:${CORTEX_VERSION}-cuda${cuda[$i]}-cudnn${cudnn[$i]}
diff --git a/build/push-image.sh b/build/push-image.sh
@@ -24,8 +24,8 @@ image=$1
 echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
 
 if [ "$image" == "python-predictor-gpu" ]; then
-  cuda=("10.0" "10.1" "10.1" "10.2" "10.2" "11.0" "11.1")
-  cudnn=("7" "7" "8" "7" "8" "8" "8")
+  cuda=("10.0" "10.1" "10.1" "10.2" "10.2" "11.0" "11.1" "11.2")
+  cudnn=("7" "7" "8" "7" "8" "8" "8" "8")
   for i in ${!cudnn[@]}; do
     docker push quay.io/cortexlabs/${image}:${CORTEX_VERSION}-cuda${cuda[$i]}-cudnn${cudnn[$i]}
   done
diff --git a/dev/versions.md b/dev/versions.md
@@ -143,18 +143,21 @@ python versions in our pip dependencies (e.g. [tensorflow](https://pypi.org/proj
 ## TensorFlow / TensorFlow Serving
 
 1. Find the latest release on [GitHub](https://github.com/tensorflow/tensorflow/releases)
-1. Search the codebase for the current minor TensorFlow version (e.g. `2.3`) and update versions as appropriate
+1. Search the codebase for the current minor TensorFlow version (e.g. `2.4`) and update versions as appropriate
+1. Update the version for libnvinfer in `images/tensorflow-serving-gpu/Dockerfile` dockerfile as appropriate (https://www.tensorflow.org/install/gpu)
 
 Note: it's ok if example training notebooks aren't upgraded, as long as the exported model still works
 
 ## CUDA/cuDNN
 
-1. Search the codebase for the previous CUDA version and `cudnn`
+1. Search the codebase for the previous CUDA version and `cudnn`. It might be nice to use the version of CUDA which does not require a special pip command when installing pytorch.
 
 ## ONNX runtime
 
 1. Update the version in `images/onnx-predictor-cpu/Dockerfile`
    and `images/onnx-predictor-gpu/Dockerfile` ([releases](https://github.com/microsoft/onnxruntime/releases))
+   * Use the appropriate CUDA/cuDNN version in `images/onnx-predictor-gpu/Dockerfile` ([docs](https://github.com/microsoft/onnxruntime/blob/master/BUILD.md#CUDA))
+   * Search the codebase for the previous version
 1. Search the codebase for the previous ONNX runtime version
 
 ## Nvidia device plugin
@@ -163,10 +166,11 @@ Note: it's ok if example training notebooks aren't upgraded, as long as the expo
    , [Dockerhub](https://hub.docker.com/r/nvidia/k8s-device-plugin))
 1. In the [GitHub Repo](https://github.com/NVIDIA/k8s-device-plugin), find the latest release and go to this file (
    replacing the version number): <https://github.com/NVIDIA/k8s-device-plugin/blob/v0.6.0/nvidia-device-plugin.yml>
-1. Copy the contents to `manager/manifests/nvidia.yaml`
+1. Copy the contents to `manager/manifests/nvidia_aws.yaml`
     1. Update the link at the top of the file to the URL you copied from
     1. Check that your diff is reasonable (and put back any of our modifications, e.g. the image path, rolling update
        strategy, resource requests, tolerations, node selector, priority class, etc)
+1. For `manager/manifests/nvidia_gcp.yaml` follow the instructions at [here](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers)
 1. Confirm GPUs work for PyTorch, TensorFlow, and ONNX models
 
 ## Inferentia device plugin
@@ -188,10 +192,10 @@ Note: it's ok if example training notebooks aren't upgraded, as long as the expo
 
 1. `docker run --rm -it amazonlinux:2`
 1. Run the `echo $'[neuron] ...' > /etc/yum.repos.d/neuron.repo` command
-   from [Dockerfile.neuron-rtd](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/docker-example/Dockerfile.neuron-rtd) (
-   it needs to be updated to work properly with the new lines)
-1. Run `yum info aws-neuron-tools` and `yum info aws-neuron-runtime` to check the versions that were installed, and use
-   those versions in `images/neuron-rtd/Dockerfile`
+   from [Dockerfile.neuron-rtd](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/docker-example/Dockerfile.neuron-rtd) (it needs to be updated to work properly with the new lines)
+   * e.g. `echo $'[neuron] \nname=Neuron YUM Repository \nbaseurl=https://yum.repos.neuron.amazonaws.com \nenabled=1' > /etc/yum.repos.d/neuron.repo`
+1. Run `yum info aws-neuron-tools`, `yum info aws-neuron-runtime`, and `yum info procps-ng` to check the versions
+   that were installed, and use those versions in `images/neuron-rtd/Dockerfile`
 1. Check if there are any updates
    to [Dockerfile.neuron-rtd](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/docker-example/Dockerfile.neuron-rtd)
    which should be brought in to `images/neuron-rtd/Dockerfile`
@@ -268,19 +272,10 @@ Note: it's ok if example training notebooks aren't upgraded, as long as the expo
 1. Find the latest release on [GitHub](https://github.com/kubernetes-incubator/metrics-server/releases) and check the
    changelog
 1. Update the version in `images/metrics-server/Dockerfile`
-1. In the [GitHub Repo](https://github.com/kubernetes-incubator/metrics-server), find the latest release and go to this
-   directory (replacing the version
-   number): <https://github.com/kubernetes-incubator/metrics-server/tree/v0.3.7/deploy/1.8+>
-1. Copy the contents of all of the files in that directory into `manager/manifests/metrics-server.yaml`
-    1. Update this line of config:
-
-        ```yaml
-        image: $CORTEX_IMAGE_METRICS_SERVER
-        ```
-
-    1. Update the link at the top of the file to the URL you copied from
-    1. Check that your diff is reasonable (there may have been other modifications to the file which should be
-       preserved, like resource requests)
+1. Download the manifest referenced in the latest release in changelog
+1. Copy the contents of the manifest into `manager/manifests/metrics-server.yaml`
+    1. Update accordingly (e.g. image, pull policy, resource request, etc):
+    1. Check that your diff is reasonable
 1. You can confirm the metric server is running by showing the logs of the metrics-server pod, or
    via `kubectl get deployment metrics-server -n kube-system`
    and `kubectl get apiservice v1beta1.metrics.k8s.io -o yaml`
diff --git a/docs/workloads/batch/predictors.md b/docs/workloads/batch/predictors.md
@@ -90,7 +90,7 @@ class PythonPredictor:
 
 ## TensorFlow Predictor
 
-**Uses TensorFlow version 2.3.0 by default**
+**Uses TensorFlow version 2.4.1 by default**
 
 ### Interface
 
@@ -151,7 +151,7 @@ If you need to share files between your predictor implementation and the TensorF
 
 ## ONNX Predictor
 
-**Uses ONNX Runtime version 1.4.0 by default**
+**Uses ONNX Runtime version 1.6.0 by default**
 
 ### Interface
 
diff --git a/docs/workloads/dependencies/images.md b/docs/workloads/dependencies/images.md
@@ -20,6 +20,7 @@ Cortex's base Docker images are listed below. Depending on the Cortex Predictor
   * `quay.io/cortexlabs/python-predictor-gpu:master-cuda10.2-cudnn8`
   * `quay.io/cortexlabs/python-predictor-gpu:master-cuda11.0-cudnn8`
   * `quay.io/cortexlabs/python-predictor-gpu:master-cuda11.1-cudnn8`
+  * `quay.io/cortexlabs/python-predictor-gpu:master-cuda11.2-cudnn8`
 * Python Predictor (Inferentia): `quay.io/cortexlabs/python-predictor-inf:master`
 * TensorFlow Predictor (CPU, GPU, Inferentia): `quay.io/cortexlabs/tensorflow-predictor:master`
 * ONNX Predictor (CPU): `quay.io/cortexlabs/onnx-predictor-cpu:master`
diff --git a/docs/workloads/realtime/predictors.md b/docs/workloads/realtime/predictors.md
@@ -131,7 +131,7 @@ Your `predictor` method can return different types of objects such as `JSON`-par
 
 ## TensorFlow Predictor
 
-**Uses TensorFlow version 2.3.0 by default**
+**Uses TensorFlow version 2.4.1 by default**
 
 ### Interface
 
@@ -203,7 +203,7 @@ If you need to share files between your predictor implementation and the TensorF
 
 ## ONNX Predictor
 
-**Uses ONNX Runtime version 1.4.0 by default**
+**Uses ONNX Runtime version 1.6.0 by default**
 
 ### Interface
 
diff --git a/images/inferentia/Dockerfile b/images/inferentia/Dockerfile
@@ -1 +1 @@
-FROM 790709498068.dkr.ecr.us-west-2.amazonaws.com/neuron-device-plugin:1.0.11000.0
+FROM 790709498068.dkr.ecr.us-west-2.amazonaws.com/neuron-device-plugin:1.4.1.0
diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile
@@ -18,11 +18,11 @@ RUN apk add --no-cache bash curl gettext jq openssl
 RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.36.2/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && \
     mv /tmp/eksctl /usr/local/bin
 
-RUN curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.17.9/2020-08-04/bin/linux/amd64/aws-iam-authenticator && \
+RUN curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.18.9/2020-11-02/bin/linux/amd64/aws-iam-authenticator && \
     chmod +x ./aws-iam-authenticator && \
     mv ./aws-iam-authenticator /usr/local/bin/aws-iam-authenticator
 
-RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.19.0/bin/linux/amd64/kubectl && \
+RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.20.2/bin/linux/amd64/kubectl && \
     chmod +x ./kubectl && \
     mv ./kubectl /usr/local/bin/kubectl
 
diff --git a/images/metrics-server/Dockerfile b/images/metrics-server/Dockerfile
@@ -1 +1 @@
-FROM k8s.gcr.io/metrics-server/metrics-server:v0.3.7
+FROM k8s.gcr.io/metrics-server/metrics-server:v0.4.2
diff --git a/images/neuron-rtd/Dockerfile b/images/neuron-rtd/Dockerfile
@@ -9,8 +9,8 @@ enabled=1' > /etc/yum.repos.d/neuron.repo
 RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
 
 RUN yum install -y \
-    aws-neuron-tools-1.0.11054.0 \
-    aws-neuron-runtime-1.0.9592.0 \
+    aws-neuron-tools-1.4.2.0 \
+    aws-neuron-runtime-1.4.3.0 \
     procps-ng-3.3.10-26.amzn2.x86_64 \
     gzip \
     tar \
diff --git a/images/nvidia/Dockerfile b/images/nvidia/Dockerfile
@@ -1 +1 @@
-FROM nvidia/k8s-device-plugin:v0.6.0
+FROM nvidia/k8s-device-plugin:v0.7.3
diff --git a/images/onnx-predictor-cpu/Dockerfile b/images/onnx-predictor-cpu/Dockerfile
@@ -48,7 +48,7 @@ COPY pkg/cortex/serve/cortex_internal.requirements.txt /src/cortex/serve/cortex_
 RUN pip install --no-cache-dir \
     -r /src/cortex/serve/serve.requirements.txt \
     -r /src/cortex/serve/cortex_internal.requirements.txt \
-    onnxruntime==1.4.0
+    onnxruntime==1.6.0
 
 COPY pkg/cortex/serve/init/install-core-dependencies.sh /usr/local/cortex/install-core-dependencies.sh
 RUN chmod +x /usr/local/cortex/install-core-dependencies.sh
diff --git a/images/onnx-predictor-gpu/Dockerfile b/images/onnx-predictor-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+FROM nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04
 
 RUN apt-get update -qq && apt-get install -y -q \
         build-essential \
@@ -48,7 +48,7 @@ COPY pkg/cortex/serve/cortex_internal.requirements.txt /src/cortex/serve/cortex_
 RUN pip install --no-cache-dir \
     -r /src/cortex/serve/serve.requirements.txt \
     -r /src/cortex/serve/cortex_internal.requirements.txt \
-    onnxruntime-gpu==1.4.0
+    onnxruntime-gpu==1.6.0
 
 COPY pkg/cortex/serve/init/install-core-dependencies.sh /usr/local/cortex/install-core-dependencies.sh
 RUN chmod +x /usr/local/cortex/install-core-dependencies.sh
diff --git a/images/operator/Dockerfile b/images/operator/Dockerfile
@@ -1,6 +1,6 @@
 FROM golang:1.14.7 as builder
 
-RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl && \
+RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.20.2/bin/linux/amd64/kubectl && \
     mv ./kubectl /tmp/kubectl
 
 COPY go.mod go.sum /go/src/github.com/cortexlabs/cortex/
diff --git a/images/python-predictor-gpu/Dockerfile b/images/python-predictor-gpu/Dockerfile
@@ -1,6 +1,6 @@
 ARG CUDA_VERSION=10.2
 ARG CUDNN=8
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN-devel-ubuntu18.04
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN-runtime-ubuntu18.04
 
 RUN apt-get update -qq && apt-get install -y -q \
         build-essential \
diff --git a/images/python-predictor-inf/Dockerfile b/images/python-predictor-inf/Dockerfile
@@ -6,8 +6,8 @@ RUN apt-get update -qq && apt-get install -y -q \
     echo "deb https://apt.repos.neuron.amazonaws.com bionic main" >> /etc/apt/sources.list.d/neuron.list && \
     wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \
     apt-get update -qq && apt-get install -y -q \
-    aws-neuron-tools=1.1.228.0 \
-    aws-neuron-runtime=1.1.1402.0 && \
+    aws-neuron-tools=1.4.2.0 \
+    aws-neuron-runtime=1.4.3.0 && \
     apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
 RUN wget -P /tmp/ https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer && \
diff --git a/images/tensorflow-predictor/Dockerfile b/images/tensorflow-predictor/Dockerfile
@@ -48,8 +48,8 @@ COPY pkg/cortex/serve/cortex_internal.requirements.txt /src/cortex/serve/cortex_
 RUN pip install --no-cache-dir \
     -r /src/cortex/serve/serve.requirements.txt \
     -r /src/cortex/serve/cortex_internal.requirements.txt \
-    tensorflow-cpu==2.3.0 \
-    tensorflow-serving-api==2.3.0
+    tensorflow-cpu==2.4.1 \
+    tensorflow-serving-api==2.4.1
 
 COPY pkg/cortex/serve/init/install-core-dependencies.sh /usr/local/cortex/install-core-dependencies.sh
 RUN chmod +x /usr/local/cortex/install-core-dependencies.sh
diff --git a/images/tensorflow-serving-cpu/Dockerfile b/images/tensorflow-serving-cpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM tensorflow/serving:2.3.0
+FROM tensorflow/serving:2.4.1
 
 RUN apt-get update -qq && apt-get install -y -q \
     curl \
diff --git a/images/tensorflow-serving-gpu/Dockerfile b/images/tensorflow-serving-gpu/Dockerfile
@@ -1,8 +1,9 @@
-FROM tensorflow/serving:2.3.0-gpu
+FROM tensorflow/serving:2.4.0-gpu
 
+# https://www.tensorflow.org/install/gpu
 RUN apt-get update -qq && apt-get install -y --no-install-recommends -q \
-        libnvinfer6=6.0.1-1+cuda10.1 \
-        libnvinfer-plugin6=6.0.1-1+cuda10.1 \
+        libnvinfer7=7.1.3-1+cuda11.0 \
+        libnvinfer-plugin7=7.1.3-1+cuda11.0 \
         curl \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
diff --git a/images/tensorflow-serving-inf/Dockerfile b/images/tensorflow-serving-inf/Dockerfile
@@ -11,9 +11,9 @@ RUN apt-get update -qq && apt-get install -y -q \
     echo "deb https://apt.repos.neuron.amazonaws.com bionic main" >> /etc/apt/sources.list.d/neuron.list && \
     wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \
     apt-get update -qq && apt-get install -y -q \
-    aws-neuron-tools=1.1.228.0 \
-    aws-neuron-runtime=1.1.1402.0 \
-    tensorflow-model-server-neuron=1.15.0.1.0.2043.0 && \
+    aws-neuron-tools=1.4.2.0 \
+    aws-neuron-runtime=1.4.3.0 \
+    tensorflow-model-server-neuron=1.15.0.1.2.2.0 && \
     apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
 ENV PATH=/opt/aws/neuron/bin/:$PATH
diff --git a/manager/manifests/metrics-server.yaml b/manager/manifests/metrics-server.yaml
diff --git a/manager/manifests/nvidia_aws.yaml b/manager/manifests/nvidia_aws.yaml
diff --git a/pkg/cortex/serve/init/install-core-dependencies.sh b/pkg/cortex/serve/init/install-core-dependencies.sh

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-FROM 790709498068.dkr.ecr.us-west-2.amazonaws.com/neuron-device-plugin:1.0.11000.0`
	`1`	`+FROM 790709498068.dkr.ecr.us-west-2.amazonaws.com/neuron-device-plugin:1.4.1.0`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-FROM k8s.gcr.io/metrics-server/metrics-server:v0.3.7`
	`1`	`+FROM k8s.gcr.io/metrics-server/metrics-server:v0.4.2`