Skip to content

Commit 3f488e5

Browse files
authored
Fix downloader daemonsets (#2013)
1 parent e61cdcd commit 3f488e5

File tree

4 files changed

+48
-37
lines changed

4 files changed

+48
-37
lines changed

manager/install.sh

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -352,30 +352,48 @@ function start_pre_download_images() {
352352
export CORTEX_IMAGE_TENSORFLOW_SERVING_INF="${registry}/tensorflow-serving-inf:${CORTEX_VERSION}"
353353
export CORTEX_IMAGE_TENSORFLOW_PREDICTOR="${registry}/tensorflow-predictor:${CORTEX_VERSION}"
354354

355-
if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]] || [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then
355+
envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null
356+
357+
has_gpu="false"
358+
has_inf="false"
359+
360+
cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length)
361+
for idx in $(seq 0 $(($cluster_config_len-1))); do
362+
ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type)
363+
if [[ "$ng_instance_type" == p* || "$ng_instance_type" == g* ]]; then
364+
has_gpu="true"
365+
fi
366+
if [[ "$ng_instance_type" == inf* ]]; then
367+
has_inf="true"
368+
fi
369+
done
370+
371+
if [ "$has_gpu" == "true" ]; then
356372
envsubst < manifests/image-downloader-gpu.yaml | kubectl apply -f - &>/dev/null
357-
elif [[ "$CORTEX_INSTANCE_TYPE" == inf* ]]; then
373+
fi
374+
375+
if [ "$has_inf" == "true" ]; then
358376
envsubst < manifests/image-downloader-inf.yaml | kubectl apply -f - &>/dev/null
359-
else
360-
envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null
361377
fi
362378
}
363379

364380
function await_pre_download_images() {
365-
if kubectl get daemonset image-downloader -n=default &>/dev/null; then
366-
echo -n "○ downloading docker images "
367-
printed_dot="false"
381+
echo -n "○ downloading docker images ."
382+
for ds_name in image-downloader-cpu image-downloader-gpu image-downloader-inf; do
383+
if ! kubectl get daemonset $ds_name > /dev/null 2>&1; then
384+
continue
385+
fi
368386
i=0
369-
until [ "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do
387+
until [ "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do
370388
if [ $i -eq 120 ]; then break; fi # give up after 6 minutes
371389
echo -n "."
372-
printed_dot="true"
373390
((i=i+1))
374391
sleep 3
375392
done
376-
kubectl -n=default delete --ignore-not-found=true daemonset image-downloader &>/dev/null
377-
if [ "$printed_dot" == "true" ]; then echo ""; else echo ""; fi
378-
fi
393+
kubectl -n=default delete --ignore-not-found=true daemonset $ds_name &>/dev/null
394+
done
395+
396+
echo ""
379397
}
380398

381399
function validate_cortex() {

manager/manifests/image-downloader-cpu.yaml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,16 @@
1515
apiVersion: apps/v1
1616
kind: DaemonSet
1717
metadata:
18-
name: image-downloader
18+
name: image-downloader-cpu
1919
namespace: default
2020
spec:
2121
selector:
2222
matchLabels:
23-
name: image-downloader
23+
name: image-downloader-cpu
2424
template:
2525
metadata:
2626
labels:
27-
name: image-downloader
27+
name: image-downloader-cpu
2828
spec:
2929
nodeSelector:
3030
workload: "true"
@@ -33,6 +33,13 @@ spec:
3333
value: "true"
3434
operator: Equal
3535
effect: NoSchedule
36+
- key: nvidia.com/gpu
37+
operator: Exists
38+
effect: NoSchedule
39+
- key: aws.amazon.com/neuron
40+
value: "true"
41+
operator: Equal
42+
effect: NoSchedule
3643
terminationGracePeriodSeconds: 0
3744
containers:
3845
- name: python-predictor-cpu

manager/manifests/image-downloader-gpu.yaml

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,20 @@
1515
apiVersion: apps/v1
1616
kind: DaemonSet
1717
metadata:
18-
name: image-downloader
18+
name: image-downloader-gpu
1919
namespace: default
2020
spec:
2121
selector:
2222
matchLabels:
23-
name: image-downloader
23+
name: image-downloader-gpu
2424
template:
2525
metadata:
2626
labels:
27-
name: image-downloader
27+
name: image-downloader-gpu
2828
spec:
2929
nodeSelector:
3030
workload: "true"
31+
nvidia.com/gpu: "true"
3132
tolerations:
3233
- key: workload
3334
value: "true"
@@ -46,11 +47,3 @@ spec:
4647
image: $CORTEX_IMAGE_TENSORFLOW_SERVING_GPU
4748
command: ["/bin/sh"]
4849
args: ["-c", "sleep 1000000"]
49-
- name: tensorflow-predictor
50-
image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR
51-
command: ["/bin/sh"]
52-
args: ["-c", "sleep 1000000"]
53-
- name: downloader
54-
image: $CORTEX_IMAGE_DOWNLOADER
55-
command: ["/bin/sh"]
56-
args: ["-c", "sleep 1000000"]

manager/manifests/image-downloader-inf.yaml

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,20 @@
1515
apiVersion: apps/v1
1616
kind: DaemonSet
1717
metadata:
18-
name: image-downloader
18+
name: image-downloader-inf
1919
namespace: default
2020
spec:
2121
selector:
2222
matchLabels:
23-
name: image-downloader
23+
name: image-downloader-inf
2424
template:
2525
metadata:
2626
labels:
27-
name: image-downloader
27+
name: image-downloader-inf
2828
spec:
2929
nodeSelector:
3030
workload: "true"
31+
aws.amazon.com/neuron: "true"
3132
tolerations:
3233
- key: workload
3334
value: "true"
@@ -47,14 +48,6 @@ spec:
4748
image: $CORTEX_IMAGE_TENSORFLOW_SERVING_INF
4849
command: ["/bin/sh"]
4950
args: ["-c", "sleep 1000000"]
50-
- name: tensorflow-predictor
51-
image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR
52-
command: ["/bin/sh"]
53-
args: ["-c", "sleep 1000000"]
54-
- name: downloader
55-
image: $CORTEX_IMAGE_DOWNLOADER
56-
command: ["/bin/sh"]
57-
args: ["-c", "sleep 1000000"]
5851
- name: neuron-rtd
5952
image: $CORTEX_IMAGE_NEURON_RTD
6053
command: ["/bin/sh"]

0 commit comments

Comments
 (0)