Skip to content

Commit f79afa3

Browse files
authored
Replica pod autoscaling (#196)
1 parent d974347 commit f79afa3

File tree

45 files changed

+849
-273
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+849
-273
lines changed

Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ test-examples:
117117
###############
118118

119119
ci-build-images:
120+
@./build/build-image.sh images/manager manager
120121
@./build/build-image.sh images/spark-base spark-base
121122
@./build/build-image.sh images/tf-base tf-base
122123
@./build/build-image.sh images/tf-base-gpu tf-base-gpu
@@ -135,9 +136,10 @@ ci-build-images:
135136
@./build/build-image.sh images/argo-executor argo-executor
136137
@./build/build-image.sh images/python-packager python-packager
137138
@./build/build-image.sh images/cluster-autoscaler cluster-autoscaler
138-
@./build/build-image.sh images/manager manager
139+
@./build/build-image.sh images/metrics-server metrics-server
139140

140141
ci-push-images:
142+
@./build/push-image.sh manager
141143
@./build/push-image.sh spark
142144
@./build/push-image.sh spark-operator
143145
@./build/push-image.sh tf-train
@@ -153,7 +155,8 @@ ci-push-images:
153155
@./build/push-image.sh argo-executor
154156
@./build/push-image.sh python-packager
155157
@./build/push-image.sh cluster-autoscaler
156-
@./build/push-image.sh manager
158+
@./build/push-image.sh metrics-server
159+
157160

158161
ci-build-cli:
159162
@./build/cli.sh

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ Cortex is actively maintained by Cortex Labs. We're a venture-backed team of inf
2626
model: s3://my-bucket/my-model.zip
2727
request_handler: handler.py
2828
compute:
29-
replicas: 4
30-
gpu: 2
29+
min_replicas: 5
30+
max_replicas: 20
3131
```
3232
3333
**Customize** request handling (optional):

cli/cmd/get.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -443,16 +443,16 @@ func describeAPI(name string, resourcesRes *schema.GetResourcesResponse) (string
443443
}
444444

445445
out := titleStr("Summary")
446-
out += "Status: " + groupStatus.Message() + "\n"
446+
out += "Status: " + groupStatus.Message() + "\n"
447447
if ctxAPIStatus != nil {
448-
out += fmt.Sprintf("Updated replicas: %d/%d ready\n", ctxAPIStatus.ReadyUpdated, ctxAPIStatus.RequestedReplicas)
448+
out += fmt.Sprintf("Up-to-date replicas: %d ready\n", ctxAPIStatus.ReadyUpdated)
449449
}
450450
if staleReplicas != 0 {
451-
out += fmt.Sprintf("Stale replicas: %d ready\n", staleReplicas)
451+
out += fmt.Sprintf("Stale replicas: %d ready\n", staleReplicas)
452452
}
453-
out += "Created at: " + libtime.LocalTimestamp(groupStatus.Start) + "\n"
453+
out += "Created at: " + libtime.LocalTimestamp(groupStatus.Start) + "\n"
454454
if groupStatus.ActiveStatus != nil && groupStatus.ActiveStatus.Start != nil {
455-
out += "Refreshed at: " + libtime.LocalTimestamp(groupStatus.ActiveStatus.Start) + "\n"
455+
out += "Refreshed at: " + libtime.LocalTimestamp(groupStatus.ActiveStatus.Start) + "\n"
456456
}
457457

458458
out += titleStr("Endpoint")

cortex.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/
131131
export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}"
132132
export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}"
133133
export CORTEX_IMAGE_CLUSTER_AUTOSCALER="${CORTEX_IMAGE_CLUSTER_AUTOSCALER:-cortexlabs/cluster-autoscaler:$CORTEX_VERSION_STABLE}"
134+
export CORTEX_IMAGE_METRICS_SERVER="${CORTEX_IMAGE_METRICS_SERVER:-cortexlabs/metrics-server:$CORTEX_VERSION_STABLE}"
134135

135136
export CORTEX_ENABLE_TELEMETRY="${CORTEX_ENABLE_TELEMETRY:-""}"
136137

@@ -181,6 +182,7 @@ function install_cortex() {
181182
-e CORTEX_IMAGE_TF_SERVE_GPU=$CORTEX_IMAGE_TF_SERVE_GPU \
182183
-e CORTEX_IMAGE_TF_TRAIN_GPU=$CORTEX_IMAGE_TF_TRAIN_GPU \
183184
-e CORTEX_IMAGE_CLUSTER_AUTOSCALER=$CORTEX_IMAGE_CLUSTER_AUTOSCALER \
185+
-e CORTEX_IMAGE_METRICS_SERVER=$CORTEX_IMAGE_METRICS_SERVER \
184186
-e CORTEX_ENABLE_TELEMETRY=$CORTEX_ENABLE_TELEMETRY \
185187
$CORTEX_IMAGE_MANAGER
186188
}

dev/load.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
3+
# Copyright 2019 Cortex Labs, Inc.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
18+
SLEEP="0.01"
19+
20+
URL="https://ac47cdf2e99cb11e9ade10693263c378-1155928797.us-west-2.elb.amazonaws.com/iris/iris-type"
21+
DATA='{ "samples": [ { "sepal_length": 5.2, "sepal_width": 3.6, "petal_length": 1.4, "petal_width": 0.3 } ] }'
22+
23+
# URL="https://ac47cdf2e99cb11e9ade10693263c378-1155928797.us-west-2.elb.amazonaws.com/insurance/cost"
24+
# DATA='{ "samples": [ { "age": 22, "bmi": 25, "children": 0, "region": "northeast", "sex": "female", "smoker": "no" } ] }'
25+
26+
trap ctrl_c INT
27+
function ctrl_c() {
28+
echo ""
29+
exit 0
30+
}
31+
32+
function make_request() {
33+
curl --silent --show-error -k -X POST -H "Content-Type: application/json" -d "${DATA}" "${URL}"
34+
}
35+
36+
resp=$(make_request)
37+
echo -n "."
38+
39+
while eval "sleep ${SLEEP}"; do
40+
resp=$(make_request)
41+
echo -n "."
42+
done

dev/registry.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ function ecr_login() {
3535
}
3636

3737
function create_registry() {
38+
aws ecr create-repository --repository-name=cortexlabs/manager --region=$REGISTRY_REGION || true
3839
aws ecr create-repository --repository-name=cortexlabs/argo-controller --region=$REGISTRY_REGION || true
3940
aws ecr create-repository --repository-name=cortexlabs/argo-executor --region=$REGISTRY_REGION || true
4041
aws ecr create-repository --repository-name=cortexlabs/fluentd --region=$REGISTRY_REGION || true
@@ -50,7 +51,7 @@ function create_registry() {
5051
aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
5152
aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
5253
aws ecr create-repository --repository-name=cortexlabs/cluster-autoscaler --region=$REGISTRY_REGION || true
53-
aws ecr create-repository --repository-name=cortexlabs/manager --region=$REGISTRY_REGION || true
54+
aws ecr create-repository --repository-name=cortexlabs/metrics-server --region=$REGISTRY_REGION || true
5455
}
5556

5657
### HELPERS ###
@@ -116,6 +117,8 @@ if [ "$cmd" = "create" ]; then
116117

117118
elif [ "$cmd" = "update" ]; then
118119
if [ "$env" != "dev" ]; then
120+
build_and_push $ROOT/images/manager manager latest
121+
119122
cache_builder $ROOT/images/spark-base spark-base
120123
build_base $ROOT/images/spark-base spark-base
121124
build_base $ROOT/images/tf-base tf-base
@@ -136,7 +139,7 @@ elif [ "$cmd" = "update" ]; then
136139
build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
137140
build_and_push $ROOT/images/python-packager python-packager latest
138141
build_and_push $ROOT/images/cluster-autoscaler cluster-autoscaler latest
139-
build_and_push $ROOT/images/manager manager latest
142+
build_and_push $ROOT/images/metrics-server metrics-server latest
140143
fi
141144

142145
build_and_push $ROOT/images/spark spark latest

docs/apis/apis.md

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,12 @@ Serve models at scale and use them to build smarter applications.
99
name: <string> # API name (required)
1010
model: <string> # path to a zipped model dir (e.g. s3://my-bucket/model.zip)
1111
compute:
12-
replicas: <int> # number of replicas to launch (default: 1)
13-
cpu: <string> # CPU request per replica (default: Null)
14-
gpu: <string> # gpu request per replica (default: Null)
12+
min_replicas: <int> # minimum number of replicas (default: 1)
13+
max_replicas: <int> # maximum number of replicas (default: 100)
14+
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
15+
target_cpu_utilization: <int> # CPU utilization threshold (as a percentage) to trigger scaling (default: 80)
16+
cpu: <string> # CPU request per replica (default: 200m)
17+
gpu: <string> # gpu request per replica (default: 0)
1518
mem: <string> # memory request per replica (default: Null)
1619
```
1720
@@ -24,8 +27,9 @@ See [packaging models](packaging-models.md) for how to create the zipped model.
2427
name: my-api
2528
model: s3://my-bucket/my-model.zip
2629
compute:
27-
replicas: 3
28-
gpu: 2
30+
min_replicas: 5
31+
max_replicas: 20
32+
cpu: "1"
2933
```
3034
3135
## Integration

docs/apis/tutorial.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ Add to `cortex.yaml`:
3333
- kind: api
3434
name: iris-type
3535
model: s3://cortex-examples/iris-tensorflow.zip
36-
compute:
37-
replicas: 3
3836
```
3937

4038
### Deploy the API

docs/cluster/config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="cortexlabs/tf-train-gpu:master"
5252
export CORTEX_IMAGE_TF_SERVE_GPU="cortexlabs/tf-serve-gpu:master"
5353
export CORTEX_IMAGE_PYTHON_PACKAGER="cortexlabs/python-packager:master"
5454
export CORTEX_IMAGE_CLUSTER_AUTOSCALER="cortexlabs/cluster-autoscaler:master"
55+
export CORTEX_IMAGE_METRICS_SERVER="cortexlabs/metrics-server:master"
5556

5657
# Flag to enable collecting error reports and usage stats. If flag is not set to either "true" or "false", you will be prompted.
5758
export CORTEX_ENABLE_TELEMETRY=""

docs/cluster/development.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/corte
8686
export CORTEX_IMAGE_TF_TRANSFORM="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/tf-transform:latest"
8787
export CORTEX_IMAGE_PYTHON_PACKAGER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/python-packager:latest"
8888
export CORTEX_IMAGE_CLUSTER_AUTOSCALER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/cluster-autoscaler:latest"
89+
export CORTEX_IMAGE_METRICS_SERVER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/metrics-server:latest"
8990

9091
export AWS_ACCESS_KEY_ID="XXXXXX"
9192
export AWS_SECRET_ACCESS_KEY="XXXXXX"

0 commit comments

Comments
 (0)