Skip to content

Commit 6aee81b

Browse files
committed
Update realtime api configuration docs
1 parent c834752 commit 6aee81b

File tree

1 file changed

+21
-21
lines changed

1 file changed

+21
-21
lines changed

docs/workloads/realtime/configuration.md

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,18 @@
2828
image: <string> # docker image to use for the Predictor (default: quay.io/cortexlabs/python-predictor-cpu:master or quay.io/cortexlabs/python-predictor-gpu:master based on compute)
2929
env: <string: string> # dictionary of environment variables
3030
networking:
31-
endpoint: <string> # the endpoint for the API (aws and gcp only) (default: <api_name>)
31+
endpoint: <string> # the endpoint for the API (default: <api_name>)
3232
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
3333
compute:
3434
cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
3535
gpu: <int> # GPU request per replica (default: 0)
3636
inf: <int> # Inferentia ASIC request per replica (default: 0) (aws only)
3737
mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
38-
autoscaling: # (aws and gcp only)
39-
min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only)
40-
max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only)
41-
init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only)
42-
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws and gcp only)
38+
autoscaling:
39+
min_replicas: <int> # minimum number of replicas (default: 1)
40+
max_replicas: <int> # maximum number of replicas (default: 100)
41+
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
42+
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
4343
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
4444
window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only)
4545
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
@@ -48,7 +48,7 @@
4848
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
4949
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
5050
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
51-
update_strategy: # (aws and gcp only)
51+
update_strategy:
5252
max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
5353
max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
5454
```
@@ -84,18 +84,18 @@
8484
tensorflow_serving_image: <string> # docker image to use for the TensorFlow Serving container (default: quay.io/cortexlabs/tensorflow-serving-gpu:master or quay.io/cortexlabs/tensorflow-serving-cpu:master based on compute)
8585
env: <string: string> # dictionary of environment variables
8686
networking:
87-
endpoint: <string> # the endpoint for the API (aws and gcp only) (default: <api_name>)
87+
endpoint: <string> # the endpoint for the API (default: <api_name>)
8888
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
8989
compute:
9090
cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
9191
gpu: <int> # GPU request per replica (default: 0)
9292
inf: <int> # Inferentia ASIC request per replica (default: 0) (aws only)
9393
mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
94-
autoscaling: # (aws and gcp only)
95-
min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only)
96-
max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only)
97-
init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only)
98-
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws and gcp only)
94+
autoscaling:
95+
min_replicas: <int> # minimum number of replicas (default: 1)
96+
max_replicas: <int> # maximum number of replicas (default: 100)
97+
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
98+
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
9999
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
100100
window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only)
101101
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
@@ -104,7 +104,7 @@
104104
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
105105
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
106106
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
107-
update_strategy: # (aws and gcp only)
107+
update_strategy:
108108
max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
109109
max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
110110
```
@@ -134,17 +134,17 @@
134134
image: <string> # docker image to use for the Predictor (default: quay.io/cortexlabs/onnx-predictor-gpu:master or quay.io/cortexlabs/onnx-predictor-cpu:master based on compute)
135135
env: <string: string> # dictionary of environment variables
136136
networking:
137-
endpoint: <string> # the endpoint for the API (aws and gcp only) (default: <api_name>)
137+
endpoint: <string> # the endpoint for the API (default: <api_name>)
138138
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
139139
compute:
140140
cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
141141
gpu: <int> # GPU request per replica (default: 0)
142142
mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
143-
autoscaling: # (aws and gcp only)
144-
min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only)
145-
max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only)
146-
init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only)
147-
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws and gcp only)
143+
autoscaling:
144+
min_replicas: <int> # minimum number of replicas (default: 1)
145+
max_replicas: <int> # maximum number of replicas (default: 100)
146+
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
147+
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
148148
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
149149
window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only)
150150
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
@@ -153,7 +153,7 @@
153153
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
154154
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
155155
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
156-
update_strategy: # (aws and gcp only)
156+
update_strategy:
157157
max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
158158
max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
159159
```

0 commit comments

Comments
 (0)