Skip to content

Commit 8c834bd

Browse files
committed
Update docs
1 parent 3763505 commit 8c834bd

File tree

2 files changed

+49
-47
lines changed

2 files changed

+49
-47
lines changed

docs/deployments/batch-api/api-configuration.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ Once your model is [exported](../../guides/exporting.md) and you've implemented
66

77
Reference the section below which corresponds to your Predictor type: [Python](#python-predictor), [TensorFlow](#tensorflow-predictor), or [ONNX](#onnx-predictor).
88

9+
**Batch APIs are only supported on a Cortex cluster (in AWS).**
10+
911
## Python Predictor
1012

1113
<!-- CORTEX_VERSION_BRANCH_STABLE x2 -->

docs/deployments/realtime-api/api-configuration.md

Lines changed: 47 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -33,29 +33,29 @@ Reference the section below which corresponds to your Predictor type: [Python](#
3333
networking:
3434
endpoint: <string> # the endpoint for the API (aws only) (default: <api_name>)
3535
local_port: <int> # specify the port for API (local only) (default: 8888)
36-
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide)
36+
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
3737
compute:
3838
cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
3939
gpu: <int> # GPU request per replica (default: 0)
40-
inf: <int> # Inferentia ASIC request per replica (default: 0)
40+
inf: <int> # Inferentia ASIC request per replica (default: 0) (aws only)
4141
mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
4242
monitoring: # (aws only)
4343
model_type: <string> # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
4444
key: <string> # the JSON key in the response payload of the value to monitor (required if the response payload is a JSON object)
45-
autoscaling: # (aws only)
46-
min_replicas: <int> # minimum number of replicas (default: 1)
47-
max_replicas: <int> # maximum number of replicas (default: 100)
48-
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
49-
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
50-
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
51-
window: <duration> # the time over which to average the API's concurrency (default: 60s)
52-
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m)
53-
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m)
54-
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
55-
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
56-
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
57-
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
58-
update_strategy: # (aws only)
45+
autoscaling: # (aws and gcp only)
46+
min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only)
47+
max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only)
48+
init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only)
49+
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
50+
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws only)
51+
window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only)
52+
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
53+
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m) (aws only)
54+
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75) (aws only)
55+
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
56+
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
57+
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
58+
update_strategy: # (aws and gcp only)
5959
max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
6060
max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
6161
```
@@ -96,29 +96,29 @@ See additional documentation for [models](models.md), [parallelism](parallelism.
9696
networking:
9797
endpoint: <string> # the endpoint for the API (aws only) (default: <api_name>)
9898
local_port: <int> # specify the port for API (local only) (default: 8888)
99-
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide)
99+
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
100100
compute:
101101
cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
102102
gpu: <int> # GPU request per replica (default: 0)
103-
inf: <int> # Inferentia ASIC request per replica (default: 0)
103+
inf: <int> # Inferentia ASIC request per replica (default: 0) (aws only)
104104
mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
105105
monitoring: # (aws only)
106106
model_type: <string> # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
107107
key: <string> # the JSON key in the response payload of the value to monitor (required if the response payload is a JSON object)
108-
autoscaling: # (aws only)
109-
min_replicas: <int> # minimum number of replicas (default: 1)
110-
max_replicas: <int> # maximum number of replicas (default: 100)
111-
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
112-
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
113-
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
114-
window: <duration> # the time over which to average the API's concurrency (default: 60s)
115-
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m)
116-
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m)
117-
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
118-
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
119-
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
120-
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
121-
update_strategy: # (aws only)
108+
autoscaling: # (aws and gcp only)
109+
min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only)
110+
max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only)
111+
init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only)
112+
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
113+
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws only)
114+
window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only)
115+
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
116+
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m) (aws only)
117+
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75) (aws only)
118+
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
119+
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
120+
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
121+
update_strategy: # (aws and gcp only)
122122
max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
123123
max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
124124
```
@@ -153,28 +153,28 @@ See additional documentation for [models](models.md), [parallelism](parallelism.
153153
networking:
154154
endpoint: <string> # the endpoint for the API (aws only) (default: <api_name>)
155155
local_port: <int> # specify the port for API (local only) (default: 8888)
156-
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide)
156+
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
157157
compute:
158158
cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
159159
gpu: <int> # GPU request per replica (default: 0)
160160
mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
161161
monitoring: # (aws only)
162162
model_type: <string> # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
163163
key: <string> # the JSON key in the response payload of the value to monitor (required if the response payload is a JSON object)
164-
autoscaling: # (aws only)
165-
min_replicas: <int> # minimum number of replicas (default: 1)
166-
max_replicas: <int> # maximum number of replicas (default: 100)
167-
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
168-
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
169-
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
170-
window: <duration> # the time over which to average the API's concurrency (default: 60s)
171-
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m)
172-
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m)
173-
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
174-
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
175-
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
176-
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
177-
update_strategy: # (aws only)
164+
autoscaling: # (aws and gcp only)
165+
min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only)
166+
max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only)
167+
init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only)
168+
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
169+
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws only)
170+
window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only)
171+
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
172+
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m) (aws only)
173+
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75) (aws only)
174+
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
175+
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
176+
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
177+
update_strategy: # (aws and gcp only)
178178
max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
179179
max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
180180
```

0 commit comments

Comments
 (0)