You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: docs/deployments/batch-api/api-configuration.md
+2Lines changed: 2 additions & 0 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -6,6 +6,8 @@ Once your model is [exported](../../guides/exporting.md) and you've implemented
6
6
7
7
Reference the section below which corresponds to your Predictor type: [Python](#python-predictor), [TensorFlow](#tensorflow-predictor), or [ONNX](#onnx-predictor).
8
8
9
+
**Batch APIs are only supported on a Cortex cluster (in AWS).**
Copy file name to clipboardExpand all lines: docs/deployments/realtime-api/api-configuration.md
+47-47Lines changed: 47 additions & 47 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -33,29 +33,29 @@ Reference the section below which corresponds to your Predictor type: [Python](#
33
33
networking:
34
34
endpoint: <string> # the endpoint for the API (aws only) (default: <api_name>)
35
35
local_port: <int> # specify the port for API (local only) (default: 8888)
36
-
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide)
36
+
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
37
37
compute:
38
38
cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
39
39
gpu: <int> # GPU request per replica (default: 0)
40
-
inf: <int> # Inferentia ASIC request per replica (default: 0)
mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
42
42
monitoring: # (aws only)
43
43
model_type: <string> # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
44
44
key: <string> # the JSON key in the response payload of the value to monitor (required if the response payload is a JSON object)
45
-
autoscaling: # (aws only)
46
-
min_replicas: <int> # minimum number of replicas (default: 1)
47
-
max_replicas: <int> # maximum number of replicas (default: 100)
48
-
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
49
-
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
50
-
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
51
-
window: <duration> # the time over which to average the API's concurrency (default: 60s)
52
-
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m)
53
-
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m)
54
-
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
55
-
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
56
-
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
57
-
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
58
-
update_strategy: # (aws only)
45
+
autoscaling: # (aws and gcp only)
46
+
min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only)
47
+
max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only)
48
+
init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only)
49
+
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
50
+
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws only)
51
+
window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only)
52
+
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
53
+
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m) (aws only)
54
+
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75) (aws only)
55
+
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
56
+
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
57
+
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
58
+
update_strategy: # (aws and gcp only)
59
59
max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
60
60
max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
61
61
```
@@ -96,29 +96,29 @@ See additional documentation for [models](models.md), [parallelism](parallelism.
96
96
networking:
97
97
endpoint: <string> # the endpoint for the API (aws only) (default: <api_name>)
98
98
local_port: <int> # specify the port for API (local only) (default: 8888)
99
-
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide)
99
+
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
100
100
compute:
101
101
cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
102
102
gpu: <int> # GPU request per replica (default: 0)
103
-
inf: <int> # Inferentia ASIC request per replica (default: 0)
mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
105
105
monitoring: # (aws only)
106
106
model_type: <string> # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
107
107
key: <string> # the JSON key in the response payload of the value to monitor (required if the response payload is a JSON object)
108
-
autoscaling: # (aws only)
109
-
min_replicas: <int> # minimum number of replicas (default: 1)
110
-
max_replicas: <int> # maximum number of replicas (default: 100)
111
-
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
112
-
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
113
-
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
114
-
window: <duration> # the time over which to average the API's concurrency (default: 60s)
115
-
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m)
116
-
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m)
117
-
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
118
-
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
119
-
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
120
-
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
121
-
update_strategy: # (aws only)
108
+
autoscaling: # (aws and gcp only)
109
+
min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only)
110
+
max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only)
111
+
init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only)
112
+
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
113
+
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws only)
114
+
window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only)
115
+
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
116
+
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m) (aws only)
117
+
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75) (aws only)
118
+
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
119
+
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
120
+
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
121
+
update_strategy: # (aws and gcp only)
122
122
max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
123
123
max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
124
124
```
@@ -153,28 +153,28 @@ See additional documentation for [models](models.md), [parallelism](parallelism.
153
153
networking:
154
154
endpoint: <string> # the endpoint for the API (aws only) (default: <api_name>)
155
155
local_port: <int> # specify the port for API (local only) (default: 8888)
156
-
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide)
156
+
api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
157
157
compute:
158
158
cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
159
159
gpu: <int> # GPU request per replica (default: 0)
160
160
mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
161
161
monitoring: # (aws only)
162
162
model_type: <string> # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
163
163
key: <string> # the JSON key in the response payload of the value to monitor (required if the response payload is a JSON object)
164
-
autoscaling: # (aws only)
165
-
min_replicas: <int> # minimum number of replicas (default: 1)
166
-
max_replicas: <int> # maximum number of replicas (default: 100)
167
-
init_replicas: <int> # initial number of replicas (default: <min_replicas>)
168
-
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
169
-
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
170
-
window: <duration> # the time over which to average the API's concurrency (default: 60s)
171
-
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m)
172
-
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m)
173
-
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
174
-
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
175
-
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
176
-
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
177
-
update_strategy: # (aws only)
164
+
autoscaling: # (aws and gcp only)
165
+
min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only)
166
+
max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only)
167
+
init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only)
168
+
target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
169
+
max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws only)
170
+
window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only)
171
+
downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
172
+
upscale_stabilization_period: <duration> # the API will not scale above the lowest recommendation made during this period (default: 1m) (aws only)
173
+
max_downscale_factor: <float> # the maximum factor by which to scale down the API on a single scaling event (default: 0.75) (aws only)
174
+
max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
175
+
downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
176
+
upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
177
+
update_strategy: # (aws and gcp only)
178
178
max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
179
179
max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
0 commit comments