|
28 | 28 | image: <string> # docker image to use for the Predictor (default: quay.io/cortexlabs/python-predictor-cpu:master or quay.io/cortexlabs/python-predictor-gpu:master based on compute) |
29 | 29 | env: <string: string> # dictionary of environment variables |
30 | 30 | networking: |
31 | | - endpoint: <string> # the endpoint for the API (aws and gcp only) (default: <api_name>) |
| 31 | + endpoint: <string> # the endpoint for the API (default: <api_name>) |
32 | 32 | api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only) |
33 | 33 | compute: |
34 | 34 | cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m) |
35 | 35 | gpu: <int> # GPU request per replica (default: 0) |
36 | 36 | inf: <int> # Inferentia ASIC request per replica (default: 0) (aws only) |
37 | 37 | mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null) |
38 | | - autoscaling: # (aws and gcp only) |
39 | | - min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only) |
40 | | - max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only) |
41 | | - init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only) |
42 | | - max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws and gcp only) |
| 38 | + autoscaling: |
| 39 | + min_replicas: <int> # minimum number of replicas (default: 1) |
| 40 | + max_replicas: <int> # maximum number of replicas (default: 100) |
| 41 | + init_replicas: <int> # initial number of replicas (default: <min_replicas>) |
| 42 | + max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) |
43 | 43 | target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only) |
44 | 44 | window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only) |
45 | 45 | downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only) |
|
48 | 48 | max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only) |
49 | 49 | downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only) |
50 | 50 | upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only) |
51 | | - update_strategy: # (aws and gcp only) |
| 51 | + update_strategy: |
52 | 52 | max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates) |
53 | 53 | max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) |
54 | 54 | ``` |
|
84 | 84 | tensorflow_serving_image: <string> # docker image to use for the TensorFlow Serving container (default: quay.io/cortexlabs/tensorflow-serving-gpu:master or quay.io/cortexlabs/tensorflow-serving-cpu:master based on compute) |
85 | 85 | env: <string: string> # dictionary of environment variables |
86 | 86 | networking: |
87 | | - endpoint: <string> # the endpoint for the API (aws and gcp only) (default: <api_name>) |
| 87 | + endpoint: <string> # the endpoint for the API (default: <api_name>) |
88 | 88 | api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only) |
89 | 89 | compute: |
90 | 90 | cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m) |
91 | 91 | gpu: <int> # GPU request per replica (default: 0) |
92 | 92 | inf: <int> # Inferentia ASIC request per replica (default: 0) (aws only) |
93 | 93 | mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null) |
94 | | - autoscaling: # (aws and gcp only) |
95 | | - min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only) |
96 | | - max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only) |
97 | | - init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only) |
98 | | - max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws and gcp only) |
| 94 | + autoscaling: |
| 95 | + min_replicas: <int> # minimum number of replicas (default: 1) |
| 96 | + max_replicas: <int> # maximum number of replicas (default: 100) |
| 97 | + init_replicas: <int> # initial number of replicas (default: <min_replicas>) |
| 98 | + max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) |
99 | 99 | target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only) |
100 | 100 | window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only) |
101 | 101 | downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only) |
|
104 | 104 | max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only) |
105 | 105 | downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only) |
106 | 106 | upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only) |
107 | | - update_strategy: # (aws and gcp only) |
| 107 | + update_strategy: |
108 | 108 | max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates) |
109 | 109 | max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) |
110 | 110 | ``` |
|
134 | 134 | image: <string> # docker image to use for the Predictor (default: quay.io/cortexlabs/onnx-predictor-gpu:master or quay.io/cortexlabs/onnx-predictor-cpu:master based on compute) |
135 | 135 | env: <string: string> # dictionary of environment variables |
136 | 136 | networking: |
137 | | - endpoint: <string> # the endpoint for the API (aws and gcp only) (default: <api_name>) |
| 137 | + endpoint: <string> # the endpoint for the API (default: <api_name>) |
138 | 138 | api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only) |
139 | 139 | compute: |
140 | 140 | cpu: <string | int | float> # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m) |
141 | 141 | gpu: <int> # GPU request per replica (default: 0) |
142 | 142 | mem: <string> # memory request per replica, e.g. 200Mi or 1Gi (default: Null) |
143 | | - autoscaling: # (aws and gcp only) |
144 | | - min_replicas: <int> # minimum number of replicas (default: 1) (aws and gcp only) |
145 | | - max_replicas: <int> # maximum number of replicas (default: 100) (aws and gcp only) |
146 | | - init_replicas: <int> # initial number of replicas (default: <min_replicas>) (aws and gcp only) |
147 | | - max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws and gcp only) |
| 143 | + autoscaling: |
| 144 | + min_replicas: <int> # minimum number of replicas (default: 1) |
| 145 | + max_replicas: <int> # maximum number of replicas (default: 100) |
| 146 | + init_replicas: <int> # initial number of replicas (default: <min_replicas>) |
| 147 | + max_replica_concurrency: <int> # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) |
148 | 148 | target_replica_concurrency: <float> # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only) |
149 | 149 | window: <duration> # the time over which to average the API's concurrency (default: 60s) (aws only) |
150 | 150 | downscale_stabilization_period: <duration> # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only) |
|
153 | 153 | max_upscale_factor: <float> # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only) |
154 | 154 | downscale_tolerance: <float> # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only) |
155 | 155 | upscale_tolerance: <float> # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only) |
156 | | - update_strategy: # (aws and gcp only) |
| 156 | + update_strategy: |
157 | 157 | max_surge: <string | int> # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates) |
158 | 158 | max_unavailable: <string | int> # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) |
159 | 159 | ``` |
0 commit comments