|
18 | 18 | endpoint: <string> # the endpoint for the API (default: <api_name>) |
19 | 19 | api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) |
20 | 20 | compute: |
21 | | - cpu: <string | int | float> # CPU request per worker, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m) |
22 | | - gpu: <int> # GPU request per worker (default: 0) |
23 | | - inf: <int> # Inferentia ASIC request per worker (default: 0) |
24 | | - mem: <string> # memory request per worker, e.g. 200Mi or 1Gi (default: Null) |
| 21 | + cpu: <string | int | float> # CPU request per worker. One unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m) |
| 22 | + gpu: <int> # GPU request per worker. One unit of GPU corresponds to one virtual GPU (default: 0) |
| 23 | + inf: <int> # Inferentia request per worker. One unit corresponds to one Inferentia ASIC with 4 NeuronCores and 8GB of cache memory. Each process will have one NeuronCore Group with (4 * inf / processes_per_replica) NeuronCores, so your model should be compiled to run on (4 * inf / processes_per_replica) NeuronCores. (default: 0) (aws only) |
| 24 | + mem: <string> # memory request per worker. One unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null) |
25 | 25 | ``` |
26 | 26 |
|
27 | 27 | ## TensorFlow Predictor |
|
54 | 54 | endpoint: <string> # the endpoint for the API (default: <api_name>) |
55 | 55 | api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) |
56 | 56 | compute: |
57 | | - cpu: <string | int | float> # CPU request per worker, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m) |
58 | | - gpu: <int> # GPU request per worker (default: 0) |
59 | | - inf: <int> # Inferentia ASIC request per worker (default: 0) |
60 | | - mem: <string> # memory request per worker, e.g. 200Mi or 1Gi (default: Null) |
| 57 | + cpu: <string | int | float> # CPU request per worker. One unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m) |
| 58 | + gpu: <int> # GPU request per worker. One unit of GPU corresponds to one virtual GPU (default: 0) |
| 59 | + inf: <int> # Inferentia request per worker. One unit corresponds to one Inferentia ASIC with 4 NeuronCores and 8GB of cache memory. Each process will have one NeuronCore Group with (4 * inf / processes_per_replica) NeuronCores, so your model should be compiled to run on (4 * inf / processes_per_replica) NeuronCores. (default: 0) (aws only) |
| 60 | + mem: <string> # memory request per worker. One unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null) |
61 | 61 | ``` |
62 | 62 |
|
63 | 63 | ## ONNX Predictor |
|
84 | 84 | endpoint: <string> # the endpoint for the API (default: <api_name>) |
85 | 85 | api_gateway: public | none # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) |
86 | 86 | compute: |
87 | | - cpu: <string | int | float> # CPU request per worker, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m) |
88 | | - gpu: <int> # GPU request per worker (default: 0) |
89 | | - mem: <string> # memory request per worker, e.g. 200Mi or 1Gi (default: Null) |
| 87 | + cpu: <string | int | float> # CPU request per worker. One unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m) |
| 88 | + gpu: <int> # GPU request per worker. One unit of GPU corresponds to one virtual GPU (default: 0) |
| 89 | + mem: <string> # memory request per worker. One unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null) |
90 | 90 | ``` |
0 commit comments