Update docs

deliahu · deliahu · commit 8c834bd53caf · 2020-12-08T15:05:53.000-08:00
diff --git a/docs/deployments/batch-api/api-configuration.md b/docs/deployments/batch-api/api-configuration.md
@@ -6,6 +6,8 @@ Once your model is [exported](../../guides/exporting.md) and you've implemented
 
 Reference the section below which corresponds to your Predictor type: [Python](#python-predictor), [TensorFlow](#tensorflow-predictor), or [ONNX](#onnx-predictor).
 
+**Batch APIs are only supported on a Cortex cluster (in AWS).**
+
 ## Python Predictor
 
 <!-- CORTEX_VERSION_BRANCH_STABLE x2 -->
diff --git a/docs/deployments/realtime-api/api-configuration.md b/docs/deployments/realtime-api/api-configuration.md
@@ -33,29 +33,29 @@ Reference the section below which corresponds to your Predictor type: [Python](#
   networking:
     endpoint: <string>  # the endpoint for the API (aws only) (default: <api_name>)
     local_port: <int>  # specify the port for API (local only) (default: 8888)
-    api_gateway: public | none  # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide)
+    api_gateway: public | none  # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
   compute:
     cpu: <string | int | float>  # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
     gpu: <int>  # GPU request per replica (default: 0)
-    inf: <int>  # Inferentia ASIC request per replica (default: 0)
+    inf: <int>  # Inferentia ASIC request per replica (default: 0) (aws only)
     mem: <string>  # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
   monitoring:  # (aws only)
     model_type: <string>  # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
     key: <string>  # the JSON key in the response payload of the value to monitor (required if the response payload is a JSON object)
-  autoscaling:  # (aws only)
-    min_replicas: <int>  # minimum number of replicas (default: 1)
-    max_replicas: <int>  # maximum number of replicas (default: 100)
-    init_replicas: <int>  # initial number of replicas (default: <min_replicas>)
-    target_replica_concurrency: <float>  # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
-    max_replica_concurrency: <int>  # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
-    window: <duration>  # the time over which to average the API's concurrency (default: 60s)
-    downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m)
-    upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m)
-    max_downscale_factor: <float>  # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
-    max_upscale_factor: <float>  # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
-    downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
-    upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
-  update_strategy:  # (aws only)
+  autoscaling:  # (aws and gcp only)
+    min_replicas: <int>  # minimum number of replicas (default: 1) (aws and gcp only)
+    max_replicas: <int>  # maximum number of replicas (default: 100) (aws and gcp only)
+    init_replicas: <int>  # initial number of replicas (default: <min_replicas>) (aws and gcp only)
+    target_replica_concurrency: <float>  # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
+    max_replica_concurrency: <int>  # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws only)
+    window: <duration>  # the time over which to average the API's concurrency (default: 60s) (aws only)
+    downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
+    upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m) (aws only)
+    max_downscale_factor: <float>  # the maximum factor by which to scale down the API on a single scaling event (default: 0.75) (aws only)
+    max_upscale_factor: <float>  # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
+    downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
+    upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
+  update_strategy:  # (aws and gcp only)
     max_surge: <string | int>  # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
     max_unavailable: <string | int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
 ```
@@ -96,29 +96,29 @@ See additional documentation for [models](models.md), [parallelism](parallelism.
   networking:
     endpoint: <string>  # the endpoint for the API (aws only) (default: <api_name>)
     local_port: <int>  # specify the port for API (local only) (default: 8888)
-    api_gateway: public | none  # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide)
+    api_gateway: public | none  # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
   compute:
     cpu: <string | int | float>  # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
     gpu: <int>  # GPU request per replica (default: 0)
-    inf: <int>  # Inferentia ASIC request per replica (default: 0)
+    inf: <int>  # Inferentia ASIC request per replica (default: 0) (aws only)
     mem: <string>  # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
   monitoring:  # (aws only)
     model_type: <string>  # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
     key: <string>  # the JSON key in the response payload of the value to monitor (required if the response payload is a JSON object)
-  autoscaling:  # (aws only)
-    min_replicas: <int>  # minimum number of replicas (default: 1)
-    max_replicas: <int>  # maximum number of replicas (default: 100)
-    init_replicas: <int>  # initial number of replicas (default: <min_replicas>)
-    target_replica_concurrency: <float>  # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
-    max_replica_concurrency: <int>  # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
-    window: <duration>  # the time over which to average the API's concurrency (default: 60s)
-    downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m)
-    upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m)
-    max_downscale_factor: <float>  # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
-    max_upscale_factor: <float>  # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
-    downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
-    upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
-  update_strategy:  # (aws only)
+  autoscaling:  # (aws and gcp only)
+    min_replicas: <int>  # minimum number of replicas (default: 1) (aws and gcp only)
+    max_replicas: <int>  # maximum number of replicas (default: 100) (aws and gcp only)
+    init_replicas: <int>  # initial number of replicas (default: <min_replicas>) (aws and gcp only)
+    target_replica_concurrency: <float>  # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
+    max_replica_concurrency: <int>  # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws only)
+    window: <duration>  # the time over which to average the API's concurrency (default: 60s) (aws only)
+    downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
+    upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m) (aws only)
+    max_downscale_factor: <float>  # the maximum factor by which to scale down the API on a single scaling event (default: 0.75) (aws only)
+    max_upscale_factor: <float>  # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
+    downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
+    upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
+  update_strategy:  # (aws and gcp only)
     max_surge: <string | int>  # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
     max_unavailable: <string | int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
 ```
@@ -153,28 +153,28 @@ See additional documentation for [models](models.md), [parallelism](parallelism.
   networking:
     endpoint: <string>  # the endpoint for the API (aws only) (default: <api_name>)
     local_port: <int>  # specify the port for API (local only) (default: 8888)
-    api_gateway: public | none  # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide)
+    api_gateway: public | none  # whether to create a public API Gateway endpoint for this API (if not, the API will still be accessible via the load balancer) (default: public, unless disabled cluster-wide) (aws only)
   compute:
     cpu: <string | int | float>  # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
     gpu: <int>  # GPU request per replica (default: 0)
     mem: <string>  # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
   monitoring:  # (aws only)
     model_type: <string>  # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
     key: <string>  # the JSON key in the response payload of the value to monitor (required if the response payload is a JSON object)
-  autoscaling:  # (aws only)
-    min_replicas: <int>  # minimum number of replicas (default: 1)
-    max_replicas: <int>  # maximum number of replicas (default: 100)
-    init_replicas: <int>  # initial number of replicas (default: <min_replicas>)
-    target_replica_concurrency: <float>  # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
-    max_replica_concurrency: <int>  # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
-    window: <duration>  # the time over which to average the API's concurrency (default: 60s)
-    downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m)
-    upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m)
-    max_downscale_factor: <float>  # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
-    max_upscale_factor: <float>  # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
-    downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
-    upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
-  update_strategy:  # (aws only)
+  autoscaling:  # (aws and gcp only)
+    min_replicas: <int>  # minimum number of replicas (default: 1) (aws and gcp only)
+    max_replicas: <int>  # maximum number of replicas (default: 100) (aws and gcp only)
+    init_replicas: <int>  # initial number of replicas (default: <min_replicas>) (aws and gcp only)
+    target_replica_concurrency: <float>  # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process) (aws only)
+    max_replica_concurrency: <int>  # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024) (aws only)
+    window: <duration>  # the time over which to average the API's concurrency (default: 60s) (aws only)
+    downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m) (aws only)
+    upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m) (aws only)
+    max_downscale_factor: <float>  # the maximum factor by which to scale down the API on a single scaling event (default: 0.75) (aws only)
+    max_upscale_factor: <float>  # the maximum factor by which to scale up the API on a single scaling event (default: 1.5) (aws only)
+    downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05) (aws only)
+    upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05) (aws only)
+  update_strategy:  # (aws and gcp only)
     max_surge: <string | int>  # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
     max_unavailable: <string | int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
 ```