Schedule post predict in same threadpool as predict (#1367)

vishalbollu · web-flow · commit 06f6d7dd2f05 · 2020-09-23T15:07:56.000-04:00
diff --git a/docs/deployments/realtime-api/predictors.md b/docs/deployments/realtime-api/predictors.md
@@ -86,6 +86,10 @@ class PythonPredictor:
         Useful for tasks that the client doesn't need to wait on before
         receiving a response such as recording metrics or storing results.
 
+        Note: post_predict() and predict() run in the same thread pool. The
+        size of the thread pool can be increased by updating
+        `threads_per_process` in the api configuration yaml.
+
         Args:
             response (optional): The response as returned by the predict method.
             payload (optional): The request payload (see below for the possible
@@ -245,6 +249,10 @@ class TensorFlowPredictor:
         Useful for tasks that the client doesn't need to wait on before
         receiving a response such as recording metrics or storing results.
 
+        Note: post_predict() and predict() run in the same thread pool. The
+        size of the thread pool can be increased by updating
+        `threads_per_process` in the api configuration yaml.
+
         Args:
             response (optional): The response as returned by the predict method.
             payload (optional): The request payload (see below for the possible
@@ -353,6 +361,10 @@ class ONNXPredictor:
         Useful for tasks that the client doesn't need to wait on before
         receiving a response such as recording metrics or storing results.
 
+        Note: post_predict() and predict() run in the same thread pool. The
+        size of the thread pool can be increased by updating
+        `threads_per_process` in the api configuration yaml.
+
         Args:
             response (optional): The response as returned by the predict method.
             payload (optional): The request payload (see below for the possible
diff --git a/pkg/workloads/cortex/serve/serve.py b/pkg/workloads/cortex/serve/serve.py
@@ -214,7 +214,7 @@ def predict(request: Request):
 
     if util.has_method(predictor_impl, "post_predict"):
         kwargs = build_post_predict_kwargs(prediction, request)
-        tasks.add_task(predictor_impl.post_predict, **kwargs)
+        request_thread_pool.submit(predictor_impl.post_predict, **kwargs)
 
     if len(tasks.tasks) > 0:
         response.background = tasks