feat: implement parallel streaming output rails execution (#1263)

Pouyanpi · web-flow · commit bfe81f13c8e1 · 2025-07-21T17:07:00.000+02:00
* feat: implement parallel streaming output rails execution

- Add _run_output_rails_in_parallel_streaming method to run output rails concurrently
- Use asyncio tasks to execute multiple rails simultaneously during streaming
- Implement early termination when any rail blocks content to optimize performance
- Register the new action in the runtime dispatcher
- Add proper error handling and cancellation for robust parallel execution
- Avoid full flow state management issues that can occur with hide_prev_turn logic during streaming
- Add comprehensive tests for parallel streaming functionality
* rename result to is_blocked
diff --git a/nemoguardrails/colang/runtime.py b/nemoguardrails/colang/runtime.py
@@ -37,6 +37,12 @@ def __init__(self, config: RailsConfig, verbose: bool = False):
             import_paths=list(config.imported_paths.values()),
         )
 
+        if hasattr(self, "_run_output_rails_in_parallel_streaming"):
+            self.action_dispatcher.register_action(
+                self._run_output_rails_in_parallel_streaming,
+                name="run_output_rails_in_parallel_streaming",
+            )
+
         # The list of additional parameters that can be passed to the actions.
         self.registered_action_params: dict = {}
 
diff --git a/nemoguardrails/colang/v1_0/runtime/runtime.py b/nemoguardrails/colang/v1_0/runtime/runtime.py
@@ -12,10 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import asyncio
 import inspect
 import logging
-import uuid
 from textwrap import indent
 from time import time
 from typing import Any, Dict, List, Optional, Tuple
@@ -25,10 +24,13 @@
 from langchain.chains.base import Chain
 
 from nemoguardrails.actions.actions import ActionResult
+from nemoguardrails.actions.output_mapping import is_output_blocked
 from nemoguardrails.colang import parse_colang_file
 from nemoguardrails.colang.runtime import Runtime
 from nemoguardrails.colang.v1_0.runtime.flows import (
     FlowConfig,
+    _get_flow_params,
+    _normalize_flow_id,
     compute_context,
     compute_next_steps,
 )
@@ -259,6 +261,89 @@ def _internal_error_action_result(message: str):
             ]
         )
 
+    async def _run_output_rails_in_parallel_streaming(
+        self, flows_with_params: Dict[str, dict], events: List[dict]
+    ) -> ActionResult:
+        """Run the output rails in parallel for streaming chunks.
+
+        This is a streamlined version that avoids the full flow state management
+        which can cause issues with hide_prev_turn logic during streaming.
+
+        Args:
+            flows_with_params: Dictionary mapping flow_id to {"action_name": str, "params": dict}
+            events: The events list for context
+        """
+        tasks = []
+
+        async def run_single_rail(flow_id: str, action_info: dict) -> tuple:
+            """Run a single rail flow and return (flow_id, result)"""
+
+            try:
+                action_name = action_info["action_name"]
+                params = action_info["params"]
+
+                result_tuple = await self.action_dispatcher.execute_action(
+                    action_name, params
+                )
+                result, status = result_tuple
+
+                if status != "success":
+                    log.error(f"Action {action_name} failed with status: {status}")
+                    return flow_id, False  # Allow on failure
+
+                action_func = self.action_dispatcher.get_action(action_name)
+
+                # use the mapping to decide if the result indicates blocked content.
+                # True means blocked, False means allowed
+                result = is_output_blocked(result, action_func)
+
+                return flow_id, result
+
+            except Exception as e:
+                log.error(f"Error executing rail {flow_id}: {e}")
+                return flow_id, False  # Allow on error
+
+        # create tasks for all flows
+        for flow_id, action_info in flows_with_params.items():
+            task = asyncio.create_task(run_single_rail(flow_id, action_info))
+            tasks.append(task)
+
+        stopped_events = []
+
+        try:
+            for future in asyncio.as_completed(tasks):
+                try:
+                    flow_id, is_blocked = await future
+
+                    # check if this rail blocked the content
+                    if is_blocked:
+                        # create stop events
+                        stopped_events = [
+                            {
+                                "type": "BotIntent",
+                                "intent": "stop",
+                                "flow_id": flow_id,
+                            }
+                        ]
+
+                        # cancel remaining tasks
+                        for pending_task in tasks:
+                            if not pending_task.done():
+                                pending_task.cancel()
+                        break
+
+                except asyncio.CancelledError:
+                    pass
+                except Exception as e:
+                    log.error(f"Error in parallel rail task: {e}")
+                    continue
+
+        except Exception as e:
+            log.error(f"Error in parallel rail execution: {e}")
+            return ActionResult(events=[])
+
+        return ActionResult(events=stopped_events)
+
     async def _process_start_action(self, events: List[dict]) -> List[dict]:
         """
         Start the specified action, wait for it to finish, and post back the result.
@@ -458,8 +543,9 @@ async def _get_action_resp(
                                 )
 
                             resp = await resp.json()
-                            result, status = resp.get("result", result), resp.get(
-                                "status", status
+                            result, status = (
+                                resp.get("result", result),
+                                resp.get("status", status),
                             )
                     except Exception as e:
                         log.info(f"Exception {e} while making request to {action_name}")
diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py
@@ -455,6 +455,11 @@ class OutputRailsStreamingConfig(BaseModel):
 class OutputRails(BaseModel):
     """Configuration of output rails."""
 
+    parallel: Optional[bool] = Field(
+        default=False,
+        description="If True, the output rails are executed in parallel.",
+    )
+
     flows: List[str] = Field(
         default_factory=list,
         description="The names of all the flows that implement output rails.",
diff --git a/nemoguardrails/rails/llm/llmrails.py b/nemoguardrails/rails/llm/llmrails.py
@@ -66,7 +66,7 @@
 from nemoguardrails.logging.verbose import set_verbose
 from nemoguardrails.patch_asyncio import check_sync_call_from_async_loop
 from nemoguardrails.rails.llm.buffer import get_buffer_strategy
-from nemoguardrails.rails.llm.config import EmbeddingSearchProvider, Model, RailsConfig
+from nemoguardrails.rails.llm.config import EmbeddingSearchProvider, RailsConfig
 from nemoguardrails.rails.llm.options import (
     GenerationLog,
     GenerationOptions,
@@ -1351,6 +1351,32 @@ def _get_latest_user_message(
                     return message
             return {}
 
+        def _prepare_context_for_parallel_rails(
+            chunk_str: str,
+            prompt: Optional[str] = None,
+            messages: Optional[List[dict]] = None,
+        ) -> dict:
+            """Prepare context for parallel rails execution."""
+            context_message = _get_last_context_message(messages)
+            user_message = prompt or _get_latest_user_message(messages)
+
+            context = {
+                "user_message": user_message,
+                "bot_message": chunk_str,
+            }
+
+            if context_message:
+                context.update(context_message["content"])
+
+            return context
+
+        def _create_events_for_chunk(chunk_str: str, context: dict) -> List[dict]:
+            """Create events for running output rails on a chunk."""
+            return [
+                {"type": "ContextUpdate", "data": context},
+                {"type": "BotMessage", "text": chunk_str},
+            ]
+
         def _prepare_params(
             flow_id: str,
             action_name: str,
@@ -1404,6 +1430,8 @@ def _prepare_params(
             _get_action_details_from_flow_id, flows=self.config.flows
         )
 
+        parallel_mode = getattr(self.config.rails.output, "parallel", False)
+
         async for chunk_batch in buffer_strategy(streaming_handler):
             user_output_chunks = chunk_batch.user_output_chunks
             # format processing_context for output rails processing (needs full context)
@@ -1427,48 +1455,118 @@ def _prepare_params(
                 for chunk in user_output_chunks:
                     yield chunk
 
-            for flow_id in output_rails_flows_id:
-                action_name, action_params = get_action_details(flow_id)
+            if parallel_mode:
+                try:
+                    context = _prepare_context_for_parallel_rails(
+                        bot_response_chunk, prompt, messages
+                    )
+                    events = _create_events_for_chunk(bot_response_chunk, context)
+
+                    flows_with_params = {}
+                    for flow_id in output_rails_flows_id:
+                        action_name, action_params = get_action_details(flow_id)
+                        params = _prepare_params(
+                            flow_id=flow_id,
+                            action_name=action_name,
+                            bot_response_chunk=bot_response_chunk,
+                            prompt=prompt,
+                            messages=messages,
+                            action_params=action_params,
+                        )
+                        flows_with_params[flow_id] = {
+                            "action_name": action_name,
+                            "params": params,
+                        }
+
+                    result_tuple = await self.runtime.action_dispatcher.execute_action(
+                        "run_output_rails_in_parallel_streaming",
+                        {
+                            "flows_with_params": flows_with_params,
+                            "events": events,
+                        },
+                    )
 
-                params = _prepare_params(
-                    flow_id=flow_id,
-                    action_name=action_name,
-                    bot_response_chunk=bot_response_chunk,
-                    prompt=prompt,
-                    messages=messages,
-                    action_params=action_params,
-                )
+                    # ActionDispatcher.execute_action always returns (result, status)
+                    result, status = result_tuple
 
-                result = await self.runtime.action_dispatcher.execute_action(
-                    action_name, params
-                )
+                    if status != "success":
+                        log.error(
+                            f"Parallel rails execution failed with status: {status}"
+                        )
+                        # continue processing the chunk even if rails fail
+                        pass
+                    else:
+                        # if there are any stop events, content was blocked
+                        if result.events:
+                            # extract the blocked flow from the first stop event
+                            blocked_flow = result.events[0].get(
+                                "flow_id", "output rails"
+                            )
+
+                            reason = f"Blocked by {blocked_flow} rails."
+                            error_data = {
+                                "error": {
+                                    "message": reason,
+                                    "type": "guardrails_violation",
+                                    "param": blocked_flow,
+                                    "code": "content_blocked",
+                                }
+                            }
+                            yield json.dumps(error_data)
+                            return
+
+                except Exception as e:
+                    log.error(f"Error in parallel rail execution: {e}")
+                    # don't block the stream for rail execution errors
+                    # continue processing the chunk
+                    pass
+
+                # update explain info for parallel mode
                 self.explain_info = self._ensure_explain_info()
 
-                action_func = self.runtime.action_dispatcher.get_action(action_name)
-
-                # Use the mapping to decide if the result indicates blocked content.
-                if is_output_blocked(result, action_func):
-                    reason = f"Blocked by {flow_id} rails."
-
-                    # return the error as a plain JSON string (not in SSE format)
-                    # NOTE: When integrating with the OpenAI Python client, the server code should:
-                    # 1. detect this JSON error object in the stream
-                    # 2. terminate the stream
-                    # 3. format the error following OpenAI's SSE format
-                    # the OpenAI client will then properly raise an APIError with this error message
-
-                    error_data = {
-                        "error": {
-                            "message": reason,
-                            "type": "guardrails_violation",
-                            "param": flow_id,
-                            "code": "content_blocked",
+            else:
+                for flow_id in output_rails_flows_id:
+                    action_name, action_params = get_action_details(flow_id)
+
+                    params = _prepare_params(
+                        flow_id=flow_id,
+                        action_name=action_name,
+                        bot_response_chunk=bot_response_chunk,
+                        prompt=prompt,
+                        messages=messages,
+                        action_params=action_params,
+                    )
+
+                    result = await self.runtime.action_dispatcher.execute_action(
+                        action_name, params
+                    )
+                    self.explain_info = self._ensure_explain_info()
+
+                    action_func = self.runtime.action_dispatcher.get_action(action_name)
+
+                    # Use the mapping to decide if the result indicates blocked content.
+                    if is_output_blocked(result, action_func):
+                        reason = f"Blocked by {flow_id} rails."
+
+                        # return the error as a plain JSON string (not in SSE format)
+                        # NOTE: When integrating with the OpenAI Python client, the server code should:
+                        # 1. detect this JSON error object in the stream
+                        # 2. terminate the stream
+                        # 3. format the error following OpenAI's SSE format
+                        # the OpenAI client will then properly raise an APIError with this error message
+
+                        error_data = {
+                            "error": {
+                                "message": reason,
+                                "type": "guardrails_violation",
+                                "param": flow_id,
+                                "code": "content_blocked",
+                            }
                         }
-                    }
 
-                    # return as plain JSON: the server should detect this JSON and convert it to an HTTP error
-                    yield json.dumps(error_data)
-                    return
+                        # return as plain JSON: the server should detect this JSON and convert it to an HTTP error
+                        yield json.dumps(error_data)
+                        return
 
             if not stream_first:
                 # yield the individual chunks directly from the buffer strategy
diff --git a/tests/test_parallel_streaming_output_rails.py b/tests/test_parallel_streaming_output_rails.py