NVIDIA-NeMo
diff --git a/‎nemoguardrails/server/api.py‎
Lines changed: 121 additions & 6 deletions b/‎nemoguardrails/server/api.py‎
Lines changed: 121 additions & 6 deletions
diff --git a/‎nemoguardrails/server/schemas/openai.py‎
Lines changed: 7 additions & 101 deletions b/‎nemoguardrails/server/schemas/openai.py‎
Lines changed: 7 additions & 101 deletions
diff --git a/‎nemoguardrails/streaming.py‎
Lines changed: 51 additions & 4 deletions b/‎nemoguardrails/streaming.py‎
Lines changed: 51 additions & 4 deletions
@@ -24,10 +24,12 @@
 import uuid
 import warnings
 from contextlib import asynccontextmanager
-from typing import Any, Callable, List, Optional
+from typing import Any, AsyncIterator, Callable, List, Optional, Union
 
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
+from openai.types.chat.chat_completion import ChatCompletion, Choice
+from openai.types.model import Model
 from pydantic import Field, root_validator, validator
 from starlette.responses import StreamingResponse
 from starlette.staticfiles import StaticFiles
@@ -36,10 +38,7 @@
 from nemoguardrails.rails.llm.options import GenerationOptions, GenerationResponse
 from nemoguardrails.server.datastore.datastore import DataStore
 from nemoguardrails.server.schemas.openai import (
-    Choice,
-    Model,
     ModelsResponse,
-    OpenAIRequestFields,
     ResponseBody,
 )
 from nemoguardrails.streaming import StreamingHandler
@@ -195,7 +194,7 @@ async def root_handler():
 app.single_config_id = None
 
 
-class RequestBody(OpenAIRequestFields):
+class RequestBody(ChatCompletion):
     config_id: Optional[str] = Field(
         default=os.getenv("DEFAULT_CONFIG_ID", None),
         description="The id of the configuration to be used. If not set, the default configuration will be used.",
@@ -213,6 +212,50 @@ class RequestBody(OpenAIRequestFields):
         max_length=255,
         description="The id of an existing thread to which the messages should be added.",
     )
+    model: Optional[str] = Field(
+        default=None,
+        description="The model used for the chat completion.",
+    )
+    id: Optional[str] = Field(
+        default=None,
+        description="The id of the chat completion.",
+    )
+    object: Optional[str] = Field(
+        default="chat.completion",
+        description="The object type, which is always chat.completion",
+    )
+    created: Optional[int] = Field(
+        default=None,
+        description="The Unix timestamp (in seconds) of when the chat completion was created.",
+    )
+    choices: Optional[List[Choice]] = Field(
+        default=None,
+        description="The list of choices for the chat completion.",
+    )
+    max_tokens: Optional[int] = Field(
+        default=None,
+        description="The maximum number of tokens to generate.",
+    )
+    temperature: Optional[float] = Field(
+        default=None,
+        description="The temperature to use for the chat completion.",
+    )
+    top_p: Optional[float] = Field(
+        default=None,
+        description="The top p to use for the chat completion.",
+    )
+    stop: Optional[Union[str, List[str]]] = Field(
+        default=None,
+        description="The stop sequences to use for the chat completion.",
+    )
+    presence_penalty: Optional[float] = Field(
+        default=None,
+        description="The presence penalty to use for the chat completion.",
+    )
+    frequency_penalty: Optional[float] = Field(
+        default=None,
+        description="The frequency penalty to use for the chat completion.",
+    )
     messages: Optional[List[dict]] = Field(
         default=None, description="The list of messages in the current conversation."
     )
@@ -392,6 +435,73 @@ def _get_rails(config_ids: List[str]) -> LLMRails:
     return llm_rails
 
 
+async def _format_streaming_response(
+    streaming_handler: StreamingHandler, model_name: Optional[str]
+) -> AsyncIterator[str]:
+    while True:
+        try:
+            chunk = await streaming_handler.__anext__()
+        except StopAsyncIteration:
+            # When the stream ends, yield the [DONE] message
+            yield "data: [DONE]\n\n"
+            break
+
+        # Determine the payload format based on chunk type
+        if isinstance(chunk, dict):
+            # If chunk is a dict, wrap it in OpenAI chunk format with delta
+            payload = {
+                "id": None,
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "model": model_name,
+                "choices": [
+                    {
+                        "delta": chunk,
+                        "index": None,
+                        "finish_reason": None,
+                    }
+                ],
+            }
+        elif isinstance(chunk, str):
+            try:
+                # Try parsing as JSON - if it parses, it might be a pre-formed payload
+                payload = json.loads(chunk)
+            except Exception:
+                # treat as plain text content token
+                payload = {
+                    "id": None,
+                    "object": "chat.completion.chunk",
+                    "created": int(time.time()),
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "delta": {"content": chunk},
+                            "index": None,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+        else:
+            # For any other type, treat as plain content
+            payload = {
+                "id": None,
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "model": model_name,
+                "choices": [
+                    {
+                        "delta": {"content": str(chunk)},
+                        "index": None,
+                        "finish_reason": None,
+                    }
+                ],
+            }
+
+        # Send the payload as JSON
+        data = json.dumps(payload, ensure_ascii=False)
+        yield f"data: {data}\n\n"
+
+
 @app.post(
     "/v1/chat/completions",
     response_model=ResponseBody,
@@ -523,7 +633,12 @@ async def chat_completion(body: RequestBody, request: Request):
                 )
             )
 
-            return StreamingResponse(streaming_handler)
+            return StreamingResponse(
+                _format_streaming_response(
+                    streaming_handler, model_name=config_ids[0] if config_ids else None
+                ),
+                media_type="text/event-stream",
+            )
         else:
             res = await llm_rails.generate_async(
                 messages=messages, options=body.options, state=body.state
 
@@ -15,96 +15,19 @@
 
 """OpenAI API schema definitions for the NeMo Guardrails server."""
 
-from typing import List, Optional, Union
+from typing import List, Optional
 
+from openai.types.chat.chat_completion import (
+    ChatCompletion,
+    Choice
+)
+from openai.types.model import Model
 from pydantic import BaseModel, Field
 
 
-class OpenAIRequestFields(BaseModel):
-    """OpenAI API request fields that can be mixed into other request schemas."""
-
-    # Standard OpenAI completion parameters
-    model: Optional[str] = Field(
-        default=None,
-        description="The model to use for chat completion. Maps to config_id for backward compatibility.",
-    )
-    max_tokens: Optional[int] = Field(
-        default=None,
-        description="The maximum number of tokens to generate.",
-    )
-    temperature: Optional[float] = Field(
-        default=None,
-        description="Sampling temperature to use.",
-    )
-    top_p: Optional[float] = Field(
-        default=None,
-        description="Top-p sampling parameter.",
-    )
-    stop: Optional[Union[str, List[str]]] = Field(
-        default=None,
-        description="Stop sequences.",
-    )
-    presence_penalty: Optional[float] = Field(
-        default=None,
-        description="Presence penalty parameter.",
-    )
-    frequency_penalty: Optional[float] = Field(
-        default=None,
-        description="Frequency penalty parameter.",
-    )
-    function_call: Optional[dict] = Field(
-        default=None,
-        description="Function call parameter.",
-    )
-    logit_bias: Optional[dict] = Field(
-        default=None,
-        description="Logit bias parameter.",
-    )
-    log_probs: Optional[bool] = Field(
-        default=None,
-        description="Log probabilities parameter.",
-    )
-
-
-class Choice(BaseModel):
-    """OpenAI API choice structure in chat completion responses."""
-
-    index: Optional[int] = Field(
-        default=None, description="The index of the choice in the list of choices."
-    )
-    message: Optional[dict] = Field(
-        default=None, description="The message of the choice"
-    )
-    logprobs: Optional[dict] = Field(
-        default=None, description="The log probabilities of the choice"
-    )
-    finish_reason: Optional[str] = Field(
-        default=None, description="The reason the model stopped generating tokens."
-    )
-
-
-class ResponseBody(BaseModel):
+class ResponseBody(ChatCompletion):
     """OpenAI API response body with NeMo-Guardrails extensions."""
 
-    # OpenAI API fields
-    id: Optional[str] = Field(
-        default=None, description="A unique identifier for the chat completion."
-    )
-    object: str = Field(
-        default="chat.completion",
-        description="The object type, which is always chat.completion",
-    )
-    created: Optional[int] = Field(
-        default=None,
-        description="The Unix timestamp (in seconds) of when the chat completion was created.",
-    )
-    model: Optional[str] = Field(
-        default=None, description="The model used for the chat completion."
-    )
-    choices: Optional[List[Choice]] = Field(
-        default=None, description="A list of chat completion choices."
-    )
-    # NeMo-Guardrails specific fields for backward compatibility
     state: Optional[dict] = Field(
         default=None, description="State object for continuing the conversation."
     )
@@ -117,23 +40,6 @@ class ResponseBody(BaseModel):
     log: Optional[dict] = Field(default=None, description="Generation log data.")
 
 
-class Model(BaseModel):
-    """OpenAI API model representation."""
-
-    id: str = Field(
-        description="The model identifier, which can be referenced in the API endpoints."
-    )
-    object: str = Field(
-        default="model", description="The object type, which is always 'model'."
-    )
-    created: int = Field(
-        description="The Unix timestamp (in seconds) of when the model was created."
-    )
-    owned_by: str = Field(
-        default="nemo-guardrails", description="The organization that owns the model."
-    )
-
-
 class ModelsResponse(BaseModel):
     """OpenAI API models list response."""
 
 
@@ -174,18 +174,39 @@ async def __anext__(self):
 
     async def _process(
         self,
-        chunk: Union[str, object],
+        chunk: Union[str, dict, object],
         generation_info: Optional[Dict[str, Any]] = None,
     ):
-        """Process a chunk of text.
+        """Process a chunk of text or dict.
 
         If we're in buffering mode, record the text.
         Otherwise, update the full completion, check for stop tokens, and enqueue the chunk.
+        Dict chunks bypass completion tracking and go directly to the queue.
         """
 
         if self.include_generation_metadata and generation_info:
             self.current_generation_info = generation_info
 
+        # Dict chunks bypass buffering and completion tracking
+        if isinstance(chunk, dict):
+            if self.pipe_to:
+                asyncio.create_task(self.pipe_to.push_chunk(chunk))
+            else:
+                if self.include_generation_metadata:
+                    await self.queue.put(
+                        {
+                            "text": chunk,
+                            "generation_info": (
+                                self.current_generation_info.copy()
+                                if self.current_generation_info
+                                else {}
+                            ),
+                        }
+                    )
+                else:
+                    await self.queue.put(chunk)
+            return
+
         if self.enable_buffer:
             if chunk is not END_OF_STREAM:
                 self.buffer += chunk if chunk is not None else ""
@@ -259,10 +280,28 @@ async def _process(
 
     async def push_chunk(
         self,
-        chunk: Union[str, GenerationChunk, AIMessageChunk, ChatGenerationChunk, None],
+        chunk: Union[
+            str,
+            dict,
+            GenerationChunk,
+            AIMessageChunk,
+            ChatGenerationChunk,
+            None,
+            object,
+        ],
         generation_info: Optional[Dict[str, Any]] = None,
     ):
-        """Push a new chunk to the stream."""
+        """Push a new chunk to the stream.
+
+        Args:
+            chunk: The chunk to push. Can be:
+                - str: Plain text content
+                - dict: Dictionary with fields like role, content, etc.
+                - GenerationChunk/AIMessageChunk/ChatGenerationChunk: LangChain chunk types
+                - None: Signals end of stream (converted to END_OF_STREAM)
+                - object: END_OF_STREAM sentinel
+            generation_info: Optional metadata about the generation
+        """
 
         # if generation_info is not explicitly passed,
         # try to get it from the chunk itself if it's a GenerationChunk or ChatGenerationChunk
@@ -288,6 +327,9 @@ async def push_chunk(
         elif isinstance(chunk, str):
             # empty string is a valid chunk and should be processed normally
             pass
+        elif isinstance(chunk, dict):
+            # plain dict chunks are allowed (e.g., for OpenAI-compatible streaming)
+            pass
         else:
             raise Exception(f"Unsupported chunk type: {chunk.__class__.__name__}")
 
@@ -298,6 +340,11 @@ async def push_chunk(
         if self.include_generation_metadata and generation_info:
             self.current_generation_info = generation_info
 
+        # Dict chunks bypass prefix/suffix processing and go directly to _process
+        if isinstance(chunk, dict):
+            await self._process(chunk, generation_info)
+            return
+
         # Process prefix: accumulate until the expected prefix is received, then remove it.
         if self.prefix:
             if chunk is not None and chunk is not END_OF_STREAM: