From a0d0c763ff9d1aae92be0ebdf7b011e5a88f2497 Mon Sep 17 00:00:00 2001
From: Dmitrii Cherkasov <dmitrii.cherkasov@oracle.com>
Date: Tue, 20 May 2025 09:21:36 -0700
Subject: [PATCH] Update AQUA client documentation to Support
 predictWithResponseStream Endpoint

---
 .../large_language_model/aqua_client.rst      | 36 ++++++++++++++-----
 .../llamaindex_integration.rst                | 15 +++++---
 2 files changed, 39 insertions(+), 12 deletions(-)
diff --git a/docs/source/user_guide/large_language_model/aqua_client.rst b/docs/source/user_guide/large_language_model/aqua_client.rst
index 4eec3eeb4..f91b48500 100644
--- a/docs/source/user_guide/large_language_model/aqua_client.rst
+++ b/docs/source/user_guide/large_language_model/aqua_client.rst
@@ -46,8 +46,7 @@ Sync Usage
     client = Client(endpoint="https://<MD_OCID>/predict")
     response = client.chat(
         messages=[{"role": "user", "content": "Tell me a joke."}],
-        payload={"model": "odsc-llm"},
-        stream=False,
+        payload={"model": "odsc-llm"}
     )
     print(response)
 
@@ -58,7 +57,7 @@ Sync Usage
     from ads.aqua import Client
     ads.set_auth(auth="security_token", profile="<replace-with-your-profile>")
 
-    client = Client(endpoint="https://<MD_OCID>/predict")
+    client = Client(endpoint="https://<MD_OCID>/predictWithResponseStream")
     response = client.chat(
         messages=[{"role": "user", "content": "Tell me a joke."}],
         payload={"model": "odsc-llm"},
@@ -97,8 +96,7 @@ The following examples demonstrate how to perform the same operations using the
     client = AsyncClient(endpoint="https://<MD_OCID>/predict")
     response = await client.generate(
         prompt="Tell me a joke",
-        payload={"model": "odsc-llm"},
-        stream=False,
+        payload={"model": "odsc-llm"}
     )
     print(response)
 
@@ -109,7 +107,7 @@ The following examples demonstrate how to perform the same operations using the
     from ads.aqua import AsyncClient
     ads.set_auth(auth="security_token", profile="<replace-with-your-profile>")
 
-    client = AsyncClient(endpoint="https://<MD_OCID>/predict")
+    client = AsyncClient(endpoint="https://<MD_OCID>/predictWithResponseStream")
     async for chunk in await client.generate(
         prompt="Tell me a joke",
         payload={"model": "odsc-llm"},
@@ -225,11 +223,33 @@ The synchronous client, ``OpenAI``, extends the OpenAI client. If no HTTP client
                 "content": "Tell me a joke.",
             }
         ],
-        # stream=True, # enable for streaming
     )
 
     print(response)
 
+**Streaming**
+For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``.
+
+.. code-block:: python
+
+    client = OpenAI(
+            base_url="https://modeldeployment.us-ashburn-1.oci.customer-oci.com/<OCID>/predictWithResponseStream/v1",
+        )
+
+    response = client.chat.completions.create(
+        model="odsc-llm",
+        messages=[
+            {
+                "role": "user",
+                "content": "Tell me a joke.",
+            }
+        ],
+        stream=True
+    )
+
+    for chunk in response:
+        print(chunk)
+
 
 **Asynchronous Client**
 
@@ -246,7 +266,7 @@ The asynchronous client, ``AsynOpenAI``, extends the AsyncOpenAI client. If no a
 
     async def test_async() -> None:
         client_async = AsyncOpenAI(
-            base_url="https://modeldeployment.us-ashburn-1.oci.customer-oci.com/<OCID>/predict/v1",
+            base_url="https://modeldeployment.us-ashburn-1.oci.customer-oci.com/<OCID>/predictWithResponseStream/v1",
         )
         response = await client_async.chat.completions.create(
             model="odsc-llm",
diff --git a/docs/source/user_guide/large_language_model/llamaindex_integration.rst b/docs/source/user_guide/large_language_model/llamaindex_integration.rst
index 9e7f6223d..2eaa48a86 100644
--- a/docs/source/user_guide/large_language_model/llamaindex_integration.rst
+++ b/docs/source/user_guide/large_language_model/llamaindex_integration.rst
@@ -82,6 +82,7 @@ Streaming
 
 Using ``stream_complete`` endpoint
 -------------------------------
+For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``.
 
 .. code-block:: python3
 
@@ -92,7 +93,7 @@ Using ``stream_complete`` endpoint
 
    llm = OCIDataScience(
        model="odsc-llm",
-       endpoint="https://<MD_OCID>/predict",
+       endpoint="https://<MD_OCID>/predictWithResponseStream",
    )
 
    for chunk in llm.stream_complete("Tell me a joke"):
@@ -101,6 +102,8 @@ Using ``stream_complete`` endpoint
 Using ``stream_chat`` endpoint
 ----------------------------
 
+For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``.
+
 .. code-block:: python3
 
    import ads
@@ -111,7 +114,7 @@ Using ``stream_chat`` endpoint
 
    llm = OCIDataScience(
        model="odsc-llm",
-       endpoint="https://<MD_OCID>/predict",
+       endpoint="https://<MD_OCID>/predictWithResponseStream",
    )
    response = llm.stream_chat(
        [
@@ -176,6 +179,8 @@ Async Streaming
 Using ``astream_complete`` endpoint
 ---------------------------------
 
+For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``.
+
 .. code-block:: python3
 
    import ads
@@ -185,7 +190,7 @@ Using ``astream_complete`` endpoint
 
    llm = OCIDataScience(
        model="odsc-llm",
-       endpoint="https://<MD_OCID>/predict",
+       endpoint="https://<MD_OCID>/predictWithResponseStream",
    )
 
    async for chunk in await llm.astream_complete("Tell me a joke"):
@@ -194,6 +199,8 @@ Using ``astream_complete`` endpoint
 Using ``astream_chat`` endpoint
 -----------------------------
 
+For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``.
+
 .. code-block:: python3
 
    import ads
@@ -204,7 +211,7 @@ Using ``astream_chat`` endpoint
 
    llm = OCIDataScience(
        model="odsc-llm",
-       endpoint="https://<MD_OCID>/predict",
+       endpoint="https://<MD_OCID>/predictWithResponseStream",
    )
    response = await llm.stream_chat(
        [