From a0d0c763ff9d1aae92be0ebdf7b011e5a88f2497 Mon Sep 17 00:00:00 2001 From: Dmitrii Cherkasov Date: Tue, 20 May 2025 09:21:36 -0700 Subject: [PATCH] Update AQUA client documentation to Support predictWithResponseStream Endpoint --- .../large_language_model/aqua_client.rst | 36 ++++++++++++++----- .../llamaindex_integration.rst | 15 +++++--- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/docs/source/user_guide/large_language_model/aqua_client.rst b/docs/source/user_guide/large_language_model/aqua_client.rst index 4eec3eeb4..f91b48500 100644 --- a/docs/source/user_guide/large_language_model/aqua_client.rst +++ b/docs/source/user_guide/large_language_model/aqua_client.rst @@ -46,8 +46,7 @@ Sync Usage client = Client(endpoint="https:///predict") response = client.chat( messages=[{"role": "user", "content": "Tell me a joke."}], - payload={"model": "odsc-llm"}, - stream=False, + payload={"model": "odsc-llm"} ) print(response) @@ -58,7 +57,7 @@ Sync Usage from ads.aqua import Client ads.set_auth(auth="security_token", profile="") - client = Client(endpoint="https:///predict") + client = Client(endpoint="https:///predictWithResponseStream") response = client.chat( messages=[{"role": "user", "content": "Tell me a joke."}], payload={"model": "odsc-llm"}, @@ -97,8 +96,7 @@ The following examples demonstrate how to perform the same operations using the client = AsyncClient(endpoint="https:///predict") response = await client.generate( prompt="Tell me a joke", - payload={"model": "odsc-llm"}, - stream=False, + payload={"model": "odsc-llm"} ) print(response) @@ -109,7 +107,7 @@ The following examples demonstrate how to perform the same operations using the from ads.aqua import AsyncClient ads.set_auth(auth="security_token", profile="") - client = AsyncClient(endpoint="https:///predict") + client = AsyncClient(endpoint="https:///predictWithResponseStream") async for chunk in await client.generate( prompt="Tell me a joke", payload={"model": "odsc-llm"}, @@ -225,11 +223,33 @@ The synchronous client, ``OpenAI``, extends the OpenAI client. If no HTTP client "content": "Tell me a joke.", } ], - # stream=True, # enable for streaming ) print(response) +**Streaming** +For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``. + +.. code-block:: python + + client = OpenAI( + base_url="https://modeldeployment.us-ashburn-1.oci.customer-oci.com//predictWithResponseStream/v1", + ) + + response = client.chat.completions.create( + model="odsc-llm", + messages=[ + { + "role": "user", + "content": "Tell me a joke.", + } + ], + stream=True + ) + + for chunk in response: + print(chunk) + **Asynchronous Client** @@ -246,7 +266,7 @@ The asynchronous client, ``AsynOpenAI``, extends the AsyncOpenAI client. If no a async def test_async() -> None: client_async = AsyncOpenAI( - base_url="https://modeldeployment.us-ashburn-1.oci.customer-oci.com//predict/v1", + base_url="https://modeldeployment.us-ashburn-1.oci.customer-oci.com//predictWithResponseStream/v1", ) response = await client_async.chat.completions.create( model="odsc-llm", diff --git a/docs/source/user_guide/large_language_model/llamaindex_integration.rst b/docs/source/user_guide/large_language_model/llamaindex_integration.rst index 9e7f6223d..2eaa48a86 100644 --- a/docs/source/user_guide/large_language_model/llamaindex_integration.rst +++ b/docs/source/user_guide/large_language_model/llamaindex_integration.rst @@ -82,6 +82,7 @@ Streaming Using ``stream_complete`` endpoint ------------------------------- +For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``. .. code-block:: python3 @@ -92,7 +93,7 @@ Using ``stream_complete`` endpoint llm = OCIDataScience( model="odsc-llm", - endpoint="https:///predict", + endpoint="https:///predictWithResponseStream", ) for chunk in llm.stream_complete("Tell me a joke"): @@ -101,6 +102,8 @@ Using ``stream_complete`` endpoint Using ``stream_chat`` endpoint ---------------------------- +For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``. + .. code-block:: python3 import ads @@ -111,7 +114,7 @@ Using ``stream_chat`` endpoint llm = OCIDataScience( model="odsc-llm", - endpoint="https:///predict", + endpoint="https:///predictWithResponseStream", ) response = llm.stream_chat( [ @@ -176,6 +179,8 @@ Async Streaming Using ``astream_complete`` endpoint --------------------------------- +For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``. + .. code-block:: python3 import ads @@ -185,7 +190,7 @@ Using ``astream_complete`` endpoint llm = OCIDataScience( model="odsc-llm", - endpoint="https:///predict", + endpoint="https:///predictWithResponseStream", ) async for chunk in await llm.astream_complete("Tell me a joke"): @@ -194,6 +199,8 @@ Using ``astream_complete`` endpoint Using ``astream_chat`` endpoint ----------------------------- +For streaming, a dedicated endpoint must be used: ``/predictWithResponseStream``. + .. code-block:: python3 import ads @@ -204,7 +211,7 @@ Using ``astream_chat`` endpoint llm = OCIDataScience( model="odsc-llm", - endpoint="https:///predict", + endpoint="https:///predictWithResponseStream", ) response = await llm.stream_chat( [