From 4cdebea3bc23956cf6130b67fad5f20e31f52682 Mon Sep 17 00:00:00 2001 From: Bill Chen Date: Sun, 20 Apr 2025 18:59:19 -0700 Subject: [PATCH 1/2] changed the create_response to use the responses api from openai sdk --- .gitignore | 3 +- agent/agent.py | 71 ++++++++++++++++++++++++++++-------- requirements.txt | 1 + simple_cua_loop.py | 47 +++++++++++++++++++----- utils.py | 89 +++++++++++++++++++++++++++++++++++----------- 5 files changed, 165 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index e58b6e1..80b2f0b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__/ .env -.venv/ \ No newline at end of file +.venv/ +cua_doc.pdf \ No newline at end of file diff --git a/agent/agent.py b/agent/agent.py index 2514c24..4457e5e 100644 --- a/agent/agent.py +++ b/agent/agent.py @@ -116,26 +116,67 @@ def run_full_turn( self.print_steps = print_steps self.debug = debug self.show_images = show_images - new_items = [] - - # keep looping until we get a final response - while new_items[-1].get("role") != "assistant" if new_items else True: - self.debug_print([sanitize_message(msg) for msg in input_items + new_items]) - - response = create_response( + transcript: list[dict] = [] # keep for debugging/return + pending: list[dict] = input_items.copy() # start with user/system messages + previous_response_id = None + + while True: + self.debug_print([sanitize_message(msg) for msg in transcript + pending]) + + # Deduplicate by id within this batch (API rejects duplicates). + seen_ids: set[str] = set() + payload: list[dict] = [] + for m in pending: + mid = m.get("id") if isinstance(m, dict) else None + if mid and mid in seen_ids: + continue + if mid: + seen_ids.add(mid) + payload.append(m) + + req = dict( model=self.model, - input=input_items + new_items, + input=payload, tools=self.tools, truncation="auto", ) + if previous_response_id: + req["previous_response_id"] = previous_response_id + + response = create_response(**req) self.debug_print(response) - if "output" not in response and self.debug: - print(response) + previous_response_id = response.get("id") + + if "output" not in response: raise ValueError("No output from model") - else: - new_items += response["output"] - for item in response["output"]: - new_items += self.handle_item(item) - return new_items + # prepare for next loop + new_pending: list[dict] = [] + + for item in response["output"]: + t_type = item.get("type") + + if t_type == "computer_call": + # Execute call and collect observations to send next. + obs = self.handle_item(item) + transcript.append(item) # keep for local log + transcript.extend(obs) + new_pending.extend(obs) + # After a computer_call we immediately break to send obs. + break + else: + transcript.append(item) + # assistant/user messages do not get re‑sent; rely on previous_response_id + + # Determine loop exit: if last transcript item is assistant with no further tool calls + if new_pending and new_pending[-1].get("role") == "assistant": + break + + pending = new_pending if new_pending else [] + + # If there's nothing new to send (shouldn't happen), exit to avoid infinite loop + if not pending: + break + + return transcript diff --git a/requirements.txt b/requirements.txt index 13769fb..5c1068d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,3 +21,4 @@ scrapybara>=2.3.6 sniffio==1.3.1 typing_extensions==4.12.2 urllib3==2.3.0 +openai==1.75.0 diff --git a/simple_cua_loop.py b/simple_cua_loop.py index 1bbb0fc..975e485 100644 --- a/simple_cua_loop.py +++ b/simple_cua_loop.py @@ -65,31 +65,60 @@ def main(): } ] - items = [] + transcript: list[dict] = [] while True: # get user input forever user_input = input("> ") - items.append({"role": "user", "content": user_input}) - - while True: # keep looping until we get a final response - response = create_response( + pending: list[dict] = [{"role": "user", "content": user_input}] + previous_response_id = None + + while True: + seen_ids: set[str] = set() + payload: list[dict] = [] + for m in pending: + mid = m.get("id") if isinstance(m, dict) else None + if mid and mid in seen_ids: + continue + if mid: + seen_ids.add(mid) + payload.append(m) + + req = dict( model="computer-use-preview", - input=items, + input=payload, tools=tools, truncation="auto", ) + if previous_response_id: + req["previous_response_id"] = previous_response_id + + response = create_response(**req) if "output" not in response: print(response) raise ValueError("No output from model") - items += response["output"] + previous_response_id = response.get("id") + + new_pending: list[dict] = [] for item in response["output"]: - items += handle_item(item, computer) + if item.get("type") == "computer_call": + obs = handle_item(item, computer) + transcript.append(item) + transcript.extend(obs) + new_pending.extend(obs) + break + else: + transcript.append(item) + + if new_pending and new_pending[-1].get("role") == "assistant": + break - if items[-1].get("role") == "assistant": + if not new_pending: break + pending = new_pending + if __name__ == "__main__": main() diff --git a/utils.py b/utils.py index b17ee81..85083b6 100644 --- a/utils.py +++ b/utils.py @@ -1,15 +1,45 @@ +"""Utility helpers used by the CUA demo code. + +This file previously issued raw HTTP requests to the OpenAI *Responses* API +using the ``requests`` library. The endpoint is now available directly in the +official ``openai`` Python SDK, so we route all calls through +``openai.responses.create`` instead. This removes the manual URL / header +handling and makes it easier to take advantage of new SDK features such as +``previous_response_id``. +""" + +from __future__ import annotations + import os -import requests -from dotenv import load_dotenv import json import base64 -from PIL import Image -from io import BytesIO import io +from io import BytesIO +from typing import Any, Dict from urllib.parse import urlparse +from dotenv import load_dotenv +from PIL import Image + +# Official OpenAI SDK (>= 1.15) – *do not* vendor‑lock to a minor version. +import openai + +# --------------------------------------------------------------------------- +# Environment / SDK initialisation +# --------------------------------------------------------------------------- + +# Load environment variables from a local ``.env`` if present. This is a +# no‑op when already running in an environment with the variables exported. load_dotenv(override=True) +# Configure the OpenAI SDK from env vars. We intentionally *do not* raise if +# the key is missing at import time – downstream scripts may patch +# ``openai.api_key`` dynamically (e.g. through a CLI flag) before making the +# first request. +openai.api_key = os.getenv("OPENAI_API_KEY", openai.api_key) +if org := os.getenv("OPENAI_ORG"): + openai.organization = org + BLOCKED_DOMAINS = [ "maliciousbook.com", "evilvideos.com", @@ -47,23 +77,40 @@ def sanitize_message(msg: dict) -> dict: return msg -def create_response(**kwargs): - url = "https://api.openai.com/v1/responses" - headers = { - "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}", - "Content-Type": "application/json" - } - - openai_org = os.getenv("OPENAI_ORG") - if openai_org: - headers["Openai-Organization"] = openai_org - - response = requests.post(url, headers=headers, json=kwargs) - - if response.status_code != 200: - print(f"Error: {response.status_code} {response.text}") - - return response.json() +def create_response(**kwargs: Any) -> Dict[str, Any]: + """Wrapper around *openai.responses.create* with dict output. + + The original implementation issued a raw POST and returned ``response.json()``. + Most call‑sites therefore expect a *plain dict* with keys like ``"output"`` + and ``"id"``. The OpenAI SDK, however, returns a *pydantic* model. + + To stay backward‑compatible we convert the SDK object back to ``dict`` via + its ``to_dict()``/``model_dump()`` helper before returning. This way none + of the downstream code needs to change its access pattern. + """ + + # The callers may or may not supply ``previous_response_id``. The SDK + # requires ``openai.NOT_GIVEN`` for omitted optional parameters instead of + # plain ``None``. Convert to the sentinel so that accidental "None" + # doesn't override the continuation logic. + if "previous_response_id" in kwargs and kwargs["previous_response_id"] is None: + kwargs["previous_response_id"] = openai.NOT_GIVEN + + try: + sdk_response = openai.responses.create(**kwargs) + except openai.OpenAIError as e: # pragma: no cover – network/credential errors only at runtime. + # Mirror the old behaviour of printing status & body for debugging. + print("Error while calling Responses API:", e) + raise + + # Convert to dict for compatibility. + if hasattr(sdk_response, "to_dict"): + return sdk_response.to_dict() + # Fallback – older versions expose ``model_dump``. + if hasattr(sdk_response, "model_dump"): + return sdk_response.model_dump() + # As a last resort, expose the underlying ``__dict__``. + return dict(sdk_response.__dict__) def check_blocklisted_url(url: str) -> None: From 9c07787549cfe074a04d6bd250ad7f2defe87050 Mon Sep 17 00:00:00 2001 From: Bill Chen Date: Sun, 20 Apr 2025 19:13:08 -0700 Subject: [PATCH 2/2] removed extraneous comments --- utils.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/utils.py b/utils.py index 85083b6..d91fa45 100644 --- a/utils.py +++ b/utils.py @@ -1,13 +1,3 @@ -"""Utility helpers used by the CUA demo code. - -This file previously issued raw HTTP requests to the OpenAI *Responses* API -using the ``requests`` library. The endpoint is now available directly in the -official ``openai`` Python SDK, so we route all calls through -``openai.responses.create`` instead. This removes the manual URL / header -handling and makes it easier to take advantage of new SDK features such as -``previous_response_id``. -""" - from __future__ import annotations import os @@ -21,21 +11,9 @@ from dotenv import load_dotenv from PIL import Image -# Official OpenAI SDK (>= 1.15) – *do not* vendor‑lock to a minor version. import openai -# --------------------------------------------------------------------------- -# Environment / SDK initialisation -# --------------------------------------------------------------------------- - -# Load environment variables from a local ``.env`` if present. This is a -# no‑op when already running in an environment with the variables exported. -load_dotenv(override=True) -# Configure the OpenAI SDK from env vars. We intentionally *do not* raise if -# the key is missing at import time – downstream scripts may patch -# ``openai.api_key`` dynamically (e.g. through a CLI flag) before making the -# first request. openai.api_key = os.getenv("OPENAI_API_KEY", openai.api_key) if org := os.getenv("OPENAI_ORG"): openai.organization = org