From 4cdebea3bc23956cf6130b67fad5f20e31f52682 Mon Sep 17 00:00:00 2001
From: Bill Chen <billchen@openai.com>
Date: Sun, 20 Apr 2025 18:59:19 -0700
Subject: [PATCH 1/2] changed the create_response to use the responses api from
 openai sdk

---
 .gitignore         |  3 +-
 agent/agent.py     | 71 ++++++++++++++++++++++++++++--------
 requirements.txt   |  1 +
 simple_cua_loop.py | 47 +++++++++++++++++++-----
 utils.py           | 89 +++++++++++++++++++++++++++++++++++-----------
 5 files changed, 165 insertions(+), 46 deletions(-)

diff --git a/.gitignore b/.gitignore
index e58b6e1..80b2f0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 __pycache__/
 .env
-.venv/
\ No newline at end of file
+.venv/
+cua_doc.pdf
\ No newline at end of file
diff --git a/agent/agent.py b/agent/agent.py
index 2514c24..4457e5e 100644
--- a/agent/agent.py
+++ b/agent/agent.py
@@ -116,26 +116,67 @@ def run_full_turn(
         self.print_steps = print_steps
         self.debug = debug
         self.show_images = show_images
-        new_items = []
-
-        # keep looping until we get a final response
-        while new_items[-1].get("role") != "assistant" if new_items else True:
-            self.debug_print([sanitize_message(msg) for msg in input_items + new_items])
-
-            response = create_response(
+        transcript: list[dict] = []  # keep for debugging/return
+        pending: list[dict] = input_items.copy()  # start with user/system messages
+        previous_response_id = None
+
+        while True:
+            self.debug_print([sanitize_message(msg) for msg in transcript + pending])
+
+            # Deduplicate by id within this batch (API rejects duplicates).
+            seen_ids: set[str] = set()
+            payload: list[dict] = []
+            for m in pending:
+                mid = m.get("id") if isinstance(m, dict) else None
+                if mid and mid in seen_ids:
+                    continue
+                if mid:
+                    seen_ids.add(mid)
+                payload.append(m)
+
+            req = dict(
                 model=self.model,
-                input=input_items + new_items,
+                input=payload,
                 tools=self.tools,
                 truncation="auto",
             )
+            if previous_response_id:
+                req["previous_response_id"] = previous_response_id
+
+            response = create_response(**req)
             self.debug_print(response)
 
-            if "output" not in response and self.debug:
-                print(response)
+            previous_response_id = response.get("id")
+
+            if "output" not in response:
                 raise ValueError("No output from model")
-            else:
-                new_items += response["output"]
-                for item in response["output"]:
-                    new_items += self.handle_item(item)
 
-        return new_items
+            # prepare for next loop
+            new_pending: list[dict] = []
+
+            for item in response["output"]:
+                t_type = item.get("type")
+
+                if t_type == "computer_call":
+                    # Execute call and collect observations to send next.
+                    obs = self.handle_item(item)
+                    transcript.append(item)  # keep for local log
+                    transcript.extend(obs)
+                    new_pending.extend(obs)
+                    # After a computer_call we immediately break to send obs.
+                    break
+                else:
+                    transcript.append(item)
+                    # assistant/user messages do not get re‑sent; rely on previous_response_id
+
+            # Determine loop exit: if last transcript item is assistant with no further tool calls
+            if new_pending and new_pending[-1].get("role") == "assistant":
+                break
+
+            pending = new_pending if new_pending else []
+
+            # If there's nothing new to send (shouldn't happen), exit to avoid infinite loop
+            if not pending:
+                break
+
+        return transcript
diff --git a/requirements.txt b/requirements.txt
index 13769fb..5c1068d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,3 +21,4 @@ scrapybara>=2.3.6
 sniffio==1.3.1
 typing_extensions==4.12.2
 urllib3==2.3.0
+openai==1.75.0
diff --git a/simple_cua_loop.py b/simple_cua_loop.py
index 1bbb0fc..975e485 100644
--- a/simple_cua_loop.py
+++ b/simple_cua_loop.py
@@ -65,31 +65,60 @@ def main():
             }
         ]
 
-        items = []
+        transcript: list[dict] = []
         while True:  # get user input forever
             user_input = input("> ")
-            items.append({"role": "user", "content": user_input})
-
-            while True:  # keep looping until we get a final response
-                response = create_response(
+            pending: list[dict] = [{"role": "user", "content": user_input}]
+            previous_response_id = None
+
+            while True:
+                seen_ids: set[str] = set()
+                payload: list[dict] = []
+                for m in pending:
+                    mid = m.get("id") if isinstance(m, dict) else None
+                    if mid and mid in seen_ids:
+                        continue
+                    if mid:
+                        seen_ids.add(mid)
+                    payload.append(m)
+
+                req = dict(
                     model="computer-use-preview",
-                    input=items,
+                    input=payload,
                     tools=tools,
                     truncation="auto",
                 )
+                if previous_response_id:
+                    req["previous_response_id"] = previous_response_id
+
+                response = create_response(**req)
 
                 if "output" not in response:
                     print(response)
                     raise ValueError("No output from model")
 
-                items += response["output"]
+                previous_response_id = response.get("id")
+
+                new_pending: list[dict] = []
 
                 for item in response["output"]:
-                    items += handle_item(item, computer)
+                    if item.get("type") == "computer_call":
+                        obs = handle_item(item, computer)
+                        transcript.append(item)
+                        transcript.extend(obs)
+                        new_pending.extend(obs)
+                        break
+                    else:
+                        transcript.append(item)
+
+                if new_pending and new_pending[-1].get("role") == "assistant":
+                    break
 
-                if items[-1].get("role") == "assistant":
+                if not new_pending:
                     break
 
+                pending = new_pending
+
 
 if __name__ == "__main__":
     main()
diff --git a/utils.py b/utils.py
index b17ee81..85083b6 100644
--- a/utils.py
+++ b/utils.py
@@ -1,15 +1,45 @@
+"""Utility helpers used by the CUA demo code.
+
+This file previously issued raw HTTP requests to the OpenAI *Responses* API
+using the ``requests`` library.  The endpoint is now available directly in the
+official ``openai`` Python SDK, so we route all calls through
+``openai.responses.create`` instead.  This removes the manual URL / header
+handling and makes it easier to take advantage of new SDK features such as
+``previous_response_id``.
+"""
+
+from __future__ import annotations
+
 import os
-import requests
-from dotenv import load_dotenv
 import json
 import base64
-from PIL import Image
-from io import BytesIO
 import io
+from io import BytesIO
+from typing import Any, Dict
 from urllib.parse import urlparse
 
+from dotenv import load_dotenv
+from PIL import Image
+
+# Official OpenAI SDK (>= 1.15) – *do not* vendor‑lock to a minor version.
+import openai
+
+# ---------------------------------------------------------------------------
+# Environment / SDK initialisation
+# ---------------------------------------------------------------------------
+
+# Load environment variables from a local ``.env`` if present.  This is a
+# no‑op when already running in an environment with the variables exported.
 load_dotenv(override=True)
 
+# Configure the OpenAI SDK from env vars.  We intentionally *do not* raise if
+# the key is missing at import time – downstream scripts may patch
+# ``openai.api_key`` dynamically (e.g. through a CLI flag) before making the
+# first request.
+openai.api_key = os.getenv("OPENAI_API_KEY", openai.api_key)
+if org := os.getenv("OPENAI_ORG"):
+    openai.organization = org
+
 BLOCKED_DOMAINS = [
     "maliciousbook.com",
     "evilvideos.com",
@@ -47,23 +77,40 @@ def sanitize_message(msg: dict) -> dict:
     return msg
 
 
-def create_response(**kwargs):
-    url = "https://api.openai.com/v1/responses"
-    headers = {
-        "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
-        "Content-Type": "application/json"
-    }
-
-    openai_org = os.getenv("OPENAI_ORG")
-    if openai_org:
-        headers["Openai-Organization"] = openai_org
-
-    response = requests.post(url, headers=headers, json=kwargs)
-
-    if response.status_code != 200:
-        print(f"Error: {response.status_code} {response.text}")
-
-    return response.json()
+def create_response(**kwargs: Any) -> Dict[str, Any]:
+    """Wrapper around *openai.responses.create* with dict output.
+
+    The original implementation issued a raw POST and returned ``response.json()``.
+    Most call‑sites therefore expect a *plain dict* with keys like ``"output"``
+    and ``"id"``.  The OpenAI SDK, however, returns a *pydantic* model.
+
+    To stay backward‑compatible we convert the SDK object back to ``dict`` via
+    its ``to_dict()``/``model_dump()`` helper before returning.  This way none
+    of the downstream code needs to change its access pattern.
+    """
+
+    # The callers may or may not supply ``previous_response_id``.  The SDK
+    # requires ``openai.NOT_GIVEN`` for omitted optional parameters instead of
+    # plain ``None``.  Convert to the sentinel so that accidental "None"
+    # doesn't override the continuation logic.
+    if "previous_response_id" in kwargs and kwargs["previous_response_id"] is None:
+        kwargs["previous_response_id"] = openai.NOT_GIVEN
+
+    try:
+        sdk_response = openai.responses.create(**kwargs)
+    except openai.OpenAIError as e:  # pragma: no cover – network/credential errors only at runtime.
+        # Mirror the old behaviour of printing status & body for debugging.
+        print("Error while calling Responses API:", e)
+        raise
+
+    # Convert to dict for compatibility.
+    if hasattr(sdk_response, "to_dict"):
+        return sdk_response.to_dict()
+    # Fallback – older versions expose ``model_dump``.
+    if hasattr(sdk_response, "model_dump"):
+        return sdk_response.model_dump()
+    # As a last resort, expose the underlying ``__dict__``.
+    return dict(sdk_response.__dict__)
 
 
 def check_blocklisted_url(url: str) -> None:

From 9c07787549cfe074a04d6bd250ad7f2defe87050 Mon Sep 17 00:00:00 2001
From: Bill Chen <billchen@openai.com>
Date: Sun, 20 Apr 2025 19:13:08 -0700
Subject: [PATCH 2/2] removed extraneous comments

---
 utils.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/utils.py b/utils.py
index 85083b6..d91fa45 100644
--- a/utils.py
+++ b/utils.py
@@ -1,13 +1,3 @@
-"""Utility helpers used by the CUA demo code.
-
-This file previously issued raw HTTP requests to the OpenAI *Responses* API
-using the ``requests`` library.  The endpoint is now available directly in the
-official ``openai`` Python SDK, so we route all calls through
-``openai.responses.create`` instead.  This removes the manual URL / header
-handling and makes it easier to take advantage of new SDK features such as
-``previous_response_id``.
-"""
-
 from __future__ import annotations
 
 import os
@@ -21,21 +11,9 @@
 from dotenv import load_dotenv
 from PIL import Image
 
-# Official OpenAI SDK (>= 1.15) – *do not* vendor‑lock to a minor version.
 import openai
 
-# ---------------------------------------------------------------------------
-# Environment / SDK initialisation
-# ---------------------------------------------------------------------------
-
-# Load environment variables from a local ``.env`` if present.  This is a
-# no‑op when already running in an environment with the variables exported.
-load_dotenv(override=True)
 
-# Configure the OpenAI SDK from env vars.  We intentionally *do not* raise if
-# the key is missing at import time – downstream scripts may patch
-# ``openai.api_key`` dynamically (e.g. through a CLI flag) before making the
-# first request.
 openai.api_key = os.getenv("OPENAI_API_KEY", openai.api_key)
 if org := os.getenv("OPENAI_ORG"):
     openai.organization = org