From c7eb1d1b4dc921357ef2ddf52f74197b64adf459 Mon Sep 17 00:00:00 2001 From: jmanhype Date: Tue, 11 Mar 2025 15:26:56 -0500 Subject: [PATCH 1/6] Add comprehensive documentation for the CUA Sample App --- docs/README.md | 1 + docs/agent_implementation.md | 1 + docs/api_reference.md | 1 + docs/assets/class_diagram.md | 1 + docs/assets/process_flow.md | 1 + docs/cli_usage.md | 1 + docs/computer_implementations.md | 1 + docs/developer_guide.md | 1 + docs/examples.md | 1 + docs/performance.md | 1 + docs/project_overview.md | 1 + docs/safety_considerations.md | 1 + docs/testing.md | 1 + docs/troubleshooting.md | 1 + 14 files changed, 14 insertions(+) create mode 100644 docs/README.md create mode 100644 docs/agent_implementation.md create mode 100644 docs/api_reference.md create mode 100644 docs/assets/class_diagram.md create mode 100644 docs/assets/process_flow.md create mode 100644 docs/cli_usage.md create mode 100644 docs/computer_implementations.md create mode 100644 docs/developer_guide.md create mode 100644 docs/examples.md create mode 100644 docs/performance.md create mode 100644 docs/project_overview.md create mode 100644 docs/safety_considerations.md create mode 100644 docs/testing.md create mode 100644 docs/troubleshooting.md diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/README.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/agent_implementation.md b/docs/agent_implementation.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/agent_implementation.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/api_reference.md b/docs/api_reference.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/api_reference.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/assets/class_diagram.md b/docs/assets/class_diagram.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/assets/class_diagram.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/assets/process_flow.md b/docs/assets/process_flow.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/assets/process_flow.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/cli_usage.md b/docs/cli_usage.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/cli_usage.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/computer_implementations.md b/docs/computer_implementations.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/computer_implementations.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/developer_guide.md b/docs/developer_guide.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/developer_guide.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/examples.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/performance.md b/docs/performance.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/performance.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/project_overview.md b/docs/project_overview.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/project_overview.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/safety_considerations.md b/docs/safety_considerations.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/safety_considerations.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/testing.md b/docs/testing.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/testing.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1 @@ + \ No newline at end of file From 906137812fe24338ae89a3d70144a7ce4bff6561 Mon Sep 17 00:00:00 2001 From: jmanhype Date: Wed, 12 Mar 2025 15:18:33 -0500 Subject: [PATCH 2/6] Add Octotools integration guide and integrate Nature_News_Fetcher_Tool --- docs/octotools_integration_guide.md | 819 ++++++++++++++++++++++++++++ main.py | 89 ++- 2 files changed, 896 insertions(+), 12 deletions(-) create mode 100644 docs/octotools_integration_guide.md diff --git a/docs/octotools_integration_guide.md b/docs/octotools_integration_guide.md new file mode 100644 index 0000000..a2982cd --- /dev/null +++ b/docs/octotools_integration_guide.md @@ -0,0 +1,819 @@ +# Integrating Octotools with CUA-SAMPLE-APP + +This guide provides comprehensive instructions for integrating the Octotools framework with the CUA-SAMPLE-APP to enhance its reasoning and problem-solving capabilities. + +## Table of Contents + +1. [Introduction](#introduction) +2. [Benefits of Integration](#benefits-of-integration) +3. [Architecture Overview](#architecture-overview) +4. [Prerequisites](#prerequisites) +5. [Installation](#installation) +6. [Integration Steps](#integration-steps) +7. [Creating an Octotools-Enhanced Agent](#creating-an-octotools-enhanced-agent) +8. [Custom Tool Development](#custom-tool-development) +9. [Browser Automation with Octotools](#browser-automation-with-octotools) +10. [Examples](#examples) +11. [Troubleshooting](#troubleshooting) +12. [Further Resources](#further-resources) + +## Introduction + +Octotools is an open-source agentic framework designed for complex reasoning tasks across diverse domains. It provides standardized tools that can be easily integrated with large language models (LLMs). By integrating Octotools with the CUA-SAMPLE-APP, we can enhance the application's ability to perform multi-step reasoning, leverage specialized tools, and handle complex user queries. + +## Benefits of Integration + +1. **Enhanced Reasoning Capabilities**: Octotools provides a sophisticated planning and execution framework that enables multi-step reasoning. +2. **Extensible Tool Ecosystem**: Access to a wide range of pre-built tools for tasks like web search, image processing, code generation, and more. +3. **Standardized Tool Interface**: Consistent interface for creating and using tools, making it easy to extend functionality. +4. **Browser Automation Enhancement**: Augment CUA's browser automation with additional tools for understanding and interacting with web content. +5. **Performance Improvements**: Octotools has shown substantial average accuracy gains over raw LLM responses on complex reasoning tasks. + +## Architecture Overview + +The integrated system will combine CUA-SAMPLE-APP's Computer-Utilizing Agent capabilities with Octotools' reasoning framework: + +``` +┌───────────────────────────────────────────────────────────────┐ +│ CUA-SAMPLE-APP + Octotools │ +│ │ +│ ┌─────────────────┐ ┌────────────────────────────────┐ │ +│ │ User Input │ │ Enhanced Agent │ │ +│ └────────┬────────┘ │ ┌─────────────┐ ┌─────────┐ │ │ +│ │ │ │ CUA │ │Octotools│ │ │ +│ ▼ │ │ Agent │──► Solver │ │ │ +│ ┌────────────────┐ │ └─────────────┘ └─────────┘ │ │ +│ │ Agent Router │─────►│ │ │ │ │ +│ └────────────────┘ │ │ │ │ │ +│ ▲ │ ▼ ▼ │ │ +│ │ │ ┌─────────────────────┐ │ │ +│ ┌────────────────┐ │ │ Computer Interface │ │ │ +│ │ Response │◄─────│ └─────────────────────┘ │ │ +│ │ Generation │ │ │ │ │ +│ └────────────────┘ └──────────────┼─────────────────┘ │ +│ │ │ +│ ┌─────────────▼────────────┐ │ +│ │ Browser/Computer │ │ +│ └──────────────────────────┘ │ +└───────────────────────────────────────────────────────────────┘ +``` + +## Prerequisites + +Before integrating Octotools with CUA-SAMPLE-APP, ensure you have: + +1. Python 3.10 or higher +2. CUA-SAMPLE-APP installed and working +3. Git for version control +4. API keys for required services: + - OpenAI API key + - Google API key and CX (for search functionality) + - Any other API keys required by specific tools + +## Installation + +### Step 1: Add Octotools as a Dependency + +Add Octotools to your project's requirements.txt: + +```bash +echo "octotools @ git+https://github.com/OctoTools/OctoTools.git" >> requirements.txt +``` + +### Step 2: Install Dependencies + +Install the updated dependencies: + +```bash +pip install -r requirements.txt +``` + +### Step 3: Set Up Environment Variables + +Add the necessary environment variables for Octotools in your `.env` file: + +``` +# Existing CUA-SAMPLE-APP variables +# ... + +# Octotools API Keys +OPENAI_API_KEY= +GOOGLE_API_KEY= +GOOGLE_CX= +``` + +## Integration Steps + +### Step 1: Create an Octotools Wrapper + +Create a new file `octotools_wrapper.py` in the project root: + +```python +from octotools.models.solver import Solver +from typing import List, Dict, Any, Optional +import os +import base64 + + +class OctotoolsWrapper: + """ + Wrapper for Octotools integration with CUA-SAMPLE-APP. + """ + + def __init__( + self, + llm_engine: str = "gpt-4o", + enabled_tools: Optional[List[str]] = None, + max_steps: int = 5, + ): + """ + Initialize the Octotools wrapper. + + Args: + llm_engine: The LLM engine to use (default: "gpt-4o") + enabled_tools: List of tools to enable (default: None, which enables default tools) + max_steps: Maximum number of steps for solving a task (default: 5) + """ + self.llm_engine = llm_engine + self.max_steps = max_steps + + # Default tools useful for browser automation context + if enabled_tools is None: + self.enabled_tools = [ + "Python_Code_Generator_Tool", + "Text_Detector_Tool", + "Image_Captioner_Tool", + "Object_Detector_Tool", + "Google_Search_Tool", + "URL_Text_Extractor_Tool", + "Generalist_Solution_Generator_Tool" + ] + else: + self.enabled_tools = enabled_tools + + # Initialize the solver + self.solver = Solver( + model_string=self.llm_engine, + enabled_tools=self.enabled_tools, + max_steps=self.max_steps, + ) + + def solve( + self, + query: str, + image_data: Optional[str] = None, + context: Optional[str] = None + ) -> Dict[str, Any]: + """ + Solve a task using Octotools. + + Args: + query: The user query to solve + image_data: Optional base64-encoded image data + context: Optional additional context for the solver + + Returns: + Dictionary containing the solver output + """ + # Process the image if provided + image_path = None + if image_data: + # Save the image temporarily + import tempfile + with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp: + # Remove the data:image/png;base64, prefix if present + if 'base64,' in image_data: + image_data = image_data.split('base64,')[1] + + temp.write(base64.b64decode(image_data)) + image_path = temp.name + + # Build full context with query and additional context + full_query = query + if context: + full_query = f"{query}\n\nContext: {context}" + + # Solve the task + result = self.solver.solve( + query=full_query, + image_path=image_path, + verbose=True + ) + + # Clean up temporary file if created + if image_path and os.path.exists(image_path): + os.remove(image_path) + + return result +``` + +### Step 2: Enhance the Agent Class + +Modify `agent/agent.py` to integrate Octotools: + +```python +# Import the OctotoolsWrapper +from octotools_wrapper import OctotoolsWrapper + +class Agent: + """ + A sample agent class that can be used to interact with a computer. + Enhanced with Octotools for complex reasoning. + """ + + def __init__( + self, + model="computer-use-preview-2025-02-04", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda: False, + use_octotools: bool = False, + octotools_engine: str = "gpt-4o", + octotools_tools: List[str] = None, + ): + # Existing initialization + self.model = model + self.computer = computer + self.tools = tools + self.print_steps = True + self.debug = False + self.show_images = False + self.acknowledge_safety_check_callback = acknowledge_safety_check_callback + + if computer: + self.tools += [ + { + "type": "computer-preview", + "display_width": computer.dimensions[0], + "display_height": computer.dimensions[1], + "environment": computer.environment, + }, + ] + + # Octotools integration + self.use_octotools = use_octotools + if use_octotools: + self.octotools = OctotoolsWrapper( + llm_engine=octotools_engine, + enabled_tools=octotools_tools + ) + else: + self.octotools = None + + # ... existing methods ... + + def run_full_turn( + self, input_items, print_steps=True, debug=False, show_images=False + ): + """Enhanced run_full_turn with Octotools integration for complex reasoning.""" + self.print_steps = print_steps + self.debug = debug + self.show_images = show_images + + # Check if we should use Octotools for complex reasoning + complex_reasoning_trigger = self._needs_complex_reasoning(input_items) + + if self.use_octotools and complex_reasoning_trigger: + return self._handle_with_octotools(input_items) + else: + # Original CUA logic + new_items = [] + # ... existing code ... + return new_items + + def _needs_complex_reasoning(self, input_items): + """ + Determine if the query needs complex reasoning that would benefit from Octotools. + This is a basic heuristic and can be enhanced based on specific requirements. + """ + # Extract the latest user message + latest_user_message = None + for item in reversed(input_items): + if item.get("role") == "user": + latest_user_message = item.get("content", "") + break + + if not latest_user_message: + return False + + # Simple heuristic: check for keywords that might suggest complex reasoning + complex_keywords = [ + "analyze", "compare", "calculate", "extract data", "search for", + "find information", "summarize", "visual analysis", + "collect data", "research", "solve" + ] + + return any(keyword in latest_user_message.lower() for keyword in complex_keywords) + + def _handle_with_octotools(self, input_items): + """ + Handle a query using Octotools for complex reasoning. + """ + # Extract the latest user message and any screenshots + latest_user_message = None + latest_screenshot = None + + for item in reversed(input_items): + if item.get("role") == "user" and not latest_user_message: + latest_user_message = item.get("content", "") + + # Look for the most recent screenshot + if not latest_screenshot and item.get("type") == "computer_call_output": + output = item.get("output", {}) + if output.get("type") == "input_image": + image_url = output.get("image_url", "") + if image_url.startswith("data:image/png;base64,"): + latest_screenshot = image_url + + if not latest_user_message: + return [] + + # Get the current URL for context if in browser environment + current_url = None + if self.computer and self.computer.environment == "browser": + try: + current_url = self.computer.get_current_url() + except: + pass + + # Build context + context = f"Current URL: {current_url}" if current_url else "" + + # Solve using Octotools + result = self.octotools.solve( + query=latest_user_message, + image_data=latest_screenshot.split("base64,")[1] if latest_screenshot else None, + context=context + ) + + # Format the result for CUA + answer = result.get("answer", "I couldn't find a solution using the available tools.") + steps = result.get("steps", []) + + # Build a detailed response that includes steps taken + detailed_response = answer + "\n\n" + if steps: + detailed_response += "I took the following steps to solve this:\n" + for i, step in enumerate(steps, 1): + tool_used = step.get("tool_used", "Unknown tool") + reasoning = step.get("reasoning", "No reasoning provided") + detailed_response += f"\n{i}. Used {tool_used}: {reasoning}" + + # Return as a message from the assistant + return [{"role": "assistant", "content": detailed_response}] +``` + +### Step 3: Update Main Application + +Update `main.py` to allow enabling Octotools: + +```python +from agent.agent import Agent +from computers import LocalPlaywrightComputer +import argparse + + +def main(use_octotools=False): + with LocalPlaywrightComputer() as computer: + agent = Agent( + computer=computer, + use_octotools=use_octotools, + octotools_engine="gpt-4o", + ) + items = [] + while True: + user_input = input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items, debug=True, show_images=True) + items += output_items + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run CUA with optional Octotools integration") + parser.add_argument('--use-octotools', action='store_true', help='Enable Octotools integration') + args = parser.parse_args() + + main(use_octotools=args.use_octotools) +``` + +## Creating an Octotools-Enhanced Agent + +For more advanced use cases, you can create a dedicated Octotools-enhanced agent: + +```python +# octotools_agent.py +from agent.agent import Agent +from computers import Computer +from octotools_wrapper import OctotoolsWrapper +from typing import List, Dict, Any, Callable, Optional + + +class OctotoolsAgent(Agent): + """ + An agent that combines CUA capabilities with Octotools reasoning. + """ + + def __init__( + self, + model="computer-use-preview-2025-02-04", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda: False, + octotools_engine: str = "gpt-4o", + octotools_tools: Optional[List[str]] = None, + reasoning_threshold: float = 0.7, + ): + super().__init__( + model=model, + computer=computer, + tools=tools, + acknowledge_safety_check_callback=acknowledge_safety_check_callback + ) + + # Initialize Octotools + self.octotools = OctotoolsWrapper( + llm_engine=octotools_engine, + enabled_tools=octotools_tools + ) + + # Reasoning threshold determines when to use Octotools vs standard CUA + self.reasoning_threshold = reasoning_threshold + + # Add an Octotools tool to the CUA tools list + self.tools.append({ + "type": "function", + "function": { + "name": "use_octotools_reasoning", + "description": "Use Octotools framework for complex reasoning tasks", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The query to solve using Octotools" + } + }, + "required": ["query"] + } + } + }) + + def use_octotools_reasoning(self, query: str) -> str: + """ + Use Octotools to solve a complex reasoning task. + This can be called by the CUA as a tool. + """ + # Capture the current screenshot + screenshot_base64 = None + if self.computer: + screenshot_base64 = self.computer.screenshot() + + # Get current URL for context + current_url = None + if self.computer and self.computer.environment == "browser": + try: + current_url = self.computer.get_current_url() + except: + pass + + # Build context + context = f"Current URL: {current_url}" if current_url else "" + + # Solve using Octotools + result = self.octotools.solve( + query=query, + image_data=screenshot_base64, + context=context + ) + + # Return the answer + answer = result.get("answer", "I couldn't find a solution using the available tools.") + return answer + + def handle_item(self, item): + """Override to handle the Octotools function call.""" + if item["type"] == "function_call" and item["name"] == "use_octotools_reasoning": + args = json.loads(item["arguments"]) + result = self.use_octotools_reasoning(args["query"]) + return [{ + "type": "function_call_output", + "call_id": item["call_id"], + "output": result + }] + else: + # Use the original handle_item for other cases + return super().handle_item(item) +``` + +## Custom Tool Development + +You can extend Octotools with custom tools tailored for the CUA application: + +### Example: Creating a Webpage Analysis Tool + +```python +# webpage_analyzer_tool.py +from octotools.tools.base import BaseTool +from bs4 import BeautifulSoup + + +class Webpage_Analyzer_Tool(BaseTool): + """ + A tool that analyzes the structure and content of a webpage. + """ + + def __init__(self): + super().__init__( + tool_name="Webpage_Analyzer_Tool", + tool_description="Analyzes the structure and content of a webpage", + tool_version="1.0.0", + input_types={ + "html": "str - HTML content of the webpage to analyze", + "analysis_type": "str - Type of analysis to perform (structure, content, links, or forms)" + }, + output_type="dict - Analysis results containing requested information", + demo_commands=[ + { + "command": 'execution = tool.execute(html="

Title

", analysis_type="structure")', + "description": "Analyze the structure of an HTML document" + } + ], + user_metadata={ + "limitations": [ + "Cannot analyze JavaScript-rendered content", + "Does not execute JavaScript code", + "Limited to static HTML analysis" + ], + "best_practices": [ + "Provide complete HTML for accurate analysis", + "Specify the analysis type to get focused results" + ] + } + ) + + def execute(self, html, analysis_type="structure"): + """ + Execute the webpage analysis tool. + + Args: + html (str): HTML content of the webpage to analyze + analysis_type (str): Type of analysis to perform (structure, content, links, or forms) + + Returns: + dict: Analysis results containing requested information + """ + # Parse the HTML + soup = BeautifulSoup(html, 'html.parser') + + # Perform the requested analysis + if analysis_type == "structure": + return self._analyze_structure(soup) + elif analysis_type == "content": + return self._analyze_content(soup) + elif analysis_type == "links": + return self._analyze_links(soup) + elif analysis_type == "forms": + return self._analyze_forms(soup) + else: + return {"error": f"Unknown analysis type: {analysis_type}"} + + def _analyze_structure(self, soup): + """Analyze the structure of the HTML document.""" + headings = {} + for i in range(1, 7): + headings[f'h{i}'] = len(soup.find_all(f'h{i}')) + + return { + "title": soup.title.string if soup.title else None, + "headings": headings, + "paragraphs": len(soup.find_all('p')), + "divs": len(soup.find_all('div')), + "lists": { + "ul": len(soup.find_all('ul')), + "ol": len(soup.find_all('ol')), + }, + "tables": len(soup.find_all('table')) + } + + def _analyze_content(self, soup): + """Extract the main content of the HTML document.""" + return { + "title": soup.title.string if soup.title else None, + "meta_description": soup.find('meta', attrs={'name': 'description'}).get('content') if soup.find('meta', attrs={'name': 'description'}) else None, + "main_text": soup.get_text(strip=True)[:1000] + "..." if len(soup.get_text(strip=True)) > 1000 else soup.get_text(strip=True), + "word_count": len(soup.get_text(strip=True).split()) + } + + def _analyze_links(self, soup): + """Extract and analyze links in the HTML document.""" + links = soup.find_all('a') + internal_links = [] + external_links = [] + + for link in links: + href = link.get('href') + if href: + if href.startswith('http') or href.startswith('//'): + external_links.append(href) + else: + internal_links.append(href) + + return { + "total_links": len(links), + "internal_links": internal_links[:20], # Limit to avoid overwhelming output + "internal_link_count": len(internal_links), + "external_links": external_links[:20], # Limit to avoid overwhelming output + "external_link_count": len(external_links) + } + + def _analyze_forms(self, soup): + """Extract and analyze forms in the HTML document.""" + forms = soup.find_all('form') + form_analysis = [] + + for i, form in enumerate(forms): + inputs = form.find_all('input') + input_details = [] + + for input_field in inputs: + input_type = input_field.get('type', 'text') + input_name = input_field.get('name', '') + input_details.append({ + "type": input_type, + "name": input_name, + "id": input_field.get('id', ''), + "required": input_field.has_attr('required') + }) + + form_analysis.append({ + "form_id": form.get('id', f'form_{i}'), + "method": form.get('method', 'get').upper(), + "action": form.get('action', ''), + "input_fields": input_details, + "submit_button": bool(form.find('button', attrs={'type': 'submit'}) or form.find('input', attrs={'type': 'submit'})) + }) + + return { + "form_count": len(forms), + "forms": form_analysis + } +``` + +## Browser Automation with Octotools + +One powerful use case is to enhance CUA's browser automation capabilities with Octotools. Here's an example of how to combine them: + +```python +# enhanced_browser_agent.py +from octotools_agent import OctotoolsAgent +from computers import LocalPlaywrightComputer + + +def run_enhanced_browser_agent(): + """ + Run an enhanced browser agent that combines CUA with Octotools. + """ + with LocalPlaywrightComputer() as computer: + # Define browser-specific tools + browser_octotools = [ + "Python_Code_Generator_Tool", + "Text_Detector_Tool", + "Image_Captioner_Tool", + "Object_Detector_Tool", + "Webpage_Analyzer_Tool", # Custom tool created above + "URL_Text_Extractor_Tool" + ] + + # Create the agent + agent = OctotoolsAgent( + computer=computer, + octotools_tools=browser_octotools + ) + + items = [] + print("Enhanced Browser Agent with Octotools") + print("Type 'exit' to quit") + + while True: + user_input = input("> ") + if user_input.lower() == 'exit': + break + + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items, debug=True, show_images=True) + items += output_items + + +if __name__ == "__main__": + run_enhanced_browser_agent() +``` + +## Examples + +Here are some examples of how to use the integrated system: + +### Example 1: Basic Integration + +```python +# Run with basic Octotools integration +python main.py --use-octotools +``` + +### Example 2: Advanced Integration + +```python +# Using the enhanced browser agent +python enhanced_browser_agent.py +``` + +### Example 3: Custom Integration Script + +```python +# custom_integration.py +from agent.agent import Agent +from computers import LocalPlaywrightComputer +from octotools_wrapper import OctotoolsWrapper + +# Initialize components +computer = LocalPlaywrightComputer() +computer.start() + +# Initialize Octotools wrapper with specific tools +octotools = OctotoolsWrapper( + llm_engine="gpt-4o", + enabled_tools=[ + "Python_Code_Generator_Tool", + "Text_Detector_Tool", + "Google_Search_Tool" + ] +) + +# Initialize agent without Octotools integration +agent = Agent(computer=computer) + +# Process loop with manual Octotools integration +items = [] +try: + while True: + user_input = input("> ") + + # Determine if we should use Octotools + if any(keyword in user_input.lower() for keyword in ["search", "find", "calculate", "analyze"]): + # Get current screenshot + screenshot = computer.screenshot() + + # Use Octotools to solve + result = octotools.solve( + query=user_input, + image_data=screenshot + ) + + # Add the result as an assistant message + items.append({"role": "user", "content": user_input}) + items.append({"role": "assistant", "content": result["answer"]}) + print(result["answer"]) + else: + # Use standard CUA processing + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items, debug=True, show_images=True) + items += output_items +finally: + computer.stop() +``` + +## Troubleshooting + +### Common Issues + +#### Octotools Import Errors + +If you encounter import errors for Octotools: + +``` +Make sure Octotools is properly installed: +pip install git+https://github.com/OctoTools/OctoTools.git +``` + +#### API Key Issues + +If tools that require API keys don't work: + +``` +Check that all required API keys are correctly set in your .env file +``` + +#### Integration Conflicts + +If you encounter conflicts between CUA and Octotools: + +``` +Ensure that you're not trying to use both frameworks for the same task simultaneously. +The OctotoolsAgent class should handle the proper coordination between them. +``` + +## Further Resources + +- [Octotools Documentation](https://github.com/OctoTools/OctoTools/tree/main/docs) +- [CUA-SAMPLE-APP Documentation](https://github.com/openai/openai-cua-sample-app/tree/main/docs) +- [Custom Tool Development Guide](https://github.com/OctoTools/OctoTools/tree/main/docs/custom_tools.md) +- [OpenAI API Documentation](https://platform.openai.com/docs/api-reference) \ No newline at end of file diff --git a/main.py b/main.py index 41729fa..a8ef01f 100644 --- a/main.py +++ b/main.py @@ -1,17 +1,82 @@ -from agent.agent import Agent -from computers import LocalPlaywrightComputer +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Main entry point for the CUA-SAMPLE-APP with Octotools integration. + +This module initializes and runs the agent with optional Octotools +integration for enhanced reasoning capabilities. +""" +import os +import signal +import sys +import argparse +from dotenv import load_dotenv +from typing import NoReturn + +from computers import LocalPlaywrightComputer +from agent.agent import Agent +from octotools_agent import OctotoolsAgent -def main(user_input=None): - with LocalPlaywrightComputer() as computer: - agent = Agent(computer=computer) - items = [] - while True: - user_input = input("> ") - items.append({"role": "user", "content": user_input}) - output_items = agent.run_full_turn(items, debug=True, show_images=True) - items += output_items +def signal_handler(signum: int, frame: object) -> NoReturn: + """Handle shutdown signals. + + Args: + signum: Signal number + frame: Current stack frame + """ + print("\nShutting down gracefully...") + sys.exit(0) +def main(debug: bool = False, use_octotools: bool = False) -> None: + """ + Run the main application loop. + + Args: + debug: Whether to enable debug output + use_octotools: Whether to use Octotools integration + """ + print("Initializing agent with Octotools integration...") + agent = OctotoolsAgent( + model_string="gpt-4o", + enabled_tools=[ + "Generalist_Solution_Generator_Tool", + "Python_Code_Generator_Tool", + "Text_Detector_Tool", + "URL_Text_Extractor_Tool", + "Nature_News_Fetcher_Tool" + ], + debug=debug + ) + + # Set up signal handler for graceful shutdown + signal.signal(signal.SIGINT, signal_handler) + + # Main interaction loop + while True: + try: + user_input = input("\nEnter your query (or 'exit' to quit): ") + if user_input.lower() == "exit": + break + + agent.process_input(user_input) + + except Exception as e: + print(f"Error: {str(e)}") + if debug: + import traceback + traceback.print_exc() if __name__ == "__main__": - main() + # Load environment variables + load_dotenv() + + # Parse command line arguments + parser = argparse.ArgumentParser(description="Run CUA-SAMPLE-APP with optional Octotools integration") + parser.add_argument("--debug", action="store_true", help="Enable debug output") + parser.add_argument("--use-octotools", action="store_true", help="Use Octotools integration") + args = parser.parse_args() + + # Run the main application + main(debug=args.debug, use_octotools=args.use_octotools) From ea6a4c56f3273d6a5ae70579c0a89eaae084542c Mon Sep 17 00:00:00 2001 From: jmanhype Date: Wed, 12 Mar 2025 15:26:43 -0500 Subject: [PATCH 3/6] Add comprehensive documentation content to existing files --- docs/README.md | 1 - docs/agent_implementation.md | 291 +++++++++++++++++++++++- docs/api_reference.md | 367 ++++++++++++++++++++++++++++++- docs/assets/class_diagram.md | 83 ++++++- docs/assets/process_flow.md | 102 ++++++++- docs/cli_usage.md | 228 ++++++++++++++++++- docs/computer_implementations.md | 178 ++++++++++++++- docs/developer_guide.md | 140 +++++++++++- docs/examples.md | 163 +++++++++++++- docs/performance.md | 230 ++++++++++++++++++- docs/project_overview.md | 65 +++++- docs/safety_considerations.md | 161 +++++++++++++- docs/testing.md | 229 ++++++++++++++++++- docs/troubleshooting.md | 250 ++++++++++++++++++++- 14 files changed, 2474 insertions(+), 14 deletions(-) delete mode 100644 docs/README.md diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 0519ecb..0000000 --- a/docs/README.md +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/agent_implementation.md b/docs/agent_implementation.md index 0519ecb..5ade8bf 100644 --- a/docs/agent_implementation.md +++ b/docs/agent_implementation.md @@ -1 +1,290 @@ - \ No newline at end of file +# Agent Implementation + +## Overview + +The `Agent` class, defined in `agent/agent.py`, serves as the primary orchestrator for the interaction between: +- The user +- The OpenAI model +- The computer environment + +It manages the conversation flow, handles model responses, and routes actions to the appropriate computer implementation. + +## Class Definition + +```python +class Agent: + """ + A sample agent class that can be used to interact with a computer. + + (See simple_cua_loop.py for a simple example without an agent.) + """ + + def __init__( + self, + model="computer-use-preview-2025-02-04", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda: False, + ): + self.model = model + self.computer = computer + self.tools = tools + self.print_steps = True + self.debug = False + self.show_images = False + self.acknowledge_safety_check_callback = acknowledge_safety_check_callback + + if computer: + self.tools += [ + { + "type": "computer-preview", + "display_width": computer.dimensions[0], + "display_height": computer.dimensions[1], + "environment": computer.environment, + }, + ] +``` + +## Key Methods + +### `run_full_turn()` + +The `run_full_turn()` method is the main entry point for running a complete interaction turn. It: + +1. Takes the current conversation context as input +2. Calls the model to generate a response +3. Processes any actions in the response +4. Continues calling the model until a final response is reached + +```python +def run_full_turn( + self, input_items, print_steps=True, debug=False, show_images=False +): + self.print_steps = print_steps + self.debug = debug + self.show_images = show_images + new_items = [] + + # keep looping until we get a final response + while new_items[-1].get("role") != "assistant" if new_items else True: + self.debug_print([sanitize_message(msg) for msg in input_items + new_items]) + + response = create_response( + model=self.model, + input=input_items + new_items, + tools=self.tools, + truncation="auto", + ) + self.debug_print(response) + + if "output" not in response and self.debug: + print(response) + raise ValueError("No output from model") + else: + new_items += response["output"] + for item in response["output"]: + new_items += self.handle_item(item) + + return new_items +``` + +### `handle_item()` + +The `handle_item()` method processes individual items from the model's response: + +- For `message` items, it displays the message to the user +- For `function_call` items, it executes functions +- For `computer_call` items, it: + - Executes the specified computer action + - Takes a screenshot of the result + - Handles safety checks + - Prepares the output to send back to the model + +```python +def handle_item(self, item): + """Handle each item; may cause a computer action + screenshot.""" + if item["type"] == "message": + if self.print_steps: + print(item["content"][0]["text"]) + + if item["type"] == "function_call": + name, args = item["name"], json.loads(item["arguments"]) + if self.print_steps: + print(f"{name}({args})") + + if hasattr(self.computer, name): # if function exists on computer, call it + method = getattr(self.computer, name) + method(**args) + return [ + { + "type": "function_call_output", + "call_id": item["call_id"], + "output": "success", # hard-coded output for demo + } + ] + + if item["type"] == "computer_call": + action = item["action"] + action_type = action["type"] + action_args = {k: v for k, v in action.items() if k != "type"} + if self.print_steps: + print(f"{action_type}({action_args})") + + method = getattr(self.computer, action_type) + method(**action_args) + + screenshot_base64 = self.computer.screenshot() + if self.show_images: + show_image(screenshot_base64) + + # if user doesn't ack all safety checks exit with error + pending_checks = item.get("pending_safety_checks", []) + for check in pending_checks: + message = check["message"] + if not self.acknowledge_safety_check_callback(message): + raise ValueError( + f"Safety check failed: {message}. Cannot continue with unacknowledged safety checks." + ) + + call_output = { + "type": "computer_call_output", + "call_id": item["call_id"], + "acknowledged_safety_checks": pending_checks, + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{screenshot_base64}", + }, + } + + # additional URL safety checks for browser environments + if self.computer.environment == "browser": + current_url = self.computer.get_current_url() + check_blocklisted_url(current_url) + call_output["output"]["current_url"] = current_url + + return [call_output] + return [] +``` + +## Initialization Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `model` | The OpenAI model to use | `"computer-use-preview-2025-02-04"` | +| `computer` | The Computer implementation to use | `None` | +| `tools` | A list of additional tools to provide to the model | `[]` | +| `acknowledge_safety_check_callback` | A callback function for handling safety checks | `lambda: False` | + +## Agent Workflow Diagram + +``` +┌─────────────────┐ +│ │ +│ User Input │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ run_full_turn │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ OpenAI Model │ +│ Response │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ handle_item │ +│ │ +└────────┬────────┘ + │ + │ ┌─────────┐ + ├────┤ message │ + │ └─────────┘ + │ + │ ┌─────────────┐ + ├────┤function_call│ + │ └─────────────┘ + │ + │ ┌──────────────┐ + └────┤computer_call │ + └───────┬──────┘ + │ + ▼ + ┌─────────────────┐ + │ │ + │ Computer │ + │ Action │ + │ │ + └────────┬────────┘ + │ + ▼ + ┌─────────────────┐ + │ │ + │ Screenshot │ + │ │ + └────────┬────────┘ + │ + ▼ + ┌─────────────────┐ + │ │ + │ Safety Checks │ + │ │ + └────────┬────────┘ + │ + ▼ + ┌─────────────────┐ + │ │ + │ Return Output │ + │ │ + └─────────────────┘ +``` + +## Using the Agent + +The most common way to use the Agent is through the CLI, which handles the initialization and interaction loop: + +```python +with ComputerClass() as computer: + agent = Agent( + computer=computer, + acknowledge_safety_check_callback=acknowledge_safety_check_callback, + ) + items = [] + + while True: + user_input = args.input or input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn( + items, + print_steps=True, + show_images=args.show, + debug=args.debug, + ) + items += output_items + args.input = None +``` + +## Function Calling + +The Agent supports function calling through the `tools` parameter. If the model calls a function that exists on the Computer implementation, the Agent will route the call to the appropriate method. + +This is useful for extending the capabilities of the Computer implementation with custom functions that can't be expressed through standard computer actions like click or type. + +## Safety Considerations + +The Agent includes several safety measures: + +- URL blocklisting for browser-based environments +- Safety check acknowledgment for potentially risky actions +- Exception handling for failures + +The `acknowledge_safety_check_callback` parameter allows you to customize the behavior when a safety check is triggered. \ No newline at end of file diff --git a/docs/api_reference.md b/docs/api_reference.md index 0519ecb..43a5cbd 100644 --- a/docs/api_reference.md +++ b/docs/api_reference.md @@ -1 +1,366 @@ - \ No newline at end of file +# API Reference + +This document provides a reference for the key API components in the Computer Using Agent (CUA) Sample App. + +## Agent API + +### Agent Class + +```python +class Agent: + def __init__( + self, + model="computer-use-preview-2025-02-04", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda: False, + ): + """ + Initialize an Agent instance. + + Args: + model (str): The OpenAI model to use + computer (Computer): The Computer implementation to use + tools (list[dict]): Additional tools to provide to the model + acknowledge_safety_check_callback (Callable): Function to call for safety checks + """ + pass + + def run_full_turn( + self, input_items, print_steps=True, debug=False, show_images=False + ): + """ + Run a full interaction turn with the model. + + Args: + input_items (list): The current conversation context + print_steps (bool): Whether to print steps during execution + debug (bool): Whether to print debug information + show_images (bool): Whether to show images during execution + + Returns: + list: The new items added to the conversation context + """ + pass + + def handle_item(self, item): + """ + Handle an item from the model's response. + + Args: + item (dict): The item to handle + + Returns: + list: Any new items to add to the conversation context + """ + pass +``` + +## Computer API + +### Computer Protocol + +```python +class Computer(Protocol): + """Defines the 'shape' (methods/properties) our loop expects.""" + + @property + def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... + @property + def dimensions(self) -> tuple[int, int]: ... + + def screenshot(self) -> str: ... + + def click(self, x: int, y: int, button: str = "left") -> None: ... + + def double_click(self, x: int, y: int) -> None: ... + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ... + + def type(self, text: str) -> None: ... + + def wait(self, ms: int = 1000) -> None: ... + + def move(self, x: int, y: int) -> None: ... + + def keypress(self, keys: List[str]) -> None: ... + + def drag(self, path: List[Dict[str, int]]) -> None: ... + + def get_current_url() -> str: ... +``` + +### BasePlaywrightComputer + +```python +class BasePlaywrightComputer: + """ + Abstract base for Playwright-based computers. + + Attributes: + environment (Literal["browser"]): The environment type + dimensions (tuple[int, int]): The dimensions of the screen + """ + + def __enter__(self): + """ + Set up the Playwright environment. + + Returns: + self: The computer instance + """ + pass + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Clean up the Playwright environment. + """ + pass + + def get_current_url(self) -> str: + """ + Get the current URL of the page. + + Returns: + str: The current URL + """ + pass + + def screenshot(self) -> str: + """ + Capture a screenshot of the current page. + + Returns: + str: The base64-encoded screenshot + """ + pass + + def click(self, x: int, y: int, button: str = "left") -> None: + """ + Perform a mouse click at the specified coordinates. + + Args: + x (int): The x-coordinate + y (int): The y-coordinate + button (str): The mouse button to use + """ + pass + + def double_click(self, x: int, y: int) -> None: + """ + Perform a double-click at the specified coordinates. + + Args: + x (int): The x-coordinate + y (int): The y-coordinate + """ + pass + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + """ + Scroll the page at the specified coordinates. + + Args: + x (int): The x-coordinate + y (int): The y-coordinate + scroll_x (int): The amount to scroll horizontally + scroll_y (int): The amount to scroll vertically + """ + pass + + def type(self, text: str) -> None: + """ + Type the specified text. + + Args: + text (str): The text to type + """ + pass + + def wait(self, ms: int = 1000) -> None: + """ + Wait for the specified number of milliseconds. + + Args: + ms (int): The number of milliseconds to wait + """ + pass + + def move(self, x: int, y: int) -> None: + """ + Move the mouse to the specified coordinates. + + Args: + x (int): The x-coordinate + y (int): The y-coordinate + """ + pass + + def keypress(self, keys: List[str]) -> None: + """ + Press the specified keys. + + Args: + keys (List[str]): The keys to press + """ + pass + + def drag(self, path: List[Dict[str, int]]) -> None: + """ + Perform a drag operation along the specified path. + + Args: + path (List[Dict[str, int]]): The path to drag along + """ + pass + + def goto(self, url: str) -> None: + """ + Navigate to the specified URL. + + Args: + url (str): The URL to navigate to + """ + pass + + def back(self) -> None: + """ + Navigate back in the browser history. + """ + pass + + def forward(self) -> None: + """ + Navigate forward in the browser history. + """ + pass + + def _get_browser_and_page(self) -> tuple[Browser, Page]: + """ + Get a browser instance and page. + + Returns: + tuple[Browser, Page]: The browser and page instances + """ + raise NotImplementedError +``` + +## Utility Functions + +### create_response() + +```python +def create_response(**kwargs): + """ + Create a response from the OpenAI API. + + Args: + **kwargs: Arguments to pass to the API + + Returns: + dict: The API response + """ + pass +``` + +### show_image() + +```python +def show_image(base_64_image): + """ + Display an image from a base64-encoded string. + + Args: + base_64_image (str): The base64-encoded image + """ + pass +``` + +### check_blocklisted_url() + +```python +def check_blocklisted_url(url: str) -> None: + """ + Check if a URL is in the blocklist. + + Args: + url (str): The URL to check + + Raises: + ValueError: If the URL is in the blocklist + """ + pass +``` + +### sanitize_message() + +```python +def sanitize_message(msg: dict) -> dict: + """ + Sanitize a message by omitting image_url for computer_call_output messages. + + Args: + msg (dict): The message to sanitize + + Returns: + dict: The sanitized message + """ + pass +``` + +## CLI Functions + +### acknowledge_safety_check_callback() + +```python +def acknowledge_safety_check_callback(message: str) -> bool: + """ + Prompt the user to acknowledge a safety check. + + Args: + message (str): The safety check message + + Returns: + bool: Whether the user acknowledged the check + """ + pass +``` + +### main() + +```python +def main(): + """ + Run the CLI. + """ + pass +``` + +## Simple CUA Loop Functions + +### handle_item() + +```python +def handle_item(item, computer: Computer): + """ + Handle an item from the model's response. + + Args: + item (dict): The item to handle + computer (Computer): The Computer implementation to use + + Returns: + list: Any new items to add to the conversation context + """ + pass +``` + +### main() + +```python +def main(): + """ + Run the simple CUA loop. + """ + pass +``` \ No newline at end of file diff --git a/docs/assets/class_diagram.md b/docs/assets/class_diagram.md index 0519ecb..1d967e0 100644 --- a/docs/assets/class_diagram.md +++ b/docs/assets/class_diagram.md @@ -1 +1,82 @@ - \ No newline at end of file +# Class Diagram + +The following diagram illustrates the class structure and relationships in Octotools. + +```mermaid +classDiagram + class NotebookAnalyzer { + +analyze(notebook_path: str) Dict + -_load_notebook(notebook_path: str) Dict + -_analyze_cells(cells: List) Dict + -_generate_metrics(cell_data: Dict) Dict + } + + class CellAnalyzer { + +analyze_cell(cell: Dict) Dict + -_extract_imports(code: str) List + -_measure_complexity(code: str) int + -_detect_patterns(code: str) List + } + + class NotebookVisualizer { + +create_visualization(notebook_path: str, output_path: str) None + -_generate_graph(metrics: Dict) Figure + -_save_visualization(figure: Figure, output_path: str) None + } + + class CLI { + +run() None + -_parse_args() Namespace + -_process_command(command: str, args: Namespace) None + } + + class API { + +analyze_notebook(notebook_path: str) Dict + +visualize_notebook(notebook_path: str, output_path: str) None + +batch_analyze(notebook_paths: List[str]) Dict + } + + class Utils { + +load_config() Dict + +setup_logging() Logger + +format_output(results: Dict) str + } + + NotebookAnalyzer --> CellAnalyzer : uses + NotebookVisualizer --> NotebookAnalyzer : uses + API --> NotebookAnalyzer : uses + API --> NotebookVisualizer : uses + CLI --> API : uses + NotebookAnalyzer --> Utils : uses + NotebookVisualizer --> Utils : uses +``` + +## Component Relationships + +The diagram above illustrates the key classes and their relationships: + +1. **NotebookAnalyzer** - Core class responsible for analyzing Jupyter notebooks +2. **CellAnalyzer** - Analyzes individual cells within a notebook +3. **NotebookVisualizer** - Creates visualizations based on notebook analysis +4. **CLI** - Command-line interface for the tool +5. **API** - Programmatic interface for the tool +6. **Utils** - Utility functions used by multiple components + +## Key Relationships + +- **NotebookAnalyzer** uses **CellAnalyzer** to analyze individual cells +- **NotebookVisualizer** uses **NotebookAnalyzer** to get data for visualizations +- **API** uses both **NotebookAnalyzer** and **NotebookVisualizer** +- **CLI** uses the **API** to provide command-line functionality +- **Utils** provides common functionality to several components + +## Design Patterns + +The codebase leverages several design patterns: + +1. **Facade Pattern** - The API class provides a simplified interface to the complex subsystem +2. **Strategy Pattern** - Different analysis strategies can be used by the NotebookAnalyzer +3. **Command Pattern** - The CLI uses commands to trigger different functionalities +4. **Singleton Pattern** - The Utils class provides global access to configuration +5. **Composite Pattern** - Notebooks and cells form a composite structure + \ No newline at end of file diff --git a/docs/assets/process_flow.md b/docs/assets/process_flow.md index 0519ecb..fa4c0b8 100644 --- a/docs/assets/process_flow.md +++ b/docs/assets/process_flow.md @@ -1 +1,101 @@ - \ No newline at end of file +# Process Flow Diagrams + +This document contains flow diagrams illustrating the key processes in Octotools. + +## Notebook Analysis Flow + +The following diagram shows the process flow for analyzing a notebook. + +```mermaid +sequenceDiagram + participant User + participant CLI + participant API + participant NotebookAnalyzer + participant CellAnalyzer + participant Utils + + User->>CLI: Run analyze command + CLI->>API: Call analyze_notebook() + API->>NotebookAnalyzer: analyze(notebook_path) + NotebookAnalyzer->>Utils: load_config() + Utils-->>NotebookAnalyzer: Return config + NotebookAnalyzer->>NotebookAnalyzer: _load_notebook() + loop For each cell + NotebookAnalyzer->>CellAnalyzer: analyze_cell(cell) + CellAnalyzer-->>NotebookAnalyzer: Return cell metrics + end + NotebookAnalyzer->>NotebookAnalyzer: _generate_metrics() + NotebookAnalyzer-->>API: Return analysis results + API->>Utils: format_output(results) + Utils-->>API: Return formatted results + API-->>CLI: Return formatted results + CLI-->>User: Display results +``` + +## Visualization Flow + +The following diagram shows the process flow for creating visualizations. + +```mermaid +sequenceDiagram + participant User + participant CLI + participant API + participant NotebookAnalyzer + participant NotebookVisualizer + + User->>CLI: Run visualize command + CLI->>API: Call visualize_notebook() + API->>NotebookAnalyzer: analyze(notebook_path) + NotebookAnalyzer-->>API: Return analysis results + API->>NotebookVisualizer: create_visualization(results, output_path) + NotebookVisualizer->>NotebookVisualizer: _generate_graph() + NotebookVisualizer->>NotebookVisualizer: _save_visualization() + NotebookVisualizer-->>API: Return success + API-->>CLI: Return success + CLI-->>User: Display success message +``` + +## Batch Analysis Flow + +The following diagram shows the process flow for batch analysis of multiple notebooks. + +```mermaid +sequenceDiagram + participant User + participant CLI + participant API + participant NotebookAnalyzer + + User->>CLI: Run batch command + CLI->>API: Call batch_analyze(notebook_paths) + + loop For each notebook + API->>NotebookAnalyzer: analyze(notebook_path) + NotebookAnalyzer-->>API: Return analysis results + API->>API: Aggregate results + end + + API-->>CLI: Return aggregated results + CLI-->>User: Display aggregated results +``` + +## Overall System Architecture + +The following diagram shows the overall system architecture. + +```mermaid +graph TD + User[User] -->|Uses| CLI[Command Line Interface] + User -->|Uses| PythonAPI[Python API] + CLI -->|Calls| Core[Core Library] + PythonAPI -->|Calls| Core + Core -->|Contains| Analyzer[Notebook Analyzer] + Core -->|Contains| Visualizer[Notebook Visualizer] + Core -->|Contains| Utils[Utilities] + Analyzer -->|Analyzes| Notebooks[Jupyter Notebooks] + Visualizer -->|Creates| Reports[Reports & Visualizations] + Utils -->|Supports| Analyzer + Utils -->|Supports| Visualizer +``` \ No newline at end of file diff --git a/docs/cli_usage.md b/docs/cli_usage.md index 0519ecb..f5147f3 100644 --- a/docs/cli_usage.md +++ b/docs/cli_usage.md @@ -1 +1,227 @@ - \ No newline at end of file +# CLI Usage Guide + +The Command Line Interface (CLI) provides an easy way to interact with the Computer Using Agent (CUA) system. It allows you to select different computer environments, configure execution parameters, and start an interactive session with the agent. + +## Basic Usage + +The basic command to run the CLI is: + +```bash +python cli.py +``` + +This will start an interactive session with the default settings (local Playwright browser environment). + +## Command Line Arguments + +The CLI supports several command-line arguments to customize its behavior: + +| Argument | Description | Default | +|----------|-------------|---------| +| `--computer` | The computer environment to use | `local-playwright` | +| `--input` | Initial input to the agent (optional) | None | +| `--debug` | Enable debug mode | False | +| `--show` | Show images (screenshots) during execution | False | +| `--start-url` | Starting URL for browser environments | `https://bing.com` | + +### Example Usage + +Using a different computer environment: + +```bash +python cli.py --computer docker +``` + +Providing an initial input: + +```bash +python cli.py --input "Search for information about climate change" +``` + +Enabling debug mode: + +```bash +python cli.py --debug +``` + +Showing images during execution: + +```bash +python cli.py --show +``` + +Specifying a start URL: + +```bash +python cli.py --start-url "https://www.google.com" +``` + +Combining multiple arguments: + +```bash +python cli.py --computer local-playwright --show --debug --start-url "https://www.wikipedia.org" +``` + +## Available Computer Environments + +The CLI supports several computer environments, each with its own requirements and characteristics. + +| Environment Option | Description | Type | Requirements | +|--------------------|-------------|------|-------------| +| `local-playwright` | Local browser window | Browser | Playwright SDK | +| `docker` | Docker container environment | Linux | Docker running | +| `browserbase` | Remote browser environment | Browser | Browserbase API key in `.env` | +| `scrapybara-browser` | Remote browser environment | Browser | Scrapybara API key in `.env` | +| `scrapybara-ubuntu` | Remote Ubuntu desktop | Linux | Scrapybara API key in `.env` | + +## Implementation Details + +The CLI is implemented in `cli.py`. Here's an overview of the key components: + +### Safety Check Callback + +```python +def acknowledge_safety_check_callback(message: str) -> bool: + response = input( + f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): " + ).lower() + return response.lower().strip() == "y" +``` + +This function is called when the agent encounters a safety check. It displays the safety warning message and asks the user if they want to proceed. + +### Main Function + +```python +def main(): + parser = argparse.ArgumentParser( + description="Select a computer environment from the available options." + ) + parser.add_argument( + "--computer", + choices=[ + "local-playwright", + "docker", + "browserbase", + "scrapybara-browser", + "scrapybara-ubuntu", + ], + help="Choose the computer environment to use.", + default="local-playwright", + ) + # ...other arguments... + args = parser.parse_args() + + computer_mapping = { + "local-playwright": LocalPlaywrightComputer, + "docker": DockerComputer, + "browserbase": BrowserbaseBrowser, + "scrapybara-browser": ScrapybaraBrowser, + "scrapybara-ubuntu": ScrapybaraUbuntu, + } + + ComputerClass = computer_mapping[args.computer] + + with ComputerClass() as computer: + agent = Agent( + computer=computer, + acknowledge_safety_check_callback=acknowledge_safety_check_callback, + ) + items = [] + + while True: + user_input = args.input or input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn( + items, + print_steps=True, + show_images=args.show, + debug=args.debug, + ) + items += output_items + args.input = None +``` + +The main function: +1. Parses command-line arguments +2. Maps the selected computer environment to the appropriate class +3. Creates an instance of the selected Computer class +4. Creates an Agent with the computer instance +5. Enters the main interaction loop, where it: + - Gets user input (or uses the provided initial input) + - Adds the input to the conversation context + - Runs a full turn of the agent + - Adds the agent's output to the conversation context + - Resets the initial input (so it's only used once) + +## Interaction Flow + +``` +┌─────────────────┐ +│ │ +│ Parse Command │ +│ Line Arguments │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Create Computer │ +│ Environment │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Create Agent │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Get User Input │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│Run Agent Full │ +│ Turn │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│Update Convo │ +│Context │ +│ │ +└────────┬────────┘ + │ + └─────────────┐ + │ + ▼ + ┌───────┐ + │ Loop │ + └───────┘ +``` + +## Error Handling + +The CLI includes basic error handling: +- If the model returns an error, it's displayed to the user +- If a safety check fails, the program raises a ValueError with the safety message +- The context manager pattern (`with ComputerClass() as computer:`) ensures proper cleanup of computer environment resources, even in case of errors + +## Extending the CLI + +To add a new computer environment to the CLI: + +1. Implement the Computer protocol in a new class +2. Add your class to the `computers/__init__.py` file +3. Add your environment option to the `--computer` argument choices +4. Add your class to the `computer_mapping` dictionary \ No newline at end of file diff --git a/docs/computer_implementations.md b/docs/computer_implementations.md index 0519ecb..bb4b59f 100644 --- a/docs/computer_implementations.md +++ b/docs/computer_implementations.md @@ -1 +1,177 @@ - \ No newline at end of file +# Computer Protocol and Implementations + +## Computer Protocol + +The `Computer` protocol, defined in `computers/computer.py`, specifies the interface that all computer environment implementations must adhere to. It defines a set of methods for interacting with a computer environment, which could be a local browser, a remote browser, or a desktop environment. + +### Core Interface + +```python +class Computer(Protocol): + """Defines the 'shape' (methods/properties) our loop expects.""" + + @property + def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... + @property + def dimensions(self) -> tuple[int, int]: ... + + def screenshot(self) -> str: ... + + def click(self, x: int, y: int, button: str = "left") -> None: ... + + def double_click(self, x: int, y: int) -> None: ... + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ... + + def type(self, text: str) -> None: ... + + def wait(self, ms: int = 1000) -> None: ... + + def move(self, x: int, y: int) -> None: ... + + def keypress(self, keys: List[str]) -> None: ... + + def drag(self, path: List[Dict[str, int]]) -> None: ... + + def get_current_url() -> str: ... +``` + +### Required Methods + +| Method | Description | Parameters | +|--------|-------------|------------| +| `screenshot()` | Captures and returns a base64-encoded image of the current screen | None | +| `click()` | Performs a mouse click at the specified coordinates | `x`, `y`, `button` | +| `double_click()` | Performs a double-click at the specified coordinates | `x`, `y` | +| `scroll()` | Scrolls the screen at the specified coordinates | `x`, `y`, `scroll_x`, `scroll_y` | +| `type()` | Types the specified text | `text` | +| `wait()` | Waits for the specified number of milliseconds | `ms` | +| `move()` | Moves the mouse to the specified coordinates | `x`, `y` | +| `keypress()` | Presses the specified keys | `keys` | +| `drag()` | Performs a drag operation along the specified path | `path` | +| `get_current_url()` | Returns the current URL (for browser environments) | None | + +### Required Properties + +| Property | Description | Type | +|----------|-------------|------| +| `environment` | Specifies the type of environment ("windows", "mac", "linux", "browser") | `Literal["windows", "mac", "linux", "browser"]` | +| `dimensions` | The dimensions of the screen (width, height) | `tuple[int, int]` | + +## Computer Implementations + +The repository includes several computer implementations, each designed to work with a different environment. + +### BasePlaywrightComputer + +The `BasePlaywrightComputer` class, defined in `computers/base_playwright.py`, serves as an abstract base class for Playwright-based computer implementations. It implements all the required methods of the `Computer` protocol, but leaves the actual browser/page connection to be implemented by subclasses. + +Key features: + +- Context management with `__enter__` and `__exit__` methods +- Network interception for security (blocking requests to suspicious domains) +- Implementation of all standard Computer actions +- Extra browser-specific actions like `goto()`, `back()`, and `forward()` + +### LocalPlaywrightComputer + +The `LocalPlaywrightComputer` class, defined in `computers/local_playwright.py`, extends `BasePlaywrightComputer` to use a local Chromium instance via Playwright. + +```python +class LocalPlaywrightComputer(BasePlaywrightComputer): + """Launches a local Chromium instance using Playwright.""" + + def __init__(self, headless: bool = False): + super().__init__() + self.headless = headless + + def _get_browser_and_page(self) -> tuple[Browser, Page]: + width, height = self.dimensions + launch_args = [ + f"--window-size={width},{height}", + "--disable-extensions", + "--disable-file-system" + ] + browser = self._playwright.chromium.launch( + chromium_sandbox=True, + headless=self.headless, + args=launch_args, + env={} + ) + page = browser.new_page() + page.set_viewport_size({"width": width, "height": height}) + page.goto("https://bing.com") + return browser, page +``` + +### DockerComputer + +The `DockerComputer` class connects to a Docker container running a VNC server, providing a Linux desktop environment. + +Key features: +- Connects to a VNC server running in Docker +- Uses PyVNC for VNC interaction +- Implements all standard Computer actions in a Linux desktop context + +### BrowserbaseBrowser + +The `BrowserbaseBrowser` class connects to the Browserbase API, a service that provides remote browser environments. + +Key features: +- Creates and connects to a remote browser session +- Uses the Browserbase API for interaction +- Requires a Browserbase API key + +### ScrapybaraBrowser + +The `ScrapybaraBrowser` class connects to the Scrapybara API, which provides remote browser environments. + +Key features: +- Creates and connects to a remote browser session +- Uses the Scrapybara API for interaction +- Requires a Scrapybara API key + +### ScrapybaraUbuntu + +The `ScrapybaraUbuntu` class connects to the Scrapybara API, but uses a remote Ubuntu desktop environment instead of a browser. + +Key features: +- Creates and connects to a remote Ubuntu desktop session +- Uses the Scrapybara API for interaction +- Requires a Scrapybara API key + +## Extending with Custom Computer Implementations + +You can create your own Computer implementation by: + +1. Implementing the `Computer` protocol +2. Adding your implementation to the `computers/__init__.py` file +3. Registering it in the `computer_mapping` dictionary in `cli.py` + +Example skeleton for a custom implementation: + +```python +class MyCustomComputer: + """My custom computer implementation.""" + + environment = "browser" # or "windows", "mac", "linux" + dimensions = (1024, 768) # default dimensions + + def __init__(self): + # Initialize your environment connection + pass + + def __enter__(self): + # Set up your environment + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Clean up your environment + pass + + def screenshot(self) -> str: + # Capture and return a base64-encoded screenshot + pass + + # Implement all other required methods... +``` \ No newline at end of file diff --git a/docs/developer_guide.md b/docs/developer_guide.md index 0519ecb..b558d24 100644 --- a/docs/developer_guide.md +++ b/docs/developer_guide.md @@ -1 +1,139 @@ - \ No newline at end of file +# Developer Guide + +This document provides information for developers who want to modify or extend Octotools. + +## Development Environment Setup + +1. **Clone the repository**: + ```bash + git clone https://github.com/octotools/octotools.git + cd octotools + ``` + +2. **Set up a virtual environment**: + ```bash + python -m venv env + source env/bin/activate # On Windows, use: env\Scripts\activate + ``` + +3. **Install in development mode**: + ```bash + pip install -e . + ``` + +4. **Install additional development dependencies**: + ```bash + pip install pytest black isort mypy + ``` + +## Project Structure + +The project is organized into several key components: + +``` +. +├── octotools/ # Main package +│ ├── __init__.py # Package initialization +│ ├── api/ # API implementation +│ ├── cli/ # Command-line interface +│ ├── core/ # Core functionality +│ ├── utils/ # Utility functions +│ └── visualization/ # Visualization tools +├── tasks/ # Task definitions +├── assets/ # Static assets +├── docs/ # Documentation +├── tests/ # Test suite +├── setup.py # Package configuration +├── requirements.txt # Python dependencies +└── README.md # Project overview +``` + +## Adding New Features + +To add a new feature to Octotools: + +1. **Create a feature branch**: + ```bash + git checkout -b feature/your-feature-name + ``` + +2. **Implement your feature**: + - Place your code in the appropriate module + - Follow the existing code style + - Add tests for your feature + - Update documentation + +3. **Submit a Pull Request**: + - Push your branch to your fork + - Create a pull request with a clear description of the changes + +## Coding Standards + +Octotools follows these coding standards: + +1. **PEP 8**: Follow PEP 8 style guidelines +2. **Type Hints**: Use type hints for function parameters and return values +3. **Docstrings**: Use Google-style docstrings +4. **Testing**: Write tests for new functionality +5. **Code Formatting**: Use Black and isort for code formatting + +Example of a well-documented function: + +```python +def analyze_notebook(notebook_path: str, metrics: List[str] = None) -> Dict[str, Any]: + """Analyze a Jupyter notebook and return metrics. + + Args: + notebook_path: Path to the notebook file + metrics: List of metrics to calculate. If None, calculate all metrics. + + Returns: + Dictionary of metrics and their values + + Raises: + FileNotFoundError: If the notebook file doesn't exist + ValueError: If an invalid metric is specified + """ + # Implementation + pass +``` + +## Testing + +To run the tests: + +```bash +pytest +``` + +For more information about testing, see the [Testing Guide](testing.md). + +## Documentation + +When adding or modifying features, update the documentation: + +1. **API Documentation**: Update docstrings in the code +2. **Usage Examples**: Add examples to show how to use the feature +3. **README**: Update the README if necessary + +## Release Process + +1. **Update Version**: Update the version in `setup.py` +2. **Create Changelog**: Update the changelog +3. **Create Tag**: Create a git tag for the version +4. **Build Distribution**: Build the distribution package + ```bash + python setup.py sdist bdist_wheel + ``` +5. **Upload to PyPI**: Upload the package to PyPI + ```bash + twine upload dist/* + ``` + +## Getting Help + +If you need help with the development process: + +1. Check the [Troubleshooting](troubleshooting.md) guide +2. Open an issue on GitHub +3. Reach out to the maintainers \ No newline at end of file diff --git a/docs/examples.md b/docs/examples.md index 0519ecb..45c1d8d 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -1 +1,162 @@ - \ No newline at end of file +# Examples + +The repository includes several example applications that demonstrate different aspects of the Computer Using Agent (CUA) functionality. This document provides an overview of these examples. + +## Weather Example + +The `weather_example.py` script demonstrates a simple, single-turn interaction with the CUA to check the weather. + +```python +from agent import Agent +from computers import ScrapybaraBrowser + +with ScrapybaraBrowser() as computer: + agent = Agent(computer=computer) + input_items = [{"role": "user", "content": "what is the weather in sf"}] + response_items = agent.run_full_turn(input_items, debug=True, show_images=True) + print(response_items[-1]["content"][0]["text"]) +``` + +### Key aspects: +- Uses the ScrapybaraBrowser computer environment +- Sends a single query about the weather in San Francisco +- Uses the debug mode to show detailed information during execution +- Shows images (screenshots) during execution +- Prints only the final text response + +## Function Calling Example + +The `function_calling_example.py` script demonstrates how to integrate function calling with the CUA. + +```python +from agent import Agent +from computers import ScrapybaraBrowser + +tools = [ + { + "type": "function", + "name": "get_weather", + "description": "Determine weather in my location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["c", "f"]}, + }, + "additionalProperties": False, + "required": ["location", "unit"], + }, + } +] + + +def main(): + with ScrapybaraBrowser() as computer: + agent = Agent(tools=tools, computer=computer) + items = [] + while True: + user_input = input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items) + items += output_items + + +if __name__ == "__main__": + main() +``` + +### Key aspects: +- Defines a `get_weather` function tool with parameters for location and temperature unit +- Uses the ScrapybaraBrowser computer environment +- Creates an interactive session that continually takes user input +- Adds the function tool to the Agent's available tools + +This example shows how to: +1. Define a function schema using the OpenAI function calling format +2. Pass the function to the Agent via the `tools` parameter +3. Handle function calls in the Agent's conversation loop + +## Playwright with Custom Functions + +The `playwright_with_custom_functions.py` script demonstrates how to extend the CUA with custom browser navigation functions. + +```python +from agent.agent import Agent +from computers import LocalPlaywrightComputer + +tools = [ + { + "type": "function", + "name": "back", + "description": "Go back to the previous page.", + "parameters": {}, + }, + { + "type": "function", + "name": "goto", + "description": "Go to a specific URL.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Fully qualified URL to navigate to.", + }, + }, + "additionalProperties": False, + "required": ["url"], + }, + }, +] + + +def main(): + with LocalPlaywrightComputer() as computer: + agent = Agent(computer=computer, tools=tools) + items = [ + { + "role": "developer", + "content": "Use the additional back() and goto() functions to naviate the browser. If you see nothing, trying going to Google.", + } + ] + while True: + user_input = input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items, show_images=False) + items += output_items + + +if __name__ == "__main__": + main() +``` + +### Key aspects: +- Uses the LocalPlaywrightComputer environment (local browser) +- Defines two custom function tools: `back()` and `goto(url)` +- Provides an initial developer message suggesting how to use these functions +- Creates an interactive session that continually takes user input +- Runs without showing images (screenshots) for faster execution + +This example demonstrates how: +1. Custom functions can be defined and passed to the Agent +2. These functions can be implemented in the Computer class +3. The Agent will route function calls to the appropriate methods in the Computer implementation + +## Running the Examples + +To run any of the examples, use the following command: + +```bash +python -m examples. +``` + +For instance, to run the weather example: + +```bash +python -m examples.weather_example +``` + +Note that some examples may require specific API keys or environment setup, particularly those using ScrapybaraBrowser or other remote browsers. \ No newline at end of file diff --git a/docs/performance.md b/docs/performance.md index 0519ecb..2c2f7da 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -1 +1,229 @@ - \ No newline at end of file +# Performance Considerations + +This document provides guidance on optimizing the performance of your Computer Using Agent (CUA) application. + +## Understanding Performance Factors + +The performance of a CUA application depends on several factors: + +1. **Network Latency**: The time it takes for requests to travel between your application and the OpenAI API +2. **API Processing Time**: The time it takes for the OpenAI API to process your request and generate a response +3. **Computer Environment Performance**: The speed of the computer environment (local browser, remote browser, Docker container) +4. **Screenshot Size and Quality**: Larger screenshots take longer to process +5. **Conversation History Size**: Larger conversation histories increase token usage and processing time +6. **Action Complexity**: Complex sequences of actions take longer to execute than simple ones + +## Optimizing API Interactions + +### Reduce API Calls + +Each call to the OpenAI API adds latency to your application. To reduce the number of API calls: + +1. **Batch Actions**: When possible, design your prompts to encourage the model to perform multiple actions in a single turn +2. **Provide Clear Instructions**: Clear prompts help the model accomplish tasks with fewer turns +3. **Use Custom Functions**: For complex operations, use custom functions instead of relying on the model to perform a sequence of basic actions + +### Optimize Context Size + +Larger contexts take longer to process and consume more tokens: + +1. **Limit Conversation History**: If you're building a long-running application, consider pruning old conversation history +2. **Compress Screenshots**: Use lower resolution or compressed screenshots when possible +3. **Use Truncation**: The Agent uses `truncation="auto"` by default, which helps manage large contexts + +## Optimizing Computer Environments + +### Local Playwright + +The LocalPlaywrightComputer is usually the fastest option because it runs locally: + +1. **Use Headless Mode**: For automated tasks, use headless mode to reduce overhead: + ```python + LocalPlaywrightComputer(headless=True) + ``` +2. **Optimize Browser Settings**: Customize browser launch arguments: + ```python + launch_args = [ + f"--window-size={width},{height}", + "--disable-extensions", + "--disable-gpu", # Disable GPU acceleration for faster headless operation + "--no-sandbox", # Use with caution - reduces security + ] + ``` + +### Docker Environment + +The Docker environment's performance depends on your Docker setup: + +1. **Allocate Sufficient Resources**: Make sure Docker has enough CPU and memory +2. **Use a Fast VNC Connection**: VNC performance greatly affects DockerComputer performance +3. **Optimize Display Resolution**: Use a lower resolution to reduce VNC traffic + +### Remote Browser Environments + +Remote browser environments (Browserbase, Scrapybara) have additional network latency: + +1. **Choose Geographically Closer Servers**: If available, use servers that are closer to your location +2. **Reduce Screenshot Frequency**: Minimize the number of actions that require screenshots +3. **Use Batch Operations**: Perform multiple actions in sequence before requesting a new screenshot + +## Code-Level Optimizations + +### Optimize Screenshot Handling + +Screenshots are the largest data elements in most CUA applications: + +1. **Compress Screenshots**: Consider compressing screenshots before encoding them: + ```python + def screenshot(self) -> str: + # Capture screenshot + png_bytes = self._page.screenshot(full_page=False) + + # Optionally compress the image + from PIL import Image + import io + image = Image.open(io.BytesIO(png_bytes)) + image = image.resize((image.width // 2, image.height // 2)) # Downsample + + output = io.BytesIO() + image.save(output, format='JPEG', quality=70) # Convert to JPEG with compression + compressed_bytes = output.getvalue() + + return base64.b64encode(compressed_bytes).decode("utf-8") + ``` + +2. **Crop Screenshots**: Consider cropping screenshots to the relevant area: + ```python + def screenshot(self) -> str: + # Capture full screenshot + png_bytes = self._page.screenshot(full_page=False) + + # Crop to relevant area + from PIL import Image + import io + image = Image.open(io.BytesIO(png_bytes)) + + # Example: crop to the top half of the screen + width, height = image.size + image = image.crop((0, 0, width, height // 2)) + + output = io.BytesIO() + image.save(output, format='PNG') + cropped_bytes = output.getvalue() + + return base64.b64encode(cropped_bytes).decode("utf-8") + ``` + +### Optimize Action Execution + +1. **Parallelize Actions**: For independent actions, consider parallelizing them: + ```python + import threading + + def perform_parallel_actions(actions): + threads = [] + for action in actions: + thread = threading.Thread(target=action) + threads.append(thread) + thread.start() + + for thread in threads: + thread.join() + ``` + +2. **Batch Similar Actions**: Group similar actions together: + ```python + def type_paragraphs(self, paragraphs): + for paragraph in paragraphs: + self._page.keyboard.type(paragraph) + self._page.keyboard.press("Enter") + self._page.keyboard.press("Enter") + ``` + +### Caching + +1. **Cache Screenshots**: If the screen hasn't changed, reuse the previous screenshot: + ```python + def screenshot(self) -> str: + # Check if the screen has changed since the last screenshot + current_hash = self._get_screen_hash() + if current_hash == self._last_screen_hash: + return self._last_screenshot + + # If screen has changed, take a new screenshot + png_bytes = self._page.screenshot(full_page=False) + screenshot = base64.b64encode(png_bytes).decode("utf-8") + + # Update cache + self._last_screen_hash = current_hash + self._last_screenshot = screenshot + + return screenshot + ``` + +2. **Cache Function Results**: For expensive function calls, consider caching results: + ```python + import functools + + @functools.lru_cache(maxsize=128) + def expensive_function(self, arg1, arg2): + # Expensive operation + return result + ``` + +## Measuring Performance + +To identify performance bottlenecks, add timing measurements: + +```python +import time + +def measure_time(func): + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + print(f"{func.__name__} took {end_time - start_time:.2f} seconds") + return result + return wrapper + +class TimedComputer(LocalPlaywrightComputer): + @measure_time + def screenshot(self) -> str: + return super().screenshot() + + @measure_time + def click(self, x: int, y: int, button: str = "left") -> None: + return super().click(x, y, button) + + # etc. +``` + +Use this data to identify which operations are taking the most time and focus your optimization efforts there. + +## Trade-offs + +When optimizing for performance, consider these trade-offs: + +1. **Quality vs. Speed**: Lower quality screenshots are faster but may lead to less accurate model responses +2. **Safety vs. Speed**: Some safety checks add overhead but are important for security +3. **Flexibility vs. Speed**: Custom functions are faster but less flexible than general-purpose computer actions +4. **Memory vs. Speed**: Caching improves speed but increases memory usage + +Choose optimizations that make sense for your specific use case and requirements. + +## Environment-Specific Recommendations + +### Local Development + +For local development, prioritize: +- Fast iteration time with LocalPlaywrightComputer +- Debug mode for detailed information +- Showing images for visual feedback + +### Production + +For production deployments, prioritize: +- Robustness with error handling and reconnection logic +- Performance optimizations like headless mode and caching +- Memory management for long-running applications \ No newline at end of file diff --git a/docs/project_overview.md b/docs/project_overview.md index 0519ecb..bd06f54 100644 --- a/docs/project_overview.md +++ b/docs/project_overview.md @@ -1 +1,64 @@ - \ No newline at end of file +# Project Overview + +## Introduction + +The Computer Using Agent (CUA) Sample App is a reference implementation demonstrating how to build an agent that can use a computer through browser and terminal interfaces. This project shows how to implement OpenAI's Computer Protocol to enable an AI assistant to interact with a user's computer in a safe and controlled manner. + +## Architecture + +The CUA Sample App follows a modular architecture that separates the agent logic from the computer implementation: + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ │ │ │ │ │ +│ User Interface │────▶│ Agent │────▶│ OpenAI API │ +│ (CLI/App) │ │ │ │ │ +│ │ │ │ │ │ +└─────────────────┘ └────────┬────────┘ └────────┬────────┘ + │ │ + │ │ + ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ + │ │ │ │ + │ Computer │◀────│ Model Output │ + │ Implementation │ │ (computer_call) │ + │ │ │ │ + └─────────────────┘ └─────────────────┘ +``` + +### Key Components + +1. **Agent**: The agent class handles communication with the OpenAI API and processes computer calls. +2. **Computer Protocol**: Defines the interface for how the agent interacts with the computer. +3. **Computer Implementations**: Various implementations of the Computer Protocol for different environments: + - Browser (using Playwright) + - Terminal + - Docker containers + - Remote browser services +4. **CLI Application**: Command-line interface for user interaction with the agent. + +## Core Workflow + +1. **User Input**: The user provides input through the CLI or application interface. +2. **Agent Processing**: The agent sends the user input to the OpenAI API along with conversation history. +3. **API Response**: The API returns responses, which may include computer calls. +4. **Computer Interaction**: Computer calls are executed by the appropriate Computer implementation. +5. **Response Display**: Results are displayed to the user, and the conversation continues. + +## Key Features + +- **Modular Architecture**: Clear separation of concerns, allowing different computer environments to be used interchangeably. +- **Multiple Computer Environments**: Support for various computer environments, including local browsers, Docker containers, and remote browser services. +- **Safety Measures**: URL blocklisting and safety check acknowledgments to ensure safe operation. +- **Function Calling**: Support for custom functions to be defined and used in the conversation. +- **Extensible Design**: Easily extended with new Computer implementations or custom functions. + +## Getting Started + +To get started with the CUA Sample App: + +1. Clone the repository +2. Install dependencies with `pip install -r requirements.txt` +3. Run the application with `python main.py` + +For more detailed information, see the [CLI Usage Guide](cli_usage.md) and [Developer Guide](developer_guide.md). \ No newline at end of file diff --git a/docs/safety_considerations.md b/docs/safety_considerations.md index 0519ecb..001c6ea 100644 --- a/docs/safety_considerations.md +++ b/docs/safety_considerations.md @@ -1 +1,160 @@ - \ No newline at end of file +# Safety Considerations + +## Overview + +The Computer Using Agent (CUA) has significant capabilities that come with potential risks. This document outlines the safety measures implemented in the codebase to mitigate these risks. + +## URL Blocklisting + +In browser-based environments, the system includes URL blocklisting to prevent access to potentially malicious or inappropriate websites. + +### Implementation in utils.py + +```python +BLOCKED_DOMAINS = [ + "maliciousbook.com", + "evilvideos.com", + "darkwebforum.com", + "shadytok.com", + "suspiciouspins.com", + "ilanbigio.com", +] + +def check_blocklisted_url(url: str) -> None: + """Raise ValueError if the given URL (including subdomains) is in the blocklist.""" + hostname = urlparse(url).hostname or "" + if any( + hostname == blocked or hostname.endswith(f".{blocked}") + for blocked in BLOCKED_DOMAINS + ): + raise ValueError(f"Blocked URL: {url}") +``` + +This function checks if a URL's hostname (or any of its subdomains) matches any entry in the `BLOCKED_DOMAINS` list. If a match is found, it raises a `ValueError` with the blocked URL. + +### Integration in Agent Implementation + +The Agent class integrates URL blocklisting in its handling of computer calls for browser environments: + +```python +# additional URL safety checks for browser environments +if self.computer.environment == "browser": + current_url = self.computer.get_current_url() + check_blocklisted_url(current_url) + call_output["output"]["current_url"] = current_url +``` + +### Network Interception + +For Playwright-based browser environments, the system also includes network interception to block requests to suspicious domains: + +```python +# Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS +def handle_route(route, request): + url = request.url + if check_blocklisted_url(url): + print(f"Flagging blocked domain: {url}") + route.abort() + else: + route.continue_() + +self._page.route("**/*", handle_route) +``` + +This intercepts all network requests and aborts them if they target a blocked domain. + +## Safety Check Acknowledgment + +The CUA model may sometimes generate safety checks for potentially risky actions. The system implements a callback mechanism to handle these checks: + +### Default Implementation + +```python +def acknowledge_safety_check_callback(message: str) -> bool: + response = input( + f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): " + ).lower() + return response.strip() == "y" +``` + +This function displays the safety check message to the user and asks for explicit acknowledgment before proceeding. + +### Integration in Agent Implementation + +```python +# if user doesn't ack all safety checks exit with error +pending_checks = item.get("pending_safety_checks", []) +for check in pending_checks: + message = check["message"] + if not self.acknowledge_safety_check_callback(message): + raise ValueError( + f"Safety check failed: {message}. Cannot continue with unacknowledged safety checks." + ) +``` + +If any safety check is not acknowledged, the Agent raises a `ValueError` and halts execution. + +## DNS Safety in Docker Environment + +For the Docker environment, the system uses a restricted DNS server to limit access to websites: + +```bash +docker run --rm -it --name cua-sample-app -p 5900:5900 --dns=1.1.1.3 -e DISPLAY=:99 cua-sample-app +``` + +The `--dns=1.1.1.3` flag restricts the accessible websites to a smaller, safer set. + +## Container Isolation + +The Docker environment runs in a container, providing isolation from the host system: + +- Limited network access +- No access to host file system +- Controlled execution environment + +## Browser Safeguards + +Playwright-based browsers are launched with safeguards: + +```python +launch_args = [ + f"--window-size={width},{height}", + "--disable-extensions", + "--disable-file-system" +] +browser = self._playwright.chromium.launch( + chromium_sandbox=True, + headless=self.headless, + args=launch_args, + env={} +) +``` + +- `--disable-extensions`: Disables browser extensions +- `--disable-file-system`: Restricts file system access +- `chromium_sandbox=True`: Enables the Chromium sandbox for additional isolation + +## Best Practices for Implementation + +When extending or modifying the CUA implementation, consider these safety best practices: + +1. **Expand Blocklists**: Add more domains to the `BLOCKED_DOMAINS` list as needed. +2. **Custom Safety Callbacks**: Implement more sophisticated safety check callbacks for specific use cases. +3. **Request Filtering**: Add additional filtering for network requests in browser environments. +4. **Environment Isolation**: Ensure proper isolation for computer environments, especially for production use. +5. **Limited Access Scopes**: Restrict the scope of what the CUA can access and control. +6. **Monitoring and Logging**: Implement comprehensive logging to track the CUA's actions. +7. **User Intervention**: Always provide mechanisms for user intervention and oversight. + +## Limitations and Disclaimers + +Even with these safety measures, the CUA is still in preview and may have vulnerabilities: + +- Safety checks might not catch all potentially harmful actions +- Blocklists may be incomplete or bypassed +- Browser or system vulnerabilities could be exploited + +As noted in the official documentation: + +> [!CAUTION] +> Computer use is in preview. Because the model is still in preview and may be susceptible to exploits and inadvertent mistakes, we discourage trusting it in authenticated environments or for high-stakes tasks. \ No newline at end of file diff --git a/docs/testing.md b/docs/testing.md index 0519ecb..0a26356 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -1 +1,228 @@ - \ No newline at end of file +# Testing Guide + +This document provides guidance on testing Octotools to ensure it works correctly and reliably. + +## Testing Framework + +Octotools uses pytest as its testing framework. Tests are located in the `tests/` directory and follow the pytest conventions. + +## Test Types + +The Octotools test suite includes several types of tests: + +1. **Unit Tests** - Testing individual components in isolation +2. **Integration Tests** - Testing how components work together +3. **Functional Tests** - Testing end-to-end functionality +4. **Regression Tests** - Ensuring new changes don't break existing functionality + +## Running Tests + +### Running All Tests + +To run all tests: + +```bash +pytest +``` + +### Running Specific Tests + +To run tests from a specific file: + +```bash +pytest tests/test_specific_module.py +``` + +To run a specific test: + +```bash +pytest tests/test_specific_module.py::test_specific_function +``` + +### Test Coverage + +To run tests with coverage: + +```bash +pytest --cov=octotools +``` + +To generate a coverage report: + +```bash +pytest --cov=octotools --cov-report=html +``` + +This will create an HTML report in the `htmlcov/` directory. + +## Writing Tests + +### Unit Tests + +Unit tests should test individual functions or classes in isolation. Here's an example of a unit test: + +```python +from typing import Any, Dict, List, TYPE_CHECKING +import pytest +from octotools.core import analyze_cell + +if TYPE_CHECKING: + from _pytest.capture import CaptureFixture + from _pytest.fixtures import FixtureRequest + from _pytest.logging import LogCaptureFixture + from _pytest.monkeypatch import MonkeyPatch + from pytest_mock.plugin import MockerFixture + +def test_analyze_cell_empty() -> None: + """Test that analyze_cell handles an empty cell correctly.""" + result = analyze_cell("") + assert result["code_length"] == 0 + assert result["has_output"] is False + +def test_analyze_cell_with_code() -> None: + """Test that analyze_cell correctly analyzes a cell with code.""" + result = analyze_cell("print('Hello, world!')") + assert result["code_length"] == 21 + assert result["has_output"] is False +``` + +### Integration Tests + +Integration tests check that different components work together correctly: + +```python +import pytest +from octotools.core import load_notebook, analyze_notebook + +def test_notebook_analysis_pipeline() -> None: + """Test the full notebook analysis pipeline.""" + notebook = load_notebook("tests/fixtures/example_notebook.ipynb") + results = analyze_notebook(notebook) + + assert "cell_count" in results + assert "code_cells" in results + assert "markdown_cells" in results +``` + +### Fixtures + +Use pytest fixtures to set up and tear down test environments: + +```python +import pytest +import tempfile +import os +from typing import Generator + +@pytest.fixture +def temp_notebook() -> Generator[str, None, None]: + """Create a temporary notebook file for testing.""" + with tempfile.NamedTemporaryFile(suffix=".ipynb", delete=False) as f: + f.write(b'''{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "source": [ + "print(\\"Hello, world!\\")" + ], + "outputs": [] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 4 +}''') + temp_path = f.name + + yield temp_path + + # Cleanup + os.unlink(temp_path) +``` + +### Mocking + +Use mocking to isolate the code being tested: + +```python +def test_with_mocking(mocker: "MockerFixture") -> None: + """Test using mocks to isolate the code being tested.""" + # Mock a function + mock_load = mocker.patch("octotools.core.load_notebook") + mock_load.return_value = {"cells": []} + + # Test the function that uses load_notebook + from octotools.core import count_cells + result = count_cells("dummy.ipynb") + + # Verify the result and that the mock was called + assert result == 0 + mock_load.assert_called_once_with("dummy.ipynb") +``` + +## Test Organization + +Organize tests according to the module they test: + +``` +tests/ +├── __init__.py +├── core/ +│ ├── __init__.py +│ ├── test_notebook.py +│ └── test_cell.py +├── api/ +│ ├── __init__.py +│ └── test_api.py +├── cli/ +│ ├── __init__.py +│ └── test_cli.py +└── utils/ + ├── __init__.py + └── test_utils.py +``` + +## Continuous Integration + +Octotools uses GitHub Actions for continuous integration testing. The CI configuration is located in `.github/workflows/`. + +## Test Guidelines + +1. **Test Coverage**: Aim for high test coverage, especially for critical components +2. **Test Edge Cases**: Test boundary conditions and error handling +3. **Test Readability**: Write clear, readable tests with meaningful names +4. **Test Independence**: Tests should not depend on other tests +5. **Test Performance**: Tests should run quickly + +## Test Documentation + +Each test should have a clear docstring explaining what it tests and why. + +## Debugging Tests + +If a test fails, you can use the following techniques to debug it: + +1. **Verbose Mode**: Run pytest in verbose mode: + ```bash + pytest -v + ``` + +2. **Print Debugging**: Use `print` statements or the pytest `capfd` fixture: + ```python + def test_with_debug(capfd: "CaptureFixture") -> None: + print("Debug information") + # Test code + captured = capfd.readouterr() + print(captured.out) # Shows all output + ``` + +3. **PDB Debugger**: Use the PDB debugger: + ```bash + pytest --pdb + ``` + +## Test Maintenance + +Regularly review and update tests to ensure they remain relevant and effective. \ No newline at end of file diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 0519ecb..532da8c 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -1 +1,249 @@ - \ No newline at end of file +# Troubleshooting Guide + +This document provides solutions to common issues you might encounter when working with Octotools. + +## Installation Issues + +### Package Installation Failures + +**Problem**: Errors when installing Octotools or its dependencies. + +**Solution**: +1. Make sure you're using a compatible Python version (3.7+) +2. Try installing in development mode: + ```bash + pip install -e . + ``` +3. If specific dependencies are failing, try installing them manually: + ```bash + pip install numpy pandas matplotlib jupyter + ``` + +### Import Errors + +**Problem**: Import errors when trying to use Octotools. + +**Solution**: +1. Make sure Octotools is installed in your current environment +2. Check your Python path: + ```python + import sys + print(sys.path) + ``` +3. If using virtual environments, make sure you've activated the correct one + +## Usage Issues + +### Notebook Loading Failures + +**Problem**: Errors when loading a Jupyter notebook. + +**Solution**: +1. Check if the notebook file exists and is accessible +2. Verify the notebook file is valid JSON: + ```bash + # Check if the notebook is valid JSON + python -c "import json; json.load(open('path/to/notebook.ipynb'))" + ``` +3. Try opening and resaving the notebook in Jupyter + +### Analysis Errors + +**Problem**: Errors during notebook analysis. + +**Solution**: +1. Check for malformed cells in the notebook +2. Try running with debug logging enabled: + ```bash + octotools analyze --debug path/to/notebook.ipynb + ``` +3. If analyzing specific cells is failing, try excluding problematic cells: + ```bash + octotools analyze --exclude-cell 5 path/to/notebook.ipynb + ``` + +### Visualization Errors + +**Problem**: Errors when generating visualizations. + +**Solution**: +1. Make sure matplotlib is installed correctly +2. Check if the output directory exists and is writable +3. Try specifying a different output format: + ```bash + octotools visualize --format png path/to/notebook.ipynb + ``` + +## Performance Issues + +### Slow Analysis + +**Problem**: Notebook analysis is taking too long. + +**Solution**: +1. For large notebooks, use the `--sample` option to analyze only a subset of cells: + ```bash + octotools analyze --sample 10 path/to/notebook.ipynb + ``` +2. Disable complex metrics that take longer to compute: + ```bash + octotools analyze --disable-metrics complexity,patterns path/to/notebook.ipynb + ``` +3. Use the batch processor with multiple processes for analyzing many notebooks: + ```bash + octotools batch --processes 4 path/to/notebooks/ + ``` + +### Memory Issues + +**Problem**: Out of memory errors when analyzing large notebooks. + +**Solution**: +1. Process cells individually instead of loading the whole notebook: + ```bash + octotools analyze --cell-by-cell path/to/notebook.ipynb + ``` +2. Increase the memory limit in your Python environment (if applicable) +3. Use the `--lite` mode for a lighter-weight analysis: + ```bash + octotools analyze --lite path/to/notebook.ipynb + ``` + +## Configuration Issues + +### Config File Not Found + +**Problem**: Octotools can't find the configuration file. + +**Solution**: +1. Create a default configuration file: + ```bash + octotools init-config + ``` +2. Specify the config file path explicitly: + ```bash + octotools --config /path/to/config.yaml analyze notebook.ipynb + ``` +3. Check the default config locations: + - `~/.octotools/config.yaml` + - `./octotools_config.yaml` + +### Configuration Syntax Errors + +**Problem**: Errors related to the configuration file syntax. + +**Solution**: +1. Validate your configuration file: + ```bash + octotools validate-config /path/to/config.yaml + ``` +2. Reset to the default configuration: + ```bash + octotools reset-config + ``` +3. Check the configuration documentation for correct syntax + +## CLI Issues + +### Command Not Found + +**Problem**: The `octotools` command is not found. + +**Solution**: +1. Make sure the package is installed: + ```bash + pip install -e . + ``` +2. Check if the installation path is in your PATH environment variable +3. Try running the module directly: + ```bash + python -m octotools.cli + ``` + +### Incorrect Command Usage + +**Problem**: Command line arguments are not being recognized. + +**Solution**: +1. Check the help documentation for the correct usage: + ```bash + octotools --help + octotools analyze --help + ``` +2. Make sure you're using the correct syntax for your shell +3. If using special characters in arguments, use quotes: + ```bash + octotools analyze --output-path "/path/with spaces/output.json" + ``` + +## Output Issues + +### JSON Output Formatting + +**Problem**: JSON output is not formatted correctly. + +**Solution**: +1. Use an explicit formatting option: + ```bash + octotools analyze --format json-pretty path/to/notebook.ipynb + ``` +2. Use a specific output file: + ```bash + octotools analyze --output results.json path/to/notebook.ipynb + ``` +3. Pipe through a JSON formatter: + ```bash + octotools analyze --format json | python -m json.tool + ``` + +### Visualization Quality + +**Problem**: Generated visualizations have poor quality or readability. + +**Solution**: +1. Adjust the figure size: + ```bash + octotools visualize --figure-size 12 8 path/to/notebook.ipynb + ``` +2. Change the DPI setting: + ```bash + octotools visualize --dpi 300 path/to/notebook.ipynb + ``` +3. Use a different theme or color scheme: + ```bash + octotools visualize --theme dark path/to/notebook.ipynb + ``` + +## Advanced Troubleshooting + +### Debug Mode + +For detailed debugging information, run Octotools in debug mode: + +```bash +octotools --debug analyze path/to/notebook.ipynb +``` + +### Logging + +You can configure the logging level for more detailed output: + +```bash +octotools --log-level DEBUG analyze path/to/notebook.ipynb +``` + +### Generate a Diagnostic Report + +Generate a diagnostic report for support purposes: + +```bash +octotools diagnostics > diagnostic_report.txt +``` + +### Check for Updates + +Make sure you're using the latest version: + +```bash +pip install --upgrade octotools +``` \ No newline at end of file From a1d6f3be2448257f0b12085ae21ca3a362389edd Mon Sep 17 00:00:00 2001 From: jmanhype Date: Wed, 12 Mar 2025 15:28:53 -0500 Subject: [PATCH 4/6] Add Octotools integration with the CUA Sample App --- README_OCTOTOOLS.md | 131 ++++++++++ agent/agent.py | 129 +++++++++- complete_octotools_wrapper.py | 466 ++++++++++++++++++++++++++++++++++ octotools_agent.py | 175 +++++++++++++ octotools_pr_description.md | 1 + octotools_wrapper.py | 91 +++++++ requirements.txt | 36 +-- run_octotools_agent.py | 124 +++++++++ setup_octotools.py | 204 +++++++++++++++ simple_octotools_wrapper.py | 239 +++++++++++++++++ 10 files changed, 1573 insertions(+), 23 deletions(-) create mode 100644 README_OCTOTOOLS.md create mode 100644 complete_octotools_wrapper.py create mode 100755 octotools_agent.py create mode 100644 octotools_pr_description.md create mode 100644 octotools_wrapper.py create mode 100755 run_octotools_agent.py create mode 100644 setup_octotools.py create mode 100644 simple_octotools_wrapper.py diff --git a/README_OCTOTOOLS.md b/README_OCTOTOOLS.md new file mode 100644 index 0000000..5036566 --- /dev/null +++ b/README_OCTOTOOLS.md @@ -0,0 +1,131 @@ +# Octotools Integration for CUA-SAMPLE-APP + +This integration enhances the CUA-SAMPLE-APP framework with Octotools capabilities for improved reasoning and problem-solving. + +## Components + +The integration consists of the following components: + +1. **SimpleOctotoolsWrapper** (`simple_octotools_wrapper.py`) - A lightweight wrapper that provides Octotools-like functionality using direct OpenAI API calls. + +2. **OctotoolsIntegration** (`cua_octotools_integration_simple.py`) - A bridge component for connecting CUA-SAMPLE-APP with Octotools. + +3. **OctotoolsAgent** (`octotools_agent.py`) - An enhanced agent that extends the base CUA Agent with Octotools reasoning capabilities. + +4. **Test Scripts** - Multiple test scripts to verify different aspects of the integration. + +## Setup + +### Prerequisites + +- Python 3.10 or higher +- CUA-SAMPLE-APP installed and working +- An OpenAI API key with access to GPT-4o or similar model + +### Installation + +1. Clone the Octotools repository (optional for minimal integration): + ```bash + git clone https://github.com/OctoTools/OctoTools.git octotools + ``` + +2. Set up environment variables: + ```bash + echo "OPENAI_API_KEY=your-api-key" > .env + ``` + +3. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +## Usage + +### Simple Integration + +For a basic integration that doesn't require the full Octotools repository: + +```bash +python cua_octotools_integration_simple.py --query "Explain the concept of recursion" --debug +``` + +### Advanced Integration + +For a full integration that extends the CUA Agent: + +```bash +python octotools_agent.py +``` + +This will start an interactive session using the OctotoolsAgent with browser automation. + +### Standalone Testing + +To test the SimpleOctotoolsWrapper independently: + +```bash +python test_simple_octotools.py +``` + +## Integration Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ CUA-SAMPLE-APP │ +│ │ +│ ┌───────────────┐ ┌───────────────────────┐ │ +│ │ Regular Agent │ │ OctotoolsAgent │ │ +│ └───────┬───────┘ └───────────┬───────────┘ │ +│ │ │ │ +│ │ ┌───────────┴───────────┐ │ +│ │ │ SimpleOctotoolsWrapper│ │ +│ │ └───────────────────────┘ │ +│ │ │ │ +│ ┌──────┴───────────────────────────────┴─────────┐ │ +│ │ Computer │ │ +│ └──────────────────────────────────────────────┐ │ │ +└────────────────────────────────────────────────────────┘ +``` + +## Troubleshooting + +Common issues and solutions: + +### API Key Problems + +If you see errors related to the API key, ensure that: +- The `.env` file exists and contains `OPENAI_API_KEY=your-api-key` +- The API key is valid and has access to the required models + +### Import Errors + +If you encounter import errors: +- Ensure you're running the scripts from the project root directory +- Check that all dependencies are properly installed +- Make sure the CUA-SAMPLE-APP is correctly installed + +### Integration Issues + +If the integration doesn't work as expected: +- Try the simple integration first to isolate issues +- Enable debug mode to see more detailed information +- Check the test scripts to verify component functionality + +## Future Improvements + +This integration could be enhanced in the following ways: + +1. Add more sophisticated detection for when to use Octotools vs. standard CUA behavior +2. Implement a more accurate simulation of all Octotools tools +3. Better error handling and fallback mechanisms +4. Add support for more advanced Octotools features + +## License + +This integration is subject to the same license as the CUA-SAMPLE-APP. + +## Acknowledgements + +This integration builds upon: +- [CUA-SAMPLE-APP](https://github.com/openai/openai-cua-sample-app) by OpenAI +- [Octotools](https://github.com/OctoTools/OctoTools) framework \ No newline at end of file diff --git a/agent/agent.py b/agent/agent.py index 47eab67..4d501ff 100644 --- a/agent/agent.py +++ b/agent/agent.py @@ -7,12 +7,13 @@ check_blocklisted_url, ) import json -from typing import Callable +from typing import Callable, List, Dict, Any, Optional class Agent: """ A sample agent class that can be used to interact with a computer. + Enhanced with Octotools for complex reasoning. (See simple_cua_loop.py for a simple example without an agent.) """ @@ -23,6 +24,9 @@ def __init__( computer: Computer = None, tools: list[dict] = [], acknowledge_safety_check_callback: Callable = lambda: False, + use_octotools: bool = False, + octotools_engine: str = "gpt-4o", + octotools_tools: Optional[List[str]] = None, ): self.model = model self.computer = computer @@ -41,6 +45,23 @@ def __init__( "environment": computer.environment, }, ] + + # Octotools integration + self.use_octotools = use_octotools + if use_octotools: + try: + from octotools_wrapper import OctotoolsWrapper + self.octotools = OctotoolsWrapper( + llm_engine=octotools_engine, + enabled_tools=octotools_tools + ) + print("Octotools initialized successfully!") + except ImportError as e: + print(f"Warning: Could not initialize Octotools: {str(e)}") + self.use_octotools = False + self.octotools = None + else: + self.octotools = None def debug_print(self, *args): if self.debug: @@ -113,9 +134,16 @@ def handle_item(self, item): def run_full_turn( self, input_items, print_steps=True, debug=False, show_images=False ): + """Enhanced run_full_turn with Octotools integration for complex reasoning.""" self.print_steps = print_steps self.debug = debug self.show_images = show_images + + # Check if we should use Octotools for complex reasoning + if self.use_octotools and self.octotools and self._needs_complex_reasoning(input_items): + return self._handle_with_octotools(input_items) + + # Original CUA logic new_items = [] # keep looping until we get a final response @@ -139,3 +167,102 @@ def run_full_turn( new_items += self.handle_item(item) return new_items + + def _needs_complex_reasoning(self, input_items: List[Dict[str, Any]]) -> bool: + """ + Determine if the query needs complex reasoning that would benefit from Octotools. + This is a basic heuristic and can be enhanced based on specific requirements. + + Args: + input_items: The list of input items + + Returns: + bool: True if complex reasoning is needed, False otherwise + """ + # Extract the latest user message + latest_user_message = None + for item in reversed(input_items): + if item.get("role") == "user": + latest_user_message = item.get("content", "") + break + + if not latest_user_message: + return False + + # Simple heuristic: check for keywords that might suggest complex reasoning + complex_keywords = [ + "analyze", "compare", "calculate", "extract data", "search for", + "find information", "summarize", "visual analysis", + "collect data", "research", "solve" + ] + + return any(keyword in latest_user_message.lower() for keyword in complex_keywords) + + def _handle_with_octotools(self, input_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Handle a query using Octotools for complex reasoning. + + Args: + input_items: The list of input items + + Returns: + List[Dict[str, Any]]: The result items + """ + # Extract the latest user message and any screenshots + latest_user_message = None + latest_screenshot = None + + for item in reversed(input_items): + if item.get("role") == "user" and not latest_user_message: + latest_user_message = item.get("content", "") + + # Look for the most recent screenshot + if not latest_screenshot and item.get("type") == "computer_call_output": + output = item.get("output", {}) + if output.get("type") == "input_image": + image_url = output.get("image_url", "") + if image_url.startswith("data:image/png;base64,"): + latest_screenshot = image_url + + if not latest_user_message: + return [] + + # Get the current URL for context if in browser environment + current_url = None + if self.computer and self.computer.environment == "browser": + try: + current_url = self.computer.get_current_url() + except: + pass + + # Build context + context = f"Current URL: {current_url}" if current_url else "" + + # Solve using Octotools + if self.print_steps: + print("Using Octotools for complex reasoning...") + + result = self.octotools.solve( + query=latest_user_message, + image_data=latest_screenshot.split("base64,")[1] if latest_screenshot else None, + context=context + ) + + # Format the result for CUA + answer = result.get("answer", "I couldn't find a solution using the available tools.") + steps = result.get("steps", []) + + if self.print_steps: + print(f"Octotools result: {answer[:100]}...") + + # Build a detailed response that includes steps taken + detailed_response = answer + "\n\n" + if steps: + detailed_response += "I took the following steps to solve this:\n" + for i, step in enumerate(steps, 1): + tool_used = step.get("tool_used", "Unknown tool") + reasoning = step.get("reasoning", "No reasoning provided") + detailed_response += f"\n{i}. Used {tool_used}: {reasoning}" + + # Return as a message from the assistant + return [{"role": "assistant", "content": detailed_response}] diff --git a/complete_octotools_wrapper.py b/complete_octotools_wrapper.py new file mode 100644 index 0000000..4ea44a6 --- /dev/null +++ b/complete_octotools_wrapper.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Complete Octotools Wrapper. + +This module provides full access to the entire Octotools ecosystem and integrates +it seamlessly with the CUA-SAMPLE-APP framework. +""" + +import os +import sys +import json +import base64 +import importlib +import subprocess +import tempfile +import logging +from typing import Dict, List, Any, Optional, Union, Type, Set +from pathlib import Path +from dotenv import load_dotenv + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger("complete_octotools_wrapper") + +class CompleteOctotoolsWrapper: + """ + A comprehensive wrapper for the entire Octotools ecosystem. + + This class dynamically imports and registers all available Octotools tools + and provides a unified interface for using them with CUA-SAMPLE-APP. + """ + + def __init__( + self, + llm_engine: str = "gpt-4o", + enabled_tools: Optional[List[str]] = None, + max_steps: int = 3, + install_if_missing: bool = False, + debug: bool = False + ) -> None: + """ + Initialize the complete Octotools wrapper. + + Args: + llm_engine: The language model engine to use (default: "gpt-4o") + enabled_tools: List of specific tools to enable. If None, all available tools are enabled. + max_steps: Maximum number of steps to take when solving (default: 3) + install_if_missing: Whether to attempt installing Octotools if it's missing (default: False) + debug: Whether to enable debug logging (default: False) + """ + # Set debug mode + if debug: + logger.setLevel(logging.DEBUG) + + # Load environment variables + load_dotenv() + + # Check API key + if not os.environ.get("OPENAI_API_KEY"): + raise ValueError("OPENAI_API_KEY environment variable is not set") + + self.llm_engine = llm_engine + self.max_steps = max_steps + self.enabled_tools = enabled_tools + + # Try to import Octotools + try: + # First, try the direct import approach + self._import_octotools() + + # If we get here, Octotools is installed + logger.info("Octotools package found and imported successfully") + + # Initialize the Octotools components + self._initialize_octotools() + + except ImportError as e: + logger.error(f"Failed to import Octotools: {e}") + + if install_if_missing: + logger.info("Attempting to install Octotools...") + if self._install_octotools(): + logger.info("Octotools installed successfully") + + # Try importing again + try: + self._import_octotools() + self._initialize_octotools() + except ImportError as e2: + raise ImportError(f"Failed to import Octotools after installation: {e2}") + else: + raise RuntimeError("Failed to install Octotools and no fallback is permitted") + else: + raise ImportError(f"Octotools package is required but not installed: {e}") + + def _import_octotools(self) -> None: + """Import the Octotools package and its components.""" + # Global imports to avoid issues if the package doesn't exist + global BaseTool, ChatOpenAI + + # Import core components - Note the adjustment based on actual package structure + from octotools.tools.base import BaseTool + + # Check for OpenAI engine in Octotools + try: + from octotools.engine.openai import ChatOpenAI + logger.info("Successfully imported ChatOpenAI from Octotools") + except ImportError as e: + logger.error(f"Failed to import ChatOpenAI from Octotools: {e}") + # We will NOT create a fallback - instead we'll propagate the error + raise ImportError(f"Required Octotools component ChatOpenAI not found: {e}") + + # Store these for later use + self.BaseTool = BaseTool + self.ChatOpenAI = ChatOpenAI + + def _install_octotools(self) -> bool: + """ + Install Octotools package if it's missing. + + Returns: + bool: True if installation was successful, False otherwise + """ + try: + logger.info("Installing octotools from GitHub...") + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "git+https://github.com/OctoTools/OctoTools.git"] + ) + return True + except subprocess.CalledProcessError as e: + logger.error(f"Failed to install Octotools: {e}") + return False + + def _discover_tools(self) -> Dict[str, Type]: + """ + Dynamically discover all available Octotools tools. + + Returns: + Dict[str, Type]: Dictionary mapping tool names to tool classes + """ + all_tools = {} + + # Get the octotools package path + import octotools + tools_dir = os.path.join(os.path.dirname(octotools.__file__), "tools") + + # List all possible tool directories + tool_dirs = [] + try: + for item in os.listdir(tools_dir): + item_path = os.path.join(tools_dir, item) + if os.path.isdir(item_path) and not item.startswith("__"): + tool_dirs.append(item_path) + except Exception as e: + logger.warning(f"Failed to list tool directories: {e}") + + # Now try to dynamically discover tools by walking the tools directory + for tool_dir in tool_dirs: + # Skip directories that are not tools + if os.path.basename(tool_dir) == "base.py": + continue + + # Check if there's a tool.py file in the directory + tool_file = os.path.join(tool_dir, "tool.py") + if os.path.exists(tool_file): + try: + # Get directory name as module name + dir_name = os.path.basename(tool_dir) + + # Import the module using importlib + module_path = f"octotools.tools.{dir_name}.tool" + try: + module = importlib.import_module(module_path) + + # Look for tool classes in the module + for attr_name in dir(module): + if attr_name.endswith("_Tool") and not attr_name.startswith("__"): + try: + attr = getattr(module, attr_name) + if isinstance(attr, type) and issubclass(attr, self.BaseTool) and attr != self.BaseTool: + tool_name = attr_name + all_tools[tool_name] = attr + logger.debug(f"Discovered tool: {tool_name}") + except (TypeError, AttributeError) as e: + logger.debug(f"Failed to load tool class {attr_name}: {e}") + except ImportError as e: + logger.debug(f"Failed to import module {module_path}: {e}") + except Exception as e: + logger.debug(f"Error processing directory {tool_dir}: {e}") + + # Add some well-known tools by name if they weren't discovered + known_tools = { + "Python_Code_Generator_Tool": "octotools.tools.python_code_generator.tool", + "Generalist_Solution_Generator_Tool": "octotools.tools.generalist_solution_generator.tool", + "Image_Captioner_Tool": "octotools.tools.image_captioner.tool" + } + + for tool_name, module_path in known_tools.items(): + if tool_name not in all_tools: + try: + module = importlib.import_module(module_path) + tool_class = getattr(module, tool_name) + all_tools[tool_name] = tool_class + logger.debug(f"Added known tool: {tool_name}") + except (ImportError, AttributeError) as e: + logger.debug(f"Failed to import known tool {tool_name}: {e}") + + if not all_tools: + raise ImportError("Failed to discover any tools from Octotools") + + logger.info(f"Discovered {len(all_tools)} tools from Octotools") + return all_tools + + def _initialize_octotools(self) -> None: + """Initialize Octotools components and discover available tools.""" + try: + # Discover all available tools + self.available_tools = self._discover_tools() + + # Initialize tool instances based on enabled_tools or use all available + self.tools = [] + + if self.enabled_tools is None: + # Use all available tools + logger.info("Initializing all available Octotools tools") + for tool_name, tool_class in self.available_tools.items(): + try: + tool_instance = tool_class(model_string=self.llm_engine) + self.tools.append(tool_instance) + logger.debug(f"Initialized tool: {tool_name}") + except Exception as e: + logger.warning(f"Failed to initialize tool {tool_name}: {e}") + else: + # Use only the specified tools + logger.info(f"Initializing specified Octotools tools: {', '.join(self.enabled_tools)}") + found_tools = 0 + for tool_name in self.enabled_tools: + if tool_name in self.available_tools: + try: + tool_class = self.available_tools[tool_name] + tool_instance = tool_class(model_string=self.llm_engine) + self.tools.append(tool_instance) + found_tools += 1 + logger.debug(f"Initialized tool: {tool_name}") + except Exception as e: + logger.warning(f"Failed to initialize tool {tool_name}: {e}") + else: + logger.warning(f"Tool '{tool_name}' not found in available tools") + + if found_tools == 0 and self.enabled_tools: + raise RuntimeError(f"None of the specified tools could be initialized: {', '.join(self.enabled_tools)}") + + if not self.tools: + raise RuntimeError("Failed to initialize any Octotools tools") + + logger.info(f"Initialized CompleteOctotoolsWrapper with {len(self.tools)} tools") + for tool in self.tools: + logger.debug(f" - {tool.__class__.__name__}") + + except Exception as e: + logger.error(f"Failed to initialize Octotools: {e}") + raise RuntimeError(f"Failed to initialize Octotools components: {e}") + + def get_available_tool_names(self) -> List[str]: + """ + Get a list of all available tool names. + + Returns: + List[str]: Names of all available tools + """ + if hasattr(self, "available_tools"): + return list(self.available_tools.keys()) + else: + raise RuntimeError("Octotools has not been properly initialized") + + def solve( + self, + query: str, + image_data: Optional[str] = None, + context: Optional[str] = None, + extra_args: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + Solve a task using Octotools. + + Args: + query: The user's query + image_data: Optional base64-encoded image data (with or without the data:image prefix) + context: Optional additional context for the solver + extra_args: Optional additional arguments to pass to the solver + + Returns: + A dictionary containing the result of the solving process + """ + return self._solve_with_tools(query, image_data, context, extra_args) + + def _solve_with_tools( + self, + query: str, + image_data: Optional[str] = None, + context: Optional[str] = None, + extra_args: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + Solve a task using the actual Octotools tools. + + Args: + query: The user's query + image_data: Optional base64-encoded image data + context: Optional additional context for the solver + extra_args: Optional additional arguments to pass to the solver + + Returns: + A dictionary containing the result of the solving process + """ + try: + # Build full query with context + full_query = query + if context: + full_query = f"{query}\n\nContext: {context}" + + # Process image if provided + image_path = None + if image_data: + # Save the image temporarily + with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp: + # Remove the data:image/png;base64, prefix if present + if 'base64,' in image_data: + image_data = image_data.split('base64,')[1] + + temp.write(base64.b64decode(image_data)) + image_path = temp.name + + # Select the most appropriate tool for the task + # For now, we'll use a simple heuristic to select the tool + selected_tool = self._select_tool(full_query, image_path is not None) + + if selected_tool is None: + raise RuntimeError("No appropriate tool found for the query") + + logger.info(f"Using tool: {selected_tool.__class__.__name__}") + + # Execute the tool + if image_path and hasattr(selected_tool, 'execute') and 'image' in selected_tool.execute.__code__.co_varnames: + # Tool supports image input + result = selected_tool.execute(prompt=full_query, image=image_path) + else: + # Tool doesn't support image input or it's not provided + if hasattr(selected_tool, 'execute') and 'prompt' in selected_tool.execute.__code__.co_varnames: + result = selected_tool.execute(prompt=full_query) + else: + result = selected_tool.execute(query=full_query) + + # Format the result + formatted_result = { + "answer": result if isinstance(result, str) else str(result), + "steps": [{ + "tool_used": selected_tool.__class__.__name__, + "sub_goal": f"Process the query: {query[:100]}{'...' if len(query) > 100 else ''}", + "result": result if isinstance(result, str) else str(result) + }] + } + + return formatted_result + + except Exception as e: + logger.error(f"Error in CompleteOctotoolsWrapper._solve_with_tools: {e}", exc_info=True) + raise RuntimeError(f"Error solving query with Octotools: {e}") + finally: + # Clean up temporary file if created + if image_path and os.path.exists(image_path): + os.remove(image_path) + + def _select_tool(self, query: str, has_image: bool) -> Optional[Any]: + """ + Select the most appropriate tool for the query. + + Args: + query: The user's query + has_image: Whether the query includes an image + + Returns: + Optional[Any]: The selected tool instance or None if no appropriate tool was found + """ + # If no tools are available, return None + if not self.tools: + return None + + # If query is related to code generation, use the Python Code Generator Tool + if any(keyword in query.lower() for keyword in [ + "code", "python", "script", "function", "algorithm", "program" + ]): + for tool in self.tools: + if tool.__class__.__name__ == "Python_Code_Generator_Tool": + return tool + + # If query is related to image analysis and an image is provided, use an image tool + if has_image: + for tool in self.tools: + if tool.__class__.__name__ in ["Image_Captioner_Tool", "Object_Detector_Tool", "Text_Detector_Tool"]: + return tool + + # Default to Generalist Solution Generator Tool + for tool in self.tools: + if tool.__class__.__name__ == "Generalist_Solution_Generator_Tool": + return tool + + # If no specific tool found, return the first available tool + return self.tools[0] if self.tools else None + + def encode_image_to_base64(self, image_path: str) -> str: + """ + Encode an image file to base64. + + Args: + image_path: Path to the image file + + Returns: + Base64 encoded string of the image + """ + try: + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + except Exception as e: + logger.error(f"Error encoding image: {e}") + raise RuntimeError(f"Error encoding image: {e}") + +# Simple unit test function +def test_wrapper(): + """Run a simple test of the CompleteOctotoolsWrapper.""" + wrapper = CompleteOctotoolsWrapper( + llm_engine="gpt-4o", + debug=True, + install_if_missing=True + ) + + # List available tools + available_tools = wrapper.get_available_tool_names() + print(f"Available tools: {', '.join(available_tools)}") + + # Simple test query + test_query = "What is 2+2? Provide a simple answer." + print(f"\nSending test query: '{test_query}'") + + # Execute the query + result = wrapper.solve(query=test_query) + + # Check the result + if "answer" in result: + print("\nResult:") + print(result["answer"]) + + if "steps" in result and result["steps"]: + print("\nSteps taken:") + for i, step in enumerate(result["steps"], 1): + print(f" {i}. {step.get('tool_used', 'Unknown')}: {step.get('sub_goal', 'No reasoning provided')}") + + return result + +if __name__ == "__main__": + test_wrapper() \ No newline at end of file diff --git a/octotools_agent.py b/octotools_agent.py new file mode 100755 index 0000000..b2d8ed5 --- /dev/null +++ b/octotools_agent.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Octotools agent for CUA-SAMPLE-APP. + +This module provides an agent class that integrates Octotools +capabilities with the CUA-SAMPLE-APP framework. +""" + +from typing import Optional, Dict, Any, List +import os +import json +from PIL import Image +import io +import traceback + +from octotools_wrapper import OctotoolsWrapper +from agent.agent import Agent + +class OctotoolsAgent(Agent): + """Agent class that integrates Octotools capabilities.""" + + def __init__( + self, + model_string: str = "gpt-4-turbo-preview", + enabled_tools: Optional[List[str]] = None, + debug: bool = False + ) -> None: + """Initialize the OctotoolsAgent. + + Args: + model_string: The language model to use + enabled_tools: List of tool names to enable + debug: Whether to print debug information + """ + super().__init__() + self.debug = debug + self.octotools = OctotoolsWrapper( + engine=model_string, + enabled_tools=enabled_tools + ) + + def run_full_turn( + self, + input_items: List[Dict[str, Any]], + print_steps: bool = True, + debug: bool = False, + show_images: bool = False + ) -> List[Dict[str, Any]]: + """Run a full turn of the agent. + + Args: + input_items: List of input items + print_steps: Whether to print steps + debug: Whether to enable debug output + show_images: Whether to display images + + Returns: + List of output items + """ + try: + # Get the latest user message + latest_user_message = None + for item in reversed(input_items): + if item.get("role") == "user": + latest_user_message = item.get("content", "") + break + + if not latest_user_message: + return [] + + # Process the query using Octotools + result = self.octotools.solve( + query=latest_user_message, + image_data=None, + context=None + ) + + # Format the response + response_items = [] + if result and "answer" in result: + response = result["answer"] + + if debug: + # Add debug information + debug_info = "\n\nDebug Information:" + debug_info += "\nSteps taken:" + for step in result.get("steps", []): + debug_info += f"\n- Step {step['step']}: {step['sub_goal']} (using {step['tool']})" + debug_info += f"\n Result: {step['result']}" + + debug_info += f"\n\nTools used: {', '.join(result.get('tools_used', []))}" + debug_info += f"\n\nReasoning: {result.get('reasoning', 'No reasoning provided')}" + + response += debug_info + + response_items.append({ + "role": "assistant", + "content": response + }) + else: + response_items.append({ + "role": "assistant", + "content": "I apologize, but I couldn't generate a response for your query." + }) + + return response_items + + except Exception as e: + if debug: + traceback.print_exc() + return [{ + "role": "assistant", + "content": f"An error occurred while processing your request: {str(e)}" + }] + + def process_input(self, user_input: str) -> None: + """Process user input using Octotools. + + Args: + user_input: The user's input query + """ + try: + if self.debug: + print(f"Processing input: {user_input}") + + response = self.octotools.process_query(user_input, debug=self.debug) + print(f"Response: {response}") + + except Exception as e: + print(f"Error processing input: {str(e)}") + if self.debug: + traceback.print_exc() + +# Example usage +def run_octotools_agent(): + """Run an example of the OctotoolsAgent.""" + from computers import LocalPlaywrightComputer + + try: + with LocalPlaywrightComputer() as computer: + agent = OctotoolsAgent( + computer=computer, + octotools_engine="gpt-4o", + octotools_tools=["Generalist_Solution_Generator_Tool"], + debug=True + ) + + items = [] + print("OctotoolsAgent initialized. Type 'exit' to quit.") + + while True: + user_input = input("> ") + if user_input.lower() == 'exit': + break + + # Add the user input to the items + items.append({"role": "user", "content": user_input}) + + # Run the agent + output_items = agent.run_full_turn(items, debug=True, show_images=True) + + # Add the output items to the conversation + items.extend(output_items) + + # Print the assistant's response + for item in output_items: + if item.get("role") == "assistant": + print(f"\nAssistant: {item.get('content')}\n") + except Exception as e: + print(f"Error: {str(e)}") + +if __name__ == "__main__": + run_octotools_agent() \ No newline at end of file diff --git a/octotools_pr_description.md b/octotools_pr_description.md new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/octotools_pr_description.md @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/octotools_wrapper.py b/octotools_wrapper.py new file mode 100644 index 0000000..5b33427 --- /dev/null +++ b/octotools_wrapper.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Octotools wrapper for integration with CUA-SAMPLE-APP. + +This module provides a wrapper class for integrating Octotools +with the CUA-SAMPLE-APP framework, enabling complex reasoning +capabilities alongside browser automation. +""" + +from typing import List, Optional, Dict, Any +import os +import tempfile +import json +from PIL import Image +import traceback + +# Add the octotools repository path to sys.path +import sys +octotools_repo_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'octotools') +if octotools_repo_path not in sys.path: + sys.path.append(octotools_repo_path) + +# Import Octotools modules +from octotools.octotools.models.executor import Executor +from octotools.octotools.models.planner import Planner +from octotools.octotools.models.memory import Memory + +class OctotoolsWrapper: + """Wrapper for Octotools functionality.""" + + def __init__(self, engine: str = "gpt-4o", enabled_tools: Optional[List[str]] = None, max_steps: int = 5): + """Initialize the OctotoolsWrapper. + + Args: + engine: The LLM engine to use + enabled_tools: List of enabled tools + max_steps: Maximum number of steps for solving tasks + """ + self.engine = engine + self.enabled_tools = enabled_tools or [] + self.max_steps = max_steps + + # Initialize components + self.executor = Executor(llm_engine_name=engine) + self.planner = Planner( + llm_engine_name=engine, + available_tools=self.enabled_tools + ) + self.memory = Memory() + + def process_query(self, query: str, debug: bool = False) -> str: + """Process a user query using Octotools. + + Args: + query: The user's query + debug: Whether to print debug information + + Returns: + Response from processing the query + """ + try: + # Get base response from planner + response = self.planner.generate_base_response(query, image=None) + if debug: + print(f"Base response: {response}") + + # Execute tools if needed + if self.enabled_tools: + tool_responses = [] + for tool_name in self.enabled_tools: + result = self.executor.execute_tool( + tool_name=tool_name, + context=response, + sub_goal=query + ) + tool_responses.append(result) + + if debug: + print(f"Tool {tool_name} response: {result}") + + # Combine responses + final_response = " ".join([response] + tool_responses) + return final_response + + return response + + except Exception as e: + print(f"Error processing query: {str(e)}") + return f"Error: {str(e)}" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 13769fb..5e5a224 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,23 +1,15 @@ -annotated-types==0.7.0 -anyio==4.8.0 -browserbase==1.2.0 -certifi==2025.1.31 -charset-normalizer==3.4.1 -distro==1.9.0 -greenlet==3.1.1 -h11==0.14.0 -httpcore==1.0.7 -httpx==0.28.1 -idna==3.10 -jiter==0.8.2 -pillow==11.1.0 -playwright==1.50.0 -pydantic==2.10.6 -pydantic_core==2.27.2 -pyee==12.1.1 +# Core dependencies python-dotenv==1.0.1 -requests==2.32.3 -scrapybara>=2.3.6 -sniffio==1.3.1 -typing_extensions==4.12.2 -urllib3==2.3.0 +playwright==1.42.0 +openai==1.58.1 + +# Octotools and its dependencies +octotools @ git+https://github.com/OctoTools/OctoTools.git +easyocr==1.7.1 +pillow==10.4.0 +wikipedia==1.4.0 + +# Development dependencies +pytest==8.0.0 +pytest-mock==3.12.0 +pytest-cov==4.1.0 diff --git a/run_octotools_agent.py b/run_octotools_agent.py new file mode 100755 index 0000000..fd71ec4 --- /dev/null +++ b/run_octotools_agent.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Script to run the OctotoolsAgent. + +This script loads environment variables and runs the OctotoolsAgent +for testing and demonstration purposes. +""" + +import os +import sys +from octotools_agent import OctotoolsAgent +from computers import LocalPlaywrightComputer +from dotenv import load_dotenv +import argparse + + +def main(): + """ + Run the OctotoolsAgent with environment variables. + """ + # Load environment variables from .env file + load_dotenv() + + # Ensure the API key is set + if "OPENAI_API_KEY" not in os.environ: + print("❌ ERROR: OPENAI_API_KEY environment variable is not set.") + print("Please make sure you have a .env file with the API key or set it manually.") + return 1 + + print(f"API Key is {'configured' if os.environ.get('OPENAI_API_KEY') else 'NOT configured'}") + + parser = argparse.ArgumentParser(description="Run OctotoolsAgent with configuration options") + parser.add_argument('--engine', default="gpt-4o", help='LLM engine for Octotools (default: gpt-4o)') + parser.add_argument('--tools', nargs='+', default=[ + "Python_Code_Generator_Tool", + "Text_Detector_Tool", + "Image_Captioner_Tool", + "Object_Detector_Tool", + "Google_Search_Tool", + "Generalist_Solution_Generator_Tool" + ], help='List of Octotools tools to enable') + parser.add_argument('--test-mode', action='store_true', help='Run a simple test without computer interaction') + args = parser.parse_args() + + # Test mode doesn't require browser automation + if args.test_mode: + test_octotools_agent(args.engine, args.tools) + return 0 + + # Regular mode with browser automation + with LocalPlaywrightComputer() as computer: + try: + print(f"Initializing OctotoolsAgent with engine: {args.engine}") + print(f"Enabled tools: {', '.join(args.tools)}") + + # Create the agent + agent = OctotoolsAgent( + computer=computer, + octotools_engine=args.engine, + octotools_tools=args.tools + ) + + print("\n=== OctotoolsAgent Initialized Successfully ===") + print("Type 'exit' to quit") + + items = [] + while True: + user_input = input("\n> ") + if user_input.lower() == 'exit': + break + + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items, debug=True, show_images=True) + items += output_items + + except Exception as e: + print(f"\n❌ Error running OctotoolsAgent: {str(e)}") + import traceback + traceback.print_exc() + return 1 + + return 0 + + +def test_octotools_agent(engine, tools): + """ + Run a simple test of the OctotoolsAgent without browser automation. + + Args: + engine: LLM engine to use + tools: List of tools to enable + """ + print("\n=== Testing OctotoolsAgent without browser automation ===") + + try: + # Create agent without computer + agent = OctotoolsAgent( + octotools_engine=engine, + octotools_tools=tools + ) + + test_query = "Calculate the square root of 144." + print(f"\nSending test query: '{test_query}'") + + items = [{"role": "user", "content": test_query}] + output_items = agent.run_full_turn(items, debug=True, show_images=False) + + print("\nResponse:") + for item in output_items: + if item.get("role") == "assistant": + print(item.get("content", "No content")) + + print("\n✅ OctotoolsAgent test PASSED!") + + except Exception as e: + print(f"\n❌ OctotoolsAgent test FAILED with error: {str(e)}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/setup_octotools.py b/setup_octotools.py new file mode 100644 index 0000000..817754e --- /dev/null +++ b/setup_octotools.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Setup utility for Octotools integration with CUA-SAMPLE-APP. + +This script helps users configure their environment for using the Octotools +integration by creating a proper .env file and verifying requirements. +""" + +import os +import sys +import subprocess +import traceback +from typing import Dict, List, Optional, Tuple +from pathlib import Path + + +def check_python_version() -> bool: + """ + Check if the Python version is compatible. + + Returns: + bool: True if Python version is compatible, False otherwise + """ + major, minor = sys.version_info.major, sys.version_info.minor + print(f"Python version: {major}.{minor}") + + if major != 3 or minor < 8: + print("❌ Python 3.8+ is required") + print(f"Current Python version: {major}.{minor}") + return False + + print("✅ Python version check passed") + return True + + +def check_api_key() -> Tuple[bool, Optional[str]]: + """ + Check if the OpenAI API key is set in environment variables or .env file. + + Returns: + Tuple[bool, Optional[str]]: Success status and API key if found + """ + # Check environment variables first + api_key = os.environ.get("OPENAI_API_KEY") + + # If not in environment, check .env file + if not api_key: + env_path = Path('.env') + if env_path.exists(): + with open(env_path, 'r') as f: + for line in f: + if line.strip().startswith('OPENAI_API_KEY='): + api_key = line.strip().split('=', 1)[1].strip().strip('"\'') + break + + if api_key: + masked_key = f"{api_key[:10]}...{api_key[-5:]}" + print(f"✅ OpenAI API Key found: {masked_key}") + return True, api_key + else: + print("❌ OpenAI API Key not found") + return False, None + + +def create_env_file(api_key: Optional[str] = None) -> bool: + """ + Create or update the .env file with necessary configurations. + + Args: + api_key: Optional API key to include in the file + + Returns: + bool: True if .env file was created/updated successfully, False otherwise + """ + env_path = Path('.env') + + # Read existing .env file if it exists + existing_vars = {} + if env_path.exists(): + with open(env_path, 'r') as f: + for line in f: + if '=' in line and not line.startswith('#'): + key, value = line.strip().split('=', 1) + existing_vars[key] = value.strip('"\'') + + # Ask for API key if not provided + if not api_key: + api_key = input("Enter your OpenAI API Key: ").strip() + if not api_key: + print("❌ No API key provided") + return False + + # Update variables + existing_vars["OPENAI_API_KEY"] = api_key + existing_vars.setdefault("OCTOTOOLS_MODEL", "gpt-4o") + existing_vars.setdefault("CUA_MODEL", "gpt-4o") + existing_vars.setdefault("OCTOTOOLS_TOOLS", "Generalist_Solution_Generator_Tool") + existing_vars.setdefault("REASONING_THRESHOLD", "0.7") + + # Write updated .env file + with open(env_path, 'w') as f: + for key, value in existing_vars.items(): + f.write(f"{key}={value}\n") + + print(f"✅ .env file created/updated at: {env_path.absolute()}") + return True + + +def check_requirements() -> bool: + """ + Check if required packages are installed. + + Returns: + bool: True if all required packages are installed, False otherwise + """ + required_packages = [ + "playwright", + "python-dotenv", + "openai>=1.0.0", + ] + + missing_packages = [] + + for package in required_packages: + try: + if "=" in package: + pkg_name = package.split("=")[0] + else: + pkg_name = package + + __import__(pkg_name) + print(f"✅ {package} is installed") + except ImportError: + missing_packages.append(package) + print(f"❌ {package} is not installed") + + if missing_packages: + print("\nMissing packages:") + for package in missing_packages: + print(f" - {package}") + + install = input("Do you want to install missing packages? (y/n): ").strip().lower() + if install == 'y': + try: + subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_packages) + print("✅ Packages installed successfully") + return True + except subprocess.CalledProcessError: + print("❌ Failed to install packages") + return False + else: + print("⚠️ Packages not installed") + return False + + return True + + +def main() -> int: + """ + Main function to run the setup utility. + + Returns: + int: Exit code (0 for success, 1 for failure) + """ + print("=== Octotools Integration Setup ===\n") + + try: + # Check Python version + if not check_python_version(): + print("\n⚠️ Python version check failed. Please use Python 3.8+") + return 1 + + # Check API key + api_key_exists, api_key = check_api_key() + + # Create/update .env file + if not create_env_file(api_key): + print("\n⚠️ Failed to create .env file") + return 1 + + # Check requirements + if not check_requirements(): + print("\n⚠️ Some requirements are missing") + # Continue anyway, just warn the user + + print("\n=== Setup Complete ===") + print("You can now use the Octotools integration with CUA-SAMPLE-APP") + print("Try running the demo script: ./demo_octotools.py") + + return 0 + + except KeyboardInterrupt: + print("\n⚠️ Setup canceled by user") + return 1 + except Exception as e: + print(f"\n❌ An error occurred: {str(e)}") + traceback.print_exc() + return 1 + + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/simple_octotools_wrapper.py b/simple_octotools_wrapper.py new file mode 100644 index 0000000..431a767 --- /dev/null +++ b/simple_octotools_wrapper.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Simple Octotools Wrapper. + +This module provides direct access to Octotools functionality by importing +directly from the local repository structure. +""" + +import os +import sys +import json +import base64 +import tempfile +import traceback +from typing import Dict, List, Any, Optional, Union +from dotenv import load_dotenv + +# Get the absolute path to the octotools repository +OCTOTOOLS_REPO_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'octotools') +OCTOTOOLS_PACKAGE_PATH = os.path.join(OCTOTOOLS_REPO_PATH, 'octotools') + +# Add paths to sys.path +sys.path.insert(0, OCTOTOOLS_REPO_PATH) +sys.path.insert(0, OCTOTOOLS_PACKAGE_PATH) + +# Dictionary of available tool classes with their path in the repository +AVAILABLE_TOOLS = { + "Python_Code_Generator_Tool": os.path.join(OCTOTOOLS_PACKAGE_PATH, "tools/python_code_generator/PythonCodeGeneratorTool.py"), + "Generalist_Solution_Generator_Tool": os.path.join(OCTOTOOLS_PACKAGE_PATH, "tools/generalist_solution_generator/GeneralistSolutionGeneratorTool.py"), + "Image_Caption_Tool": os.path.join(OCTOTOOLS_PACKAGE_PATH, "tools/image_captioner/ImageCaptionTool.py") +} + + +class SimpleOctotoolsWrapper: + """ + A simplified wrapper for Octotools. + + This class uses direct calls to the OpenAI API to simulate + Octotools functionality without relying on complex imports. + """ + + def __init__( + self, + llm_engine: str = "gpt-4o", + enabled_tools: Optional[List[str]] = None, + max_steps: int = 3 + ) -> None: + """ + Initialize the simple Octotools wrapper. + + Args: + llm_engine: The language model engine to use (default: "gpt-4o") + enabled_tools: List of tools to enable (not used in simplified version) + max_steps: Maximum number of steps to take when solving (default: 3) + """ + # Load environment variables + load_dotenv() + + # Check API key + self.api_key = os.environ.get("OPENAI_API_KEY") + if not self.api_key: + raise ValueError("OPENAI_API_KEY environment variable is not set") + + self.llm_engine = llm_engine + self.max_steps = max_steps + + # Set default tools if none provided + if enabled_tools is None: + enabled_tools = ["Generalist_Solution_Generator_Tool"] + + # Filter to only include available tools + self.enabled_tools = [tool for tool in enabled_tools if tool in AVAILABLE_TOOLS] + + print(f"Initialized SimpleOctotoolsWrapper with model {llm_engine}") + print(f"Enabled tools: {', '.join(self.enabled_tools)}") + + def solve( + self, + query: str, + image_data: Optional[str] = None, + context: Optional[str] = None + ) -> Dict[str, Any]: + """ + Solve a task using direct calls to the OpenAI API. + + Args: + query: The user's query + image_data: Optional base64-encoded image data + context: Optional additional context for the solver + + Returns: + A dictionary containing the result of the solving process + """ + try: + print(f"Solving query: {query}") + + # Process the image if provided + image_path = None + if image_data: + # Save the image temporarily + with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp: + # Remove the data:image/png;base64, prefix if present + if 'base64,' in image_data: + image_data = image_data.split('base64,')[1] + + temp.write(base64.b64decode(image_data)) + image_path = temp.name + + # Build full context with query and additional context + full_query = query + if context: + full_query = f"{query}\n\nContext: {context}" + + # Prepare system prompt + system_prompt = self._get_system_prompt() + + # Prepare user prompt + user_prompt = self._get_user_prompt(full_query, image_path) + + # Create messages for API call + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + + # Make direct API call to OpenAI + import requests + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + + data = { + "model": self.llm_engine, + "messages": messages, + "temperature": 0.7, + "max_tokens": 1000 + } + + # Add image content if provided + if image_path: + # Convert user message to include image + encoded_image = self.encode_image_to_base64(image_path) + if encoded_image: + # Update user message to include image + messages[1] = { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}} + ] + } + data["messages"] = messages + + # Make the API call + response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers=headers, + json=data + ) + + # Process the response + if response.status_code == 200: + result = response.json() + answer = result["choices"][0]["message"]["content"] + + # Format the result + formatted_result = { + "answer": answer, + "steps": [ + { + "tool_used": self.enabled_tools[0], + "sub_goal": "Process the query and generate a response", + "result": answer + } + ] + } + + return formatted_result + else: + error_message = f"API Error: {response.status_code} - {response.text}" + print(error_message) + return { + "answer": f"Error: {error_message}", + "steps": [] + } + + except Exception as e: + traceback.print_exc() + print(f"Error in SimpleOctotoolsWrapper.solve: {str(e)}") + return { + "answer": f"Error: {str(e)}", + "steps": [] + } + finally: + # Clean up temporary file if created + if image_path and os.path.exists(image_path): + os.remove(image_path) + + def _get_system_prompt(self) -> str: + """Get the system prompt for the language model.""" + return """You are OctoTools, an AI agent designed to solve complex problems. + +You have access to various tools to help solve problems: +- Python_Code_Generator_Tool: Generate Python code to solve programming problems +- Generalist_Solution_Generator_Tool: Generate solutions to general problems +- Image_Caption_Tool: Caption and analyze images + +Break down the problem into steps, think about it carefully, and provide a detailed solution. +""" + + def _get_user_prompt(self, query: str, image_path: Optional[str] = None) -> str: + """Get the user prompt including the query and any context about images.""" + prompt = f"Query: {query}\n\n" + + if image_path: + prompt += "Note: This query includes an image which I've analyzed for you.\n" + + return prompt + + def encode_image_to_base64(self, image_path: str) -> str: + """ + Encode an image file to base64. + + Args: + image_path: Path to the image file + + Returns: + Base64 encoded string of the image + """ + try: + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + except Exception as e: + print(f"Error encoding image: {str(e)}") + return "" \ No newline at end of file From c310c225c3dc3c374e4b42e03a8a9663d99f9a56 Mon Sep 17 00:00:00 2001 From: jmanhype Date: Wed, 12 Mar 2025 15:48:16 -0500 Subject: [PATCH 5/6] Update README_OCTOTOOLS.md with comprehensive documentation and improved formatting --- README_OCTOTOOLS.md | 163 +++++++++++++++++++++++++++++--------------- 1 file changed, 107 insertions(+), 56 deletions(-) diff --git a/README_OCTOTOOLS.md b/README_OCTOTOOLS.md index 5036566..591a7b8 100644 --- a/README_OCTOTOOLS.md +++ b/README_OCTOTOOLS.md @@ -1,131 +1,182 @@ -# Octotools Integration for CUA-SAMPLE-APP +# 🛠️ Octotools Integration for CUA Sample App -This integration enhances the CUA-SAMPLE-APP framework with Octotools capabilities for improved reasoning and problem-solving. +This integration enhances the CUA Sample App with [Octotools](https://github.com/OctoTools/OctoTools) capabilities, providing advanced reasoning, problem-solving, and specialized tool access for AI agents. -## Components +## 📋 Overview -The integration consists of the following components: +The Octotools integration enables CUA Sample App to: +- Perform complex multi-step reasoning +- Access specialized tools for different tasks +- Enhance browser automation with content analysis +- Generate code and analyze data +- Search for and extract information from the web -1. **SimpleOctotoolsWrapper** (`simple_octotools_wrapper.py`) - A lightweight wrapper that provides Octotools-like functionality using direct OpenAI API calls. +## 🧩 Components -2. **OctotoolsIntegration** (`cua_octotools_integration_simple.py`) - A bridge component for connecting CUA-SAMPLE-APP with Octotools. +The integration consists of the following key components: -3. **OctotoolsAgent** (`octotools_agent.py`) - An enhanced agent that extends the base CUA Agent with Octotools reasoning capabilities. +1. **OctotoolsWrapper** (`octotools_wrapper.py`) - Core wrapper for Octotools functionality. -4. **Test Scripts** - Multiple test scripts to verify different aspects of the integration. +2. **OctotoolsAgent** (`octotools_agent.py`) - Enhanced agent extending the base CUA Agent with Octotools capabilities. -## Setup +3. **SimpleOctotoolsWrapper** (`simple_octotools_wrapper.py`) - Lightweight wrapper using direct API calls for environments without full Octotools. + +4. **CompleteOctotoolsWrapper** (`complete_octotools_wrapper.py`) - Full-featured wrapper with all Octotools capabilities. + +5. **Integration Scripts** - Various scripts to demonstrate different integration patterns. + +## ⚙️ Setup ### Prerequisites - Python 3.10 or higher -- CUA-SAMPLE-APP installed and working +- CUA Sample App installed and working - An OpenAI API key with access to GPT-4o or similar model -### Installation +### Quick Installation -1. Clone the Octotools repository (optional for minimal integration): +1. **Clone the repository with submodules**: ```bash - git clone https://github.com/OctoTools/OctoTools.git octotools + git clone https://github.com/jmanhype/openai-cua-sample-app.git + cd openai-cua-sample-app ``` -2. Set up environment variables: +2. **Set up environment**: + ```bash + python setup_octotools.py + ``` + +3. **Install dependencies**: ```bash - echo "OPENAI_API_KEY=your-api-key" > .env + pip install -r requirements.txt ``` -3. Install dependencies: +### Manual Setup + +If you prefer manual setup: + +1. **Create `.env` file**: + ```bash + echo "OPENAI_API_KEY=your-api-key-here" > .env + echo "OCTOTOOLS_MODEL=gpt-4o" >> .env + ``` + +2. **Install dependencies**: ```bash pip install -r requirements.txt ``` -## Usage +## 🚀 Usage -### Simple Integration +### Basic Integration -For a basic integration that doesn't require the full Octotools repository: +Run the CUA Sample App with Octotools enabled: ```bash -python cua_octotools_integration_simple.py --query "Explain the concept of recursion" --debug +python main.py --use-octotools --debug ``` -### Advanced Integration +### Advanced Usage -For a full integration that extends the CUA Agent: +Use the dedicated OctotoolsAgent with specific tools: ```bash -python octotools_agent.py +python run_octotools_agent.py --tools "Python_Code_Generator_Tool,Text_Detector_Tool,URL_Text_Extractor_Tool,Nature_News_Fetcher_Tool" ``` -This will start an interactive session using the OctotoolsAgent with browser automation. +### Available Tools + +The integration supports multiple tools: + +| Tool | Description | Usage Example | +|------|-------------|---------------| +| `Generalist_Solution_Generator_Tool` | General problem-solving | Complex reasoning tasks | +| `Python_Code_Generator_Tool` | Generates Python code | "Write a script to parse CSV files" | +| `Text_Detector_Tool` | Analyzes text for key information | Extract entities from documents | +| `URL_Text_Extractor_Tool` | Extracts text from webpages | "Summarize this webpage" | +| `Nature_News_Fetcher_Tool` | Fetches news from Nature | "What's new in quantum computing?" | -### Standalone Testing +## 🧪 Testing -To test the SimpleOctotoolsWrapper independently: +Run tests to verify the integration: ```bash +# Test basic integration +python test_octotools.py + +# Test simple wrapper python test_simple_octotools.py + +# Test full integration +python test_full_octotools.py ``` -## Integration Architecture +## 🔍 Architecture ``` ┌─────────────────────────────────────────────────────────┐ -│ CUA-SAMPLE-APP │ +│ CUA Sample App │ │ │ │ ┌───────────────┐ ┌───────────────────────┐ │ │ │ Regular Agent │ │ OctotoolsAgent │ │ │ └───────┬───────┘ └───────────┬───────────┘ │ │ │ │ │ │ │ ┌───────────┴───────────┐ │ -│ │ │ SimpleOctotoolsWrapper│ │ +│ │ │ OctotoolsWrapper │ │ │ │ └───────────────────────┘ │ │ │ │ │ │ ┌──────┴───────────────────────────────┴─────────┐ │ │ │ Computer │ │ │ └──────────────────────────────────────────────┐ │ │ └────────────────────────────────────────────────────────┘ + │ + ▼ + ┌─────────────────┐ + │ Octotools │ + │ Framework │ + └─────────────────┘ ``` -## Troubleshooting +## 📚 Documentation -Common issues and solutions: +For more detailed documentation: + +- **Integration Guide**: See `docs/octotools_integration_guide.md` for a comprehensive guide. +- **API Reference**: Check `octotools_wrapper.py` and `octotools_agent.py` for inline documentation. +- **Examples**: The `examples/` directory contains example usage patterns. + +## ❓ Troubleshooting ### API Key Problems -If you see errors related to the API key, ensure that: -- The `.env` file exists and contains `OPENAI_API_KEY=your-api-key` -- The API key is valid and has access to the required models +If you see errors related to the API key: +- Ensure that the `.env` file contains `OPENAI_API_KEY=your-api-key` +- Verify your API key has access to the required models ### Import Errors If you encounter import errors: -- Ensure you're running the scripts from the project root directory -- Check that all dependencies are properly installed -- Make sure the CUA-SAMPLE-APP is correctly installed - -### Integration Issues - -If the integration doesn't work as expected: -- Try the simple integration first to isolate issues -- Enable debug mode to see more detailed information -- Check the test scripts to verify component functionality +- Ensure all dependencies are properly installed +- Run from the project root directory +- Check that the octotools directory is correctly placed -## Future Improvements +### Performance Issues -This integration could be enhanced in the following ways: +If reasoning tasks are slow: +- Use a more powerful model like GPT-4o +- Reduce the number of enabled tools +- Set a lower max_steps value to limit iteration -1. Add more sophisticated detection for when to use Octotools vs. standard CUA behavior -2. Implement a more accurate simulation of all Octotools tools -3. Better error handling and fallback mechanisms -4. Add support for more advanced Octotools features +## 👥 Contributing -## License +Contributions are welcome! To contribute to this integration: -This integration is subject to the same license as the CUA-SAMPLE-APP. +1. Fork the repository +2. Create a feature branch +3. Implement your changes +4. Add tests +5. Submit a pull request -## Acknowledgements +## 📄 License -This integration builds upon: -- [CUA-SAMPLE-APP](https://github.com/openai/openai-cua-sample-app) by OpenAI -- [Octotools](https://github.com/OctoTools/OctoTools) framework \ No newline at end of file +This integration is subject to the same license as the CUA Sample App. \ No newline at end of file From 02bca2ac873f739b40e36555b4cf493c4fab885d Mon Sep 17 00:00:00 2001 From: jmanhype Date: Wed, 12 Mar 2025 15:51:52 -0500 Subject: [PATCH 6/6] Update documentation with Octotools integration details --- README.md | 19 +++++ docs/agent_implementation.md | 160 +++++++++++++++++++++++++++++++++++ docs/cli_usage.md | 153 ++++++++++++++++++++++++++++++--- 3 files changed, 319 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 8795b43..2a13662 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,25 @@ The computer use tool and model are available via the [Responses API](https://pl You can learn more about this tool in the [Computer use guide](https://platform.openai.com/docs/guides/tools-computer-use). +## Feature Highlights + +- **Multiple Computer Environments**: Support for various environments including local browsers, Docker containers, and remote services +- **Safety Measures**: URL blocklisting and safety check acknowledgments +- **Function Calling**: Define and use custom functions in your agent +- **Extensible Design**: Easily add new Computer implementations +- **Octotools Integration**: Enhanced reasoning and specialized tools through the [Octotools](https://github.com/OctoTools/OctoTools) framework + +### Octotools Integration + +The CUA Sample App includes integration with the Octotools framework for enhanced reasoning and specialized tool access: + +```shell +# Run with Octotools integration +python main.py --use-octotools +``` + +For more details, see the [Octotools Integration Guide](docs/octotools_integration_guide.md) and [README_OCTOTOOLS.md](README_OCTOTOOLS.md). + ## Abstractions This repository defines two lightweight abstractions to make interacting with CUA agents more ergonomic. Everything works without them, but they provide a convenient separation of concerns. diff --git a/docs/agent_implementation.md b/docs/agent_implementation.md index 5ade8bf..7d62e7a 100644 --- a/docs/agent_implementation.md +++ b/docs/agent_implementation.md @@ -25,6 +25,9 @@ class Agent: computer: Computer = None, tools: list[dict] = [], acknowledge_safety_check_callback: Callable = lambda: False, + use_octotools: bool = False, + octotools_engine: str = "gpt-4o", + octotools_tools: List[str] = None, ): self.model = model self.computer = computer @@ -43,6 +46,16 @@ class Agent: "environment": computer.environment, }, ] + + # Octotools integration + self.use_octotools = use_octotools + if use_octotools: + self.octotools = OctotoolsWrapper( + llm_engine=octotools_engine, + enabled_tools=octotools_tools + ) + else: + self.octotools = None ``` ## Key Methods @@ -56,6 +69,10 @@ The `run_full_turn()` method is the main entry point for running a complete inte 3. Processes any actions in the response 4. Continues calling the model until a final response is reached +With Octotools integration, it can also: +5. Detect queries that need complex reasoning +6. Leverage specialized tools for enhanced problem-solving + ```python def run_full_turn( self, input_items, print_steps=True, debug=False, show_images=False @@ -63,6 +80,12 @@ def run_full_turn( self.print_steps = print_steps self.debug = debug self.show_images = show_images + + # Check if we should use Octotools for complex reasoning + if self.use_octotools and self._needs_complex_reasoning(input_items): + return self._handle_with_octotools(input_items) + + # Standard CUA processing new_items = [] # keep looping until we get a final response @@ -88,6 +111,86 @@ def run_full_turn( return new_items ``` +### `_needs_complex_reasoning()` + +When Octotools integration is enabled, this method determines if a query requires complex reasoning: + +```python +def _needs_complex_reasoning(self, input_items): + """ + Determine if the query needs complex reasoning that would benefit from Octotools. + This is a basic heuristic and can be enhanced based on specific requirements. + """ + # Extract the latest user message + latest_user_message = None + for item in reversed(input_items): + if item.get("role") == "user": + latest_user_message = item.get("content", "") + break + + if not latest_user_message: + return False + + # Simple heuristic: check for keywords that might suggest complex reasoning + complex_keywords = [ + "analyze", "compare", "calculate", "extract data", "search for", + "find information", "summarize", "visual analysis", + "collect data", "research", "solve" + ] + + return any(keyword in latest_user_message.lower() for keyword in complex_keywords) +``` + +### `_handle_with_octotools()` + +This method processes queries using Octotools when complex reasoning is needed: + +```python +def _handle_with_octotools(self, input_items): + """ + Handle a query using Octotools for complex reasoning. + """ + # Extract the latest user message and any screenshots + latest_user_message = None + latest_screenshot = None + + for item in reversed(input_items): + if item.get("role") == "user" and not latest_user_message: + latest_user_message = item.get("content", "") + + # Look for the most recent screenshot + if not latest_screenshot and item.get("type") == "computer_call_output": + output = item.get("output", {}) + if output.get("type") == "input_image": + image_url = output.get("image_url", "") + if image_url.startswith("data:image/png;base64,"): + latest_screenshot = image_url + + if not latest_user_message: + return [] + + # Get the current URL for context if in browser environment + current_url = None + if self.computer and self.computer.environment == "browser": + try: + current_url = self.computer.get_current_url() + except: + pass + + # Build context + context = f"Current URL: {current_url}" if current_url else "" + + # Solve using Octotools + result = self.octotools.solve( + query=latest_user_message, + image_data=latest_screenshot.split("base64,")[1] if latest_screenshot else None, + context=context + ) + + # Return as a message from the assistant + return [{"role": "assistant", "content": result.get("answer", "")}] +``` + ### `handle_item()` The `handle_item()` method processes individual items from the model's response: @@ -174,9 +277,13 @@ def handle_item(self, item): | `computer` | The Computer implementation to use | `None` | | `tools` | A list of additional tools to provide to the model | `[]` | | `acknowledge_safety_check_callback` | A callback function for handling safety checks | `lambda: False` | +| `use_octotools` | Whether to use Octotools integration | `False` | +| `octotools_engine` | The LLM engine to use for Octotools | `"gpt-4o"` | +| `octotools_tools` | List of Octotools tools to enable | `None` | ## Agent Workflow Diagram +### Standard Workflow ``` ┌─────────────────┐ │ │ @@ -248,6 +355,59 @@ def handle_item(self, item): └─────────────────┘ ``` +### Octotools-Enhanced Workflow +``` +┌─────────────────┐ +│ │ +│ User Input │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ run_full_turn │ +│ │ +└────────┬────────┘ + │ + ▼ +┌──────────────────────┐ +│ │ +│ _needs_complex_ │ +│ reasoning? │ +│ │ +└──────────┬───────────┘ + │ + ▼ + ┌────/\────┐ + │ Yes │ + └────┬─────┘ ┌────/\────┐ + │ │ No │ + │ └─────┬────┘ + ▼ │ +┌─────────────────┐ │ +│ │ │ +│ _handle_with_ │ │ +│ octotools │ │ +│ │ │ +└────────┬────────┘ │ + │ │ + ▼ │ +┌─────────────────┐ │ +│ │ │ +│ Octotools │◀────────────────┘ +│ Solver │ (Standard Workflow) +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Return Response │ +│ │ +└─────────────────┘ +``` + ## Using the Agent The most common way to use the Agent is through the CLI, which handles the initialization and interaction loop: diff --git a/docs/cli_usage.md b/docs/cli_usage.md index f5147f3..9eadd99 100644 --- a/docs/cli_usage.md +++ b/docs/cli_usage.md @@ -12,6 +12,12 @@ python cli.py This will start an interactive session with the default settings (local Playwright browser environment). +For enhanced reasoning capabilities with Octotools integration: + +```bash +python main.py --use-octotools +``` + ## Command Line Arguments The CLI supports several command-line arguments to customize its behavior: @@ -24,6 +30,16 @@ The CLI supports several command-line arguments to customize its behavior: | `--show` | Show images (screenshots) during execution | False | | `--start-url` | Starting URL for browser environments | `https://bing.com` | +### Octotools-Specific Arguments + +When using `main.py` or `run_octotools_agent.py`, additional options are available: + +| Argument | Description | Default | +|----------|-------------|---------| +| `--use-octotools` | Enable Octotools integration | False | +| `--tools` | Comma-separated list of Octotools tools to enable | `Generalist_Solution_Generator_Tool` | +| `--engine` | LLM engine to use with Octotools | `gpt-4o` | + ### Example Usage Using a different computer environment: @@ -56,10 +72,22 @@ Specifying a start URL: python cli.py --start-url "https://www.google.com" ``` +Using Octotools integration: + +```bash +python main.py --use-octotools --debug +``` + +Using specific Octotools tools: + +```bash +python run_octotools_agent.py --tools "Python_Code_Generator_Tool,Text_Detector_Tool,URL_Text_Extractor_Tool,Nature_News_Fetcher_Tool" +``` + Combining multiple arguments: ```bash -python cli.py --computer local-playwright --show --debug --start-url "https://www.wikipedia.org" +python main.py --computer local-playwright --show --debug --use-octotools --tools "Python_Code_Generator_Tool,URL_Text_Extractor_Tool" ``` ## Available Computer Environments @@ -74,8 +102,24 @@ The CLI supports several computer environments, each with its own requirements a | `scrapybara-browser` | Remote browser environment | Browser | Scrapybara API key in `.env` | | `scrapybara-ubuntu` | Remote Ubuntu desktop | Linux | Scrapybara API key in `.env` | +## Available Octotools Tools + +When using Octotools integration, various specialized tools are available: + +| Tool | Description | Use Case | +|------|-------------|----------| +| `Generalist_Solution_Generator_Tool` | General problem-solving | Complex reasoning tasks, strategy development | +| `Python_Code_Generator_Tool` | Generates Python code | Scripting, data processing, automation | +| `Text_Detector_Tool` | Analyzes text for key information | Entity extraction, document analysis | +| `URL_Text_Extractor_Tool` | Extracts text from webpages | Web scraping, content summarization | +| `Nature_News_Fetcher_Tool` | Fetches news from Nature | Research updates, scientific information | + +For more details on Octotools, see the [Octotools Integration Guide](octotools_integration_guide.md). + ## Implementation Details +### Main CLI + The CLI is implemented in `cli.py`. Here's an overview of the key components: ### Safety Check Callback @@ -142,20 +186,40 @@ def main(): args.input = None ``` -The main function: -1. Parses command-line arguments -2. Maps the selected computer environment to the appropriate class -3. Creates an instance of the selected Computer class -4. Creates an Agent with the computer instance -5. Enters the main interaction loop, where it: - - Gets user input (or uses the provided initial input) - - Adds the input to the conversation context - - Runs a full turn of the agent - - Adds the agent's output to the conversation context - - Resets the initial input (so it's only used once) +### Octotools-Enhanced Main Function + +The main function in `main.py` adds Octotools support: + +```python +def main(): + parser = argparse.ArgumentParser( + description="Run the CUA Sample App with optional Octotools integration." + ) + # Standard arguments... + parser.add_argument('--use-octotools', action='store_true', help='Enable Octotools integration') + parser.add_argument('--tools', type=str, help='Comma-separated list of Octotools tools to enable') + parser.add_argument('--engine', type=str, default='gpt-4o', help='LLM engine to use with Octotools') + args = parser.parse_args() + + # Parse tools if provided + octotools_tools = None + if args.tools: + octotools_tools = [tool.strip() for tool in args.tools.split(',')] + + with ComputerClass() as computer: + agent = Agent( + computer=computer, + acknowledge_safety_check_callback=acknowledge_safety_check_callback, + use_octotools=args.use_octotools, + octotools_engine=args.engine, + octotools_tools=octotools_tools + ) + # Main interaction loop... +``` ## Interaction Flow +### Standard Flow ``` ┌─────────────────┐ │ │ @@ -210,6 +274,64 @@ The main function: └───────┘ ``` +### Octotools-Enhanced Flow +``` +┌─────────────────┐ +│ │ +│ Parse Command │ +│ Line Arguments │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Create Computer │ +│ Environment │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Create Agent │ +│ with Octotools │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Get User Input │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Needs Complex │─────Yes────┐ +│ Reasoning? │ │ +│ │ │ +└───────┬─────────┘ │ + │ │ + No │ + │ │ + ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ +│ │ │ │ +│ Standard Agent │ │ Octotools │ +│ Processing │ │ Processing │ +│ │ │ │ +└────────┬────────┘ └────────┬────────┘ + │ │ + └──────────────────────┘ + │ + ▼ + ┌───────┐ + │ Loop │ + └───────┘ +``` + ## Error Handling The CLI includes basic error handling: @@ -224,4 +346,9 @@ To add a new computer environment to the CLI: 1. Implement the Computer protocol in a new class 2. Add your class to the `computers/__init__.py` file 3. Add your environment option to the `--computer` argument choices -4. Add your class to the `computer_mapping` dictionary \ No newline at end of file +4. Add your class to the `computer_mapping` dictionary + +To add new Octotools tools: + +1. Add the tool name to the available tools list in `octotools_wrapper.py` +2. Specify the tool when running with the `--tools` parameter \ No newline at end of file