Skip to content

Commit e5dade8

Browse files
authored
Merge branch 'main' into upstream-main
2 parents 776a7f8 + 0a22173 commit e5dade8

File tree

8 files changed

+419
-246
lines changed

8 files changed

+419
-246
lines changed

README.md

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212
</div>
1313

1414
<!--
15-
:rotating_light: **OUTAGE NOTIFICATION: gpt-4-vision-preview**
15+
:rotating_light: **OUTAGE NOTIFICATION: gpt-4o**
1616
**This model is currently experiencing an outage so the self-operating computer may not work as expected.**
1717
-->
1818

1919

2020
## Key Features
2121
- **Compatibility**: Designed for various multimodal models.
22-
- **Integration**: Currently integrated with **GPT-4v, Gemini Pro Vision, and LLaVa.**
22+
- **Integration**: Currently integrated with **GPT-4o, Gemini Pro Vision, Claude 3 and LLaVa.**
2323
- **Future Plans**: Support for additional models.
2424

2525
## Ongoing Development
@@ -45,7 +45,7 @@ pip install self-operating-computer
4545
```
4646
operate
4747
```
48-
3. **Enter your OpenAI Key**: If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)
48+
3. **Enter your OpenAI Key**: If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys). If you need you change your key at a later point, run `vim .env` to open the `.env` and replace the old key.
4949

5050
<div align="center">
5151
<img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/key.png" width="300" style="margin: 10px;"/>
@@ -58,24 +58,6 @@ operate
5858
<img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/terminal-access-2.png" width="300" style="margin: 10px;"/>
5959
</div>
6060

61-
### Alternatively installation with `.sh`
62-
63-
1. **Clone the repo** to a directory on your computer:
64-
```
65-
git clone https://github.com/OthersideAI/self-operating-computer.git
66-
```
67-
2. **Cd into directory**:
68-
69-
```
70-
cd self-operating-computer
71-
```
72-
73-
3. **Run the installation script**:
74-
75-
```
76-
./run.sh
77-
```
78-
7961
## Using `operate` Modes
8062

8163
### Multimodal Models `-m`
@@ -88,7 +70,14 @@ operate -m gemini-pro-vision
8870

8971
**Enter your Google AI Studio API key when terminal prompts you for it** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR.
9072

91-
### Locally Hosted LLaVA Through Ollama
73+
#### Try Claude `-m claude-3`
74+
Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Claude dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it.
75+
76+
```
77+
operate -m claude-3
78+
```
79+
80+
#### Try LLaVa Hosted Through Ollama `-m llava`
9281
If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!
9382
*Note: Ollama currently only supports MacOS and Linux*
9483

@@ -187,5 +176,5 @@ Stay updated with the latest developments:
187176
- This project is compatible with Mac OS, Windows, and Linux (with X server installed).
188177

189178
## OpenAI Rate Limiting Note
190-
The ```gpt-4-vision-preview``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5.
179+
The ```gpt-4o``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5.
191180
Learn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)**

evaluate.py

Lines changed: 42 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
Guideline: {guideline}
2626
"""
2727

28-
SCREENSHOT_PATH = os.path.join('screenshots', 'screenshot.png')
28+
SCREENSHOT_PATH = os.path.join("screenshots", "screenshot.png")
29+
2930

3031
# Check if on a windows terminal that supports ANSI escape codes
3132
def supports_ansi():
@@ -37,6 +38,7 @@ def supports_ansi():
3738
is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
3839
return supported_platform and is_a_tty
3940

41+
4042
if supports_ansi():
4143
# Standard green text
4244
ANSI_GREEN = "\033[32m"
@@ -62,8 +64,8 @@ def supports_ansi():
6264
ANSI_YELLOW = ""
6365
ANSI_RED = ""
6466
ANSI_BRIGHT_MAGENTA = ""
65-
66-
67+
68+
6769
def format_evaluation_prompt(guideline):
6870
prompt = EVALUATION_PROMPT.format(guideline=guideline)
6971
return prompt
@@ -72,88 +74,95 @@ def format_evaluation_prompt(guideline):
7274
def parse_eval_content(content):
7375
try:
7476
res = json.loads(content)
75-
77+
7678
print(res["reason"])
77-
79+
7880
return res["guideline_met"]
7981
except:
80-
print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")
82+
print(
83+
"The model gave a bad evaluation response and it couldn't be parsed. Exiting..."
84+
)
8185
exit(1)
8286

8387

8488
def evaluate_final_screenshot(guideline):
85-
'''Load the final screenshot and return True or False if it meets the given guideline.'''
89+
"""Load the final screenshot and return True or False if it meets the given guideline."""
8690
with open(SCREENSHOT_PATH, "rb") as img_file:
8791
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
8892

89-
eval_message = [{
90-
"role": "user",
91-
"content": [
92-
{"type": "text", "text": format_evaluation_prompt(guideline)},
93-
{
94-
"type": "image_url",
95-
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
96-
},
97-
],
98-
}]
99-
93+
eval_message = [
94+
{
95+
"role": "user",
96+
"content": [
97+
{"type": "text", "text": format_evaluation_prompt(guideline)},
98+
{
99+
"type": "image_url",
100+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
101+
},
102+
],
103+
}
104+
]
105+
100106
response = openai.chat.completions.create(
101-
model="gpt-4-vision-preview",
107+
model="gpt-4o",
102108
messages=eval_message,
103109
presence_penalty=1,
104110
frequency_penalty=1,
105111
temperature=0.7,
106-
max_tokens=300,
107112
)
108113

109114
eval_content = response.choices[0].message.content
110-
115+
111116
return parse_eval_content(eval_content)
112117

113118

114119
def run_test_case(objective, guideline, model):
115-
'''Returns True if the result of the test with the given prompt meets the given guideline for the given model.'''
120+
"""Returns True if the result of the test with the given prompt meets the given guideline for the given model."""
116121
# Run `operate` with the model to evaluate and the test case prompt
117-
subprocess.run(['operate', '-m', model, '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
118-
122+
subprocess.run(
123+
["operate", "-m", model, "--prompt", f'"{objective}"'],
124+
stdout=subprocess.DEVNULL,
125+
)
126+
119127
try:
120128
result = evaluate_final_screenshot(guideline)
121-
except(OSError):
129+
except OSError:
122130
print("[Error] Couldn't open the screenshot for evaluation")
123131
return False
124-
132+
125133
return result
126134

127135

128136
def get_test_model():
129137
parser = argparse.ArgumentParser(
130138
description="Run the self-operating-computer with a specified model."
131139
)
132-
140+
133141
parser.add_argument(
134142
"-m",
135143
"--model",
136144
help="Specify the model to evaluate.",
137145
required=False,
138146
default="gpt-4-with-ocr",
139147
)
140-
148+
141149
return parser.parse_args().model
142150

143151

144152
def main():
145153
load_dotenv()
146154
openai.api_key = os.getenv("OPENAI_API_KEY")
147-
155+
148156
model = get_test_model()
149-
157+
150158
print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
151159
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")
152160

153-
passed = 0; failed = 0
161+
passed = 0
162+
failed = 0
154163
for objective, guideline in TEST_CASES.items():
155164
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
156-
165+
157166
result = run_test_case(objective, guideline, model)
158167
if result:
159168
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
@@ -166,5 +175,6 @@ def main():
166175
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"
167176
)
168177

178+
169179
if __name__ == "__main__":
170180
main()

operate/config.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from dotenv import load_dotenv
66
from ollama import Client
77
from openai import OpenAI
8+
import anthropic
89
from prompt_toolkit.shortcuts import input_dialog
910

1011

@@ -38,6 +39,10 @@ def __init__(self):
3839
)
3940
self.ollama_host = (
4041
None # instance variables are backups in case savint to a `.env` fails
42+
43+
self.anthropic_api_key = (
44+
None # instance variables are backups in case saving to a `.env` fails
45+
4146
)
4247

4348
def initialize_openai(self):
@@ -91,6 +96,13 @@ def initialize_ollama(self):
9196
model = Client(host=self.ollama_host)
9297
return model
9398

99+
def initialize_anthropic(self):
100+
if self.anthropic_api_key:
101+
api_key = self.anthropic_api_key
102+
else:
103+
api_key = os.getenv("ANTHROPIC_API_KEY")
104+
return anthropic.Anthropic(api_key=api_key)
105+
94106
def validation(self, model, voice_mode):
95107
"""
96108
Validate the input parameters for the dialog operation.
@@ -101,11 +113,15 @@ def validation(self, model, voice_mode):
101113
model == "gpt-4"
102114
or voice_mode
103115
or model == "gpt-4-with-som"
104-
or model == "gpt-4-with-ocr",
116+
or model == "gpt-4-with-ocr"
117+
or model == "o1-with-ocr",
105118
)
106119
self.require_api_key(
107120
"GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"
108121
)
122+
self.require_api_key(
123+
"ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3"
124+
)
109125

110126
def require_api_key(self, key_name, key_description, is_required):
111127
key_exists = bool(os.environ.get(key_name))
@@ -130,6 +146,8 @@ def prompt_and_save_api_key(self, key_name, key_description):
130146
self.openai_api_key = key_value
131147
elif key_name == "GOOGLE_API_KEY":
132148
self.google_api_key = key_value
149+
elif key_name == "ANTHROPIC_API_KEY":
150+
self.anthropic_api_key = key_value
133151
self.save_api_key_to_env(key_name, key_value)
134152
load_dotenv() # Reload environment variables
135153
# Update the instance attribute with the new key

0 commit comments

Comments
 (0)