Skip to content

Commit 5ee9bdf

Browse files
authored
Merge pull request #163 from michaelhhogue/evaluator-update
Add `-m` argument to `evaluate.py`
2 parents 7cc9ab7 + f781cfe commit 5ee9bdf

File tree

1 file changed

+26
-6
lines changed

1 file changed

+26
-6
lines changed

evaluate.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55
import base64
66
import json
77
import openai
8+
import argparse
89

910
from dotenv import load_dotenv
1011

1112
# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
1213
TEST_CASES = {
13-
"Go to Github.com": "The Github home page is visible.",
14+
"Go to Github.com": "A Github page is visible.",
1415
"Go to Youtube.com and play a video": "The YouTube video player is visible.",
1516
}
1617

@@ -110,10 +111,10 @@ def evaluate_final_screenshot(guideline):
110111
return parse_eval_content(eval_content)
111112

112113

113-
def run_test_case(objective, guideline):
114-
'''Returns True if the result of the test with the given prompt meets the given guideline.'''
115-
# Run `operate` with the test case prompt
116-
subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
114+
def run_test_case(objective, guideline, model):
115+
'''Returns True if the result of the test with the given prompt meets the given guideline for the given model.'''
116+
# Run `operate` with the model to evaluate and the test case prompt
117+
subprocess.run(['operate', '-m', model, '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
117118

118119
try:
119120
result = evaluate_final_screenshot(guideline)
@@ -124,17 +125,36 @@ def run_test_case(objective, guideline):
124125
return result
125126

126127

128+
def get_test_model():
129+
parser = argparse.ArgumentParser(
130+
description="Run the self-operating-computer with a specified model."
131+
)
132+
133+
parser.add_argument(
134+
"-m",
135+
"--model",
136+
help="Specify the model to evaluate.",
137+
required=False,
138+
default="gpt-4-with-ocr",
139+
)
140+
141+
return parser.parse_args().model
142+
143+
127144
def main():
128145
load_dotenv()
129146
openai.api_key = os.getenv("OPENAI_API_KEY")
130147

148+
model = get_test_model()
149+
150+
print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
131151
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")
132152

133153
passed = 0; failed = 0
134154
for objective, guideline in TEST_CASES.items():
135155
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
136156

137-
result = run_test_case(objective, guideline)
157+
result = run_test_case(objective, guideline, model)
138158
if result:
139159
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
140160
passed += 1

0 commit comments

Comments
 (0)