55import base64
66import json
77import openai
8+ import argparse
89
910from dotenv import load_dotenv
1011
1112# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
1213TEST_CASES = {
13- "Go to Github.com" : "The Github home page is visible." ,
14+ "Go to Github.com" : "A Github page is visible." ,
1415 "Go to Youtube.com and play a video" : "The YouTube video player is visible." ,
1516}
1617
@@ -110,10 +111,10 @@ def evaluate_final_screenshot(guideline):
110111 return parse_eval_content (eval_content )
111112
112113
113- def run_test_case (objective , guideline ):
114- '''Returns True if the result of the test with the given prompt meets the given guideline.'''
115- # Run `operate` with the test case prompt
116- subprocess .run (['operate' , '--prompt' , f'"{ objective } "' ], stdout = subprocess .DEVNULL )
114+ def run_test_case (objective , guideline , model ):
115+ '''Returns True if the result of the test with the given prompt meets the given guideline for the given model .'''
116+ # Run `operate` with the model to evaluate and the test case prompt
117+ subprocess .run (['operate' , '-m' , model , '- -prompt' , f'"{ objective } "' ], stdout = subprocess .DEVNULL )
117118
118119 try :
119120 result = evaluate_final_screenshot (guideline )
@@ -124,17 +125,36 @@ def run_test_case(objective, guideline):
124125 return result
125126
126127
128+ def get_test_model ():
129+ parser = argparse .ArgumentParser (
130+ description = "Run the self-operating-computer with a specified model."
131+ )
132+
133+ parser .add_argument (
134+ "-m" ,
135+ "--model" ,
136+ help = "Specify the model to evaluate." ,
137+ required = False ,
138+ default = "gpt-4-with-ocr" ,
139+ )
140+
141+ return parser .parse_args ().model
142+
143+
127144def main ():
128145 load_dotenv ()
129146 openai .api_key = os .getenv ("OPENAI_API_KEY" )
130147
148+ model = get_test_model ()
149+
150+ print (f"{ ANSI_BLUE } [EVALUATING MODEL `{ model } `]{ ANSI_RESET } " )
131151 print (f"{ ANSI_BRIGHT_MAGENTA } [STARTING EVALUATION]{ ANSI_RESET } " )
132152
133153 passed = 0 ; failed = 0
134154 for objective , guideline in TEST_CASES .items ():
135155 print (f"{ ANSI_BLUE } [EVALUATING]{ ANSI_RESET } '{ objective } '" )
136156
137- result = run_test_case (objective , guideline )
157+ result = run_test_case (objective , guideline , model )
138158 if result :
139159 print (f"{ ANSI_GREEN } [PASSED]{ ANSI_RESET } '{ objective } '" )
140160 passed += 1
0 commit comments