44import base64
55import traceback
66import io
7+ import easyocr
78
89
910from PIL import Image
1920 get_user_prompt ,
2021 get_system_prompt ,
2122)
23+ from operate .utils .ocr import get_text_element , get_text_coordinates
2224
2325
2426from operate .utils .label import (
@@ -48,6 +50,9 @@ async def get_next_action(model, messages, objective, session_id):
4850 if model == "gpt-4-with-som" :
4951 operation = await call_gpt_4_vision_preview_labeled (messages , objective )
5052 return operation , None
53+ if model == "gpt-4-with-ocr" :
54+ operation = await call_gpt_4_vision_preview_ocr (messages , objective , model )
55+ return operation , None
5156 elif model == "agent-1" :
5257 return "coming soon"
5358 elif model == "gemini-pro-vision" :
@@ -58,7 +63,7 @@ async def get_next_action(model, messages, objective, session_id):
5863
5964def call_gpt_4_vision_preview (messages ):
6065 if VERBOSE :
61- print ("[Self Operating Computer][get_next_action][ call_gpt_4_v]" )
66+ print ("[call_gpt_4_v]" )
6267 time .sleep (1 )
6368 client = config .initialize_openai ()
6469 try :
@@ -80,7 +85,7 @@ def call_gpt_4_vision_preview(messages):
8085
8186 if VERBOSE :
8287 print (
83- "[Self Operating Computer][get_next_action][ call_gpt_4_v] user_prompt" ,
88+ "[call_gpt_4_v] user_prompt" ,
8489 user_prompt ,
8590 )
8691
@@ -115,7 +120,7 @@ def call_gpt_4_vision_preview(messages):
115120 assistant_message = {"role" : "assistant" , "content" : content }
116121 if VERBOSE :
117122 print (
118- "[Self Operating Computer][get_next_action][ call_gpt_4_v] content" ,
123+ "[call_gpt_4_v] content" ,
119124 content ,
120125 )
121126 content = json .loads (content )
@@ -157,25 +162,23 @@ def call_gemini_pro_vision(messages, objective):
157162 capture_screen_with_cursor (screenshot_filename )
158163 # sleep for a second
159164 time .sleep (1 )
160- prompt = get_system_prompt (objective )
165+ prompt = get_system_prompt ("gemini-pro-vision" , objective )
161166
162167 model = config .initialize_google ()
163168 if VERBOSE :
164- print ("[Self Operating Computer][ call_gemini_pro_vision] model" , model )
169+ print ("[call_gemini_pro_vision] model" , model )
165170
166171 response = model .generate_content ([prompt , Image .open (screenshot_filename )])
167172
168173 content = response .text [1 :]
169174 if VERBOSE :
170- print (
171- "[Self Operating Computer][call_gemini_pro_vision] response" , response
172- )
173- print ("[Self Operating Computer][call_gemini_pro_vision] content" , content )
175+ print ("[call_gemini_pro_vision] response" , response )
176+ print ("[call_gemini_pro_vision] content" , content )
174177
175178 content = json .loads (content )
176179 if VERBOSE :
177180 print (
178- "[Self Operating Computer][ get_next_action][call_gemini_pro_vision] content" ,
181+ "[get_next_action][call_gemini_pro_vision] content" ,
179182 content ,
180183 )
181184
@@ -189,6 +192,132 @@ def call_gemini_pro_vision(messages, objective):
189192 return call_gpt_4_vision_preview (messages )
190193
191194
195+ async def call_gpt_4_vision_preview_ocr (messages , objective , model ):
196+ if VERBOSE :
197+ print ("[call_gpt_4_vision_preview_ocr]" )
198+
199+ # Construct the path to the file within the package
200+ try :
201+ time .sleep (1 )
202+ client = config .initialize_openai ()
203+
204+ confirm_system_prompt (messages , objective , model )
205+ screenshots_dir = "screenshots"
206+ if not os .path .exists (screenshots_dir ):
207+ os .makedirs (screenshots_dir )
208+
209+ screenshot_filename = os .path .join (screenshots_dir , "screenshot.png" )
210+ # Call the function to capture the screen with the cursor
211+ capture_screen_with_cursor (screenshot_filename )
212+
213+ with open (screenshot_filename , "rb" ) as img_file :
214+ img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
215+
216+ if len (messages ) == 1 :
217+ user_prompt = get_user_first_message_prompt ()
218+ else :
219+ user_prompt = get_user_prompt ()
220+
221+ if VERBOSE :
222+ print (
223+ "[call_gpt_4_vision_preview_ocr] user_prompt" ,
224+ user_prompt ,
225+ )
226+
227+ vision_message = {
228+ "role" : "user" ,
229+ "content" : [
230+ {"type" : "text" , "text" : user_prompt },
231+ {
232+ "type" : "image_url" ,
233+ "image_url" : {"url" : f"data:image/jpeg;base64,{ img_base64 } " },
234+ },
235+ ],
236+ }
237+ messages .append (vision_message )
238+
239+ response = client .chat .completions .create (
240+ model = "gpt-4-vision-preview" ,
241+ messages = messages ,
242+ presence_penalty = 1 ,
243+ frequency_penalty = 1 ,
244+ temperature = 0.7 ,
245+ max_tokens = 1000 ,
246+ )
247+
248+ content = response .choices [0 ].message .content
249+
250+ if content .startswith ("```json" ):
251+ content = content [len ("```json" ) :] # Remove starting ```json
252+ if content .endswith ("```" ):
253+ content = content [: - len ("```" )] # Remove ending
254+
255+ content_str = content
256+
257+ content = json .loads (content )
258+ if VERBOSE :
259+ print ("[call_gpt_4_vision_preview_ocr] content" , content )
260+
261+ processed_content = []
262+
263+ for operation in content :
264+ if operation .get ("operation" ) == "click" :
265+ text_to_click = operation .get ("text" )
266+ if VERBOSE :
267+ print (
268+ "[call_gpt_4_vision_preview_ocr][click] text_to_click" ,
269+ text_to_click ,
270+ )
271+ # Initialize EasyOCR Reader
272+ reader = easyocr .Reader (["en" ])
273+
274+ # Read the screenshot
275+ result = reader .readtext (screenshot_filename )
276+
277+ text_element_index = get_text_element (
278+ result , text_to_click , screenshot_filename
279+ )
280+ coordinates = get_text_coordinates (
281+ result , text_element_index , screenshot_filename
282+ )
283+
284+ # add `coordinates`` to `content`
285+ operation ["x" ] = coordinates ["x" ]
286+ operation ["y" ] = coordinates ["y" ]
287+
288+ if VERBOSE :
289+ print (
290+ "[call_gpt_4_vision_preview_ocr][click] text_element_index" ,
291+ text_element_index ,
292+ )
293+ print (
294+ "[call_gpt_4_vision_preview_ocr][click] coordinates" ,
295+ coordinates ,
296+ )
297+ print (
298+ "[call_gpt_4_vision_preview_ocr][click] final operation" ,
299+ operation ,
300+ )
301+ processed_content .append (operation )
302+
303+ else :
304+ processed_content .append (operation )
305+
306+ # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
307+ assistant_message = {"role" : "assistant" , "content" : content_str }
308+ messages .append (assistant_message )
309+
310+ return processed_content
311+
312+ except Exception as e :
313+ print (
314+ f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_RED } [Error] Something went wrong. Trying another method { ANSI_RESET } " ,
315+ e ,
316+ )
317+ traceback .print_exc ()
318+ return gpt_4_fallback (messages , objective , model )
319+
320+
192321async def call_gpt_4_vision_preview_labeled (messages , objective ):
193322 time .sleep (1 )
194323 client = config .initialize_openai ()
@@ -217,7 +346,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
217346
218347 if VERBOSE :
219348 print (
220- "[Self Operating Computer][get_next_action][ call_gpt_4_vision_preview_labeled] user_prompt" ,
349+ "[call_gpt_4_vision_preview_labeled] user_prompt" ,
221350 user_prompt ,
222351 )
223352
@@ -254,7 +383,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
254383 assistant_message = {"role" : "assistant" , "content" : content }
255384 if VERBOSE :
256385 print (
257- "[Self Operating Computer][get_next_action][ call_gpt_4_vision_preview_labeled] content" ,
386+ "[call_gpt_4_vision_preview_labeled] content" ,
258387 content ,
259388 )
260389 messages .append (assistant_message )
@@ -268,14 +397,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
268397 label = operation .get ("label" )
269398 if VERBOSE :
270399 print (
271- "[Self Operating Computer][get_next_action][ call_gpt_4_vision_preview_labeled] label" ,
400+ "[Self Operating Computer][call_gpt_4_vision_preview_labeled] label" ,
272401 label ,
273402 )
274403
275404 coordinates = get_label_coordinates (label , label_coordinates )
276405 if VERBOSE :
277406 print (
278- "[Self Operating Computer][get_next_action][ call_gpt_4_vision_preview_labeled] coordinates" ,
407+ "[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates" ,
279408 coordinates ,
280409 )
281410 image = Image .open (
@@ -287,7 +416,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
287416 )
288417 if VERBOSE :
289418 print (
290- "[Self Operating Computer][get_next_action][ call_gpt_4_vision_preview_labeled] click_position_percent" ,
419+ "[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent" ,
291420 click_position_percent ,
292421 )
293422 if not click_position_percent :
@@ -302,7 +431,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
302431 operation ["y" ] = y_percent
303432 if VERBOSE :
304433 print (
305- "[Self Operating Computer][get_next_action][ call_gpt_4_vision_preview_labeled] new click operation" ,
434+ "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation" ,
306435 operation ,
307436 )
308437 processed_content .append (operation )
@@ -311,7 +440,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
311440
312441 if VERBOSE :
313442 print (
314- "[Self Operating Computer][get_next_action][ call_gpt_4_vision_preview_labeled] new processed_content" ,
443+ "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content" ,
315444 processed_content ,
316445 )
317446 return processed_content
@@ -321,6 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
321450 f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_RED } [Error] Something went wrong. Trying another method { ANSI_RESET } " ,
322451 e ,
323452 )
453+ traceback .print_exc ()
324454 return call_gpt_4_vision_preview (messages )
325455
326456
@@ -336,3 +466,39 @@ def get_last_assistant_message(messages):
336466 else :
337467 return messages [index ]
338468 return None # Return None if no assistant message is found
469+
470+
471+ def gpt_4_fallback (messages , objective , model ):
472+ if VERBOSE :
473+ print ("[gpt_4_fallback]" )
474+ system_prompt = get_system_prompt ("gpt-4-vision-preview" , objective )
475+ new_system_message = {"role" : "system" , "content" : system_prompt }
476+ # remove and replace the first message in `messages` with `new_system_message`
477+
478+ messages [0 ] = new_system_message
479+ if VERBOSE :
480+ print ("[gpt_4_fallback] new messages" , messages )
481+
482+ if VERBOSE :
483+ print ("[gpt_4_fallback][updated]" )
484+ print ("[gpt_4_fallback][updated] len(messages)" , len (messages ))
485+
486+ return call_gpt_4_vision_preview (messages )
487+
488+
489+ def confirm_system_prompt (messages , objective , model ):
490+ """
491+ On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure
492+ """
493+ if VERBOSE :
494+ print ("[confirm_system_prompt]" )
495+
496+ system_prompt = get_system_prompt (model , objective )
497+ new_system_message = {"role" : "system" , "content" : system_prompt }
498+ # remove and replace the first message in `messages` with `new_system_message`
499+
500+ messages [0 ] = new_system_message
501+
502+ if VERBOSE :
503+ print ("[confirm_system_prompt][updated]" )
504+ print ("[confirm_system_prompt][updated] len(messages)" , len (messages ))
0 commit comments