@@ -37,6 +37,9 @@ async def get_next_action(model, messages, objective, session_id):
3737 print ("[Self-Operating Computer][get_next_action] model" , model )
3838 if model == "gpt-4" :
3939 return call_gpt_4o (messages ), None
40+ if model == "qwen-vl" :
41+ operation = await call_qwen_vl_with_ocr (messages , objective , model )
42+ return operation , None
4043 if model == "gpt-4-with-som" :
4144 operation = await call_gpt_4o_labeled (messages , objective , model )
4245 return operation , None
@@ -136,6 +139,118 @@ def call_gpt_4o(messages):
136139 return call_gpt_4o (messages )
137140
138141
142+ async def call_qwen_vl_with_ocr (messages , objective , model ):
143+ if config .verbose :
144+ print ("[call_qwen_vl_with_ocr]" )
145+
146+ # Construct the path to the file within the package
147+ try :
148+ time .sleep (1 )
149+ client = config .initialize_qwen ()
150+
151+ confirm_system_prompt (messages , objective , model )
152+ screenshots_dir = "screenshots"
153+ if not os .path .exists (screenshots_dir ):
154+ os .makedirs (screenshots_dir )
155+
156+ screenshot_filename = os .path .join (screenshots_dir , "screenshot.png" )
157+ # Call the function to capture the screen with the cursor
158+ capture_screen_with_cursor (screenshot_filename )
159+
160+ with open (screenshot_filename , "rb" ) as img_file :
161+ img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
162+
163+ if len (messages ) == 1 :
164+ user_prompt = get_user_first_message_prompt ()
165+ else :
166+ user_prompt = get_user_prompt ()
167+
168+ vision_message = {
169+ "role" : "user" ,
170+ "content" : [
171+ {"type" : "text" , "text" : user_prompt },
172+ {
173+ "type" : "image_url" ,
174+ "image_url" : {"url" : f"data:image/jpeg;base64,{ img_base64 } " },
175+ },
176+ ],
177+ }
178+ messages .append (vision_message )
179+
180+ response = client .chat .completions .create (
181+ model = "qwen2.5-vl-72b-instruct" ,
182+ messages = messages ,
183+ )
184+
185+ content = response .choices [0 ].message .content
186+
187+ content = clean_json (content )
188+
189+ # used later for the messages
190+ content_str = content
191+
192+ content = json .loads (content )
193+
194+ processed_content = []
195+
196+ for operation in content :
197+ if operation .get ("operation" ) == "click" :
198+ text_to_click = operation .get ("text" )
199+ if config .verbose :
200+ print (
201+ "[call_qwen_vl_with_ocr][click] text_to_click" ,
202+ text_to_click ,
203+ )
204+ # Initialize EasyOCR Reader
205+ reader = easyocr .Reader (["en" ])
206+
207+ # Read the screenshot
208+ result = reader .readtext (screenshot_filename )
209+
210+ text_element_index = get_text_element (
211+ result , text_to_click , screenshot_filename
212+ )
213+ coordinates = get_text_coordinates (
214+ result , text_element_index , screenshot_filename
215+ )
216+
217+ # add `coordinates`` to `content`
218+ operation ["x" ] = coordinates ["x" ]
219+ operation ["y" ] = coordinates ["y" ]
220+
221+ if config .verbose :
222+ print (
223+ "[call_qwen_vl_with_ocr][click] text_element_index" ,
224+ text_element_index ,
225+ )
226+ print (
227+ "[call_qwen_vl_with_ocr][click] coordinates" ,
228+ coordinates ,
229+ )
230+ print (
231+ "[call_qwen_vl_with_ocr][click] final operation" ,
232+ operation ,
233+ )
234+ processed_content .append (operation )
235+
236+ else :
237+ processed_content .append (operation )
238+
239+ # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
240+ assistant_message = {"role" : "assistant" , "content" : content_str }
241+ messages .append (assistant_message )
242+
243+ return processed_content
244+
245+ except Exception as e :
246+ print (
247+ f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BRIGHT_MAGENTA } [{ model } ] That did not work. Trying another method { ANSI_RESET } "
248+ )
249+ if config .verbose :
250+ print ("[Self-Operating Computer][Operate] error" , e )
251+ traceback .print_exc ()
252+ return gpt_4_fallback (messages , objective , model )
253+
139254def call_gemini_pro_vision (messages , objective ):
140255 """
141256 Get the next action for Self-Operating Computer using Gemini Pro Vision
@@ -227,7 +342,7 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
227342 messages .append (vision_message )
228343
229344 response = client .chat .completions .create (
230- model = "o1 " ,
345+ model = "gpt-4o " ,
231346 messages = messages ,
232347 )
233348
@@ -340,7 +455,7 @@ async def call_o1_with_ocr(messages, objective, model):
340455 messages .append (vision_message )
341456
342457 response = client .chat .completions .create (
343- model = "gpt-4o " ,
458+ model = "o1 " ,
344459 messages = messages ,
345460 )
346461
0 commit comments