Skip to content

Commit 6e1c7c7

Browse files
author
何涛
committed
fix(apis): fix gpt-4o & o1 model use in apis.py && add qwen-vl apis
1 parent 14b579f commit 6e1c7c7

File tree

2 files changed

+143
-2
lines changed

2 files changed

+143
-2
lines changed

operate/config.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ def __init__(self):
4343
self.anthropic_api_key = (
4444
None # instance variables are backups in case saving to a `.env` fails
4545
)
46+
self.qwen_api_key = (
47+
None # instance variables are backups in case saving to a `.env` fails
48+
)
4649

4750
def initialize_openai(self):
4851
if self.verbose:
@@ -66,6 +69,29 @@ def initialize_openai(self):
6669
client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
6770
return client
6871

72+
def initialize_qwen(self):
73+
if self.verbose:
74+
print("[Config][initialize_qwen]")
75+
76+
if self.qwen_api_key:
77+
if self.verbose:
78+
print("[Config][initialize_qwen] using cached qwen_api_key")
79+
api_key = self.qwen_api_key
80+
else:
81+
if self.verbose:
82+
print(
83+
"[Config][initialize_qwen] no cached qwen_api_key, try to get from env."
84+
)
85+
api_key = os.getenv("QWEN_API_KEY")
86+
87+
client = OpenAI(
88+
api_key=api_key,
89+
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
90+
)
91+
client.api_key = api_key
92+
client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
93+
return client
94+
6995
def initialize_google(self):
7096
if self.google_api_key:
7197
if self.verbose:

operate/models/apis.py

Lines changed: 117 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ async def get_next_action(model, messages, objective, session_id):
3737
print("[Self-Operating Computer][get_next_action] model", model)
3838
if model == "gpt-4":
3939
return call_gpt_4o(messages), None
40+
if model == "qwen-vl":
41+
operation = await call_qwen_vl_with_ocr(messages, objective, model)
42+
return operation, None
4043
if model == "gpt-4-with-som":
4144
operation = await call_gpt_4o_labeled(messages, objective, model)
4245
return operation, None
@@ -136,6 +139,118 @@ def call_gpt_4o(messages):
136139
return call_gpt_4o(messages)
137140

138141

142+
async def call_qwen_vl_with_ocr(messages, objective, model):
143+
if config.verbose:
144+
print("[call_qwen_vl_with_ocr]")
145+
146+
# Construct the path to the file within the package
147+
try:
148+
time.sleep(1)
149+
client = config.initialize_qwen()
150+
151+
confirm_system_prompt(messages, objective, model)
152+
screenshots_dir = "screenshots"
153+
if not os.path.exists(screenshots_dir):
154+
os.makedirs(screenshots_dir)
155+
156+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
157+
# Call the function to capture the screen with the cursor
158+
capture_screen_with_cursor(screenshot_filename)
159+
160+
with open(screenshot_filename, "rb") as img_file:
161+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
162+
163+
if len(messages) == 1:
164+
user_prompt = get_user_first_message_prompt()
165+
else:
166+
user_prompt = get_user_prompt()
167+
168+
vision_message = {
169+
"role": "user",
170+
"content": [
171+
{"type": "text", "text": user_prompt},
172+
{
173+
"type": "image_url",
174+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
175+
},
176+
],
177+
}
178+
messages.append(vision_message)
179+
180+
response = client.chat.completions.create(
181+
model="qwen2.5-vl-72b-instruct",
182+
messages=messages,
183+
)
184+
185+
content = response.choices[0].message.content
186+
187+
content = clean_json(content)
188+
189+
# used later for the messages
190+
content_str = content
191+
192+
content = json.loads(content)
193+
194+
processed_content = []
195+
196+
for operation in content:
197+
if operation.get("operation") == "click":
198+
text_to_click = operation.get("text")
199+
if config.verbose:
200+
print(
201+
"[call_qwen_vl_with_ocr][click] text_to_click",
202+
text_to_click,
203+
)
204+
# Initialize EasyOCR Reader
205+
reader = easyocr.Reader(["en"])
206+
207+
# Read the screenshot
208+
result = reader.readtext(screenshot_filename)
209+
210+
text_element_index = get_text_element(
211+
result, text_to_click, screenshot_filename
212+
)
213+
coordinates = get_text_coordinates(
214+
result, text_element_index, screenshot_filename
215+
)
216+
217+
# add `coordinates`` to `content`
218+
operation["x"] = coordinates["x"]
219+
operation["y"] = coordinates["y"]
220+
221+
if config.verbose:
222+
print(
223+
"[call_qwen_vl_with_ocr][click] text_element_index",
224+
text_element_index,
225+
)
226+
print(
227+
"[call_qwen_vl_with_ocr][click] coordinates",
228+
coordinates,
229+
)
230+
print(
231+
"[call_qwen_vl_with_ocr][click] final operation",
232+
operation,
233+
)
234+
processed_content.append(operation)
235+
236+
else:
237+
processed_content.append(operation)
238+
239+
# wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
240+
assistant_message = {"role": "assistant", "content": content_str}
241+
messages.append(assistant_message)
242+
243+
return processed_content
244+
245+
except Exception as e:
246+
print(
247+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
248+
)
249+
if config.verbose:
250+
print("[Self-Operating Computer][Operate] error", e)
251+
traceback.print_exc()
252+
return gpt_4_fallback(messages, objective, model)
253+
139254
def call_gemini_pro_vision(messages, objective):
140255
"""
141256
Get the next action for Self-Operating Computer using Gemini Pro Vision
@@ -227,7 +342,7 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
227342
messages.append(vision_message)
228343

229344
response = client.chat.completions.create(
230-
model="o1",
345+
model="gpt-4o",
231346
messages=messages,
232347
)
233348

@@ -340,7 +455,7 @@ async def call_o1_with_ocr(messages, objective, model):
340455
messages.append(vision_message)
341456

342457
response = client.chat.completions.create(
343-
model="gpt-4o",
458+
model="o1",
344459
messages=messages,
345460
)
346461

0 commit comments

Comments
 (0)