Skip to content

Commit fd0b8b3

Browse files
authored
Merge pull request #145 from OthersideAI/add-ocr
Add ocr
2 parents d0a972f + b1cc4fa commit fd0b8b3

File tree

9 files changed

+453
-57
lines changed

9 files changed

+453
-57
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,13 @@ operate -m gemini-pro-vision
9191

9292
**Enter your Google AI Studio API key when terminal prompts you for it** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR:
9393

94+
### Optical Character Recognition Mode `-m gpt-4-with-ocr`
95+
The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click.
96+
97+
Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write:
98+
99+
`operate` or `operate -m gpt-4-with-ocr` will also work.
100+
94101
### Set-of-Mark Prompting `-m gpt-4-with-som`
95102
The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models.
96103

operate/config.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,6 @@ def initialize_openai(self):
4949
)
5050
api_key = os.getenv("OPENAI_API_KEY")
5151

52-
if self.verbose:
53-
print("[Config][initialize_openai] api_key", api_key)
54-
5552
client = OpenAI(
5653
api_key=api_key,
5754
)
@@ -65,9 +62,10 @@ def initialize_google(self):
6562
print("[Config][initialize_google] using cached google_api_key")
6663
api_key = self.google_api_key
6764
else:
68-
print(
69-
"[Config][initialize_google] no cached google_api_key, try to get from env."
70-
)
65+
if self.verbose:
66+
print(
67+
"[Config][initialize_google] no cached google_api_key, try to get from env."
68+
)
7169
api_key = os.getenv("GOOGLE_API_KEY")
7270
genai.configure(api_key=api_key, transport="rest")
7371
model = genai.GenerativeModel("gemini-pro-vision")

operate/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def main_entry():
1515
"--model",
1616
help="Specify the model to use",
1717
required=False,
18-
default="gpt-4",
18+
default="gpt-4-with-ocr",
1919
)
2020

2121
# Add a voice flag

operate/models/apis.py

Lines changed: 183 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import base64
55
import traceback
66
import io
7+
import easyocr
78

89

910
from PIL import Image
@@ -19,6 +20,7 @@
1920
get_user_prompt,
2021
get_system_prompt,
2122
)
23+
from operate.utils.ocr import get_text_element, get_text_coordinates
2224

2325

2426
from operate.utils.label import (
@@ -48,6 +50,9 @@ async def get_next_action(model, messages, objective, session_id):
4850
if model == "gpt-4-with-som":
4951
operation = await call_gpt_4_vision_preview_labeled(messages, objective)
5052
return operation, None
53+
if model == "gpt-4-with-ocr":
54+
operation = await call_gpt_4_vision_preview_ocr(messages, objective, model)
55+
return operation, None
5156
elif model == "agent-1":
5257
return "coming soon"
5358
elif model == "gemini-pro-vision":
@@ -58,7 +63,7 @@ async def get_next_action(model, messages, objective, session_id):
5863

5964
def call_gpt_4_vision_preview(messages):
6065
if VERBOSE:
61-
print("[Self Operating Computer][get_next_action][call_gpt_4_v]")
66+
print("[call_gpt_4_v]")
6267
time.sleep(1)
6368
client = config.initialize_openai()
6469
try:
@@ -80,7 +85,7 @@ def call_gpt_4_vision_preview(messages):
8085

8186
if VERBOSE:
8287
print(
83-
"[Self Operating Computer][get_next_action][call_gpt_4_v] user_prompt",
88+
"[call_gpt_4_v] user_prompt",
8489
user_prompt,
8590
)
8691

@@ -115,7 +120,7 @@ def call_gpt_4_vision_preview(messages):
115120
assistant_message = {"role": "assistant", "content": content}
116121
if VERBOSE:
117122
print(
118-
"[Self Operating Computer][get_next_action][call_gpt_4_v] content",
123+
"[call_gpt_4_v] content",
119124
content,
120125
)
121126
content = json.loads(content)
@@ -157,25 +162,23 @@ def call_gemini_pro_vision(messages, objective):
157162
capture_screen_with_cursor(screenshot_filename)
158163
# sleep for a second
159164
time.sleep(1)
160-
prompt = get_system_prompt(objective)
165+
prompt = get_system_prompt("gemini-pro-vision", objective)
161166

162167
model = config.initialize_google()
163168
if VERBOSE:
164-
print("[Self Operating Computer][call_gemini_pro_vision] model", model)
169+
print("[call_gemini_pro_vision] model", model)
165170

166171
response = model.generate_content([prompt, Image.open(screenshot_filename)])
167172

168173
content = response.text[1:]
169174
if VERBOSE:
170-
print(
171-
"[Self Operating Computer][call_gemini_pro_vision] response", response
172-
)
173-
print("[Self Operating Computer][call_gemini_pro_vision] content", content)
175+
print("[call_gemini_pro_vision] response", response)
176+
print("[call_gemini_pro_vision] content", content)
174177

175178
content = json.loads(content)
176179
if VERBOSE:
177180
print(
178-
"[Self Operating Computer][get_next_action][call_gemini_pro_vision] content",
181+
"[get_next_action][call_gemini_pro_vision] content",
179182
content,
180183
)
181184

@@ -189,6 +192,132 @@ def call_gemini_pro_vision(messages, objective):
189192
return call_gpt_4_vision_preview(messages)
190193

191194

195+
async def call_gpt_4_vision_preview_ocr(messages, objective, model):
196+
if VERBOSE:
197+
print("[call_gpt_4_vision_preview_ocr]")
198+
199+
# Construct the path to the file within the package
200+
try:
201+
time.sleep(1)
202+
client = config.initialize_openai()
203+
204+
confirm_system_prompt(messages, objective, model)
205+
screenshots_dir = "screenshots"
206+
if not os.path.exists(screenshots_dir):
207+
os.makedirs(screenshots_dir)
208+
209+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
210+
# Call the function to capture the screen with the cursor
211+
capture_screen_with_cursor(screenshot_filename)
212+
213+
with open(screenshot_filename, "rb") as img_file:
214+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
215+
216+
if len(messages) == 1:
217+
user_prompt = get_user_first_message_prompt()
218+
else:
219+
user_prompt = get_user_prompt()
220+
221+
if VERBOSE:
222+
print(
223+
"[call_gpt_4_vision_preview_ocr] user_prompt",
224+
user_prompt,
225+
)
226+
227+
vision_message = {
228+
"role": "user",
229+
"content": [
230+
{"type": "text", "text": user_prompt},
231+
{
232+
"type": "image_url",
233+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
234+
},
235+
],
236+
}
237+
messages.append(vision_message)
238+
239+
response = client.chat.completions.create(
240+
model="gpt-4-vision-preview",
241+
messages=messages,
242+
presence_penalty=1,
243+
frequency_penalty=1,
244+
temperature=0.7,
245+
max_tokens=1000,
246+
)
247+
248+
content = response.choices[0].message.content
249+
250+
if content.startswith("```json"):
251+
content = content[len("```json") :] # Remove starting ```json
252+
if content.endswith("```"):
253+
content = content[: -len("```")] # Remove ending
254+
255+
content_str = content
256+
257+
content = json.loads(content)
258+
if VERBOSE:
259+
print("[call_gpt_4_vision_preview_ocr] content", content)
260+
261+
processed_content = []
262+
263+
for operation in content:
264+
if operation.get("operation") == "click":
265+
text_to_click = operation.get("text")
266+
if VERBOSE:
267+
print(
268+
"[call_gpt_4_vision_preview_ocr][click] text_to_click",
269+
text_to_click,
270+
)
271+
# Initialize EasyOCR Reader
272+
reader = easyocr.Reader(["en"])
273+
274+
# Read the screenshot
275+
result = reader.readtext(screenshot_filename)
276+
277+
text_element_index = get_text_element(
278+
result, text_to_click, screenshot_filename
279+
)
280+
coordinates = get_text_coordinates(
281+
result, text_element_index, screenshot_filename
282+
)
283+
284+
# add `coordinates`` to `content`
285+
operation["x"] = coordinates["x"]
286+
operation["y"] = coordinates["y"]
287+
288+
if VERBOSE:
289+
print(
290+
"[call_gpt_4_vision_preview_ocr][click] text_element_index",
291+
text_element_index,
292+
)
293+
print(
294+
"[call_gpt_4_vision_preview_ocr][click] coordinates",
295+
coordinates,
296+
)
297+
print(
298+
"[call_gpt_4_vision_preview_ocr][click] final operation",
299+
operation,
300+
)
301+
processed_content.append(operation)
302+
303+
else:
304+
processed_content.append(operation)
305+
306+
# wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
307+
assistant_message = {"role": "assistant", "content": content_str}
308+
messages.append(assistant_message)
309+
310+
return processed_content
311+
312+
except Exception as e:
313+
print(
314+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
315+
e,
316+
)
317+
traceback.print_exc()
318+
return gpt_4_fallback(messages, objective, model)
319+
320+
192321
async def call_gpt_4_vision_preview_labeled(messages, objective):
193322
time.sleep(1)
194323
client = config.initialize_openai()
@@ -217,7 +346,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
217346

218347
if VERBOSE:
219348
print(
220-
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] user_prompt",
349+
"[call_gpt_4_vision_preview_labeled] user_prompt",
221350
user_prompt,
222351
)
223352

@@ -254,7 +383,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
254383
assistant_message = {"role": "assistant", "content": content}
255384
if VERBOSE:
256385
print(
257-
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] content",
386+
"[call_gpt_4_vision_preview_labeled] content",
258387
content,
259388
)
260389
messages.append(assistant_message)
@@ -268,14 +397,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
268397
label = operation.get("label")
269398
if VERBOSE:
270399
print(
271-
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] label",
400+
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] label",
272401
label,
273402
)
274403

275404
coordinates = get_label_coordinates(label, label_coordinates)
276405
if VERBOSE:
277406
print(
278-
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] coordinates",
407+
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates",
279408
coordinates,
280409
)
281410
image = Image.open(
@@ -287,7 +416,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
287416
)
288417
if VERBOSE:
289418
print(
290-
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] click_position_percent",
419+
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent",
291420
click_position_percent,
292421
)
293422
if not click_position_percent:
@@ -302,7 +431,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
302431
operation["y"] = y_percent
303432
if VERBOSE:
304433
print(
305-
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new click operation",
434+
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation",
306435
operation,
307436
)
308437
processed_content.append(operation)
@@ -311,7 +440,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
311440

312441
if VERBOSE:
313442
print(
314-
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new processed_content",
443+
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content",
315444
processed_content,
316445
)
317446
return processed_content
@@ -321,6 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
321450
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
322451
e,
323452
)
453+
traceback.print_exc()
324454
return call_gpt_4_vision_preview(messages)
325455

326456

@@ -336,3 +466,39 @@ def get_last_assistant_message(messages):
336466
else:
337467
return messages[index]
338468
return None # Return None if no assistant message is found
469+
470+
471+
def gpt_4_fallback(messages, objective, model):
472+
if VERBOSE:
473+
print("[gpt_4_fallback]")
474+
system_prompt = get_system_prompt("gpt-4-vision-preview", objective)
475+
new_system_message = {"role": "system", "content": system_prompt}
476+
# remove and replace the first message in `messages` with `new_system_message`
477+
478+
messages[0] = new_system_message
479+
if VERBOSE:
480+
print("[gpt_4_fallback] new messages", messages)
481+
482+
if VERBOSE:
483+
print("[gpt_4_fallback][updated]")
484+
print("[gpt_4_fallback][updated] len(messages)", len(messages))
485+
486+
return call_gpt_4_vision_preview(messages)
487+
488+
489+
def confirm_system_prompt(messages, objective, model):
490+
"""
491+
On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure
492+
"""
493+
if VERBOSE:
494+
print("[confirm_system_prompt]")
495+
496+
system_prompt = get_system_prompt(model, objective)
497+
new_system_message = {"role": "system", "content": system_prompt}
498+
# remove and replace the first message in `messages` with `new_system_message`
499+
500+
messages[0] = new_system_message
501+
502+
if VERBOSE:
503+
print("[confirm_system_prompt][updated]")
504+
print("[confirm_system_prompt][updated] len(messages)", len(messages))

0 commit comments

Comments
 (0)