Skip to content

Commit 87cc2f3

Browse files
committed
Local III EasyOCR
1 parent e05265d commit 87cc2f3

File tree

10 files changed

+463
-64
lines changed

10 files changed

+463
-64
lines changed

interpreter/core/computer/utils/computer_vision.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,17 @@
99
except:
1010
cv2 = None # Fixes colab error
1111
PIL = lazy_import("PIL")
12-
# pytesseract is very very optional, we don't even recommend it unless the api has failed
1312
pytesseract = lazy_import("pytesseract")
1413

1514

1615
def pytesseract_get_text(img):
17-
import pytesseract
16+
# List the attributes of pytesseract, which will trigger lazy loading of it
17+
attributes = dir(pytesseract)
18+
if pytesseract == None:
19+
raise ImportError("The pytesseract module could not be imported.")
1820

19-
return pytesseract.image_to_string(img)
21+
result = pytesseract.image_to_string(img)
22+
return result
2023

2124

2225
def pytesseract_get_text_bounding_boxes(img):

interpreter/core/computer/vision/vision.py

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,35 +17,44 @@ def __init__(self, computer):
1717
self.computer = computer
1818
self.model = None # Will load upon first use
1919
self.tokenizer = None # Will load upon first use
20+
self.easyocr = None
2021

21-
def load(self):
22-
print("\nLoading Moondream (vision)...\n")
22+
def load(self, load_moondream=True, load_easyocr=True):
23+
# print("Loading vision models (Moondream, EasyOCR)...\n")
2324

2425
with contextlib.redirect_stdout(
2526
open(os.devnull, "w")
2627
), contextlib.redirect_stderr(open(os.devnull, "w")):
27-
import transformers # Wait until we use it. Transformers can't be lazy loaded for some reason!
28-
29-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
30-
31-
if self.computer.debug:
32-
print(
33-
"Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
28+
if self.easyocr == None and load_easyocr:
29+
import easyocr
30+
31+
self.easyocr = easyocr.Reader(
32+
["en"]
33+
) # this needs to run only once to load the model into memory
34+
35+
if self.model == None and load_moondream:
36+
import transformers # Wait until we use it. Transformers can't be lazy loaded for some reason!
37+
38+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
39+
40+
if self.computer.debug:
41+
print(
42+
"Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
43+
)
44+
print(
45+
"Alternatively, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
46+
)
47+
model_id = "vikhyatk/moondream2"
48+
revision = "2024-04-02"
49+
print("loading model")
50+
51+
self.model = transformers.AutoModelForCausalLM.from_pretrained(
52+
model_id, trust_remote_code=True, revision=revision
3453
)
35-
print(
36-
"Alternatively, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
54+
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
55+
model_id, revision=revision
3756
)
38-
model_id = "vikhyatk/moondream2"
39-
revision = "2024-04-02"
40-
print("loading model")
41-
42-
self.model = transformers.AutoModelForCausalLM.from_pretrained(
43-
model_id, trust_remote_code=True, revision=revision
44-
)
45-
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
46-
model_id, revision=revision
47-
)
48-
return True
57+
return True
4958

5059
def ocr(
5160
self,
@@ -99,7 +108,11 @@ def ocr(
99108
path = temp_file_path
100109

101110
try:
102-
return pytesseract_get_text(path)
111+
if not self.easyocr:
112+
self.load(load_moondream=False)
113+
result = self.easyocr.readtext(path)
114+
text = " ".join([item[1] for item in result])
115+
return text.strip()
103116
except ImportError:
104117
print(
105118
"\nTo use local vision, run `pip install 'open-interpreter[local]'`.\n"
@@ -108,7 +121,7 @@ def ocr(
108121

109122
def query(
110123
self,
111-
query="Describe this image.",
124+
query="Describe this image. Also tell me what text is in the image, if any.",
112125
base_64=None,
113126
path=None,
114127
lmc=None,
@@ -119,7 +132,7 @@ def query(
119132
"""
120133

121134
if self.model == None and self.tokenizer == None:
122-
success = self.load()
135+
success = self.load(load_easyocr=False).strip()
123136
if not success:
124137
return ""
125138

@@ -149,6 +162,8 @@ def query(
149162

150163
with contextlib.redirect_stdout(open(os.devnull, "w")):
151164
enc_image = self.model.encode_image(img)
152-
answer = self.model.answer_question(enc_image, query, self.tokenizer)
165+
answer = self.model.answer_question(
166+
enc_image, query, self.tokenizer, max_length=400
167+
)
153168

154169
return answer

interpreter/core/llm/llm.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def __init__(self, interpreter):
4848
self.api_base = None
4949
self.api_key = None
5050
self.api_version = None
51+
self._is_loaded = False
5152

5253
# Budget manager powered by LiteLLM
5354
self.max_budget = None
@@ -143,7 +144,7 @@ def run(self, messages):
143144
img_msg["content"] = (
144145
precursor
145146
+ image_description
146-
+ "\n---\nThe image contains the following text exactly, which may or may not be relevant (if it's not relevant, ignore this): '''\n"
147+
+ "\n---\nI've OCR'd the image, this is the result (this may or may not be relevant. If it's not relevant, ignore this): '''\n"
147148
+ ocr
148149
+ "\n'''"
149150
+ postcursor
@@ -273,7 +274,20 @@ def run(self, messages):
273274
else:
274275
yield from run_text_llm(self, params)
275276

277+
# If you change model, set _is_loaded to false
278+
@property
279+
def model(self):
280+
return self._model
281+
282+
@model.setter
283+
def model(self, value):
284+
self._model = value
285+
self._is_loaded = False
286+
276287
def load(self):
288+
if self._is_loaded:
289+
return
290+
277291
if self.model.startswith("ollama/"):
278292
# WOAH we should also hit up ollama and set max_tokens and context_window based on the LLM. I think they let u do that
279293

@@ -302,7 +316,7 @@ def load(self):
302316
subprocess.run(["ollama", "pull", model_name], check=True)
303317

304318
# Send a ping, which will actually load the model
305-
print(f"\nLoading {model_name}...\n")
319+
# print(f"\nLoading {model_name}...\n")
306320

307321
old_max_tokens = self.max_tokens
308322
self.max_tokens = 1
@@ -313,6 +327,9 @@ def load(self):
313327

314328
# Validate LLM should be moved here!!
315329

330+
self._is_loaded = True
331+
return
332+
316333

317334
def fixed_litellm_completions(**params):
318335
"""

interpreter/core/utils/system_debug_info.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,9 @@ def interpreter_info(interpreter):
8686
for message in interpreter.messages:
8787
message = message.copy()
8888
try:
89-
if len(message["content"]) > 600:
89+
if len(message["content"]) > 5000:
9090
message["content"] = (
91-
message["content"][:300] + "..." + message["content"][-300:]
91+
message["content"][:800] + "..." + message["content"][-800:]
9292
)
9393
except Exception as e:
9494
print(str(e), "for message:", message)

interpreter/terminal_interface/magic_commands.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,10 @@ def handle_verbose(self, arguments=None):
8282
print("\n\nCurrent messages:\n")
8383
for message in self.messages:
8484
message = message.copy()
85-
if message["type"] == "image" and message.get("format") != "path":
85+
if message["type"] == "image" and message.get("format") not in [
86+
"path",
87+
"description",
88+
]:
8689
message["content"] = (
8790
message["content"][:30] + "..." + message["content"][-30:]
8891
)
@@ -102,7 +105,10 @@ def handle_debug(self, arguments=None):
102105
print("\n\nCurrent messages:\n")
103106
for message in self.messages:
104107
message = message.copy()
105-
if message["type"] == "image" and message.get("format") != "path":
108+
if message["type"] == "image" and message.get("format") not in [
109+
"path",
110+
"description",
111+
]:
106112
message["content"] = (
107113
message["content"][:30] + "..." + message["content"][-30:]
108114
)

interpreter/terminal_interface/profiles/defaults/codestral-os.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,14 @@
113113
interpreter.offline = True
114114
interpreter.os = True
115115

116+
# Vision setup
117+
interpreter.computer.vision.load()
118+
116119
# Final message
117120
interpreter.display_message(
118121
"**Warning:** In this mode, Open Interpreter will not require approval before performing actions. Be ready to close your terminal."
119122
)
120123
interpreter.display_message(
121124
"\n**Note:** Codestral is a relatively weak model, so OS mode is highly experimental. Try using a more powerful model for OS mode with `interpreter --os`."
122125
)
123-
interpreter.display_message(
124-
"> Model set to `codestral`, experimental OS control enabled"
125-
)
126+
interpreter.display_message("> Experimental OS control enabled.")

interpreter/terminal_interface/profiles/defaults/codestral-vision.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
User: The code you ran produced no output. Was this expected, or are we finished?
1818
Assistant: No further action is required; the provided snippet opens Chrome.
1919
20-
You have access to ONE special function called `computer.vision.query(query="Describe this image.", path="image.jpg")`. This will ask a vision AI model the query, regarding the image at path. For example:
20+
You have access to TWO special functions called `computer.vision.query(query="Describe this image.", path="image.jpg")` (asks a vision AI model the query, regarding the image at path) and `computer.vision.ocr(path="image.jpg")` (returns text in the image at path). For example:
2121
2222
User: Rename the images on my desktop to something more descriptive.
2323
Assistant: Viewing and renaming images.
@@ -53,6 +53,25 @@
5353
```
5454
User: The code you ran produced no output. Was this expected, or are we finished?
5555
Assistant: We are finished.
56+
User: What text is in the image 'user.png' on my desktop?
57+
Assistant: ```python
58+
import os
59+
import string
60+
from pathlib import Path
61+
62+
# Get the user's home directory in a cross-platform way
63+
home_dir = Path.home()
64+
65+
# Define the path to the image
66+
image_path = desktop_dir / 'user.png'
67+
68+
# Get the text in the image
69+
text_in_image = computer.vision.ocr(path=str(image_path))
70+
71+
text_in_image
72+
```
73+
User: The code you ran produced this output: "29294 is the username". What does this mean?
74+
Assistant: The output means that the `user.png` image on your desktop contains the text "29294 is the username".
5675
5776
NEVER use placeholders. Always specify exact paths, and use cross-platform ways of determining the desktop, documents, etc. folders.
5877
@@ -65,15 +84,16 @@
6584

6685
# LLM settings
6786
interpreter.llm.model = "ollama/codestral"
68-
interpreter.llm.load() # Loads Ollama models
6987
interpreter.llm.supports_functions = False
7088
interpreter.llm.execution_instructions = False
7189
interpreter.llm.max_tokens = 1000
7290
interpreter.llm.context_window = 7000
91+
interpreter.llm.load() # Loads Ollama models
7392

7493
# Computer settings
7594
interpreter.computer.import_computer_api = True
7695
interpreter.computer.system_message = "" # The default will explain how to use the full Computer API, and append this to the system message. For local models, we want more control, so we set this to "". The system message will ONLY be what's above ^
96+
interpreter.computer.vision.load() # Load vision models
7797

7898
# Misc settings
7999
interpreter.auto_run = False

0 commit comments

Comments
 (0)