Skip to content

Commit 193c606

Browse files
committed
Add updated support for llava through ollama python library
1 parent 10bb8bf commit 193c606

File tree

2 files changed

+136
-7
lines changed

2 files changed

+136
-7
lines changed

operate/models/apis.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import traceback
66
import io
77
import easyocr
8-
8+
import ollama
99

1010
from PIL import Image
1111
from ultralytics import YOLO
@@ -53,6 +53,9 @@ async def get_next_action(model, messages, objective, session_id):
5353
return "coming soon"
5454
elif model == "gemini-pro-vision":
5555
return call_gemini_pro_vision(messages, objective), None
56+
elif model == "ollama-llava":
57+
operation = call_ollama_llava(messages), None
58+
return operation
5659

5760
raise ModelNotRecognizedException(model)
5861

@@ -464,6 +467,83 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
464467
return call_gpt_4_vision_preview(messages)
465468

466469

470+
def call_ollama_llava(messages):
471+
if VERBOSE:
472+
print("[call_ollama_llava]")
473+
time.sleep(1)
474+
try:
475+
screenshots_dir = "screenshots"
476+
if not os.path.exists(screenshots_dir):
477+
os.makedirs(screenshots_dir)
478+
479+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
480+
# Call the function to capture the screen with the cursor
481+
capture_screen_with_cursor(screenshot_filename)
482+
483+
with open(screenshot_filename, "rb") as img_file:
484+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
485+
486+
if len(messages) == 1:
487+
user_prompt = get_user_first_message_prompt()
488+
else:
489+
user_prompt = get_user_prompt()
490+
491+
if VERBOSE:
492+
print(
493+
"[call_ollama_llava] user_prompt",
494+
user_prompt,
495+
)
496+
497+
vision_message = {
498+
"role": "user",
499+
"content": user_prompt,
500+
"images": [screenshot_filename],
501+
}
502+
messages.append(vision_message)
503+
504+
response = ollama.chat(
505+
model="llava",
506+
messages=messages,
507+
)
508+
509+
# Important: Remove the image path from the message history.
510+
# Ollama will attempt to load each image reference and will
511+
# eventually timeout.
512+
messages[-1]["images"] = None
513+
514+
content = response['message']['content'].strip()
515+
516+
if content.startswith("```json"):
517+
content = content[len("```json") :] # Remove starting ```json
518+
if content.endswith("```"):
519+
content = content[: -len("```")] # Remove ending
520+
521+
assistant_message = {"role": "assistant", "content": content}
522+
if VERBOSE:
523+
print(
524+
"[call_ollama_llava] content",
525+
content,
526+
)
527+
content = json.loads(content)
528+
529+
messages.append(assistant_message)
530+
531+
return content
532+
533+
except Exception as e:
534+
print(
535+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
536+
e,
537+
)
538+
print(
539+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
540+
content,
541+
)
542+
if VERBOSE:
543+
traceback.print_exc()
544+
return call_ollama_llava(messages)
545+
546+
467547
def get_last_assistant_message(messages):
468548
"""
469549
Retrieve the last message from the assistant in the messages array.

requirements.txt

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,102 @@
1+
aiohttp==3.9.1
2+
aiosignal==1.3.1
13
annotated-types==0.6.0
24
anyio==3.7.1
5+
attrs==23.2.0
6+
cachetools==5.3.2
37
certifi==2023.7.22
48
charset-normalizer==3.3.2
59
colorama==0.4.6
610
contourpy==1.2.0
711
cycler==0.12.1
812
distro==1.8.0
13+
easyocr==1.7.1
914
EasyProcess==1.1
1015
entrypoint2==1.1
1116
exceptiongroup==1.1.3
17+
filelock==3.13.1
1218
fonttools==4.44.0
19+
frozenlist==1.4.1
20+
fsspec==2024.2.0
21+
google-ai-generativelanguage==0.4.0
22+
google-api-core==2.16.2
23+
google-auth==2.27.0
24+
google-generativeai==0.3.0
25+
googleapis-common-protos==1.62.0
26+
grpcio==1.60.1
27+
grpcio-status==1.60.1
1328
h11==0.14.0
1429
httpcore==1.0.2
15-
httpx==0.25.1
30+
httpx==0.25.2
1631
idna==3.4
32+
imageio==2.33.1
1733
importlib-resources==6.1.1
34+
Jinja2==3.1.3
1835
kiwisolver==1.4.5
36+
lazy_loader==0.3
37+
MarkupSafe==2.1.5
1938
matplotlib==3.8.1
2039
MouseInfo==0.1.3
40+
mpmath==1.3.0
2141
mss==9.0.1
42+
multidict==6.0.5
43+
networkx==3.2.1
44+
ninja==1.11.1.1
2245
numpy==1.26.1
46+
ollama==0.1.6
2347
openai==1.2.3
48+
opencv-python==4.9.0.80
49+
opencv-python-headless==4.9.0.80
2450
packaging==23.2
51+
pandas==2.2.0
2552
Pillow==10.1.0
2653
prompt-toolkit==3.0.39
54+
proto-plus==1.23.0
55+
protobuf==4.25.2
56+
psutil==5.9.8
57+
py-cpuinfo==9.0.0
58+
pyasn1==0.5.1
59+
pyasn1-modules==0.3.0
2760
PyAutoGUI==0.9.54
61+
pyclipper==1.3.0.post5
2862
pydantic==2.4.2
2963
pydantic_core==2.10.1
3064
PyGetWindow==0.0.9
3165
PyMsgBox==1.0.9
66+
pyobjc-core==10.1
67+
pyobjc-framework-Cocoa==10.1
68+
pyobjc-framework-Quartz==10.1
3269
pyparsing==3.1.1
3370
pyperclip==1.8.2
3471
PyRect==0.2.0
3572
pyscreenshot==3.1
3673
PyScreeze==0.1.29
37-
python3-xlib==0.15
74+
python-bidi==0.4.2
3875
python-dateutil==2.8.2
3976
python-dotenv==1.0.0
77+
python3-xlib==0.15
4078
pytweening==1.0.7
79+
pytz==2024.1
80+
PyYAML==6.0.1
4181
requests==2.31.0
82+
rsa==4.9
4283
rubicon-objc==0.4.7
84+
scikit-image==0.22.0
85+
scipy==1.12.0
86+
seaborn==0.13.2
87+
shapely==2.0.2
4388
six==1.16.0
4489
sniffio==1.3.0
90+
sympy==1.12
91+
thop==0.1.1.post2209072238
92+
tifffile==2024.1.30
93+
torch==2.2.0
94+
torchvision==0.17.0
4595
tqdm==4.66.1
4696
typing_extensions==4.8.0
97+
tzdata==2023.4
98+
ultralytics==8.0.227
4799
urllib3==2.0.7
48100
wcwidth==0.2.9
101+
yarl==1.9.4
49102
zipp==3.17.0
50-
google-generativeai==0.3.0
51-
aiohttp==3.9.1
52-
ultralytics==8.0.227
53-
easyocr==1.7.1

0 commit comments

Comments
 (0)