Add updated support for llava through ollama python library

michaelhhogue · michaelhhogue · commit 193c60652449 · 2024-02-06T19:37:34.000-05:00
diff --git a/operate/models/apis.py b/operate/models/apis.py
@@ -5,7 +5,7 @@
 import traceback
 import io
 import easyocr
-
+import ollama
 
 from PIL import Image
 from ultralytics import YOLO
@@ -53,6 +53,9 @@ async def get_next_action(model, messages, objective, session_id):
         return "coming soon"
     elif model == "gemini-pro-vision":
         return call_gemini_pro_vision(messages, objective), None
+    elif model == "ollama-llava":
+        operation = call_ollama_llava(messages), None
+        return operation
 
     raise ModelNotRecognizedException(model)
 
@@ -464,6 +467,83 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
         return call_gpt_4_vision_preview(messages)
 
 
+def call_ollama_llava(messages):
+    if VERBOSE:
+        print("[call_ollama_llava]")
+    time.sleep(1)
+    try:
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        # Call the function to capture the screen with the cursor
+        capture_screen_with_cursor(screenshot_filename)
+        
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        if VERBOSE:
+            print(
+                "[call_ollama_llava] user_prompt",
+                user_prompt,
+            )
+
+        vision_message = {
+            "role": "user",
+            "content": user_prompt,
+            "images": [screenshot_filename],
+        }
+        messages.append(vision_message)
+
+        response = ollama.chat(
+            model="llava",
+            messages=messages,
+        )
+        
+        # Important: Remove the image path from the message history.
+        # Ollama will attempt to load each image reference and will
+        # eventually timeout.
+        messages[-1]["images"] = None
+        
+        content = response['message']['content'].strip()
+
+        if content.startswith("```json"):
+            content = content[len("```json") :]  # Remove starting ```json
+            if content.endswith("```"):
+                content = content[: -len("```")]  # Remove ending
+
+        assistant_message = {"role": "assistant", "content": content}
+        if VERBOSE:
+            print(
+                "[call_ollama_llava] content",
+                content,
+            )
+        content = json.loads(content)
+
+        messages.append(assistant_message)
+
+        return content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
+            e,
+        )
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
+            content,
+        )
+        if VERBOSE:
+            traceback.print_exc()
+        return call_ollama_llava(messages)
+
+
 def get_last_assistant_message(messages):
     """
     Retrieve the last message from the assistant in the messages array.
diff --git a/requirements.txt b/requirements.txt
@@ -1,53 +1,102 @@
+aiohttp==3.9.1
+aiosignal==1.3.1
 annotated-types==0.6.0
 anyio==3.7.1
+attrs==23.2.0
+cachetools==5.3.2
 certifi==2023.7.22
 charset-normalizer==3.3.2
 colorama==0.4.6
 contourpy==1.2.0
 cycler==0.12.1
 distro==1.8.0
+easyocr==1.7.1
 EasyProcess==1.1
 entrypoint2==1.1
 exceptiongroup==1.1.3
+filelock==3.13.1
 fonttools==4.44.0
+frozenlist==1.4.1
+fsspec==2024.2.0
+google-ai-generativelanguage==0.4.0
+google-api-core==2.16.2
+google-auth==2.27.0
+google-generativeai==0.3.0
+googleapis-common-protos==1.62.0
+grpcio==1.60.1
+grpcio-status==1.60.1
 h11==0.14.0
 httpcore==1.0.2
-httpx==0.25.1
+httpx==0.25.2
 idna==3.4
+imageio==2.33.1
 importlib-resources==6.1.1
+Jinja2==3.1.3
 kiwisolver==1.4.5
+lazy_loader==0.3
+MarkupSafe==2.1.5
 matplotlib==3.8.1
 MouseInfo==0.1.3
+mpmath==1.3.0
 mss==9.0.1
+multidict==6.0.5
+networkx==3.2.1
+ninja==1.11.1.1
 numpy==1.26.1
+ollama==0.1.6
 openai==1.2.3
+opencv-python==4.9.0.80
+opencv-python-headless==4.9.0.80
 packaging==23.2
+pandas==2.2.0
 Pillow==10.1.0
 prompt-toolkit==3.0.39
+proto-plus==1.23.0
+protobuf==4.25.2
+psutil==5.9.8
+py-cpuinfo==9.0.0
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
 PyAutoGUI==0.9.54
+pyclipper==1.3.0.post5
 pydantic==2.4.2
 pydantic_core==2.10.1
 PyGetWindow==0.0.9
 PyMsgBox==1.0.9
+pyobjc-core==10.1
+pyobjc-framework-Cocoa==10.1
+pyobjc-framework-Quartz==10.1
 pyparsing==3.1.1
 pyperclip==1.8.2
 PyRect==0.2.0
 pyscreenshot==3.1
 PyScreeze==0.1.29
-python3-xlib==0.15
+python-bidi==0.4.2
 python-dateutil==2.8.2
 python-dotenv==1.0.0
+python3-xlib==0.15
 pytweening==1.0.7
+pytz==2024.1
+PyYAML==6.0.1
 requests==2.31.0
+rsa==4.9
 rubicon-objc==0.4.7
+scikit-image==0.22.0
+scipy==1.12.0
+seaborn==0.13.2
+shapely==2.0.2
 six==1.16.0
 sniffio==1.3.0
+sympy==1.12
+thop==0.1.1.post2209072238
+tifffile==2024.1.30
+torch==2.2.0
+torchvision==0.17.0
 tqdm==4.66.1
 typing_extensions==4.8.0
+tzdata==2023.4
+ultralytics==8.0.227
 urllib3==2.0.7
 wcwidth==0.2.9
+yarl==1.9.4
 zipp==3.17.0
-google-generativeai==0.3.0
-aiohttp==3.9.1
-ultralytics==8.0.227
-easyocr==1.7.1