Add Ragie video-RAG

nikhil-1e9 · nikhil-1e9 · commit b5ad7c905349 · 2025-06-13T01:22:20.000+05:30
diff --git a/mcp-video-rag/.env.example b/mcp-video-rag/.env.example
@@ -0,0 +1 @@
+RAGIE_API_KEY=<YOUR_RAGIE_API_KEY>
diff --git a/mcp-video-rag/.gitignore b/mcp-video-rag/.gitignore
@@ -0,0 +1,71 @@
+# Python-generated files
+__pycache__/
+*.py[cod]
+build/
+dist/
+wheels/
+*.egg-info/
+*.egg
+.eggs/
+.Python
+develop-eggs/
+downloads/
+lib/
+lib64/
+parts/
+sdist/
+var/
+.installed.cfg
+
+# Virtual environments
+.venv
+venv/
+ENV/
+env/
+.env
+
+# IDE specific files
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
+.project
+.pydevproject
+.settings/
+*.sublime-workspace
+*.sublime-project
+
+# Testing and coverage
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+htmlcov/
+
+# Documentation
+docs/_build/
+site/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Logs and databases
+*.log
+*.sqlite
+*.db
+
+# Environment variables
+.env
+.env.local
+.env.*.local
diff --git a/mcp-video-rag/.python-version b/mcp-video-rag/.python-version
@@ -0,0 +1 @@
+3.12
diff --git a/mcp-video-rag/README.md b/mcp-video-rag/README.md
@@ -0,0 +1,97 @@
+# MCP-powered video-RAG using Ragie
+
+This project demonstrates how to build a video-based Retrieval Augmented Generation (RAG) system powered by the Model Context Protocol (MCP). It uses [Ragie's](https://www.ragie.ai/) video ingestion and retrieval capabilities to enable semantic search and Q&A over video content and integrate them as MCP tools via Cursor IDE.
+
+We use the following tech stack:
+- Ragie for video ingestion + retrieval (video-RAG)
+- Cursor as the MCP host
+
+---
+## Setup and Installation
+
+Ensure you have Python 3.12 or later installed on your system.
+
+### Install uv
+First, let’s install uv and set up our Python project and environment:
+```bash
+# MacOS/Linux
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Windows
+powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+
+### Install dependencies
+```bash
+# Create a new directory for our project
+uv init project-name
+cd project-name
+
+# Create virtual environment and activate it
+uv venv
+source .venv/bin/activate  # MacOS/Linux
+
+.venv\Scripts\activate     # Windows
+
+# Install dependencies
+uv sync
+```
+
+### Configure environment variables
+
+Copy `.env.example` to `.env` and configure the following environment variables:
+```
+RAGIE_API_KEY=your_ragie_api_key
+```
+
+## Run the project
+
+First, set up your MCP server as follows:
+- Go to Cursor settings
+- Select MCP Tools
+- Add new global MCP server.
+
+In the JSON file, add this:
+```json
+{
+    "mcpServers": {
+        "ragie": {
+            "command": "uv",
+            "args": [
+                "--directory",
+                "/absolute/path/to/project_root",
+                "run",
+                "server.py"
+            ],
+            "env": {
+                "RAGIE_API_KEY": "YOUR_RAGIE_API_KEY"
+            }
+        }
+    }
+}
+```
+
+You should now be able to see the MCP server listed in the MCP settings. In Cursor MCP settings make sure to toggle the button to connect the server to the host.
+
+Done! Your server is now up and running. 
+
+The custom MCP server has 3 tools:
+- `ingest_data_tool`: Ingests the video data to the Ragie index
+- `retrieve_data_tool`: Retrieves relevant data from the video based on user query
+- `show_video_tool`: Creates a short video chunk from the specified segment from the original video 
+
+You can now ingest your videos, retrieve relevant data and query it all using the Cursor Agent.
+The agent can even create the desired chunks from your video just with a single query.
+
+---
+
+## 📬 Stay Updated with Our Newsletter!
+**Get a FREE Data Science eBook** 📖 with 150+ essential lessons in Data Science when you subscribe to our newsletter! Stay in the loop with the latest tutorials, insights, and exclusive resources. [Subscribe now!](https://join.dailydoseofds.com)
+
+[![Daily Dose of Data Science Newsletter](https://github.com/patchy631/ai-engineering/blob/main/resources/join_ddods.png)](https://join.dailydoseofds.com)
+
+---
+
+## Contribution
+
+Contributions are welcome! Please fork the repository and submit a pull request with your improvements. 
diff --git a/mcp-video-rag/main.py b/mcp-video-rag/main.py
@@ -0,0 +1,132 @@
+import os
+import time
+import logging
+from pathlib import Path
+
+from dotenv import load_dotenv
+from ragie import Ragie
+from moviepy import VideoFileClip
+
+load_dotenv()
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# initialize ragie client
+ragie = Ragie(
+    auth=os.getenv('RAGIE_API_KEY'),
+)
+
+# Remove previous docs from index
+def clear_index():
+    while True:
+        try:
+            # List all documents
+            response = ragie.documents.list()
+            documents = response.result.documents
+
+            # Process each document
+            for document in documents:
+                try:
+                    ragie.documents.delete(
+                        document_id=document.id
+                    )
+                    logger.info(f"Deleted document {document.id}")
+                except Exception as e:
+                    logger.error(f"Failed to delete document {document.id}: {str(e)}")
+                    raise
+
+            # Check if there are more documents
+            if not response.result.pagination.next_cursor:
+                logger.warning("No more documents\n")
+                break
+
+        except Exception as e:
+            logger.error(f"Failed to retrieve or process documents: {str(e)}")
+            raise
+
+# Ingest data from a directory into the Ragie index
+def ingest_data(directory):
+    # Get list of files in directory
+    directory_path = Path(directory)
+    files = os.listdir(directory_path)
+    
+    for file in files:
+        try:
+            file_path = directory_path / file
+            # Read file content
+            with open(file_path, mode='rb') as f:
+                file_content = f.read()   
+            # Create document in Ragie
+            response = ragie.documents.create(request={
+                "file": {
+                    "file_name": file,
+                    "content": file_content,
+                },
+                "mode": {
+                    "video": "audio_video",
+                    "audio": True
+                }
+            })
+            # Wait for document to be ready
+            while True:
+                res = ragie.documents.get(document_id=response.id)
+                if res.status == "ready":
+                    break
+        
+                time.sleep(2)
+
+            logger.info(f"Successfully uploaded {file}")
+            
+        except Exception as e:
+            logger.error(f"Failed to process file {file}: {str(e)}")
+            continue
+
+# Retrieve data from the Ragie index
+def retrieve_data(query):
+    try:
+        logger.info(f"Retrieving data for query: {query}")
+        retrieval_response = ragie.retrievals.retrieve(request={
+            "query": query
+        })
+
+        content = [
+            {
+                **chunk.document_metadata,
+                "text": chunk.text,
+                "document_name": chunk.document_name,
+                "start_time": chunk.metadata.get("start_time"),
+                "end_time": chunk.metadata.get("end_time")
+            }
+            for chunk in retrieval_response.scored_chunks
+        ]
+
+        logger.info(f"Successfully retrieved {len(content)} chunks")
+        return content
+
+    except Exception as e:
+        logger.error(f"Failed to retrieve data: {str(e)}")
+        raise
+
+def chunk_video(document_name, start_time, end_time, directory="videos"):
+    # Create output filename
+    output_dir = Path("video_chunks")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    chunk_filename = f"video_chunk_{start_time:.1f}_{end_time:.1f}.mp4"
+    output_path = output_dir / chunk_filename
+
+    with VideoFileClip(directory + "/" + document_name) as video:
+        video_duration = video.duration
+        actual_end_time = min(end_time, video_duration) if end_time is not None else video_duration
+
+        video_chunk = video.subclipped(start_time, actual_end_time)
+        video_chunk.write_videofile(str(output_path))
+
+    return output_path
+
+
+if __name__ == "__main__":
+    clear_index()
+    ingest_data("videos")
+    print(retrieve_data("What is the main topic of the video?"))
diff --git a/mcp-video-rag/pyproject.toml b/mcp-video-rag/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "mcp-video-rag"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "ipykernel>=6.29.5",
+    "mcp>=1.9.4",
+    "moviepy>=2.2.1",
+    "python-dotenv>=1.1.0",
+    "ragie>=1.9.0",
+]
diff --git a/mcp-video-rag/server.py b/mcp-video-rag/server.py
@@ -0,0 +1,67 @@
+from mcp.server.fastmcp import FastMCP
+from main import clear_index, ingest_data, retrieve_data, chunk_video
+
+mcp = FastMCP("ragie")
+
+@mcp.tool()
+def ingest_data_tool(directory: str) -> None:
+    """
+    Loads data from a directory into the Ragie index. Wait until the data is fully ingested before continuing.
+
+    Args:
+        directory (str): The directory to load data from.
+
+    Returns:
+        str: A message indicating that the data was loaded successfully.
+    """
+    try:
+        clear_index()
+        ingest_data(directory)
+        return "Data loaded successfully"   
+    except Exception as e:
+        return f"Failed to load data: {str(e)}"
+
+@mcp.tool()
+def retrieve_data_tool(query: str) -> list[dict]:
+    """
+    Retrieves data from the Ragie index based on the query. The data is returned as a list of dictionaries, each containing the following keys:
+    - text: The text of the retrieved chunk
+    - document_name: The name of the document the chunk belongs to
+    - start_time: The start time of the chunk
+    - end_time: The end time of the chunk
+
+    Args:
+        query (str): The query to retrieve data from the Ragie index.
+
+    Returns:
+        list[dict]: The retrieved data.
+    """
+    try:
+        content = retrieve_data(query)
+        return content
+    except Exception as e:
+        return f"Failed to retrieve data: {str(e)}"
+
+@mcp.tool()
+def show_video_tool(document_name: str, start_time: float, end_time: float) -> str:
+    """
+    Creates and saves a video chunk based on the document name, start time, and end time of the chunk.
+    Returns a message indicating that the video chunk was created successfully.
+
+    Args:
+        document_name (str): The name of the document the chunk belongs to
+        start_time (float): The start time of the chunk
+        end_time (float): The end time of the chunk
+
+    Returns:
+        str: A message indicating that the video chunk was created successfully
+    """
+    try:
+        chunk_video(document_name, start_time, end_time)
+        return "Video chunk created successfully"
+    except Exception as e:
+        return f"Failed to create video chunk: {str(e)}"
+
+# Run the server locally
+if __name__ == "__main__":
+    mcp.run(transport='stdio')
diff --git a/mcp-video-rag/uv.lock b/mcp-video-rag/uv.lock
diff --git a/mcp-video-rag/videos/messi-goals.mp4 b/mcp-video-rag/videos/messi-goals.mp4