From c0972ddb7c5f26210930356ad1cb3c2711a9e9ca Mon Sep 17 00:00:00 2001 From: Martin Bielik Date: Tue, 12 Aug 2025 19:59:06 +0200 Subject: [PATCH 1/9] Add CLAUDE.md with comprehensive project documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added development setup instructions, architecture overview, and configuration details for the RAG chatbot system including uv dependency management guidance. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- CLAUDE.md | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..d559245c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,95 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Essential Commands + +### Development Setup +```bash +# Install dependencies +uv sync + +# Add new dependencies +uv add package_name + +# Set up environment variables +cp .env.example .env +# Edit .env to add your ANTHROPIC_API_KEY +``` + +### Running the Application +```bash +# Quick start (recommended) +chmod +x run.sh +./run.sh + +# Manual start +cd backend && uv run uvicorn app:app --reload --port 8000 +``` + +### Development Commands +```bash +# Run from backend directory +cd backend && uv run uvicorn app:app --reload --port 8000 + +# Test API endpoints directly +curl http://localhost:8000/api/courses +curl -X POST http://localhost:8000/api/query -H "Content-Type: application/json" -d '{"query":"your question here"}' +``` + +## Architecture Overview + +This is a **RAG (Retrieval-Augmented Generation) System** with a three-layer architecture: + +### Core RAG Pipeline +1. **Document Processing**: Course transcripts → chunked text with metadata +2. **Vector Storage**: ChromaDB stores embeddings for semantic search +3. **AI Generation**: Claude API generates contextual responses using retrieved content +4. **Tool Integration**: AI can dynamically search the knowledge base using tools + +### Key Components + +**RAGSystem (`rag_system.py`)** - Central orchestrator that coordinates: +- Document processing and chunking +- Vector storage operations +- AI response generation with tool access +- Session management for conversation history + +**Tool-Based Search Architecture** - The system uses a tool-based approach where: +- `ToolManager` registers available search tools +- `CourseSearchTool` performs vector searches +- Claude API calls tools dynamically during response generation +- Tools return sources that are tracked and returned to frontend + +**Data Models** (`models.py`): +- `Course`: Contains title, instructor, lessons list +- `CourseChunk`: Text chunks with course/lesson metadata for vector storage +- `Lesson`: Individual lessons with titles and optional links + +### Configuration (`config.py`) +Key settings: +- `CHUNK_SIZE: 800` - Text chunk size for vector storage +- `CHUNK_OVERLAP: 100` - Overlap between chunks +- `MAX_RESULTS: 5` - Vector search result limit +- `MAX_HISTORY: 2` - Conversation memory depth + +### Data Flow +1. Course documents in `docs/` are processed into `CourseChunk` objects +2. Chunks are embedded and stored in ChromaDB (`./chroma_db/`) +3. User queries trigger tool-based searches via Claude API +4. Retrieved chunks provide context for AI response generation +5. Session history maintains conversation continuity + +### Frontend Integration +- FastAPI serves both API endpoints (`/api/*`) and static frontend files +- Frontend communicates via `/api/query` for chat and `/api/courses` for statistics +- CORS configured for development with live reload support + +## Environment Requirements + +Required environment variable: +``` +ANTHROPIC_API_KEY=your_anthropic_api_key_here +``` + +The system expects course documents in `docs/` folder as `.txt`, `.pdf`, or `.docx` files. From c8e39ebc5fb131d7b34e4874d47acb3d049812ef Mon Sep 17 00:00:00 2001 From: Martin Bielik Date: Wed, 13 Aug 2025 00:53:13 +0200 Subject: [PATCH 2/9] Add course outline tool and improve UI styling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add CourseOutlineTool for retrieving complete course structures - Update New Chat button styling to match sidebar design - Implement get_course_outline method in vector store - Update AI system prompt to use both search and outline tools - Register new tool in RAG system 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- backend/ai_generator.py | 35 ++++++++++---- backend/app.py | 16 ++++++- backend/rag_system.py | 4 +- backend/search_tools.py | 103 ++++++++++++++++++++++++++++++++++++++-- backend/vector_store.py | 42 ++++++++++++++++ frontend/index.html | 7 +++ frontend/script.js | 60 ++++++++++++++++++++++- frontend/style.css | 40 ++++++++++++++++ 8 files changed, 287 insertions(+), 20 deletions(-) diff --git a/backend/ai_generator.py b/backend/ai_generator.py index 0363ca90..2bff896c 100644 --- a/backend/ai_generator.py +++ b/backend/ai_generator.py @@ -5,21 +5,36 @@ class AIGenerator: """Handles interactions with Anthropic's Claude API for generating responses""" # Static system prompt to avoid rebuilding on each call - SYSTEM_PROMPT = """ You are an AI assistant specialized in course materials and educational content with access to a comprehensive search tool for course information. + SYSTEM_PROMPT = """ You are an AI assistant specialized in course materials and educational content with access to comprehensive tools for course information. -Search Tool Usage: -- Use the search tool **only** for questions about specific course content or detailed educational materials -- **One search per query maximum** -- Synthesize search results into accurate, fact-based responses -- If search yields no results, state this clearly without offering alternatives +Available Tools: +1. **get_course_outline**: Retrieves complete course structure with title, link, and all lessons + - Use for: course outlines, syllabus queries, lesson lists, course structure questions + - Returns: Course title, course link, and numbered lesson list + +2. **search_course_content**: Searches within course materials for specific content + - Use for: detailed content questions, specific topics, lesson details + - Returns: Relevant content excerpts from course materials + +Tool Usage Guidelines: +- **Course outline/structure questions**: Use get_course_outline tool +- **Specific content questions**: Use search_course_content tool +- **One tool call per query maximum** +- Synthesize tool results into accurate, fact-based responses +- If tools yield no results, state this clearly without offering alternatives Response Protocol: -- **General knowledge questions**: Answer using existing knowledge without searching -- **Course-specific questions**: Search first, then answer +- **General knowledge questions**: Answer using existing knowledge without tools +- **Course-specific questions**: Use appropriate tool first, then answer - **No meta-commentary**: - - Provide direct answers only — no reasoning process, search explanations, or question-type analysis - - Do not mention "based on the search results" + - Provide direct answers only — no reasoning process, tool explanations, or question-type analysis + - Do not mention "based on the search results" or "using the outline tool" +When presenting course outlines: +- Display the course title prominently +- Include the course link if available +- List all lessons with their numbers and titles +- Keep formatting clean and readable All responses must be: 1. **Brief, Concise and focused** - Get to the point quickly diff --git a/backend/app.py b/backend/app.py index 5a69d741..85545697 100644 --- a/backend/app.py +++ b/backend/app.py @@ -6,7 +6,7 @@ from fastapi.staticfiles import StaticFiles from fastapi.middleware.trustedhost import TrustedHostMiddleware from pydantic import BaseModel -from typing import List, Optional +from typing import List, Optional, Dict, Any import os from config import config @@ -43,7 +43,7 @@ class QueryRequest(BaseModel): class QueryResponse(BaseModel): """Response model for course queries""" answer: str - sources: List[str] + sources: List[Dict[str, Any]] # Changed to support sources with links session_id: str class CourseStats(BaseModel): @@ -85,6 +85,18 @@ async def get_course_stats(): except Exception as e: raise HTTPException(status_code=500, detail=str(e)) +@app.post("/api/session/clear") +async def clear_session(session_id: Optional[str] = None): + """Clear a session's conversation history""" + try: + if session_id: + rag_system.session_manager.clear_session(session_id) + # Always create and return a new session + new_session_id = rag_system.session_manager.create_session() + return {"session_id": new_session_id, "status": "success"} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + @app.on_event("startup") async def startup_event(): """Load initial documents on startup""" diff --git a/backend/rag_system.py b/backend/rag_system.py index 50d848c8..1a79eb74 100644 --- a/backend/rag_system.py +++ b/backend/rag_system.py @@ -4,7 +4,7 @@ from vector_store import VectorStore from ai_generator import AIGenerator from session_manager import SessionManager -from search_tools import ToolManager, CourseSearchTool +from search_tools import ToolManager, CourseSearchTool, CourseOutlineTool from models import Course, Lesson, CourseChunk class RAGSystem: @@ -22,7 +22,9 @@ def __init__(self, config): # Initialize search tools self.tool_manager = ToolManager() self.search_tool = CourseSearchTool(self.vector_store) + self.outline_tool = CourseOutlineTool(self.vector_store) self.tool_manager.register_tool(self.search_tool) + self.tool_manager.register_tool(self.outline_tool) def add_course_document(self, file_path: str) -> Tuple[Course, int]: """ diff --git a/backend/search_tools.py b/backend/search_tools.py index adfe8235..3cc44cef 100644 --- a/backend/search_tools.py +++ b/backend/search_tools.py @@ -88,7 +88,7 @@ def execute(self, query: str, course_name: Optional[str] = None, lesson_number: def _format_results(self, results: SearchResults) -> str: """Format search results with course and lesson context""" formatted = [] - sources = [] # Track sources for the UI + sources = [] # Track sources for the UI with links for doc, meta in zip(results.documents, results.metadata): course_title = meta.get('course_title', 'unknown') @@ -100,11 +100,22 @@ def _format_results(self, results: SearchResults) -> str: header += f" - Lesson {lesson_num}" header += "]" - # Track source for the UI - source = course_title + # Track source for the UI with link + source_text = course_title if lesson_num is not None: - source += f" - Lesson {lesson_num}" - sources.append(source) + source_text += f" - Lesson {lesson_num}" + + # Get lesson link from vector store + lesson_link = None + if lesson_num is not None: + lesson_link = self.store.get_lesson_link(course_title, lesson_num) + + # Create source dictionary with text and optional link + source_data = { + "text": source_text, + "link": lesson_link + } + sources.append(source_data) formatted.append(f"{header}\n{doc}") @@ -113,6 +124,88 @@ def _format_results(self, results: SearchResults) -> str: return "\n\n".join(formatted) +class CourseOutlineTool(Tool): + """Tool for retrieving complete course outlines with lessons""" + + def __init__(self, vector_store: VectorStore): + self.store = vector_store + self.last_sources = [] # Track sources from last search + + def get_tool_definition(self) -> Dict[str, Any]: + """Return Anthropic tool definition for this tool""" + return { + "name": "get_course_outline", + "description": "Get complete course outline including title, link, and all lessons with their numbers and titles", + "input_schema": { + "type": "object", + "properties": { + "course_title": { + "type": "string", + "description": "Course title to get outline for (partial matches work, e.g. 'MCP', 'Computer Use')" + } + }, + "required": ["course_title"] + } + } + + def execute(self, course_title: str) -> str: + """ + Execute the course outline tool. + + Args: + course_title: Course name to get outline for + + Returns: + Formatted course outline or error message + """ + # Get course outline from vector store + outline = self.store.get_course_outline(course_title) + + # Handle not found + if not outline: + return f"No course found matching '{course_title}'. Please check the course name and try again." + + # Format the response + formatted = [] + sources = [] + + # Add course title and link + formatted.append(f"**Course Title:** {outline['title']}") + + if outline.get('course_link'): + formatted.append(f"**Course Link:** {outline['course_link']}") + # Track source with link + sources.append({ + "text": outline['title'], + "link": outline['course_link'] + }) + else: + # Track source without link + sources.append({ + "text": outline['title'], + "link": None + }) + + if outline.get('instructor'): + formatted.append(f"**Instructor:** {outline['instructor']}") + + # Add lessons + formatted.append(f"\n**Lessons ({outline.get('lesson_count', 0)} total):**") + + if outline.get('lessons'): + for lesson in outline['lessons']: + lesson_num = lesson.get('lesson_number', '?') + lesson_title = lesson.get('lesson_title', 'Unknown') + formatted.append(f" Lesson {lesson_num}: {lesson_title}") + else: + formatted.append(" No lessons found") + + # Store sources for retrieval + self.last_sources = sources + + return "\n".join(formatted) + + class ToolManager: """Manages available tools for the AI""" diff --git a/backend/vector_store.py b/backend/vector_store.py index 390abe71..8764b792 100644 --- a/backend/vector_store.py +++ b/backend/vector_store.py @@ -264,4 +264,46 @@ def get_lesson_link(self, course_title: str, lesson_number: int) -> Optional[str return None except Exception as e: print(f"Error getting lesson link: {e}") + + def get_course_outline(self, course_name: str) -> Optional[Dict[str, Any]]: + """ + Get complete course outline including title, link, and all lessons. + + Args: + course_name: Course name to search for (partial matches work) + + Returns: + Dictionary with course outline data or None if not found + """ + import json + + # First resolve the course name to get exact title + course_title = self._resolve_course_name(course_name) + if not course_title: + return None + + try: + # Get course metadata by ID (title is the ID) + results = self.course_catalog.get(ids=[course_title]) + if results and 'metadatas' in results and results['metadatas']: + metadata = results['metadatas'][0] + + # Parse lessons JSON + lessons = [] + lessons_json = metadata.get('lessons_json') + if lessons_json: + lessons = json.loads(lessons_json) + + # Return structured course outline + return { + 'title': metadata.get('title'), + 'course_link': metadata.get('course_link'), + 'instructor': metadata.get('instructor'), + 'lesson_count': metadata.get('lesson_count', 0), + 'lessons': lessons + } + return None + except Exception as e: + print(f"Error getting course outline: {e}") + return None \ No newline at end of file diff --git a/frontend/index.html b/frontend/index.html index f8e25a62..a6f1afe7 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -19,6 +19,13 @@

Course Materials Assistant