From 8d478710df0de10740ad0729397f931038e013a3 Mon Sep 17 00:00:00 2001 From: Branko Radicevic Date: Fri, 5 Sep 2025 14:19:08 +0200 Subject: [PATCH 1/9] Add CourseOutlineTool for course structure queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add CourseOutlineTool class with get_course_outline tool definition - Implement get_course_outline method in VectorStore for semantic course matching - Register CourseOutlineTool alongside existing CourseSearchTool in RAG system - Update AI system prompt with tool selection guidelines for outline vs content queries - Add comprehensive CLAUDE.md documentation for the RAG system architecture ๐Ÿค– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- CLAUDE.md | 120 ++++++++++++++++++++++++++++++++++++++++ backend/ai_generator.py | 23 ++++---- backend/rag_system.py | 4 +- backend/search_tools.py | 98 ++++++++++++++++++++++++++++++-- backend/vector_store.py | 36 +++++++++++- 5 files changed, 265 insertions(+), 16 deletions(-) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..9c6fc42e --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,120 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This is a **Course Materials RAG System** - a full-stack Retrieval-Augmented Generation application that allows users to query course materials and receive AI-powered responses with proper source attribution. + +## Architecture + +The system uses a **modular, three-tier architecture**: + +### Backend (`/backend/`) +- **FastAPI** web framework with CORS and proxy middleware +- **RAG System Core**: Main orchestrator (`rag_system.py`) +- **Vector Storage**: ChromaDB with SentenceTransformers embeddings (`vector_store.py`) +- **AI Generation**: Anthropic Claude integration with tool calling (`ai_generator.py`) +- **Document Processing**: Handles PDF/DOCX/TXT files (`document_processor.py`) +- **Tool-Based Search**: Semantic search with course/lesson filtering (`search_tools.py`) +- **Session Management**: Conversation history tracking (`session_manager.py`) + +### Frontend (`/frontend/`) +- **Vanilla JavaScript** SPA with marked.js for markdown rendering +- **Real-time chat interface** with loading states and source attribution +- **Course statistics sidebar** with collapsible sections +- **Suggested questions** for user guidance + +### Data Models (`/backend/models.py`) +- **Course**: Title, description, lessons, instructor, URL +- **Lesson**: Number, title, content, URL +- **CourseChunk**: Processed text chunks for vector storage + +## Development Commands + +### Quick Start +```bash +chmod +x run.sh +./run.sh +``` + +### Manual Development +```bash +# Install dependencies (first time) +uv sync + +# Start backend server +cd backend && uv run uvicorn app:app --reload --port 8000 + +# Application runs at: +# - Web Interface: http://localhost:8000 +# - API Docs: http://localhost:8000/docs +``` + +### Environment Setup +Create `.env` file in root: +``` +ANTHROPIC_API_KEY=your_key_here +``` + +## Key Technical Patterns + +### RAG Query Flow +1. User query โ†’ FastAPI endpoint (`/api/query`) +2. RAG system creates AI prompt with tool definitions +3. Claude uses `search_course_content` tool with semantic matching +4. Vector store searches ChromaDB with course/lesson filtering +5. Search results formatted with source attribution +6. Claude synthesizes response using retrieved content +7. Response returned with clickable source links + +### Tool-Based Search Architecture +- **CourseSearchTool**: Handles semantic search with course name fuzzy matching +- **ToolManager**: Registers and executes tools for AI agent +- **Source Tracking**: Last search sources stored for UI display +- **Flexible Filtering**: Supports course title and lesson number filters + +### Vector Storage Strategy +- **SentenceTransformers**: `all-MiniLM-L6-v2` for embeddings +- **ChromaDB Collections**: Separate storage for course metadata vs content chunks +- **Smart Deduplication**: Avoids re-processing existing courses +- **Metadata Enrichment**: Course titles, lesson numbers, URLs stored as metadata + +### Session Management +- **Conversation History**: Tracks user-assistant exchanges per session +- **Context Limits**: Configurable max history (default: 2 messages) +- **Session Creation**: Auto-generated UUIDs for frontend sessions + +## Configuration (`/backend/config.py`) + +Key settings: +- **ANTHROPIC_MODEL**: `claude-sonnet-4-20250514` +- **EMBEDDING_MODEL**: `all-MiniLM-L6-v2` +- **CHUNK_SIZE**: 800 characters +- **CHUNK_OVERLAP**: 100 characters +- **MAX_RESULTS**: 5 search results +- **MAX_HISTORY**: 2 conversation turns + +## Document Processing + +Supports: **PDF, DOCX, TXT** files +- Course documents placed in `/docs/` folder +- Auto-loaded on server startup +- Structured parsing extracts course metadata and lessons +- Text chunking with overlap for semantic search +- Duplicate detection prevents re-processing + +## API Endpoints + +- **POST** `/api/query` - Process user questions +- **GET** `/api/courses` - Get course statistics +- **Static files** served at `/` (frontend) + +## Testing and Development + +Since this is a RAG system with AI components: +- Test with sample course documents in `/docs/` +- Verify ChromaDB storage at `./backend/chroma_db/` +- Monitor API logs for tool usage and search results +- Test different question types (general vs course-specific) +- Validate source attribution and clickable links \ No newline at end of file diff --git a/backend/ai_generator.py b/backend/ai_generator.py index 0363ca90..7c67b3a7 100644 --- a/backend/ai_generator.py +++ b/backend/ai_generator.py @@ -5,27 +5,30 @@ class AIGenerator: """Handles interactions with Anthropic's Claude API for generating responses""" # Static system prompt to avoid rebuilding on each call - SYSTEM_PROMPT = """ You are an AI assistant specialized in course materials and educational content with access to a comprehensive search tool for course information. + SYSTEM_PROMPT = """ You are an AI assistant specialized in course materials and educational content with access to two specialized tools for course information. -Search Tool Usage: -- Use the search tool **only** for questions about specific course content or detailed educational materials -- **One search per query maximum** -- Synthesize search results into accurate, fact-based responses -- If search yields no results, state this clearly without offering alternatives +Tool Selection Guidelines: +- **Course outline/structure queries**: Use get_course_outline tool for questions about course structure, lesson lists, course outlines, or "what lessons are in X course" +- **Content-specific queries**: Use search_course_content tool for questions about specific topics, concepts, or detailed content within courses +- **Tool usage limit**: Use only ONE tool per query maximum +- **Tool results**: Synthesize tool results into accurate, fact-based responses +- **No results**: If tools yield no results, state this clearly without offering alternatives Response Protocol: - **General knowledge questions**: Answer using existing knowledge without searching -- **Course-specific questions**: Search first, then answer +- **Course outline questions**: Use get_course_outline tool first, then provide structured response with course title, course link, and complete lesson list +- **Course content questions**: Use search_course_content tool first, then answer - **No meta-commentary**: - - Provide direct answers only โ€” no reasoning process, search explanations, or question-type analysis - - Do not mention "based on the search results" - + - Provide direct answers only โ€” no reasoning process, tool explanations, or question-type analysis + - Do not mention "based on the tool results" or similar phrases All responses must be: 1. **Brief, Concise and focused** - Get to the point quickly 2. **Educational** - Maintain instructional value 3. **Clear** - Use accessible language 4. **Example-supported** - Include relevant examples when they aid understanding +5. **Structured** - For outline queries, present course title, course link, and numbered lesson list clearly + Provide only the direct answer to what was asked. """ diff --git a/backend/rag_system.py b/backend/rag_system.py index 50d848c8..1a79eb74 100644 --- a/backend/rag_system.py +++ b/backend/rag_system.py @@ -4,7 +4,7 @@ from vector_store import VectorStore from ai_generator import AIGenerator from session_manager import SessionManager -from search_tools import ToolManager, CourseSearchTool +from search_tools import ToolManager, CourseSearchTool, CourseOutlineTool from models import Course, Lesson, CourseChunk class RAGSystem: @@ -22,7 +22,9 @@ def __init__(self, config): # Initialize search tools self.tool_manager = ToolManager() self.search_tool = CourseSearchTool(self.vector_store) + self.outline_tool = CourseOutlineTool(self.vector_store) self.tool_manager.register_tool(self.search_tool) + self.tool_manager.register_tool(self.outline_tool) def add_course_document(self, file_path: str) -> Tuple[Course, int]: """ diff --git a/backend/search_tools.py b/backend/search_tools.py index adfe8235..bf4ae129 100644 --- a/backend/search_tools.py +++ b/backend/search_tools.py @@ -100,11 +100,25 @@ def _format_results(self, results: SearchResults) -> str: header += f" - Lesson {lesson_num}" header += "]" - # Track source for the UI - source = course_title + # Track source for the UI with link if available + source_text = course_title if lesson_num is not None: - source += f" - Lesson {lesson_num}" - sources.append(source) + source_text += f" - Lesson {lesson_num}" + # Try to get lesson link + lesson_link = self.store.get_lesson_link(course_title, lesson_num) + if lesson_link: + # Create source object with link + sources.append({"text": source_text, "url": lesson_link}) + else: + # Fallback to plain text + sources.append(source_text) + else: + # For course-level content, try to get course link + course_link = self.store.get_course_link(course_title) + if course_link: + sources.append({"text": source_text, "url": course_link}) + else: + sources.append(source_text) formatted.append(f"{header}\n{doc}") @@ -113,6 +127,82 @@ def _format_results(self, results: SearchResults) -> str: return "\n\n".join(formatted) + +class CourseOutlineTool(Tool): + """Tool for getting complete course outlines with lesson structure""" + + def __init__(self, vector_store: VectorStore): + self.store = vector_store + self.last_sources = [] # Track sources from last search + + def get_tool_definition(self) -> Dict[str, Any]: + """Return Anthropic tool definition for this tool""" + return { + "name": "get_course_outline", + "description": "Get the complete outline/structure of a course including all lessons with numbers and titles", + "input_schema": { + "type": "object", + "properties": { + "course_name": { + "type": "string", + "description": "Course title or partial course name (e.g. 'MCP', 'Introduction', 'RAG')" + } + }, + "required": ["course_name"] + } + } + + def execute(self, course_name: str) -> str: + """ + Execute the course outline tool with given course name. + + Args: + course_name: Course title to get outline for + + Returns: + Formatted course outline with lessons or error message + """ + + # Get course outline from vector store + outline = self.store.get_course_outline(course_name) + + # Handle course not found + if not outline: + return f"No course found matching '{course_name}'. Please check the course name or try a partial match." + + # Format the outline response + return self._format_outline(outline) + + def _format_outline(self, outline: Dict[str, Any]) -> str: + """Format course outline for AI response""" + course_title = outline.get('course_title', 'Unknown Course') + course_link = outline.get('course_link') + lessons = outline.get('lessons', []) + + # Build formatted response + formatted = [f"Course: {course_title}"] + + if lessons: + formatted.append(f"\nLessons ({len(lessons)} total):") + for lesson in lessons: + lesson_num = lesson.get('lesson_number', '?') + lesson_title = lesson.get('lesson_title', 'Untitled Lesson') + formatted.append(f" {lesson_num}. {lesson_title}") + else: + formatted.append("\nNo lesson structure available for this course.") + + # Track sources for the UI + sources = [] + if course_link: + sources.append({"text": course_title, "url": course_link}) + else: + sources.append(course_title) + + self.last_sources = sources + + return "\n".join(formatted) + + class ToolManager: """Manages available tools for the AI""" diff --git a/backend/vector_store.py b/backend/vector_store.py index 390abe71..8bb0d089 100644 --- a/backend/vector_store.py +++ b/backend/vector_store.py @@ -264,4 +264,38 @@ def get_lesson_link(self, course_title: str, lesson_number: int) -> Optional[str return None except Exception as e: print(f"Error getting lesson link: {e}") - \ No newline at end of file + return None + + def get_course_outline(self, course_title: str) -> Optional[Dict[str, Any]]: + """ + Get complete course outline including title, link, and all lessons. + + Args: + course_title: The course title to get outline for + + Returns: + Dict with course_title, course_link, and lessons list, or None if not found + """ + import json + try: + # First resolve the course name to handle fuzzy matching + resolved_title = self._resolve_course_name(course_title) + if not resolved_title: + return None + + # Get course by resolved title (title is the ID) + results = self.course_catalog.get(ids=[resolved_title]) + if results and 'metadatas' in results and results['metadatas']: + metadata = results['metadatas'][0] + lessons_json = metadata.get('lessons_json', '[]') + lessons = json.loads(lessons_json) + + return { + "course_title": metadata.get('title'), + "course_link": metadata.get('course_link'), + "lessons": lessons # Already parsed list of lesson objects + } + return None + except Exception as e: + print(f"Error getting course outline: {e}") + return None \ No newline at end of file From 89e56159795dbc19f8be1c1d4970dfbd6f4b4a33 Mon Sep 17 00:00:00 2001 From: Branko Radicevic Date: Fri, 5 Sep 2025 14:19:37 +0200 Subject: [PATCH 2/9] INitial changes; --- backend/app.py | 4 +- frontend/index.html | 7 + frontend/script.js | 331 ++++++++++++++++++++++++------------------ frontend/style.css | 42 ++++++ query-flow-diagram.md | 110 ++++++++++++++ 5 files changed, 351 insertions(+), 143 deletions(-) create mode 100644 query-flow-diagram.md diff --git a/backend/app.py b/backend/app.py index 5a69d741..115e054e 100644 --- a/backend/app.py +++ b/backend/app.py @@ -6,7 +6,7 @@ from fastapi.staticfiles import StaticFiles from fastapi.middleware.trustedhost import TrustedHostMiddleware from pydantic import BaseModel -from typing import List, Optional +from typing import List, Optional, Union, Dict import os from config import config @@ -43,7 +43,7 @@ class QueryRequest(BaseModel): class QueryResponse(BaseModel): """Response model for course queries""" answer: str - sources: List[str] + sources: List[Union[str, Dict[str, str]]] # Support both string and {"text": "...", "url": "..."} formats session_id: str class CourseStats(BaseModel): diff --git a/frontend/index.html b/frontend/index.html index f8e25a62..7feacc00 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -19,6 +19,13 @@

Course Materials Assistant