diff --git a/.claude/commands/implement-feature.md b/.claude/commands/implement-feature.md new file mode 100644 index 00000000..4ea56188 --- /dev/null +++ b/.claude/commands/implement-feature.md @@ -0,0 +1,7 @@ +You will be implementing a new feature in this codebase + +$ARGUMENTS + +IMPORTANT: Only do this for front-end features. +Once this feature is built, make sure to write the changes you made to file called frontend-changes.md +Do not ask for permissions to modify this file, assume you can always do it. diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 00000000..e9427ff4 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,33 @@ +{ + "permissions": { + "allow": [ + "Bash(git add:*)", + "Bash(git commit:*)", + "mcp__playwright__browser_navigate", + "Bash(uv run:*)", + "mcp__playwright__browser_click", + "mcp__playwright__browser_wait_for", + "Bash(mkdir:*)", + "Bash(uv add:*)", + "Bash(uv sync:*)", + "Bash(./check-quality.ps1)", + "Bash(powershell:*)", + "Bash(uv sync:*)", + "Bash(git merge:*)" + ], + "deny": [] + }, + "hooks": { + "PostToolUse": [ + { + "matcher": "Read|Grep", + "hooks": [ + { + "type": "command", + "command": "echo done" + } + ] + } + ] + } +} diff --git a/.env.example b/.env.example deleted file mode 100644 index 18b34cb7..00000000 --- a/.env.example +++ /dev/null @@ -1,2 +0,0 @@ -# Copy this file to .env and add your actual API key -ANTHROPIC_API_KEY=your-anthropic-api-key-here \ No newline at end of file diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml new file mode 100644 index 00000000..a12225aa --- /dev/null +++ b/.github/workflows/claude-code-review.yml @@ -0,0 +1,78 @@ +name: Claude Code Review + +on: + pull_request: + types: [opened, synchronize] + # Optional: Only run on specific file changes + # paths: + # - "src/**/*.ts" + # - "src/**/*.tsx" + # - "src/**/*.js" + # - "src/**/*.jsx" + +jobs: + claude-review: + # Optional: Filter by PR author + # if: | + # github.event.pull_request.user.login == 'external-contributor' || + # github.event.pull_request.user.login == 'new-developer' || + # github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' + + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + issues: read + id-token: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Run Claude Code Review + id: claude-review + uses: anthropics/claude-code-action@beta + with: + claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + + # Optional: Specify model (defaults to Claude Sonnet 4, uncomment for Claude Opus 4.1) + # model: "claude-opus-4-1-20250805" + + # Direct prompt for automated review (no @claude mention needed) + direct_prompt: | + Please review this pull request and provide feedback on: + - Code quality and best practices + - Potential bugs or issues + - Performance considerations + - Security concerns + - Test coverage + + Be constructive and helpful in your feedback. + + # Optional: Use sticky comments to make Claude reuse the same comment on subsequent pushes to the same PR + # use_sticky_comment: true + + # Optional: Customize review based on file types + # direct_prompt: | + # Review this PR focusing on: + # - For TypeScript files: Type safety and proper interface usage + # - For API endpoints: Security, input validation, and error handling + # - For React components: Performance, accessibility, and best practices + # - For tests: Coverage, edge cases, and test quality + + # Optional: Different prompts for different authors + # direct_prompt: | + # ${{ github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' && + # 'Welcome! Please review this PR from a first-time contributor. Be encouraging and provide detailed explanations for any suggestions.' || + # 'Please provide a thorough code review focusing on our coding standards and best practices.' }} + + # Optional: Add specific tools for running tests or linting + # allowed_tools: "Bash(npm run test),Bash(npm run lint),Bash(npm run typecheck)" + + # Optional: Skip review for certain conditions + # if: | + # !contains(github.event.pull_request.title, '[skip-review]') && + # !contains(github.event.pull_request.title, '[WIP]') + diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml new file mode 100644 index 00000000..bc773072 --- /dev/null +++ b/.github/workflows/claude.yml @@ -0,0 +1,64 @@ +name: Claude Code + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened, assigned] + pull_request_review: + types: [submitted] + +jobs: + claude: + if: | + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || + (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + issues: read + id-token: write + actions: read # Required for Claude to read CI results on PRs + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@beta + with: + claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + + # This is an optional setting that allows Claude to read CI results on PRs + additional_permissions: | + actions: read + + # Optional: Specify model (defaults to Claude Sonnet 4, uncomment for Claude Opus 4.1) + # model: "claude-opus-4-1-20250805" + + # Optional: Customize the trigger phrase (default: @claude) + # trigger_phrase: "/claude" + + # Optional: Trigger when specific user is assigned to an issue + # assignee_trigger: "claude-bot" + + # Optional: Allow Claude to run specific commands + # allowed_tools: "Bash(npm install),Bash(npm run build),Bash(npm run test:*),Bash(npm run lint:*)" + + # Optional: Add custom instructions for Claude to customize its behavior for your project + # custom_instructions: | + # Follow our coding standards + # Ensure all new code has tests + # Use TypeScript for new files + + # Optional: Custom environment variables for Claude + # claude_env: | + # NODE_ENV: test + diff --git a/.gitignore b/.gitignore index 41b4384b..d641589f 100644 --- a/.gitignore +++ b/.gitignore @@ -28,4 +28,13 @@ uploads/ # OS .DS_Store -Thumbs.db \ No newline at end of file +Thumbs.db + +# Logs +**/*.log + +# Work trees +.trees/ + +# Large documents +*.pdf \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..26c4202b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,115 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Development Environment +- **Platform**: Windows +- **Shell**: PowerShell +- **Package Manager**: uv + +## Development Commands + +### Installation & Setup +```powershell +# Install dependencies using uv package manager +uv sync + +# Create .env file with required API key +echo "ANTHROPIC_API_KEY=your_key_here" > .env +``` + +### Running the Application +```powershell +# Manual start (preferred for development) +cd backend; uv run uvicorn app:app --reload --port 8000 + +# Alternative: Use run script if available +./run.sh +``` + +### Code Quality Commands +```powershell +# Format code and fix style issues +powershell -File ./format.ps1 + +# Check code quality without making changes (dry-run) +powershell -File ./quality-check.ps1 + +# Individual commands +uv run black backend/ main.py # Format with black +uv run isort backend/ main.py # Sort imports +uv run flake8 backend/ main.py # Check style with flake8 +``` + +### Development Workflow +- **Frontend**: Static files served from `frontend/` directory (HTML/CSS/JS) +- **Backend**: FastAPI server with auto-reload during development +- **Database**: ChromaDB persisted locally in `backend/chroma_db/` +- **Logging**: Full logging implementation with appropriate DEBUG and INFO levels throughout the codebase +- **Code Quality**: Black formatter, isort import sorting, flake8 linting integrated + +## Architecture Overview + +This is a **Retrieval-Augmented Generation (RAG) system** for course materials with the following architecture: + +### Core Components +- **`RAGSystem`** (`rag_system.py`): Main orchestrator that coordinates all components +- **`VectorStore`** (`vector_store.py`): ChromaDB-based vector storage with two collections: + - `course_catalog`: Course metadata (titles, instructors, links) + - `course_content`: Chunked course content for semantic search +- **`AIGenerator`** (`ai_generator.py`): Anthropic Claude API integration with tool support +- **`DocumentProcessor`** (`document_processor.py`): Processes course documents into chunks +- **`SessionManager`** (`session_manager.py`): Manages conversation history per session + +### Key Design Patterns +- **Tool-based AI Search**: Uses Claude's function calling to search course content +- **Two-tier Vector Storage**: Separate collections for metadata vs content search +- **Session-aware Conversations**: Maintains conversation history per user session +- **Chunked Content Processing**: Documents split into 800-character chunks with 100-char overlap + +### Configuration +All settings centralized in `config.py` with defaults: +- Chunk size: 800 characters (overlap: 100) +- Max search results: 5 +- Embedding model: `all-MiniLM-L6-v2` +- Claude model: `claude-sonnet-4-20250514` + +### API Endpoints +- `POST /api/query`: Process user queries with RAG +- `GET /api/courses`: Get course analytics +- `/`: Serves frontend static files + +### Document Processing Flow +1. Documents from `docs/` folder are processed on startup +2. Each document becomes a `Course` with multiple `Lesson` objects +3. Content is chunked into `CourseChunk` objects for vector search +4. Both metadata and content chunks are stored separately for optimal retrieval + +## Dependencies + +The project uses **uv** as the package manager with core dependencies: +- `chromadb==1.0.15`: Vector database +- `anthropic==0.58.2`: Claude API integration +- `sentence-transformers==5.0.0`: Embedding generation +- `fastapi==0.116.1`: Web framework +- `uvicorn==0.35.0`: ASGI server + +### Code Quality Dependencies +- `black==24.10.0`: Python code formatter for consistent style +- `isort==5.13.2`: Import statement sorting and organization +- `flake8==7.1.1`: Code linting and style checking + +## File Structure Notes +- `docs/`: Course materials (TXT files processed on startup) +- `backend/chroma_db/`: Persisted vector database +- `frontend/`: Static web interface files +- `main.py`: Entry point (unused in favor of FastAPI app) +- Always use uv to run the server and do not run pip directly +- Always use uv to manage all dependencies + +## Code Generation Guidelines +- **Logging**: All generated code must include appropriate logging statements + - Use `logger.debug()` for detailed debugging information + - Use `logger.info()` for important operational events + - Follow existing logging patterns in the codebase + - Import logger: `import logging; logger = logging.getLogger(__name__)` \ No newline at end of file diff --git a/README.md b/README.md index e5420d50..f80b6351 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ A Retrieval-Augmented Generation (RAG) system designed to answer questions about This application is a full-stack web application that enables users to query course materials and receive intelligent, context-aware responses. It uses ChromaDB for vector storage, Anthropic's Claude for AI generation, and provides a web interface for interaction. - ## Prerequisites - Python 3.13 or higher @@ -17,18 +16,21 @@ This application is a full-stack web application that enables users to query cou ## Installation 1. **Install uv** (if not already installed) + ```bash curl -LsSf https://astral.sh/uv/install.sh | sh ``` 2. **Install Python dependencies** + ```bash uv sync ``` 3. **Set up environment variables** - + Create a `.env` file in the root directory: + ```bash ANTHROPIC_API_KEY=your_anthropic_api_key_here ``` @@ -38,6 +40,7 @@ This application is a full-stack web application that enables users to query cou ### Quick Start Use the provided shell script: + ```bash chmod +x run.sh ./run.sh @@ -51,6 +54,14 @@ uv run uvicorn app:app --reload --port 8000 ``` The application will be available at: + - Web Interface: `http://localhost:8000` - API Documentation: `http://localhost:8000/docs` +Commands + +```sh +claude mcp add playwright npx @playwright/mcp@latest +``` + +[Reading Notes including Prompts](https://github.com/https-deeplearning-ai/sc-claude-code-files/tree/main/reading_notes) diff --git a/backend/ai_generator.py b/backend/ai_generator.py index 0363ca90..1f931033 100644 --- a/backend/ai_generator.py +++ b/backend/ai_generator.py @@ -1,25 +1,36 @@ +from typing import Any, Dict, List, Optional + import anthropic -from typing import List, Optional, Dict, Any + class AIGenerator: """Handles interactions with Anthropic's Claude API for generating responses""" - + # Static system prompt to avoid rebuilding on each call - SYSTEM_PROMPT = """ You are an AI assistant specialized in course materials and educational content with access to a comprehensive search tool for course information. + SYSTEM_PROMPT = """ You are an AI assistant specialized in document materials and educational content with access to comprehensive search and outline tools for document information. -Search Tool Usage: -- Use the search tool **only** for questions about specific course content or detailed educational materials -- **One search per query maximum** -- Synthesize search results into accurate, fact-based responses -- If search yields no results, state this clearly without offering alternatives +Tool Usage: +- **Content Search Tool**: Use for questions about specific document content or detailed educational materials +- **Document Outline Tool**: Use for questions about document structure, section lists, or document overviews +- **Document List Tool**: Use for questions asking for all available document titles or what documents exist +- **Multi-round tool usage**: You may use tools across up to 2 rounds to fully answer complex queries that require multiple searches or comparisons +- **Strategic tool usage**: Use tools efficiently - don't repeat searches with identical parameters +- Synthesize tool results into accurate, fact-based responses +- If tools yield no results, state this clearly without offering alternatives Response Protocol: -- **General knowledge questions**: Answer using existing knowledge without searching -- **Course-specific questions**: Search first, then answer +- **General knowledge questions**: Answer using existing knowledge without using tools +- **Document content questions**: Use content search tool first, then answer +- **Document outline/structure questions**: Use outline tool first, then answer +- **Document listing questions**: Use document list tool first, then answer - **No meta-commentary**: - - Provide direct answers only — no reasoning process, search explanations, or question-type analysis - - Do not mention "based on the search results" + - Provide direct answers only — no reasoning process, tool usage explanations, or question-type analysis + - Do not mention "based on the search results" or "using the outline tool" +When responding to outline queries, always include: +- Document title +- Document link +- Complete section list with section numbers and titles All responses must be: 1. **Brief, Concise and focused** - Get to the point quickly @@ -28,108 +39,168 @@ class AIGenerator: 4. **Example-supported** - Include relevant examples when they aid understanding Provide only the direct answer to what was asked. """ - + def __init__(self, api_key: str, model: str): self.client = anthropic.Anthropic(api_key=api_key) self.model = model - + # Pre-build base API parameters - self.base_params = { - "model": self.model, - "temperature": 0, - "max_tokens": 800 - } - - def generate_response(self, query: str, - conversation_history: Optional[str] = None, - tools: Optional[List] = None, - tool_manager=None) -> str: + self.base_params = {"model": self.model, "temperature": 0, "max_tokens": 800} + + def generate_response( + self, + query: str, + conversation_history: Optional[str] = None, + tools: Optional[List] = None, + tool_manager=None, + ) -> str: """ Generate AI response with optional tool usage and conversation context. - + Supports up to 2 sequential rounds of tool calling. + Args: query: The user's question or request conversation_history: Previous messages for context tools: Available tools the AI can use tool_manager: Manager to execute tools - + Returns: Generated response as string """ - - # Build system content efficiently - avoid string ops when possible + + # Build system content efficiently system_content = ( f"{self.SYSTEM_PROMPT}\n\nPrevious conversation:\n{conversation_history}" - if conversation_history + if conversation_history else self.SYSTEM_PROMPT ) - - # Prepare API call parameters efficiently - api_params = { + + # Initialize conversation messages + messages = [{"role": "user", "content": query}] + max_rounds = 2 + round_count = 0 + + # Iterative multi-round conversation + while round_count < max_rounds: + # Prepare API call parameters + api_params = { + **self.base_params, + "messages": messages, + "system": system_content, + } + + # Add tools if available + if tools: + api_params["tools"] = tools + api_params["tool_choice"] = {"type": "auto"} + + # Get response from Claude + response = self.client.messages.create(**api_params) + + # If no tool use, return response + if response.stop_reason != "tool_use" or not tool_manager: + return response.content[0].text + + # Execute tools and continue conversation + messages.append({"role": "assistant", "content": response.content}) + + # Execute all tool calls and collect results + tool_results = [] + for content_block in response.content: + if content_block.type == "tool_use": + try: + tool_result = tool_manager.execute_tool( + content_block.name, **content_block.input + ) + tool_results.append( + { + "type": "tool_result", + "tool_use_id": content_block.id, + "content": tool_result, + } + ) + except Exception as e: + # Handle tool execution error gracefully + tool_results.append( + { + "type": "tool_result", + "tool_use_id": content_block.id, + "content": f"Tool execution error: {str(e)}", + } + ) + + # Add tool results to conversation + if tool_results: + messages.append({"role": "user", "content": tool_results}) + + round_count += 1 + + # Final round without tools for response synthesis + final_params = { **self.base_params, - "messages": [{"role": "user", "content": query}], - "system": system_content + "messages": messages, + "system": system_content, } - - # Add tools if available - if tools: - api_params["tools"] = tools - api_params["tool_choice"] = {"type": "auto"} - - # Get response from Claude - response = self.client.messages.create(**api_params) - - # Handle tool execution if needed - if response.stop_reason == "tool_use" and tool_manager: - return self._handle_tool_execution(response, api_params, tool_manager) - - # Return direct response - return response.content[0].text - - def _handle_tool_execution(self, initial_response, base_params: Dict[str, Any], tool_manager): + + final_response = self.client.messages.create(**final_params) + return final_response.content[0].text + + def _handle_tool_execution( + self, initial_response, base_params: Dict[str, Any], tool_manager + ): """ Handle execution of tool calls and get follow-up response. - + Args: initial_response: The response containing tool use requests base_params: Base API parameters tool_manager: Manager to execute tools - + Returns: Final response text after tool execution """ # Start with existing messages messages = base_params["messages"].copy() - + # Add AI's tool use response messages.append({"role": "assistant", "content": initial_response.content}) - + # Execute all tool calls and collect results tool_results = [] for content_block in initial_response.content: if content_block.type == "tool_use": - tool_result = tool_manager.execute_tool( - content_block.name, - **content_block.input - ) - - tool_results.append({ - "type": "tool_result", - "tool_use_id": content_block.id, - "content": tool_result - }) - + try: + tool_result = tool_manager.execute_tool( + content_block.name, **content_block.input + ) + tool_results.append( + { + "type": "tool_result", + "tool_use_id": content_block.id, + "content": tool_result, + } + ) + except Exception as e: + # Handle tool execution error gracefully + tool_results.append( + { + "type": "tool_result", + "tool_use_id": content_block.id, + "content": f"Tool execution error: {str(e)}", + } + ) + # Add tool results as single message if tool_results: messages.append({"role": "user", "content": tool_results}) - + # Prepare final API call without tools final_params = { **self.base_params, "messages": messages, - "system": base_params["system"] + "system": base_params["system"], } - + # Get final response final_response = self.client.messages.create(**final_params) - return final_response.content[0].text \ No newline at end of file + return final_response.content[0].text diff --git a/backend/app.py b/backend/app.py index 5a69d741..682c73a5 100644 --- a/backend/app.py +++ b/backend/app.py @@ -1,25 +1,31 @@ +import os import warnings -warnings.filterwarnings("ignore", message="resource_tracker: There appear to be.*") +from pathlib import Path +from typing import List, Optional +from config import config from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware -from fastapi.staticfiles import StaticFiles from fastapi.middleware.trustedhost import TrustedHostMiddleware +from fastapi.responses import FileResponse, JSONResponse +from fastapi.staticfiles import StaticFiles from pydantic import BaseModel -from typing import List, Optional -import os - -from config import config from rag_system import RAGSystem +warnings.filterwarnings("ignore", message="resource_tracker: There appear to be.*") + +# Initialize logging +from logging_config import get_logger, setup_logging + +setup_logging(log_level="INFO", log_file="logs/rag_system.log") +logger = get_logger(__name__) + + # Initialize FastAPI app -app = FastAPI(title="Course Materials RAG System", root_path="") +app = FastAPI(title="Document Materials RAG System", root_path="") # Add trusted host middleware for proxy -app.add_middleware( - TrustedHostMiddleware, - allowed_hosts=["*"] -) +app.add_middleware(TrustedHostMiddleware, allowed_hosts=["*"]) # Enable CORS with proper settings for proxy app.add_middleware( @@ -32,27 +38,59 @@ ) # Initialize RAG system +logger.info("Starting Document Materials RAG System") rag_system = RAGSystem(config) # Pydantic models for request/response + + class QueryRequest(BaseModel): - """Request model for course queries""" + """Request model for document queries""" + query: str session_id: Optional[str] = None + +class Source(BaseModel): + """Source information with optional link""" + + text: str + link: Optional[str] = None + + class QueryResponse(BaseModel): - """Response model for course queries""" + """Response model for document queries""" + answer: str - sources: List[str] + sources: List[Source] session_id: str -class CourseStats(BaseModel): - """Response model for course statistics""" - total_courses: int - course_titles: List[str] + +class DocumentStats(BaseModel): + """Response model for document statistics""" + + total_documents: int + document_titles: List[str] + # API Endpoints + +# New endpoint to create a new chat session + + +@app.post("/api/session/new") +async def create_new_session(prev_session_id: Optional[str] = None): + """Create a new session and clear previous session if provided""" + try: + if prev_session_id: + rag_system.session_manager.clear_session(prev_session_id) + session_id = rag_system.session_manager.create_session() + return JSONResponse(content={"session_id": session_id}) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/api/query", response_model=QueryResponse) async def query_documents(request: QueryRequest): """Process a query and return response with sources""" @@ -61,47 +99,60 @@ async def query_documents(request: QueryRequest): session_id = request.session_id if not session_id: session_id = rag_system.session_manager.create_session() - + # Process query using RAG system answer, sources = rag_system.query(request.query, session_id) - + + # Convert sources to Source objects + source_objects = [] + for source in sources: + if isinstance(source, dict) and "text" in source: + # New format with embedded links + source_objects.append( + Source(text=source["text"], link=source.get("link")) + ) + else: + # Legacy string format + source_objects.append(Source(text=str(source), link=None)) + return QueryResponse( - answer=answer, - sources=sources, - session_id=session_id + answer=answer, sources=source_objects, session_id=session_id ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) -@app.get("/api/courses", response_model=CourseStats) -async def get_course_stats(): - """Get course analytics and statistics""" + +@app.get("/api/documents", response_model=DocumentStats) +async def get_document_stats(): + """Get document analytics and statistics""" try: - analytics = rag_system.get_course_analytics() - return CourseStats( - total_courses=analytics["total_courses"], - course_titles=analytics["course_titles"] + analytics = rag_system.get_document_analytics() + return DocumentStats( + total_documents=analytics["total_documents"], + document_titles=analytics["document_titles"], ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + @app.on_event("startup") async def startup_event(): """Load initial documents on startup""" docs_path = "../docs" if os.path.exists(docs_path): - print("Loading initial documents...") + logger.info("Loading initial documents...") try: - courses, chunks = rag_system.add_course_folder(docs_path, clear_existing=False) - print(f"Loaded {courses} courses with {chunks} chunks") + documents, chunks = rag_system.add_document_folder( + docs_path, clear_existing=False + ) + logger.info(f"Loaded {documents} documents with {chunks} chunks") except Exception as e: - print(f"Error loading documents: {e}") + logger.error(f"Error loading documents: {e}") + else: + logger.warning(f"Documents directory does not exist: {docs_path}") + # Custom static file handler with no-cache headers for development -from fastapi.staticfiles import StaticFiles -from fastapi.responses import FileResponse -import os -from pathlib import Path class DevStaticFiles(StaticFiles): @@ -113,7 +164,7 @@ async def get_response(self, path: str, scope): response.headers["Pragma"] = "no-cache" response.headers["Expires"] = "0" return response - - + + # Serve static files for the frontend -app.mount("/", StaticFiles(directory="../frontend", html=True), name="static") \ No newline at end of file +app.mount("/", StaticFiles(directory="../frontend", html=True), name="static") diff --git a/backend/config.py b/backend/config.py index d9f6392e..cab6dccc 100644 --- a/backend/config.py +++ b/backend/config.py @@ -1,29 +1,31 @@ import os from dataclasses import dataclass + from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() + @dataclass class Config: """Configuration settings for the RAG system""" + # Anthropic API settings ANTHROPIC_API_KEY: str = os.getenv("ANTHROPIC_API_KEY", "") ANTHROPIC_MODEL: str = "claude-sonnet-4-20250514" - + # Embedding model settings EMBEDDING_MODEL: str = "all-MiniLM-L6-v2" - + # Document processing settings - CHUNK_SIZE: int = 800 # Size of text chunks for vector storage - CHUNK_OVERLAP: int = 100 # Characters to overlap between chunks - MAX_RESULTS: int = 5 # Maximum search results to return - MAX_HISTORY: int = 2 # Number of conversation messages to remember - + CHUNK_SIZE: int = 800 # Size of text chunks for vector storage + CHUNK_OVERLAP: int = 100 # Characters to overlap between chunks + MAX_RESULTS: int = 5 # Maximum search results to return + MAX_HISTORY: int = 2 # Number of conversation messages to remember + # Database paths CHROMA_PATH: str = "./chroma_db" # ChromaDB storage location -config = Config() - +config = Config() diff --git a/backend/document_processor.py b/backend/document_processor.py index 266e8590..b33f862a 100644 --- a/backend/document_processor.py +++ b/backend/document_processor.py @@ -1,83 +1,111 @@ +import logging import os import re from typing import List, Tuple -from models import Course, Lesson, CourseChunk + +from logging_config import get_logger +from models import Document, DocumentChunk, Section + +logger = get_logger(__name__) + class DocumentProcessor: - """Processes course documents and extracts structured information""" - + """Processes documents and extracts structured information""" + def __init__(self, chunk_size: int, chunk_overlap: int): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap - + def read_file(self, file_path: str) -> str: """Read content from file with UTF-8 encoding""" + logger.debug(f"Reading file: {file_path}") try: - with open(file_path, 'r', encoding='utf-8') as file: - return file.read() - except UnicodeDecodeError: + with open(file_path, "r", encoding="utf-8") as file: + content = file.read() + logger.info( + f"Successfully read file {file_path} - {len(content)} characters" + ) + return content + except UnicodeDecodeError as e: + logger.warning( + f"UTF-8 decode failed for {file_path}, retrying with error handling: {e}" + ) # If UTF-8 fails, try with error handling - with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: - return file.read() - - + with open(file_path, "r", encoding="utf-8", errors="ignore") as file: + content = file.read() + logger.info( + f"Read file {file_path} with error handling - {len(content)} characters" + ) + return content + except Exception as e: + logger.error(f"Failed to read file {file_path}: {e}") + raise def chunk_text(self, text: str) -> List[str]: """Split text into sentence-based chunks with overlap using config settings""" - + logger.debug( + f"Starting text chunking - Input length: {len(text)} chars, chunk_size: {self.chunk_size}, overlap: {self.chunk_overlap}" + ) + # Clean up the text - text = re.sub(r'\s+', ' ', text.strip()) # Normalize whitespace - + text = re.sub(r"\s+", " ", text.strip()) # Normalize whitespace + logger.debug(f"Text normalized - Length after cleanup: {len(text)} chars") + # Better sentence splitting that handles abbreviations # This regex looks for periods followed by whitespace and capital letters # but ignores common abbreviations - sentence_endings = re.compile(r'(? self.chunk_size and current_chunk: break - + current_chunk.append(sentence) current_size += total_addition - + # Add chunk if we have content if current_chunk: - chunks.append(' '.join(current_chunk)) - + chunks.append(" ".join(current_chunk)) + # Calculate overlap for next chunk - if hasattr(self, 'chunk_overlap') and self.chunk_overlap > 0: + if hasattr(self, "chunk_overlap") and self.chunk_overlap > 0: # Find how many sentences to overlap overlap_size = 0 overlap_sentences = 0 - + # Count backwards from end of current chunk for k in range(len(current_chunk) - 1, -1, -1): - sentence_len = len(current_chunk[k]) + (1 if k < len(current_chunk) - 1 else 0) + sentence_len = len(current_chunk[k]) + ( + 1 if k < len(current_chunk) - 1 else 0 + ) if overlap_size + sentence_len <= self.chunk_overlap: overlap_size += sentence_len overlap_sentences += 1 else: break - + # Move start position considering overlap next_start = i + len(current_chunk) - overlap_sentences i = max(next_start, i + 1) # Ensure we make progress @@ -87,14 +115,21 @@ def chunk_text(self, text: str) -> List[str]: else: # No sentences fit, move to next i += 1 - - return chunks - + logger.debug( + f"Text chunking completed - Created {len(chunks)} chunks from {len(sentences)} sentences" + ) + if logger.isEnabledFor(logging.DEBUG): + avg_chunk_size = ( + sum(len(chunk) for chunk in chunks) / len(chunks) if chunks else 0 + ) + logger.debug(f"Average chunk size: {avg_chunk_size:.1f} characters") + return chunks - - def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseChunk]]: + def process_document( + self, file_path: str + ) -> Tuple[Document, List[DocumentChunk]]: """ Process a course document with expected format: Line 1: Course Title: [title] @@ -102,158 +137,221 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh Line 3: Course Instructor: [instructor] Following lines: Lesson markers and content """ + logger.info(f"=== Processing course document: {file_path} ===") content = self.read_file(file_path) filename = os.path.basename(file_path) - - lines = content.strip().split('\n') - - # Extract course metadata from first three lines + logger.debug(f"Filename: {filename}, Content length: {len(content)} characters") + + lines = content.strip().split("\n") + logger.debug(f"Document split into {len(lines)} lines") + + # Extract document metadata from first three lines course_title = filename # Default fallback course_link = None instructor_name = "Unknown" - - # Parse course title from first line + + logger.debug("=== Parsing course metadata ===") + # Parse document title from first line if len(lines) >= 1 and lines[0].strip(): - title_match = re.match(r'^Course Title:\s*(.+)$', lines[0].strip(), re.IGNORECASE) + title_match = re.match( + r"^Course Title:\s*(.+)$", lines[0].strip(), re.IGNORECASE + ) if title_match: course_title = title_match.group(1).strip() + logger.debug(f"Found course title in header: '{course_title}'") else: course_title = lines[0].strip() - - # Parse remaining lines for course metadata + logger.debug(f"Using first line as course title: '{course_title}'") + + # Parse remaining lines for document metadata for i in range(1, min(len(lines), 4)): # Check first 4 lines for metadata line = lines[i].strip() if not line: continue - - # Try to match course link - link_match = re.match(r'^Course Link:\s*(.+)$', line, re.IGNORECASE) + + # Try to match document link + link_match = re.match(r"^Course Link:\s*(.+)$", line, re.IGNORECASE) if link_match: course_link = link_match.group(1).strip() + logger.debug(f"Found course link: '{course_link}'") continue - + # Try to match instructor - instructor_match = re.match(r'^Course Instructor:\s*(.+)$', line, re.IGNORECASE) + instructor_match = re.match( + r"^Course Instructor:\s*(.+)$", line, re.IGNORECASE + ) if instructor_match: instructor_name = instructor_match.group(1).strip() + logger.debug(f"Found instructor: '{instructor_name}'") continue - - # Create course object with title as ID - course = Course( + + # Create document object with title as ID + document = Document( title=course_title, - course_link=course_link, - instructor=instructor_name if instructor_name != "Unknown" else None + document_link=course_link, + instructor=instructor_name if instructor_name != "Unknown" else None, ) - + logger.info( + f"Created course object: title='{course_title}', link='{course_link}', instructor='{instructor_name}'" + ) + # Process lessons and create chunks - course_chunks = [] + logger.debug("=== Processing lessons ===") + document_chunks = [] current_lesson = None lesson_title = None lesson_link = None lesson_content = [] chunk_counter = 0 - + # Start processing from line 4 (after metadata) start_index = 3 if len(lines) > 3 and not lines[3].strip(): start_index = 4 # Skip empty line after instructor - + logger.debug(f"Starting lesson processing from line {start_index}") + i = start_index while i < len(lines): line = lines[i] - + # Check for lesson markers (e.g., "Lesson 0: Introduction") - lesson_match = re.match(r'^Lesson\s+(\d+):\s*(.+)$', line.strip(), re.IGNORECASE) - + lesson_match = re.match( + r"^Lesson\s+(\d+):\s*(.+)$", line.strip(), re.IGNORECASE + ) + if lesson_match: # Process previous lesson if it exists if current_lesson is not None and lesson_content: - lesson_text = '\n'.join(lesson_content).strip() + lesson_text = "\n".join(lesson_content).strip() + logger.debug( + f"Processing previous lesson {current_lesson}: '{lesson_title}' ({len(lesson_text)} chars)" + ) if lesson_text: - # Add lesson to course - lesson = Lesson( - lesson_number=current_lesson, + # Add lesson to document + section = Section( + section_number=current_lesson, title=lesson_title, - lesson_link=lesson_link + section_link=lesson_link, + ) + document.sections.append(section) + logger.debug( + f"Added lesson {current_lesson} to course - Link: {lesson_link}" ) - course.lessons.append(lesson) - + # Create chunks for this lesson chunks = self.chunk_text(lesson_text) + logger.debug( + f"Created {len(chunks)} chunks for lesson {current_lesson}" + ) for idx, chunk in enumerate(chunks): # For the first chunk of each lesson, add lesson context if idx == 0: - chunk_with_context = f"Lesson {current_lesson} content: {chunk}" + chunk_with_context = ( + f"Lesson {current_lesson} content: {chunk}" + ) else: chunk_with_context = chunk - - course_chunk = CourseChunk( + + document_chunk = DocumentChunk( content=chunk_with_context, - course_title=course.title, - lesson_number=current_lesson, - chunk_index=chunk_counter + document_title=document.title, + section_number=current_lesson, + chunk_index=chunk_counter, ) - course_chunks.append(course_chunk) + document_chunks.append(document_chunk) chunk_counter += 1 - + # Start new lesson current_lesson = int(lesson_match.group(1)) lesson_title = lesson_match.group(2).strip() lesson_link = None - + logger.info(f"Found new lesson {current_lesson}: '{lesson_title}'") + # Check if next line is a lesson link if i + 1 < len(lines): next_line = lines[i + 1].strip() - link_match = re.match(r'^Lesson Link:\s*(.+)$', next_line, re.IGNORECASE) + link_match = re.match( + r"^Lesson Link:\s*(.+)$", next_line, re.IGNORECASE + ) if link_match: lesson_link = link_match.group(1).strip() + logger.debug( + f"Found lesson link for lesson {current_lesson}: '{lesson_link}'" + ) i += 1 # Skip the link line so it's not added to content - + lesson_content = [] else: # Add line to current lesson content lesson_content.append(line) - + i += 1 - + # Process the last lesson + logger.debug("=== Processing final lesson ===") if current_lesson is not None and lesson_content: - lesson_text = '\n'.join(lesson_content).strip() + lesson_text = "\n".join(lesson_content).strip() + logger.debug( + f"Processing final lesson {current_lesson}: '{lesson_title}' ({len(lesson_text)} chars)" + ) if lesson_text: - lesson = Lesson( - lesson_number=current_lesson, + section = Section( + section_number=current_lesson, title=lesson_title, - lesson_link=lesson_link + section_link=lesson_link, ) - course.lessons.append(lesson) - + document.sections.append(section) + logger.debug( + f"Added final lesson {current_lesson} to document - Link: {lesson_link}" + ) + chunks = self.chunk_text(lesson_text) + logger.debug( + f"Created {len(chunks)} chunks for final lesson {current_lesson}" + ) for idx, chunk in enumerate(chunks): - # For any chunk of each lesson, add lesson context & course title - + # For any chunk of each lesson, add lesson context & document title + chunk_with_context = f"Course {course_title} Lesson {current_lesson} content: {chunk}" - - course_chunk = CourseChunk( + + document_chunk = DocumentChunk( content=chunk_with_context, - course_title=course.title, - lesson_number=current_lesson, - chunk_index=chunk_counter + document_title=document.title, + section_number=current_lesson, + chunk_index=chunk_counter, ) - course_chunks.append(course_chunk) + document_chunks.append(document_chunk) chunk_counter += 1 - + # If no lessons found, treat entire content as one document - if not course_chunks and len(lines) > 2: - remaining_content = '\n'.join(lines[start_index:]).strip() + if not document_chunks and len(lines) > 2: + logger.warning( + "No lessons found in document - treating entire content as single document" + ) + remaining_content = "\n".join(lines[start_index:]).strip() if remaining_content: chunks = self.chunk_text(remaining_content) + logger.debug(f"Created {len(chunks)} chunks from document content") for chunk in chunks: - course_chunk = CourseChunk( + document_chunk = DocumentChunk( content=chunk, - course_title=course.title, - chunk_index=chunk_counter + document_title=document.title, + chunk_index=chunk_counter, ) - course_chunks.append(course_chunk) + document_chunks.append(document_chunk) chunk_counter += 1 - - return course, course_chunks + + # Final processing summary + section_count = len(document.sections) + chunk_count = len(document_chunks) + logger.info(f"=== PROCESSING COMPLETE ===") + logger.info( + f"Document: '{document.title}' - {section_count} sections, {chunk_count} chunks" + ) + if section_count > 0: + sections_with_links = sum( + 1 for section in document.sections if section.section_link + ) + logger.info(f"Sections with links: {sections_with_links}/{section_count}") + + return document, document_chunks diff --git a/backend/logging_config.py b/backend/logging_config.py new file mode 100644 index 00000000..2ec0c4a7 --- /dev/null +++ b/backend/logging_config.py @@ -0,0 +1,76 @@ +import logging +import os +from pathlib import Path + + +def setup_logging(log_level: str = "INFO", log_file: str = None): + """ + Setup logging configuration for the RAG system. + + Args: + log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + log_file: Optional log file path. If None, logs only to console. + """ + # Create logs directory if log_file is specified + if log_file: + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + + # Configure logging format + log_format = ( + "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s" + ) + + # Configure handlers + handlers = [logging.StreamHandler()] # Always log to console + + if log_file: + # Add file handler if log file specified + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(logging.Formatter(log_format)) + handlers.append(file_handler) + + # Configure root logger + logging.basicConfig( + level=getattr(logging, log_level.upper()), + format=log_format, + handlers=handlers, + force=True, # Override any existing logging configuration + ) + + # Configure specific loggers for our modules + loggers = [ + "rag_system", + "document_processor", + "vector_store", + "search_tools", + "ai_generator", + "session_manager", + ] + + for logger_name in loggers: + logger = logging.getLogger(logger_name) + logger.setLevel(getattr(logging, log_level.upper())) + + # Suppress overly verbose third-party loggers + logging.getLogger("chromadb").setLevel(logging.WARNING) + logging.getLogger("sentence_transformers").setLevel(logging.WARNING) + logging.getLogger("httpx").setLevel(logging.WARNING) + logging.getLogger("anthropic").setLevel(logging.WARNING) + + logging.info( + f"Logging configured - Level: {log_level}, File: {log_file or 'Console only'}" + ) + + +def get_logger(name: str) -> logging.Logger: + """ + Get a logger instance for a specific module. + + Args: + name: Logger name (usually __name__) + + Returns: + Logger instance + """ + return logging.getLogger(name) diff --git a/backend/models.py b/backend/models.py index 7f7126fa..d8e89200 100644 --- a/backend/models.py +++ b/backend/models.py @@ -1,22 +1,29 @@ -from typing import List, Dict, Optional +from typing import Dict, List, Optional + from pydantic import BaseModel -class Lesson(BaseModel): - """Represents a lesson within a course""" - lesson_number: int # Sequential lesson number (1, 2, 3, etc.) - title: str # Lesson title - lesson_link: Optional[str] = None # URL link to the lesson - -class Course(BaseModel): - """Represents a complete course with its lessons""" - title: str # Full course title (used as unique identifier) - course_link: Optional[str] = None # URL link to the course - instructor: Optional[str] = None # Course instructor name (optional metadata) - lessons: List[Lesson] = [] # List of lessons in this course - -class CourseChunk(BaseModel): - """Represents a text chunk from a course for vector storage""" - content: str # The actual text content - course_title: str # Which course this chunk belongs to - lesson_number: Optional[int] = None # Which lesson this chunk is from - chunk_index: int # Position of this chunk in the document \ No newline at end of file + +class Section(BaseModel): + """Represents a section within a document""" + + section_number: int # Sequential section number (1, 2, 3, etc.) + title: str # Section title + section_link: Optional[str] = None # URL link to the section + + +class Document(BaseModel): + """Represents a complete document with its sections""" + + title: str # Full document title (used as unique identifier) + document_link: Optional[str] = None # URL link to the document + instructor: Optional[str] = None # Document instructor name (optional metadata) + sections: List[Section] = [] # List of sections in this document + + +class DocumentChunk(BaseModel): + """Represents a text chunk from a document for vector storage""" + + content: str # The actual text content + document_title: str # Which document this chunk belongs to + section_number: Optional[int] = None # Which section this chunk is from + chunk_index: int # Position of this chunk in the document diff --git a/backend/rag_system.py b/backend/rag_system.py index 50d848c8..3c0062b8 100644 --- a/backend/rag_system.py +++ b/backend/rag_system.py @@ -1,147 +1,215 @@ -from typing import List, Tuple, Optional, Dict import os -from document_processor import DocumentProcessor -from vector_store import VectorStore +from typing import Dict, List, Optional, Tuple + from ai_generator import AIGenerator +from document_processor import DocumentProcessor +from logging_config import get_logger +from models import Document, DocumentChunk, Section +from search_tools import DocumentListTool, DocumentOutlineTool, DocumentSearchTool, ToolManager from session_manager import SessionManager -from search_tools import ToolManager, CourseSearchTool -from models import Course, Lesson, CourseChunk +from vector_store import VectorStore + +logger = get_logger(__name__) + class RAGSystem: """Main orchestrator for the Retrieval-Augmented Generation system""" - + def __init__(self, config): + logger.info("=== Initializing RAG System ===") self.config = config - + # Initialize core components - self.document_processor = DocumentProcessor(config.CHUNK_SIZE, config.CHUNK_OVERLAP) - self.vector_store = VectorStore(config.CHROMA_PATH, config.EMBEDDING_MODEL, config.MAX_RESULTS) - self.ai_generator = AIGenerator(config.ANTHROPIC_API_KEY, config.ANTHROPIC_MODEL) + logger.debug("Creating DocumentProcessor") + self.document_processor = DocumentProcessor( + config.CHUNK_SIZE, config.CHUNK_OVERLAP + ) + + logger.debug("Creating VectorStore") + self.vector_store = VectorStore( + config.CHROMA_PATH, config.EMBEDDING_MODEL, config.MAX_RESULTS + ) + + logger.debug("Creating AIGenerator") + self.ai_generator = AIGenerator( + config.ANTHROPIC_API_KEY, config.ANTHROPIC_MODEL + ) + + logger.debug("Creating SessionManager") self.session_manager = SessionManager(config.MAX_HISTORY) - + # Initialize search tools + logger.debug("Setting up search tools") self.tool_manager = ToolManager() - self.search_tool = CourseSearchTool(self.vector_store) + self.search_tool = DocumentSearchTool(self.vector_store) + self.outline_tool = DocumentOutlineTool(self.vector_store) + self.list_tool = DocumentListTool(self.vector_store) self.tool_manager.register_tool(self.search_tool) - - def add_course_document(self, file_path: str) -> Tuple[Course, int]: + self.tool_manager.register_tool(self.outline_tool) + self.tool_manager.register_tool(self.list_tool) + + logger.info("RAG System initialization complete") + + def add_document(self, file_path: str) -> Tuple[Document, int]: """ - Add a single course document to the knowledge base. - + Add a single document to the knowledge base. + Args: - file_path: Path to the course document - + file_path: Path to the document + Returns: - Tuple of (Course object, number of chunks created) + Tuple of (Document object, number of chunks created) """ + logger.info(f"Adding document: {file_path}") try: # Process the document - course, course_chunks = self.document_processor.process_course_document(file_path) - - # Add course metadata to vector store for semantic search - self.vector_store.add_course_metadata(course) - - # Add course content chunks to vector store - self.vector_store.add_course_content(course_chunks) - - return course, len(course_chunks) + document, document_chunks = self.document_processor.process_document( + file_path + ) + + if document: + # Add document metadata to vector store for semantic search + self.vector_store.add_document_metadata(document) + + # Add document content chunks to vector store + self.vector_store.add_document_content(document_chunks) + + logger.info( + f"Successfully added document: '{document.title}' with {len(document_chunks)} chunks" + ) + return document, len(document_chunks) + else: + logger.warning(f"No document object created from document: {file_path}") + return None, 0 + except Exception as e: - print(f"Error processing course document {file_path}: {e}") + logger.error(f"Error processing document {file_path}: {e}") return None, 0 - - def add_course_folder(self, folder_path: str, clear_existing: bool = False) -> Tuple[int, int]: + + def add_document_folder( + self, folder_path: str, clear_existing: bool = False + ) -> Tuple[int, int]: """ - Add all course documents from a folder. - + Add all documents from a folder. + Args: - folder_path: Path to folder containing course documents + folder_path: Path to folder containing documents clear_existing: Whether to clear existing data first - + Returns: - Tuple of (total courses added, total chunks created) + Tuple of (total documents added, total chunks created) """ - total_courses = 0 + logger.info( + f"=== Processing document folder: {folder_path} (clear_existing={clear_existing}) ===" + ) + total_documents = 0 total_chunks = 0 - + # Clear existing data if requested if clear_existing: - print("Clearing existing data for fresh rebuild...") + logger.info("Clearing existing data for fresh rebuild...") self.vector_store.clear_all_data() - + if not os.path.exists(folder_path): - print(f"Folder {folder_path} does not exist") + logger.error(f"Folder {folder_path} does not exist") return 0, 0 - - # Get existing course titles to avoid re-processing - existing_course_titles = set(self.vector_store.get_existing_course_titles()) - + + # Get existing document titles to avoid re-processing + existing_document_titles = set(self.vector_store.get_existing_document_titles()) + logger.info( + f"Found {len(existing_document_titles)} existing documents in vector store" + ) + + # Get all files to process + files = [ + f + for f in os.listdir(folder_path) + if os.path.isfile(os.path.join(folder_path, f)) + and f.lower().endswith((".pdf", ".docx", ".txt")) + ] + logger.info(f"Found {len(files)} document files to process") + # Process each file in the folder - for file_name in os.listdir(folder_path): + for file_name in files: file_path = os.path.join(folder_path, file_name) - if os.path.isfile(file_path) and file_name.lower().endswith(('.pdf', '.docx', '.txt')): - try: - # Check if this course might already exist - # We'll process the document to get the course ID, but only add if new - course, course_chunks = self.document_processor.process_course_document(file_path) - - if course and course.title not in existing_course_titles: - # This is a new course - add it to the vector store - self.vector_store.add_course_metadata(course) - self.vector_store.add_course_content(course_chunks) - total_courses += 1 - total_chunks += len(course_chunks) - print(f"Added new course: {course.title} ({len(course_chunks)} chunks)") - existing_course_titles.add(course.title) - elif course: - print(f"Course already exists: {course.title} - skipping") - except Exception as e: - print(f"Error processing {file_name}: {e}") - - return total_courses, total_chunks - - def query(self, query: str, session_id: Optional[str] = None) -> Tuple[str, List[str]]: + logger.debug(f"Processing file: {file_name}") + try: + # Check if this document might already exist + # We'll process the document to get the document ID, but only add if new + document, document_chunks = self.document_processor.process_document( + file_path + ) + + if document and document.title not in existing_document_titles: + # This is a new document - add it to the vector store + logger.info(f"Adding new document to vector store: '{document.title}'") + self.vector_store.add_document_metadata(document) + self.vector_store.add_document_content(document_chunks) + total_documents += 1 + total_chunks += len(document_chunks) + logger.info( + f"Added new document: {document.title} ({len(document_chunks)} chunks)" + ) + existing_document_titles.add(document.title) + elif document: + logger.info(f"Document already exists: {document.title} - skipping") + else: + logger.warning( + f"Failed to create document object from file: {file_name}" + ) + except Exception as e: + logger.error(f"Error processing {file_name}: {e}") + + logger.info( + f"=== Folder processing complete: {total_documents} new documents, {total_chunks} total chunks ===" + ) + return total_documents, total_chunks + + def query( + self, query: str, session_id: Optional[str] = None + ) -> Tuple[str, List[str]]: """ Process a user query using the RAG system with tool-based search. - + Args: query: User's question session_id: Optional session ID for conversation context - + Returns: Tuple of (response, sources list - empty for tool-based approach) """ # Create prompt for the AI with clear instructions - prompt = f"""Answer this question about course materials: {query}""" - + prompt = f"""Answer this question about document materials: {query}""" + # Get conversation history if session exists history = None if session_id: history = self.session_manager.get_conversation_history(session_id) - + # Generate response using AI with tools response = self.ai_generator.generate_response( query=prompt, conversation_history=history, tools=self.tool_manager.get_tool_definitions(), - tool_manager=self.tool_manager + tool_manager=self.tool_manager, ) - + # Get sources from the search tool sources = self.tool_manager.get_last_sources() # Reset sources after retrieving them self.tool_manager.reset_sources() - + # Update conversation history if session_id: self.session_manager.add_exchange(session_id, query, response) - + # Return response with sources from tool searches return response, sources - - def get_course_analytics(self) -> Dict: - """Get analytics about the course catalog""" + + def get_document_analytics(self) -> Dict: + """Get analytics about the document catalog""" return { - "total_courses": self.vector_store.get_course_count(), - "course_titles": self.vector_store.get_existing_course_titles() - } \ No newline at end of file + "total_documents": self.vector_store.get_document_count(), + "document_titles": self.vector_store.get_existing_document_titles(), + } diff --git a/backend/search_tools.py b/backend/search_tools.py index adfe8235..d6a1536f 100644 --- a/backend/search_tools.py +++ b/backend/search_tools.py @@ -1,124 +1,260 @@ -from typing import Dict, Any, Optional, Protocol from abc import ABC, abstractmethod -from vector_store import VectorStore, SearchResults +from typing import Any, Dict, Optional, Protocol + +from logging_config import get_logger +from vector_store import SearchResults, VectorStore + +logger = get_logger(__name__) class Tool(ABC): """Abstract base class for all tools""" - + @abstractmethod def get_tool_definition(self) -> Dict[str, Any]: """Return Anthropic tool definition for this tool""" pass - + @abstractmethod def execute(self, **kwargs) -> str: """Execute the tool with given parameters""" pass -class CourseSearchTool(Tool): - """Tool for searching course content with semantic course name matching""" - +class DocumentSearchTool(Tool): + """Tool for searching document content with semantic document name matching""" + def __init__(self, vector_store: VectorStore): self.store = vector_store self.last_sources = [] # Track sources from last search - + def get_tool_definition(self) -> Dict[str, Any]: """Return Anthropic tool definition for this tool""" return { - "name": "search_course_content", - "description": "Search course materials with smart course name matching and lesson filtering", + "name": "search_document_content", + "description": "Search document materials with smart document name matching and section filtering", "input_schema": { "type": "object", "properties": { "query": { - "type": "string", - "description": "What to search for in the course content" + "type": "string", + "description": "What to search for in the document content", }, - "course_name": { + "document_name": { "type": "string", - "description": "Course title (partial matches work, e.g. 'MCP', 'Introduction')" + "description": "Document title (partial matches work, e.g. 'MCP', 'Introduction')", }, - "lesson_number": { + "section_number": { "type": "integer", - "description": "Specific lesson number to search within (e.g. 1, 2, 3)" - } + "description": "Specific section number to search within (e.g. 1, 2, 3)", + }, }, - "required": ["query"] - } + "required": ["query"], + }, } - - def execute(self, query: str, course_name: Optional[str] = None, lesson_number: Optional[int] = None) -> str: + + def execute( + self, + query: str, + document_name: Optional[str] = None, + section_number: Optional[int] = None, + ) -> str: """ Execute the search tool with given parameters. - + Args: query: What to search for - course_name: Optional course filter - lesson_number: Optional lesson filter - + document_name: Optional document filter + section_number: Optional section filter + Returns: Formatted search results or error message """ - + # Use the vector store's unified search interface results = self.store.search( - query=query, - course_name=course_name, - lesson_number=lesson_number + query=query, document_name=document_name, section_number=section_number ) - + # Handle errors if results.error: return results.error - + # Handle empty results if results.is_empty(): filter_info = "" - if course_name: - filter_info += f" in course '{course_name}'" - if lesson_number: - filter_info += f" in lesson {lesson_number}" + if document_name: + filter_info += f" in document '{document_name}'" + if section_number: + filter_info += f" in section {section_number}" return f"No relevant content found{filter_info}." - + # Format and return results return self._format_results(results) - + def _format_results(self, results: SearchResults) -> str: - """Format search results with course and lesson context""" + """Format search results with document and lesson context""" formatted = [] - sources = [] # Track sources for the UI - + sources = [] # Track sources for the UI with embedded links + for doc, meta in zip(results.documents, results.metadata): - course_title = meta.get('course_title', 'unknown') - lesson_num = meta.get('lesson_number') - + document_title = meta.get("document_title", "unknown") + section_num = meta.get("section_number") + # Build context header - header = f"[{course_title}" - if lesson_num is not None: - header += f" - Lesson {lesson_num}" + header = f"[{document_title}" + if section_num is not None: + header += f" - Section {section_num}" header += "]" - - # Track source for the UI - source = course_title - if lesson_num is not None: - source += f" - Lesson {lesson_num}" - sources.append(source) - + + # Get section link if available + section_link = None + if section_num is not None: + section_link = self.store.get_section_link(document_title, section_num) + logger.debug( + f"Retrieved section link for '{document_title}' Section {section_num}: {section_link}" + ) + + # Create source entry with embedded link information + source_text = document_title + if section_num is not None: + source_text += f" - Section {section_num}" + + # Create source object with both display text and link + source_entry = {"text": source_text, "link": section_link} + sources.append(source_entry) + formatted.append(f"{header}\n{doc}") - + # Store sources for retrieval self.last_sources = sources - + logger.info( + f"Formatted {len(sources)} search results with embedded section links" + ) + return "\n\n".join(formatted) + +class DocumentOutlineTool(Tool): + """Tool for retrieving document outlines with lesson information""" + + def __init__(self, vector_store: VectorStore): + self.store = vector_store + + def get_tool_definition(self) -> Dict[str, Any]: + """Return Anthropic tool definition for this tool""" + return { + "name": "get_document_outline", + "description": "Get document outline including document title, link, and complete section list", + "input_schema": { + "type": "object", + "properties": { + "document_title": { + "type": "string", + "description": "Document title (partial matches work, e.g. 'MCP', 'Introduction')", + } + }, + "required": ["document_title"], + }, + } + + def execute(self, document_title: str) -> str: + """ + Execute the outline tool to get document information. + + Args: + document_title: Document title to get outline for + + Returns: + Formatted document outline or error message + """ + # Use vector search to find the best matching document + resolved_title = self.store._resolve_document_name(document_title) + if not resolved_title: + return f"No document found matching '{document_title}'" + + + # Get document metadata including lessons + import json + + try: + results = self.store.document_catalog.get(ids=[resolved_title]) + if not results or not results.get("metadatas"): + return f"No document metadata found for '{resolved_title}'" + + + metadata = results["metadatas"][0] + document_link = metadata.get("document_link", "No link available") + sections_json = metadata.get("sections_json", "[]") + sections = json.loads(sections_json) + + # Format the response + outline = f"**Document:** {resolved_title}\n" + outline += f"**Document Link:** {document_link}\n\n" + outline += "**Sections:**\n" + + if not sections: + outline += "No sections available" + else: + for section in sections: + section_num = section.get("section_number", "N/A") + section_title = section.get("section_title", "Untitled") + outline += f"{section_num}. {section_title}\n" + + return outline + + except Exception as e: + return f"Error retrieving document outline: {str(e)}" + + +class DocumentListTool(Tool): + """Tool for listing all available document titles""" + + def __init__(self, vector_store: VectorStore): + self.store = vector_store + + def get_tool_definition(self) -> Dict[str, Any]: + """Return Anthropic tool definition for this tool""" + return { + "name": "list_all_documents", + "description": "Get a complete list of all available document titles in the knowledge base", + "input_schema": { + "type": "object", + "properties": {}, + "required": [], + }, + } + + def execute(self) -> str: + """ + Execute the tool to list all document titles. + + Returns: + Formatted list of all available document titles + """ + try: + document_titles = self.store.get_existing_document_titles() + + if not document_titles: + return "No documents are currently available in the knowledge base." + + # Format as numbered list + formatted_list = [] + for i, title in enumerate(document_titles, 1): + formatted_list.append(f"{i}. {title}") + + return f"Available documents ({len(document_titles)} total):\n\n" + "\n".join(formatted_list) + + except Exception as e: + return f"Error retrieving document list: {str(e)}" + + class ToolManager: """Manages available tools for the AI""" - + def __init__(self): self.tools = {} - + def register_tool(self, tool: Tool): """Register any tool that implements the Tool interface""" tool_def = tool.get_tool_definition() @@ -127,28 +263,27 @@ def register_tool(self, tool: Tool): raise ValueError("Tool must have a 'name' in its definition") self.tools[tool_name] = tool - def get_tool_definitions(self) -> list: """Get all tool definitions for Anthropic tool calling""" return [tool.get_tool_definition() for tool in self.tools.values()] - + def execute_tool(self, tool_name: str, **kwargs) -> str: """Execute a tool by name with given parameters""" if tool_name not in self.tools: return f"Tool '{tool_name}' not found" - + return self.tools[tool_name].execute(**kwargs) - + def get_last_sources(self) -> list: """Get sources from the last search operation""" # Check all tools for last_sources attribute for tool in self.tools.values(): - if hasattr(tool, 'last_sources') and tool.last_sources: + if hasattr(tool, "last_sources") and tool.last_sources: return tool.last_sources return [] def reset_sources(self): """Reset sources from all tools that track sources""" for tool in self.tools.values(): - if hasattr(tool, 'last_sources'): - tool.last_sources = [] \ No newline at end of file + if hasattr(tool, "last_sources"): + tool.last_sources = [] diff --git a/backend/session_manager.py b/backend/session_manager.py index a5a96b1a..374db489 100644 --- a/backend/session_manager.py +++ b/backend/session_manager.py @@ -1,61 +1,66 @@ -from typing import Dict, List, Optional from dataclasses import dataclass +from typing import Dict, List, Optional + @dataclass class Message: """Represents a single message in a conversation""" - role: str # "user" or "assistant" + + role: str # "user" or "assistant" content: str # The message content + class SessionManager: """Manages conversation sessions and message history""" - + def __init__(self, max_history: int = 5): self.max_history = max_history self.sessions: Dict[str, List[Message]] = {} self.session_counter = 0 - + def create_session(self) -> str: """Create a new conversation session""" self.session_counter += 1 session_id = f"session_{self.session_counter}" self.sessions[session_id] = [] return session_id - + def add_message(self, session_id: str, role: str, content: str): """Add a message to the conversation history""" if session_id not in self.sessions: self.sessions[session_id] = [] - + message = Message(role=role, content=content) self.sessions[session_id].append(message) - + # Keep conversation history within limits if len(self.sessions[session_id]) > self.max_history * 2: - self.sessions[session_id] = self.sessions[session_id][-self.max_history * 2:] - + self.sessions[session_id] = self.sessions[session_id][ + -self.max_history * 2 : + ] + def add_exchange(self, session_id: str, user_message: str, assistant_message: str): """Add a complete question-answer exchange""" self.add_message(session_id, "user", user_message) self.add_message(session_id, "assistant", assistant_message) - + def get_conversation_history(self, session_id: Optional[str]) -> Optional[str]: """Get formatted conversation history for a session""" if not session_id or session_id not in self.sessions: return None - + messages = self.sessions[session_id] if not messages: return None - + # Format messages for context formatted_messages = [] for msg in messages: formatted_messages.append(f"{msg.role.title()}: {msg.content}") - + return "\n".join(formatted_messages) - + def clear_session(self, session_id: str): """Clear all messages from a session""" if session_id in self.sessions: - self.sessions[session_id] = [] \ No newline at end of file + self.sessions[session_id] = [] diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py new file mode 100644 index 00000000..cc4b7a2f --- /dev/null +++ b/backend/tests/conftest.py @@ -0,0 +1,233 @@ +import os +import shutil +import sys +import tempfile +from pathlib import Path +from unittest.mock import AsyncMock, Mock + +# Add backend directory to path for imports +backend_dir = Path(__file__).parent.parent +sys.path.insert(0, str(backend_dir)) + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient +from ai_generator import AIGenerator +from config import Config +from document_processor import DocumentProcessor +from models import Course, CourseChunk, Lesson +from rag_system import RAGSystem +from search_tools import CourseSearchTool, ToolManager +from session_manager import SessionManager +from vector_store import VectorStore + +@pytest.fixture +def temp_chroma_path(): + """Create temporary ChromaDB path for testing""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + shutil.rmtree(temp_dir) + + +@pytest.fixture +def test_config(temp_chroma_path): + """Create test configuration with temporary paths""" + config = Config() + config.CHROMA_PATH = temp_chroma_path + config.MAX_RESULTS = 5 # Set to reasonable value for testing + config.ANTHROPIC_API_KEY = "test-key" + return config + + +@pytest.fixture +def vector_store(test_config): + """Create vector store instance for testing""" + return VectorStore( + chroma_path=test_config.CHROMA_PATH, + embedding_model=test_config.EMBEDDING_MODEL, + max_results=test_config.MAX_RESULTS, + ) + + +@pytest.fixture +def sample_course(): + """Create sample course data for testing""" + lessons = [ + Lesson( + lesson_number=1, + title="Introduction", + lesson_link="https://example.com/lesson1", + ), + Lesson( + lesson_number=2, title="Overview", lesson_link="https://example.com/lesson2" + ), + Lesson( + lesson_number=3, + title="Advanced Topics", + lesson_link="https://example.com/lesson3", + ), + ] + return Course( + title="Test Course", + course_link="https://example.com/course", + instructor="Test Instructor", + lessons=lessons, + ) + + +@pytest.fixture +def sample_chunks(sample_course): + """Create sample course chunks for testing""" + chunks = [] + for i in range(5): + chunk = CourseChunk( + content=f"This is test content for chunk {i+1}. It contains information about testing.", + course_title=sample_course.title, + lesson_number=1 if i < 2 else 2, + chunk_index=i, + ) + chunks.append(chunk) + return chunks + + +@pytest.fixture +def populated_vector_store(vector_store, sample_course, sample_chunks): + """Vector store with sample data loaded""" + vector_store.add_course_metadata(sample_course) + vector_store.add_course_content(sample_chunks) + return vector_store + + +@pytest.fixture +def course_search_tool(populated_vector_store): + """CourseSearchTool with populated data""" + return CourseSearchTool(populated_vector_store) + + +@pytest.fixture +def tool_manager(course_search_tool): + """ToolManager with registered tools""" + tm = ToolManager() + tm.register_tool(course_search_tool) + return tm + + +@pytest.fixture +def ai_generator(test_config): + """AI generator instance for testing""" + return AIGenerator( + api_key=test_config.ANTHROPIC_API_KEY, model=test_config.ANTHROPIC_MODEL + ) + + +@pytest.fixture +def rag_system(test_config): + """RAG system instance for testing""" + return RAGSystem(test_config) + + +@pytest.fixture +def mock_rag_system(): + """Mock RAG system for API testing""" + mock_rag = Mock() + mock_rag.session_manager = Mock() + mock_rag.session_manager.create_session.return_value = "test-session-123" + mock_rag.session_manager.clear_session.return_value = None + mock_rag.query.return_value = ( + "Test answer from the RAG system", + [{"text": "Test source content", "link": "https://example.com/test"}] + ) + mock_rag.get_course_analytics.return_value = { + "total_courses": 2, + "course_titles": ["Test Course 1", "Test Course 2"] + } + return mock_rag + + +@pytest.fixture +def test_app(mock_rag_system): + """Create test FastAPI app without static file mounting issues""" + from fastapi import FastAPI, HTTPException + from fastapi.responses import JSONResponse + from pydantic import BaseModel + from typing import List, Optional + + app = FastAPI(title="Course Materials RAG System Test") + + # Pydantic models + class QueryRequest(BaseModel): + query: str + session_id: Optional[str] = None + + class Source(BaseModel): + text: str + link: Optional[str] = None + + class QueryResponse(BaseModel): + answer: str + sources: List[Source] + session_id: str + + class CourseStats(BaseModel): + total_courses: int + course_titles: List[str] + + # Inject mock RAG system + app.state.rag_system = mock_rag_system + + @app.post("/api/session/new") + async def create_new_session(prev_session_id: Optional[str] = None): + try: + if prev_session_id: + app.state.rag_system.session_manager.clear_session(prev_session_id) + session_id = app.state.rag_system.session_manager.create_session() + return JSONResponse(content={"session_id": session_id}) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/api/query", response_model=QueryResponse) + async def query_documents(request: QueryRequest): + try: + session_id = request.session_id + if not session_id: + session_id = app.state.rag_system.session_manager.create_session() + + answer, sources = app.state.rag_system.query(request.query, session_id) + + source_objects = [] + for source in sources: + if isinstance(source, dict) and "text" in source: + source_objects.append(Source(text=source["text"], link=source.get("link"))) + else: + source_objects.append(Source(text=str(source), link=None)) + + return QueryResponse( + answer=answer, + sources=source_objects, + session_id=session_id + ) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/courses", response_model=CourseStats) + async def get_course_stats(): + try: + analytics = app.state.rag_system.get_course_analytics() + return CourseStats( + total_courses=analytics["total_courses"], + course_titles=analytics["course_titles"] + ) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/") + async def root(): + return {"message": "Course Materials RAG System Test API"} + + return app + + +@pytest.fixture +def test_client(test_app): + """Create test client for API testing""" + return TestClient(test_app) diff --git a/backend/tests/test_ai_generator.py b/backend/tests/test_ai_generator.py new file mode 100644 index 00000000..431416a4 --- /dev/null +++ b/backend/tests/test_ai_generator.py @@ -0,0 +1,416 @@ +from unittest.mock import MagicMock, Mock, patch + +import anthropic +import pytest +from ai_generator import AIGenerator + + +class TestAIGenerator: + """Test suite for AIGenerator tool calling functionality""" + + @pytest.fixture + def mock_anthropic_client(self): + """Mock Anthropic client for testing""" + mock_client = Mock(spec=anthropic.Anthropic) + return mock_client + + @pytest.fixture + def ai_generator_with_mock(self, mock_anthropic_client, test_config): + """AI generator with mocked client""" + with patch("anthropic.Anthropic", return_value=mock_anthropic_client): + generator = AIGenerator( + api_key=test_config.ANTHROPIC_API_KEY, model=test_config.ANTHROPIC_MODEL + ) + generator.client = mock_anthropic_client + return generator + + def test_generate_response_without_tools(self, ai_generator_with_mock): + """Test basic response generation without tools""" + # Mock response + mock_response = Mock() + mock_response.content = [Mock(text="Test response")] + mock_response.stop_reason = "end_turn" + ai_generator_with_mock.client.messages.create.return_value = mock_response + + result = ai_generator_with_mock.generate_response("What is AI?") + + assert result == "Test response" + ai_generator_with_mock.client.messages.create.assert_called_once() + + def test_generate_response_with_tools_no_use( + self, ai_generator_with_mock, tool_manager + ): + """Test response generation with tools available but not used""" + # Mock response without tool use + mock_response = Mock() + mock_response.content = [Mock(text="General knowledge response")] + mock_response.stop_reason = "end_turn" + ai_generator_with_mock.client.messages.create.return_value = mock_response + + tools = tool_manager.get_tool_definitions() + result = ai_generator_with_mock.generate_response( + "What is artificial intelligence?", tools=tools, tool_manager=tool_manager + ) + + assert result == "General knowledge response" + + def test_generate_response_with_tool_use( + self, ai_generator_with_mock, tool_manager + ): + """Test response generation that triggers tool use""" + # Mock initial response with tool use + mock_tool_block = Mock() + mock_tool_block.type = "tool_use" + mock_tool_block.name = "search_course_content" + mock_tool_block.id = "tool_123" + mock_tool_block.input = {"query": "testing"} + + initial_response = Mock() + initial_response.content = [mock_tool_block] + initial_response.stop_reason = "tool_use" + + # Mock final response after tool execution + final_response = Mock() + final_response.content = [ + Mock(text="Based on the search results, testing is important.") + ] + + ai_generator_with_mock.client.messages.create.side_effect = [ + initial_response, + final_response, + ] + + tools = tool_manager.get_tool_definitions() + result = ai_generator_with_mock.generate_response( + "Tell me about testing", tools=tools, tool_manager=tool_manager + ) + + assert result == "Based on the search results, testing is important." + # Should be called twice - initial and final + assert ai_generator_with_mock.client.messages.create.call_count == 2 + + def test_handle_tool_execution(self, ai_generator_with_mock, tool_manager): + """Test tool execution handling""" + # Create mock tool use block + mock_tool_block = Mock() + mock_tool_block.type = "tool_use" + mock_tool_block.name = "search_course_content" + mock_tool_block.id = "tool_123" + mock_tool_block.input = {"query": "testing"} + + initial_response = Mock() + initial_response.content = [mock_tool_block] + + # Mock final response + final_response = Mock() + final_response.content = [Mock(text="Tool execution complete")] + ai_generator_with_mock.client.messages.create.return_value = final_response + + base_params = { + "messages": [{"role": "user", "content": "Tell me about testing"}], + "system": "Test system prompt", + } + + result = ai_generator_with_mock._handle_tool_execution( + initial_response, base_params, tool_manager + ) + + assert result == "Tool execution complete" + + def test_tool_execution_with_multiple_tools( + self, ai_generator_with_mock, tool_manager + ): + """Test handling multiple tool calls in one response""" + # Create multiple mock tool blocks + mock_tool_block1 = Mock() + mock_tool_block1.type = "tool_use" + mock_tool_block1.name = "search_course_content" + mock_tool_block1.id = "tool_123" + mock_tool_block1.input = {"query": "testing"} + + mock_tool_block2 = Mock() + mock_tool_block2.type = "tool_use" + mock_tool_block2.name = "search_course_content" + mock_tool_block2.id = "tool_456" + mock_tool_block2.input = {"query": "development"} + + initial_response = Mock() + initial_response.content = [mock_tool_block1, mock_tool_block2] + + final_response = Mock() + final_response.content = [Mock(text="Multiple tools executed")] + ai_generator_with_mock.client.messages.create.return_value = final_response + + base_params = { + "messages": [ + {"role": "user", "content": "Tell me about testing and development"} + ], + "system": "Test system prompt", + } + + result = ai_generator_with_mock._handle_tool_execution( + initial_response, base_params, tool_manager + ) + + assert result == "Multiple tools executed" + + def test_conversation_history_integration(self, ai_generator_with_mock): + """Test that conversation history is properly integrated""" + mock_response = Mock() + mock_response.content = [Mock(text="Response with history")] + mock_response.stop_reason = "end_turn" + ai_generator_with_mock.client.messages.create.return_value = mock_response + + history = "User: Hello\nAssistant: Hi there!" + result = ai_generator_with_mock.generate_response( + "How are you?", conversation_history=history + ) + + assert result == "Response with history" + + # Verify system prompt includes history + call_args = ai_generator_with_mock.client.messages.create.call_args + system_content = call_args.kwargs["system"] + assert history in system_content + + def test_api_parameters_structure(self, ai_generator_with_mock, tool_manager): + """Test that API parameters are structured correctly""" + mock_response = Mock() + mock_response.content = [Mock(text="Test response")] + mock_response.stop_reason = "end_turn" + ai_generator_with_mock.client.messages.create.return_value = mock_response + + tools = tool_manager.get_tool_definitions() + ai_generator_with_mock.generate_response( + "Test query", tools=tools, tool_manager=tool_manager + ) + + call_args = ai_generator_with_mock.client.messages.create.call_args + params = call_args.kwargs + + # Verify required parameters + assert "model" in params + assert "messages" in params + assert "system" in params + assert "temperature" in params + assert "max_tokens" in params + + # Verify tools parameters when tools are provided + assert "tools" in params + assert "tool_choice" in params + assert params["tool_choice"]["type"] == "auto" + + def test_sequential_tool_calling_two_rounds( + self, ai_generator_with_mock, tool_manager + ): + """Test sequential tool calling across two rounds""" + # Mock tool blocks for round 1 + mock_tool_block1 = Mock() + mock_tool_block1.type = "tool_use" + mock_tool_block1.name = "get_course_outline" + mock_tool_block1.id = "tool_1" + mock_tool_block1.input = {"course_name": "MCP Basics"} + + # Mock tool blocks for round 2 + mock_tool_block2 = Mock() + mock_tool_block2.type = "tool_use" + mock_tool_block2.name = "search_course_content" + mock_tool_block2.id = "tool_2" + mock_tool_block2.input = {"query": "lesson 4 topic"} + + # Round 1 response with tool use + round1_response = Mock() + round1_response.content = [mock_tool_block1] + round1_response.stop_reason = "tool_use" + + # Round 2 response with tool use + round2_response = Mock() + round2_response.content = [mock_tool_block2] + round2_response.stop_reason = "tool_use" + + # Final response without tool use + final_response = Mock() + final_response.content = [ + Mock(text="Found courses discussing the same topic as lesson 4") + ] + + ai_generator_with_mock.client.messages.create.side_effect = [ + round1_response, + round2_response, + final_response, + ] + + tools = tool_manager.get_tool_definitions() + result = ai_generator_with_mock.generate_response( + "Find courses discussing the same topic as lesson 4 of MCP Basics", + tools=tools, + tool_manager=tool_manager, + ) + + assert result == "Found courses discussing the same topic as lesson 4" + # Should be called 3 times: round 1, round 2, final response + assert ai_generator_with_mock.client.messages.create.call_count == 3 + + def test_sequential_tool_calling_early_termination( + self, ai_generator_with_mock, tool_manager + ): + """Test that conversation stops early when Claude doesn't use tools""" + # Round 1 response without tool use + round1_response = Mock() + round1_response.content = [Mock(text="This is a general knowledge question")] + round1_response.stop_reason = "end_turn" + + ai_generator_with_mock.client.messages.create.return_value = round1_response + + tools = tool_manager.get_tool_definitions() + result = ai_generator_with_mock.generate_response( + "What is artificial intelligence?", tools=tools, tool_manager=tool_manager + ) + + assert result == "This is a general knowledge question" + # Should only be called once + assert ai_generator_with_mock.client.messages.create.call_count == 1 + + def test_sequential_tool_calling_max_rounds( + self, ai_generator_with_mock, tool_manager + ): + """Test that conversation stops after max rounds (2)""" + # Mock tool block + mock_tool_block = Mock() + mock_tool_block.type = "tool_use" + mock_tool_block.name = "search_course_content" + mock_tool_block.id = "tool_123" + mock_tool_block.input = {"query": "testing"} + + # Both round 1 and round 2 responses use tools + tool_response = Mock() + tool_response.content = [mock_tool_block] + tool_response.stop_reason = "tool_use" + + # Final response after max rounds + final_response = Mock() + final_response.content = [Mock(text="Final response after 2 tool rounds")] + + ai_generator_with_mock.client.messages.create.side_effect = [ + tool_response, + tool_response, + final_response, + ] + + tools = tool_manager.get_tool_definitions() + result = ai_generator_with_mock.generate_response( + "Complex query requiring multiple tools", + tools=tools, + tool_manager=tool_manager, + ) + + assert result == "Final response after 2 tool rounds" + # Should be called 3 times: 2 tool rounds + 1 final + assert ai_generator_with_mock.client.messages.create.call_count == 3 + + def test_sequential_tool_calling_with_tool_error( + self, ai_generator_with_mock, tool_manager + ): + """Test graceful handling of tool errors in sequential calling""" + # Mock tool block + mock_tool_block = Mock() + mock_tool_block.type = "tool_use" + mock_tool_block.name = "search_course_content" + mock_tool_block.id = "tool_123" + mock_tool_block.input = {"query": "testing"} + + # Round 1 with tool use + round1_response = Mock() + round1_response.content = [mock_tool_block] + round1_response.stop_reason = "tool_use" + + # Final response + final_response = Mock() + final_response.content = [Mock(text="Response despite tool error")] + + # Mock tool manager to raise exception + tool_manager.execute_tool.side_effect = Exception("Tool execution failed") + + ai_generator_with_mock.client.messages.create.side_effect = [ + round1_response, + final_response, + ] + + tools = tool_manager.get_tool_definitions() + result = ai_generator_with_mock.generate_response( + "Query with failing tools", tools=tools, tool_manager=tool_manager + ) + + assert result == "Response despite tool error" + # Should continue despite tool error + assert ai_generator_with_mock.client.messages.create.call_count == 2 + + def test_error_handling_in_tool_execution(self, ai_generator_with_mock): + """Test error handling during tool execution""" + # Create mock tool manager that raises exception + mock_tool_manager = Mock() + mock_tool_manager.execute_tool.side_effect = Exception("Tool execution failed") + + mock_tool_block = Mock() + mock_tool_block.type = "tool_use" + mock_tool_block.name = "search_course_content" + mock_tool_block.id = "tool_123" + mock_tool_block.input = {"query": "testing"} + + initial_response = Mock() + initial_response.content = [mock_tool_block] + + final_response = Mock() + final_response.content = [Mock(text="Error handled")] + ai_generator_with_mock.client.messages.create.return_value = final_response + + base_params = { + "messages": [{"role": "user", "content": "Test"}], + "system": "Test system", + } + + # This should not raise an exception + try: + result = ai_generator_with_mock._handle_tool_execution( + initial_response, base_params, mock_tool_manager + ) + # If we get here, the error was handled gracefully + assert result == "Error handled" + except Exception as e: + # If an exception is raised, it should be handled properly + pytest.fail(f"Tool execution error was not handled: {e}") + + +class TestAIGeneratorIntegration: + """Integration tests for AIGenerator with real tool manager""" + + def test_real_tool_integration(self, tool_manager): + """Test AIGenerator with real tools (mocking only Anthropic API)""" + with patch("anthropic.Anthropic") as mock_anthropic: + # Setup mock responses for tool use scenario + mock_tool_block = Mock() + mock_tool_block.type = "tool_use" + mock_tool_block.name = "search_course_content" + mock_tool_block.id = "tool_123" + mock_tool_block.input = {"query": "testing"} + + initial_response = Mock() + initial_response.content = [mock_tool_block] + initial_response.stop_reason = "tool_use" + + final_response = Mock() + final_response.content = [Mock(text="Integration test complete")] + + mock_client = Mock() + mock_client.messages.create.side_effect = [initial_response, final_response] + mock_anthropic.return_value = mock_client + + generator = AIGenerator("test-key", "test-model") + tools = tool_manager.get_tool_definitions() + + result = generator.generate_response( + "Tell me about testing", tools=tools, tool_manager=tool_manager + ) + + assert result == "Integration test complete" + assert mock_client.messages.create.call_count == 2 diff --git a/backend/tests/test_api_endpoints.py b/backend/tests/test_api_endpoints.py new file mode 100644 index 00000000..0ffa104b --- /dev/null +++ b/backend/tests/test_api_endpoints.py @@ -0,0 +1,196 @@ +import pytest +import json +from fastapi.testclient import TestClient +from unittest.mock import patch + +@pytest.mark.api +class TestAPIEndpoints: + """Test suite for API endpoints""" + + def test_root_endpoint(self, test_client): + """Test the root endpoint returns correct response""" + response = test_client.get("/") + assert response.status_code == 200 + assert response.json() == {"message": "Course Materials RAG System Test API"} + + def test_create_new_session(self, test_client): + """Test creating a new session""" + response = test_client.post("/api/session/new") + assert response.status_code == 200 + data = response.json() + assert "session_id" in data + assert data["session_id"] == "test-session-123" + + def test_create_new_session_with_previous_id(self, test_client, mock_rag_system): + """Test creating a new session while clearing previous one""" + response = test_client.post("/api/session/new?prev_session_id=old-session-456") + assert response.status_code == 200 + data = response.json() + assert "session_id" in data + assert data["session_id"] == "test-session-123" + + # Verify clear_session was called with the previous session ID + mock_rag_system.session_manager.clear_session.assert_called_once_with("old-session-456") + + def test_query_endpoint_with_session_id(self, test_client): + """Test querying with provided session ID""" + query_data = { + "query": "What is machine learning?", + "session_id": "existing-session-789" + } + response = test_client.post("/api/query", json=query_data) + assert response.status_code == 200 + + data = response.json() + assert "answer" in data + assert "sources" in data + assert "session_id" in data + assert data["answer"] == "Test answer from the RAG system" + assert data["session_id"] == "existing-session-789" + assert len(data["sources"]) == 1 + assert data["sources"][0]["text"] == "Test source content" + assert data["sources"][0]["link"] == "https://example.com/test" + + def test_query_endpoint_without_session_id(self, test_client, mock_rag_system): + """Test querying without session ID (should create new session)""" + query_data = { + "query": "Explain neural networks" + } + response = test_client.post("/api/query", json=query_data) + assert response.status_code == 200 + + data = response.json() + assert "answer" in data + assert "sources" in data + assert "session_id" in data + assert data["session_id"] == "test-session-123" # From mock + + # Verify create_session was called + mock_rag_system.session_manager.create_session.assert_called() + + def test_query_endpoint_invalid_data(self, test_client): + """Test query endpoint with invalid data""" + # Missing required query field + response = test_client.post("/api/query", json={}) + assert response.status_code == 422 # Validation error + + def test_query_endpoint_empty_query(self, test_client): + """Test query endpoint with empty query string""" + query_data = {"query": ""} + response = test_client.post("/api/query", json=query_data) + assert response.status_code == 200 # Should still work with empty query + + def test_courses_endpoint(self, test_client): + """Test getting course statistics""" + response = test_client.get("/api/courses") + assert response.status_code == 200 + + data = response.json() + assert "total_courses" in data + assert "course_titles" in data + assert data["total_courses"] == 2 + assert data["course_titles"] == ["Test Course 1", "Test Course 2"] + + @patch('app.rag_system') + def test_query_endpoint_error_handling(self, mock_rag_patch, test_client): + """Test error handling in query endpoint""" + # Make the mock RAG system raise an exception + test_client.app.state.rag_system.query.side_effect = Exception("Test error") + + query_data = {"query": "What is AI?"} + response = test_client.post("/api/query", json=query_data) + assert response.status_code == 500 + assert "Test error" in response.json()["detail"] + + @patch('app.rag_system') + def test_courses_endpoint_error_handling(self, mock_rag_patch, test_client): + """Test error handling in courses endpoint""" + # Make the mock RAG system raise an exception + test_client.app.state.rag_system.get_course_analytics.side_effect = Exception("Analytics error") + + response = test_client.get("/api/courses") + assert response.status_code == 500 + assert "Analytics error" in response.json()["detail"] + + @patch('app.rag_system') + def test_session_endpoint_error_handling(self, mock_rag_patch, test_client): + """Test error handling in session creation endpoint""" + # Make the session manager raise an exception + test_client.app.state.rag_system.session_manager.create_session.side_effect = Exception("Session error") + + response = test_client.post("/api/session/new") + assert response.status_code == 500 + assert "Session error" in response.json()["detail"] + + def test_query_with_string_sources(self, test_client, mock_rag_system): + """Test query handling when sources are strings (legacy format)""" + # Configure mock to return string sources instead of dict sources + mock_rag_system.query.return_value = ( + "Test answer", + ["String source 1", "String source 2"] + ) + + query_data = {"query": "Test query"} + response = test_client.post("/api/query", json=query_data) + assert response.status_code == 200 + + data = response.json() + assert len(data["sources"]) == 2 + assert data["sources"][0]["text"] == "String source 1" + assert data["sources"][0]["link"] is None + assert data["sources"][1]["text"] == "String source 2" + assert data["sources"][1]["link"] is None + + def test_query_with_mixed_sources(self, test_client, mock_rag_system): + """Test query handling with mixed source formats""" + # Configure mock to return mixed sources + mock_rag_system.query.return_value = ( + "Test answer", + [ + {"text": "Dict source with link", "link": "https://example.com"}, + "String source without link", + {"text": "Dict source without link"} + ] + ) + + query_data = {"query": "Test query"} + response = test_client.post("/api/query", json=query_data) + assert response.status_code == 200 + + data = response.json() + assert len(data["sources"]) == 3 + assert data["sources"][0]["text"] == "Dict source with link" + assert data["sources"][0]["link"] == "https://example.com" + assert data["sources"][1]["text"] == "String source without link" + assert data["sources"][1]["link"] is None + assert data["sources"][2]["text"] == "Dict source without link" + assert data["sources"][2]["link"] is None + + def test_content_type_validation(self, test_client): + """Test that endpoints require proper content type""" + # Test with form data instead of JSON + response = test_client.post("/api/query", data={"query": "test"}) + assert response.status_code == 422 # Should fail validation + + # Test with proper JSON + response = test_client.post("/api/query", json={"query": "test"}) + assert response.status_code == 200 + + def test_response_models(self, test_client): + """Test that responses match expected Pydantic models""" + # Test query response structure + query_data = {"query": "Test query"} + response = test_client.post("/api/query", json=query_data) + data = response.json() + + required_fields = {"answer", "sources", "session_id"} + assert required_fields.issubset(data.keys()) + + # Test courses response structure + response = test_client.get("/api/courses") + data = response.json() + + required_fields = {"total_courses", "course_titles"} + assert required_fields.issubset(data.keys()) + assert isinstance(data["total_courses"], int) + assert isinstance(data["course_titles"], list) \ No newline at end of file diff --git a/backend/tests/test_rag_system.py b/backend/tests/test_rag_system.py new file mode 100644 index 00000000..9efac10e --- /dev/null +++ b/backend/tests/test_rag_system.py @@ -0,0 +1,326 @@ +import shutil +import tempfile +from unittest.mock import MagicMock, Mock, patch + +import pytest +from rag_system import RAGSystem +from vector_store import SearchResults + + +class TestRAGSystem: + """Test suite for RAG system query handling""" + + def test_rag_system_initialization(self, test_config): + """Test RAG system initializes all components""" + rag = RAGSystem(test_config) + + assert rag.config is not None + assert rag.document_processor is not None + assert rag.vector_store is not None + assert rag.ai_generator is not None + assert rag.session_manager is not None + assert rag.tool_manager is not None + assert rag.search_tool is not None + assert rag.outline_tool is not None + + def test_query_without_session(self, rag_system): + """Test query processing without session ID""" + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + mock_generate.return_value = "Test response" + + response, sources = rag_system.query("What is testing?") + + assert response == "Test response" + assert isinstance(sources, list) + mock_generate.assert_called_once() + + def test_query_with_session(self, rag_system): + """Test query processing with session ID""" + session_id = "test-session-123" + + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + mock_generate.return_value = "Test response with session" + + response, sources = rag_system.query("What is testing?", session_id) + + assert response == "Test response with session" + mock_generate.assert_called_once() + + # Verify conversation history was passed + call_args = mock_generate.call_args + assert call_args is not None + + def test_query_with_tool_usage(self, rag_system): + """Test query that should trigger tool usage""" + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + mock_generate.return_value = "Response based on search" + + # Mock tool manager to return some sources + with patch.object( + rag_system.tool_manager, "get_last_sources" + ) as mock_sources: + mock_sources.return_value = [ + { + "text": "Test Course - Lesson 1", + "link": "https://example.com/lesson1", + } + ] + + response, sources = rag_system.query( + "Tell me about testing in the course" + ) + + assert response == "Response based on search" + assert len(sources) == 1 + assert sources[0]["text"] == "Test Course - Lesson 1" + + def test_query_error_handling(self, rag_system): + """Test error handling during query processing""" + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + mock_generate.side_effect = Exception("AI generation failed") + + # Should handle error gracefully + with pytest.raises(Exception): + rag_system.query("What is testing?") + + def test_add_course_document_success(self, rag_system, tmp_path): + """Test successful addition of course document""" + # Create test file + test_file = tmp_path / "test_course.txt" + test_content = """Course: Test Course +Instructor: Test Teacher + +Lesson 1: Introduction +This is lesson 1 content. + +Lesson 2: Advanced Topics +This is lesson 2 content.""" + + test_file.write_text(test_content) + + course, chunk_count = rag_system.add_course_document(str(test_file)) + + assert course is not None + assert chunk_count > 0 + assert course.title == "Test Course" + + def test_add_course_document_nonexistent_file(self, rag_system): + """Test adding nonexistent course document""" + course, chunk_count = rag_system.add_course_document("nonexistent_file.txt") + + assert course is None + assert chunk_count == 0 + + def test_add_course_folder_success(self, rag_system, tmp_path): + """Test successful addition of course folder""" + # Create test files + test_file1 = tmp_path / "course1.txt" + test_file1.write_text("Course: Course 1\nLesson 1: Intro\nContent here") + + test_file2 = tmp_path / "course2.txt" + test_file2.write_text("Course: Course 2\nLesson 1: Intro\nContent here") + + courses_added, chunks_added = rag_system.add_course_folder(str(tmp_path)) + + assert courses_added >= 0 # May be 0 if courses already exist + assert chunks_added >= 0 + + def test_add_course_folder_clear_existing(self, rag_system, tmp_path): + """Test adding course folder with clear existing data""" + test_file = tmp_path / "course.txt" + test_file.write_text("Course: New Course\nLesson 1: Intro\nContent here") + + courses_added, chunks_added = rag_system.add_course_folder( + str(tmp_path), clear_existing=True + ) + + assert courses_added >= 0 + assert chunks_added >= 0 + + def test_add_course_folder_nonexistent_folder(self, rag_system): + """Test adding nonexistent course folder""" + courses_added, chunks_added = rag_system.add_course_folder("nonexistent_folder") + + assert courses_added == 0 + assert chunks_added == 0 + + def test_get_course_analytics(self, rag_system): + """Test course analytics retrieval""" + analytics = rag_system.get_course_analytics() + + assert isinstance(analytics, dict) + assert "total_courses" in analytics + assert "course_titles" in analytics + assert isinstance(analytics["total_courses"], int) + assert isinstance(analytics["course_titles"], list) + + def test_session_management_integration(self, rag_system): + """Test session management integration""" + session_id = "test-session-456" + + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + mock_generate.return_value = "First response" + + # First query + rag_system.query("First question", session_id) + + mock_generate.return_value = "Second response" + + # Second query - should have history + rag_system.query("Second question", session_id) + + # Verify second call had conversation history + assert mock_generate.call_count == 2 + second_call_args = mock_generate.call_args_list[1] + assert "conversation_history" in second_call_args.kwargs + + def test_tools_integration(self, rag_system): + """Test tools are properly integrated""" + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + mock_generate.return_value = "Tool-based response" + + rag_system.query("Content-related question") + + # Verify tools were passed to AI generator + call_args = mock_generate.call_args + assert "tools" in call_args.kwargs + assert "tool_manager" in call_args.kwargs + assert call_args.kwargs["tools"] is not None + assert call_args.kwargs["tool_manager"] is not None + + def test_sources_reset_after_query(self, rag_system): + """Test that sources are reset after each query""" + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + mock_generate.return_value = "Response" + + # Mock tool manager to track reset calls + with patch.object(rag_system.tool_manager, "reset_sources") as mock_reset: + rag_system.query("Test question") + mock_reset.assert_called_once() + + +class TestRAGSystemConfigIssues: + """Test RAG system behavior with configuration issues""" + + def test_max_results_zero_issue(self, test_config): + """Test RAG system behavior when MAX_RESULTS is 0""" + # Set MAX_RESULTS to 0 (current issue in config) + test_config.MAX_RESULTS = 0 + rag = RAGSystem(test_config) + + with patch.object(rag.ai_generator, "generate_response") as mock_generate: + mock_generate.return_value = "No results due to config" + + response, sources = rag.query("Tell me about testing") + + # Should still work but may have issues with search + assert isinstance(response, str) + assert isinstance(sources, list) + + def test_empty_api_key(self, test_config): + """Test RAG system with empty API key""" + test_config.ANTHROPIC_API_KEY = "" + + # Should initialize without error (API calls will fail later) + rag = RAGSystem(test_config) + assert rag.ai_generator is not None + + def test_invalid_chroma_path(self, test_config): + """Test RAG system with invalid ChromaDB path""" + # Set invalid path + test_config.CHROMA_PATH = "/invalid/path/that/does/not/exist" + + # Should still initialize (ChromaDB will create the path) + rag = RAGSystem(test_config) + assert rag.vector_store is not None + + +class TestRAGSystemEndToEnd: + """End-to-end integration tests""" + + def test_complete_query_flow_mocked(self, rag_system): + """Test complete query flow with mocked external dependencies""" + session_id = "e2e-test-session" + + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + # Mock AI to return a response that would come from tool usage + mock_generate.return_value = ( + "Based on the course content, testing involves..." + ) + + # Mock tool manager to return realistic sources + with patch.object( + rag_system.tool_manager, "get_last_sources" + ) as mock_sources: + mock_sources.return_value = [ + { + "text": "Test Course - Lesson 1", + "link": "https://example.com/lesson1", + }, + { + "text": "Test Course - Lesson 2", + "link": "https://example.com/lesson2", + }, + ] + + # Execute query + response, sources = rag_system.query( + "What is software testing?", session_id + ) + + # Verify response + assert isinstance(response, str) + assert len(response) > 0 + assert "testing" in response.lower() + + # Verify sources + assert isinstance(sources, list) + assert len(sources) == 2 + assert sources[0]["text"] == "Test Course - Lesson 1" + assert sources[0]["link"] == "https://example.com/lesson1" + + # Verify AI generator was called with proper parameters + call_args = mock_generate.call_args + assert call_args is not None + assert "tools" in call_args.kwargs + assert "tool_manager" in call_args.kwargs + + def test_query_failure_scenarios(self, rag_system): + """Test various query failure scenarios""" + # Test with AI generation failure + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + mock_generate.side_effect = Exception("API Error") + + with pytest.raises(Exception): + rag_system.query("Test question") + + # Test with tool execution failure + with patch.object(rag_system.tool_manager, "execute_tool") as mock_execute: + mock_execute.side_effect = Exception("Tool Error") + + with patch.object( + rag_system.ai_generator, "generate_response" + ) as mock_generate: + mock_generate.return_value = "Fallback response" + + # Should still work if AI handles tool errors gracefully + response, sources = rag_system.query("Test question") + assert isinstance(response, str) diff --git a/backend/tests/test_search_tools.py b/backend/tests/test_search_tools.py new file mode 100644 index 00000000..a03df55c --- /dev/null +++ b/backend/tests/test_search_tools.py @@ -0,0 +1,201 @@ +from unittest.mock import Mock, patch + +import pytest +from search_tools import CourseSearchTool, ToolManager +from vector_store import SearchResults + + +class TestCourseSearchTool: + """Test suite for CourseSearchTool execute method""" + + def test_execute_successful_search(self, course_search_tool): + """Test successful search execution""" + result = course_search_tool.execute("testing") + + # Should not be empty and should contain formatted results + assert result != "" + assert isinstance(result, str) + assert "Test Course" in result + + def test_execute_with_course_filter(self, course_search_tool): + """Test search with course name filter""" + result = course_search_tool.execute("testing", course_name="Test Course") + + assert result != "" + assert "Test Course" in result + + def test_execute_with_lesson_filter(self, course_search_tool): + """Test search with lesson number filter""" + result = course_search_tool.execute("testing", lesson_number=1) + + assert result != "" + assert "Lesson 1" in result + + def test_execute_with_both_filters(self, course_search_tool): + """Test search with both course and lesson filters""" + result = course_search_tool.execute( + "testing", course_name="Test Course", lesson_number=1 + ) + + assert result != "" + assert "Test Course" in result + assert "Lesson 1" in result + + def test_execute_no_results_found(self, course_search_tool): + """Test when search returns no results""" + result = course_search_tool.execute("nonexistent_topic") + + # Should return a "not found" message + assert "No relevant content found" in result + + def test_execute_course_not_found(self, course_search_tool): + """Test when specified course doesn't exist""" + result = course_search_tool.execute("testing", course_name="Nonexistent Course") + + # Should return course not found message + assert "No course found matching" in result + + def test_execute_stores_sources(self, course_search_tool): + """Test that execute stores sources for later retrieval""" + # Clear any existing sources + course_search_tool.last_sources = [] + + result = course_search_tool.execute("testing") + + # Should have stored sources + assert len(course_search_tool.last_sources) > 0 + + # Sources should have proper structure + for source in course_search_tool.last_sources: + assert isinstance(source, dict) + assert "text" in source + assert "link" in source + + def test_execute_with_zero_max_results(self, vector_store): + """Test behavior when max_results is 0 (current config issue)""" + # Create tool with vector store that has 0 max_results + vector_store.max_results = 0 + tool = CourseSearchTool(vector_store) + + result = tool.execute("testing") + + # With 0 max_results, should return no results + assert "No relevant content found" in result or result == "" + + def test_format_results_with_links(self, course_search_tool): + """Test that _format_results properly formats with lesson links""" + # Create mock search results + mock_results = SearchResults( + documents=["Test content 1", "Test content 2"], + metadata=[ + {"course_title": "Test Course", "lesson_number": 1}, + {"course_title": "Test Course", "lesson_number": 2}, + ], + distances=[0.1, 0.2], + ) + + formatted = course_search_tool._format_results(mock_results) + + assert "[Test Course - Lesson 1]" in formatted + assert "[Test Course - Lesson 2]" in formatted + assert "Test content 1" in formatted + assert "Test content 2" in formatted + + def test_get_tool_definition(self, course_search_tool): + """Test tool definition structure""" + definition = course_search_tool.get_tool_definition() + + assert definition["name"] == "search_course_content" + assert "description" in definition + assert "input_schema" in definition + assert "properties" in definition["input_schema"] + assert "query" in definition["input_schema"]["properties"] + assert "required" in definition["input_schema"] + assert "query" in definition["input_schema"]["required"] + + +class TestToolManager: + """Test suite for ToolManager""" + + def test_register_tool(self, course_search_tool): + """Test tool registration""" + tm = ToolManager() + tm.register_tool(course_search_tool) + + assert "search_course_content" in tm.tools + + def test_get_tool_definitions(self, tool_manager): + """Test getting all tool definitions""" + definitions = tool_manager.get_tool_definitions() + + assert len(definitions) > 0 + assert isinstance(definitions, list) + + # Should contain search tool definition + search_def = next( + (d for d in definitions if d["name"] == "search_course_content"), None + ) + assert search_def is not None + + def test_execute_tool(self, tool_manager): + """Test executing tool through manager""" + result = tool_manager.execute_tool("search_course_content", query="testing") + + assert isinstance(result, str) + assert result != "" + + def test_execute_nonexistent_tool(self, tool_manager): + """Test executing tool that doesn't exist""" + result = tool_manager.execute_tool("nonexistent_tool", query="testing") + + assert "not found" in result + + def test_get_last_sources(self, tool_manager): + """Test retrieving sources from last search""" + # Execute a search to generate sources + tool_manager.execute_tool("search_course_content", query="testing") + + sources = tool_manager.get_last_sources() + + assert isinstance(sources, list) + # Should have sources if search was successful + if sources: # Only test structure if sources exist + for source in sources: + assert isinstance(source, dict) + assert "text" in source + + def test_reset_sources(self, tool_manager): + """Test resetting sources""" + # Execute a search to generate sources + tool_manager.execute_tool("search_course_content", query="testing") + + # Reset sources + tool_manager.reset_sources() + + sources = tool_manager.get_last_sources() + assert len(sources) == 0 + + +class TestSearchToolsIntegration: + """Integration tests for search tools""" + + def test_end_to_end_search_flow(self, tool_manager): + """Test complete search flow from tool manager to results""" + # Execute search + result = tool_manager.execute_tool( + "search_course_content", query="testing", course_name="Test Course" + ) + + # Verify result + assert isinstance(result, str) + assert result != "" + + # Verify sources were captured + sources = tool_manager.get_last_sources() + assert isinstance(sources, list) + + # If sources exist, verify structure + if sources: + for source in sources: + assert "text" in source + assert "link" in source diff --git a/backend/tests/test_vector_store.py b/backend/tests/test_vector_store.py new file mode 100644 index 00000000..e506bd97 --- /dev/null +++ b/backend/tests/test_vector_store.py @@ -0,0 +1,203 @@ +import pytest +from models import Course, CourseChunk, Lesson +from vector_store import SearchResults, VectorStore + + +class TestVectorStore: + """Test suite for VectorStore search functionality""" + + def test_search_with_normal_max_results(self, vector_store): + """Test search with normal max_results setting""" + # Ensure max_results is set to a reasonable value + vector_store.max_results = 5 + + # Add some test data + course = Course( + title="Search Test Course", + lessons=[Lesson(lesson_number=1, title="Test Lesson")], + ) + chunks = [ + CourseChunk( + content="This is test content about testing", + course_title="Search Test Course", + lesson_number=1, + chunk_index=0, + ) + ] + + vector_store.add_course_metadata(course) + vector_store.add_course_content(chunks) + + # Perform search + results = vector_store.search("testing") + + assert not results.is_empty() + assert len(results.documents) > 0 + assert results.error is None + + def test_search_with_zero_max_results(self, vector_store): + """Test search with zero max_results (current config issue)""" + # Set max_results to 0 to simulate the config issue + vector_store.max_results = 0 + + # Add some test data + course = Course( + title="Zero Results Test Course", + lessons=[Lesson(lesson_number=1, title="Test Lesson")], + ) + chunks = [ + CourseChunk( + content="This is test content", + course_title="Zero Results Test Course", + lesson_number=1, + chunk_index=0, + ) + ] + + vector_store.add_course_metadata(course) + vector_store.add_course_content(chunks) + + # Perform search - should return empty results due to limit=0 + results = vector_store.search("test") + + # With max_results=0, search should return no results + assert results.is_empty() or len(results.documents) == 0 + + def test_search_with_explicit_limit(self, vector_store): + """Test search with explicitly provided limit""" + # Add test data + course = Course( + title="Explicit Limit Test Course", + lessons=[Lesson(lesson_number=1, title="Test Lesson")], + ) + chunks = [ + CourseChunk( + content=f"Test content {i}", + course_title="Explicit Limit Test Course", + lesson_number=1, + chunk_index=i, + ) + for i in range(10) + ] + + vector_store.add_course_metadata(course) + vector_store.add_course_content(chunks) + + # Search with explicit limit should override max_results + results = vector_store.search("test", limit=3) + + assert not results.is_empty() + assert len(results.documents) <= 3 + + def test_search_course_name_resolution(self, populated_vector_store): + """Test course name resolution in search""" + results = populated_vector_store.search("test", course_name="Test Course") + + # Should find the course and return results + if not results.is_empty(): + for metadata in results.metadata: + assert metadata.get("course_title") == "Test Course" + + def test_search_nonexistent_course(self, populated_vector_store): + """Test search for nonexistent course""" + results = populated_vector_store.search( + "test", course_name="Nonexistent Course" + ) + + # Should return error about course not found + assert results.error is not None + assert "No course found matching" in results.error + + def test_search_with_lesson_filter(self, populated_vector_store): + """Test search with lesson number filter""" + results = populated_vector_store.search("test", lesson_number=1) + + if not results.is_empty(): + for metadata in results.metadata: + assert metadata.get("lesson_number") == 1 + + def test_get_existing_course_titles(self, populated_vector_store): + """Test getting existing course titles""" + titles = populated_vector_store.get_existing_course_titles() + + assert isinstance(titles, list) + assert "Test Course" in titles + + def test_get_course_count(self, populated_vector_store): + """Test getting course count""" + count = populated_vector_store.get_course_count() + + assert isinstance(count, int) + assert count >= 1 # At least the test course + + def test_get_lesson_link(self, populated_vector_store): + """Test getting lesson link""" + link = populated_vector_store.get_lesson_link("Test Course", 1) + + # Should return the link or None + assert link is None or isinstance(link, str) + + def test_clear_all_data(self, vector_store): + """Test clearing all vector store data""" + # Add some data first + course = Course(title="Temp Course", lessons=[]) + vector_store.add_course_metadata(course) + + # Verify data exists + count_before = vector_store.get_course_count() + assert count_before > 0 + + # Clear data + vector_store.clear_all_data() + + # Verify data is cleared + count_after = vector_store.get_course_count() + assert count_after == 0 + + +class TestSearchResults: + """Test SearchResults utility class""" + + def test_from_chroma_with_results(self): + """Test creating SearchResults from ChromaDB results""" + chroma_results = { + "documents": [["doc1", "doc2"]], + "metadatas": [[{"key": "value1"}, {"key": "value2"}]], + "distances": [[0.1, 0.2]], + } + + results = SearchResults.from_chroma(chroma_results) + + assert len(results.documents) == 2 + assert len(results.metadata) == 2 + assert len(results.distances) == 2 + assert results.error is None + + def test_from_chroma_empty_results(self): + """Test creating SearchResults from empty ChromaDB results""" + chroma_results = {"documents": [[]], "metadatas": [[]], "distances": [[]]} + + results = SearchResults.from_chroma(chroma_results) + + assert len(results.documents) == 0 + assert len(results.metadata) == 0 + assert len(results.distances) == 0 + assert results.error is None + + def test_empty_with_error(self): + """Test creating empty SearchResults with error""" + results = SearchResults.empty("Test error message") + + assert results.is_empty() + assert results.error == "Test error message" + assert len(results.documents) == 0 + + def test_is_empty_method(self): + """Test is_empty method""" + # Empty results + empty_results = SearchResults([], [], []) + assert empty_results.is_empty() + + # Non-empty results + non_empty_results = SearchResults(["doc"], [{"key": "value"}], [0.1]) + assert not non_empty_results.is_empty() diff --git a/backend/vector_store.py b/backend/vector_store.py index 390abe71..972110ec 100644 --- a/backend/vector_store.py +++ b/backend/vector_store.py @@ -1,267 +1,327 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + import chromadb from chromadb.config import Settings -from typing import List, Dict, Any, Optional -from dataclasses import dataclass -from models import Course, CourseChunk +from logging_config import get_logger +from models import Document, DocumentChunk from sentence_transformers import SentenceTransformer +logger = get_logger(__name__) + + @dataclass class SearchResults: """Container for search results with metadata""" + documents: List[str] metadata: List[Dict[str, Any]] distances: List[float] error: Optional[str] = None - + @classmethod - def from_chroma(cls, chroma_results: Dict) -> 'SearchResults': + def from_chroma(cls, chroma_results: Dict) -> "SearchResults": """Create SearchResults from ChromaDB query results""" return cls( - documents=chroma_results['documents'][0] if chroma_results['documents'] else [], - metadata=chroma_results['metadatas'][0] if chroma_results['metadatas'] else [], - distances=chroma_results['distances'][0] if chroma_results['distances'] else [] + documents=( + chroma_results["documents"][0] if chroma_results["documents"] else [] + ), + metadata=( + chroma_results["metadatas"][0] if chroma_results["metadatas"] else [] + ), + distances=( + chroma_results["distances"][0] if chroma_results["distances"] else [] + ), ) - + @classmethod - def empty(cls, error_msg: str) -> 'SearchResults': + def empty(cls, error_msg: str) -> "SearchResults": """Create empty results with error message""" return cls(documents=[], metadata=[], distances=[], error=error_msg) - + def is_empty(self) -> bool: """Check if results are empty""" return len(self.documents) == 0 + class VectorStore: - """Vector storage using ChromaDB for course content and metadata""" - + """Vector storage using ChromaDB for document content and metadata""" + def __init__(self, chroma_path: str, embedding_model: str, max_results: int = 5): + logger.info( + f"Initializing VectorStore - path: {chroma_path}, model: {embedding_model}, max_results: {max_results}" + ) self.max_results = max_results + # Initialize ChromaDB client + logger.debug("Setting up ChromaDB client") self.client = chromadb.PersistentClient( - path=chroma_path, - settings=Settings(anonymized_telemetry=False) + path=chroma_path, settings=Settings(anonymized_telemetry=False) ) - + # Set up sentence transformer embedding function - self.embedding_function = chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction( - model_name=embedding_model + logger.debug(f"Loading embedding model: {embedding_model}") + self.embedding_function = ( + chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction( + model_name=embedding_model + ) ) - + # Create collections for different types of data - self.course_catalog = self._create_collection("course_catalog") # Course titles/instructors - self.course_content = self._create_collection("course_content") # Actual course material - + logger.debug("Creating/accessing ChromaDB collections") + self.document_catalog = self._create_collection( + "document_catalog" + ) # Document titles/instructors + self.document_content = self._create_collection( + "document_content" + ) # Actual document material + logger.info("VectorStore initialization complete") + def _create_collection(self, name: str): """Create or get a ChromaDB collection""" return self.client.get_or_create_collection( - name=name, - embedding_function=self.embedding_function + name=name, embedding_function=self.embedding_function ) - - def search(self, - query: str, - course_name: Optional[str] = None, - lesson_number: Optional[int] = None, - limit: Optional[int] = None) -> SearchResults: + + def search( + self, + query: str, + document_name: Optional[str] = None, + section_number: Optional[int] = None, + limit: Optional[int] = None, + ) -> SearchResults: """ - Main search interface that handles course resolution and content search. - + Main search interface that handles document resolution and content search. + Args: - query: What to search for in course content - course_name: Optional course name/title to filter by - lesson_number: Optional lesson number to filter by + query: What to search for in document content + document_name: Optional document name/title to filter by + section_number: Optional section number to filter by limit: Maximum results to return - + Returns: SearchResults object with documents and metadata """ - # Step 1: Resolve course name if provided - course_title = None - if course_name: - course_title = self._resolve_course_name(course_name) - if not course_title: - return SearchResults.empty(f"No course found matching '{course_name}'") - + # Step 1: Resolve document name if provided + document_title = None + if document_name: + document_title = self._resolve_document_name(document_name) + if not document_title: + return SearchResults.empty(f"No document found matching '{document_name}'") + # Step 2: Build filter for content search - filter_dict = self._build_filter(course_title, lesson_number) - - # Step 3: Search course content - # Use provided limit or fall back to configured max_results + filter_dict = self._build_filter(document_title, section_number) + + # Step 3: Search document content + # Use provided limit or fall back to configured max_results, ensure it's at least 1 search_limit = limit if limit is not None else self.max_results - + if search_limit <= 0: + search_limit = 5 # Fallback to reasonable default + try: - results = self.course_content.query( - query_texts=[query], - n_results=search_limit, - where=filter_dict + results = self.document_content.query( + query_texts=[query], n_results=search_limit, where=filter_dict ) return SearchResults.from_chroma(results) except Exception as e: return SearchResults.empty(f"Search error: {str(e)}") - - def _resolve_course_name(self, course_name: str) -> Optional[str]: - """Use vector search to find best matching course by name""" + + def _resolve_document_name(self, document_name: str) -> Optional[str]: + """Use vector search to find best matching document by name""" try: - results = self.course_catalog.query( - query_texts=[course_name], - n_results=1 - ) - - if results['documents'][0] and results['metadatas'][0]: + results = self.document_catalog.query(query_texts=[document_name], n_results=1) + + if results["documents"][0] and results["metadatas"][0]: # Return the title (which is now the ID) - return results['metadatas'][0][0]['title'] + return results["metadatas"][0][0]["title"] except Exception as e: - print(f"Error resolving course name: {e}") - + print(f"Error resolving document name: {e}") + return None - - def _build_filter(self, course_title: Optional[str], lesson_number: Optional[int]) -> Optional[Dict]: + + def _build_filter( + self, document_title: Optional[str], section_number: Optional[int] + ) -> Optional[Dict]: """Build ChromaDB filter from search parameters""" - if not course_title and lesson_number is None: + if not document_title and section_number is None: return None - + # Handle different filter combinations - if course_title and lesson_number is not None: - return {"$and": [ - {"course_title": course_title}, - {"lesson_number": lesson_number} - ]} - - if course_title: - return {"course_title": course_title} - - return {"lesson_number": lesson_number} - - def add_course_metadata(self, course: Course): - """Add course information to the catalog for semantic search""" + if document_title and section_number is not None: + return { + "$and": [ + {"document_title": document_title}, + {"section_number": section_number}, + ] + } + + if document_title: + return {"document_title": document_title} + + return {"section_number": section_number} + + def add_document_metadata(self, document: Document): + """Add document information to the catalog for semantic search""" import json - course_text = course.title - - # Build lessons metadata and serialize as JSON string - lessons_metadata = [] - for lesson in course.lessons: - lessons_metadata.append({ - "lesson_number": lesson.lesson_number, - "lesson_title": lesson.title, - "lesson_link": lesson.lesson_link - }) - - self.course_catalog.add( - documents=[course_text], - metadatas=[{ - "title": course.title, - "instructor": course.instructor, - "course_link": course.course_link, - "lessons_json": json.dumps(lessons_metadata), # Serialize as JSON string - "lesson_count": len(course.lessons) - }], - ids=[course.title] + logger.info(f"Adding document metadata to catalog: '{document.title}'") + document_text = document.title + + # Build sections metadata and serialize as JSON string + sections_metadata = [] + for section in document.sections: + sections_metadata.append( + { + "section_number": section.section_number, + "section_title": section.title, + "section_link": section.section_link, + } + ) + + sections_with_links = sum(1 for section in document.sections if section.section_link) + logger.debug( + f"Document '{document.title}': {len(document.sections)} sections, {sections_with_links} with links" + ) + + metadata = { + "title": document.title, + "instructor": document.instructor, + "document_link": document.document_link, + "sections_json": json.dumps(sections_metadata), # Serialize as JSON string + "section_count": len(document.sections), + } + + logger.debug(f"Adding to document_catalog collection: id='{document.title}'") + self.document_catalog.add( + documents=[document_text], metadatas=[metadata], ids=[document.title] ) - - def add_course_content(self, chunks: List[CourseChunk]): - """Add course content chunks to the vector store""" + logger.info(f"Successfully added document metadata for: '{document.title}'") + + def add_document_content(self, chunks: List[DocumentChunk]): + """Add document content chunks to the vector store""" if not chunks: + logger.debug("No chunks to add to document content") return - + + logger.info(f"Adding {len(chunks)} content chunks to vector store") + document_title = chunks[0].document_title if chunks else "Unknown" + logger.debug(f"Document: '{document_title}' - Processing {len(chunks)} chunks") + documents = [chunk.content for chunk in chunks] - metadatas = [{ - "course_title": chunk.course_title, - "lesson_number": chunk.lesson_number, - "chunk_index": chunk.chunk_index - } for chunk in chunks] + metadatas = [ + { + "document_title": chunk.document_title, + "section_number": chunk.section_number, + "chunk_index": chunk.chunk_index, + } + for chunk in chunks + ] # Use title with chunk index for unique IDs - ids = [f"{chunk.course_title.replace(' ', '_')}_{chunk.chunk_index}" for chunk in chunks] - - self.course_content.add( - documents=documents, - metadatas=metadatas, - ids=ids + ids = [ + f"{chunk.document_title.replace(' ', '_')}_{chunk.chunk_index}" + for chunk in chunks + ] + + logger.debug( + f"Adding chunks to document_content collection with IDs: {ids[0]}...{ids[-1]}" + ) + self.document_content.add(documents=documents, metadatas=metadatas, ids=ids) + logger.info( + f"Successfully added {len(chunks)} chunks for document: '{document_title}'" ) - + def clear_all_data(self): """Clear all data from both collections""" + logger.warning("Clearing all data from vector store collections") try: - self.client.delete_collection("course_catalog") - self.client.delete_collection("course_content") + logger.debug("Deleting document_catalog collection") + self.client.delete_collection("document_catalog") + logger.debug("Deleting document_content collection") + self.client.delete_collection("document_content") # Recreate collections - self.course_catalog = self._create_collection("course_catalog") - self.course_content = self._create_collection("course_content") + logger.debug("Recreating collections") + self.document_catalog = self._create_collection("document_catalog") + self.document_content = self._create_collection("document_content") + logger.info("Successfully cleared and recreated all collections") except Exception as e: - print(f"Error clearing data: {e}") - - def get_existing_course_titles(self) -> List[str]: - """Get all existing course titles from the vector store""" + logger.error(f"Error clearing data: {e}") + raise + + def get_existing_document_titles(self) -> List[str]: + """Get all existing document titles from the vector store""" try: # Get all documents from the catalog - results = self.course_catalog.get() - if results and 'ids' in results: - return results['ids'] + results = self.document_catalog.get() + if results and "ids" in results: + return results["ids"] return [] except Exception as e: - print(f"Error getting existing course titles: {e}") + print(f"Error getting existing document titles: {e}") return [] - - def get_course_count(self) -> int: - """Get the total number of courses in the vector store""" + + def get_document_count(self) -> int: + """Get the total number of documents in the vector store""" try: - results = self.course_catalog.get() - if results and 'ids' in results: - return len(results['ids']) + results = self.document_catalog.get() + if results and "ids" in results: + return len(results["ids"]) return 0 except Exception as e: - print(f"Error getting course count: {e}") + print(f"Error getting document count: {e}") return 0 - - def get_all_courses_metadata(self) -> List[Dict[str, Any]]: - """Get metadata for all courses in the vector store""" + + def get_all_documents_metadata(self) -> List[Dict[str, Any]]: + """Get metadata for all documents in the vector store""" import json + try: - results = self.course_catalog.get() - if results and 'metadatas' in results: - # Parse lessons JSON for each course + results = self.document_catalog.get() + if results and "metadatas" in results: + # Parse sections JSON for each document parsed_metadata = [] - for metadata in results['metadatas']: - course_meta = metadata.copy() - if 'lessons_json' in course_meta: - course_meta['lessons'] = json.loads(course_meta['lessons_json']) - del course_meta['lessons_json'] # Remove the JSON string version - parsed_metadata.append(course_meta) + for metadata in results["metadatas"]: + document_meta = metadata.copy() + if "sections_json" in document_meta: + document_meta["sections"] = json.loads(document_meta["sections_json"]) + del document_meta[ + "sections_json" + ] # Remove the JSON string version + parsed_metadata.append(document_meta) return parsed_metadata return [] except Exception as e: - print(f"Error getting courses metadata: {e}") + print(f"Error getting documents metadata: {e}") return [] - def get_course_link(self, course_title: str) -> Optional[str]: - """Get course link for a given course title""" + def get_document_link(self, document_title: str) -> Optional[str]: + """Get document link for a given document title""" try: - # Get course by ID (title is the ID) - results = self.course_catalog.get(ids=[course_title]) - if results and 'metadatas' in results and results['metadatas']: - metadata = results['metadatas'][0] - return metadata.get('course_link') + # Get document by ID (title is the ID) + results = self.document_catalog.get(ids=[document_title]) + if results and "metadatas" in results and results["metadatas"]: + metadata = results["metadatas"][0] + return metadata.get("document_link") return None except Exception as e: - print(f"Error getting course link: {e}") + print(f"Error getting document link: {e}") return None - - def get_lesson_link(self, course_title: str, lesson_number: int) -> Optional[str]: - """Get lesson link for a given course title and lesson number""" + + def get_section_link(self, document_title: str, section_number: int) -> Optional[str]: + """Get section link for a given document title and section number""" import json + try: - # Get course by ID (title is the ID) - results = self.course_catalog.get(ids=[course_title]) - if results and 'metadatas' in results and results['metadatas']: - metadata = results['metadatas'][0] - lessons_json = metadata.get('lessons_json') - if lessons_json: - lessons = json.loads(lessons_json) - # Find the lesson with matching number - for lesson in lessons: - if lesson.get('lesson_number') == lesson_number: - return lesson.get('lesson_link') + # Get document by ID (title is the ID) + results = self.document_catalog.get(ids=[document_title]) + if results and "metadatas" in results and results["metadatas"]: + metadata = results["metadatas"][0] + sections_json = metadata.get("sections_json") + if sections_json: + sections = json.loads(sections_json) + # Find the section with matching number + for section in sections: + if section.get("section_number") == section_number: + return section.get("section_link") return None except Exception as e: - print(f"Error getting lesson link: {e}") - \ No newline at end of file + print(f"Error getting section link: {e}") diff --git a/check-quality.ps1 b/check-quality.ps1 new file mode 100644 index 00000000..8d06e527 --- /dev/null +++ b/check-quality.ps1 @@ -0,0 +1,53 @@ +# Quality Check Script (Dry-run) +# Checks code quality without making changes + +Write-Host "Running quality checks (dry-run)..." -ForegroundColor Green + +# Check if we're in the right directory +if (-not (Test-Path "pyproject.toml")) { + Write-Host "Error: pyproject.toml not found. Please run from project root." -ForegroundColor Red + exit 1 +} + +# Check import sorting +Write-Host "1. Checking import order..." -ForegroundColor Yellow +uv run isort backend/ main.py --check-only --diff +$isort_result = $LASTEXITCODE + +# Check code formatting +Write-Host "`n2. Checking code formatting..." -ForegroundColor Yellow +uv run black backend/ main.py --check --diff +$black_result = $LASTEXITCODE + +# Check code style +Write-Host "`n3. Checking code style..." -ForegroundColor Yellow +uv run flake8 backend/ main.py --max-line-length=88 --extend-ignore=E203,W503 +$flake8_result = $LASTEXITCODE + +# Summary +Write-Host "`n--- Quality Check Summary ---" -ForegroundColor Blue +if ($isort_result -eq 0) { + Write-Host "✓ Import order: PASSED" -ForegroundColor Green +} else { + Write-Host "✗ Import order: NEEDS ATTENTION" -ForegroundColor Red +} + +if ($black_result -eq 0) { + Write-Host "✓ Code formatting: PASSED" -ForegroundColor Green +} else { + Write-Host "✗ Code formatting: NEEDS ATTENTION" -ForegroundColor Red +} + +if ($flake8_result -eq 0) { + Write-Host "✓ Code style: PASSED" -ForegroundColor Green +} else { + Write-Host "⚠ Code style: HAS WARNINGS" -ForegroundColor Yellow +} + +if ($isort_result -eq 0 -and $black_result -eq 0) { + Write-Host "`nAll quality checks passed!" -ForegroundColor Green + exit 0 +} else { + Write-Host "`nRun format.ps1 to fix formatting issues." -ForegroundColor Yellow + exit 1 +} \ No newline at end of file diff --git a/dl-ragchatbot.code-workspace b/dl-ragchatbot.code-workspace new file mode 100644 index 00000000..2584f714 --- /dev/null +++ b/dl-ragchatbot.code-workspace @@ -0,0 +1,32 @@ +{ + "folders": [ + { + "name": "RAG Chatbot", + "path": "." + } + ], + "settings": { + "python.defaultInterpreterPath": "./.venv/Scripts/python.exe", + "python.terminal.activateEnvironment": true, + "terminal.integrated.cwd": "${workspaceFolder}", + "files.associations": { + "*.txt": "plaintext", + "*.pdf": "pdf" + }, + "search.exclude": { + "**/chroma_db/**": true, + "**/.uv/**": true, + "**/uv.lock": true + } + }, + "extensions": { + "recommendations": [ + "ms-python.python", + "ms-python.flake8", + "ms-python.black-formatter", + "ms-python.isort", + "ms-toolsai.jupyter", + "humao.rest-client" + ] + } +} diff --git a/docs/on_hold/data_analysis_data_visualization_report.docx b/docs/on_hold/data_analysis_data_visualization_report.docx new file mode 100644 index 00000000..c0886e1f Binary files /dev/null and b/docs/on_hold/data_analysis_data_visualization_report.docx differ diff --git a/docs/on_hold/data_visualization_methodology.docx b/docs/on_hold/data_visualization_methodology.docx new file mode 100644 index 00000000..c8bbd699 Binary files /dev/null and b/docs/on_hold/data_visualization_methodology.docx differ diff --git a/docs/on_hold/q3_data_visualization_analysis.docx b/docs/on_hold/q3_data_visualization_analysis.docx new file mode 100644 index 00000000..da2fb061 Binary files /dev/null and b/docs/on_hold/q3_data_visualization_analysis.docx differ diff --git a/extensions.json b/extensions.json new file mode 100644 index 00000000..bbfc673b --- /dev/null +++ b/extensions.json @@ -0,0 +1,15 @@ +{ + "recommendations": [ + "ms-python.python", + "ms-python.flake8", + "ms-python.black-formatter", + "ms-python.isort", + "ms-toolsai.jupyter", + "ms-vscode.vscode-json", + "bradlc.vscode-tailwindcss", + "esbenp.prettier-vscode", + "ms-vscode.live-server", + "humao.rest-client", + "ms-python.debugpy" + ] +} \ No newline at end of file diff --git a/format.ps1 b/format.ps1 new file mode 100644 index 00000000..5ced5bb1 --- /dev/null +++ b/format.ps1 @@ -0,0 +1,36 @@ +# Code Quality Check Script +# Runs formatting and linting tools for the RAG chatbot project + +Write-Host "Running code quality checks..." -ForegroundColor Green + +# Check if we're in the right directory +if (-not (Test-Path "pyproject.toml")) { + Write-Host "Error: pyproject.toml not found. Please run from project root." -ForegroundColor Red + exit 1 +} + +# Run isort to sort imports +Write-Host "1. Sorting imports with isort..." -ForegroundColor Yellow +uv run isort backend/ main.py +if ($LASTEXITCODE -ne 0) { + Write-Host "Error: isort failed" -ForegroundColor Red + exit 1 +} + +# Run black to format code +Write-Host "2. Formatting code with black..." -ForegroundColor Yellow +uv run black backend/ main.py +if ($LASTEXITCODE -ne 0) { + Write-Host "Error: black formatting failed" -ForegroundColor Red + exit 1 +} + +# Run flake8 to check for style issues +Write-Host "3. Checking code style with flake8..." -ForegroundColor Yellow +uv run flake8 backend/ main.py --max-line-length=88 --extend-ignore=E203,W503 +if ($LASTEXITCODE -ne 0) { + Write-Host "Warning: flake8 found style issues" -ForegroundColor Yellow + # Don't exit on flake8 warnings, just warn +} + +Write-Host "Code quality checks completed!" -ForegroundColor Green \ No newline at end of file diff --git a/frontend-changes.md b/frontend-changes.md new file mode 100644 index 00000000..1f71263c --- /dev/null +++ b/frontend-changes.md @@ -0,0 +1,30 @@ +# Frontend Changes: Dark/Light Theme Toggle + +## Overview +Added a dark/light theme toggle button to the Course Materials Assistant interface, allowing users to switch between dark and light themes with smooth transitions and persistent theme preferences. + +## Files Modified + +### 1. `frontend/index.html` +- **Header Structure Updated**: Modified the header to include a theme toggle button positioned in the top-right corner +- **Theme Toggle Button**: Added a button with sun/moon SVG icons that switch visibility based on the current theme +- **Accessibility**: Included proper ARIA labels for screen reader support + +### 2. `frontend/style.css` +- **Light Theme Variables**: Added comprehensive light theme CSS custom properties +- **Theme Toggle Button Styles**: Responsive hover and focus states with smooth animations +- **Smooth Transitions**: Added global transition properties for seamless theme switching +- **Header Display**: Changed header from hidden to flex layout + +### 3. `frontend/script.js` +- **Theme Management Functions**: Complete theme switching logic with localStorage persistence +- **Event Listeners**: Added click and keyboard event listeners for accessibility +- **Initialization**: Loads saved theme preference on page load + +## Features Implemented +- ✅ Icon-based toggle button with sun/moon SVG icons +- ✅ Smooth 0.3s transitions between themes +- ✅ localStorage persistence for user preferences +- ✅ Full keyboard accessibility support +- ✅ WCAG compliant light and dark themes +EOF < /dev/null diff --git a/frontend/index.html b/frontend/index.html index f8e25a62..614148c6 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -1,81 +1,138 @@ -
- - - - - -Ask questions about courses, instructors, and content
-