From c69202453a88b1adc7f7779c084d0975c98b9366 Mon Sep 17 00:00:00 2001 From: nacharki Date: Wed, 6 Aug 2025 11:05:30 +0200 Subject: [PATCH] Using ZeroEntropy for Semantic Search --- README.md | 2 + .../.env.example | 7 + .../semantic_search_over_articles/.gitignore | 90 ++++ .../semantic_search_over_articles/Dockerfile | 47 ++ .../semantic_search_over_articles/README.md | 112 ++++ .../backend/__main__.py | 220 ++++++++ .../backend/indexer_ze.py | 178 +++++++ .../backend/logger/__init__.py | 1 + .../backend/logger/logging.py | 121 +++++ .../backend/search_ze.py | 359 +++++++++++++ .../backend/utils_ze.py | 371 ++++++++++++++ .../demo_notebook.ipynb | 478 ++++++++++++++++++ .../docker-compose.yml | 38 ++ .../frontend/streamlit_app_ze.py | 352 +++++++++++++ 14 files changed, 2376 insertions(+) create mode 100644 guides/semantic_search_over_articles/.env.example create mode 100644 guides/semantic_search_over_articles/.gitignore create mode 100644 guides/semantic_search_over_articles/Dockerfile create mode 100644 guides/semantic_search_over_articles/README.md create mode 100644 guides/semantic_search_over_articles/backend/__main__.py create mode 100644 guides/semantic_search_over_articles/backend/indexer_ze.py create mode 100644 guides/semantic_search_over_articles/backend/logger/__init__.py create mode 100644 guides/semantic_search_over_articles/backend/logger/logging.py create mode 100644 guides/semantic_search_over_articles/backend/search_ze.py create mode 100644 guides/semantic_search_over_articles/backend/utils_ze.py create mode 100644 guides/semantic_search_over_articles/demo_notebook.ipynb create mode 100644 guides/semantic_search_over_articles/docker-compose.yml create mode 100644 guides/semantic_search_over_articles/frontend/streamlit_app_ze.py diff --git a/README.md b/README.md index 2330694..77c84da 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,8 @@ As of now, the guides in this cookbook are written in Python, but the same conce Learn how to use ZeroEntropy as an Agent's search tool to access a knowledge base when responding to user queries. 8. **[Use LlamaParse in combination with ZeroEntropy to search and rerank PDFs](guides/rerank_llamaparsed_pdfs)** Learn how to use ZeroEntropy and LlamaParse to parse, search, and rerank complex PDF documents. +9. **[Use ZeroEntropy for Semantic Search over articles (French Gossip Media)](guides/semantic_search_over_articles)** + Learn how to use ZeroEntropy on a semantic search RAG to scrap, index, search, and rerank media articles. *(More guides coming soon...)* diff --git a/guides/semantic_search_over_articles/.env.example b/guides/semantic_search_over_articles/.env.example new file mode 100644 index 0000000..f8497ac --- /dev/null +++ b/guides/semantic_search_over_articles/.env.example @@ -0,0 +1,7 @@ +# AI Provider Configuration +OPENAI_API_KEY="" +ZEROENTROPY_API_KEY="" + +# Environment +ENVIRONMENT=development +EOF < /dev/null diff --git a/guides/semantic_search_over_articles/.gitignore b/guides/semantic_search_over_articles/.gitignore new file mode 100644 index 0000000..2499ae3 --- /dev/null +++ b/guides/semantic_search_over_articles/.gitignore @@ -0,0 +1,90 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +*.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +.xml + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyPI configuration file +.pypirc diff --git a/guides/semantic_search_over_articles/Dockerfile b/guides/semantic_search_over_articles/Dockerfile new file mode 100644 index 0000000..2f53512 --- /dev/null +++ b/guides/semantic_search_over_articles/Dockerfile @@ -0,0 +1,47 @@ +# Set python version +FROM python:3.10-slim + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONPATH=/app + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements file +COPY requirements.txt . + +# Install Python dependencies using uv +RUN pip install --no-cache-dir uv +RUN uv pip install --system --no-cache -r requirements.txt + +# Copy application code +COPY backend/ ./backend/ +COPY frontend/ ./frontend/ + +# Copy environment files (if any) +COPY .env* ./ + +# Create necessary directories +RUN mkdir -p /app/data /app/logs + +# Set proper permissions +RUN chmod -R 755 /app + +# Expose Streamlit port +EXPOSE 8501 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8501/_stcore/health || exit 1 + +# Set the entrypoint to run Streamlit +CMD ["streamlit", "run", "frontend/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true"] \ No newline at end of file diff --git a/guides/semantic_search_over_articles/README.md b/guides/semantic_search_over_articles/README.md new file mode 100644 index 0000000..04ad5f9 --- /dev/null +++ b/guides/semantic_search_over_articles/README.md @@ -0,0 +1,112 @@ +# French Gossip Semantic Search with ZeroEntropy + +This is a guide of production-ready semantic search for French gossip articles from **vsd.fr** and **public.fr** using ZeroEntropy. + +[![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) +[![ZeroEntropy](https://img.shields.io/badge/zeroentropy-latest-purple.svg)](https://zeroentropy.dev/) +[![Streamlit](https://img.shields.io/badge/streamlit-1.30+-green.svg)](https://streamlit.io/) +[![Docker](https://img.shields.io/badge/docker-ready-blue.svg)](https://docker.com/) + +## Features + +- **Advanced AI Retrieval**: Powered by ZeroEntropy's state-of-the-art search & reranking +- **Multiple Search Types**: Documents, snippets, pages, and advanced reranked results +- **Real-time RSS Scraping**: Automatically indexes articles from gossip websites +- **Interactive Web UI**: Beautiful Streamlit interface with advanced filtering +- **Smart Reranking**: Uses `zerank-1-small` model for improved relevance + +## Quick Start + +### 1. Setup Environment +```bash +# Clone repository +git clone +cd .\guides\semantic_search_over_articles + +# Install dependencies +pip install -r requirements.txt + +# Configure API key then add your ZEROENTROPY Credentials +cp .env.example .env + +``` + +### 2. Index Articles +```bash +# Scrape RSS feeds and index articles +python backend scrape --collection my_articles +``` + +### 3. Search Articles +```bash +# Search for articles (CLI) +python backend search "TPMP" --k 5 --collection my_articles +python backend search "famille royale" --search-type snippets +python backend search "célébrités" --search-type advanced --k 10 +``` + +### 4. Web Interface +```bash +# Launch Streamlit app +streamlit run frontend/streamlit_app.py +``` +Access at: `http://localhost:8501` + +## Docker Deployment + +```bash +# Build and run with Docker Compose +docker-compose up --build + +# Or build docker image and run the command within the container +docker build -t gossip-search . +docker exec -it gossip-search-container python backend scrape --collection my_articles +docker exec -it gossip-search-container python backend search "TPMP" --k 5 +``` + +## Project Structure + +``` +├── backend/ +│ ├── main.py # Main CLI interface +│ ├── indexer_ze.py # RSS scraping & indexing +│ ├── search_ze.py # Search functionality +│ ├── utils_ze.py # Advanced utilities & reranking +│ └── logger.py # Logging configuration +├── frontend/ +│ └── streamlit_app.py # Web interface +├── demo_notebook.ipynb # Interactive demo +├── docker-compose.yml # Container orchestration +├── requirements.txt # Dependencies +└── README.md # This file +``` + +## Usage Examples + +### CLI Commands +```bash +# Collection management +python backend manage list +python backend manage status --collection my_articles + +# Advanced search with filters +python backend search "mode" --filter-creator "Public" --reranker zerank-1-small + +# Different search types +python backend search "actualité" --search-type documents --k 10 +python backend search "télé" --search-type snippets --k 5 +python backend search "people" --search-type advanced --k 8 +``` + + +## Demo Notebook + +An Jupyter notebook is also available to explore the code and run it step by step; +```bash +jupyter notebook demo_notebook.ipynb +``` + +--- +## Author & Contribution + +**Created by [Naoufal Acharki](https://github.com/nacharki)**: This project is a demo of ZeroEntropy on a RAG for French gossip content. diff --git a/guides/semantic_search_over_articles/backend/__main__.py b/guides/semantic_search_over_articles/backend/__main__.py new file mode 100644 index 0000000..e54768d --- /dev/null +++ b/guides/semantic_search_over_articles/backend/__main__.py @@ -0,0 +1,220 @@ +# main.py +import asyncio +import json +import argparse +from dotenv import load_dotenv + +# Internal imports +from indexer_ze import ZeroEntropyArticleIndexer +from search_ze import ZeroEntropyArticleSearcher +from utils_ze import ZeroEntropyUtils +from logger import getLogger + +# Load environment variables +load_dotenv() + +# Configure logger to display log messages +logger = getLogger() + + +class ZeroEntropyArticleManager: + """ + Main class that orchestrates RSS scraping, indexing, and searching using ZeroEntropy. + """ + + def __init__(self, collection_name: str = "articles"): + self.collection_name = collection_name + self.indexer = ZeroEntropyArticleIndexer(collection_name) + self.searcher = ZeroEntropyArticleSearcher(collection_name) + self.utils = ZeroEntropyUtils(collection_name) + + async def scrape_and_index(self): + """Scrape RSS feeds and index articles""" + # Initialize RSS feed URLs + rss_public_urls = [ + "https://www.public.fr/feed", + "https://www.public.fr/people/feed", + "https://www.public.fr/tele/feed", + "https://www.public.fr/mode/feed", + "https://www.public.fr/people/familles-royales/feed", + ] + + rss_vsd_urls = [ + "https://vsd.fr/actu-people/feed/", + "https://vsd.fr/tele/feed/", + "https://vsd.fr/societe/feed/", + "https://vsd.fr/culture/feed/", + "https://vsd.fr/loisirs/feed/", + ] + + # Initialize collection + await self.indexer.initialize_collection() + + # Extract content from RSS feeds + articles = [] + for url in rss_public_urls + rss_vsd_urls: + content = self.indexer.get_rss_feed_content(url) + if content: + articles.extend(content) + logger.info("Successfully extracted content from %s", url) + else: + logger.warning("Failed to extract content from %s", url) + + # Save all content to a JSON file for backup + with open("articles.json", "w", encoding="utf-8") as f: + json.dump(articles, f, ensure_ascii=False, indent=4) + + logger.info(f"Extracted {len(articles)} articles total") + + # Index articles in ZeroEntropy + if articles: + await self.indexer.index_articles(articles) + logger.info("Successfully scraped and indexed articles in ZeroEntropy.") + else: + logger.warning("No articles to index.") + + async def search_articles( + self, + query: str, + search_type: str = "documents", + k: int = 10, + filter_creator: str = None, + filter_category: str = None, + reranker: str = "zerank-1-small", + show_status: bool = False, + ): + """Search for articles""" + # Show status if requested + if show_status: + await self.searcher.get_collection_status() + + # Prepare filter if specified + filter_dict = {} + if filter_creator: + filter_dict["creator"] = {"$eq": filter_creator} + if filter_category: + filter_dict["categories"] = {"$eq": filter_category} + + filter_dict = filter_dict if filter_dict else None + + # Perform search based on type + if search_type == "documents": + results = await self.searcher.search_documents( + query=query, + k=k, + filter_dict=filter_dict, + reranker=reranker, + ) + self.searcher.display_document_results(results, query) + + elif search_type == "snippets": + results = await self.searcher.search_snippets( + query=query, + k=k, + filter_dict=filter_dict, + reranker=reranker, + ) + self.searcher.display_snippet_results(results, query) + + elif search_type == "pages": + results = await self.searcher.search_pages( + query=query, k=k, filter_dict=filter_dict + ) + self.searcher.display_page_results(results, query) + + elif search_type == "advanced": + results = await self.utils.search_and_rerank( + query=query, k=k * 2, rerank_top_n=k + ) + self.utils.display_advanced_results(results, query) + + return results + + async def manage_collections(self, action: str, collection_name: str = None): + """Manage collections (list, delete, status)""" + if action == "list": + collections = await self.utils.list_all_collections() + logger.info(f"Available collections: {collections}") + return collections + + elif action == "delete" and collection_name: + success = await self.utils.delete_collection(collection_name) + if success: + logger.info(f"Successfully deleted collection: {collection_name}") + else: + logger.info(f"Failed to delete collection: {collection_name}") + return success + + elif action == "status": + status = await self.searcher.get_collection_status() + return status + + else: + logger.info("Invalid action or missing collection name") + return None + + +async def main(): + # Set up argument parser + parser = argparse.ArgumentParser(description="ZeroEntropy RSS Article Manager") + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # Scrape command + scrape_parser = subparsers.add_parser("scrape", help="Scrape RSS feeds and index articles") + scrape_parser.add_argument("--collection", type=str, default="articles", + help="Collection name to use") + + # Search command + search_parser = subparsers.add_parser("search", help="Search for articles") + search_parser.add_argument("query", type=str, help="Search query string") + search_parser.add_argument("--k", type=int, default=10, help="Number of results to return") + search_parser.add_argument("--search-type", choices=["documents", "snippets", "pages", "advanced"], + default="documents", help="Type of search to perform") + search_parser.add_argument("--collection", type=str, default="articles", + help="Collection name to search in") + search_parser.add_argument("--filter-creator", type=str, help="Filter by creator/author") + search_parser.add_argument("--filter-category", type=str, help="Filter by category") + search_parser.add_argument("--status", action="store_true", + help="Show collection status before searching") + search_parser.add_argument("--reranker", type=str, default="zerank-1-small", + help="Reranker model to use") + + # Collection management command + manage_parser = subparsers.add_parser("manage", help="Manage collections") + manage_parser.add_argument("action", choices=["list", "delete", "status"], + help="Management action to perform") + manage_parser.add_argument("--collection", type=str, help="Collection name (required for delete)") + + # Parse arguments + args = parser.parse_args() + + if args.command == "scrape": + # Initialize manager and scrape + manager = ZeroEntropyArticleManager(args.collection) + await manager.scrape_and_index() + + elif args.command == "search": + # Initialize manager and search + manager = ZeroEntropyArticleManager(args.collection) + await manager.search_articles( + query=args.query, + search_type=args.search_type, + k=args.k, + filter_creator=args.filter_creator, + filter_category=args.filter_category, + reranker=args.reranker, + show_status=args.status + ) + + elif args.command == "manage": + # Initialize manager and manage collections + manager = ZeroEntropyArticleManager() + await manager.manage_collections( + action=args.action, + collection_name=args.collection + ) + + +if __name__ == "__main__": + # Run the main async function + asyncio.run(main()) diff --git a/guides/semantic_search_over_articles/backend/indexer_ze.py b/guides/semantic_search_over_articles/backend/indexer_ze.py new file mode 100644 index 0000000..b2f5563 --- /dev/null +++ b/guides/semantic_search_over_articles/backend/indexer_ze.py @@ -0,0 +1,178 @@ +# indexer_ze.py +import requests +from typing import List, Dict +from bs4 import BeautifulSoup +from dotenv import load_dotenv +from zeroentropy import AsyncZeroEntropy, ConflictError + +# Logger import +from logger import getLogger + +# Load environment variables +load_dotenv() + +# Configure logger to display log messages +logger = getLogger() + + +class ZeroEntropyArticleIndexer: + """ + ZeroEntropyArticleIndexer handles RSS feed scraping and article indexing using ZeroEntropy API. + """ + def __init__(self, collection_name: str = "articles"): + self.collection_name = collection_name + self.zclient = AsyncZeroEntropy() + + async def initialize_collection(self): + """ + Initialize ZeroEntropy collection. Creates a new collection in ZeroEntropy if it doesn't exist. + If the collection already exists, logs a message and continues. + + Raises + ------ + ConflictError + If there's a conflict during collection creation (handled gracefully) + """ + try: + await self.zclient.collections.add(collection_name=self.collection_name) + logger.info(f"Created new collection: {self.collection_name}") + except ConflictError: + logger.error(f"Collection '{self.collection_name}' already exists") + + @staticmethod + def get_rss_feed_content(url: str) -> List[Dict]: + """ + Extract content from an RSS feed URL and returns a list of dictionaries containing the extracted content. + + Parameters + ---------- + url : str + The RSS feed URL to extract content from + + Returns + ------- + List[Dict] + A list of dictionaries, each containing article information with keys: + - title : str + The article title + - creator : str + The article author/creator + - categories : List[str] + List of article categories + - description : str + Brief description of the article + - pub_date : str + Publication date of the article + - content : str + Full cleaned text content of the article + - source_url : str + The original RSS feed URL + + Raises + ------ + requests.exceptions.HTTPError + If the HTTP request fails + requests.exceptions.RequestException + If there's a general request error + """ + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" # noqa: E501 + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + response.encoding = "utf-8" + + soup = BeautifulSoup(response.content, "xml") + items = soup.find_all("item") + content_list = [] + + for item in items: + title = item.find("title").get_text(strip=True) if item.find("title") else "N/A" + creator = item.find("dc:creator").get_text(strip=True) if item.find("dc:creator") else "N/A" + categories = [category.get_text(strip=True) for category in item.find_all("category")] + description = item.find("description").get_text(strip=True) if item.find("description") else "N/A" + publication_date = item.find("pubDate").get_text(strip=True) if item.find("pubDate") else "N/A" + content_encoded = ( + item.find("content:encoded").get_text(strip=True) if item.find("content:encoded") else "N/A" + ) + + # Clean up HTML content + content_text = "" + if content_encoded != "N/A": + content_encoded_soup = BeautifulSoup(content_encoded, "html.parser") + for script_or_style in content_encoded_soup(["script", "style"]): + script_or_style.decompose() + content_text = content_encoded_soup.get_text(separator=" ", strip=True).replace("\n", " ") + else: + description_soup = BeautifulSoup(description, "html.parser") + for script_or_style in description_soup(["script", "style"]): + script_or_style.decompose() + content_text = description_soup.get_text(separator=" ", strip=True).replace("\n", " ") + + content_list.append({ + "title": title, + "creator": creator, + "categories": categories, + "description": description, + "pub_date": publication_date, + "content": content_text, + "source_url": url + }) + + return content_list + + async def index_articles(self, articles: List[Dict]): + """ + Index articles in ZeroEntropy using the documents API. + + Takes a list of article dictionaries and indexes them in the ZeroEntropy + collection. Creates unique document paths and prepares metadata for each article. + + Parameters + ---------- + articles : List[Dict] + List of article dictionaries containing article information. + Each dictionary should have keys: title, creator, categories, + description, pub_date, content, source_url + """ + indexed_count = 0 + failed_count = 0 + + for idx, article in enumerate(articles): + try: + # Create unique document path + doc_path = f"article_{idx}_{hash(article['title'][:50])}" + + # Prepare content for indexing - combine title, description, and content + full_content = f"Title: {article['title']}\n\n" + full_content += f"Description: {article['description']}\n\n" + full_content += f"Content: {article['content']}" + + # Prepare metadata + metadata = { + "title": article["title"][:500], # Limit length for metadata + "creator": article["creator"][:200], + "categories": ", ".join(article["categories"][:5])[:300], # Limit categories + "pub_date": article["pub_date"][:100], + "source_url": article["source_url"][:300], + "type": "rss_article", + } + + # Add document to ZeroEntropy + await self.zclient.documents.add( + collection_name=self.collection_name, + path=doc_path, + content={"type": "text", "text": full_content}, + metadata=metadata, + ) + + indexed_count += 1 + if indexed_count % 10 == 0: + logger.info(f"Indexed {indexed_count} articles...") + + except ConflictError: + logger.warning(f"Article {idx} already exists, skipping...") + continue + + logger.info(f"Indexing complete. Success: {indexed_count}, Failed: {failed_count}") diff --git a/guides/semantic_search_over_articles/backend/logger/__init__.py b/guides/semantic_search_over_articles/backend/logger/__init__.py new file mode 100644 index 0000000..ae07c67 --- /dev/null +++ b/guides/semantic_search_over_articles/backend/logger/__init__.py @@ -0,0 +1 @@ +from .logging import CustomLogRecord, getLogger # noqa: F401 diff --git a/guides/semantic_search_over_articles/backend/logger/logging.py b/guides/semantic_search_over_articles/backend/logger/logging.py new file mode 100644 index 0000000..5d4a26d --- /dev/null +++ b/guides/semantic_search_over_articles/backend/logger/logging.py @@ -0,0 +1,121 @@ +from coloredlogs import ColoredFormatter +import logging +import socket +import os + + +class CustomLogRecord(logging.LogRecord): + """ + A custom log record class that extends logging.LogRecord. + + Attributes + ---------- + hostname : str + The hostname where the log record originated. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the object with the hostname of the system. + + Parameters + ---------- + *args : Variable length argument list. + **kwargs : Keyword arguments. + + Returns + ------- + None + + Notes + ----- + This function initializes the object with the hostname of the system using the socket module. + + Raises + ------ + None + """ + super().__init__(*args, **kwargs) + self.hostname = socket.gethostname() + + +def getLogger(name: str = "main", loglevel: str = "INFO", logdir: str = "./logs", stream=None, color_logs=True): + """ + Get a logger object with custom settings and formatters. + + Parameters: + name (str): Name of the logger. + loglevel (str): Log level for the logger. + logdir (str): Directory to store log files. + stream (stream): Stream to write logs to. + color_logs (bool): Whether to colorize logs. + + Returns: + logger: A logger object with custom settings and formatters. + """ + logging.setLogRecordFactory(CustomLogRecord) + logger = logging.getLogger(name) + logger.propagate = False + + if logger.handlers: + return logger + else: + loglevel = getattr(logging, loglevel.upper(), logging.INFO) + logger.setLevel(loglevel) + + log_format = "%(asctime)s | %(module)s | %(levelname)-8s | %(message)s [%(filename)s:%(lineno)s]" + + simple_formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S") + + colored_formatter = ColoredFormatter( + log_format, + datefmt="%Y-%m-%d %H:%M:%S", + level_styles={ + "critical": {"bold": True, "color": "red"}, + "debug": {"color": "green"}, + "error": {"color": "red"}, + "info": {"color": "white"}, + "notice": {"color": "magenta"}, + "spam": {"color": "green", "faint": True}, + "success": {"bold": True, "color": "green"}, + "verbose": {"color": "blue"}, + "warning": {"color": "yellow"}, + }, + field_styles={ + "asctime": {"color": "green"}, + "hostname": {"color": "magenta"}, + "levelname": {"bold": True, "color": "magenta"}, + "module": {"color": "blue"}, + "programname": {"color": "cyan"}, + "username": {"color": "yellow"}, + }, + ) + + if not os.path.isdir(logdir): + os.mkdir(logdir) + + fileHandler = logging.FileHandler(os.path.join(logdir, "logs.txt")) + fileHandler.setLevel(logging.DEBUG) + fileHandler.setFormatter(simple_formatter) + + streamHandler = logging.StreamHandler(stream=stream) + streamHandler.setLevel(loglevel) + if color_logs: + streamHandler.setFormatter(colored_formatter) + else: + streamHandler.setFormatter(simple_formatter) + + logger.addHandler(fileHandler) + logger.addHandler(streamHandler) + + return logger + + +# a simple usecase +if __name__ == "__main__": + logger = getLogger(loglevel="DEBUG") + logger.debug("A message only developers care about") + logger.info("Curious users might want to know this") + logger.warning("Something is wrong and any user should be informed") + logger.error("Serious stuff, this is red for a reason") + logger.critical("!OH NO everything is on fire") diff --git a/guides/semantic_search_over_articles/backend/search_ze.py b/guides/semantic_search_over_articles/backend/search_ze.py new file mode 100644 index 0000000..7058ff8 --- /dev/null +++ b/guides/semantic_search_over_articles/backend/search_ze.py @@ -0,0 +1,359 @@ +# search_ze.py +from dotenv import load_dotenv +from zeroentropy import AsyncZeroEntropy + +# Logger import +from logger import getLogger + +# Load environment variables +load_dotenv() + +# Configure logger to display log messages +logger = getLogger() + + +class ZeroEntropyArticleSearcher: + """ + ZeroEntropyArticleSearcher handles article searching using ZeroEntropy's advanced retrieval capabilities. + + Parameters + ---------- + collection_name : str, optional + Name of the ZeroEntropy collection to search in, by default "articles" + + Attributes + ---------- + collection_name : str + The name of the ZeroEntropy collection + zclient : AsyncZeroEntropy + The ZeroEntropy async client instance + """ + + def __init__(self, collection_name: str = "articles"): + self.collection_name = collection_name + self.zclient = AsyncZeroEntropy() + + async def search_documents( + self, + query: str, + k: int = 10, + filter_dict: dict = None, + include_metadata: bool = True, + reranker: str = "zerank-1-small", + ) -> list: + """Search for documents using ZeroEntropy's top_documents query. + + Parameters + ---------- + query : str + The search query string + k : int, optional + Number of top results to return, by default 10 + filter_dict : dict, optional + Dictionary containing filters for metadata fields, by default None + include_metadata : bool, optional + Whether to include document metadata in results, by default True + reranker : str, optional + Name of the reranker model to use, by default "zerank-1-small" + + Returns + ------- + List: + List of document result dictionaries, each containing: + - path : str + Unique document path identifier + - score : float + Relevance score for the document + - file_url : str or None + URL to access the full document file + - metadata : Dict + Document metadata including title, creator, categories, etc. + """ + response = await self.zclient.queries.top_documents( + collection_name=self.collection_name, + query=query, + k=k, + filter=filter_dict, + include_metadata=include_metadata, + reranker=reranker, + latency_mode="low", + ) + + # Convert Result objects to dictionaries + results = [] + for result in response.results: + result_dict = { + 'path': result.path, + 'score': result.score, + 'file_url': getattr(result, 'file_url', None), + 'metadata': getattr(result, 'metadata', {}) + } + results.append(result_dict) + + return results + + async def search_snippets( + self, + query: str, + k: int = 10, + filter_dict: dict = None, + precise_responses: bool = True, + reranker: str = "zerank-1-small", + ) -> list: + """ + Search for specific snippets using ZeroEntropy's top_snippets query. + + Parameters + ---------- + query : str + The search query string + k : int, optional + Number of top snippets to return, by default 10 + filter_dict : dict, optional + Dictionary containing filters for metadata fields, by default None + precise_responses : bool, optional + Whether to return precise snippet boundaries, by default True + reranker : str, optional + Name of the reranker model to use, by default "zerank-1-small" + + Returns + ------- + List: + List of snippet result dictionaries, each containing: + - path : str + Document path where the snippet was found + - score : float + Relevance score for the snippet + - start_index : int + Starting character index of the snippet in the document + - end_index : int + Ending character index of the snippet in the document + - page_span : List[int] + List indicating which pages the snippet spans + - content : str + The actual text content of the snippet + - metadata : Dict + Document metadata including title, creator, categories, etc. + """ + response = await self.zclient.queries.top_snippets( + collection_name=self.collection_name, + query=query, + k=k, + filter=filter_dict, + precise_responses=precise_responses, + include_document_metadata=True, + reranker=reranker, + ) + + # Convert Result objects to dictionaries + results = [] + for result in response.results: + result_dict = { + 'path': result.path, + 'score': result.score, + 'start_index': getattr(result, 'start_index', 0), + 'end_index': getattr(result, 'end_index', 0), + 'page_span': getattr(result, 'page_span', []), + 'content': getattr(result, 'content', ''), + 'metadata': getattr(result, 'metadata', {}) + } + results.append(result_dict) + + return results + + async def search_pages( + self, + query: str, + k: int = 10, + filter_dict: dict = None, + include_content: bool = True, + ) -> list: + """ + Search for pages using ZeroEntropy's top_pages query. + + Parameters + ---------- + query : str + The search query string + k : int, optional + Number of top pages to return, by default 10 + filter_dict : dict, optional + Dictionary containing filters for metadata fields, by default None + include_content : bool, optional + Whether to include page content in results, by default True + + Returns + ------- + List: + List of page result dictionaries, each containing: + - path : str + Document path containing the page + - score : float + Relevance score for the page + - page_index : int + Index of the page within the document (0-based) + - content : str + The text content of the page (if include_content=True) + - metadata : Dict + Document metadata including title, creator, categories, etc. + """ + response = await self.zclient.queries.top_pages( + collection_name=self.collection_name, + query=query, + k=k, + filter=filter_dict, + include_content=include_content, + latency_mode="low", + ) + + # Convert Result objects to dictionaries + results = [] + for result in response.results: + result_dict = { + 'path': result.path, + 'score': result.score, + 'page_index': getattr(result, 'page_index', 0), + 'content': getattr(result, 'content', ''), + 'metadata': getattr(result, 'metadata', {}) + } + results.append(result_dict) + + return results + + @staticmethod + def display_document_results(results: list, query: str) -> None: + """ + Display document search results in a formatted way. + + Parameters + ---------- + results : list + List of document result dictionaries from search_documents() + query : str + The original search query string for display purposes + """ + if not results: + print(f"No results found for query: '{query}'") + return + + print(f"\n{'=' * 60}") + print(f"DOCUMENT SEARCH RESULTS FOR: '{query}'") + print(f"Found {len(results)} results") + print(f"{'=' * 60}\n") + + for i, result in enumerate(results, 1): + print(f"Result {i}") + print(f"Document Path: {result['path']}") + print(f"Relevance Score: {result['score']:.4f}") + + if result.get("metadata"): + metadata = result["metadata"] + print(f"Title: {metadata.get('title', 'N/A')}") + print(f"Author: {metadata.get('creator', 'N/A')}") + print(f"Publication Date: {metadata.get('pub_date', 'N/A')}") + print(f"Categories: {metadata.get('categories', 'N/A')}") + print(f"Source URL: {metadata.get('source_url', 'N/A')}") + + # NOTE: file_url contains sensitive tokens - UNCOMMENT ONLY FOR DEBUGGING/TESTING + if result.get("file_url"): + # print(f"File URL: {result['file_url']}") + print("File URL: [Available - hidden for security]") + + print("\n" + "-" * 50 + "\n") + + @staticmethod + def display_snippet_results(results: list, query: str): + """Display snippet search results in a formatted way. + + Parameters + ---------- + results : list + List of snippet result dictionaries from search_snippets() + query : str + The original search query string for display purposes + """ + if not results: + print(f"No snippets found for query: '{query}'") + return + + print(f"\n{'=' * 60}") + print(f"SNIPPET SEARCH RESULTS FOR: '{query}'") + print(f"Found {len(results)} snippets") + print(f"{'=' * 60}\n") + + for i, result in enumerate(results, 1): + print(f"Snippet {i}") + print(f"Document Path: {result['path']}") + print(f"Relevance Score: {result['score']:.4f}") + print(f"Character Range: {result['start_index']}-{result['end_index']}") + print(f"Page Span: {result['page_span']}") + + if result.get("content"): + print(f"Content: {result['content'][:300]}...") + + if result.get("metadata"): + metadata = result["metadata"] + print(f"Title: {metadata.get('title', 'N/A')}") + print(f"Author: {metadata.get('creator', 'N/A')}") + + print("\n" + "-" * 40 + "\n") + + @staticmethod + def display_page_results(results: list, query: str): + """ + Display page search results in a formatted way. + + Parameters + ---------- + results : list + List of page result dictionaries from search_pages() + query : str + The original search query string for display purposes + """ + if not results: + print(f"No page results found for query: '{query}'") + return + + print(f"\n{'=' * 60}") + print(f"PAGE SEARCH RESULTS FOR: '{query}'") + print(f"Found {len(results)} page results") + print(f"{'=' * 60}\n") + + for i, result in enumerate(results, 1): + print(f"Page {i}") + print(f"Document Path: {result['path']}") + print(f"Page Index: {result['page_index']}") + print(f"Relevance Score: {result['score']:.4f}") + + if result.get("content"): + print(f"Content: {result['content'][:300]}...") + + if result.get("metadata"): + metadata = result["metadata"] + print(f"Title: {metadata.get('title', 'N/A')}") + print(f"Author: {metadata.get('creator', 'N/A')}") + + print("\n" + "-" * 40 + "\n") + + async def get_collection_status(self): + """Get and display collection status information. + + Returns + ------- + status : object + ZeroEntropy status response object containing collection statistics + + Notes + ----- + Prints collection status including total documents, indexed documents, + documents currently being parsed/indexed, and failed documents. + This is useful for monitoring the health and progress of document indexing. + """ + status = await self.zclient.status.get(collection_name=self.collection_name) + print(f"\nCollection Status for '{self.collection_name}':") + print(f"Total Documents: {status.num_documents}") + print(f"Indexed Documents: {status.num_indexed_documents}") + print(f"Parsing Documents: {status.num_parsing_documents}") + print(f"Indexing Documents: {status.num_indexing_documents}") + print(f"Failed Documents: {status.num_failed_documents}") + return status diff --git a/guides/semantic_search_over_articles/backend/utils_ze.py b/guides/semantic_search_over_articles/backend/utils_ze.py new file mode 100644 index 0000000..6122510 --- /dev/null +++ b/guides/semantic_search_over_articles/backend/utils_ze.py @@ -0,0 +1,371 @@ +# utils_ze.py +from dotenv import load_dotenv +from zeroentropy import AsyncZeroEntropy + +# Logger import +from logger import getLogger + +# Load environment variables +load_dotenv() + +# Configure logger to display log messages +logger = getLogger() + + +class ZeroEntropyUtils: + """ + Utility class for advanced ZeroEntropy operations including reranking, collection management, and batch operations. + + Parameters + ---------- + collection_name : str, optional + Name of the default ZeroEntropy collection to work with, by default "articles" + + Attributes + ---------- + collection_name : str + The default collection name for operations + zclient : AsyncZeroEntropy + The ZeroEntropy async client instance + """ + def __init__(self, collection_name: str = "articles"): + self.collection_name = collection_name + self.zclient = AsyncZeroEntropy() + + async def rerank_documents( + self, + query: str, + document_texts: list, + model: str = "zerank-1-small", + top_n: int = 10, + ) -> list: + """ + Use ZeroEntropy's reranking model to rerank a list of documents. + + Parameters + ---------- + query : str + The search query to use for reranking + document_texts : list + List of document text strings to be reranked + model : str, optional + Name of the reranking model to use, by default "zerank-1-small" + top_n : int, optional + Number of top reranked results to return, by default 10 + + Returns + ------- + list + List of reranked result dictionaries, each containing: + - index : int + Original index of the document in the input list + - relevance_score : float + Reranking relevance score for the document + """ + response = await self.zclient.models.rerank( + query=query, documents=document_texts, model=model, top_n=top_n + ) + return response.results + + async def list_all_collections(self) -> list: + """ + List all available collections in the ZeroEntropy instance. + + Returns + ------- + list + List of collection names available in the ZeroEntropy instance + """ + response = await self.zclient.collections.get_list() + return response.collection_names + + async def delete_collection(self, collection_name: str) -> bool: + """ + Delete a collection from ZeroEntropy. + + Parameters + ---------- + collection_name : str + Name of the collection to delete + + Returns + ------- + bool + True if deletion was successful + """ + await self.zclient.collections.delete(collection_name=collection_name) + logger.info(f"Successfully deleted collection: {collection_name}") + return True + + async def list_documents_in_collection( + self, collection_name: str = None, limit: int = 100 + ) -> list: + """ + List all documents in a collection with pagination support. + + Parameters + ---------- + collection_name : str, optional + Name of the collection to list documents from. If None, uses the default + collection name, by default None + limit : int, optional + Maximum number of documents to return, by default 100 + + Returns + ------- + list + List of document metadata dictionaries containing information about + each document in the collection + """ + if collection_name is None: + collection_name = self.collection_name + + response = await self.zclient.documents.get_info_list( + collection_name=collection_name, limit=limit + ) + return response.documents + + async def get_document_info( + self, + path: str, + collection_name: str = None, + include_content: bool = False, + ) -> dict: + """ + Get detailed information about a specific document. + + Parameters + ---------- + path : str + Unique path identifier of the document + collection_name : str, optional + Name of the collection containing the document. If None, uses the default + collection name, by default None + include_content : bool, optional + Whether to include the full document content in the response, by default False + + Returns + ------- + Optional[Dict] + Dictionary containing document information including metadata, status, + and optionally the full content. Returns None if document not found. + """ + if collection_name is None: + collection_name = self.collection_name + + response = await self.zclient.documents.get_info( + collection_name=collection_name, + path=path, + include_content=include_content, + ) + return response.document + + async def update_document_metadata( + self, path: str, metadata: dict, collection_name: str = None + ) -> bool: + """ + Update metadata for a specific document. + + Parameters + ---------- + path : str + Unique path identifier of the document to update + metadata : Dict[str, str] + Dictionary of metadata keys and values to update + collection_name : str, optional + Name of the collection containing the document. If None, uses the default + collection name, by default None + + Returns + ------- + bool + True if the metadata update was successful + """ + if collection_name is None: + collection_name = self.collection_name + + await self.zclient.documents.update( + collection_name=collection_name, path=path, metadata=metadata + ) + logger.info(f"Successfully updated metadata for document: {path}") + return True + + async def delete_document( + self, path: str, collection_name: str = None + ) -> bool: + """ + Delete a specific document from a collection. + + Parameters + ---------- + path : str + Unique path identifier of the document to delete + collection_name : str, optional + Name of the collection containing the document. If None, uses the default + collection name, by default None + + Returns + ------- + bool + True if the document deletion was successful + """ + if collection_name is None: + collection_name = self.collection_name + + await self.zclient.documents.delete( + collection_name=collection_name, path=path + ) + logger.info(f"Successfully deleted document: {path}") + return True + + async def batch_delete_documents( + self, paths: list, collection_name: str = None + ) -> dict: + """ + Delete multiple documents in batch operation. + + Parameters + ---------- + paths : list + List of document path identifiers to delete + collection_name : str, optional + Name of the collection containing the documents. If None, uses the default + collection name, by default None + + Returns + ------- + Dict[str, bool] + Dictionary mapping each document path to its deletion success status. + True indicates successful deletion, False indicates failure. + """ + if collection_name is None: + collection_name = self.collection_name + + results = {} + for path in paths: + success = await self.delete_document(path, collection_name) + results[path] = success + + return results + + async def search_and_rerank( + self, + query: str, + k: int = 20, + rerank_top_n: int = 10, + collection_name: str = None, + ) -> list: + """ + Advanced search that first retrieves more documents, then reranks them for improved relevance. + + Parameters + ---------- + query : str + The search query string + k : int, optional + Number of initial documents to retrieve before reranking, by default 20 + rerank_top_n : int, optional + Number of top documents to return after reranking, by default 10 + collection_name : str, optional + Name of the collection to search in. If None, uses the default + collection name, by default None + + Returns + ------- + list + List of reranked document result dictionaries, each containing: + - All original document fields (path, metadata, etc.) + - original_score : float + The original search relevance score + - rerank_score : float + The improved relevance score from reranking + """ + if collection_name is None: + collection_name = self.collection_name + + # First, get more documents than needed + search_results = await self.zclient.queries.top_documents( + collection_name=collection_name, + query=query, + k=k, + include_metadata=True, + latency_mode="low", + ) + + if not search_results.results: + return [] + + # Extract document texts for reranking + document_texts = [] + for result in search_results.results: + # Get full document content + doc_info = await self.get_document_info( + path=result["path"], + collection_name=collection_name, + include_content=True, + ) + if doc_info and doc_info.get("content"): + document_texts.append(doc_info["content"]) + else: + # Fallback to using metadata if content not available + metadata = result.get("metadata", {}) + fallback_text = ( + f"{metadata.get('title', '')} {metadata.get('description', '')}" + ) + document_texts.append(fallback_text) + + # Rerank the documents + rerank_results = await self.rerank_documents( + query=query, + document_texts=document_texts, + top_n=min(rerank_top_n, len(document_texts)), + ) + + # Combine rerank results with original metadata + final_results = [] + for rerank_result in rerank_results: + original_result = search_results.results[rerank_result["index"]] + final_results.append( + { + **original_result, + "rerank_score": rerank_result["relevance_score"], + "original_score": original_result["score"], + } + ) + + return final_results + + def display_advanced_results(self, results: list, query: str): + """ + Display results with both original and rerank scores in a formatted way. + + Parameters + ---------- + results : list + List of advanced search result dictionaries from search_and_rerank() + query : str + The original search query string for display purposes + """ + if not results: + print(f"No results found for query: '{query}'") + return + + print(f"\n{'='*70}") + print(f"ADVANCED SEARCH & RERANK RESULTS FOR: '{query}'") + print(f"Found {len(results)} results") + print(f"{'='*70}\n") + + for i, result in enumerate(results, 1): + print(f"Result {i}") + print(f"Document Path: {result['path']}") + print(f"Original Score: {result['original_score']:.4f}") + print(f"Rerank Score: {result['rerank_score']:.4f}") + + if result.get("metadata"): + metadata = result["metadata"] + print(f"Title: {metadata.get('title', 'N/A')}") + print(f"Author: {metadata.get('creator', 'N/A')}") + print(f"Publication Date: {metadata.get('pub_date', 'N/A')}") + print(f"Categories: {metadata.get('categories', 'N/A')}") + + print("\n" + "-" * 60 + "\n") diff --git a/guides/semantic_search_over_articles/demo_notebook.ipynb b/guides/semantic_search_over_articles/demo_notebook.ipynb new file mode 100644 index 0000000..4d31b4d --- /dev/null +++ b/guides/semantic_search_over_articles/demo_notebook.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "ff59aa32", + "metadata": {}, + "outputs": [], + "source": [ + "# main.py\n", + "import json\n", + "from dotenv import load_dotenv\n", + "\n", + "# Internal imports\n", + "from indexer_ze import ZeroEntropyArticleIndexer\n", + "from search_ze import ZeroEntropyArticleSearcher\n", + "from utils_ze import ZeroEntropyUtils\n", + "from logger import getLogger\n", + "\n", + "# Load environment variables\n", + "load_dotenv()\n", + "\n", + "# Configure logger to display log messages\n", + "logger = getLogger()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c6413325", + "metadata": {}, + "outputs": [], + "source": [ + "class ZeroEntropyArticleManager:\n", + " \"\"\"\n", + " Main class that orchestrates RSS scraping, indexing, and searching using ZeroEntropy.\n", + " \"\"\"\n", + "\n", + " def __init__(self, collection_name: str = \"articles\"):\n", + " self.collection_name = collection_name\n", + " self.indexer = ZeroEntropyArticleIndexer(collection_name)\n", + " self.searcher = ZeroEntropyArticleSearcher(collection_name)\n", + " self.utils = ZeroEntropyUtils(collection_name)\n", + "\n", + " async def scrape_and_index(self):\n", + " \"\"\"Scrape RSS feeds and index articles\"\"\"\n", + " # Initialize RSS feed URLs\n", + " rss_public_urls = [\n", + " \"https://www.public.fr/feed\",\n", + " \"https://www.public.fr/people/feed\",\n", + " \"https://www.public.fr/tele/feed\",\n", + " \"https://www.public.fr/mode/feed\",\n", + " \"https://www.public.fr/people/familles-royales/feed\",\n", + " ]\n", + "\n", + " rss_vsd_urls = [\n", + " \"https://vsd.fr/actu-people/feed/\",\n", + " \"https://vsd.fr/tele/feed/\",\n", + " \"https://vsd.fr/societe/feed/\",\n", + " \"https://vsd.fr/culture/feed/\",\n", + " \"https://vsd.fr/loisirs/feed/\",\n", + " ]\n", + "\n", + " # Initialize collection\n", + " await self.indexer.initialize_collection()\n", + "\n", + " # Extract content from RSS feeds\n", + " articles = []\n", + " for url in rss_public_urls + rss_vsd_urls:\n", + " content = self.indexer.get_rss_feed_content(url)\n", + " if content:\n", + " articles.extend(content)\n", + " logger.info(\"Successfully extracted content from %s\", url)\n", + " else:\n", + " logger.warning(\"Failed to extract content from %s\", url)\n", + "\n", + " # Save all content to a JSON file for backup\n", + " with open(\"articles.json\", \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(articles, f, ensure_ascii=False, indent=4)\n", + "\n", + " logger.info(f\"Extracted {len(articles)} articles total\")\n", + "\n", + " # Index articles in ZeroEntropy\n", + " if articles:\n", + " await self.indexer.index_articles(articles)\n", + " logger.info(\"Successfully scraped and indexed articles in ZeroEntropy.\")\n", + " else:\n", + " logger.warning(\"No articles to index.\")\n", + "\n", + " async def search_articles(\n", + " self,\n", + " query: str,\n", + " search_type: str = \"documents\",\n", + " k: int = 10,\n", + " filter_creator: str = None,\n", + " filter_category: str = None,\n", + " reranker: str = \"zerank-1-small\",\n", + " show_status: bool = False,\n", + " ):\n", + " \"\"\"Search for articles\"\"\"\n", + " # Show status if requested\n", + " if show_status:\n", + " await self.searcher.get_collection_status()\n", + "\n", + " # Prepare filter if specified\n", + " filter_dict = {}\n", + " if filter_creator:\n", + " filter_dict[\"creator\"] = {\"$eq\": filter_creator}\n", + " if filter_category:\n", + " filter_dict[\"categories\"] = {\"$eq\": filter_category}\n", + "\n", + " filter_dict = filter_dict if filter_dict else None\n", + "\n", + " # Perform search based on type\n", + " if search_type == \"documents\":\n", + " results = await self.searcher.search_documents(\n", + " query=query,\n", + " k=k,\n", + " filter_dict=filter_dict,\n", + " reranker=reranker,\n", + " )\n", + " self.searcher.display_document_results(results, query)\n", + "\n", + " elif search_type == \"snippets\":\n", + " results = await self.searcher.search_snippets(\n", + " query=query,\n", + " k=k,\n", + " filter_dict=filter_dict,\n", + " reranker=reranker,\n", + " )\n", + " self.searcher.display_snippet_results(results, query)\n", + "\n", + " elif search_type == \"pages\":\n", + " results = await self.searcher.search_pages(\n", + " query=query, k=k, filter_dict=filter_dict\n", + " )\n", + " self.searcher.display_page_results(results, query)\n", + "\n", + " elif search_type == \"advanced\":\n", + " results = await self.utils.search_and_rerank(\n", + " query=query, k=k * 2, rerank_top_n=k\n", + " )\n", + " self.utils.display_advanced_results(results, query)\n", + "\n", + " return results\n", + "\n", + " async def manage_collections(self, action: str, collection_name: str = None):\n", + " \"\"\"Manage collections (list, delete, status)\"\"\"\n", + " if action == \"list\":\n", + " collections = await self.utils.list_all_collections()\n", + " print(f\"Available collections: {collections}\")\n", + " return collections\n", + "\n", + " elif action == \"delete\" and collection_name:\n", + " success = await self.utils.delete_collection(collection_name)\n", + " if success:\n", + " print(f\"Successfully deleted collection: {collection_name}\")\n", + " else:\n", + " print(f\"Failed to delete collection: {collection_name}\")\n", + " return success\n", + "\n", + " elif action == \"status\":\n", + " status = await self.searcher.get_collection_status()\n", + " return status\n", + "\n", + " else:\n", + " print(\"Invalid action or missing collection name\")\n", + " return None\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3e0d6bbe", + "metadata": {}, + "outputs": [], + "source": [ + "collection = \"my_articles\"\n", + "manager = ZeroEntropyArticleManager(collection)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae95b984", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2025-08-06 10:30:52\u001b[0m | \u001b[34mindexer_ze\u001b[0m | \u001b[1;35mERROR \u001b[0m | \u001b[31mCollection 'my_articles' already exists\u001b[0m [indexer_ze.py:40]\n", + "\u001b[32m2025-08-06 10:31:04\u001b[0m | \u001b[34m1820264054\u001b[0m | \u001b[1;35mINFO \u001b[0m | \u001b[37mSuccessfully extracted content from https://www.public.fr/feed\u001b[0m [1820264054.py:40]\n", + "\u001b[32m2025-08-06 10:31:06\u001b[0m | \u001b[34m1820264054\u001b[0m | \u001b[1;35mINFO \u001b[0m | \u001b[37mSuccessfully extracted content from https://www.public.fr/people/feed\u001b[0m [1820264054.py:40]\n" + ] + } + ], + "source": [ + "await manager.scrape_and_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c271f12", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "DOCUMENT SEARCH RESULTS FOR: 'famille royale'\n", + "Found 10 results\n", + "============================================================\n", + "\n", + "Result 1\n", + "Document Path: article_204_2420503972517589745\n", + "Relevance Score: 1.1434\n", + "Title: Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu\n", + "Author: Elisabeth Sall\n", + "Publication Date: Tue, 22 Jul 2025 07:45:00 +0000\n", + "Categories: People, Royauté, Famille royale britannique, Mort, Prince Harry\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n", + "Result 2\n", + "Document Path: article_244_4511532550014461900\n", + "Relevance Score: 1.0142\n", + "Title: Prince Harry: \"Les médecins ont recommandé au roi de ne plus parler à son fils\", Stéphane Bern s’en mêle\n", + "Author: Clément Garin\n", + "Publication Date: Sat, 03 May 2025 07:50:00 +0000\n", + "Categories: People, Royauté, Cancer, Clash, Famille royale britannique\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n", + "Result 3\n", + "Document Path: article_204_5642323426295626288\n", + "Relevance Score: 0.8656\n", + "Title: Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu\n", + "Author: Elisabeth Sall\n", + "Publication Date: Tue, 22 Jul 2025 07:45:00 +0000\n", + "Categories: People, Royauté, Famille royale britannique, Mort, Prince Harry\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n", + "Result 4\n", + "Document Path: article_244_-4945558259102855923\n", + "Relevance Score: 0.7137\n", + "Title: Prince Harry: \"Les médecins ont recommandé au roi de ne plus parler à son fils\", Stéphane Bern s’en mêle\n", + "Author: Clément Garin\n", + "Publication Date: Sat, 03 May 2025 07:50:00 +0000\n", + "Categories: People, Royauté, Cancer, Clash, Famille royale britannique\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n", + "Result 5\n", + "Document Path: article_204_-5586505152836586607\n", + "Relevance Score: 0.5834\n", + "Title: Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu\n", + "Author: Elisabeth Sall\n", + "Publication Date: Tue, 22 Jul 2025 07:45:00 +0000\n", + "Categories: People, Royauté, Famille royale britannique, Mort, Prince Harry\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n", + "Result 6\n", + "Document Path: article_204_4431767949249707685\n", + "Relevance Score: 0.5344\n", + "Title: Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu\n", + "Author: Elisabeth Sall\n", + "Publication Date: Tue, 22 Jul 2025 07:45:00 +0000\n", + "Categories: People, Royauté, Famille royale britannique, Mort, Prince Harry\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n", + "Result 7\n", + "Document Path: article_204_-7449109881057781290\n", + "Relevance Score: 0.4924\n", + "Title: Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu\n", + "Author: Elisabeth Sall\n", + "Publication Date: Tue, 22 Jul 2025 07:45:00 +0000\n", + "Categories: People, Royauté, Famille royale britannique, Mort, Prince Harry\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n", + "Result 8\n", + "Document Path: article_244_1906632056819888007\n", + "Relevance Score: 0.4643\n", + "Title: Prince Harry: \"Les médecins ont recommandé au roi de ne plus parler à son fils\", Stéphane Bern s’en mêle\n", + "Author: Clément Garin\n", + "Publication Date: Sat, 03 May 2025 07:50:00 +0000\n", + "Categories: People, Royauté, Cancer, Clash, Famille royale britannique\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n", + "Result 9\n", + "Document Path: article_244_522793638848069728\n", + "Relevance Score: 0.4395\n", + "Title: Prince Harry: \"Les médecins ont recommandé au roi de ne plus parler à son fils\", Stéphane Bern s’en mêle\n", + "Author: Clément Garin\n", + "Publication Date: Sat, 03 May 2025 07:50:00 +0000\n", + "Categories: People, Royauté, Cancer, Clash, Famille royale britannique\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n", + "Result 10\n", + "Document Path: article_204_5853657413749066345\n", + "Relevance Score: 0.4209\n", + "Title: Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu\n", + "Author: Elisabeth Sall\n", + "Publication Date: Tue, 22 Jul 2025 07:45:00 +0000\n", + "Categories: People, Royauté, Famille royale britannique, Mort, Prince Harry\n", + "Source URL: https://www.public.fr/people/familles-royales/feed\n", + "File URL: [Available - hidden for security]\n", + "\n", + "--------------------------------------------------\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'path': 'article_204_2420503972517589745',\n", + " 'score': 1.1434071251552826,\n", + " 'file_url': '[Available - hidden for security]',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu',\n", + " 'creator': 'Elisabeth Sall',\n", + " 'pub_date': 'Tue, 22 Jul 2025 07:45:00 +0000',\n", + " 'categories': 'People, Royauté, Famille royale britannique, Mort, Prince Harry',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}},\n", + " {'path': 'article_244_4511532550014461900',\n", + " 'score': 1.0141609646223106,\n", + " 'file_url': 'Available - hidden for security',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry: \"Les médecins ont recommandé au roi de ne plus parler à son fils\", Stéphane Bern s’en mêle',\n", + " 'creator': 'Clément Garin',\n", + " 'pub_date': 'Sat, 03 May 2025 07:50:00 +0000',\n", + " 'categories': 'People, Royauté, Cancer, Clash, Famille royale britannique',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}},\n", + " {'path': 'article_204_5642323426295626288',\n", + " 'score': 0.865620797153619,\n", + " 'file_url': 'Available - hidden for security',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu',\n", + " 'creator': 'Elisabeth Sall',\n", + " 'pub_date': 'Tue, 22 Jul 2025 07:45:00 +0000',\n", + " 'categories': 'People, Royauté, Famille royale britannique, Mort, Prince Harry',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}},\n", + " {'path': 'article_244_-4945558259102855923',\n", + " 'score': 0.7137316794271147,\n", + " 'file_url': 'Available - hidden for security',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry: \"Les médecins ont recommandé au roi de ne plus parler à son fils\", Stéphane Bern s’en mêle',\n", + " 'creator': 'Clément Garin',\n", + " 'pub_date': 'Sat, 03 May 2025 07:50:00 +0000',\n", + " 'categories': 'People, Royauté, Cancer, Clash, Famille royale britannique',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}},\n", + " {'path': 'article_204_-5586505152836586607',\n", + " 'score': 0.5834117259799423,\n", + " 'file_url': 'Available - hidden for security',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu',\n", + " 'creator': 'Elisabeth Sall',\n", + " 'pub_date': 'Tue, 22 Jul 2025 07:45:00 +0000',\n", + " 'categories': 'People, Royauté, Famille royale britannique, Mort, Prince Harry',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}},\n", + " {'path': 'article_204_4431767949249707685',\n", + " 'score': 0.5343611770621242,\n", + " 'file_url': 'Available - hidden for security',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu',\n", + " 'creator': 'Elisabeth Sall',\n", + " 'pub_date': 'Tue, 22 Jul 2025 07:45:00 +0000',\n", + " 'categories': 'People, Royauté, Famille royale britannique, Mort, Prince Harry',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}},\n", + " {'path': 'article_204_-7449109881057781290',\n", + " 'score': 0.49244116519499875,\n", + " 'file_url': 'Available - hidden for security',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu',\n", + " 'creator': 'Elisabeth Sall',\n", + " 'pub_date': 'Tue, 22 Jul 2025 07:45:00 +0000',\n", + " 'categories': 'People, Royauté, Famille royale britannique, Mort, Prince Harry',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}},\n", + " {'path': 'article_244_1906632056819888007',\n", + " 'score': 0.46429313152067825,\n", + " 'file_url': 'Available - hidden for security',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry: \"Les médecins ont recommandé au roi de ne plus parler à son fils\", Stéphane Bern s’en mêle',\n", + " 'creator': 'Clément Garin',\n", + " 'pub_date': 'Sat, 03 May 2025 07:50:00 +0000',\n", + " 'categories': 'People, Royauté, Cancer, Clash, Famille royale britannique',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}},\n", + " {'path': 'article_244_522793638848069728',\n", + " 'score': 0.43949404810417025,\n", + " 'file_url': 'Available - hidden for security',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry: \"Les médecins ont recommandé au roi de ne plus parler à son fils\", Stéphane Bern s’en mêle',\n", + " 'creator': 'Clément Garin',\n", + " 'pub_date': 'Sat, 03 May 2025 07:50:00 +0000',\n", + " 'categories': 'People, Royauté, Cancer, Clash, Famille royale britannique',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}},\n", + " {'path': 'article_204_5853657413749066345',\n", + " 'score': 0.42092474401628305,\n", + " 'file_url': 'Available - hidden for security',\n", + " 'metadata': {'type': 'rss_article',\n", + " 'title': 'Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu',\n", + " 'creator': 'Elisabeth Sall',\n", + " 'pub_date': 'Tue, 22 Jul 2025 07:45:00 +0000',\n", + " 'categories': 'People, Royauté, Famille royale britannique, Mort, Prince Harry',\n", + " 'source_url': 'https://www.public.fr/people/familles-royales/feed'}}]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"famille royale\"\n", + "search_type = \"documents\"\n", + "k = 10\n", + "reranker = \"zerank-1-small\"\n", + "\n", + "manager = ZeroEntropyArticleManager(collection)\n", + "await manager.search_articles(\n", + " query=query,\n", + " search_type=search_type,\n", + " k=k,\n", + " reranker=reranker,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/guides/semantic_search_over_articles/docker-compose.yml b/guides/semantic_search_over_articles/docker-compose.yml new file mode 100644 index 0000000..4a7b061 --- /dev/null +++ b/guides/semantic_search_over_articles/docker-compose.yml @@ -0,0 +1,38 @@ +# docker-compose.yml +version: '3.8' + +services: + gossip-search: + build: + context: . + dockerfile: Dockerfile + container_name: zeroentropy-gossip-search + ports: + - "8501:8501" + environment: + - ZEROENTROPY_API_KEY=${ZEROENTROPY_API_KEY} + - PYTHONPATH=/app + env_file: + - .env + volumes: + - ./data:/app/data + - ./logs:/app/logs + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + # Add a reverse proxy + nginx: + image: nginx:alpine + container_name: gossip-search-proxy + ports: + - "80:80" + volumes: + - ./nginx.conf:/etc/nginx/conf.d/default.conf + depends_on: + - gossip-search + restart: unless-stopped \ No newline at end of file diff --git a/guides/semantic_search_over_articles/frontend/streamlit_app_ze.py b/guides/semantic_search_over_articles/frontend/streamlit_app_ze.py new file mode 100644 index 0000000..f73e593 --- /dev/null +++ b/guides/semantic_search_over_articles/frontend/streamlit_app_ze.py @@ -0,0 +1,352 @@ +# streamlit_app.py +import os +import sys +import asyncio +import streamlit as st +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Set the working directory to the backend +current_dir = os.path.dirname(os.path.abspath(__file__)) +backend_path = os.path.abspath(os.path.join(current_dir, "../..", "backend")) +sys.path.append(backend_path) + +# Internal imports +from backend.search_ze import ZeroEntropyArticleSearcher # noqa: E402 +from backend.utils_ze import ZeroEntropyUtils # noqa: E402 + + +# Cache the searcher and utils to avoid reinitializing +@st.cache_resource +def get_searcher_and_utils(collection_name="articles"): + """Initialize and cache ZeroEntropy searcher and utils""" + searcher = ZeroEntropyArticleSearcher(collection_name) + utils = ZeroEntropyUtils(collection_name) + return searcher, utils + + +async def get_search_results( + query: str, + search_type: str, + k: int, + collection_name: str, + filter_creator: str = None, + filter_category: str = None, + reranker: str = "zerank-1-small", +): + """Async search function that uses ZeroEntropy backend functionality""" + searcher, utils = get_searcher_and_utils(collection_name) + + # Prepare filter if specified + filter_dict = {} + if filter_creator: + filter_dict["creator"] = {"$eq": filter_creator} + if filter_category: + filter_dict["categories"] = {"$eq": filter_category} + filter_dict = filter_dict if filter_dict else None + + # Perform search based on type + if search_type == "documents": + results = await searcher.search_documents( + query=query, + k=k, + filter_dict=filter_dict, + reranker=reranker, + ) + elif search_type == "snippets": + results = await searcher.search_snippets( + query=query, + k=k, + filter_dict=filter_dict, + reranker=reranker, + ) + elif search_type == "pages": + results = await searcher.search_pages( + query=query, k=k, filter_dict=filter_dict + ) + elif search_type == "advanced": + results = await utils.search_and_rerank( + query=query, k=k * 2, rerank_top_n=k # Get more documents initially + ) + else: + results = [] + + return results, search_type + + +def run_async_search( + query: str, + search_type: str, + k: int, + collection_name: str, + filter_creator: str = None, + filter_category: str = None, + reranker: str = "zerank-1-small", +): + """Wrapper to run async search in Streamlit""" + # Create new event loop for this thread + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + # Run the async function + results, search_type = loop.run_until_complete( + get_search_results( + query, + search_type, + k, + collection_name, + filter_creator, + filter_category, + reranker, + ) + ) + + loop.close() + return results, search_type + + +def display_document_results(results): + """Display document search results""" + for i, result in enumerate(results, 1): + with st.container(): + col1, col2 = st.columns([3, 1]) + + with col1: + metadata = result.get("metadata", {}) + title = metadata.get("title", "N/A") + + # Make title clickable if file_url exists + if result.get("file_url"): + st.markdown(f"### [{title}]({result['file_url']})") + else: + st.markdown(f"### {title}") + + st.markdown(f"**Author:** {metadata.get('creator', 'N/A')}") + st.markdown(f"**Publication Date:** {metadata.get('pub_date', 'N/A')}") + st.markdown(f"**Categories:** {metadata.get('categories', 'N/A')}") + + if metadata.get("source_url"): + st.markdown(f"**Source:** {metadata.get('source_url', 'N/A')}") + + with col2: + st.metric("Relevance Score", f"{result['score']:.3f}") + if result.get("rerank_score"): + st.metric("Rerank Score", f"{result['rerank_score']:.3f}") + + st.markdown("---") + + +def display_snippet_results(results): + """Display snippet search results""" + for i, result in enumerate(results, 1): + with st.container(): + col1, col2 = st.columns([3, 1]) + + with col1: + metadata = result.get("metadata", {}) + st.markdown(f"### Snippet {i}") + st.markdown(f"**From:** {metadata.get('title', 'N/A')}") + st.markdown(f"**Author:** {metadata.get('creator', 'N/A')}") + + # Show snippet content + if result.get("content"): + with st.expander("Show Content", expanded=True): + st.write(result["content"]) + + with col2: + st.metric("Relevance Score", f"{result['score']:.3f}") + st.write( + f"**Char Range:** {result.get('start_index', 0)}-{result.get('end_index', 0)}" + ) + st.write(f"**Page Span:** {result.get('page_span', [])}") + + st.markdown("---") + + +def display_page_results(results): + """Display page search results""" + for i, result in enumerate(results, 1): + with st.container(): + col1, col2 = st.columns([3, 1]) + + with col1: + metadata = result.get("metadata", {}) + st.markdown(f"### Page {i}") + st.markdown(f"**From:** {metadata.get('title', 'N/A')}") + st.markdown(f"**Page Index:** {result.get('page_index', 0)}") + + # Show page content + if result.get("content"): + with st.expander("Show Content", expanded=False): + st.write( + result["content"][:500] + "..." + if len(result["content"]) > 500 + else result["content"] + ) + + with col2: + st.metric("Relevance Score", f"{result['score']:.3f}") + + st.markdown("---") + + +def display_advanced_results(results): + """Display advanced search results with reranking""" + for i, result in enumerate(results, 1): + with st.container(): + col1, col2 = st.columns([3, 1]) + + with col1: + metadata = result.get("metadata", {}) + title = metadata.get("title", "N/A") + + st.markdown(f"### {title}") + st.markdown(f"**Author:** {metadata.get('creator', 'N/A')}") + st.markdown(f"**Publication Date:** {metadata.get('pub_date', 'N/A')}") + st.markdown(f"**Categories:** {metadata.get('categories', 'N/A')}") + + with col2: + st.metric("Original Score", f"{result.get('original_score', 0):.3f}") + st.metric("Rerank Score", f"{result.get('rerank_score', 0):.3f}") + + st.markdown("---") + + +async def get_collection_status(collection_name): + """Get collection status""" + try: + searcher, _ = get_searcher_and_utils(collection_name) + status = await searcher.get_collection_status() + return status + except Exception as e: + st.error(f"Error getting status: {str(e)}") + return None + + +def run_async_status(collection_name): + """Wrapper to run async status check in Streamlit""" + try: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + status = loop.run_until_complete(get_collection_status(collection_name)) + loop.close() + return status + except Exception as e: + st.error(f"Error getting status: {str(e)}") + return None + + +def main(): + st.set_page_config( + page_title="ZeroEntropy Gossip Search", layout="wide", page_icon="📰" + ) + + st.title("📰 RAG Gossip Semantic Search with ZeroEntropy") + st.write( + "Search for the latest articles from VSD and Public using advanced AI-powered retrieval." + ) + + # Sidebar for configuration + with st.sidebar: + st.header("🔧 Search Configuration") + + collection_name = st.text_input("Collection Name", value="my_articles") + + search_type = st.selectbox( + "Search Type", + options=["documents", "snippets", "pages", "advanced"], + index=0, + help="Choose the type of search to perform", + ) + + k = st.slider("Number of results", min_value=1, max_value=20, value=5) + + # Advanced options + with st.expander("🔍 Advanced Options"): + reranker = st.selectbox( + "Reranker Model", options=["zerank-1-small"], index=0 + ) + + filter_creator = st.text_input( + "Filter by Creator", placeholder="e.g., Arno Crampont" + ) + filter_category = st.text_input( + "Filter by Category", placeholder="e.g., TPMP" + ) + + # Collection status + if st.button("📊 Check Collection Status"): + with st.spinner("Getting status..."): + status = run_async_status(collection_name) + if status: + st.success("✅ Collection Status") + st.write(f"**Total Documents:** {status.num_documents}") + st.write(f"**Indexed:** {status.num_indexed_documents}") + st.write(f"**Parsing:** {status.num_parsing_documents}") + st.write(f"**Indexing:** {status.num_indexing_documents}") + st.write(f"**Failed:** {status.num_failed_documents}") + + # Main search interface + query = st.text_input( + "🔍 Enter your search query:", + value="", + placeholder="e.g., TPMP, famille royale, célébrités...", + ) + + if st.button("Search for a keyword", type="primary"): + if query.strip(): + with st.spinner(f"Searching with {search_type} mode..."): + results, result_type = run_async_search( + query=query, + search_type=search_type, + k=k, + collection_name=collection_name, + filter_creator=filter_creator if filter_creator else None, + filter_category=filter_category if filter_category else None, + reranker=reranker, + ) + + if results: + st.success(f"✅ Found {len(results)} results for '{query}'") + + # Display results based on search type + if result_type == "documents": + display_document_results(results) + elif result_type == "snippets": + display_snippet_results(results) + elif result_type == "pages": + display_page_results(results) + elif result_type == "advanced": + display_advanced_results(results) + + else: + st.warning("❌ No results found for your query.") + else: + st.error("⚠️ Please enter a valid query.") + + # Help section + with st.expander("ℹ️ How to use this app"): + st.markdown( + """ + ### Search Types: + - **Documents**: Search entire articles for the most relevant matches + - **Snippets**: Find specific text snippets within articles + - **Pages**: Search individual pages of documents + - **Advanced**: Uses reranking for improved relevance + + ### Filters: + - **Creator**: Filter by article author + - **Category**: Filter by article category + + ### Tips: + - Use specific keywords for better results + - Try different search types for different use cases + - Use filters to narrow down results + """ + ) + + +if __name__ == "__main__": + main()