From 96ff796fc5c02da40ffee9bba192c140ab0da281 Mon Sep 17 00:00:00 2001 From: ajosh0504 Date: Fri, 10 Oct 2025 23:41:40 +0000 Subject: [PATCH 1/9] Initial commit --- .../contextual_chunk_embedding.ipynb | 635 ++++++++++++++++++ 1 file changed, 635 insertions(+) create mode 100644 notebooks/advanced_techniques/contextual_chunk_embedding.ipynb diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb new file mode 100644 index 0000000..5e9414d --- /dev/null +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -0,0 +1,635 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b5718b88-0de5-4874-8a9b-e9cad42d1a86", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mongodb-developer/GenAI-Showcase/blob/main/notebooks/advanced_techniques/contextual_chunk_embeddings.ipynb)\n", + "\n", + "[![View Article](https://img.shields.io/badge/View%20Article-blue)](https://www.mongodb.com/developer/products/atlas/contextual-chunk-embeddings/?utm_campaign=devrel&utm_source=cross-post&utm_medium=organic_social&utm_content=https%3A%2F%2Fgithub.com%2Fmongodb-developer%2FGenAI-Showcase&utm_term=apoorva.joshi)" + ] + }, + { + "cell_type": "markdown", + "id": "21c19fbe-e42e-4a46-9928-f439eb54caf2", + "metadata": {}, + "source": [ + "## Step 1: Install required libraries\n", + "\n", + "- **datasets**: Python library to get access to datasets available on Hugging Face Hub\n", + "- **pdfplumber**: Python library to interact with OpenAI APIs\n", + "- **voyageai**:\n", + "- **pymongo**:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "91a9f9cb-d865-4451-b507-63be68291e0b", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -qU datasets pdfplumber langchain-text-splitters tiktoken voyageai pymongo " + ] + }, + { + "cell_type": "markdown", + "id": "e8a58c57-6b0d-4b72-aa60-325605a8ce6a", + "metadata": {}, + "source": [ + "## Step 2: Setup prerequisites\n", + "\n", + "* **Voyage AI**\n", + " * [**Obtain a Voyage AI API key**](https://dashboard.voyageai.com/organization/api-keys)\n", + "\n", + "* **MongoDB**\n", + " * **Register for a [free MongoDB Atlas account](https://www.mongodb.com/cloud/atlas/register)**\n", + " * **Create a database cluster**: Once you register and sign into your Atlas account for the first time, you will be directed to the Cluster Deployment page.\n", + " * Select the _Free_ option to create a free tier cluster.\n", + " * Click _Create Deployment_ to create the cluster.\n", + " * In the modal that appears, click _Create database user_. Then click _Choose a connection method_.\n", + " * In the next screen, click _Drivers_.\n", + " * Next, copy the connection string (starts with `mongodb+srv://`) to a safe place.\n", + " * Click _Done_.\n", + " * **Allow Access from anywhere**: To connect to your MongoDB cluster from this notebook, you will need to open up network access to your cluster.\n", + " * From the side navigation bar in the Atlas UI, select _Security_ > _Network Access.\n", + " * On the screen that appears, click _Add IP Address_.\n", + " * In the modal that appears, click _Allow Acess From Anywhere_ and click _Confirm_.\n", + "\n", + "NOTE: Opening access to your MongoDB clusters from anywhere is not recommended in production environments. We are just doing it for easy access here.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d413f794-23b8-4b1b-98ae-2443a53457df", + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "import voyageai\n", + "from pymongo import MongoClient" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "bf6ff564-053c-4d2b-8a38-6a0b1a17a85e", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter your VoyageAI API key: ········\n" + ] + } + ], + "source": [ + "# Set Voyage AI API Key as an environment variable\n", + "os.environ[\"VOYAGE_API_KEY\"] = getpass.getpass(\"Enter your VoyageAI API key:\")\n", + "# Initialize the Voyage AI client\n", + "voyage_client = voyageai.Client()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "94332d4b-63ea-49b9-9802-ac18b59a12ca", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter your MongoDB URI: ········\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ok': 1.0,\n", + " '$clusterTime': {'clusterTime': Timestamp(1760126544, 1),\n", + " 'signature': {'hash': b'\\xb9$\\x83\\x89y\\xb0\\xfdF\\x10V\\x81n\\xa5\\xb7odn\\xab[\\x10',\n", + " 'keyId': 7522922054039896066}},\n", + " 'operationTime': Timestamp(1760126544, 1)}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set your MongoDB connection string\n", + "MONGODB_URI = getpass.getpass(\"Enter your MongoDB URI: \")\n", + "# Initialize the MongoDB client\n", + "mongodb_client = MongoClient(\n", + " MONGODB_URI, appname=\"devrel.showcase.contextual_embeddings_tutorial\"\n", + ")\n", + "mongodb_client.admin.command(\"ping\")" + ] + }, + { + "cell_type": "markdown", + "id": "4df97e59-d542-4851-81fe-d6a38988e609", + "metadata": {}, + "source": [ + "## Step 3: Download a dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3ec450d7-6d6f-42ec-ab7b-c007fd0c1c68", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Pdf, load_dataset\n", + "\n", + "# Download a dataset from Hugging Face\n", + "docs = load_dataset(\"MongoDB/legal-docs\", split=\"train\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "2bdac794-1454-42d0-8c4e-fafce420042d", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the first PDF in the dataset\n", + "pdf = docs[0][\"pdf\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "f95f5434-9acf-491f-bca5-02b61fde9bd2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "40" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the number of pages in the PDF\n", + "len(pdf.pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "be8f15a9-573d-48d5-86a9-6c876d3a4aa1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Exhibit 10.2\\nExecution Version\\nINTELLECTUAL PROPERTY AGREEMENT\\nThis INTELLECTUAL PROPERTY AGREEMENT (this “Agreement”), dated as of December 31, 2018 (the “Effective Date”) is entered into by and\\nbetween Armstrong Flooring, Inc., a Delaware corporation (“Seller”) and AFI Licensing LLC, a Delaware limited liability company (“Licensing” and\\ntogether with Seller, “Arizona”) and AHF Holding, Inc. (formerly known as Tarzan HoldCo, Inc.), a Delaware corporation (“Buyer”) and Armstrong\\nHardwood Flooring Company, a Tennessee corporation (the “Company” and together with Buyer the “Buyer Entities”) (each of Arizona on the one hand\\nand the Buyer Entities on the other hand, a “Party” and collectively, the “Parties”).\\nWHEREAS, Seller and Buyer have entered into that certain Stock Purchase Agreement, dated November 14, 2018 (the “Stock Purchase\\nAgreement”); WHEREAS, pursuant to the Stock Purchase Agreement, Seller has agreed to sell and transfer, and Buyer has agreed to purchase and\\nacquire, all of Seller’s right, title and interest in and to Armstrong Wood Products, Inc., a Delaware corporation (“AWP”) and its Subsidiaries, the\\nCompany and HomerWood Hardwood Flooring Company, a Delaware corporation (“HHFC,” and together with the Company, the “Company\\nSubsidiaries” and together with AWP, the “Company Entities” and each a “Company Entity”) by way of a purchase by Buyer and sale by Seller of the\\nShares, all upon the terms and condition set forth therein;\\nWHEREAS, Arizona owns certain Copyrights, Know-How, Patents and Trademarks which may be used in the Company Field, and in connection\\nwith the transactions contemplated by the Stock Purchase Agreement the Company desires to acquire all of Arizona’s right, title and interest in and to\\nsuch Intellectual Property used exclusively in the Company Field, and obtain a license from Arizona to use other such Intellectual Property on the terms\\nand subject to the conditions set forth herein;\\nWHEREAS, Seller is signatory to the Trademark License Agreement pursuant to which Seller obtains a license to the Arizona Licensed\\nTrademarks;\\nWHEREAS, the Company desires to obtain a sublicense to use the Arizona Licensed Trademarks in the Company Field;\\nWHEREAS, Arizona has obtained consent from all counterparties to the Trademark License Agreement to grant to the Company the sublicenses\\nto the Arizona Licensed Trademarks included in this Agreement; and\\nWHEREAS, the Company Entities own certain Copyrights and Know-How which may be used in the Arizona Field, and in connection with the\\ntransactions contemplated by the Stock Purchase Agreement, Arizona desires to obtain a license from the Company Entities to use such Intellectual\\nProperty on the terms and subject to the conditions set forth herein.\\nNOW, THEREFORE, in consideration of the foregoing and the mutual agreements, provisions and covenants contained in this Agreement, and\\nfor other good and valuable consideration, the receipt and sufficiency of which are hereby acknowledged, the Parties hereby agree as follows:\\nSource: ARMSTRONG FLOORING, INC., 8-K, 1/7/2019'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Preview the first page in the PDF\n", + "pdf.pages[0].extract_text()" + ] + }, + { + "cell_type": "markdown", + "id": "95ef4336-38a5-485c-81ee-075692e3dab5", + "metadata": {}, + "source": [ + "## Step 4: Chunk the PDF content" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "b45e326c-da92-4f2e-9cc7-dd32bd72c2e4", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_text_splitters import RecursiveCharacterTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e8e3e365-7393-47bc-90e2-4df6fad34c05", + "metadata": {}, + "outputs": [], + "source": [ + "separators = [\"\\n\\n\", \"\\n\", \" \", \"\", \"#\", \"##\", \"###\"]\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " model_name=\"gpt-4\", separators=separators, chunk_size=200, chunk_overlap=0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "ba9771e9-c8ee-4cb8-a6fc-d3a14292e0b3", + "metadata": {}, + "outputs": [], + "source": [ + "chunked_docs = []\n", + "# Iterate through the documents\n", + "for doc_id, doc in enumerate(docs):\n", + " pages = doc[\"pdf\"].pages\n", + " # Keep track of chunk IDs per document\n", + " chunk_id = 0\n", + " # Iterate through the pages in each document\n", + " for page in pages:\n", + " chunks = text_splitter.split_text(page.extract_text())\n", + " for chunk in chunks:\n", + " chunked_docs.append(\n", + " {\"chunk\": chunk, \"chunk_id\": chunk_id, \"doc_id\": doc_id}\n", + " )\n", + " chunk_id += 1" + ] + }, + { + "cell_type": "markdown", + "id": "a99b4582-0aa4-4783-80ad-7e689a0d04e4", + "metadata": {}, + "source": [ + "## Step 5: Embed the chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "76c96323-c60c-42a0-883c-e3e9e0746949", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "47073960-ce27-4331-9447-9c69c46e8bb9", + "metadata": {}, + "outputs": [], + "source": [ + "def get_std_embeddings(input: List[str], input_type: str) -> List[float]:\n", + " \"\"\"\n", + " Generate context-agnostic embeddings.\n", + "\n", + " Args:\n", + " input (List[str]): List of document chunks or query wrapped in a list\n", + " input_type: Either \"document\" or \"query\"\n", + "\n", + " Returns:\n", + " List[float]: List of embedding vectors\n", + " \"\"\"\n", + " response = voyage_client.embed(input, model=\"voyage-3-large\", input_type=input_type)\n", + " return response.embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "0ce67a4e-8474-4610-a4af-b43b4a397e05", + "metadata": {}, + "outputs": [], + "source": [ + "def get_contextualized_embeddings(\n", + " input: List[List[str]], input_type: str\n", + ") -> List[float]:\n", + " \"\"\"\n", + " Generate contextualized chunk embeddings.\n", + "\n", + " Args:\n", + " input (List[List[str]]): List of document chunks or query wrapped in a list of lists\n", + " input_type: Either \"document\" or \"query\"\n", + "\n", + " Returns:\n", + " List[float]: List of embedding vectors\n", + " \"\"\"\n", + " response = voyage_client.contextualized_embed(\n", + " input, model=\"voyage-context-3\", input_type=input_type\n", + " )\n", + " return [emb for r in response.results for emb in r.embeddings]" + ] + }, + { + "cell_type": "markdown", + "id": "dedb68ed-43d5-4aee-8166-8c032bb41f49", + "metadata": {}, + "source": [ + "## Step 6: Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "c3e29bf9-4170-4109-8c0b-c5c4d38523e7", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "a2914983-bafb-404a-a005-c70b9eeba327", + "metadata": {}, + "outputs": [], + "source": [ + "queries = [\n", + " {\n", + " \"question\": \"Which state’s law governs the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 44,\n", + " },\n", + " {\n", + " \"question\": \"Under the Armstrong Flooring-AHF Holding agreement, how long is the Arizona Trademark License Term?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 9,\n", + " },\n", + " {\n", + " \"question\": \"In the agreement between Armstrong Flooring, Inc. and AHF Holding, Inc., how long is the Diamond Trademark License Term?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 11,\n", + " },\n", + " {\n", + " \"question\": \"Where will disputes be resolved under the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 44,\n", + " },\n", + " {\n", + " \"question\": \"What happens if either party materially breaches the Armstrong-AHF Intellectual Property Agreement?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 35,\n", + " },\n", + " {\n", + " \"question\": \"When does Playa Hotels & Resorts' right of first offer expire?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 4,\n", + " },\n", + " {\n", + " \"question\": \"Which state’s law governs the agreement between Hyatt Franchising Latin America and Playa Hotels & Resorts B.V.?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 13,\n", + " },\n", + " {\n", + " \"question\": \"What countries can Hyatt Franchising Latin America and Playa develop Hyatt All-Inclusive Resorts in?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 1,\n", + " },\n", + " {\n", + " \"question\": \"How many years of hotel experience must arbitrators have under the Hyatt-Playa agreement?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 15,\n", + " },\n", + " {\n", + " \"question\": \"Where will arbitration take place for disputes under the Hyatt-Playa Strategic Alliance Agreement?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 15,\n", + " },\n", + " {\n", + " \"question\": \"When was the Quaker/Gulf Houghton non-compete agreement effective?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 0,\n", + " },\n", + " {\n", + " \"question\": \"Which state’s law governs the Quaker/Gulf Houghton non-competition agreement?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 18,\n", + " },\n", + " {\n", + " \"question\": \"What is the geographic scope of the Quaker/Gulf Houghton non-compete?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 9,\n", + " },\n", + " {\n", + " \"question\": \"What percentage of publicly traded securities can Gulf Houghton sellers own as passive investors?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 9,\n", + " },\n", + " {\n", + " \"question\": \"How long must former employees be terminated before Gulf Houghton sellers can hire them?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 12,\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "766918f0-ffa3-4068-8a33-03e28f586924", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_metrics(query, chunk_embds, embd_type, k):\n", + " if embd_type == \"standard\":\n", + " query_embd = get_std_embeddings([query[\"question\"]], \"query\")[0]\n", + " elif embd_type == \"contextual\":\n", + " query_embd = get_contextualized_embeddings([[query[\"question\"]]], \"query\")[0]\n", + " similarities = np.dot(chunk_embds, query_embd)\n", + " top_k_idxs = np.argsort(similarities)[::-1][:k]\n", + " top_k_docs = [chunked_docs[i] for i in top_k_idxs]\n", + " golden_rank = None\n", + " for rank, doc in enumerate(top_k_docs):\n", + " if doc[\"doc_id\"] == query[\"doc_id\"] and doc[\"chunk_id\"] == query[\"chunk_id\"]:\n", + " golden_rank = rank + 1\n", + " break\n", + "\n", + " recall = 1 if golden_rank else 0\n", + " return recall, golden_rank" + ] + }, + { + "cell_type": "markdown", + "id": "70ccdfe0-363e-40d5-b7a1-c96b5bf7afae", + "metadata": {}, + "source": [ + "### Standard Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "5e0bcbdb-53cd-495e-b39a-4c08529d6ba4", + "metadata": {}, + "outputs": [], + "source": [ + "std_embds = get_std_embeddings([record[\"chunk\"] for record in chunked_docs], \"document\")" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "id": "4222a5e2-45cd-4f33-9bdc-cf8b6a5330c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Which state’s law governs the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?: 9\n", + "Under the Armstrong Flooring-AHF Holding agreement, how long is the Arizona Trademark License Term?: 4\n", + "In the agreement between Armstrong Flooring, Inc. and AHF Holding, Inc., how long is the Diamond Trademark License Term?: 2\n", + "Where will disputes be resolved under the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?: 7\n", + "What happens if either party materially breaches the Armstrong-AHF Intellectual Property Agreement?: 1\n", + "When does Playa Hotels & Resorts' right of first offer expire?: 5\n", + "Which state’s law governs the agreement between Hyatt Franchising Latin America and Playa Hotels & Resorts B.V.?: 3\n", + "What countries can Hyatt Franchising Latin America and Playa develop Hyatt All-Inclusive Resorts in?: 2\n", + "How many years of hotel experience must arbitrators have under the Hyatt-Playa agreement?: 1\n", + "Where will arbitration take place for disputes under the Hyatt-Playa Strategic Alliance Agreement?: 3\n", + "When was the Quaker/Gulf Houghton non-compete agreement effective?: 1\n", + "Which state’s law governs the Quaker/Gulf Houghton non-competition agreement?: 7\n", + "What is the geographic scope of the Quaker/Gulf Houghton non-compete?: 9\n", + "What percentage of publicly traded securities can Gulf Houghton sellers own as passive investors?: 1\n", + "How long must former employees be terminated before Gulf Houghton sellers can hire them?: 1\n", + "Mean recall: 100.00%\n", + "Mean reciprocal rank: 50.83%\n" + ] + } + ], + "source": [ + "recalls = []\n", + "reciprocal_ranks = []\n", + "for query in queries:\n", + " recall, rank = calculate_metrics(query, std_embds, \"standard\", 10)\n", + " recalls.append(recall)\n", + " print(f\"{query['question']}: {rank}\")\n", + " reciprocal_ranks.append(1 / rank if rank else 0.0)\n", + "\n", + "print(f\"Mean recall: {np.mean(recalls) * 100:.2f}%\")\n", + "print(f\"Mean reciprocal rank: {np.mean(reciprocal_ranks) * 100:.2f}%\")" + ] + }, + { + "cell_type": "markdown", + "id": "7c6f40f1-d46b-4d04-b0ff-3e95cfb2278d", + "metadata": {}, + "source": [ + "## Contextualized Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "f1b618d5-8fe0-4eb0-ac54-465ced3d2000", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "d0c2ea78-ce96-4743-9b2a-2bc3ad054a3e", + "metadata": {}, + "outputs": [], + "source": [ + "grouped_docs = defaultdict(list)\n", + "for chunk in chunked_docs:\n", + " grouped_docs[chunk[\"doc_id\"]].append(chunk[\"chunk\"])\n", + "\n", + "chunks_by_doc = list(grouped_docs.values())" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "a3117edc-ec2b-481c-a8d7-479080d11118", + "metadata": {}, + "outputs": [], + "source": [ + "ctxt_embds = get_contextualized_embeddings(chunks_by_doc, \"document\")" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "id": "95d9c26f-be8c-4962-a437-8debf0cce199", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Which state’s law governs the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?: 1\n", + "Under the Armstrong Flooring-AHF Holding agreement, how long is the Arizona Trademark License Term?: 7\n", + "In the agreement between Armstrong Flooring, Inc. and AHF Holding, Inc., how long is the Diamond Trademark License Term?: None\n", + "Where will disputes be resolved under the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?: 2\n", + "What happens if either party materially breaches the Armstrong-AHF Intellectual Property Agreement?: 1\n", + "When does Playa Hotels & Resorts' right of first offer expire?: 5\n", + "Which state’s law governs the agreement between Hyatt Franchising Latin America and Playa Hotels & Resorts B.V.?: 1\n", + "What countries can Hyatt Franchising Latin America and Playa develop Hyatt All-Inclusive Resorts in?: 3\n", + "How many years of hotel experience must arbitrators have under the Hyatt-Playa agreement?: 1\n", + "Where will arbitration take place for disputes under the Hyatt-Playa Strategic Alliance Agreement?: 1\n", + "When was the Quaker/Gulf Houghton non-compete agreement effective?: 3\n", + "Which state’s law governs the Quaker/Gulf Houghton non-competition agreement?: 1\n", + "What is the geographic scope of the Quaker/Gulf Houghton non-compete?: 1\n", + "What percentage of publicly traded securities can Gulf Houghton sellers own as passive investors?: 1\n", + "How long must former employees be terminated before Gulf Houghton sellers can hire them?: 1\n", + "Mean recall: 93.33%\n", + "Mean reciprocal rank: 70.06%\n" + ] + } + ], + "source": [ + "recalls = []\n", + "ranks = []\n", + "reciprocal_ranks = []\n", + "for query in queries:\n", + " recall, rank = calculate_metrics(query, ctxt_embds, \"contextual\", 10)\n", + " recalls.append(recall)\n", + " print(f\"{query['question']}: {rank}\")\n", + " reciprocal_ranks.append(1 / rank if rank else 0.0)\n", + "\n", + "print(f\"Mean recall: {np.mean(recalls) * 100:.2f}%\")\n", + "print(f\"Mean reciprocal rank: {np.mean(reciprocal_ranks) * 100:.2f}%\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {} + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From fa5511b1ab8da69d62259572440b3245db762957 Mon Sep 17 00:00:00 2001 From: ajosh0504 Date: Thu, 16 Oct 2025 23:07:05 +0000 Subject: [PATCH 2/9] Eval done# Please enter the commit message for your changes. Lines starting --- .../contextual_chunk_embedding.ipynb | 136 +++++++++--------- ruff.toml | 1 + 2 files changed, 69 insertions(+), 68 deletions(-) diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb index 5e9414d..50ecdc5 100644 --- a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "91a9f9cb-d865-4451-b507-63be68291e0b", "metadata": {}, "outputs": [], @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 1, "id": "d413f794-23b8-4b1b-98ae-2443a53457df", "metadata": {}, "outputs": [], @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 2, "id": "bf6ff564-053c-4d2b-8a38-6a0b1a17a85e", "metadata": {}, "outputs": [ @@ -97,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 4, "id": "94332d4b-63ea-49b9-9802-ac18b59a12ca", "metadata": {}, "outputs": [ @@ -112,13 +112,13 @@ "data": { "text/plain": [ "{'ok': 1.0,\n", - " '$clusterTime': {'clusterTime': Timestamp(1760126544, 1),\n", - " 'signature': {'hash': b'\\xb9$\\x83\\x89y\\xb0\\xfdF\\x10V\\x81n\\xa5\\xb7odn\\xab[\\x10',\n", + " '$clusterTime': {'clusterTime': Timestamp(1760655633, 1),\n", + " 'signature': {'hash': b'\\xf8Q\"\\x1e@6\\x99\\xed+R\\xc1\\xc4\\x06\\xfdz \\x8a\\xb8v\\x86',\n", " 'keyId': 7522922054039896066}},\n", - " 'operationTime': Timestamp(1760126544, 1)}" + " 'operationTime': Timestamp(1760655633, 1)}" ] }, - "execution_count": 20, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -143,12 +143,12 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 2, "id": "3ec450d7-6d6f-42ec-ab7b-c007fd0c1c68", "metadata": {}, "outputs": [], "source": [ - "from datasets import Pdf, load_dataset\n", + "from datasets import load_dataset\n", "\n", "# Download a dataset from Hugging Face\n", "docs = load_dataset(\"MongoDB/legal-docs\", split=\"train\")" @@ -156,7 +156,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 3, "id": "2bdac794-1454-42d0-8c4e-fafce420042d", "metadata": {}, "outputs": [], @@ -167,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 4, "id": "f95f5434-9acf-491f-bca5-02b61fde9bd2", "metadata": {}, "outputs": [ @@ -177,7 +177,7 @@ "40" ] }, - "execution_count": 24, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -189,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 5, "id": "be8f15a9-573d-48d5-86a9-6c876d3a4aa1", "metadata": {}, "outputs": [ @@ -199,7 +199,7 @@ "'Exhibit 10.2\\nExecution Version\\nINTELLECTUAL PROPERTY AGREEMENT\\nThis INTELLECTUAL PROPERTY AGREEMENT (this “Agreement”), dated as of December 31, 2018 (the “Effective Date”) is entered into by and\\nbetween Armstrong Flooring, Inc., a Delaware corporation (“Seller”) and AFI Licensing LLC, a Delaware limited liability company (“Licensing” and\\ntogether with Seller, “Arizona”) and AHF Holding, Inc. (formerly known as Tarzan HoldCo, Inc.), a Delaware corporation (“Buyer”) and Armstrong\\nHardwood Flooring Company, a Tennessee corporation (the “Company” and together with Buyer the “Buyer Entities”) (each of Arizona on the one hand\\nand the Buyer Entities on the other hand, a “Party” and collectively, the “Parties”).\\nWHEREAS, Seller and Buyer have entered into that certain Stock Purchase Agreement, dated November 14, 2018 (the “Stock Purchase\\nAgreement”); WHEREAS, pursuant to the Stock Purchase Agreement, Seller has agreed to sell and transfer, and Buyer has agreed to purchase and\\nacquire, all of Seller’s right, title and interest in and to Armstrong Wood Products, Inc., a Delaware corporation (“AWP”) and its Subsidiaries, the\\nCompany and HomerWood Hardwood Flooring Company, a Delaware corporation (“HHFC,” and together with the Company, the “Company\\nSubsidiaries” and together with AWP, the “Company Entities” and each a “Company Entity”) by way of a purchase by Buyer and sale by Seller of the\\nShares, all upon the terms and condition set forth therein;\\nWHEREAS, Arizona owns certain Copyrights, Know-How, Patents and Trademarks which may be used in the Company Field, and in connection\\nwith the transactions contemplated by the Stock Purchase Agreement the Company desires to acquire all of Arizona’s right, title and interest in and to\\nsuch Intellectual Property used exclusively in the Company Field, and obtain a license from Arizona to use other such Intellectual Property on the terms\\nand subject to the conditions set forth herein;\\nWHEREAS, Seller is signatory to the Trademark License Agreement pursuant to which Seller obtains a license to the Arizona Licensed\\nTrademarks;\\nWHEREAS, the Company desires to obtain a sublicense to use the Arizona Licensed Trademarks in the Company Field;\\nWHEREAS, Arizona has obtained consent from all counterparties to the Trademark License Agreement to grant to the Company the sublicenses\\nto the Arizona Licensed Trademarks included in this Agreement; and\\nWHEREAS, the Company Entities own certain Copyrights and Know-How which may be used in the Arizona Field, and in connection with the\\ntransactions contemplated by the Stock Purchase Agreement, Arizona desires to obtain a license from the Company Entities to use such Intellectual\\nProperty on the terms and subject to the conditions set forth herein.\\nNOW, THEREFORE, in consideration of the foregoing and the mutual agreements, provisions and covenants contained in this Agreement, and\\nfor other good and valuable consideration, the receipt and sufficiency of which are hereby acknowledged, the Parties hereby agree as follows:\\nSource: ARMSTRONG FLOORING, INC., 8-K, 1/7/2019'" ] }, - "execution_count": 25, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -219,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 12, "id": "b45e326c-da92-4f2e-9cc7-dd32bd72c2e4", "metadata": {}, "outputs": [], @@ -229,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 13, "id": "e8e3e365-7393-47bc-90e2-4df6fad34c05", "metadata": {}, "outputs": [], @@ -242,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 14, "id": "ba9771e9-c8ee-4cb8-a6fc-d3a14292e0b3", "metadata": {}, "outputs": [], @@ -273,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 15, "id": "76c96323-c60c-42a0-883c-e3e9e0746949", "metadata": {}, "outputs": [], @@ -283,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 16, "id": "47073960-ce27-4331-9447-9c69c46e8bb9", "metadata": {}, "outputs": [], @@ -305,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 17, "id": "0ce67a4e-8474-4610-a4af-b43b4a397e05", "metadata": {}, "outputs": [], @@ -339,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 18, "id": "c3e29bf9-4170-4109-8c0b-c5c4d38523e7", "metadata": {}, "outputs": [], @@ -349,36 +349,36 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 19, "id": "a2914983-bafb-404a-a005-c70b9eeba327", "metadata": {}, "outputs": [], "source": [ "queries = [\n", " {\n", - " \"question\": \"Which state’s law governs the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?\",\n", + " \"question\": \"Which state’s law governs the agreement between Armstrong Flooring and AHF Holding?\",\n", " \"doc_id\": 0,\n", " \"chunk_id\": 44,\n", " },\n", " {\n", - " \"question\": \"Under the Armstrong Flooring-AHF Holding agreement, how long is the Arizona Trademark License Term?\",\n", + " \"question\": \"In the Armstrong-AHF agreement, how many days' notice is required to remedy a breach?\",\n", " \"doc_id\": 0,\n", " \"chunk_id\": 9,\n", " },\n", " {\n", - " \"question\": \"In the agreement between Armstrong Flooring, Inc. and AHF Holding, Inc., how long is the Diamond Trademark License Term?\",\n", + " \"question\": \"Where will disputes be resolved under the agreement between Armstrong Flooring and AHF Holding?\",\n", " \"doc_id\": 0,\n", - " \"chunk_id\": 11,\n", + " \"chunk_id\": 44,\n", " },\n", " {\n", - " \"question\": \"Where will disputes be resolved under the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?\",\n", + " \"question\": \"What happens if either party materially breaches the Armstrong-AHF Intellectual agreement?\",\n", " \"doc_id\": 0,\n", - " \"chunk_id\": 44,\n", + " \"chunk_id\": 35,\n", " },\n", " {\n", - " \"question\": \"What happens if either party materially breaches the Armstrong-AHF Intellectual Property Agreement?\",\n", + " \"question\": \"Under the Armstrong Flooring-AHF Holding agreement, what is the minimum logo size?\",\n", " \"doc_id\": 0,\n", - " \"chunk_id\": 35,\n", + " \"chunk_id\": 94,\n", " },\n", " {\n", " \"question\": \"When does Playa Hotels & Resorts' right of first offer expire?\",\n", @@ -401,22 +401,22 @@ " \"chunk_id\": 15,\n", " },\n", " {\n", - " \"question\": \"Where will arbitration take place for disputes under the Hyatt-Playa Strategic Alliance Agreement?\",\n", + " \"question\": \"Where will arbitration take place for disputes under the Hyatt-Playa agreement?\",\n", " \"doc_id\": 1,\n", " \"chunk_id\": 15,\n", " },\n", " {\n", - " \"question\": \"When was the Quaker/Gulf Houghton non-compete agreement effective?\",\n", + " \"question\": \"When was the Quaker/Gulf Houghton agreement effective?\",\n", " \"doc_id\": 2,\n", " \"chunk_id\": 0,\n", " },\n", " {\n", - " \"question\": \"Which state’s law governs the Quaker/Gulf Houghton non-competition agreement?\",\n", + " \"question\": \"Which state’s law governs the Quaker/Gulf Houghton agreement?\",\n", " \"doc_id\": 2,\n", " \"chunk_id\": 18,\n", " },\n", " {\n", - " \"question\": \"What is the geographic scope of the Quaker/Gulf Houghton non-compete?\",\n", + " \"question\": \"What is the geographic scope of the Quaker/Gulf Houghton agreement?\",\n", " \"doc_id\": 2,\n", " \"chunk_id\": 9,\n", " },\n", @@ -426,7 +426,7 @@ " \"chunk_id\": 9,\n", " },\n", " {\n", - " \"question\": \"How long must former employees be terminated before Gulf Houghton sellers can hire them?\",\n", + " \"question\": \"How long must before Gulf Houghton sellers can hire former employees?\",\n", " \"doc_id\": 2,\n", " \"chunk_id\": 12,\n", " },\n", @@ -435,7 +435,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 20, "id": "766918f0-ffa3-4068-8a33-03e28f586924", "metadata": {}, "outputs": [], @@ -468,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 21, "id": "5e0bcbdb-53cd-495e-b39a-4c08529d6ba4", "metadata": {}, "outputs": [], @@ -478,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 22, "id": "4222a5e2-45cd-4f33-9bdc-cf8b6a5330c4", "metadata": {}, "outputs": [ @@ -486,23 +486,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "Which state’s law governs the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?: 9\n", - "Under the Armstrong Flooring-AHF Holding agreement, how long is the Arizona Trademark License Term?: 4\n", - "In the agreement between Armstrong Flooring, Inc. and AHF Holding, Inc., how long is the Diamond Trademark License Term?: 2\n", - "Where will disputes be resolved under the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?: 7\n", - "What happens if either party materially breaches the Armstrong-AHF Intellectual Property Agreement?: 1\n", + "Which state’s law governs the agreement between Armstrong Flooring and AHF Holding?: 1\n", + "In the Armstrong-AHF agreement, how many days' notice is required to remedy a breach?: None\n", + "Where will disputes be resolved under the agreement between Armstrong Flooring and AHF Holding?: None\n", + "What happens if either party materially breaches the Armstrong-AHF Intellectual agreement?: 1\n", + "Under the Armstrong Flooring-AHF Holding agreement, what is the minimum logo size?: 2\n", "When does Playa Hotels & Resorts' right of first offer expire?: 5\n", "Which state’s law governs the agreement between Hyatt Franchising Latin America and Playa Hotels & Resorts B.V.?: 3\n", "What countries can Hyatt Franchising Latin America and Playa develop Hyatt All-Inclusive Resorts in?: 2\n", "How many years of hotel experience must arbitrators have under the Hyatt-Playa agreement?: 1\n", - "Where will arbitration take place for disputes under the Hyatt-Playa Strategic Alliance Agreement?: 3\n", - "When was the Quaker/Gulf Houghton non-compete agreement effective?: 1\n", - "Which state’s law governs the Quaker/Gulf Houghton non-competition agreement?: 7\n", - "What is the geographic scope of the Quaker/Gulf Houghton non-compete?: 9\n", + "Where will arbitration take place for disputes under the Hyatt-Playa agreement?: 2\n", + "When was the Quaker/Gulf Houghton agreement effective?: 1\n", + "Which state’s law governs the Quaker/Gulf Houghton agreement?: 2\n", + "What is the geographic scope of the Quaker/Gulf Houghton agreement?: None\n", "What percentage of publicly traded securities can Gulf Houghton sellers own as passive investors?: 1\n", - "How long must former employees be terminated before Gulf Houghton sellers can hire them?: 1\n", - "Mean recall: 100.00%\n", - "Mean reciprocal rank: 50.83%\n" + "How long must before Gulf Houghton sellers can hire former employees?: 2\n", + "Mean recall: 80.00%\n", + "Mean reciprocal rank: 53.56%\n" ] } ], @@ -510,7 +510,7 @@ "recalls = []\n", "reciprocal_ranks = []\n", "for query in queries:\n", - " recall, rank = calculate_metrics(query, std_embds, \"standard\", 10)\n", + " recall, rank = calculate_metrics(query, std_embds, \"standard\", 5)\n", " recalls.append(recall)\n", " print(f\"{query['question']}: {rank}\")\n", " reciprocal_ranks.append(1 / rank if rank else 0.0)\n", @@ -529,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 23, "id": "f1b618d5-8fe0-4eb0-ac54-465ced3d2000", "metadata": {}, "outputs": [], @@ -539,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 24, "id": "d0c2ea78-ce96-4743-9b2a-2bc3ad054a3e", "metadata": {}, "outputs": [], @@ -553,7 +553,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 25, "id": "a3117edc-ec2b-481c-a8d7-479080d11118", "metadata": {}, "outputs": [], @@ -563,7 +563,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 26, "id": "95d9c26f-be8c-4962-a437-8debf0cce199", "metadata": {}, "outputs": [ @@ -571,23 +571,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "Which state’s law governs the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?: 1\n", - "Under the Armstrong Flooring-AHF Holding agreement, how long is the Arizona Trademark License Term?: 7\n", - "In the agreement between Armstrong Flooring, Inc. and AHF Holding, Inc., how long is the Diamond Trademark License Term?: None\n", - "Where will disputes be resolved under the Intellectual Property Agreement between Armstrong Flooring and AHF Holding?: 2\n", - "What happens if either party materially breaches the Armstrong-AHF Intellectual Property Agreement?: 1\n", + "Which state’s law governs the agreement between Armstrong Flooring and AHF Holding?: 1\n", + "In the Armstrong-AHF agreement, how many days' notice is required to remedy a breach?: None\n", + "Where will disputes be resolved under the agreement between Armstrong Flooring and AHF Holding?: 2\n", + "What happens if either party materially breaches the Armstrong-AHF Intellectual agreement?: 1\n", + "Under the Armstrong Flooring-AHF Holding agreement, what is the minimum logo size?: 2\n", "When does Playa Hotels & Resorts' right of first offer expire?: 5\n", "Which state’s law governs the agreement between Hyatt Franchising Latin America and Playa Hotels & Resorts B.V.?: 1\n", "What countries can Hyatt Franchising Latin America and Playa develop Hyatt All-Inclusive Resorts in?: 3\n", "How many years of hotel experience must arbitrators have under the Hyatt-Playa agreement?: 1\n", - "Where will arbitration take place for disputes under the Hyatt-Playa Strategic Alliance Agreement?: 1\n", - "When was the Quaker/Gulf Houghton non-compete agreement effective?: 3\n", - "Which state’s law governs the Quaker/Gulf Houghton non-competition agreement?: 1\n", - "What is the geographic scope of the Quaker/Gulf Houghton non-compete?: 1\n", + "Where will arbitration take place for disputes under the Hyatt-Playa agreement?: 1\n", + "When was the Quaker/Gulf Houghton agreement effective?: 1\n", + "Which state’s law governs the Quaker/Gulf Houghton agreement?: 1\n", + "What is the geographic scope of the Quaker/Gulf Houghton agreement?: 2\n", "What percentage of publicly traded securities can Gulf Houghton sellers own as passive investors?: 1\n", - "How long must former employees be terminated before Gulf Houghton sellers can hire them?: 1\n", + "How long must before Gulf Houghton sellers can hire former employees?: 1\n", "Mean recall: 93.33%\n", - "Mean reciprocal rank: 70.06%\n" + "Mean reciprocal rank: 73.56%\n" ] } ], @@ -596,7 +596,7 @@ "ranks = []\n", "reciprocal_ranks = []\n", "for query in queries:\n", - " recall, rank = calculate_metrics(query, ctxt_embds, \"contextual\", 10)\n", + " recall, rank = calculate_metrics(query, ctxt_embds, \"contextual\", 5)\n", " recalls.append(recall)\n", " print(f\"{query['question']}: {rank}\")\n", " reciprocal_ranks.append(1 / rank if rank else 0.0)\n", diff --git a/ruff.toml b/ruff.toml index 4faecee..88ec02d 100644 --- a/ruff.toml +++ b/ruff.toml @@ -24,6 +24,7 @@ ignore = [ "B007", # Loop control variable `index` not used within loop body "B008", # Do not perform function call `File` in argument defaults "B904", # Within an `except` clause, raise exceptions with `raise ... from err`" + "RUF001", # String contains ambiguous `’` "RUF005", # Consider iterable unpacking instead of concatenation" "RUF015", # Prefer `next(iter(queries.items()))` over single element slice "F841", # Local variable `full_text_search_result` is assigned to but never used" From 2a1ad60150612c897b963a7a77b3269acce3048d Mon Sep 17 00:00:00 2001 From: ajosh0504 Date: Mon, 27 Oct 2025 23:14:51 +0000 Subject: [PATCH 3/9] Updating markdown --- .../contextual_chunk_embedding.ipynb | 81 +++++-------------- 1 file changed, 18 insertions(+), 63 deletions(-) diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb index 50ecdc5..a3eec6c 100644 --- a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -10,6 +10,16 @@ "[![View Article](https://img.shields.io/badge/View%20Article-blue)](https://www.mongodb.com/developer/products/atlas/contextual-chunk-embeddings/?utm_campaign=devrel&utm_source=cross-post&utm_medium=organic_social&utm_content=https%3A%2F%2Fgithub.com%2Fmongodb-developer%2FGenAI-Showcase&utm_term=apoorva.joshi)" ] }, + { + "cell_type": "markdown", + "id": "b93d31ee-ebe0-4393-88f9-7ce6b9dca241", + "metadata": {}, + "source": [ + "# Contextualized chunk embeddings: Combining local detail with global context\n", + "\n", + "This notebook shows you how to implement and evaluate Voyage AI's _voyage-context-3_ contextualized chunk embedding model." + ] + }, { "cell_type": "markdown", "id": "21c19fbe-e42e-4a46-9928-f439eb54caf2", @@ -18,9 +28,10 @@ "## Step 1: Install required libraries\n", "\n", "- **datasets**: Python library to get access to datasets available on Hugging Face Hub\n", - "- **pdfplumber**: Python library to interact with OpenAI APIs\n", - "- **voyageai**:\n", - "- **pymongo**:" + "- **pdfplumber**: Python library to parse and analyze PDFs\n", + "- **langchain-text-splitters**: Text chunking utilities in LangChain\n", + "- **tiktoken**: Token counting and encoding library for GPT models\n", + "- **voyageai**: Python library to interact with Voyage AI's APIs" ] }, { @@ -30,7 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -qU datasets pdfplumber langchain-text-splitters tiktoken voyageai pymongo " + "!pip install -qU datasets pdfplumber langchain-text-splitters tiktoken voyageai" ] }, { @@ -40,24 +51,7 @@ "source": [ "## Step 2: Setup prerequisites\n", "\n", - "* **Voyage AI**\n", - " * [**Obtain a Voyage AI API key**](https://dashboard.voyageai.com/organization/api-keys)\n", - "\n", - "* **MongoDB**\n", - " * **Register for a [free MongoDB Atlas account](https://www.mongodb.com/cloud/atlas/register)**\n", - " * **Create a database cluster**: Once you register and sign into your Atlas account for the first time, you will be directed to the Cluster Deployment page.\n", - " * Select the _Free_ option to create a free tier cluster.\n", - " * Click _Create Deployment_ to create the cluster.\n", - " * In the modal that appears, click _Create database user_. Then click _Choose a connection method_.\n", - " * In the next screen, click _Drivers_.\n", - " * Next, copy the connection string (starts with `mongodb+srv://`) to a safe place.\n", - " * Click _Done_.\n", - " * **Allow Access from anywhere**: To connect to your MongoDB cluster from this notebook, you will need to open up network access to your cluster.\n", - " * From the side navigation bar in the Atlas UI, select _Security_ > _Network Access.\n", - " * On the screen that appears, click _Add IP Address_.\n", - " * In the modal that appears, click _Allow Acess From Anywhere_ and click _Confirm_.\n", - "\n", - "NOTE: Opening access to your MongoDB clusters from anywhere is not recommended in production environments. We are just doing it for easy access here.\n" + "Follow the step [here](https://dashboard.voyageai.com/organization/api-keys) to obtain a Voyage AI API key." ] }, { @@ -70,8 +64,7 @@ "import getpass\n", "import os\n", "\n", - "import voyageai\n", - "from pymongo import MongoClient" + "import voyageai" ] }, { @@ -95,44 +88,6 @@ "voyage_client = voyageai.Client()" ] }, - { - "cell_type": "code", - "execution_count": 4, - "id": "94332d4b-63ea-49b9-9802-ac18b59a12ca", - "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Enter your MongoDB URI: ········\n" - ] - }, - { - "data": { - "text/plain": [ - "{'ok': 1.0,\n", - " '$clusterTime': {'clusterTime': Timestamp(1760655633, 1),\n", - " 'signature': {'hash': b'\\xf8Q\"\\x1e@6\\x99\\xed+R\\xc1\\xc4\\x06\\xfdz \\x8a\\xb8v\\x86',\n", - " 'keyId': 7522922054039896066}},\n", - " 'operationTime': Timestamp(1760655633, 1)}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Set your MongoDB connection string\n", - "MONGODB_URI = getpass.getpass(\"Enter your MongoDB URI: \")\n", - "# Initialize the MongoDB client\n", - "mongodb_client = MongoClient(\n", - " MONGODB_URI, appname=\"devrel.showcase.contextual_embeddings_tutorial\"\n", - ")\n", - "mongodb_client.admin.command(\"ping\")" - ] - }, { "cell_type": "markdown", "id": "4df97e59-d542-4851-81fe-d6a38988e609", @@ -524,7 +479,7 @@ "id": "7c6f40f1-d46b-4d04-b0ff-3e95cfb2278d", "metadata": {}, "source": [ - "## Contextualized Embeddings" + "### Contextualized Embeddings" ] }, { From a7fac7fae6153c34f69fbffdb18258082366afd7 Mon Sep 17 00:00:00 2001 From: ajosh0504 Date: Mon, 27 Oct 2025 23:47:41 +0000 Subject: [PATCH 4/9] Nit --- notebooks/advanced_techniques/contextual_chunk_embedding.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb index a3eec6c..09b0890 100644 --- a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -93,7 +93,7 @@ "id": "4df97e59-d542-4851-81fe-d6a38988e609", "metadata": {}, "source": [ - "## Step 3: Download a dataset" + "## Step 3: Download the dataset" ] }, { From 8b999a2e0ad8bfe98bd1f4827346495cc6205e61 Mon Sep 17 00:00:00 2001 From: ajosh0504 Date: Tue, 28 Oct 2025 18:24:27 +0000 Subject: [PATCH 5/9] Changing types --- .../contextual_chunk_embedding.ipynb | 128 +++++++++++++++--- 1 file changed, 111 insertions(+), 17 deletions(-) diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb index 09b0890..5b2fcea 100644 --- a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -41,7 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -qU datasets pdfplumber langchain-text-splitters tiktoken voyageai" + "!pip install -qU datasets pdfplumber langchain-text-splitters voyageai" ] }, { @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "id": "d413f794-23b8-4b1b-98ae-2443a53457df", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "id": "bf6ff564-053c-4d2b-8a38-6a0b1a17a85e", "metadata": {}, "outputs": [ @@ -98,10 +98,81 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "id": "3ec450d7-6d6f-42ec-ab7b-c007fd0c1c68", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c05936151bfb482e993d8342747bbed0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "README.md: 0%| | 0.00/190 [00:00 List[float]:\n", + "def get_std_embeddings(input: List[str], input_type: str) -> List[List[float]]:\n", " \"\"\"\n", " Generate context-agnostic embeddings.\n", "\n", @@ -252,7 +346,7 @@ " input_type: Either \"document\" or \"query\"\n", "\n", " Returns:\n", - " List[float]: List of embedding vectors\n", + " List[List[float]]: List of embedding vectors\n", " \"\"\"\n", " response = voyage_client.embed(input, model=\"voyage-3-large\", input_type=input_type)\n", " return response.embeddings" @@ -267,7 +361,7 @@ "source": [ "def get_contextualized_embeddings(\n", " input: List[List[str]], input_type: str\n", - ") -> List[float]:\n", + ") -> List[List[float]]:\n", " \"\"\"\n", " Generate contextualized chunk embeddings.\n", "\n", @@ -276,7 +370,7 @@ " input_type: Either \"document\" or \"query\"\n", "\n", " Returns:\n", - " List[float]: List of embedding vectors\n", + " List[List[float]]: List of embedding vectors\n", " \"\"\"\n", " response = voyage_client.contextualized_embed(\n", " input, model=\"voyage-context-3\", input_type=input_type\n", From 9f302e47f57e291be81209c85dfb80e7db707225 Mon Sep 17 00:00:00 2001 From: ajosh0504 Date: Tue, 28 Oct 2025 19:46:23 +0000 Subject: [PATCH 6/9] Updating calculate_metrics --- .../contextual_chunk_embedding.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb index 5b2fcea..46e4f23 100644 --- a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -497,14 +497,14 @@ " similarities = np.dot(chunk_embds, query_embd)\n", " top_k_idxs = np.argsort(similarities)[::-1][:k]\n", " top_k_docs = [chunked_docs[i] for i in top_k_idxs]\n", - " golden_rank = None\n", - " for rank, doc in enumerate(top_k_docs):\n", + " rank = None\n", + " for i, doc in enumerate(top_k_docs):\n", " if doc[\"doc_id\"] == query[\"doc_id\"] and doc[\"chunk_id\"] == query[\"chunk_id\"]:\n", - " golden_rank = rank + 1\n", + " rank = i + 1\n", " break\n", "\n", - " recall = 1 if golden_rank else 0\n", - " return recall, golden_rank" + " recall = 1 if rank else 0\n", + " return recall, rank" ] }, { From 3f63b7aac7812b91c75d0d3d227057008e795aa4 Mon Sep 17 00:00:00 2001 From: ajosh0504 Date: Tue, 28 Oct 2025 22:23:51 +0000 Subject: [PATCH 7/9] Adding comments --- .../contextual_chunk_embedding.ipynb | 81 ++----------------- 1 file changed, 8 insertions(+), 73 deletions(-) diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb index 46e4f23..414d1e8 100644 --- a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -98,81 +98,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 39, "id": "3ec450d7-6d6f-42ec-ab7b-c007fd0c1c68", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c05936151bfb482e993d8342747bbed0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "README.md: 0%| | 0.00/190 [00:00 Date: Fri, 31 Oct 2025 09:26:47 -0700 Subject: [PATCH 8/9] Pinning versions --- .../contextual_chunk_embedding.ipynb | 70 ++++++++----------- 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb index 414d1e8..663d4ff 100644 --- a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -30,7 +30,7 @@ "- **datasets**: Python library to get access to datasets available on Hugging Face Hub\n", "- **pdfplumber**: Python library to parse and analyze PDFs\n", "- **langchain-text-splitters**: Text chunking utilities in LangChain\n", - "- **tiktoken**: Token counting and encoding library for GPT models\n", + "- **tiktoken**: Token counting and encoding library\n", "- **voyageai**: Python library to interact with Voyage AI's APIs" ] }, @@ -41,7 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -qU datasets pdfplumber langchain-text-splitters voyageai" + "!pip install -qU datasets==4.3.0 pdfplumber==0.11.7 langchain-text-splitters==0.3.11 voyageai==0.3.5 tiktoken==0.12.0" ] }, { @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 29, "id": "d413f794-23b8-4b1b-98ae-2443a53457df", "metadata": {}, "outputs": [], @@ -69,18 +69,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 30, "id": "bf6ff564-053c-4d2b-8a38-6a0b1a17a85e", "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Enter your VoyageAI API key: ········\n" - ] - } - ], + "outputs": [], "source": [ "# Set Voyage AI API Key as an environment variable\n", "os.environ[\"VOYAGE_API_KEY\"] = getpass.getpass(\"Enter your VoyageAI API key:\")\n", @@ -98,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 31, "id": "3ec450d7-6d6f-42ec-ab7b-c007fd0c1c68", "metadata": {}, "outputs": [], @@ -111,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 32, "id": "2bdac794-1454-42d0-8c4e-fafce420042d", "metadata": {}, "outputs": [], @@ -122,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 33, "id": "f95f5434-9acf-491f-bca5-02b61fde9bd2", "metadata": {}, "outputs": [ @@ -132,7 +124,7 @@ "40" ] }, - "execution_count": 12, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -144,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 34, "id": "be8f15a9-573d-48d5-86a9-6c876d3a4aa1", "metadata": {}, "outputs": [ @@ -154,7 +146,7 @@ "'Exhibit 10.2\\nExecution Version\\nINTELLECTUAL PROPERTY AGREEMENT\\nThis INTELLECTUAL PROPERTY AGREEMENT (this “Agreement”), dated as of December 31, 2018 (the “Effective Date”) is entered into by and\\nbetween Armstrong Flooring, Inc., a Delaware corporation (“Seller”) and AFI Licensing LLC, a Delaware limited liability company (“Licensing” and\\ntogether with Seller, “Arizona”) and AHF Holding, Inc. (formerly known as Tarzan HoldCo, Inc.), a Delaware corporation (“Buyer”) and Armstrong\\nHardwood Flooring Company, a Tennessee corporation (the “Company” and together with Buyer the “Buyer Entities”) (each of Arizona on the one hand\\nand the Buyer Entities on the other hand, a “Party” and collectively, the “Parties”).\\nWHEREAS, Seller and Buyer have entered into that certain Stock Purchase Agreement, dated November 14, 2018 (the “Stock Purchase\\nAgreement”); WHEREAS, pursuant to the Stock Purchase Agreement, Seller has agreed to sell and transfer, and Buyer has agreed to purchase and\\nacquire, all of Seller’s right, title and interest in and to Armstrong Wood Products, Inc., a Delaware corporation (“AWP”) and its Subsidiaries, the\\nCompany and HomerWood Hardwood Flooring Company, a Delaware corporation (“HHFC,” and together with the Company, the “Company\\nSubsidiaries” and together with AWP, the “Company Entities” and each a “Company Entity”) by way of a purchase by Buyer and sale by Seller of the\\nShares, all upon the terms and condition set forth therein;\\nWHEREAS, Arizona owns certain Copyrights, Know-How, Patents and Trademarks which may be used in the Company Field, and in connection\\nwith the transactions contemplated by the Stock Purchase Agreement the Company desires to acquire all of Arizona’s right, title and interest in and to\\nsuch Intellectual Property used exclusively in the Company Field, and obtain a license from Arizona to use other such Intellectual Property on the terms\\nand subject to the conditions set forth herein;\\nWHEREAS, Seller is signatory to the Trademark License Agreement pursuant to which Seller obtains a license to the Arizona Licensed\\nTrademarks;\\nWHEREAS, the Company desires to obtain a sublicense to use the Arizona Licensed Trademarks in the Company Field;\\nWHEREAS, Arizona has obtained consent from all counterparties to the Trademark License Agreement to grant to the Company the sublicenses\\nto the Arizona Licensed Trademarks included in this Agreement; and\\nWHEREAS, the Company Entities own certain Copyrights and Know-How which may be used in the Arizona Field, and in connection with the\\ntransactions contemplated by the Stock Purchase Agreement, Arizona desires to obtain a license from the Company Entities to use such Intellectual\\nProperty on the terms and subject to the conditions set forth herein.\\nNOW, THEREFORE, in consideration of the foregoing and the mutual agreements, provisions and covenants contained in this Agreement, and\\nfor other good and valuable consideration, the receipt and sufficiency of which are hereby acknowledged, the Parties hereby agree as follows:\\nSource: ARMSTRONG FLOORING, INC., 8-K, 1/7/2019'" ] }, - "execution_count": 13, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -174,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 35, "id": "b45e326c-da92-4f2e-9cc7-dd32bd72c2e4", "metadata": {}, "outputs": [], @@ -184,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 36, "id": "e8e3e365-7393-47bc-90e2-4df6fad34c05", "metadata": {}, "outputs": [], @@ -197,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 37, "id": "ba9771e9-c8ee-4cb8-a6fc-d3a14292e0b3", "metadata": {}, "outputs": [], @@ -220,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 38, "id": "c35afc6b-745e-4a2e-ba8a-73f328858bef", "metadata": {}, "outputs": [ @@ -232,7 +224,7 @@ " 'doc_id': 0}" ] }, - "execution_count": 17, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -251,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 39, "id": "76c96323-c60c-42a0-883c-e3e9e0746949", "metadata": {}, "outputs": [], @@ -261,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 40, "id": "47073960-ce27-4331-9447-9c69c46e8bb9", "metadata": {}, "outputs": [], @@ -283,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 41, "id": "0ce67a4e-8474-4610-a4af-b43b4a397e05", "metadata": {}, "outputs": [], @@ -317,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 42, "id": "c3e29bf9-4170-4109-8c0b-c5c4d38523e7", "metadata": {}, "outputs": [], @@ -327,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 43, "id": "a2914983-bafb-404a-a005-c70b9eeba327", "metadata": {}, "outputs": [], @@ -413,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 44, "id": "766918f0-ffa3-4068-8a33-03e28f586924", "metadata": {}, "outputs": [], @@ -451,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 45, "id": "5e0bcbdb-53cd-495e-b39a-4c08529d6ba4", "metadata": {}, "outputs": [], @@ -461,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 46, "id": "4222a5e2-45cd-4f33-9bdc-cf8b6a5330c4", "metadata": {}, "outputs": [ @@ -512,7 +504,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 47, "id": "f1b618d5-8fe0-4eb0-ac54-465ced3d2000", "metadata": {}, "outputs": [], @@ -522,7 +514,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 48, "id": "d0c2ea78-ce96-4743-9b2a-2bc3ad054a3e", "metadata": {}, "outputs": [], @@ -537,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 49, "id": "a3117edc-ec2b-481c-a8d7-479080d11118", "metadata": {}, "outputs": [], @@ -547,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 50, "id": "95d9c26f-be8c-4962-a437-8debf0cce199", "metadata": {}, "outputs": [ @@ -592,9 +584,9 @@ ], "metadata": { "kernelspec": { - "display_name": "conda_python3", + "display_name": ".venv", "language": "python", - "name": "conda_python3" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -606,7 +598,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.18" + "version": "3.9.6" }, "widgets": { "application/vnd.jupyter.widget-state+json": { From 96afe622e85705f4a516736a02c8d3fc9a1104cc Mon Sep 17 00:00:00 2001 From: ajosh0504 Date: Fri, 31 Oct 2025 14:50:01 -0700 Subject: [PATCH 9/9] Nit --- .../advanced_techniques/contextual_chunk_embedding.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb index 663d4ff..a20eaf9 100644 --- a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -51,7 +51,7 @@ "source": [ "## Step 2: Setup prerequisites\n", "\n", - "Follow the step [here](https://dashboard.voyageai.com/organization/api-keys) to obtain a Voyage AI API key." + "Follow the steps [here](https://dashboard.voyageai.com/organization/api-keys) to obtain a Voyage AI API key." ] }, { @@ -69,12 +69,12 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "bf6ff564-053c-4d2b-8a38-6a0b1a17a85e", "metadata": {}, "outputs": [], "source": [ - "# Set Voyage AI API Key as an environment variable\n", + "# Set Voyage AI API key as an environment variable\n", "os.environ[\"VOYAGE_API_KEY\"] = getpass.getpass(\"Enter your VoyageAI API key:\")\n", "# Initialize the Voyage AI client\n", "voyage_client = voyageai.Client()"