Fix/few shot selection (#530)

j2whiting · web-flow · commit a1138216f05a · 2024-07-24T13:24:00.000-06:00
* try to always use at least 3 few shot examples

* add args for auto tune

* use context-based KNN to select most relevant chunks

* enforce at least 3 few shot examples for generated prompts

* utils for content-based KNN

* sem version

* fix callback arg

* fixes

* switch back to no op callbacks

* make n few shot, user controlled. default to 2"
diff --git a/.semversioner/next-release/minor-20240712152937651350.json b/.semversioner/next-release/minor-20240712152937651350.json
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Add content-based KNN for selecting prompt tune few shot examples"
+}
diff --git a/graphrag/prompt_tune/__main__.py b/graphrag/prompt_tune/__main__.py
@@ -10,7 +10,7 @@
 from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT
 from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE
 
-from .cli import fine_tune
+from .cli import prompt_tune
 
 
 class DocSelectionType(Enum):
@@ -19,6 +19,7 @@ class DocSelectionType(Enum):
     ALL = "all"
     RANDOM = "random"
     TOP = "top"
+    AUTO = "auto"
 
     def __str__(self):
         """Return the string representation of the enum value."""
@@ -46,13 +47,29 @@ def __str__(self):
 
     parser.add_argument(
         "--method",
-        help="The method to select documents, one of: all, random or top",
+        help="The method to select documents, one of: all, random, top or auto",
         required=False,
         type=DocSelectionType,
         choices=list(DocSelectionType),
         default=DocSelectionType.RANDOM,
     )
 
+    parser.add_argument(
+        "--n_subset_max",
+        help="The number of text chunks to embed when using auto selection method",
+        required=False,
+        type=int,
+        default=300,
+    )
+
+    parser.add_argument(
+        "--k",
+        help="The maximum number of documents to select from each centroid when using auto selection method",
+        required=False,
+        type=int,
+        default=15,
+    )
+
     parser.add_argument(
         "--limit",
         help="The limit of files to load when doing random or top selection",
@@ -69,6 +86,14 @@ def __str__(self):
         default=MAX_TOKEN_COUNT,
     )
 
+    parser.add_argument(
+        "--min-examples-required",
+        help="The minimum number of examples required in entity extraction prompt",
+        type=int,
+        required=False,
+        default=2,
+    )
+
     parser.add_argument(
         "--chunk-size",
         help="Max token count for prompt generation",
@@ -106,7 +131,7 @@ def __str__(self):
     loop = asyncio.get_event_loop()
 
     loop.run_until_complete(
-        fine_tune(
+        prompt_tune(
             args.root,
             args.domain,
             str(args.method),
@@ -116,5 +141,8 @@ def __str__(self):
             args.language,
             args.no_entity_types,
             args.output,
+            args.n_subset_max,
+            args.k,
+            args.min_examples_required,
         )
     )
diff --git a/graphrag/prompt_tune/cli.py b/graphrag/prompt_tune/cli.py
@@ -32,7 +32,7 @@
 )
 
 
-async def fine_tune(
+async def prompt_tune(
     root: str,
     domain: str,
     select: str = "random",
@@ -42,8 +42,11 @@ async def fine_tune(
     language: str | None = None,
     skip_entity_types: bool = False,
     output: str = "prompts",
+    n_subset_max: int = 300,
+    k: int = 15,
+    min_examples_required: int = 2,
 ):
-    """Fine tune the model.
+    """Prompt tune the model.
 
     Parameters
     ----------
@@ -55,11 +58,13 @@ async def fine_tune(
     - chunk_size: The chunk token size to use.
     - skip_entity_types: Skip generating entity types.
     - output: The output folder to store the prompts.
+    - n_subset_max: The number of text chunks to embed when using auto selection method.
+    - k: The number of documents to select when using auto selection method.
     """
     reporter = PrintProgressReporter("")
     config = read_config_parameters(root, reporter)
 
-    await fine_tune_with_config(
+    await prompt_tune_with_config(
         root,
         config,
         domain,
@@ -71,10 +76,13 @@ async def fine_tune(
         skip_entity_types,
         output,
         reporter,
+        n_subset_max,
+        k,
+        min_examples_required,
     )
 
 
-async def fine_tune_with_config(
+async def prompt_tune_with_config(
     root: str,
     config: GraphRagConfig,
     domain: str,
@@ -86,8 +94,11 @@ async def fine_tune_with_config(
     skip_entity_types: bool = False,
     output: str = "prompts",
     reporter: ProgressReporter | None = None,
+    n_subset_max: int = 300,
+    k: int = 15,
+    min_examples_required: int = 2,
 ):
-    """Fine tune the model with a configuration.
+    """Prompt tune the model with a configuration.
 
     Parameters
     ----------
@@ -101,6 +112,8 @@ async def fine_tune_with_config(
     - skip_entity_types: Skip generating entity types.
     - output: The output folder to store the prompts.
     - reporter: The progress reporter.
+    - n_subset_max: The number of text chunks to embed when using auto selection method.
+    - k: The number of documents to select when using auto selection method.
 
     Returns
     -------
@@ -118,11 +131,13 @@ async def fine_tune_with_config(
         select_method=select,
         reporter=reporter,
         chunk_size=chunk_size,
+        n_subset_max=n_subset_max,
+        k=k,
     )
 
     # Create LLM from config
     llm = load_llm(
-        "fine_tuning",
+        "prompt_tuning",
         config.llm.type,
         NoopVerbCallbacks(),
         None,
@@ -139,6 +154,7 @@ async def fine_tune_with_config(
         language,
         max_tokens,
         skip_entity_types,
+        min_examples_required,
     )
 
 
@@ -152,6 +168,7 @@ async def generate_indexing_prompts(
     language: str | None = None,
     max_tokens: int = MAX_TOKEN_COUNT,
     skip_entity_types: bool = False,
+    min_examples_required: int = 2,
 ):
     """Generate indexing prompts.
 
@@ -165,6 +182,7 @@ async def generate_indexing_prompts(
     - domain: The domain to map the input documents to.
     - max_tokens: The maximum number of tokens to use on entity extraction prompts
     - skip_entity_types: Skip generating entity types.
+    - min_examples_required: The minimum number of examples required for entity extraction prompts.
     """
     if not domain:
         reporter.info("Generating domain...")
@@ -221,6 +239,7 @@ async def generate_indexing_prompts(
         output_path=output_path,
         encoding_model=config.encoding_model,
         max_token_count=max_tokens,
+        min_examples_required=min_examples_required,
     )
     reporter.info(f"Generated entity extraction prompt, stored in folder {output_path}")
 
diff --git a/graphrag/prompt_tune/generator/entity_extraction_prompt.py b/graphrag/prompt_tune/generator/entity_extraction_prompt.py
@@ -27,6 +27,7 @@ def create_entity_extraction_prompt(
     encoding_model: str = defs.ENCODING_MODEL,
     json_mode: bool = False,
     output_path: Path | None = None,
+    min_examples_required: int = 2,
 ) -> str:
     """
     Create a prompt for entity extraction.
@@ -41,6 +42,7 @@ def create_entity_extraction_prompt(
     - max_token_count (int): The maximum number of tokens to use for the prompt
     - json_mode (bool): Whether to use JSON mode for the prompt. Default is False
     - output_path (Path | None): The path to write the prompt to. Default is None. If None, the prompt is not written to a file. Default is None.
+        - min_examples_required (int): The minimum number of examples required. Default is 2.
 
     Returns
     -------
@@ -79,8 +81,8 @@ def create_entity_extraction_prompt(
 
         example_tokens = num_tokens_from_string(example_formatted, model=encoding_model)
 
-        # Squeeze in at least one example
-        if i > 0 and example_tokens > tokens_left:
+        # Ensure at least three examples are included
+        if i >= min_examples_required and example_tokens > tokens_left:
             break
 
         examples_prompt += example_formatted
diff --git a/graphrag/prompt_tune/loader/input.py b/graphrag/prompt_tune/loader/input.py
@@ -5,16 +5,45 @@
 
 from typing import cast
 
+import numpy as np
 import pandas as pd
 from datashaper import NoopVerbCallbacks, TableContainer, VerbInput
 
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.input import load_input
+from graphrag.index.llm import load_llm_embeddings
 from graphrag.index.progress.types import ProgressReporter
 from graphrag.index.verbs import chunk
+from graphrag.llm.types.llm_types import EmbeddingLLM
 
-MIN_CHUNK_SIZE = 200
 MIN_CHUNK_OVERLAP = 0
+MIN_CHUNK_SIZE = 200
+N_SUBSET_MAX = 300
+K = 15
+
+
+async def _embed_chunks(
+    text_chunks: pd.DataFrame,
+    embedding_llm: EmbeddingLLM,
+    n_subset_max: int = N_SUBSET_MAX,
+) -> tuple[pd.DataFrame, np.ndarray]:
+    """Convert text chunks into dense text embeddings."""
+    sampled_text_chunks = text_chunks.sample(n=min(n_subset_max, len(text_chunks)))
+    embeddings = await embedding_llm(sampled_text_chunks["chunks"].tolist())
+    return text_chunks, np.array(embeddings.output)
+
+
+def _sample_chunks_from_embeddings(
+    text_chunks: pd.DataFrame,
+    embeddings,
+    k: int = K,
+) -> pd.DataFrame:
+    """Sample text chunks from embeddings."""
+    center = np.mean(embeddings, axis=0)
+    distances = np.linalg.norm(embeddings - center, axis=1)
+    nearest_indices = np.argsort(distances)[:k]
+
+    return text_chunks.iloc[nearest_indices]
 
 
 async def load_docs_in_chunks(
@@ -24,6 +53,8 @@ async def load_docs_in_chunks(
     limit: int,
     reporter: ProgressReporter,
     chunk_size: int = MIN_CHUNK_SIZE,
+    n_subset_max: int = N_SUBSET_MAX,
+    k: int = K,
 ) -> list[str]:
     """Load docs into chunks for generating prompts."""
     dataset = await load_input(config.input, reporter, root)
@@ -57,6 +88,22 @@ async def load_docs_in_chunks(
         chunks_df = chunks_df[:limit]
     elif select_method == "random":
         chunks_df = chunks_df.sample(n=limit)
+    elif select_method == "auto":
+        if k is None or k <= 0:
+            msg = "k must be an integer > 0"
+            raise ValueError(msg)
+        embedding_llm = load_llm_embeddings(
+            name="prompt_tuning_embeddings",
+            llm_type=config.embeddings.resolved_strategy()["llm"]["type"],
+            callbacks=NoopVerbCallbacks(),
+            cache=None,
+            llm_config=config.embeddings.resolved_strategy()["llm"],
+        )
+
+        chunks_df, embeddings = await _embed_chunks(
+            chunks_df, embedding_llm, n_subset_max=n_subset_max
+        )
+        chunks_df = _sample_chunks_from_embeddings(chunks_df, embeddings, k=k)
 
     # Convert the dataset to list form, so we have a list of documents
     return chunks_df["chunks"].tolist()

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "minor",
 +  "description": "Add content-based KNN for selecting prompt tune few shot examples"
 +}