fix: streamline theme extraction from overlaps in MultiHopSpecificQue… (#2347)

kenzoyan · Kenzo Yan · web-flow · commit b634a5c28de6 · 2025-10-09T13:24:01.000+05:30
## Issue Link / Problem Description  - Fixes #2275 - I also encounter this problem on my test on multi hop specific query synthesizer: 1 validation error for ThemesPersonasInput themes.0 Input should be a valid string [type=string_type, input_value=['Vedenjäähdytyskoneen', 'Vedenjäähdytyskone'], input_type=list] For further information visit https://errors.pydantic.dev/2.11/v/string_type - @czhiming-maker have not update it already 2 weeks past, so I might give a try to update on it. ## Changes Made  - add helper function _extract_themes_from_overlaps from the dicsussion in #2275 - - ## Testing  ### How to Test ``` from ragas.testset import TestsetGenerator from langchain_community.document_loaders import DirectoryLoader, TextLoader from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI from ragas.testset.synthesizers.multi_hop.specific import ( MultiHopSpecificQuerySynthesizer, ) import os loader = DirectoryLoader("./data/", glob="**/*.md", loader_cls=TextLoader) documents = loader.load() # Set your Azure OpenAI credentials AZURE_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT", ) AZURE_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY", "") AZURE_API_VERSION = "2024-12-01-preview" AZURE_DEPLOYMENT = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini") EMBEDDING_DEPLOYMENT = "text-embedding-ada-002" # Initialize embeddings embeddings = AzureOpenAIEmbeddings( azure_endpoint=AZURE_ENDPOINT, api_key=AZURE_API_KEY, api_version=AZURE_API_VERSION, azure_deployment=EMBEDDING_DEPLOYMENT ) # Initialize LLM with JSON mode enabled llm = AzureChatOpenAI( azure_endpoint=AZURE_ENDPOINT, api_key=AZURE_API_KEY, openai_api_version=AZURE_API_VERSION, azure_deployment=AZURE_DEPLOYMENT, temperature=0.3, model_kwargs={ "response_format": {"type": "json_object"} # Force clean JSON output } ) generator_llm = LangchainLLMWrapper(llm) generator_embeddings = LangchainEmbeddingsWrapper(embeddings) distribution = [ (MultiHopSpecificQuerySynthesizer(llm=generator_llm), 1), ] generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) async def generate(): # generate testset testset = generator.generate_with_langchain_docs( documents, testset_size=4, query_distribution=distribution, ) testset.to_evaluation_dataset().to_jsonl("testset.jsonl") import asyncio asyncio.run(generate()) ``` ## References  - Related issues: #2275 ## Screenshots/Examples (if applicable)  Samples Generation in test <img width="1701" height="111" alt="image" src="https://github.com/user-attachments/assets/890d14bc-cd31-4940-8ce3-df8bc5ea459b" /> ---  Co-authored-by: Kenzo Yan <kenzo.yan@granlund.fi>
diff --git a/src/ragas/testset/synthesizers/multi_hop/specific.py b/src/ragas/testset/synthesizers/multi_hop/specific.py
@@ -87,11 +87,8 @@ async def _generate_scenarios(
                     ):
                         logger.debug("Overlapped items are not strings or iterables.")
                         continue
-                    themes = (
-                        list(overlapped_items.keys())
-                        if isinstance(overlapped_items, dict)
-                        else overlapped_items
-                    )
+                    themes = self._extract_themes_from_overlaps(overlapped_items)
+
                     prompt_input = ThemesPersonasInput(
                         themes=themes, personas=persona_list
                     )
@@ -100,10 +97,9 @@ async def _generate_scenarios(
                             data=prompt_input, llm=self.llm, callbacks=callbacks
                         )
                     )
-                    combinations = [
-                        [item] if isinstance(item, str) else list(item)
-                        for item in themes
-                    ]
+
+                    combinations = [[theme] for theme in themes]
+
                     base_scenarios = self.prepare_combinations(
                         [node_a, node_b],
                         combinations,
@@ -117,3 +113,30 @@ async def _generate_scenarios(
                     scenarios.extend(base_scenarios)
 
         return scenarios
+
+    def _extract_themes_from_overlaps(self, overlapped_items: t.Any) -> t.List[str]:
+        """
+        Extract unique entity names from overlapped items.
+
+        Handles multiple formats:
+        - List[Tuple[str, str]]: Entity pairs from overlap detection
+        - List[str]: Direct entity names
+        - Dict[str, Any]: Keys as entity names
+        """
+        if isinstance(overlapped_items, dict):
+            return list(overlapped_items.keys())
+
+        if not isinstance(overlapped_items, list):
+            return []
+
+        unique_entities = set()
+        for item in overlapped_items:
+            if isinstance(item, tuple):
+                # Extract both entities from the pair
+                for entity in item:
+                    if isinstance(entity, str):
+                        unique_entities.add(entity)
+            elif isinstance(item, str):
+                unique_entities.add(item)
+
+        return list(unique_entities)