change prompty output to dict

Neehar Duvvuri · Neehar Duvvuri · commit 29d44b3fa50c · 2025-10-17T14:58:30.000-04:00
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -133,12 +133,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 target=ErrorTarget.CONVERSATION,
             )
         # Call the prompty flow to get the evaluation result.
-        llm_output, input_token_count, output_token_count, total_token_count, finish_reason, model_id, sample_input, sample_output = await self._flow(
+        prompty_output_dict = await self._flow(
             timeout=self._LLM_CALL_TIMEOUT, **eval_input
         )
 
         score = math.nan
-        if llm_output:
+        if prompty_output_dict:
+            llm_output = prompty_output_dict.get("llm_output", "")
+            input_token_count = prompty_output_dict.get("input_token_count", 0)
+            output_token_count = prompty_output_dict.get("output_token_count", 0)
+            total_token_count = prompty_output_dict.get("total_token_count", 0)
+            finish_reason = prompty_output_dict.get("finish_reason", "")
+            model_id = prompty_output_dict.get("model_id", "")
+            sample_input = prompty_output_dict.get("sample_input", "")
+            sample_output = prompty_output_dict.get("sample_output", "")
             # Parse out score and reason from evaluators known to possess them.
             if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
                 score, reason = parse_quality_evaluator_reason_score(llm_output)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_prompty_async.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_prompty_async.py
@@ -81,13 +81,15 @@ def test_load_images(self, prompty_config: Dict[str, Any]):
     @pytest.mark.asyncio
     async def test_first_match_text(self, prompty_config: Dict[str, Any]):
         prompty = AsyncPrompty(COHERENCE_PROMPTY, **prompty_config)
-        result, _, _, _, _, _, _, _ = await prompty(query="What is the capital of France?", response="France capital Paris")
+        result = await prompty(query="What is the capital of France?", response="France capital Paris")
+        assert isinstance(result, dict)
+        llm_output = result["llm_output"]
 
         # We expect an output string that contains <S0>chain of thoughts</S0> <S1>explanation<S1> <S2>int_score</S2>
-        assert isinstance(result, str)
+        assert isinstance(llm_output, str)
         matched = re.match(
             r"^\s*<S0>(.*)</S0>\s*<S1>(.*)</S1>\s*<S2>(.*)</S2>\s*$",
-            result,
+            llm_output,
             re.MULTILINE | re.DOTALL,
         )
         assert matched
@@ -99,19 +101,27 @@ async def test_first_match_text(self, prompty_config: Dict[str, Any]):
     @pytest.mark.asyncio
     async def test_first_match_image(self, prompty_config: Dict[str, Any]):
         prompty = AsyncPrompty(IMAGE_PROMPTY, **prompty_config)
-        result, _, _, _, _, _, _, _ = await prompty(image="image1.jpg", question="What is this a picture of?")
-        assert isinstance(result, str)
-        assert "apple" in result.lower()
+        result = await prompty(image="image1.jpg", question="What is this a picture of?")
+        assert isinstance(result, dict)
+        llm_output = result["llm_output"]
+        assert isinstance(llm_output, AsyncGenerator)
+        combined = ""
+        async for chunk in llm_output:
+            assert isinstance(chunk, str)
+            combined += chunk
+        assert "apple" in combined
 
     @pytest.mark.asyncio
     async def test_first_match_text_streaming(self, prompty_config: Dict[str, Any]):
         prompty_config["model"]["parameters"]["stream"] = True
         prompty = AsyncPrompty(BASIC_PROMPTY, **prompty_config)
-        result, _, _, _, _, _, _, _ = await prompty(firstName="Bob", question="What is the capital of France?")
+        result = await prompty(firstName="Bob", question="What is the capital of France?")
+        assert isinstance(result, dict)
+        llm_output = result["llm_output"]
 
-        assert isinstance(result, AsyncGenerator)
+        assert isinstance(llm_output, AsyncGenerator)
         combined = ""
-        async for chunk in result:
+        async for chunk in llm_output:
             assert isinstance(chunk, str)
             combined += chunk
 
@@ -122,11 +132,13 @@ async def test_first_match_text_streaming(self, prompty_config: Dict[str, Any]):
     async def test_first_match_image_streaming(self, prompty_config: Dict[str, Any]):
         prompty_config["model"]["parameters"]["stream"] = True
         prompty = AsyncPrompty(IMAGE_PROMPTY, **prompty_config)
-        result, _, _, _, _, _, _, _ = await prompty(image="image1.jpg", question="What is this a picture of?")
+        result = await prompty(image="image1.jpg", question="What is this a picture of?")
+        assert isinstance(result, dict)
+        llm_output = result["llm_output"]
 
-        assert isinstance(result, AsyncGenerator)
+        assert isinstance(llm_output, AsyncGenerator)
         combined = ""
-        async for chunk in result:
+        async for chunk in llm_output:
             assert isinstance(chunk, str)
             combined += chunk
 
@@ -143,20 +155,22 @@ async def test_first_match_image_streaming(self, prompty_config: Dict[str, Any])
     async def test_first_match_text_json(self, prompty_config: Dict[str, Any], outputs: Mapping[str, Any]):
         prompty_config["outputs"] = outputs
         prompty = AsyncPrompty(JSON_PROMPTY, **prompty_config)
-        result, _, _, _, _, _, _, _ = await prompty(question="What is the capital of France?")
+        result = await prompty(question="What is the capital of France?")
+        assert isinstance(result, dict)
+        llm_output = result["llm_output"]
 
-        assert isinstance(result, Mapping)
-        assert "firstName" in result
-        assert result["firstName"] == "John"
-        assert "answer" in result
-        assert "Paris" in result["answer"]
+        assert isinstance(llm_output, Mapping)
+        assert "firstName" in llm_output
+        assert llm_output["firstName"] == "John"
+        assert "answer" in llm_output
+        assert "Paris" in llm_output["answer"]
 
         if outputs:
-            # Should ahve only first name, and answer
-            assert "lastName" not in result
+            # Should have only first name, and answer
+            assert "lastName" not in llm_output
         else:
             assert "lastName" in result
-            assert result["lastName"] == "Doh"
+            assert llm_output["lastName"] == "Doh"
 
     @pytest.mark.asyncio
     async def test_first_match_text_json_missing(self, prompty_config: Dict[str, Any]):
@@ -170,20 +184,26 @@ async def test_first_match_text_json_missing(self, prompty_config: Dict[str, Any
     async def test_first_match_text_json_streaming(self, prompty_config: Dict[str, Any]):
         prompty_config["model"]["parameters"]["stream"] = True
         prompty = AsyncPrompty(JSON_PROMPTY, **prompty_config)
-        result, _, _, _, _, _, _, _ = await prompty(
+        result = await prompty(
             question="What is the capital of France?", firstName="Barbra", lastName="Streisand"
         )
-        assert isinstance(result, Mapping)
-        assert result["firstName"] == "Barbra"
-        assert result["lastName"] == "Streisand"
-        assert "Paris" in result["answer"]
+        assert isinstance(result, dict)
+        llm_output = result["llm_output"]
+        assert isinstance(llm_output, Mapping)
+        assert llm_output["firstName"] == "Barbra"
+        assert llm_output["lastName"] == "Streisand"
+        assert "Paris" in llm_output["answer"]
 
     @pytest.mark.asyncio
     async def test_full_text(self, prompty_config: Dict[str, Any]):
         prompty_config["model"]["response"] = "full"
         prompty = AsyncPrompty(BASIC_PROMPTY, **prompty_config)
-        result, _, _, _, _, _, _, _ = await prompty(firstName="Bob", question="What is the capital of France?")
-        assert isinstance(result, ChatCompletion)
-        response: str = result.choices[0].message.content or ""
+        result = await prompty(
+            question="What is the capital of France?", firstName="Barbra", lastName="Streisand"
+        )
+        assert isinstance(result, dict)
+        llm_output = result["llm_output"]
+        assert isinstance(llm_output, ChatCompletion)
+        response: str = llm_output.choices[0].message.content or ""
         assert "Bob" in response
         assert "Paris" in response