@@ -177,7 +177,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
177177 eval_input ["query" ] = reformat_conversation_history (eval_input ["query" ], logger )
178178 if not isinstance (eval_input ["response" ], str ):
179179 eval_input ["response" ] = reformat_agent_response (eval_input ["response" ], logger )
180- llm_output = await self ._flow (timeout = self ._LLM_CALL_TIMEOUT , ** eval_input )
180+ result = await self ._flow (timeout = self ._LLM_CALL_TIMEOUT , ** eval_input )
181+ llm_output = result ["llm_output" ]
181182 score = math .nan
182183
183184 if isinstance (llm_output , dict ):
@@ -191,6 +192,13 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
191192 f"{ self ._result_key } _reason" : reason ,
192193 f"{ self ._result_key } _result" : binary_result ,
193194 f"{ self ._result_key } _threshold" : self ._threshold ,
195+ f"{ self ._result_key } _prompt_tokens" : result .get ("input_token_count" , 0 ),
196+ f"{ self ._result_key } _completion_tokens" : result .get ("output_token_count" , 0 ),
197+ f"{ self ._result_key } _total_tokens" : result .get ("total_token_count" , 0 ),
198+ f"{ self ._result_key } _finish_reason" : result .get ("finish_reason" , "" ),
199+ f"{ self ._result_key } _model" : result .get ("model_id" , "" ),
200+ f"{ self ._result_key } _sample_input" : result .get ("sample_input" , "" ),
201+ f"{ self ._result_key } _sample_output" : result .get ("sample_output" , "" ),
194202 }
195203
196204 binary_result = self ._get_binary_result (score )
0 commit comments