@@ -179,7 +179,14 @@ async def _cascade_evaluate(self, program_path: str) -> Dict[str, float]:
179179 return await self ._direct_evaluate (program_path )
180180
181181 # Run first stage
182- stage1_result = await run_in_executor (module .evaluate_stage1 )(program_path )
182+ try :
183+ stage1_result = await run_in_executor (module .evaluate_stage1 )(program_path )
184+ if not isinstance (stage1_result , dict ):
185+ logger .warning (f"Stage 1 evaluation returned non-dictionary result: { stage1_result } " )
186+ return {"error" : 0.0 }
187+ except Exception as e :
188+ logger .error (f"Error in stage 1 evaluation: { str (e )} " )
189+ return {"error" : 0.0 }
183190
184191 # Check threshold
185192 if not self ._passes_threshold (stage1_result , self .config .cascade_thresholds [0 ]):
@@ -190,10 +197,25 @@ async def _cascade_evaluate(self, program_path: str) -> Dict[str, float]:
190197 return stage1_result
191198
192199 # Run second stage
193- stage2_result = await run_in_executor (module .evaluate_stage2 )(program_path )
200+ try :
201+ stage2_result = await run_in_executor (module .evaluate_stage2 )(program_path )
202+ if not isinstance (stage2_result , dict ):
203+ logger .warning (f"Stage 2 evaluation returned non-dictionary result: { stage2_result } " )
204+ return stage1_result
205+ except Exception as e :
206+ logger .error (f"Error in stage 2 evaluation: { str (e )} " )
207+ return stage1_result
194208
195209 # Merge results
196- result = {** stage1_result , ** stage2_result }
210+ result = {}
211+ # Convert all values to float to avoid type errors
212+ for name , value in stage1_result .items ():
213+ if isinstance (value , (int , float )) and name != "error" :
214+ result [name ] = float (value )
215+
216+ for name , value in stage2_result .items ():
217+ if isinstance (value , (int , float )) and name != "error" :
218+ result [name ] = float (value )
197219
198220 # Check threshold
199221 if len (self .config .cascade_thresholds ) < 2 or not self ._passes_threshold (
@@ -206,10 +228,19 @@ async def _cascade_evaluate(self, program_path: str) -> Dict[str, float]:
206228 return result
207229
208230 # Run third stage
209- stage3_result = await run_in_executor (module .evaluate_stage3 )(program_path )
231+ try :
232+ stage3_result = await run_in_executor (module .evaluate_stage3 )(program_path )
233+ if not isinstance (stage3_result , dict ):
234+ logger .warning (f"Stage 3 evaluation returned non-dictionary result: { stage3_result } " )
235+ return result
236+ except Exception as e :
237+ logger .error (f"Error in stage 3 evaluation: { str (e )} " )
238+ return result
210239
211240 # Merge results
212- result = {** result , ** stage3_result }
241+ for name , value in stage3_result .items ():
242+ if isinstance (value , (int , float )) and name != "error" :
243+ result [name ] = float (value )
213244
214245 return result
215246
@@ -308,8 +339,21 @@ def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool
308339 if not metrics :
309340 return False
310341
311- # Calculate average score
312- avg_score = sum (metrics .values ()) / len (metrics )
342+ # Calculate average score, skipping non-numeric values and 'error' key
343+ valid_metrics = []
344+ for name , value in metrics .items ():
345+ # Skip 'error' keys and ensure values are numeric
346+ if name != 'error' and isinstance (value , (int , float )):
347+ try :
348+ valid_metrics .append (float (value ))
349+ except (TypeError , ValueError ):
350+ logger .warning (f"Skipping non-numeric metric: { name } ={ value } " )
351+ continue
352+
353+ if not valid_metrics :
354+ return False
355+
356+ avg_score = sum (valid_metrics ) / len (valid_metrics )
313357 return avg_score >= threshold
314358
315359 async def evaluate_multiple (
0 commit comments