handle cascading errors better.

codelion · codelion · commit e76a2544e397 · 2025-05-17T09:01:52.000+08:00
diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py
@@ -118,6 +118,11 @@ def evaluate(program_path):
             except TimeoutError as e:
                 print(f"Trial {trial}: {str(e)}")
                 continue
+            except IndexError as e:
+                # Specifically handle IndexError which often happens with early termination checks
+                print(f"Trial {trial}: IndexError - {str(e)}")
+                print("This is likely due to a list index check before the list is fully populated.")
+                continue
             except Exception as e:
                 print(f"Trial {trial}: Error - {str(e)}")
                 print(traceback.format_exc())
@@ -226,6 +231,11 @@ def evaluate_stage1(program_path):
         except TimeoutError as e:
             print(f"Stage 1 evaluation timed out: {e}")
             return {"runs_successfully": 0.0, "error": "Timeout"}
+        except IndexError as e:
+            # Specifically handle IndexError which often happens with early termination checks
+            print(f"Stage 1 evaluation failed with IndexError: {e}")
+            print("This is likely due to a list index check before the list is fully populated.")
+            return {"runs_successfully": 0.0, "error": f"IndexError: {str(e)}"}
         except Exception as e:
             print(f"Stage 1 evaluation failed: {e}")
             print(traceback.format_exc())
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
@@ -179,7 +179,14 @@ async def _cascade_evaluate(self, program_path: str) -> Dict[str, float]:
                 return await self._direct_evaluate(program_path)
             
             # Run first stage
-            stage1_result = await run_in_executor(module.evaluate_stage1)(program_path)
+            try:
+                stage1_result = await run_in_executor(module.evaluate_stage1)(program_path)
+                if not isinstance(stage1_result, dict):
+                    logger.warning(f"Stage 1 evaluation returned non-dictionary result: {stage1_result}")
+                    return {"error": 0.0}
+            except Exception as e:
+                logger.error(f"Error in stage 1 evaluation: {str(e)}")
+                return {"error": 0.0}
             
             # Check threshold
             if not self._passes_threshold(stage1_result, self.config.cascade_thresholds[0]):
@@ -190,10 +197,25 @@ async def _cascade_evaluate(self, program_path: str) -> Dict[str, float]:
                 return stage1_result
             
             # Run second stage
-            stage2_result = await run_in_executor(module.evaluate_stage2)(program_path)
+            try:
+                stage2_result = await run_in_executor(module.evaluate_stage2)(program_path)
+                if not isinstance(stage2_result, dict):
+                    logger.warning(f"Stage 2 evaluation returned non-dictionary result: {stage2_result}")
+                    return stage1_result
+            except Exception as e:
+                logger.error(f"Error in stage 2 evaluation: {str(e)}")
+                return stage1_result
             
             # Merge results
-            result = {**stage1_result, **stage2_result}
+            result = {}
+            # Convert all values to float to avoid type errors
+            for name, value in stage1_result.items():
+                if isinstance(value, (int, float)) and name != "error":
+                    result[name] = float(value)
+            
+            for name, value in stage2_result.items():
+                if isinstance(value, (int, float)) and name != "error":
+                    result[name] = float(value)
             
             # Check threshold
             if len(self.config.cascade_thresholds) < 2 or not self._passes_threshold(
@@ -206,10 +228,19 @@ async def _cascade_evaluate(self, program_path: str) -> Dict[str, float]:
                 return result
             
             # Run third stage
-            stage3_result = await run_in_executor(module.evaluate_stage3)(program_path)
+            try:
+                stage3_result = await run_in_executor(module.evaluate_stage3)(program_path)
+                if not isinstance(stage3_result, dict):
+                    logger.warning(f"Stage 3 evaluation returned non-dictionary result: {stage3_result}")
+                    return result
+            except Exception as e:
+                logger.error(f"Error in stage 3 evaluation: {str(e)}")
+                return result
             
             # Merge results
-            result = {**result, **stage3_result}
+            for name, value in stage3_result.items():
+                if isinstance(value, (int, float)) and name != "error":
+                    result[name] = float(value)
             
             return result
         
@@ -308,8 +339,21 @@ def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool
         if not metrics:
             return False
         
-        # Calculate average score
-        avg_score = sum(metrics.values()) / len(metrics)
+        # Calculate average score, skipping non-numeric values and 'error' key
+        valid_metrics = []
+        for name, value in metrics.items():
+            # Skip 'error' keys and ensure values are numeric
+            if name != 'error' and isinstance(value, (int, float)):
+                try:
+                    valid_metrics.append(float(value))
+                except (TypeError, ValueError):
+                    logger.warning(f"Skipping non-numeric metric: {name}={value}")
+                    continue
+        
+        if not valid_metrics:
+            return False
+            
+        avg_score = sum(valid_metrics) / len(valid_metrics)
         return avg_score >= threshold
     
     async def evaluate_multiple(