Skip to content

Commit e76a254

Browse files
committed
handle cascading errors better.
1 parent 644272d commit e76a254

File tree

2 files changed

+61
-7
lines changed

2 files changed

+61
-7
lines changed

examples/function_minimization/evaluator.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ def evaluate(program_path):
118118
except TimeoutError as e:
119119
print(f"Trial {trial}: {str(e)}")
120120
continue
121+
except IndexError as e:
122+
# Specifically handle IndexError which often happens with early termination checks
123+
print(f"Trial {trial}: IndexError - {str(e)}")
124+
print("This is likely due to a list index check before the list is fully populated.")
125+
continue
121126
except Exception as e:
122127
print(f"Trial {trial}: Error - {str(e)}")
123128
print(traceback.format_exc())
@@ -226,6 +231,11 @@ def evaluate_stage1(program_path):
226231
except TimeoutError as e:
227232
print(f"Stage 1 evaluation timed out: {e}")
228233
return {"runs_successfully": 0.0, "error": "Timeout"}
234+
except IndexError as e:
235+
# Specifically handle IndexError which often happens with early termination checks
236+
print(f"Stage 1 evaluation failed with IndexError: {e}")
237+
print("This is likely due to a list index check before the list is fully populated.")
238+
return {"runs_successfully": 0.0, "error": f"IndexError: {str(e)}"}
229239
except Exception as e:
230240
print(f"Stage 1 evaluation failed: {e}")
231241
print(traceback.format_exc())

openevolve/evaluator.py

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,14 @@ async def _cascade_evaluate(self, program_path: str) -> Dict[str, float]:
179179
return await self._direct_evaluate(program_path)
180180

181181
# Run first stage
182-
stage1_result = await run_in_executor(module.evaluate_stage1)(program_path)
182+
try:
183+
stage1_result = await run_in_executor(module.evaluate_stage1)(program_path)
184+
if not isinstance(stage1_result, dict):
185+
logger.warning(f"Stage 1 evaluation returned non-dictionary result: {stage1_result}")
186+
return {"error": 0.0}
187+
except Exception as e:
188+
logger.error(f"Error in stage 1 evaluation: {str(e)}")
189+
return {"error": 0.0}
183190

184191
# Check threshold
185192
if not self._passes_threshold(stage1_result, self.config.cascade_thresholds[0]):
@@ -190,10 +197,25 @@ async def _cascade_evaluate(self, program_path: str) -> Dict[str, float]:
190197
return stage1_result
191198

192199
# Run second stage
193-
stage2_result = await run_in_executor(module.evaluate_stage2)(program_path)
200+
try:
201+
stage2_result = await run_in_executor(module.evaluate_stage2)(program_path)
202+
if not isinstance(stage2_result, dict):
203+
logger.warning(f"Stage 2 evaluation returned non-dictionary result: {stage2_result}")
204+
return stage1_result
205+
except Exception as e:
206+
logger.error(f"Error in stage 2 evaluation: {str(e)}")
207+
return stage1_result
194208

195209
# Merge results
196-
result = {**stage1_result, **stage2_result}
210+
result = {}
211+
# Convert all values to float to avoid type errors
212+
for name, value in stage1_result.items():
213+
if isinstance(value, (int, float)) and name != "error":
214+
result[name] = float(value)
215+
216+
for name, value in stage2_result.items():
217+
if isinstance(value, (int, float)) and name != "error":
218+
result[name] = float(value)
197219

198220
# Check threshold
199221
if len(self.config.cascade_thresholds) < 2 or not self._passes_threshold(
@@ -206,10 +228,19 @@ async def _cascade_evaluate(self, program_path: str) -> Dict[str, float]:
206228
return result
207229

208230
# Run third stage
209-
stage3_result = await run_in_executor(module.evaluate_stage3)(program_path)
231+
try:
232+
stage3_result = await run_in_executor(module.evaluate_stage3)(program_path)
233+
if not isinstance(stage3_result, dict):
234+
logger.warning(f"Stage 3 evaluation returned non-dictionary result: {stage3_result}")
235+
return result
236+
except Exception as e:
237+
logger.error(f"Error in stage 3 evaluation: {str(e)}")
238+
return result
210239

211240
# Merge results
212-
result = {**result, **stage3_result}
241+
for name, value in stage3_result.items():
242+
if isinstance(value, (int, float)) and name != "error":
243+
result[name] = float(value)
213244

214245
return result
215246

@@ -308,8 +339,21 @@ def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool
308339
if not metrics:
309340
return False
310341

311-
# Calculate average score
312-
avg_score = sum(metrics.values()) / len(metrics)
342+
# Calculate average score, skipping non-numeric values and 'error' key
343+
valid_metrics = []
344+
for name, value in metrics.items():
345+
# Skip 'error' keys and ensure values are numeric
346+
if name != 'error' and isinstance(value, (int, float)):
347+
try:
348+
valid_metrics.append(float(value))
349+
except (TypeError, ValueError):
350+
logger.warning(f"Skipping non-numeric metric: {name}={value}")
351+
continue
352+
353+
if not valid_metrics:
354+
return False
355+
356+
avg_score = sum(valid_metrics) / len(valid_metrics)
313357
return avg_score >= threshold
314358

315359
async def evaluate_multiple(

0 commit comments

Comments
 (0)