Merge pull request #187 from stanfordnlp/evaluation_error_handling

okhat · web-flow · commit e7f163d22e0a · 2023-10-25T08:19:09.000-07:00
added proper error handling within evaluate and bootstrap
diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -28,6 +28,8 @@ def __init__(self, *, devset, metric=None, num_threads=1, display_progress=False
         self.display_table = display_table
         self.display = display
         self.max_errors = max_errors
+        self.error_count = 0
+        self.error_lock = threading.Lock()
 
     def _execute_single_thread(self, wrapped_program, devset, display_progress):
         ncorrect = 0
@@ -95,6 +97,11 @@ def wrapped_program(example_idx, example):
                 score = metric(example, prediction)  # FIXME: TODO: What's the right order? Maybe force name-based kwargs!
                 return example_idx, example, prediction, score
             except Exception as e:
+                with self.error_lock:
+                    self.error_count += 1
+                    current_error_count = self.error_count
+                if current_error_count >= self.max_errors:
+                    raise e
                 print(f"Error for example in dev set: \t\t {e}")
                 return example_idx, example, dict(), 0.0
             finally:
diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py
@@ -1,6 +1,7 @@
 import dsp
 import tqdm
 import random
+import threading
 
 from dspy.primitives import Example
 
@@ -30,13 +31,16 @@
 
 
 class BootstrapFewShot(Teleprompter):
-    def __init__(self, metric=None, teacher_settings={}, max_bootstrapped_demos=4, max_labeled_demos=16, max_rounds=1):
+    def __init__(self, metric=None, teacher_settings={}, max_bootstrapped_demos=4, max_labeled_demos=16, max_rounds=1, max_errors=5):
         self.metric = metric
         self.teacher_settings = teacher_settings
 
         self.max_bootstrapped_demos = max_bootstrapped_demos
         self.max_labeled_demos = max_labeled_demos
         self.max_rounds = max_rounds
+        self.max_errors= max_errors
+        self.error_count = 0
+        self.error_lock = threading.Lock()
 
     def compile(self, student, *, teacher=None, trainset, valset=None):
         self.trainset = trainset
@@ -135,7 +139,11 @@ def _bootstrap_one_example(self, example, round_idx=0):
                 # print(success, example, prediction)
         except Exception as e:
             success = False
-            # FIXME: remove the reliance on uuid here so the error is printed
+            with self.error_lock:
+                self.error_count += 1
+                current_error_count = self.error_count
+            if current_error_count >= self.max_errors:
+                raise e
             print(f'Failed to run or to evaluate example {example} with {self.metric} due to {e}.')
         
         if success: