Merge pull request #79 from bigcode-project/e2b_debug

terryyz · web-flow · commit e70f177ae0d6 · 2025-02-14T03:53:13.000+08:00
fix E2b execution debug
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -189,10 +189,6 @@ def evaluate(
         
         # run the evaluation
         print(f"Command run in sandbox {e2b_endpoint}")
-        if not isinstance(pass_k, str):
-            pass_k = ",".join(pass_k)
-        if not isinstance(selective_evaluate, str):
-            selective_evaluate = ",".join(selective_evaluate)
         sandbox.commands.run("bigcodebench.evaluate  --execution 'local' "
                         f"--split {split} --subset {subset} --samples {samples} "
                         f"--pass_k {pass_k} --save_pass_rate {save_pass_rate} --calibrated {calibrated} "
@@ -209,9 +205,16 @@ def evaluate(
     else:
         
         pass_at_k = dict()
-
-        passk = [int(k) for k in pass_k.split(",")]
+        passk = list(pass_k)
         
+        if isinstance(selective_evaluate, str):
+            selected_ids = set(selective_evaluate.split(","))
+        else:
+            try:
+                selected_ids = set(selective_evaluate)
+            except:
+                selected_ids = ""
+
         if parallel < 1:
             n_workers = max(1, multiprocessing.cpu_count() // 2)
         else:
@@ -224,11 +227,7 @@ def evaluate(
         problems = get_bigcodebench(subset=subset)
         
         # Add selective evaluation logic
-        if selective_evaluate:
-            if isinstance(selective_evaluate, str):
-                selected_ids = set(selective_evaluate.split(","))
-            else:
-                selected_ids = set(selective_evaluate)
+        if selected_ids:
             problems = {k: v for k, v in problems.items() if k in selected_ids}
             if not problems:
                 raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset")
diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile
@@ -33,7 +33,7 @@ RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
 
 RUN pip install numpy==1.24.3 pyarrow==14.0.1
 
-RUN cd /bigcodebench && \
+RUN cd /bigcodebench && git checkout e2b_debug && \
     pip install . --no-deps
     
 RUN pip install --timeout 2000 \