EvolvingLMMs-Lab · Kyunnilee · Oct 4, 2025 · Oct 4, 2025 · Oct 5, 2025 · Oct 13, 2025
diff --git a/lmms_eval/api/metrics.py b/lmms_eval/api/metrics.py
@@ -531,11 +531,37 @@ def bootstrap_stderr(f, xs, iters):
     return sample_stddev(res)
 
 
+def bootstrap_chair_metric(metric_fn, xs, iters):
+    "for non multiprocessing for CHAIR"
+    print(f"bootstrapping for stddev: {metric_fn.__name__}")
+    res = []
+    from tqdm import tqdm
+
+    for _ in tqdm(range(iters), desc="Bootstrap"):
+        bootstrap_sample = random.choices(xs, k=len(xs))
+        metric_value = metric_fn(bootstrap_sample)
+        res.append(metric_value)
+
+    return sample_stddev(res)
+
 def stderr_for_metric(metric, bootstrap_iters: int):
     if bootstrap_iters <= 0:
         # return no function (don't compute stderr) if bootstrap iters = 0
         return None
-
+    # for coco_cap_chair
+    from lmms_eval.tasks.coco_cap_chair.utils import (
+        coco_cap_chair_aggregate_results_chair_i,
+        coco_cap_chair_aggregate_results_chair_s,
+        coco_cap_chair_aggregate_results_recall,
+    )
+    # for amber_g
+    from lmms_eval.tasks.amber_g.utils import (
+        amber_g_aggregate_chair,
+        amber_g_aggregate_cover,
+        amber_g_aggregate_hal,
+        amber_g_aggregate_cog,
+    )
+
     bootstrappable = [
         median,
         matthews_corrcoef,
@@ -544,11 +570,24 @@ def stderr_for_metric(metric, bootstrap_iters: int):
         bleu,
         chrf,
         ter,
+        coco_cap_chair_aggregate_results_chair_i,
+        coco_cap_chair_aggregate_results_chair_s,
+        coco_cap_chair_aggregate_results_recall,
+        amber_g_aggregate_chair,
+        amber_g_aggregate_cover,
+        amber_g_aggregate_hal,
+        amber_g_aggregate_cog,
     ]
 
     if metric in bootstrappable:
         return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
 
+    if hasattr(metric, '__name__'):
+        if 'coco_cap_chair' in metric.__name__:
+            return lambda x: bootstrap_chair_metric(metric, x, iters=bootstrap_iters)
+        if 'amber_g' in metric.__name__ or 'amber_' in metric.__name__:
+            return lambda x: bootstrap_chair_metric(metric, x, iters=bootstrap_iters)
+
     stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
 
     return stderr.get(metric, None)

diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
@@ -582,6 +582,7 @@ def evaluate(
                         "doc_id": doc_id,
                         "doc": saved_doc,
                         "target": target,
+                        # "pred": metrics['coco_cap_chair_i']['pred'],
                         "arguments": filtered_arguments,
                         "resps": [req.resps for req in requests],
                         "filtered_resps": [req.filtered_resps[filter_key] for req in requests],

diff --git a/lmms_eval/models/simple/llava_onevision1_5.py b/lmms_eval/models/simple/llava_onevision1_5.py
@@ -46,6 +46,7 @@ def __init__(
         max_image_size: Optional[int] = None,  # Only applicable if use_custom_video_loader is True
         system_prompt: Optional[str] = "You are a helpful assistant.",
         interleave_visuals: Optional[bool] = False,
+        image_first: Optional[bool] = True,
         reasoning_prompt: Optional[str] = None,
         max_length: int = 2048,
         **kwargs,
@@ -86,7 +87,7 @@ def __init__(
         self.max_pixels = max_pixels
         self.min_pixels = min_pixels
         self.max_num_frames = max_num_frames
-
+        self.image_first = image_first
         if reasoning_prompt:
             self.reasoning_prompt = reasoning_prompt.replace("\\n", "\n")
         else:
@@ -236,12 +237,20 @@ def _collate(x):
                         processed_visuals.append({"type": "image", "image": visual.convert("RGB")})
 
                 if self.interleave_visuals is False:
-                    message.append(
-                        {
-                            "role": "user",
-                            "content": processed_visuals + [{"type": "text", "text": context}],
-                        }
-                    )
+                    if self.image_first:
+                        message.append(
+                            {
+                                "role": "user",
+                                "content": processed_visuals + [{"type": "text", "text": context}],
+                            }
+                        )
+                    else:
+                        message.append(
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": context}] + processed_visuals,
+                            }
+                        )
                 else:  # currently support find <image x> in the context
                     image_placeholders = re.findall(r"<image \d+>", context)
                     content_parts = []

diff --git a/lmms_eval/models/simple/vllm.py b/lmms_eval/models/simple/vllm.py
@@ -156,6 +156,7 @@ def __init__(
         chat_template: Optional[str] = None,
         min_image_pixels: int = 28,  # minimum image dimension, required for Qwen 2/2.5-VL models
         disable_log_stats: bool = False,
+        image_first: bool = False,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -167,6 +168,7 @@ def __init__(
         self.chat_template = chat_template
         self.min_image_pixels = min_image_pixels
         self.data_parallel_size = data_parallel_size
+        self.image_first = image_first
         # Qwen 2/2.5-VL models enforce minimum image dimensions
         self._enforce_image_resize = self._is_qwen_vl_model(model)
 
@@ -338,11 +340,22 @@ def generate_until(self, requests) -> List[str]:
                             imgs.append(task.result())
 
                 messages = [{"role": "user", "content": []}]
+                # print(f"image_first: {self.image_first}", flush=True)
+                if self.image_first:
+                    # print(f"IMAGE_FIRST", flush=True)
+                    for img in self.flatten(imgs):
+                        messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
+                    messages[0]["content"].append({"type": "text", "text": contexts})
+                else:
+                    # print(f"IMAGE_LAST", flush=True)
+                    messages[0]["content"].append({"type": "text", "text": contexts})
+                    for img in self.flatten(imgs):
+                        messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
+                # Ooriginal Code
                 # Add images first, then text
-                for img in self.flatten(imgs):
-                    messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
-                messages[0]["content"].append({"type": "text", "text": contexts})
-
+                # for img in self.flatten(imgs):
+                #     messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
+                # messages[0]["content"].append({"type": "text", "text": contexts})
                 batched_messages.append(messages)
 
             sampling_params = SamplingParams(**params)

diff --git a/lmms_eval/tasks/_task_utils/file_utils.py b/lmms_eval/tasks/_task_utils/file_utils.py
@@ -2,11 +2,11 @@
 
 
 def generate_submission_file(file_name, args, subpath="submissions"):
-    if args.output_path is None:
+    if args is None or args.output is None:
         # If no output path is specified, use current directory
         path = subpath
     else:
-        path = os.path.join(args.output_path, subpath)
+        path = os.path.join(args.output, subpath)
     os.makedirs(path, exist_ok=True)
     path = os.path.join(path, file_name)
     return os.path.abspath(path)
diff --git a/lmms_eval/tasks/amber_g/amber_g.yaml b/lmms_eval/tasks/amber_g/amber_g.yaml
@@ -0,0 +1,42 @@
+# AMBER-G (Generative Task) Evaluation Configuration
+# Based on: https://github.com/junyangwang0410/AMBER
+# Dataset includes: images, questions, and complete ground truth annotations
+
+dataset_path: Kyunnilee/amber_g  # use this dataset
+dataset_kwargs:
+  trust_remote_code: true
+task: "amber_g"
+output_type: generate_until
+
+doc_to_visual: !function utils.amber_g_doc_to_visual
+doc_to_text: !function utils.amber_g_doc_to_text
+doc_to_target: "truth" 
+test_split: train
+
+generation_kwargs:
+  max_new_tokens: 2048
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+  until: [] # really important!!! the default would be ["\n\n"] and that will cause truncation
+
+process_results: !function utils.amber_g_process_result
+
+# AMBER-G Metrics:
+metric_list:
+  - metric: amber_chair
+    aggregation: !function utils.amber_g_aggregate_chair
+    higher_is_better: false
+  - metric: amber_cover
+    aggregation: !function utils.amber_g_aggregate_cover
+    higher_is_better: true
+  - metric: amber_hal
+    aggregation: !function utils.amber_g_aggregate_hal
+    higher_is_better: false
+  - metric: amber_cog
+    aggregation: !function utils.amber_g_aggregate_cog
+    higher_is_better: true
+
+metadata:
+  - version: 0.0