From 1f15a45052b2e25e18f21ab822f4e1fcc03de607 Mon Sep 17 00:00:00 2001
From: Patrick Wu <tsunghan_wu@berkeley.edu>
Date: Sat, 4 Oct 2025 00:50:01 +0000
Subject: [PATCH 01/15] add coco captioning chair

---
 .../tasks/coco_cap_chair/coco_cap_chair.yaml  |  29 ++++
 lmms_eval/tasks/coco_cap_chair/utils.py       | 136 ++++++++++++++++++
 2 files changed, 165 insertions(+)
 create mode 100644 lmms_eval/tasks/coco_cap_chair/coco_cap_chair.yaml
 create mode 100644 lmms_eval/tasks/coco_cap_chair/utils.py

diff --git a/lmms_eval/tasks/coco_cap_chair/coco_cap_chair.yaml b/lmms_eval/tasks/coco_cap_chair/coco_cap_chair.yaml
new file mode 100644
index 000000000..aaaed0ce9
--- /dev/null
+++ b/lmms_eval/tasks/coco_cap_chair/coco_cap_chair.yaml
@@ -0,0 +1,29 @@
+dataset_path: tsunghanwu/mscoco_chair
+dataset_kwargs:
+  token: True
+task: "coco_cap_chair"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.coco_cap_chair_doc_to_visual
+doc_to_text: !function utils.coco_cap_chair_doc_to_text
+doc_to_target: "gt_object"
+generation_kwargs:
+  max_new_tokens: 2048
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+  until: [] # really important!!! the default would be ["\n\n"] and that will cause truncation
+
+process_results: !function utils.coco_cap_chair_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: coco_cap_chair_i
+    aggregation : !function utils.coco_cap_chair_aggregate_results_chair_i
+    higher_is_better : false
+  - metric: coco_cap_chair_s
+    aggregation : !function utils.coco_cap_chair_aggregate_results_chair_s
+    higher_is_better : false
+
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/coco_cap_chair/utils.py b/lmms_eval/tasks/coco_cap_chair/utils.py
new file mode 100644
index 000000000..e2928ece7
--- /dev/null
+++ b/lmms_eval/tasks/coco_cap_chair/utils.py
@@ -0,0 +1,136 @@
+import nltk
+from nltk.corpus import wordnet
+from nltk.stem import WordNetLemmatizer
+
+
+CHAIR_METRICS = ["chair_s", "chair_i"]
+
+MSCOCO_OBJECTS = ['person', 'girl', 'boy', 'man', 'woman', 'kid', 'child', 'chef', 'baker', 'people', 'adult', 'rider', 'children', 'baby', 'worker', 'passenger', 'sister', 'biker', 'policeman', 'cop', 'officer', 'lady', 'cowboy', 'bride', 'groom', 'male', 'female', 'guy', 'traveler', 'mother', 'father', 'gentleman', 'pitcher', 'player', 'skier', 'snowboarder', 'skater', 'skateboarder', 'person', 'woman', 'guy', 'foreigner', 'child', 'gentleman', 'caller', 'offender', 'coworker', 'trespasser', 'patient', 'politician', 'soldier', 'grandchild', 'serviceman', 'walker', 'drinker', 'doctor', 'bicyclist', 'thief', 'buyer', 'teenager', 'student', 'camper', 'driver', 'solider', 'hunter', 'shopper', 'villager', 'bicycle', 'bike', 'bicycle', 'bike', 'unicycle', 'minibike', 'trike', 'car', 'automobile', 'van', 'minivan', 'sedan', 'suv', 'hatchback', 'cab', 'jeep', 'coupe', 'taxicab', 'limo', 'taxi', 'motorcycle', 'scooter', ' motor bike', 'motor cycle', 'motorbike', 'scooter', 'moped', 'airplane', 'jetliner', 'plane', 'air plane', 'monoplane', 'aircraft', 'jet', 'jetliner', 'airbus', 'biplane', 'seaplane', 'bus', 'minibus', 'trolley', 'train', 'locomotive', 'tramway', 'caboose', 'truck', 'pickup', 'lorry', 'hauler', 'firetruck', 'boat', 'ship', 'liner', 'sailboat', 'motorboat', 'dinghy', 'powerboat', 'speedboat', 'canoe', 'skiff', 'yacht', 'kayak', 'catamaran', 'pontoon', 'houseboat', 'vessel', 'rowboat', 'trawler', 'ferryboat', 'watercraft', 'tugboat', 'schooner', 'barge', 'ferry', 'sailboard', 'paddleboat', 'lifeboat', 'freighter', 'steamboat', 'riverboat', 'battleship', 'steamship', 'traffic light', 'street light', 'traffic signal', 'stop light', 'streetlight', 'stoplight', 'fire hydrant', 'hydrant', 'stop sign', 'parking meter', 'bench', 'pew', 'bird', 'ostrich', 'owl', 'seagull', 'goose', 'duck', 'parakeet', 'falcon', 'robin', 'pelican', 'waterfowl', 'heron', 'hummingbird', 'mallard', 'finch', 'pigeon', 'sparrow', 'seabird', 'osprey', 'blackbird', 'fowl', 'shorebird', 'woodpecker', 'egret', 'chickadee', 'quail', 'bluebird', 'kingfisher', 'buzzard', 'willet', 'gull', 'swan', 'bluejay', 'flamingo', 'cormorant', 'parrot', 'loon', 'gosling', 'waterbird', 'pheasant', 'rooster', 'sandpiper', 'crow', 'raven', 'turkey', 'oriole', 'cowbird', 'warbler', 'magpie', 'peacock', 'cockatiel', 'lorikeet', 'puffin', 'vulture', 'condor', 'macaw', 'peafowl', 'cockatoo', 'songbird', 'cat', 'kitten', 'feline', 'tabby', 'dog', 'puppy', 'beagle', 'pup', 'chihuahua', 'schnauzer', 'dachshund', 'rottweiler', 'canine', 'pitbull', 'collie', 'pug', 'terrier', 'poodle', 'labrador', 'doggie', 'doberman', 'mutt', 'doggy', 'spaniel', 'bulldog', 'sheepdog', 'weimaraner', 'corgi', 'cocker', 'greyhound', 'retriever', 'brindle', 'hound', 'whippet', 'husky', 'horse', 'colt', 'pony', 'racehorse', 'stallion', 'equine', 'mare', 'foal', 'palomino', 'mustang', 'clydesdale', 'bronc', 'bronco', 'sheep', 'lamb', 'ram', 'lamb', 'goat', 'ewe', 'cow', 'cattle', 'oxen', 'ox', 'calf', 'cattle', 'holstein', 'heifer', 'buffalo', 'bull', 'zebu', 'bison', 'elephant', 'bear', 'panda', 'zebra', 'giraffe', 'backpack', 'knapsack', 'umbrella', 'handbag', 'wallet', 'purse', 'briefcase', 'tie', 'bow', 'bow tie', 'suitcase', 'suit case', 'luggage', 'frisbee', 'skis', 'ski', 'snowboard', 'sports ball', 'ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'longboard', 'skimboard', 'shortboard', 'wakeboard', 'tennis racket', 'racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'pocketknife', 'knive', 'spoon', 'bowl', 'container', 'banana', 'apple', 'sandwich', 'burger', 'sub', 'cheeseburger', 'hamburger', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'doughnut', 'bagel', 'cake', ' cheesecake', 'cupcake', 'shortcake', 'coffeecake', 'pancake', 'chair', 'seat', 'stool', 'couch', 'sofa', 'recliner', 'futon', 'loveseat', 'settee', 'chesterfield', 'potted plant', 'houseplant', 'bed', 'dining table', 'table', 'desk', 'toilet', 'urinal', 'commode', 'toilet', 'lavatory', 'potty', 'tv', 'monitor', 'televison', 'television', 'laptop', 'computer', 'notebook', 'netbook', 'lenovo', 'macbook', 'laptop computer', 'mouse', 'remote', 'keyboard', 'cell phone', 'mobile phone', 'phone', 'cellphone', 'telephone', 'phon', 'smartphone', 'iPhone', 'microwave', 'oven', 'stovetop', 'stove', 'stove top oven', 'toaster', 'sink', 'refrigerator', 'fridge', 'fridge', 'freezer', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'teddybear', 'hair drier', 'hairdryer', 'toothbrush']
+INVERSE_SYNONYM_DICT = {'person': 'person', 'girl': 'person', 'boy': 'person', 'man': 'person', 'woman': 'person', 'kid': 'person', 'child': 'person', 'chef': 'person', 'baker': 'person', 'people': 'person', 'adult': 'person', 'rider': 'person', 'children': 'person', 'baby': 'person', 'worker': 'person', 'passenger': 'person', 'sister': 'person', 'biker': 'person', 'policeman': 'person', 'cop': 'person', 'officer': 'person', 'lady': 'person', 'cowboy': 'person', 'bride': 'person', 'groom': 'person', 'male': 'person', 'female': 'person', 'guy': 'person', 'traveler': 'person', 'mother': 'person', 'father': 'person', 'gentleman': 'person', 'pitcher': 'person', 'player': 'person', 'skier': 'person', 'snowboarder': 'person', 'skater': 'person', 'skateboarder': 'person', 'foreigner': 'person', 'caller': 'person', 'offender': 'person', 'coworker': 'person', 'trespasser': 'person', 'patient': 'person', 'politician': 'person', 'soldier': 'person', 'grandchild': 'person', 'serviceman': 'person', 'walker': 'person', 'drinker': 'person', 'doctor': 'person', 'bicyclist': 'person', 'thief': 'person', 'buyer': 'person', 'teenager': 'person', 'student': 'person', 'camper': 'person', 'driver': 'person', 'solider': 'person', 'hunter': 'person', 'shopper': 'person', 'villager': 'person', 'bicycle': 'bicycle', 'bike': 'bicycle', 'unicycle': 'bicycle', 'minibike': 'bicycle', 'trike': 'bicycle', 'car': 'car', 'automobile': 'car', 'van': 'car', 'minivan': 'car', 'sedan': 'car', 'suv': 'car', 'hatchback': 'car', 'cab': 'car', 'jeep': 'car', 'coupe': 'car', 'taxicab': 'car', 'limo': 'car', 'taxi': 'car', 'motorcycle': 'motorcycle', 'scooter': 'motorcycle', ' motor bike': 'motorcycle', 'motor cycle': 'motorcycle', 'motorbike': 'motorcycle', 'moped': 'motorcycle', 'airplane': 'airplane', 'jetliner': 'airplane', 'plane': 'airplane', 'air plane': 'airplane', 'monoplane': 'airplane', 'aircraft': 'airplane', 'jet': 'airplane', 'airbus': 'airplane', 'biplane': 'airplane', 'seaplane': 'airplane', 'bus': 'bus', 'minibus': 'bus', 'trolley': 'bus', 'train': 'train', 'locomotive': 'train', 'tramway': 'train', 'caboose': 'train', 'truck': 'truck', 'pickup': 'truck', 'lorry': 'truck', 'hauler': 'truck', 'firetruck': 'truck', 'boat': 'boat', 'ship': 'boat', 'liner': 'boat', 'sailboat': 'boat', 'motorboat': 'boat', 'dinghy': 'boat', 'powerboat': 'boat', 'speedboat': 'boat', 'canoe': 'boat', 'skiff': 'boat', 'yacht': 'boat', 'kayak': 'boat', 'catamaran': 'boat', 'pontoon': 'boat', 'houseboat': 'boat', 'vessel': 'boat', 'rowboat': 'boat', 'trawler': 'boat', 'ferryboat': 'boat', 'watercraft': 'boat', 'tugboat': 'boat', 'schooner': 'boat', 'barge': 'boat', 'ferry': 'boat', 'sailboard': 'boat', 'paddleboat': 'boat', 'lifeboat': 'boat', 'freighter': 'boat', 'steamboat': 'boat', 'riverboat': 'boat', 'battleship': 'boat', 'steamship': 'boat', 'traffic light': 'traffic light', 'street light': 'traffic light', 'traffic signal': 'traffic light', 'stop light': 'traffic light', 'streetlight': 'traffic light', 'stoplight': 'traffic light', 'fire hydrant': 'fire hydrant', 'hydrant': 'fire hydrant', 'stop sign': 'stop sign', 'parking meter': 'parking meter', 'bench': 'bench', 'pew': 'bench', 'bird': 'bird', 'ostrich': 'bird', 'owl': 'bird', 'seagull': 'bird', 'goose': 'bird', 'duck': 'bird', 'parakeet': 'bird', 'falcon': 'bird', 'robin': 'bird', 'pelican': 'bird', 'waterfowl': 'bird', 'heron': 'bird', 'hummingbird': 'bird', 'mallard': 'bird', 'finch': 'bird', 'pigeon': 'bird', 'sparrow': 'bird', 'seabird': 'bird', 'osprey': 'bird', 'blackbird': 'bird', 'fowl': 'bird', 'shorebird': 'bird', 'woodpecker': 'bird', 'egret': 'bird', 'chickadee': 'bird', 'quail': 'bird', 'bluebird': 'bird', 'kingfisher': 'bird', 'buzzard': 'bird', 'willet': 'bird', 'gull': 'bird', 'swan': 'bird', 'bluejay': 'bird', 'flamingo': 'bird', 'cormorant': 'bird', 'parrot': 'bird', 'loon': 'bird', 'gosling': 'bird', 'waterbird': 'bird', 'pheasant': 'bird', 'rooster': 'bird', 'sandpiper': 'bird', 'crow': 'bird', 'raven': 'bird', 'turkey': 'bird', 'oriole': 'bird', 'cowbird': 'bird', 'warbler': 'bird', 'magpie': 'bird', 'peacock': 'bird', 'cockatiel': 'bird', 'lorikeet': 'bird', 'puffin': 'bird', 'vulture': 'bird', 'condor': 'bird', 'macaw': 'bird', 'peafowl': 'bird', 'cockatoo': 'bird', 'songbird': 'bird', 'cat': 'cat', 'kitten': 'cat', 'feline': 'cat', 'tabby': 'cat', 'dog': 'dog', 'puppy': 'dog', 'beagle': 'dog', 'pup': 'dog', 'chihuahua': 'dog', 'schnauzer': 'dog', 'dachshund': 'dog', 'rottweiler': 'dog', 'canine': 'dog', 'pitbull': 'dog', 'collie': 'dog', 'pug': 'dog', 'terrier': 'dog', 'poodle': 'dog', 'labrador': 'dog', 'doggie': 'dog', 'doberman': 'dog', 'mutt': 'dog', 'doggy': 'dog', 'spaniel': 'dog', 'bulldog': 'dog', 'sheepdog': 'dog', 'weimaraner': 'dog', 'corgi': 'dog', 'cocker': 'dog', 'greyhound': 'dog', 'retriever': 'dog', 'brindle': 'dog', 'hound': 'dog', 'whippet': 'dog', 'husky': 'dog', 'horse': 'horse', 'colt': 'horse', 'pony': 'horse', 'racehorse': 'horse', 'stallion': 'horse', 'equine': 'horse', 'mare': 'horse', 'foal': 'horse', 'palomino': 'horse', 'mustang': 'horse', 'clydesdale': 'horse', 'bronc': 'horse', 'bronco': 'horse', 'sheep': 'sheep', 'lamb': 'sheep', 'ram': 'sheep', 'goat': 'sheep', 'ewe': 'sheep', 'cow': 'cow', 'cattle': 'cow', 'oxen': 'cow', 'ox': 'cow', 'calf': 'cow', 'holstein': 'cow', 'heifer': 'cow', 'buffalo': 'cow', 'bull': 'cow', 'zebu': 'cow', 'bison': 'cow', 'elephant': 'elephant', 'bear': 'bear', 'panda': 'bear', 'zebra': 'zebra', 'giraffe': 'giraffe', 'backpack': 'backpack', 'knapsack': 'backpack', 'umbrella': 'umbrella', 'handbag': 'handbag', 'wallet': 'handbag', 'purse': 'handbag', 'briefcase': 'handbag', 'tie': 'tie', 'bow': 'tie', 'bow tie': 'tie', 'suitcase': 'suitcase', 'suit case': 'suitcase', 'luggage': 'suitcase', 'frisbee': 'frisbee', 'skis': 'skis', 'ski': 'skis', 'snowboard': 'snowboard', 'sports ball': 'sports ball', 'ball': 'sports ball', 'kite': 'kite', 'baseball bat': 'baseball bat', 'baseball glove': 'baseball glove', 'skateboard': 'skateboard', 'surfboard': 'surfboard', 'longboard': 'surfboard', 'skimboard': 'surfboard', 'shortboard': 'surfboard', 'wakeboard': 'surfboard', 'tennis racket': 'tennis racket', 'racket': 'tennis racket', 'bottle': 'bottle', 'wine glass': 'wine glass', 'cup': 'cup', 'fork': 'fork', 'knife': 'knife', 'pocketknife': 'knife', 'knive': 'knife', 'spoon': 'spoon', 'bowl': 'bowl', 'container': 'bowl', 'banana': 'banana', 'apple': 'apple', 'sandwich': 'sandwich', 'burger': 'sandwich', 'sub': 'sandwich', 'cheeseburger': 'sandwich', 'hamburger': 'sandwich', 'orange': 'orange', 'broccoli': 'broccoli', 'carrot': 'carrot', 'hot dog': 'hot dog', 'pizza': 'pizza', 'donut': 'donut', 'doughnut': 'donut', 'bagel': 'donut', 'cake': 'cake', ' cheesecake': 'cake', 'cupcake': 'cake', 'shortcake': 'cake', 'coffeecake': 'cake', 'pancake': 'cake', 'chair': 'chair', 'seat': 'chair', 'stool': 'chair', 'couch': 'couch', 'sofa': 'couch', 'recliner': 'couch', 'futon': 'couch', 'loveseat': 'couch', 'settee': 'couch', 'chesterfield': 'couch', 'potted plant': 'potted plant', 'houseplant': 'potted plant', 'bed': 'bed', 'dining table': 'dining table', 'table': 'dining table', 'desk': 'dining table', 'toilet': 'toilet', 'urinal': 'toilet', 'commode': 'toilet', 'lavatory': 'toilet', 'potty': 'toilet', 'tv': 'tv', 'monitor': 'tv', 'televison': 'tv', 'television': 'tv', 'laptop': 'laptop', 'computer': 'laptop', 'notebook': 'laptop', 'netbook': 'laptop', 'lenovo': 'laptop', 'macbook': 'laptop', 'laptop computer': 'laptop', 'mouse': 'mouse', 'remote': 'remote', 'keyboard': 'keyboard', 'cell phone': 'cell phone', 'mobile phone': 'cell phone', 'phone': 'cell phone', 'cellphone': 'cell phone', 'telephone': 'cell phone', 'phon': 'cell phone', 'smartphone': 'cell phone', 'iPhone': 'cell phone', 'microwave': 'microwave', 'oven': 'oven', 'stovetop': 'oven', 'stove': 'oven', 'stove top oven': 'oven', 'toaster': 'toaster', 'sink': 'sink', 'refrigerator': 'refrigerator', 'fridge': 'refrigerator', 'freezer': 'refrigerator', 'book': 'book', 'clock': 'clock', 'vase': 'vase', 'scissors': 'scissors', 'teddy bear': 'teddy bear', 'teddybear': 'teddy bear', 'hair drier': 'hair drier', 'hairdryer': 'hair drier', 'toothbrush': 'toothbrush'}
+DOUBLE_WORD_DICT = {'motor bike': 'motor bike', 'motor cycle': 'motor cycle', 'air plane': 'air plane', 'traffic light': 'traffic light', 'street light': 'street light', 'traffic signal': 'traffic signal', 'stop light': 'stop light', 'fire hydrant': 'fire hydrant', 'stop sign': 'stop sign', 'parking meter': 'parking meter', 'suit case': 'suit case', 'sports ball': 'sports ball', 'baseball bat': 'baseball bat', 'baseball glove': 'baseball glove', 'tennis racket': 'tennis racket', 'wine glass': 'wine glass', 'hot dog': 'hot dog', 'cell phone': 'cell phone', 'mobile phone': 'mobile phone', 'teddy bear': 'teddy bear', 'hair drier': 'hair drier', 'potted plant': 'potted plant', 'bow tie': 'tie', 'laptop computer': 'laptop computer', 'stove top oven': 'stove top oven', 'home plate': 'home plate', 'train track': 'train track', 'baby bird': 'bird', 'adult bird': 'bird', 'baby cat': 'cat', 'adult cat': 'cat', 'baby dog': 'dog', 'adult dog': 'dog', 'baby horse': 'horse', 'adult horse': 'horse', 'baby sheep': 'sheep', 'adult sheep': 'sheep', 'baby cow': 'cow', 'adult cow': 'cow', 'baby elephant': 'elephant', 'adult elephant': 'elephant', 'baby bear': 'bear', 'adult bear': 'bear', 'baby zebra': 'zebra', 'adult zebra': 'zebra', 'baby giraffe': 'giraffe', 'adult giraffe': 'giraffe', 'baby animal': 'animal', 'adult animal': 'animal', 'baby cub': 'cub', 'adult cub': 'cub', 'passenger jet': 'jet', 'passenger train': 'train', 'toilet seat': 'toilet', 'wine glas': 'wine glass'}
+
+def coco_cap_chair_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+
+
+def coco_cap_chair_doc_to_text(doc):
+    return f"Please describe this image in detail."
+
+
+def caption_to_words(caption):
+    
+    '''
+    Input: caption
+    Output: MSCOCO words in the caption
+    '''
+
+    def get_wordnet_pos(tag):
+        if tag.startswith('J'):
+            return wordnet.ADJ
+        elif tag.startswith('V'):
+            return wordnet.VERB
+        elif tag.startswith('N'):
+            return wordnet.NOUN
+        elif tag.startswith('R'):
+            return wordnet.ADV
+        else:
+            return None
+
+    #standard preprocessing
+    words = nltk.word_tokenize(caption.lower())
+    tagged_sent = nltk.pos_tag(words)
+    lemmas_sent = []
+    wnl = WordNetLemmatizer()
+
+    for tag in tagged_sent:
+        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
+        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
+    # words = [singularize(w) for w in words]
+    words = lemmas_sent
+
+    #replace double words
+    i = 0
+    double_words = []
+    idxs = []
+    while i < len(words):
+        idxs.append(i) 
+        double_word = ' '.join(words[i:i+2])
+        if double_word in DOUBLE_WORD_DICT: 
+            double_words.append(DOUBLE_WORD_DICT[double_word])
+            i += 2
+        else:
+            double_words.append(words[i])
+            i += 1
+    words = double_words
+
+    #toilet seat is not chair (sentences like "the seat of the toilet" will fire for "chair" if we do not include this line)
+    if ('toilet' in words) & ('seat' in words): words = [word for word in words if word != 'seat']
+
+    #get synonyms for all words in the caption
+    idxs = [idxs[idx] for idx, word in enumerate(words) \
+            if word in set(MSCOCO_OBJECTS)]
+    words = [word for word in words if word in set(MSCOCO_OBJECTS)]
+    node_words = []
+    for word in words:
+        node_words.append(INVERSE_SYNONYM_DICT[word])
+    #return all the MSCOCO objects in the caption
+    return words, node_words, idxs, double_words
+
+
+def coco_cap_chair_process_result(doc, result):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name, value: metric value
+    """
+    pred = result[0] if len(result) > 0 else ""
+    words, node_words, idxs, raw_words = caption_to_words(pred)
+    image_id = int(doc["question_id"])
+
+    data_dict = {"answer": doc["gt_object"], "pred": node_words, "image_id": image_id}
+
+    return {f"coco_cap_{metric}": data_dict for metric in CHAIR_METRICS}
+
+
+def coco_cap_chair_aggregate_results_chair_i(results):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    num_all_mentioned_objects = 0
+    num_hallucinated_objects = 0
+    for result in results:
+        gt_object = result["answer"]
+        pred = result["pred"]
+        num_all_mentioned_objects += len(pred)
+        # calculate the number of hallucination
+        for node_word in pred:
+            if node_word not in gt_object:
+                num_hallucinated_objects += 1
+    return (num_hallucinated_objects / num_all_mentioned_objects) * 100
+
+
+def coco_cap_chair_aggregate_results_chair_s(results):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    num_samples = len(results)
+    num_hallucinated_samples = 0
+    for result in results:
+        gt_object = result["answer"]
+        pred = result["pred"]
+        # calculate the number of hallucination
+        for node_word in pred:
+            if node_word not in gt_object:
+                num_hallucinated_samples += 1
+                break
+    return (num_hallucinated_samples / num_samples) * 100
+
+            
\ No newline at end of file

From 48eb7de748073f33b24d77da286809af7ef81b38 Mon Sep 17 00:00:00 2001
From: Patrick Wu <tsunghan_wu@berkeley.edu>
Date: Sat, 4 Oct 2025 05:03:38 +0000
Subject: [PATCH 02/15] add chair recall

---
 .../tasks/coco_cap_chair/coco_cap_chair.yaml  |  3 +++
 lmms_eval/tasks/coco_cap_chair/utils.py       | 19 +++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/lmms_eval/tasks/coco_cap_chair/coco_cap_chair.yaml b/lmms_eval/tasks/coco_cap_chair/coco_cap_chair.yaml
index aaaed0ce9..e2ce4fc94 100644
--- a/lmms_eval/tasks/coco_cap_chair/coco_cap_chair.yaml
+++ b/lmms_eval/tasks/coco_cap_chair/coco_cap_chair.yaml
@@ -24,6 +24,9 @@ metric_list:
   - metric: coco_cap_chair_s
     aggregation : !function utils.coco_cap_chair_aggregate_results_chair_s
     higher_is_better : false
+  - metric: coco_cap_recall
+    aggregation : !function utils.coco_cap_chair_aggregate_results_recall
+    higher_is_better : true
 
 metadata:
   - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/coco_cap_chair/utils.py b/lmms_eval/tasks/coco_cap_chair/utils.py
index e2928ece7..f4aba5d7c 100644
--- a/lmms_eval/tasks/coco_cap_chair/utils.py
+++ b/lmms_eval/tasks/coco_cap_chair/utils.py
@@ -3,7 +3,7 @@
 from nltk.stem import WordNetLemmatizer
 
 
-CHAIR_METRICS = ["chair_s", "chair_i"]
+CHAIR_METRICS = ["chair_s", "chair_i", "recall"]
 
 MSCOCO_OBJECTS = ['person', 'girl', 'boy', 'man', 'woman', 'kid', 'child', 'chef', 'baker', 'people', 'adult', 'rider', 'children', 'baby', 'worker', 'passenger', 'sister', 'biker', 'policeman', 'cop', 'officer', 'lady', 'cowboy', 'bride', 'groom', 'male', 'female', 'guy', 'traveler', 'mother', 'father', 'gentleman', 'pitcher', 'player', 'skier', 'snowboarder', 'skater', 'skateboarder', 'person', 'woman', 'guy', 'foreigner', 'child', 'gentleman', 'caller', 'offender', 'coworker', 'trespasser', 'patient', 'politician', 'soldier', 'grandchild', 'serviceman', 'walker', 'drinker', 'doctor', 'bicyclist', 'thief', 'buyer', 'teenager', 'student', 'camper', 'driver', 'solider', 'hunter', 'shopper', 'villager', 'bicycle', 'bike', 'bicycle', 'bike', 'unicycle', 'minibike', 'trike', 'car', 'automobile', 'van', 'minivan', 'sedan', 'suv', 'hatchback', 'cab', 'jeep', 'coupe', 'taxicab', 'limo', 'taxi', 'motorcycle', 'scooter', ' motor bike', 'motor cycle', 'motorbike', 'scooter', 'moped', 'airplane', 'jetliner', 'plane', 'air plane', 'monoplane', 'aircraft', 'jet', 'jetliner', 'airbus', 'biplane', 'seaplane', 'bus', 'minibus', 'trolley', 'train', 'locomotive', 'tramway', 'caboose', 'truck', 'pickup', 'lorry', 'hauler', 'firetruck', 'boat', 'ship', 'liner', 'sailboat', 'motorboat', 'dinghy', 'powerboat', 'speedboat', 'canoe', 'skiff', 'yacht', 'kayak', 'catamaran', 'pontoon', 'houseboat', 'vessel', 'rowboat', 'trawler', 'ferryboat', 'watercraft', 'tugboat', 'schooner', 'barge', 'ferry', 'sailboard', 'paddleboat', 'lifeboat', 'freighter', 'steamboat', 'riverboat', 'battleship', 'steamship', 'traffic light', 'street light', 'traffic signal', 'stop light', 'streetlight', 'stoplight', 'fire hydrant', 'hydrant', 'stop sign', 'parking meter', 'bench', 'pew', 'bird', 'ostrich', 'owl', 'seagull', 'goose', 'duck', 'parakeet', 'falcon', 'robin', 'pelican', 'waterfowl', 'heron', 'hummingbird', 'mallard', 'finch', 'pigeon', 'sparrow', 'seabird', 'osprey', 'blackbird', 'fowl', 'shorebird', 'woodpecker', 'egret', 'chickadee', 'quail', 'bluebird', 'kingfisher', 'buzzard', 'willet', 'gull', 'swan', 'bluejay', 'flamingo', 'cormorant', 'parrot', 'loon', 'gosling', 'waterbird', 'pheasant', 'rooster', 'sandpiper', 'crow', 'raven', 'turkey', 'oriole', 'cowbird', 'warbler', 'magpie', 'peacock', 'cockatiel', 'lorikeet', 'puffin', 'vulture', 'condor', 'macaw', 'peafowl', 'cockatoo', 'songbird', 'cat', 'kitten', 'feline', 'tabby', 'dog', 'puppy', 'beagle', 'pup', 'chihuahua', 'schnauzer', 'dachshund', 'rottweiler', 'canine', 'pitbull', 'collie', 'pug', 'terrier', 'poodle', 'labrador', 'doggie', 'doberman', 'mutt', 'doggy', 'spaniel', 'bulldog', 'sheepdog', 'weimaraner', 'corgi', 'cocker', 'greyhound', 'retriever', 'brindle', 'hound', 'whippet', 'husky', 'horse', 'colt', 'pony', 'racehorse', 'stallion', 'equine', 'mare', 'foal', 'palomino', 'mustang', 'clydesdale', 'bronc', 'bronco', 'sheep', 'lamb', 'ram', 'lamb', 'goat', 'ewe', 'cow', 'cattle', 'oxen', 'ox', 'calf', 'cattle', 'holstein', 'heifer', 'buffalo', 'bull', 'zebu', 'bison', 'elephant', 'bear', 'panda', 'zebra', 'giraffe', 'backpack', 'knapsack', 'umbrella', 'handbag', 'wallet', 'purse', 'briefcase', 'tie', 'bow', 'bow tie', 'suitcase', 'suit case', 'luggage', 'frisbee', 'skis', 'ski', 'snowboard', 'sports ball', 'ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'longboard', 'skimboard', 'shortboard', 'wakeboard', 'tennis racket', 'racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'pocketknife', 'knive', 'spoon', 'bowl', 'container', 'banana', 'apple', 'sandwich', 'burger', 'sub', 'cheeseburger', 'hamburger', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'doughnut', 'bagel', 'cake', ' cheesecake', 'cupcake', 'shortcake', 'coffeecake', 'pancake', 'chair', 'seat', 'stool', 'couch', 'sofa', 'recliner', 'futon', 'loveseat', 'settee', 'chesterfield', 'potted plant', 'houseplant', 'bed', 'dining table', 'table', 'desk', 'toilet', 'urinal', 'commode', 'toilet', 'lavatory', 'potty', 'tv', 'monitor', 'televison', 'television', 'laptop', 'computer', 'notebook', 'netbook', 'lenovo', 'macbook', 'laptop computer', 'mouse', 'remote', 'keyboard', 'cell phone', 'mobile phone', 'phone', 'cellphone', 'telephone', 'phon', 'smartphone', 'iPhone', 'microwave', 'oven', 'stovetop', 'stove', 'stove top oven', 'toaster', 'sink', 'refrigerator', 'fridge', 'fridge', 'freezer', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'teddybear', 'hair drier', 'hairdryer', 'toothbrush']
 INVERSE_SYNONYM_DICT = {'person': 'person', 'girl': 'person', 'boy': 'person', 'man': 'person', 'woman': 'person', 'kid': 'person', 'child': 'person', 'chef': 'person', 'baker': 'person', 'people': 'person', 'adult': 'person', 'rider': 'person', 'children': 'person', 'baby': 'person', 'worker': 'person', 'passenger': 'person', 'sister': 'person', 'biker': 'person', 'policeman': 'person', 'cop': 'person', 'officer': 'person', 'lady': 'person', 'cowboy': 'person', 'bride': 'person', 'groom': 'person', 'male': 'person', 'female': 'person', 'guy': 'person', 'traveler': 'person', 'mother': 'person', 'father': 'person', 'gentleman': 'person', 'pitcher': 'person', 'player': 'person', 'skier': 'person', 'snowboarder': 'person', 'skater': 'person', 'skateboarder': 'person', 'foreigner': 'person', 'caller': 'person', 'offender': 'person', 'coworker': 'person', 'trespasser': 'person', 'patient': 'person', 'politician': 'person', 'soldier': 'person', 'grandchild': 'person', 'serviceman': 'person', 'walker': 'person', 'drinker': 'person', 'doctor': 'person', 'bicyclist': 'person', 'thief': 'person', 'buyer': 'person', 'teenager': 'person', 'student': 'person', 'camper': 'person', 'driver': 'person', 'solider': 'person', 'hunter': 'person', 'shopper': 'person', 'villager': 'person', 'bicycle': 'bicycle', 'bike': 'bicycle', 'unicycle': 'bicycle', 'minibike': 'bicycle', 'trike': 'bicycle', 'car': 'car', 'automobile': 'car', 'van': 'car', 'minivan': 'car', 'sedan': 'car', 'suv': 'car', 'hatchback': 'car', 'cab': 'car', 'jeep': 'car', 'coupe': 'car', 'taxicab': 'car', 'limo': 'car', 'taxi': 'car', 'motorcycle': 'motorcycle', 'scooter': 'motorcycle', ' motor bike': 'motorcycle', 'motor cycle': 'motorcycle', 'motorbike': 'motorcycle', 'moped': 'motorcycle', 'airplane': 'airplane', 'jetliner': 'airplane', 'plane': 'airplane', 'air plane': 'airplane', 'monoplane': 'airplane', 'aircraft': 'airplane', 'jet': 'airplane', 'airbus': 'airplane', 'biplane': 'airplane', 'seaplane': 'airplane', 'bus': 'bus', 'minibus': 'bus', 'trolley': 'bus', 'train': 'train', 'locomotive': 'train', 'tramway': 'train', 'caboose': 'train', 'truck': 'truck', 'pickup': 'truck', 'lorry': 'truck', 'hauler': 'truck', 'firetruck': 'truck', 'boat': 'boat', 'ship': 'boat', 'liner': 'boat', 'sailboat': 'boat', 'motorboat': 'boat', 'dinghy': 'boat', 'powerboat': 'boat', 'speedboat': 'boat', 'canoe': 'boat', 'skiff': 'boat', 'yacht': 'boat', 'kayak': 'boat', 'catamaran': 'boat', 'pontoon': 'boat', 'houseboat': 'boat', 'vessel': 'boat', 'rowboat': 'boat', 'trawler': 'boat', 'ferryboat': 'boat', 'watercraft': 'boat', 'tugboat': 'boat', 'schooner': 'boat', 'barge': 'boat', 'ferry': 'boat', 'sailboard': 'boat', 'paddleboat': 'boat', 'lifeboat': 'boat', 'freighter': 'boat', 'steamboat': 'boat', 'riverboat': 'boat', 'battleship': 'boat', 'steamship': 'boat', 'traffic light': 'traffic light', 'street light': 'traffic light', 'traffic signal': 'traffic light', 'stop light': 'traffic light', 'streetlight': 'traffic light', 'stoplight': 'traffic light', 'fire hydrant': 'fire hydrant', 'hydrant': 'fire hydrant', 'stop sign': 'stop sign', 'parking meter': 'parking meter', 'bench': 'bench', 'pew': 'bench', 'bird': 'bird', 'ostrich': 'bird', 'owl': 'bird', 'seagull': 'bird', 'goose': 'bird', 'duck': 'bird', 'parakeet': 'bird', 'falcon': 'bird', 'robin': 'bird', 'pelican': 'bird', 'waterfowl': 'bird', 'heron': 'bird', 'hummingbird': 'bird', 'mallard': 'bird', 'finch': 'bird', 'pigeon': 'bird', 'sparrow': 'bird', 'seabird': 'bird', 'osprey': 'bird', 'blackbird': 'bird', 'fowl': 'bird', 'shorebird': 'bird', 'woodpecker': 'bird', 'egret': 'bird', 'chickadee': 'bird', 'quail': 'bird', 'bluebird': 'bird', 'kingfisher': 'bird', 'buzzard': 'bird', 'willet': 'bird', 'gull': 'bird', 'swan': 'bird', 'bluejay': 'bird', 'flamingo': 'bird', 'cormorant': 'bird', 'parrot': 'bird', 'loon': 'bird', 'gosling': 'bird', 'waterbird': 'bird', 'pheasant': 'bird', 'rooster': 'bird', 'sandpiper': 'bird', 'crow': 'bird', 'raven': 'bird', 'turkey': 'bird', 'oriole': 'bird', 'cowbird': 'bird', 'warbler': 'bird', 'magpie': 'bird', 'peacock': 'bird', 'cockatiel': 'bird', 'lorikeet': 'bird', 'puffin': 'bird', 'vulture': 'bird', 'condor': 'bird', 'macaw': 'bird', 'peafowl': 'bird', 'cockatoo': 'bird', 'songbird': 'bird', 'cat': 'cat', 'kitten': 'cat', 'feline': 'cat', 'tabby': 'cat', 'dog': 'dog', 'puppy': 'dog', 'beagle': 'dog', 'pup': 'dog', 'chihuahua': 'dog', 'schnauzer': 'dog', 'dachshund': 'dog', 'rottweiler': 'dog', 'canine': 'dog', 'pitbull': 'dog', 'collie': 'dog', 'pug': 'dog', 'terrier': 'dog', 'poodle': 'dog', 'labrador': 'dog', 'doggie': 'dog', 'doberman': 'dog', 'mutt': 'dog', 'doggy': 'dog', 'spaniel': 'dog', 'bulldog': 'dog', 'sheepdog': 'dog', 'weimaraner': 'dog', 'corgi': 'dog', 'cocker': 'dog', 'greyhound': 'dog', 'retriever': 'dog', 'brindle': 'dog', 'hound': 'dog', 'whippet': 'dog', 'husky': 'dog', 'horse': 'horse', 'colt': 'horse', 'pony': 'horse', 'racehorse': 'horse', 'stallion': 'horse', 'equine': 'horse', 'mare': 'horse', 'foal': 'horse', 'palomino': 'horse', 'mustang': 'horse', 'clydesdale': 'horse', 'bronc': 'horse', 'bronco': 'horse', 'sheep': 'sheep', 'lamb': 'sheep', 'ram': 'sheep', 'goat': 'sheep', 'ewe': 'sheep', 'cow': 'cow', 'cattle': 'cow', 'oxen': 'cow', 'ox': 'cow', 'calf': 'cow', 'holstein': 'cow', 'heifer': 'cow', 'buffalo': 'cow', 'bull': 'cow', 'zebu': 'cow', 'bison': 'cow', 'elephant': 'elephant', 'bear': 'bear', 'panda': 'bear', 'zebra': 'zebra', 'giraffe': 'giraffe', 'backpack': 'backpack', 'knapsack': 'backpack', 'umbrella': 'umbrella', 'handbag': 'handbag', 'wallet': 'handbag', 'purse': 'handbag', 'briefcase': 'handbag', 'tie': 'tie', 'bow': 'tie', 'bow tie': 'tie', 'suitcase': 'suitcase', 'suit case': 'suitcase', 'luggage': 'suitcase', 'frisbee': 'frisbee', 'skis': 'skis', 'ski': 'skis', 'snowboard': 'snowboard', 'sports ball': 'sports ball', 'ball': 'sports ball', 'kite': 'kite', 'baseball bat': 'baseball bat', 'baseball glove': 'baseball glove', 'skateboard': 'skateboard', 'surfboard': 'surfboard', 'longboard': 'surfboard', 'skimboard': 'surfboard', 'shortboard': 'surfboard', 'wakeboard': 'surfboard', 'tennis racket': 'tennis racket', 'racket': 'tennis racket', 'bottle': 'bottle', 'wine glass': 'wine glass', 'cup': 'cup', 'fork': 'fork', 'knife': 'knife', 'pocketknife': 'knife', 'knive': 'knife', 'spoon': 'spoon', 'bowl': 'bowl', 'container': 'bowl', 'banana': 'banana', 'apple': 'apple', 'sandwich': 'sandwich', 'burger': 'sandwich', 'sub': 'sandwich', 'cheeseburger': 'sandwich', 'hamburger': 'sandwich', 'orange': 'orange', 'broccoli': 'broccoli', 'carrot': 'carrot', 'hot dog': 'hot dog', 'pizza': 'pizza', 'donut': 'donut', 'doughnut': 'donut', 'bagel': 'donut', 'cake': 'cake', ' cheesecake': 'cake', 'cupcake': 'cake', 'shortcake': 'cake', 'coffeecake': 'cake', 'pancake': 'cake', 'chair': 'chair', 'seat': 'chair', 'stool': 'chair', 'couch': 'couch', 'sofa': 'couch', 'recliner': 'couch', 'futon': 'couch', 'loveseat': 'couch', 'settee': 'couch', 'chesterfield': 'couch', 'potted plant': 'potted plant', 'houseplant': 'potted plant', 'bed': 'bed', 'dining table': 'dining table', 'table': 'dining table', 'desk': 'dining table', 'toilet': 'toilet', 'urinal': 'toilet', 'commode': 'toilet', 'lavatory': 'toilet', 'potty': 'toilet', 'tv': 'tv', 'monitor': 'tv', 'televison': 'tv', 'television': 'tv', 'laptop': 'laptop', 'computer': 'laptop', 'notebook': 'laptop', 'netbook': 'laptop', 'lenovo': 'laptop', 'macbook': 'laptop', 'laptop computer': 'laptop', 'mouse': 'mouse', 'remote': 'remote', 'keyboard': 'keyboard', 'cell phone': 'cell phone', 'mobile phone': 'cell phone', 'phone': 'cell phone', 'cellphone': 'cell phone', 'telephone': 'cell phone', 'phon': 'cell phone', 'smartphone': 'cell phone', 'iPhone': 'cell phone', 'microwave': 'microwave', 'oven': 'oven', 'stovetop': 'oven', 'stove': 'oven', 'stove top oven': 'oven', 'toaster': 'toaster', 'sink': 'sink', 'refrigerator': 'refrigerator', 'fridge': 'refrigerator', 'freezer': 'refrigerator', 'book': 'book', 'clock': 'clock', 'vase': 'vase', 'scissors': 'scissors', 'teddy bear': 'teddy bear', 'teddybear': 'teddy bear', 'hair drier': 'hair drier', 'hairdryer': 'hair drier', 'toothbrush': 'toothbrush'}
@@ -133,4 +133,19 @@ def coco_cap_chair_aggregate_results_chair_s(results):
                 break
     return (num_hallucinated_samples / num_samples) * 100
 
-            
\ No newline at end of file
+def coco_cap_chair_aggregate_results_recall(results):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    num_total_objects = 0
+    num_matched_objects = 0
+    for result in results:
+        gt_object = set(result["answer"])
+        pred = set(result["pred"])
+        num_matched_objects += len(gt_object.intersection(pred))
+        num_total_objects += len(gt_object)
+    return (num_matched_objects / num_total_objects) * 100
+    
\ No newline at end of file

From 2f9c6142887d66471373a17812a3595a8fae82be Mon Sep 17 00:00:00 2001
From: Kyunnilee <annelee5270@gmail.com>
Date: Sun, 5 Oct 2025 05:48:38 +0000
Subject: [PATCH 03/15] bootstrapping

---
 lmms_eval/api/metrics.py                | 26 +++++++++++++++++++++++++
 lmms_eval/tasks/coco_cap_chair/utils.py |  2 +-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/lmms_eval/api/metrics.py b/lmms_eval/api/metrics.py
index e5b4214c6..4764e63cb 100755
--- a/lmms_eval/api/metrics.py
+++ b/lmms_eval/api/metrics.py
@@ -531,10 +531,29 @@ def bootstrap_stderr(f, xs, iters):
     return sample_stddev(res)
 
 
+def bootstrap_chair_metric(metric_fn, xs, iters):
+    "for non multiprocessing for CHAIR"
+    print(f"bootstrapping for stddev: {metric_fn.__name__}")
+    res = []
+    from tqdm import tqdm
+    
+    for _ in tqdm(range(iters), desc="Bootstrap"):
+        bootstrap_sample = random.choices(xs, k=len(xs))
+        metric_value = metric_fn(bootstrap_sample)
+        res.append(metric_value)
+    
+    return sample_stddev(res)
+
 def stderr_for_metric(metric, bootstrap_iters: int):
     if bootstrap_iters <= 0:
         # return no function (don't compute stderr) if bootstrap iters = 0
         return None
+    # for coco_cap_chair
+    from lmms_eval.tasks.coco_cap_chair.utils import (
+        coco_cap_chair_aggregate_results_chair_i,
+        coco_cap_chair_aggregate_results_chair_s,
+        coco_cap_chair_aggregate_results_recall,
+    )
 
     bootstrappable = [
         median,
@@ -544,11 +563,18 @@ def stderr_for_metric(metric, bootstrap_iters: int):
         bleu,
         chrf,
         ter,
+        coco_cap_chair_aggregate_results_chair_i,
+        coco_cap_chair_aggregate_results_chair_s,
+        coco_cap_chair_aggregate_results_recall,
     ]
 
     if metric in bootstrappable:
         return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
 
+    if hasattr(metric, '__name__'):
+        if 'coco_cap_chair' in metric.__name__:
+            return lambda x: bootstrap_chair_metric(metric, x, iters=bootstrap_iters)
+
     stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
 
     return stderr.get(metric, None)
diff --git a/lmms_eval/tasks/coco_cap_chair/utils.py b/lmms_eval/tasks/coco_cap_chair/utils.py
index f4aba5d7c..363762db4 100644
--- a/lmms_eval/tasks/coco_cap_chair/utils.py
+++ b/lmms_eval/tasks/coco_cap_chair/utils.py
@@ -10,7 +10,7 @@
 DOUBLE_WORD_DICT = {'motor bike': 'motor bike', 'motor cycle': 'motor cycle', 'air plane': 'air plane', 'traffic light': 'traffic light', 'street light': 'street light', 'traffic signal': 'traffic signal', 'stop light': 'stop light', 'fire hydrant': 'fire hydrant', 'stop sign': 'stop sign', 'parking meter': 'parking meter', 'suit case': 'suit case', 'sports ball': 'sports ball', 'baseball bat': 'baseball bat', 'baseball glove': 'baseball glove', 'tennis racket': 'tennis racket', 'wine glass': 'wine glass', 'hot dog': 'hot dog', 'cell phone': 'cell phone', 'mobile phone': 'mobile phone', 'teddy bear': 'teddy bear', 'hair drier': 'hair drier', 'potted plant': 'potted plant', 'bow tie': 'tie', 'laptop computer': 'laptop computer', 'stove top oven': 'stove top oven', 'home plate': 'home plate', 'train track': 'train track', 'baby bird': 'bird', 'adult bird': 'bird', 'baby cat': 'cat', 'adult cat': 'cat', 'baby dog': 'dog', 'adult dog': 'dog', 'baby horse': 'horse', 'adult horse': 'horse', 'baby sheep': 'sheep', 'adult sheep': 'sheep', 'baby cow': 'cow', 'adult cow': 'cow', 'baby elephant': 'elephant', 'adult elephant': 'elephant', 'baby bear': 'bear', 'adult bear': 'bear', 'baby zebra': 'zebra', 'adult zebra': 'zebra', 'baby giraffe': 'giraffe', 'adult giraffe': 'giraffe', 'baby animal': 'animal', 'adult animal': 'animal', 'baby cub': 'cub', 'adult cub': 'cub', 'passenger jet': 'jet', 'passenger train': 'train', 'toilet seat': 'toilet', 'wine glas': 'wine glass'}
 
 def coco_cap_chair_doc_to_visual(doc):
-    return [doc["image"].convert("RGB")]
+    return [doc["image"].convert("RGB"), doc["image"].convert("RGB")]
 
 
 def coco_cap_chair_doc_to_text(doc):

From a958de22719ffb621877d3a975ce7e3006d54a5a Mon Sep 17 00:00:00 2001
From: Kyunnilee <annelee5270@gmail.com>
Date: Tue, 14 Oct 2025 15:24:47 +0000
Subject: [PATCH 04/15] add amber_g

---
 lmms_eval/tasks/amber_g/amber_g.yaml |  42 +++
 lmms_eval/tasks/amber_g/utils.py     | 438 +++++++++++++++++++++++++++
 2 files changed, 480 insertions(+)
 create mode 100644 lmms_eval/tasks/amber_g/amber_g.yaml
 create mode 100644 lmms_eval/tasks/amber_g/utils.py

diff --git a/lmms_eval/tasks/amber_g/amber_g.yaml b/lmms_eval/tasks/amber_g/amber_g.yaml
new file mode 100644
index 000000000..813723f48
--- /dev/null
+++ b/lmms_eval/tasks/amber_g/amber_g.yaml
@@ -0,0 +1,42 @@
+# AMBER-G (Generative Task) Evaluation Configuration
+# Based on: https://github.com/junyangwang0410/AMBER
+# Dataset includes: images, questions, and complete ground truth annotations
+
+dataset_path: Kyunnilee/amber_g_new  # use this dataset
+dataset_kwargs:
+  trust_remote_code: true
+task: "amber_g"
+output_type: generate_until
+
+doc_to_visual: !function utils.amber_g_doc_to_visual
+doc_to_text: !function utils.amber_g_doc_to_text
+doc_to_target: "truth" 
+test_split: train
+
+generation_kwargs:
+  max_new_tokens: 2048
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+  until: [] # really important!!! the default would be ["\n\n"] and that will cause truncation
+
+process_results: !function utils.amber_g_process_result
+
+# AMBER-G Metrics:
+metric_list:
+  - metric: amber_chair
+    aggregation: !function utils.amber_g_aggregate_chair
+    higher_is_better: false
+  - metric: amber_cover
+    aggregation: !function utils.amber_g_aggregate_cover
+    higher_is_better: true
+  - metric: amber_hal
+    aggregation: !function utils.amber_g_aggregate_hal
+    higher_is_better: false
+  - metric: amber_cog
+    aggregation: !function utils.amber_g_aggregate_cog
+    higher_is_better: true
+
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/amber_g/utils.py b/lmms_eval/tasks/amber_g/utils.py
new file mode 100644
index 000000000..4bd43520e
--- /dev/null
+++ b/lmms_eval/tasks/amber_g/utils.py
@@ -0,0 +1,438 @@
+import os
+import nltk
+import json
+import spacy
+import warnings
+import numpy as np
+from nltk.stem import WordNetLemmatizer
+from PIL import Image
+from pathlib import Path
+import datasets
+
+# Configuration
+AMBER_BASE_DIR = os.environ.get("AMBER_BASE_DIR", "/home/annelee/playground/data/eval/reverse_eval/amber")
+SIMILARITY_THRESHOLD = 0.8
+EVALUATION_TYPE = 'g'  # Default to g
+METADATA_DIR = os.path.join(AMBER_BASE_DIR, "metadata")
+QUESTIONS_DIR = os.path.join(AMBER_BASE_DIR, "questions")
+
+# Global variables for loaded metadata
+_ASSOCIATION = None
+_HALLUCINATION_WORDS = None
+_SAFE_WORDS = None
+_ANNOTATIONS = None
+_METRICS_INIT = None
+_NLP = None  # Lazy load spaCy
+
+# this was annoying issue..
+def get_nlp():
+    """Lazy load spaCy model."""
+    global _NLP
+    if _NLP is not None:
+        return _NLP
+    
+    models_to_try = ["en_core_web_md", "en_core_web_sm", "en_core_web_lg"]
+    
+    for model_name in models_to_try:
+        try:
+            _NLP = spacy.load(model_name)
+            return _NLP
+        except OSError:
+            continue
+    
+    raise OSError(
+        "No spaCy model found. Please install one:\n"
+        "  pip install spacy\n"
+        "  python -m spacy download en_core_web_sm\n"
+        "or for better accuracy:\n"
+        "  python -m spacy download en_core_web_md"
+    )
+
+
+
+def load_metadata():
+    """Load AMBER metadata files once."""
+    global _ASSOCIATION, _HALLUCINATION_WORDS, _SAFE_WORDS, _ANNOTATIONS, _METRICS_INIT
+    
+    if _ASSOCIATION is not None:
+        return
+    
+    association_file = os.path.join(METADATA_DIR, 'relation.json')
+    _ASSOCIATION = load_json(association_file)
+    _HALLUCINATION_WORDS = set()
+    for word1, related in _ASSOCIATION.items():
+        _HALLUCINATION_WORDS.add(word1)
+        _HALLUCINATION_WORDS.update(related)
+    
+    # Load safe words
+    safe_words_file = os.path.join(METADATA_DIR, 'safe_words.txt')
+    _SAFE_WORDS = load_text_lines(safe_words_file)
+    
+    # Load annotations
+    annotation_file = os.path.join(METADATA_DIR, 'annotations.json')
+    _ANNOTATIONS = load_json(annotation_file)
+    
+    # Load metrics initialization
+    metrics_file = os.path.join(METADATA_DIR, 'metrics.txt')
+    _METRICS_INIT = load_metrics(metrics_file)
+
+#########################################
+#          Utility Functions           #
+########################################
+
+def load_json(file_path):
+    """Load JSON file and return the data."""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+
+def load_text_lines(file_path):
+    """Load text file and return list of stripped lines."""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        return [line.strip() for line in file.readlines()]
+
+
+def load_metrics(metrics_path):
+    """Initialize and return a metrics dict based on a metrics file with key=value lines."""
+    metrics = {}
+    with open(metrics_path, "r", encoding="utf-8") as file:
+        lines = file.readlines()
+    for line in lines:
+        parts = line.strip().split('=')
+        if len(parts) == 2:
+            variable_name = parts[0].strip()
+            variable_value = eval(parts[1].strip())
+            metrics[variable_name] = variable_value
+    return metrics
+
+
+def check_synonyms_word(word1, word2, similarity_threshold):
+    """Check if two words are synonyms based on spaCy similarity."""
+    nlp = get_nlp()  # Lazy load spaCy model
+    token1 = nlp(word1)
+    token2 = nlp(word2)
+    similarity = token1.similarity(token2)
+    return similarity > similarity_threshold
+
+
+def extract_nouns(text):
+    """Extract lemmatized nouns from given text using NLTK."""
+    lemmatizer = WordNetLemmatizer()
+    tokens = nltk.word_tokenize(text)
+    tagged = nltk.pos_tag(tokens)
+    nouns = [lemmatizer.lemmatize(word) for word, pos in tagged if pos.startswith('NN')]
+    return nouns
+
+
+########################################
+#       Evaluation Computations        #
+########################################
+
+def setup_dimensions(evaluation_type):
+    """Setup which evaluation dimensions to run based on the evaluation_type argument."""
+    dimensions = {'g': False, 'de': False, 'da': False, 'dr': False}
+    if evaluation_type == 'a':
+        for key in dimensions:
+            dimensions[key] = True
+    elif evaluation_type == 'g':
+        dimensions['g'] = True
+    elif evaluation_type == 'd':
+        dimensions['de'] = True
+        dimensions['da'] = True
+        dimensions['dr'] = True
+    else:
+        dimensions[evaluation_type] = True
+    return dimensions
+
+
+def prepare_association(association):
+    """Load word associations and return a set of hallucination words."""
+    hallucination_words = set()
+    for word1, related in association.items():
+        hallucination_words.add(word1)
+        hallucination_words.update(related)
+    return association, hallucination_words
+
+
+def process_generative_task(data_item, ground_truth_item, association, hallucination_words,
+                            global_safe_words, similarity_threshold, metrics):
+    """Process a generative task item and update the metrics dictionary accordingly."""
+    question_id = data_item['question_id']
+    nouns = extract_nouns(data_item['text'])
+    filtered_nouns = [noun for noun in nouns if noun in hallucination_words]
+
+    safe_words = []
+    safe_list = []
+    for idx, word in enumerate(ground_truth_item['truth']):
+        related = association.get(word, [])
+        safe_words += related
+        safe_list += [idx] * len(related)
+
+    ha_words = []
+    ha_list = []
+    for idx, word in enumerate(ground_truth_item['hallu']):
+        related = association.get(word, [])
+        ha_words += related
+        ha_list += [idx] * len(related)
+
+    safe_words += ground_truth_item['truth']
+    safe_len = len(ground_truth_item['truth'])
+    safe_list += [0] * safe_len
+
+    ha_words += ground_truth_item['hallu']
+    ha_len = len(ground_truth_item['hallu'])
+    ha_list += [0] * ha_len
+
+    safe_flag_list = [0] * len(filtered_nouns)
+
+    for idx, noun in enumerate(filtered_nouns):
+        if noun in global_safe_words:
+            continue
+
+        if noun in safe_words:
+            for j in range(len(safe_words)):
+                if noun == safe_words[j]:
+                    if j < (len(safe_list) - safe_len):
+                        safe_list[safe_list[j] + len(safe_list) - safe_len] = 1
+                    else:
+                        safe_list[j] = 1
+                    break
+            continue
+
+        if noun in ha_words:
+            for j in range(len(ha_words)):
+                if noun == ha_words[j]:
+                    if j < (len(ha_list) - ha_len):
+                        ha_list[ha_list[j] + len(ha_list) - ha_len] = 1
+                    else:
+                        ha_list[j] = 1
+                    break
+
+        for j, check_word in enumerate(ha_words):
+            if check_synonyms_word(noun, check_word, similarity_threshold):
+                if j < (len(ha_list) - ha_len):
+                    ha_list[ha_list[j] + len(ha_list) - ha_len] = 1
+                else:
+                    ha_list[j] = 1
+                break
+
+        flag = False
+        for j, check_word in enumerate(safe_words):
+            if check_synonyms_word(noun, check_word, similarity_threshold):
+                flag = True
+                if j < (len(safe_list) - safe_len):
+                    safe_list[safe_list[j] + len(safe_list) - safe_len] = 1
+                else:
+                    safe_list[j] = 1
+                break
+        if flag:
+            continue
+
+        safe_flag_list[idx] = 1
+
+    metrics['chair_score'] += sum(safe_flag_list)
+    metrics['chair_num'] += len(safe_flag_list)
+    metrics['safe_cover_score'] += sum(safe_list[-safe_len:])
+    metrics['safe_cover_num'] += len(safe_list[-safe_len:])
+    metrics['hallu_cover_score'] += sum(ha_list[-ha_len:])
+    metrics['hallu_cover_num'] += len(ha_list[-ha_len:])
+    if sum(safe_flag_list) == 0:
+        metrics['non_hallu_score'] += 1
+    metrics['non_hallu_num'] += 1
+
+
+def process_discriminative_task(data_item, ground_truth_item, metrics):
+    """Process a discriminative task item and update the metrics dictionary accordingly."""
+    question_id = data_item['question_id']
+    metrics['qa_correct_num'] += 1
+
+    gt_type = ground_truth_item['type']
+    if gt_type == 'discriminative-attribute-state':
+        metrics['as_qa_correct_num'] += 1
+    elif gt_type == 'discriminative-attribute-number':
+        metrics['an_qa_correct_num'] += 1
+    elif gt_type == 'discriminative-attribute-action':
+        metrics['aa_qa_correct_num'] += 1
+    elif gt_type == 'discriminative-hallucination':
+        metrics['ha_qa_correct_num'] += 1
+    else:
+        metrics['asso_qa_correct_num'] += 1
+
+    truth = ground_truth_item['truth']
+    response = data_item['text']
+    if "yes" in response.lower():
+        response = "Yes"
+    elif "no" in response.lower():
+        response = "No"
+
+    if truth == 'yes':
+        if response == "Yes":
+            metrics['qa_correct_score'] += 1
+            if gt_type == 'discriminative-attribute-state':
+                metrics['as_qa_correct_score'] += 1
+            elif gt_type == 'discriminative-attribute-number':
+                metrics['an_qa_correct_score'] += 1
+            elif gt_type == 'discriminative-attribute-action':
+                metrics['aa_qa_correct_score'] += 1
+            elif gt_type == 'discriminative-hallucination':
+                metrics['ha_qa_correct_score'] += 1
+            else:
+                metrics['asso_qa_correct_score'] += 1
+    else:
+        metrics['qa_no_num'] += 1
+        if gt_type == 'discriminative-attribute-state':
+            metrics['as_qa_no_num'] += 1
+        elif gt_type == 'discriminative-attribute-number':
+            metrics['an_qa_no_num'] += 1
+        elif gt_type == 'discriminative-attribute-action':
+            metrics['aa_qa_no_num'] += 1
+        elif gt_type == 'discriminative-hallucination':
+            metrics['ha_qa_no_num'] += 1
+        else:
+            metrics['asso_qa_no_num'] += 1
+
+        if response == "No":
+            metrics['qa_correct_score'] += 1
+            metrics['qa_no_score'] += 1
+            if gt_type == 'discriminative-attribute-state':
+                metrics['as_qa_correct_score'] += 1
+                metrics['as_qa_no_score'] += 1
+            elif gt_type == 'discriminative-attribute-number':
+                metrics['an_qa_correct_score'] += 1
+                metrics['an_qa_no_score'] += 1
+            elif gt_type == 'discriminative-attribute-action':
+                metrics['aa_qa_correct_score'] += 1
+                metrics['aa_qa_no_score'] += 1
+            elif gt_type == 'discriminative-hallucination':
+                metrics['ha_qa_correct_score'] += 1
+                metrics['ha_qa_no_score'] += 1
+            else:
+                metrics['asso_qa_correct_score'] += 1
+                metrics['asso_qa_no_score'] += 1
+
+    if response == "No":
+        metrics['qa_ans_no_num'] += 1
+        if gt_type == 'discriminative-attribute-state':
+            metrics['as_qa_ans_no_num'] += 1
+        elif gt_type == 'discriminative-attribute-number':
+            metrics['an_qa_ans_no_num'] += 1
+        elif gt_type == 'discriminative-attribute-action':
+            metrics['aa_qa_ans_no_num'] += 1
+        elif gt_type == 'discriminative-hallucination':
+            metrics['ha_qa_ans_no_num'] += 1
+        else:
+            metrics['asso_qa_ans_no_num'] += 1
+        if truth == 'no':
+            metrics['qa_ans_no_score'] += 1
+            if gt_type == 'discriminative-attribute-state':
+                metrics['as_qa_ans_no_score'] += 1
+            elif gt_type == 'discriminative-attribute-number':
+                metrics['an_qa_ans_no_score'] += 1
+            elif gt_type == 'discriminative-attribute-action':
+                metrics['aa_qa_ans_no_score'] += 1
+            elif gt_type == 'discriminative-hallucination':
+                metrics['ha_qa_ans_no_score'] += 1
+            else:
+                metrics['asso_qa_ans_no_score'] += 1
+
+
+########################################
+#        lmms-eval Interface           #
+########################################
+
+def amber_g_doc_to_visual(doc):
+    """Convert document to visual input."""
+    return [doc["image"].convert("RGB")]
+
+
+def amber_g_doc_to_text(doc):
+    """Convert document to text prompt."""
+    task_type = doc.get("task_type", "generative")
+    
+    if task_type == 'generative':
+        return "Describe this image"
+    else:
+        return doc.get("text", "")
+
+
+def amber_g_process_result(doc, result):
+    """Process a single result for AMBER evaluation."""
+    load_metadata()
+    pred_text = result[0] if len(result) > 0 else ""
+    question_id = doc.get("question_id", 0)
+    
+    gt_item = {
+        'question_id': question_id,
+        'type': doc.get('task_type', 'generative'),
+        'truth': doc.get('truth', []),
+        'hallu': doc.get('hallu', [])
+    }
+
+    data_item = {
+        'question_id': question_id,
+        'text': pred_text
+    }
+    
+    # Initialize temporary metrics
+    temp_metrics = _METRICS_INIT.copy()
+    
+    if gt_item['type'] == 'generative':
+        process_generative_task(data_item, gt_item, _ASSOCIATION, _HALLUCINATION_WORDS,
+                                _SAFE_WORDS, SIMILARITY_THRESHOLD, temp_metrics)
+        
+        return {
+            'amber_chair': temp_metrics.copy(),
+            'amber_cover': temp_metrics.copy(),
+            'amber_hal': temp_metrics.copy(),
+            'amber_cog': temp_metrics.copy(),
+        }
+    else:
+        process_discriminative_task(data_item, gt_item, temp_metrics)
+        
+        return {
+            'amber_chair': temp_metrics,
+            'amber_cover': temp_metrics,
+            'amber_hal': temp_metrics,
+            'amber_cog': temp_metrics,
+        }
+
+
+def amber_g_aggregate_chair(results):
+    """Aggregate CHAIR metric."""
+    chair_score = sum(r['chair_score'] for r in results if r is not None)
+    chair_num = sum(r['chair_num'] for r in results if r is not None)
+    
+    if chair_num == 0:
+        return 0.0
+    return (chair_score / chair_num) * 100
+
+
+def amber_g_aggregate_cover(results):
+    """Aggregate Cover metric."""
+    safe_cover_score = sum(r['safe_cover_score'] for r in results if r is not None)
+    safe_cover_num = sum(r['safe_cover_num'] for r in results if r is not None)
+    
+    if safe_cover_num == 0:
+        return 0.0
+    return (safe_cover_score / safe_cover_num) * 100
+
+
+def amber_g_aggregate_hal(results):
+    """Aggregate Hal metric."""
+    non_hallu_score = sum(r['non_hallu_score'] for r in results if r is not None)
+    non_hallu_num = sum(r['non_hallu_num'] for r in results if r is not None)
+    
+    if non_hallu_num == 0:
+        return 0.0
+    return 100 - (non_hallu_score / non_hallu_num) * 100
+
+
+def amber_g_aggregate_cog(results):
+    """Aggregate Cog metric."""
+    hallu_cover_score = sum(r['hallu_cover_score'] for r in results if r is not None)
+    hallu_cover_num = sum(r['hallu_cover_num'] for r in results if r is not None)
+    
+    if hallu_cover_num == 0:
+        return 0.0
+    return (hallu_cover_score / hallu_cover_num) * 100

From 8b109929bc0e05af2a86ceaf580fa59a4f319959 Mon Sep 17 00:00:00 2001
From: Kyunnilee <annelee5270@gmail.com>
Date: Thu, 16 Oct 2025 08:05:42 +0000
Subject: [PATCH 05/15] amber-works

---
 lmms_eval/tasks/amber_g/amber_g.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmms_eval/tasks/amber_g/amber_g.yaml b/lmms_eval/tasks/amber_g/amber_g.yaml
index 813723f48..600be1757 100644
--- a/lmms_eval/tasks/amber_g/amber_g.yaml
+++ b/lmms_eval/tasks/amber_g/amber_g.yaml
@@ -2,7 +2,7 @@
 # Based on: https://github.com/junyangwang0410/AMBER
 # Dataset includes: images, questions, and complete ground truth annotations
 
-dataset_path: Kyunnilee/amber_g_new  # use this dataset
+dataset_path: Kyunnilee/amber_g  # use this dataset
 dataset_kwargs:
   trust_remote_code: true
 task: "amber_g"

From a30ccf559156290481afe446653ab4eff1368c35 Mon Sep 17 00:00:00 2001
From: Patrick Wu <tsunghan_wu@berkeley.edu>
Date: Sat, 18 Oct 2025 22:40:48 +0000
Subject: [PATCH 06/15] Add an easy flag to control image ordering (incomplete
 code)

---
 lmms_eval/models/simple/vllm.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/lmms_eval/models/simple/vllm.py b/lmms_eval/models/simple/vllm.py
index 724d7e614..56455e58a 100644
--- a/lmms_eval/models/simple/vllm.py
+++ b/lmms_eval/models/simple/vllm.py
@@ -156,6 +156,7 @@ def __init__(
         chat_template: Optional[str] = None,
         min_image_pixels: int = 28,  # minimum image dimension, required for Qwen 2/2.5-VL models
         disable_log_stats: bool = False,
+        image_first: bool = False,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -167,6 +168,7 @@ def __init__(
         self.chat_template = chat_template
         self.min_image_pixels = min_image_pixels
         self.data_parallel_size = data_parallel_size
+        self.image_first = image_first
         # Qwen 2/2.5-VL models enforce minimum image dimensions
         self._enforce_image_resize = self._is_qwen_vl_model(model)
 
@@ -338,11 +340,20 @@ def generate_until(self, requests) -> List[str]:
                             imgs.append(task.result())
 
                 messages = [{"role": "user", "content": []}]
+                print(f"image_first: {self.image_first}", flush=True)
+                if self.image_first:
+                    for img in self.flatten(imgs):
+                        messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
+                    messages[0]["content"].append({"type": "text", "text": contexts})
+                else:
+                    messages[0]["content"].append({"type": "text", "text": contexts})
+                    for img in self.flatten(imgs):
+                        messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
+                # Ooriginal Code
                 # Add images first, then text
-                for img in self.flatten(imgs):
-                    messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
-                messages[0]["content"].append({"type": "text", "text": contexts})
-
+                # for img in self.flatten(imgs):
+                #     messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
+                # messages[0]["content"].append({"type": "text", "text": contexts})
                 batched_messages.append(messages)
 
             sampling_params = SamplingParams(**params)

From 1a99b5199ee19ffb4cdcd6db50b8aab1a0ce1579 Mon Sep 17 00:00:00 2001
From: Patrick Wu <tsunghan_wu@berkeley.edu>
Date: Sat, 18 Oct 2025 22:51:57 +0000
Subject: [PATCH 07/15] we nned to do interleaved at the end of the day...

---
 lmms_eval/models/simple/llava_onevision1_5.py | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/lmms_eval/models/simple/llava_onevision1_5.py b/lmms_eval/models/simple/llava_onevision1_5.py
index d35715581..8258a3770 100644
--- a/lmms_eval/models/simple/llava_onevision1_5.py
+++ b/lmms_eval/models/simple/llava_onevision1_5.py
@@ -46,6 +46,7 @@ def __init__(
         max_image_size: Optional[int] = None,  # Only applicable if use_custom_video_loader is True
         system_prompt: Optional[str] = "You are a helpful assistant.",
         interleave_visuals: Optional[bool] = False,
+        image_first: Optional[bool] = False,
         reasoning_prompt: Optional[str] = None,
         max_length: int = 2048,
         **kwargs,
@@ -86,7 +87,7 @@ def __init__(
         self.max_pixels = max_pixels
         self.min_pixels = min_pixels
         self.max_num_frames = max_num_frames
-
+        self.image_first = image_first
         if reasoning_prompt:
             self.reasoning_prompt = reasoning_prompt.replace("\\n", "\n")
         else:
@@ -236,12 +237,20 @@ def _collate(x):
                         processed_visuals.append({"type": "image", "image": visual.convert("RGB")})
 
                 if self.interleave_visuals is False:
-                    message.append(
-                        {
-                            "role": "user",
-                            "content": processed_visuals + [{"type": "text", "text": context}],
-                        }
-                    )
+                    if self.image_first:
+                        message.append(
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": context}] + processed_visuals,
+                            }
+                        )
+                    else:
+                        message.append(
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": context}] + processed_visuals,
+                            }
+                        )
                 else:  # currently support find <image x> in the context
                     image_placeholders = re.findall(r"<image \d+>", context)
                     content_parts = []

From 4a617543ba32b1f58edef798a09c9e13966ef0e6 Mon Sep 17 00:00:00 2001
From: Patrick Wu <tsunghan_wu@berkeley.edu>
Date: Sun, 19 Oct 2025 01:14:04 +0000
Subject: [PATCH 08/15] fix typo

---
 lmms_eval/models/simple/llava_onevision1_5.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmms_eval/models/simple/llava_onevision1_5.py b/lmms_eval/models/simple/llava_onevision1_5.py
index 8258a3770..d23c02c60 100644
--- a/lmms_eval/models/simple/llava_onevision1_5.py
+++ b/lmms_eval/models/simple/llava_onevision1_5.py
@@ -46,7 +46,7 @@ def __init__(
         max_image_size: Optional[int] = None,  # Only applicable if use_custom_video_loader is True
         system_prompt: Optional[str] = "You are a helpful assistant.",
         interleave_visuals: Optional[bool] = False,
-        image_first: Optional[bool] = False,
+        image_first: Optional[bool] = True,
         reasoning_prompt: Optional[str] = None,
         max_length: int = 2048,
         **kwargs,
@@ -241,7 +241,7 @@ def _collate(x):
                         message.append(
                             {
                                 "role": "user",
-                                "content": [{"type": "text", "text": context}] + processed_visuals,
+                                "content": processed_visuals + [{"type": "text", "text": context}],
                             }
                         )
                     else:

From 3bbf771eb4c317ce16b19b14cd978f8776b69c0d Mon Sep 17 00:00:00 2001
From: Patrick Wu <tsunghan_wu@berkeley.edu>
Date: Thu, 23 Oct 2025 04:00:57 +0000
Subject: [PATCH 09/15] update evaluator to prevent chair customized

---
 lmms_eval/evaluator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
index c1dc315a6..8927520e0 100755
--- a/lmms_eval/evaluator.py
+++ b/lmms_eval/evaluator.py
@@ -582,6 +582,7 @@ def evaluate(
                         "doc_id": doc_id,
                         "doc": saved_doc,
                         "target": target,
+                        # "pred": metrics['coco_cap_chair_i']['pred'],
                         "arguments": filtered_arguments,
                         "resps": [req.resps for req in requests],
                         "filtered_resps": [req.filtered_resps[filter_key] for req in requests],

From 55fb8b935d99ec4194ec42cf576e263eba565efe Mon Sep 17 00:00:00 2001
From: Kyunnilee <annelee5270@gmail.com>
Date: Mon, 27 Oct 2025 06:34:42 +0000
Subject: [PATCH 10/15] mmbench two

---
 lmms_eval/tasks/_task_utils/file_utils.py | 2 +-
 lmms_eval/tasks/mmbench/en_utils.py       | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/lmms_eval/tasks/_task_utils/file_utils.py b/lmms_eval/tasks/_task_utils/file_utils.py
index 0438ffb6b..29ee72dcb 100755
--- a/lmms_eval/tasks/_task_utils/file_utils.py
+++ b/lmms_eval/tasks/_task_utils/file_utils.py
@@ -2,7 +2,7 @@
 
 
 def generate_submission_file(file_name, args, subpath="submissions"):
-    if args.output_path is None:
+    if args is None or args.output_path is None:
         # If no output path is specified, use current directory
         path = subpath
     else:
diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py
index e857655c1..e2c568e46 100755
--- a/lmms_eval/tasks/mmbench/en_utils.py
+++ b/lmms_eval/tasks/mmbench/en_utils.py
@@ -37,7 +37,9 @@
 
 
 def mmbench_doc_to_visual(doc):
-    return [doc["image"].convert("RGB")]
+    print("two image")
+    return [doc["image"].convert("RGB"), doc["image"].convert("RGB")]
+    # return [doc["image"].convert("RGB")]
 
 
 def mmbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):

From 0124da0c38c6beebd0bad81b1cfb115611c6c2fb Mon Sep 17 00:00:00 2001
From: Kyunnilee <annelee5270@gmail.com>
Date: Tue, 28 Oct 2025 09:49:18 +0000
Subject: [PATCH 11/15] clean code

---
 lmms_eval/models/simple/vllm.py         |  4 +++-
 lmms_eval/tasks/amber_g/utils.py        | 13 ++++++++++++-
 lmms_eval/tasks/coco_cap_chair/utils.py | 13 +++++++++++--
 lmms_eval/tasks/mmbench/en_utils.py     | 16 +++++++++++-----
 4 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/lmms_eval/models/simple/vllm.py b/lmms_eval/models/simple/vllm.py
index 56455e58a..9226cdbbb 100644
--- a/lmms_eval/models/simple/vllm.py
+++ b/lmms_eval/models/simple/vllm.py
@@ -340,12 +340,14 @@ def generate_until(self, requests) -> List[str]:
                             imgs.append(task.result())
 
                 messages = [{"role": "user", "content": []}]
-                print(f"image_first: {self.image_first}", flush=True)
+                # print(f"image_first: {self.image_first}", flush=True)
                 if self.image_first:
+                    # print(f"IMAGE_FIRST", flush=True)
                     for img in self.flatten(imgs):
                         messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
                     messages[0]["content"].append({"type": "text", "text": contexts})
                 else:
+                    # print(f"IMAGE_LAST", flush=True)
                     messages[0]["content"].append({"type": "text", "text": contexts})
                     for img in self.flatten(imgs):
                         messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
diff --git a/lmms_eval/tasks/amber_g/utils.py b/lmms_eval/tasks/amber_g/utils.py
index 4bd43520e..016042bf4 100644
--- a/lmms_eval/tasks/amber_g/utils.py
+++ b/lmms_eval/tasks/amber_g/utils.py
@@ -343,7 +343,18 @@ def process_discriminative_task(data_item, ground_truth_item, metrics):
 
 def amber_g_doc_to_visual(doc):
     """Convert document to visual input."""
-    return [doc["image"].convert("RGB")]
+    # Read num_image from environment variable
+    num_image = int(os.environ.get("NUM_IMAGE", "1"))
+    
+    if num_image == 1:
+        # print("one image!")
+        return [doc["image"].convert("RGB")]
+    elif num_image == 2:
+        # print("two images!")
+        # Use the same image twice (similar to mmbench pattern)
+        return [doc["image"].convert("RGB"), doc["image"].convert("RGB")]
+    else:
+        raise ValueError(f"num_image must be 1 or 2, got {num_image}")
 
 
 def amber_g_doc_to_text(doc):
diff --git a/lmms_eval/tasks/coco_cap_chair/utils.py b/lmms_eval/tasks/coco_cap_chair/utils.py
index 363762db4..3e333e100 100644
--- a/lmms_eval/tasks/coco_cap_chair/utils.py
+++ b/lmms_eval/tasks/coco_cap_chair/utils.py
@@ -1,3 +1,4 @@
+import os
 import nltk
 from nltk.corpus import wordnet
 from nltk.stem import WordNetLemmatizer
@@ -10,8 +11,16 @@
 DOUBLE_WORD_DICT = {'motor bike': 'motor bike', 'motor cycle': 'motor cycle', 'air plane': 'air plane', 'traffic light': 'traffic light', 'street light': 'street light', 'traffic signal': 'traffic signal', 'stop light': 'stop light', 'fire hydrant': 'fire hydrant', 'stop sign': 'stop sign', 'parking meter': 'parking meter', 'suit case': 'suit case', 'sports ball': 'sports ball', 'baseball bat': 'baseball bat', 'baseball glove': 'baseball glove', 'tennis racket': 'tennis racket', 'wine glass': 'wine glass', 'hot dog': 'hot dog', 'cell phone': 'cell phone', 'mobile phone': 'mobile phone', 'teddy bear': 'teddy bear', 'hair drier': 'hair drier', 'potted plant': 'potted plant', 'bow tie': 'tie', 'laptop computer': 'laptop computer', 'stove top oven': 'stove top oven', 'home plate': 'home plate', 'train track': 'train track', 'baby bird': 'bird', 'adult bird': 'bird', 'baby cat': 'cat', 'adult cat': 'cat', 'baby dog': 'dog', 'adult dog': 'dog', 'baby horse': 'horse', 'adult horse': 'horse', 'baby sheep': 'sheep', 'adult sheep': 'sheep', 'baby cow': 'cow', 'adult cow': 'cow', 'baby elephant': 'elephant', 'adult elephant': 'elephant', 'baby bear': 'bear', 'adult bear': 'bear', 'baby zebra': 'zebra', 'adult zebra': 'zebra', 'baby giraffe': 'giraffe', 'adult giraffe': 'giraffe', 'baby animal': 'animal', 'adult animal': 'animal', 'baby cub': 'cub', 'adult cub': 'cub', 'passenger jet': 'jet', 'passenger train': 'train', 'toilet seat': 'toilet', 'wine glas': 'wine glass'}
 
 def coco_cap_chair_doc_to_visual(doc):
-    return [doc["image"].convert("RGB"), doc["image"].convert("RGB")]
-
+    num_image = int(os.environ.get("NUM_IMAGE", "1"))
+    
+    if num_image == 1:
+        # print("one image!")
+        return [doc["image"].convert("RGB")]
+    elif num_image == 2:
+        # print("two images!")
+        return [doc["image"].convert("RGB"), doc["image"].convert("RGB")]
+    else:
+        raise ValueError(f"num_image must be 1 or 2, got {num_image}")
 
 def coco_cap_chair_doc_to_text(doc):
     return f"Please describe this image in detail."
diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py
index e2c568e46..b952c2837 100755
--- a/lmms_eval/tasks/mmbench/en_utils.py
+++ b/lmms_eval/tasks/mmbench/en_utils.py
@@ -37,11 +37,17 @@
 
 
 def mmbench_doc_to_visual(doc):
-    print("two image")
-    return [doc["image"].convert("RGB"), doc["image"].convert("RGB")]
-    # return [doc["image"].convert("RGB")]
-
-
+    num_image = int(os.environ.get("NUM_IMAGE", "1"))
+    
+    if num_image == 1:
+        # print("one image!")
+        return [doc["image"].convert("RGB")]
+    elif num_image == 2:
+        # print("two images!")
+        return [doc["image"].convert("RGB"), doc["image"].convert("RGB")]
+    else:
+        raise ValueError(f"num_image must be 1 or 2, got {num_image}")
+        
 def mmbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
     option_candidate = ["A", "B", "C", "D", "E"]
     options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)

From 7611727a3e81cd4e392fb834ea85703db796dd8d Mon Sep 17 00:00:00 2001
From: Kyunnilee <annelee5270@gmail.com>
Date: Tue, 28 Oct 2025 13:07:54 +0000
Subject: [PATCH 12/15] file upload bug fix

---
 lmms_eval/tasks/_task_utils/file_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmms_eval/tasks/_task_utils/file_utils.py b/lmms_eval/tasks/_task_utils/file_utils.py
index 29ee72dcb..79a6cc33b 100755
--- a/lmms_eval/tasks/_task_utils/file_utils.py
+++ b/lmms_eval/tasks/_task_utils/file_utils.py
@@ -2,11 +2,11 @@
 
 
 def generate_submission_file(file_name, args, subpath="submissions"):
-    if args is None or args.output_path is None:
+    if args is None or args.output is None:
         # If no output path is specified, use current directory
         path = subpath
     else:
-        path = os.path.join(args.output_path, subpath)
+        path = os.path.join(args.output, subpath)
     os.makedirs(path, exist_ok=True)
     path = os.path.join(path, file_name)
     return os.path.abspath(path)

From 4e5fe57153a61ed8fcca49cbc612c5c9aad86948 Mon Sep 17 00:00:00 2001
From: Kyunnilee <annelee5270@gmail.com>
Date: Tue, 28 Oct 2025 13:08:54 +0000
Subject: [PATCH 13/15] enable double img mmmu

---
 lmms_eval/tasks/mmmu/utils.py | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py
index dbb37d7d1..1f3f6602c 100755
--- a/lmms_eval/tasks/mmmu/utils.py
+++ b/lmms_eval/tasks/mmmu/utils.py
@@ -77,14 +77,32 @@ def mmmu_doc_to_messages(doc, lmms_eval_specific_kwargs=None):
     config["metadata"]["interleaved_format"] = True
     question = mmmu_doc_to_text(doc, lmms_eval_specific_kwargs)
     visuals = mmmu_doc_to_visual(doc)
+
+    # Duplicate the single image when NUM_IMAGE=2
+    num_image = int(os.environ.get("NUM_IMAGE", "1"))
+    if num_image == 1:
+        pass
+    elif num_image == 2:
+        if len(visuals) == 1:
+            visuals = [visuals[0], visuals[0]]
+    else:
+        raise ValueError(f"num_image must be 1 or 2, got {num_image}")
+
     messages = [{"role": "user", "content": []}]
     interleaved_content = question.split("<image>")
-    for i, (image, text) in enumerate(zip(visuals, interleaved_content)):
-        if text.strip() != "":
-            messages[0]["content"].append({"type": "text", "text": text.strip()})
-        messages[0]["content"].append({"type": "image", "url": image})
-    # There will be one more text part after the last image
-    messages[0]["content"].append({"type": "text", "text": interleaved_content[-1].strip()})
+
+    # Allow more visuals than placeholders by only attaching pre-image text
+    # if a corresponding segment exists. Always append the final trailing text.
+    for i in range(len(visuals)):
+        if i < len(interleaved_content) - 1:
+            text = interleaved_content[i].strip()
+            if text != "":
+                messages[0]["content"].append({"type": "text", "text": text})
+        messages[0]["content"].append({"type": "image", "url": visuals[i]})
+
+    # Append the trailing text after the last image
+    if len(interleaved_content) > 0:
+        messages[0]["content"].append({"type": "text", "text": interleaved_content[-1].strip()})
 
     return messages
 

From 8e22040dbccafaaa83ba619f190348e12979dfb8 Mon Sep 17 00:00:00 2001
From: Kyunnilee <annelee5270@gmail.com>
Date: Wed, 29 Oct 2025 00:53:05 +0000
Subject: [PATCH 14/15] hallusion_bench

---
 lmms_eval/tasks/hallusion_bench/evaluate_hb.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/lmms_eval/tasks/hallusion_bench/evaluate_hb.py b/lmms_eval/tasks/hallusion_bench/evaluate_hb.py
index 08ca2bec2..2127cd18a 100755
--- a/lmms_eval/tasks/hallusion_bench/evaluate_hb.py
+++ b/lmms_eval/tasks/hallusion_bench/evaluate_hb.py
@@ -29,13 +29,24 @@ def hb_doc_to_text(doc, lmms_eval_specific_kwargs=None):
     return f"{pre_prompt}{doc['question']}{post_prompt}"
 
 
-def hb_doc_to_visual(doc):
-    return [doc["image"].convert("RGB")]
+def hb_doc_to_visual(doc): 
+    """Convert document to visual input."""
+    num_image = int(os.environ.get("NUM_IMAGE", "1"))
+    
+    if num_image == 1:
+        # print("one image!")
+        return [doc["image"].convert("RGB")]
+    elif num_image == 2:
+        # print("two images!")
+        return [doc["image"].convert("RGB"), doc["image"].convert("RGB")]
+    else:
+        raise ValueError(f"num_image must be 1 or 2, got {num_image}")
+
 
 
 def hb_process_results(doc, result):
     sample = doc
-    doc.pop("image")
+    doc.pop("image", None)
     sample["model_prediction"] = result[0]
     return {k: sample for k in metric}
 

From a818ab8f761fcf81cdb7c505a07aad2c99dc10d4 Mon Sep 17 00:00:00 2001
From: Kyunnilee <annelee5270@gmail.com>
Date: Mon, 3 Nov 2025 02:36:17 +0000
Subject: [PATCH 15/15] bootstrap for amber

---
 lmms_eval/api/metrics.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/lmms_eval/api/metrics.py b/lmms_eval/api/metrics.py
index 4764e63cb..8da313419 100755
--- a/lmms_eval/api/metrics.py
+++ b/lmms_eval/api/metrics.py
@@ -554,7 +554,14 @@ def stderr_for_metric(metric, bootstrap_iters: int):
         coco_cap_chair_aggregate_results_chair_s,
         coco_cap_chair_aggregate_results_recall,
     )
-
+    # for amber_g
+    from lmms_eval.tasks.amber_g.utils import (
+        amber_g_aggregate_chair,
+        amber_g_aggregate_cover,
+        amber_g_aggregate_hal,
+        amber_g_aggregate_cog,
+    )
+    
     bootstrappable = [
         median,
         matthews_corrcoef,
@@ -566,6 +573,10 @@ def stderr_for_metric(metric, bootstrap_iters: int):
         coco_cap_chair_aggregate_results_chair_i,
         coco_cap_chair_aggregate_results_chair_s,
         coco_cap_chair_aggregate_results_recall,
+        amber_g_aggregate_chair,
+        amber_g_aggregate_cover,
+        amber_g_aggregate_hal,
+        amber_g_aggregate_cog,
     ]
 
     if metric in bootstrappable:
@@ -574,6 +585,8 @@ def stderr_for_metric(metric, bootstrap_iters: int):
     if hasattr(metric, '__name__'):
         if 'coco_cap_chair' in metric.__name__:
             return lambda x: bootstrap_chair_metric(metric, x, iters=bootstrap_iters)
+        if 'amber_g' in metric.__name__ or 'amber_' in metric.__name__:
+            return lambda x: bootstrap_chair_metric(metric, x, iters=bootstrap_iters)
 
     stderr = {mean: mean_stderr, acc_all: acc_all_stderr}