Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion lmms_eval/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,11 +531,37 @@ def bootstrap_stderr(f, xs, iters):
return sample_stddev(res)


def bootstrap_chair_metric(metric_fn, xs, iters):
"for non multiprocessing for CHAIR"
print(f"bootstrapping for stddev: {metric_fn.__name__}")
res = []
from tqdm import tqdm

for _ in tqdm(range(iters), desc="Bootstrap"):
bootstrap_sample = random.choices(xs, k=len(xs))
metric_value = metric_fn(bootstrap_sample)
res.append(metric_value)

return sample_stddev(res)

def stderr_for_metric(metric, bootstrap_iters: int):
if bootstrap_iters <= 0:
# return no function (don't compute stderr) if bootstrap iters = 0
return None

# for coco_cap_chair
from lmms_eval.tasks.coco_cap_chair.utils import (
coco_cap_chair_aggregate_results_chair_i,
coco_cap_chair_aggregate_results_chair_s,
coco_cap_chair_aggregate_results_recall,
)
# for amber_g
from lmms_eval.tasks.amber_g.utils import (
amber_g_aggregate_chair,
amber_g_aggregate_cover,
amber_g_aggregate_hal,
amber_g_aggregate_cog,
)

bootstrappable = [
median,
matthews_corrcoef,
Expand All @@ -544,11 +570,24 @@ def stderr_for_metric(metric, bootstrap_iters: int):
bleu,
chrf,
ter,
coco_cap_chair_aggregate_results_chair_i,
coco_cap_chair_aggregate_results_chair_s,
coco_cap_chair_aggregate_results_recall,
amber_g_aggregate_chair,
amber_g_aggregate_cover,
amber_g_aggregate_hal,
amber_g_aggregate_cog,
]

if metric in bootstrappable:
return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)

if hasattr(metric, '__name__'):
if 'coco_cap_chair' in metric.__name__:
return lambda x: bootstrap_chair_metric(metric, x, iters=bootstrap_iters)
if 'amber_g' in metric.__name__ or 'amber_' in metric.__name__:
return lambda x: bootstrap_chair_metric(metric, x, iters=bootstrap_iters)

stderr = {mean: mean_stderr, acc_all: acc_all_stderr}

return stderr.get(metric, None)
Expand Down
1 change: 1 addition & 0 deletions lmms_eval/evaluator.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This extra line should be removed

Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,7 @@ def evaluate(
"doc_id": doc_id,
"doc": saved_doc,
"target": target,
# "pred": metrics['coco_cap_chair_i']['pred'],
"arguments": filtered_arguments,
"resps": [req.resps for req in requests],
"filtered_resps": [req.filtered_resps[filter_key] for req in requests],
Expand Down
23 changes: 16 additions & 7 deletions lmms_eval/models/simple/llava_onevision1_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def __init__(
max_image_size: Optional[int] = None, # Only applicable if use_custom_video_loader is True
system_prompt: Optional[str] = "You are a helpful assistant.",
interleave_visuals: Optional[bool] = False,
image_first: Optional[bool] = True,
reasoning_prompt: Optional[str] = None,
max_length: int = 2048,
**kwargs,
Expand Down Expand Up @@ -86,7 +87,7 @@ def __init__(
self.max_pixels = max_pixels
self.min_pixels = min_pixels
self.max_num_frames = max_num_frames

self.image_first = image_first
if reasoning_prompt:
self.reasoning_prompt = reasoning_prompt.replace("\\n", "\n")
else:
Expand Down Expand Up @@ -236,12 +237,20 @@ def _collate(x):
processed_visuals.append({"type": "image", "image": visual.convert("RGB")})

if self.interleave_visuals is False:
message.append(
{
"role": "user",
"content": processed_visuals + [{"type": "text", "text": context}],
}
)
if self.image_first:
message.append(
{
"role": "user",
"content": processed_visuals + [{"type": "text", "text": context}],
}
)
else:
message.append(
{
"role": "user",
"content": [{"type": "text", "text": context}] + processed_visuals,
}
)
else: # currently support find <image x> in the context
image_placeholders = re.findall(r"<image \d+>", context)
content_parts = []
Expand Down
21 changes: 17 additions & 4 deletions lmms_eval/models/simple/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ def __init__(
chat_template: Optional[str] = None,
min_image_pixels: int = 28, # minimum image dimension, required for Qwen 2/2.5-VL models
disable_log_stats: bool = False,
image_first: bool = False,
**kwargs,
) -> None:
super().__init__()
Expand All @@ -167,6 +168,7 @@ def __init__(
self.chat_template = chat_template
self.min_image_pixels = min_image_pixels
self.data_parallel_size = data_parallel_size
self.image_first = image_first
# Qwen 2/2.5-VL models enforce minimum image dimensions
self._enforce_image_resize = self._is_qwen_vl_model(model)

Expand Down Expand Up @@ -338,11 +340,22 @@ def generate_until(self, requests) -> List[str]:
imgs.append(task.result())

messages = [{"role": "user", "content": []}]
# print(f"image_first: {self.image_first}", flush=True)
if self.image_first:
# print(f"IMAGE_FIRST", flush=True)
for img in self.flatten(imgs):
messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
messages[0]["content"].append({"type": "text", "text": contexts})
else:
# print(f"IMAGE_LAST", flush=True)
messages[0]["content"].append({"type": "text", "text": contexts})
for img in self.flatten(imgs):
messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
# Ooriginal Code
# Add images first, then text
for img in self.flatten(imgs):
messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
messages[0]["content"].append({"type": "text", "text": contexts})

# for img in self.flatten(imgs):
# messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
# messages[0]["content"].append({"type": "text", "text": contexts})
batched_messages.append(messages)

sampling_params = SamplingParams(**params)
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/tasks/_task_utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@


def generate_submission_file(file_name, args, subpath="submissions"):
if args.output_path is None:
if args is None or args.output is None:
# If no output path is specified, use current directory
path = subpath
else:
path = os.path.join(args.output_path, subpath)
path = os.path.join(args.output, subpath)
os.makedirs(path, exist_ok=True)
path = os.path.join(path, file_name)
return os.path.abspath(path)
42 changes: 42 additions & 0 deletions lmms_eval/tasks/amber_g/amber_g.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# AMBER-G (Generative Task) Evaluation Configuration
# Based on: https://github.com/junyangwang0410/AMBER
# Dataset includes: images, questions, and complete ground truth annotations

dataset_path: Kyunnilee/amber_g # use this dataset
dataset_kwargs:
trust_remote_code: true
task: "amber_g"
output_type: generate_until

doc_to_visual: !function utils.amber_g_doc_to_visual
doc_to_text: !function utils.amber_g_doc_to_text
doc_to_target: "truth"
test_split: train

generation_kwargs:
max_new_tokens: 2048
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
until: [] # really important!!! the default would be ["\n\n"] and that will cause truncation

process_results: !function utils.amber_g_process_result

# AMBER-G Metrics:
metric_list:
- metric: amber_chair
aggregation: !function utils.amber_g_aggregate_chair
higher_is_better: false
- metric: amber_cover
aggregation: !function utils.amber_g_aggregate_cover
higher_is_better: true
- metric: amber_hal
aggregation: !function utils.amber_g_aggregate_hal
higher_is_better: false
- metric: amber_cog
aggregation: !function utils.amber_g_aggregate_cog
higher_is_better: true

metadata:
- version: 0.0
Loading
Loading