Skip to content
28 changes: 1 addition & 27 deletions Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -1,31 +1,5 @@
# from gs://nm-vllm-certs/model-validation/lmeval/Qwen/Qwen2.5-7B/cuda/0.9.1.dev238+g922878ced/k8s-a100-duo/llm_eval_16218386266.json
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.6373

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.8074

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8024

- name: mmlu
metrics:
- name: acc,none
value: 0.7424

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.5633

- name: winogrande
metrics:
- name: acc,none
value: 0.7505
value: 0.6944
5 changes: 5 additions & 0 deletions Qwen/Qwen3-8B-FP8/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tasks:
- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.8756
3 changes: 3 additions & 0 deletions Qwen/Qwen3-8B-FP8/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/Qwen/Qwen3-8B
model: hf
data: hf
2 changes: 1 addition & 1 deletion Qwen/Qwen3-Next-80B-A3B-Instruct/storage.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# storage configs for https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct
# storage configs for https://huggingface.co/Qwen/Qwen3-Embedding-0.6B-GGUF
model: hf
data: hf
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
trust-remote-code: true
tensor-parallel-size: 2
tensor-parallel-size: 4
max-model-len: 16384
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tasks:
- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.6944
3 changes: 3 additions & 0 deletions RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic
model: hf
data: hf
7 changes: 7 additions & 0 deletions RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic/accuracy/server.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
enable-chunked-prefill: true
max-model-len: 9000
tensor-parallel-size: 1
trust-remote-code: true
tokenizer_mode: mistral
config_format: mistral
load_format: mistral
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
max-model-len: 448
trust-remote-code: true
5 changes: 1 addition & 4 deletions common/performance/server.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
enable-chunked-prefill: true
max-model-len: 8192
tensor-parallel-size: 1
trust-remote-code: true
uvicorn-log-level: debug
no-enable-prefix-caching: true
uvicorn-log-level: debug
5 changes: 5 additions & 0 deletions deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tasks:
- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.8271
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
enable-chunked-prefill: true
max-model-len: 10000
tensor-parallel-size: 1
trust-remote-code: true
3 changes: 3 additions & 0 deletions deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
model: hf
data: hf
4 changes: 4 additions & 0 deletions distil-whisper/distil-large-v3/accuracy/server.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
max-model-len: 448
tensor-parallel-size: 1
trust-remote-code: true
uvicorn-log-level: debug
30 changes: 30 additions & 0 deletions google/gemma-3-12b-it/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.6024

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.7665

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.7494

- name: mmlu
metrics:
- name: acc,none
value: 0.6414

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.5487

- name: winogrande
metrics:
- name: acc,none
value: 0.6835
3 changes: 3 additions & 0 deletions google/gemma-3-12b-it/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# https://huggingface.co/google/gemma-3-12b-it
model: hf
data: hf
30 changes: 30 additions & 0 deletions ibm-granite/granite-4.0-h-small/accuracy/model_card_tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.7278

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.85

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8370

- name: mmlu
metrics:
- name: acc,none
value: 0.8067

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.7062

- name: winogrande
metrics:
- name: acc,none
value: 0.8374
60 changes: 60 additions & 0 deletions ibm-granite/granite-4.0-h-small/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.715

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.85

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8573

- name: mmlu
metrics:
- name: acc,none
value: 0.8109

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.6409

- name: winogrande
metrics:
- name: acc,none
value: 0.8374

- name: leaderboard_ifeval
metrics:
- name: inst_level_strict_acc,none
value: 0.729

# - name: leaderboard_bbh
# metrics:
# - name: acc-norm,none
# value: 0.5319

# - name: leaderboard_math_hard
# metrics:
# - name: exact_match,none
# value: 0.1477

# - name: leaderboard_gpqa
# metrics:
# - name: acc-norm,none
# value: 0.3176

# - name: leaderboard_musr
# metrics:
# - name: acc-norm,none
# value: 0.4601

- name: leaderboard_mmlu_pro
metrics:
- name: acc,none
value: 0.5545
4 changes: 4 additions & 0 deletions ibm-granite/granite-4.0-h-small/performance/server.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
max-model-len: 4025
tensor-parallel-size: 2
trust-remote-code: true
uvicorn-log-level: debug
3 changes: 3 additions & 0 deletions ibm-granite/granite-4.0-h-small/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/ibm-granite/granite-3.1-8b-instruct
model: hf
data: hf
30 changes: 30 additions & 0 deletions ibm-granite/granite-4.0-micro/accuracy/model_card_tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.7278

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.79

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8370

- name: mmlu
metrics:
- name: acc,none
value: 0.8067

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.7062

- name: winogrande
metrics:
- name: acc,none
value: 0.8374
60 changes: 60 additions & 0 deletions ibm-granite/granite-4.0-micro/accuracy/tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.715

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.79

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8573

- name: mmlu
metrics:
- name: acc,none
value: 0.8109

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.6409

- name: winogrande
metrics:
- name: acc,none
value: 0.8374

- name: leaderboard_ifeval
metrics:
- name: inst_level_strict_acc,none
value: 0.729

# - name: leaderboard_bbh
# metrics:
# - name: acc-norm,none
# value: 0.5319

# - name: leaderboard_math_hard
# metrics:
# - name: exact_match,none
# value: 0.1477

# - name: leaderboard_gpqa
# metrics:
# - name: acc-norm,none
# value: 0.3176

# - name: leaderboard_musr
# metrics:
# - name: acc-norm,none
# value: 0.4601

- name: leaderboard_mmlu_pro
metrics:
- name: acc,none
value: 0.5545
3 changes: 3 additions & 0 deletions ibm-granite/granite-4.0-micro/storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# storage configs for https://huggingface.co/ibm-granite/granite-3.1-8b-instruct
model: hf
data: hf
30 changes: 30 additions & 0 deletions microsoft/Phi-3-medium-4k-instruct/accuracy/model_card_tasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
tasks:
- name: arc_challenge
metrics:
- name: acc_norm,none
value: 0.6442

- name: gsm8k
metrics:
- name: exact_match,strict-match
value: 0.85

- name: hellaswag
metrics:
- name: acc_norm,none
value: 0.8437

- name: mmlu
metrics:
- name: acc,none
value: 0.803

- name: truthfulqa_mc2
metrics:
- name: acc,none
value: 0.5937

- name: winogrande
metrics:
- name: acc,none
value: 0.8058
Loading