diff --git a/Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml b/Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml index 231aeaf..50fc1bc 100644 --- a/Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml +++ b/Qwen/Qwen2.5-VL-7B-Instruct/accuracy/tasks.yml @@ -1,31 +1,5 @@ -# from gs://nm-vllm-certs/model-validation/lmeval/Qwen/Qwen2.5-7B/cuda/0.9.1.dev238+g922878ced/k8s-a100-duo/llm_eval_16218386266.json tasks: - - name: arc_challenge - metrics: - - name: acc_norm,none - value: 0.6373 - - name: gsm8k metrics: - name: exact_match,strict-match - value: 0.8074 - - - name: hellaswag - metrics: - - name: acc_norm,none - value: 0.8024 - - - name: mmlu - metrics: - - name: acc,none - value: 0.7424 - - - name: truthfulqa_mc2 - metrics: - - name: acc,none - value: 0.5633 - - - name: winogrande - metrics: - - name: acc,none - value: 0.7505 + value: 0.6944 diff --git a/Qwen/Qwen3-8B-FP8/accuracy/tasks.yml b/Qwen/Qwen3-8B-FP8/accuracy/tasks.yml new file mode 100644 index 0000000..c4a91b6 --- /dev/null +++ b/Qwen/Qwen3-8B-FP8/accuracy/tasks.yml @@ -0,0 +1,5 @@ +tasks: + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.8756 diff --git a/Qwen/Qwen3-8B-FP8/storage.yml b/Qwen/Qwen3-8B-FP8/storage.yml new file mode 100644 index 0000000..e31645c --- /dev/null +++ b/Qwen/Qwen3-8B-FP8/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/Qwen/Qwen3-8B +model: hf +data: hf diff --git a/Qwen/Qwen3-Next-80B-A3B-Instruct/storage.yml b/Qwen/Qwen3-Next-80B-A3B-Instruct/storage.yml index de0b347..5b30f65 100644 --- a/Qwen/Qwen3-Next-80B-A3B-Instruct/storage.yml +++ b/Qwen/Qwen3-Next-80B-A3B-Instruct/storage.yml @@ -1,3 +1,3 @@ -# storage configs for https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct +# storage configs for https://huggingface.co/Qwen/Qwen3-Embedding-0.6B-GGUF model: hf data: hf diff --git a/RedHatAI/Kimi-K2-Instruct-quantized.w4a16/accuracy/server.yml b/RedHatAI/Kimi-K2-Instruct-quantized.w4a16/accuracy/server.yml index 51d25d0..22eafe9 100644 --- a/RedHatAI/Kimi-K2-Instruct-quantized.w4a16/accuracy/server.yml +++ b/RedHatAI/Kimi-K2-Instruct-quantized.w4a16/accuracy/server.yml @@ -1,3 +1,3 @@ trust-remote-code: true -tensor-parallel-size: 2 +tensor-parallel-size: 4 max-model-len: 16384 diff --git a/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/accuracy/tasks.yml b/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/accuracy/tasks.yml new file mode 100644 index 0000000..50fc1bc --- /dev/null +++ b/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/accuracy/tasks.yml @@ -0,0 +1,5 @@ +tasks: + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.6944 diff --git a/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/storage.yml b/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/storage.yml new file mode 100644 index 0000000..51a9ad6 --- /dev/null +++ b/RedHatAI/Qwen2.5-VL-7B-Instruct-FP8-Dynamic/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic +model: hf +data: hf diff --git a/RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic/accuracy/server.yml b/RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic/accuracy/server.yml new file mode 100644 index 0000000..de66392 --- /dev/null +++ b/RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic/accuracy/server.yml @@ -0,0 +1,7 @@ +enable-chunked-prefill: true +max-model-len: 9000 +tensor-parallel-size: 1 +trust-remote-code: true +tokenizer_mode: mistral +config_format: mistral +load_format: mistral diff --git a/RedHatAI/whisper-large-v3-turbo-FP8-dynamic/accuracy/server.yml b/RedHatAI/whisper-large-v3-turbo-FP8-dynamic/accuracy/server.yml new file mode 100644 index 0000000..860ae1f --- /dev/null +++ b/RedHatAI/whisper-large-v3-turbo-FP8-dynamic/accuracy/server.yml @@ -0,0 +1,2 @@ +max-model-len: 448 +trust-remote-code: true diff --git a/common/performance/server.yml b/common/performance/server.yml index 122d583..176522a 100644 --- a/common/performance/server.yml +++ b/common/performance/server.yml @@ -1,6 +1,3 @@ -enable-chunked-prefill: true -max-model-len: 8192 tensor-parallel-size: 1 trust-remote-code: true -uvicorn-log-level: debug -no-enable-prefix-caching: true +uvicorn-log-level: debug \ No newline at end of file diff --git a/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/accuracy/tasks.yml b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/accuracy/tasks.yml new file mode 100644 index 0000000..14e1988 --- /dev/null +++ b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/accuracy/tasks.yml @@ -0,0 +1,5 @@ +tasks: + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.8271 diff --git a/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/performance/server.yml b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/performance/server.yml new file mode 100644 index 0000000..7bbd353 --- /dev/null +++ b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/performance/server.yml @@ -0,0 +1,4 @@ +enable-chunked-prefill: true +max-model-len: 10000 +tensor-parallel-size: 1 +trust-remote-code: true diff --git a/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/storage.yml b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/storage.yml new file mode 100644 index 0000000..fb93360 --- /dev/null +++ b/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/deepseek-ai/DeepSeek-R1-0528 +model: hf +data: hf diff --git a/distil-whisper/distil-large-v3/accuracy/server.yml b/distil-whisper/distil-large-v3/accuracy/server.yml new file mode 100644 index 0000000..4a4677e --- /dev/null +++ b/distil-whisper/distil-large-v3/accuracy/server.yml @@ -0,0 +1,4 @@ +max-model-len: 448 +tensor-parallel-size: 1 +trust-remote-code: true +uvicorn-log-level: debug diff --git a/google/gemma-3-12b-it/accuracy/tasks.yml b/google/gemma-3-12b-it/accuracy/tasks.yml new file mode 100644 index 0000000..b284156 --- /dev/null +++ b/google/gemma-3-12b-it/accuracy/tasks.yml @@ -0,0 +1,30 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.6024 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.7665 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.7494 + + - name: mmlu + metrics: + - name: acc,none + value: 0.6414 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.5487 + + - name: winogrande + metrics: + - name: acc,none + value: 0.6835 diff --git a/google/gemma-3-12b-it/storage.yml b/google/gemma-3-12b-it/storage.yml new file mode 100644 index 0000000..5c72893 --- /dev/null +++ b/google/gemma-3-12b-it/storage.yml @@ -0,0 +1,3 @@ +# https://huggingface.co/google/gemma-3-12b-it +model: hf +data: hf diff --git a/ibm-granite/granite-3.3-8b-instruct/accuracy/tasks.yml b/ibm-granite/granite-3.3-8b-instruct/accuracy/tasks.yml new file mode 100644 index 0000000..e3b35aa --- /dev/null +++ b/ibm-granite/granite-3.3-8b-instruct/accuracy/tasks.yml @@ -0,0 +1,60 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.6663 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.6543 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8412 + + - name: mmlu + metrics: + - name: acc,none + value: 0.656 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.6048 + + - name: winogrande + metrics: + - name: acc,none + value: 0.7987 + + - name: leaderboard_ifeval + metrics: + - name: inst_level_strict_acc,none + value: 0.6666 + + # - name: leaderboard_bbh + # metrics: + # - name: acc-norm,none + # value: 0.5319 + + # - name: leaderboard_math_hard + # metrics: + # - name: exact_match,none + # value: 0.1477 + + # - name: leaderboard_gpqa + # metrics: + # - name: acc-norm,none + # value: 0.3176 + + # - name: leaderboard_musr + # metrics: + # - name: acc-norm,none + # value: 0.4601 + + - name: leaderboard_mmlu_pro + metrics: + - name: acc,none + value: 0.3517 diff --git a/ibm-granite/granite-3.3-8b-instruct/storage.yml b/ibm-granite/granite-3.3-8b-instruct/storage.yml new file mode 100644 index 0000000..1f29c96 --- /dev/null +++ b/ibm-granite/granite-3.3-8b-instruct/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/ibm-granite/granite-3.1-8b-instruct +model: hf +data: hf diff --git a/ibm-granite/granite-4.0-h-small/accuracy/model_card_tasks.yml b/ibm-granite/granite-4.0-h-small/accuracy/model_card_tasks.yml new file mode 100644 index 0000000..795309c --- /dev/null +++ b/ibm-granite/granite-4.0-h-small/accuracy/model_card_tasks.yml @@ -0,0 +1,30 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.7278 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.85 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8370 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8067 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.7062 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8374 diff --git a/ibm-granite/granite-4.0-h-small/accuracy/tasks.yml b/ibm-granite/granite-4.0-h-small/accuracy/tasks.yml new file mode 100644 index 0000000..293aa1f --- /dev/null +++ b/ibm-granite/granite-4.0-h-small/accuracy/tasks.yml @@ -0,0 +1,60 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.715 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.85 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8573 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8109 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.6409 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8374 + + - name: leaderboard_ifeval + metrics: + - name: inst_level_strict_acc,none + value: 0.729 + + # - name: leaderboard_bbh + # metrics: + # - name: acc-norm,none + # value: 0.5319 + + # - name: leaderboard_math_hard + # metrics: + # - name: exact_match,none + # value: 0.1477 + + # - name: leaderboard_gpqa + # metrics: + # - name: acc-norm,none + # value: 0.3176 + + # - name: leaderboard_musr + # metrics: + # - name: acc-norm,none + # value: 0.4601 + + - name: leaderboard_mmlu_pro + metrics: + - name: acc,none + value: 0.5545 diff --git a/ibm-granite/granite-4.0-h-small/performance/server.yml b/ibm-granite/granite-4.0-h-small/performance/server.yml new file mode 100644 index 0000000..c06be8e --- /dev/null +++ b/ibm-granite/granite-4.0-h-small/performance/server.yml @@ -0,0 +1,4 @@ +max-model-len: 4025 +tensor-parallel-size: 2 +trust-remote-code: true +uvicorn-log-level: debug diff --git a/ibm-granite/granite-4.0-h-small/storage.yml b/ibm-granite/granite-4.0-h-small/storage.yml new file mode 100644 index 0000000..1f29c96 --- /dev/null +++ b/ibm-granite/granite-4.0-h-small/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/ibm-granite/granite-3.1-8b-instruct +model: hf +data: hf diff --git a/ibm-granite/granite-4.0-micro/accuracy/model_card_tasks.yml b/ibm-granite/granite-4.0-micro/accuracy/model_card_tasks.yml new file mode 100644 index 0000000..76bbb3b --- /dev/null +++ b/ibm-granite/granite-4.0-micro/accuracy/model_card_tasks.yml @@ -0,0 +1,30 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.7278 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.79 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8370 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8067 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.7062 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8374 diff --git a/ibm-granite/granite-4.0-micro/accuracy/tasks.yml b/ibm-granite/granite-4.0-micro/accuracy/tasks.yml new file mode 100644 index 0000000..3640c76 --- /dev/null +++ b/ibm-granite/granite-4.0-micro/accuracy/tasks.yml @@ -0,0 +1,60 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.715 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.79 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8573 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8109 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.6409 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8374 + + - name: leaderboard_ifeval + metrics: + - name: inst_level_strict_acc,none + value: 0.729 + + # - name: leaderboard_bbh + # metrics: + # - name: acc-norm,none + # value: 0.5319 + + # - name: leaderboard_math_hard + # metrics: + # - name: exact_match,none + # value: 0.1477 + + # - name: leaderboard_gpqa + # metrics: + # - name: acc-norm,none + # value: 0.3176 + + # - name: leaderboard_musr + # metrics: + # - name: acc-norm,none + # value: 0.4601 + + - name: leaderboard_mmlu_pro + metrics: + - name: acc,none + value: 0.5545 diff --git a/ibm-granite/granite-4.0-micro/storage.yml b/ibm-granite/granite-4.0-micro/storage.yml new file mode 100644 index 0000000..1f29c96 --- /dev/null +++ b/ibm-granite/granite-4.0-micro/storage.yml @@ -0,0 +1,3 @@ +# storage configs for https://huggingface.co/ibm-granite/granite-3.1-8b-instruct +model: hf +data: hf diff --git a/microsoft/Phi-3-medium-4k-instruct/accuracy/model_card_tasks.yml b/microsoft/Phi-3-medium-4k-instruct/accuracy/model_card_tasks.yml new file mode 100644 index 0000000..50fc02e --- /dev/null +++ b/microsoft/Phi-3-medium-4k-instruct/accuracy/model_card_tasks.yml @@ -0,0 +1,30 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.6442 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.85 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8437 + + - name: mmlu + metrics: + - name: acc,none + value: 0.803 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.5937 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8058 diff --git a/microsoft/Phi-3-medium-4k-instruct/accuracy/tasks.yml b/microsoft/Phi-3-medium-4k-instruct/accuracy/tasks.yml new file mode 100644 index 0000000..36dff96 --- /dev/null +++ b/microsoft/Phi-3-medium-4k-instruct/accuracy/tasks.yml @@ -0,0 +1,55 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.6825 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.82 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8435 + + - name: mmlu + metrics: + - name: acc,none + value: 0.803 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.5934 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8011 + + - name: leaderboard_ifeval + metrics: + - name: inst_level_strict_acc,none + value: 0.0587 + + - name: leaderboard_gpqa_diamond + metrics: + - name: acc-norm,none + value: 0.3939 + + - name: leaderboard_gpqa_extended + metrics: + - name: acc-norm,none + value: 0.3882 + + - name: leaderboard_gpqa_main + metrics: + - name: acc-norm,none + value: 0.4129 + + - name: leaderboard_mmlu_pro + metrics: + - name: acc,none + value: 0.53 diff --git a/microsoft/Phi-4-reasoning/accuracy/tasks.yml b/microsoft/Phi-4-reasoning/accuracy/tasks.yml new file mode 100644 index 0000000..46de8cb --- /dev/null +++ b/microsoft/Phi-4-reasoning/accuracy/tasks.yml @@ -0,0 +1,30 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.6024 + + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0.9257 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.7494 + + - name: mmlu + metrics: + - name: acc,none + value: 0.6414 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.5487 + + - name: winogrande + metrics: + - name: acc,none + value: 0.6835 diff --git a/microsoft/Phi-4-reasoning/storage.yml b/microsoft/Phi-4-reasoning/storage.yml new file mode 100644 index 0000000..7dca6be --- /dev/null +++ b/microsoft/Phi-4-reasoning/storage.yml @@ -0,0 +1,3 @@ +# https://huggingface.co/microsoft/Phi-4-reasoning +model: hf +data: hf diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/performance/server.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/performance/server.yml new file mode 100644 index 0000000..452fa19 --- /dev/null +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/performance/server.yml @@ -0,0 +1,8 @@ +enable-chunked-prefill: true +max-model-len: 9000 +tensor-parallel-size: 1 +trust-remote-code: true +uvicorn-log-level: debug +tokenizer_mode: mistral +config_format: mistral +load_format: mistral diff --git a/mistralai/Voxtral-Mini-3B-2507/accuracy/server.yml b/mistralai/Voxtral-Mini-3B-2507/accuracy/server.yml new file mode 100644 index 0000000..1c30da7 --- /dev/null +++ b/mistralai/Voxtral-Mini-3B-2507/accuracy/server.yml @@ -0,0 +1,7 @@ +enable-chunked-prefill: true +max-model-len: 9000 +tensor-parallel-size: 1 +trust-remote-code: true +tokenizer_mode: mistral +config_format: mistral +load_format: mistral diff --git a/mistralai/Voxtral-Small-24B-2507/performance/server.yml b/mistralai/Voxtral-Small-24B-2507/performance/server.yml new file mode 100644 index 0000000..452fa19 --- /dev/null +++ b/mistralai/Voxtral-Small-24B-2507/performance/server.yml @@ -0,0 +1,8 @@ +enable-chunked-prefill: true +max-model-len: 9000 +tensor-parallel-size: 1 +trust-remote-code: true +uvicorn-log-level: debug +tokenizer_mode: mistral +config_format: mistral +load_format: mistral diff --git a/mistralai/Voxtral-Small-24B-2507/storage.yml b/mistralai/Voxtral-Small-24B-2507/storage.yml new file mode 100644 index 0000000..fc064a6 --- /dev/null +++ b/mistralai/Voxtral-Small-24B-2507/storage.yml @@ -0,0 +1,3 @@ +# https://huggingface.co/mistralai/Voxtral-Small-24B-2507 +model: hf +data: hf diff --git a/openai/gpt-oss-120b/performance/server.yml b/openai/gpt-oss-120b/performance/server.yml new file mode 100644 index 0000000..6652e2b --- /dev/null +++ b/openai/gpt-oss-120b/performance/server.yml @@ -0,0 +1,4 @@ +enable-chunked-prefill: true +max-model-len: 10000 +tensor-parallel-size: 2 +trust-remote-code: true diff --git a/openai/gpt-oss-20b/accuracy/tasks.yml b/openai/gpt-oss-20b/accuracy/tasks.yml index 0b4633c..cf07e78 100644 --- a/openai/gpt-oss-20b/accuracy/tasks.yml +++ b/openai/gpt-oss-20b/accuracy/tasks.yml @@ -2,4 +2,4 @@ tasks: - name: gsm8k metrics: - name: exact_match,strict-match - value: 0 + value: 0.2494 diff --git a/openai/gpt-oss-20b/performance/server.yml b/openai/gpt-oss-20b/performance/server.yml new file mode 100644 index 0000000..7bbd353 --- /dev/null +++ b/openai/gpt-oss-20b/performance/server.yml @@ -0,0 +1,4 @@ +enable-chunked-prefill: true +max-model-len: 10000 +tensor-parallel-size: 1 +trust-remote-code: true diff --git a/openai/whisper-large-v3/accuracy/server.yml b/openai/whisper-large-v3/accuracy/server.yml new file mode 100644 index 0000000..c00b7fb --- /dev/null +++ b/openai/whisper-large-v3/accuracy/server.yml @@ -0,0 +1,3 @@ +max-model-len: 448 +tensor-parallel-size: 1 +trust-remote-code: true diff --git a/openai/whisper-large-v3/accuracy/tasks.yml b/openai/whisper-large-v3/accuracy/tasks.yml new file mode 100644 index 0000000..0b4633c --- /dev/null +++ b/openai/whisper-large-v3/accuracy/tasks.yml @@ -0,0 +1,5 @@ +tasks: + - name: gsm8k + metrics: + - name: exact_match,strict-match + value: 0