|
| 1 | +from automation.tasks import ArenaHardJudgeTask |
| 2 | + |
| 3 | +model_queue_list_dict = [ |
| 4 | + {"Qwen/Qwen3-0.6B" :"oneshot-a100x1"}, |
| 5 | + {"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"}, |
| 6 | + |
| 7 | + {"Qwen/Qwen3-1.7B":"oneshot-a100x1"}, |
| 8 | + {"RedHatAI/Qwen3-1.7B-quantized.w4a16":"oneshot-a100x1"}, |
| 9 | + |
| 10 | + {"Qwen/Qwen3-4B":"oneshot-a100x1"}, |
| 11 | + {"RedHatAI/Qwen3-4B-quantized.w4a16":"oneshot-a100x1"}, |
| 12 | + |
| 13 | + {"Qwen/Qwen3-8B":"oneshot-a100x1"}, |
| 14 | + {"RedHatAI/Qwen3-8B-quantized.w4a16":"oneshot-a100x1"}, |
| 15 | + |
| 16 | + {"Qwen/Qwen3-14B":"oneshot-a100x1"}, |
| 17 | + {"RedHatAI/Qwen3-14B-quantized.w4a16":"oneshot-a100x1"}, |
| 18 | + |
| 19 | + {"Qwen/Qwen3-32B":"oneshot-a100x2"}, |
| 20 | + {"RedHatAI/Qwen3-32B-quantized.w4a16":"oneshot-a100x2"}, |
| 21 | + |
| 22 | + {"Qwen/Qwen3-30B-A3B":"oneshot-a100x2"}, |
| 23 | + {"RedHatAI/Qwen3-30B-A3B-quantized.w4a16":"oneshot-a100x2"}, |
| 24 | + |
| 25 | + {"meta-llama/Llama-3.1-8B-Instruct":"oneshot-a100x1"}, |
| 26 | + {"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16":"oneshot-a100x1"}, |
| 27 | + {"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8":"oneshot-a100x1"}, |
| 28 | + |
| 29 | + {"meta-llama/Llama-3.2-1B-Instruct":"oneshot-a100x1"}, |
| 30 | + {"RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8":"oneshot-a100x1"}, |
| 31 | + |
| 32 | + {"meta-llama/Llama-3.2-3B-Instruct":"oneshot-a100x1"}, |
| 33 | + {"RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8":"oneshot-a100x1"}, |
| 34 | + |
| 35 | + {"meta-llama/Llama-3.3-70B-Instruct": "oneshot-a100x4"}, |
| 36 | + {"RedHatAI/Llama-3.3-70B-Instruct-quantized.w4a16": "oneshot-a100x4"}, |
| 37 | + {"RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8": "oneshot-a100x4"}, |
| 38 | + |
| 39 | + {"meta-llama/Llama-4-Scout-17B-16E": "oneshot-a100x8"}, |
| 40 | + {"RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16": "oneshot-a100x8"}, |
| 41 | + |
| 42 | + {"microsoft/phi-4" :"oneshot-a100x1"}, |
| 43 | + {"RedHatAI/phi-4-quantized.w4a16" :"oneshot-a100x1"}, |
| 44 | + {"RedHatAI/phi-4-quantized.w8a8" :"oneshot-a100x1"}, |
| 45 | + |
| 46 | + {"mistralai/Mistral-Small-3.1-24B-Instruct-2503" :"oneshot-a100x2"}, |
| 47 | + {"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" :"oneshot-a100x2"}, |
| 48 | + {"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" :"oneshot-a100x2"}, |
| 49 | +] |
| 50 | + |
| 51 | +model_queue_list_dict = [ |
| 52 | + #{"Qwen/Qwen3-0.6B" :"oneshot-a100x1"}, |
| 53 | + {"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"}, |
| 54 | + {"meta-llama/Llama-3.2-1B-Instruct":"oneshot-a100x1"}, |
| 55 | + {"RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8":"oneshot-a100x1"}, |
| 56 | + {"microsoft/phi-4" :"oneshot-a100x1"}, |
| 57 | + {"RedHatAI/phi-4-quantized.w4a16" :"oneshot-a100x1"}, |
| 58 | + {"RedHatAI/phi-4-quantized.w8a8" :"oneshot-a100x1"}, |
| 59 | +] |
| 60 | + |
| 61 | +model_queue_list_dict = [ |
| 62 | + {"Qwen/Qwen3-0.6B" :"oneshot-a100x1"}, |
| 63 | +] |
| 64 | + |
| 65 | +judgement_model_dict = {"model": "openai/gpt-oss-120b", "queue": "oneshot-a100x8" } |
| 66 | + |
| 67 | + |
| 68 | +def run_task(model_queue_dict): |
| 69 | + answer_model, _ = model_queue_dict.popitem() |
| 70 | + judgement_model = judgement_model_dict["model"] |
| 71 | + queue = judgement_model_dict["queue"] |
| 72 | + |
| 73 | + task = ArenaHardJudgeTask( |
| 74 | + #project_name="jira_arenahard_gpt_judgement_final", |
| 75 | + project_name="jira_arenahard_gpt_judgement_v0.1_final", |
| 76 | + task_name = f"judge_{answer_model.lower()}_task", |
| 77 | + packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"], |
| 78 | + #answer_project_name = "jira_arenahard_generation", |
| 79 | + answer_project_name = "jira_arenahard_generation_v0.1", |
| 80 | + answer_task_name = f"generate_task_{answer_model.lower()}", |
| 81 | + answer_model= answer_model.split("/")[1].lower(), |
| 82 | + judgement_model = judgement_model, |
| 83 | + rate_type="throughput", |
| 84 | + backend="aiohttp_server", |
| 85 | + target="http://localhost:8000/v1", |
| 86 | + api_type = "openai", |
| 87 | + bench_name = "arena-hard-v0.1", |
| 88 | + #bench_name = "arena-hard-v2.0", |
| 89 | + branch = "arena_upgrade", |
| 90 | + vllm_kwargs={"enable-chunked-prefill": True}, |
| 91 | + #max_tokens = 32000, |
| 92 | + max_tokens = 64000, |
| 93 | + ) |
| 94 | + |
| 95 | + task.execute_remotely(queue) |
| 96 | + |
| 97 | +for model_queue_dict in model_queue_list_dict: |
| 98 | + run_task(model_queue_dict) |
0 commit comments