Skip to content

Commit a79c0df

Browse files
authored
[None][fix] Update GLM model accuracy test (#9286)
Signed-off-by: Xuanyu Chen <xuanyuc@nvidia.com>
1 parent 255e4ea commit a79c0df

File tree

2 files changed

+7
-9
lines changed

2 files changed

+7
-9
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,4 +263,5 @@ ByteDance-Seed/Seed-OSS-36B-Instruct:
263263
zai-org/GLM-4.6:
264264
- accuracy: 81.3
265265
- quant_algo: NVFP4
266-
accuracy: 91.0
266+
spec_dec_algo: MTP
267+
accuracy: 88.0

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2558,11 +2558,11 @@ def test_bfloat16_4gpus(self, tp_size, ep_size, mtp_nextn,
25582558

25592559
@pytest.mark.skip_less_device(4)
25602560
@pytest.mark.parametrize(
2561-
"tp_size,pp_size,mtp_nextn,fp8kv,cuda_graph,overlap_scheduler,chunked_prefill,max_batch_size,moe_backend",
2562-
[pytest.param(4, 1, 2, True, True, True, True, 16, "CUTLASS")],
2561+
"tp_size,pp_size,mtp_nextn,cuda_graph,overlap_scheduler,chunked_prefill,max_batch_size,moe_backend",
2562+
[pytest.param(4, 1, 2, True, True, True, 16, "CUTLASS")],
25632563
ids=["throughput"])
2564-
def test_nvfp4_multi_gpus(self, tp_size, pp_size, mtp_nextn, fp8kv,
2565-
cuda_graph, overlap_scheduler, chunked_prefill,
2564+
def test_nvfp4_multi_gpus(self, tp_size, pp_size, mtp_nextn, cuda_graph,
2565+
overlap_scheduler, chunked_prefill,
25662566
max_batch_size, moe_backend):
25672567

25682568
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70)
@@ -2571,13 +2571,10 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, mtp_nextn, fp8kv,
25712571
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
25722572
moe_config=MoeConfig(backend=moe_backend))
25732573

2574-
if fp8kv:
2575-
kv_cache_config.dtype = "fp8"
2576-
25772574
mtp_config = None
25782575
if mtp_nextn > 0:
25792576
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
2580-
with LLM(f"{llm_models_root()}/GLM-4.6/GLM-4.6-FP4",
2577+
with LLM(f"{llm_models_root()}/glm-4.6-fp4",
25812578
max_batch_size=max_batch_size,
25822579
tensor_parallel_size=tp_size,
25832580
pipeline_parallel_size=pp_size,

0 commit comments

Comments
 (0)