@@ -2558,11 +2558,11 @@ def test_bfloat16_4gpus(self, tp_size, ep_size, mtp_nextn,
25582558
25592559 @pytest .mark .skip_less_device (4 )
25602560 @pytest .mark .parametrize (
2561- "tp_size,pp_size,mtp_nextn,fp8kv, cuda_graph,overlap_scheduler,chunked_prefill,max_batch_size,moe_backend" ,
2562- [pytest .param (4 , 1 , 2 , True , True , True , True , 16 , "CUTLASS" )],
2561+ "tp_size,pp_size,mtp_nextn,cuda_graph,overlap_scheduler,chunked_prefill,max_batch_size,moe_backend" ,
2562+ [pytest .param (4 , 1 , 2 , True , True , True , 16 , "CUTLASS" )],
25632563 ids = ["throughput" ])
2564- def test_nvfp4_multi_gpus (self , tp_size , pp_size , mtp_nextn , fp8kv ,
2565- cuda_graph , overlap_scheduler , chunked_prefill ,
2564+ def test_nvfp4_multi_gpus (self , tp_size , pp_size , mtp_nextn , cuda_graph ,
2565+ overlap_scheduler , chunked_prefill ,
25662566 max_batch_size , moe_backend ):
25672567
25682568 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.70 )
@@ -2571,13 +2571,10 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, mtp_nextn, fp8kv,
25712571 cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
25722572 moe_config = MoeConfig (backend = moe_backend ))
25732573
2574- if fp8kv :
2575- kv_cache_config .dtype = "fp8"
2576-
25772574 mtp_config = None
25782575 if mtp_nextn > 0 :
25792576 mtp_config = MTPDecodingConfig (num_nextn_predict_layers = mtp_nextn )
2580- with LLM (f"{ llm_models_root ()} /GLM -4.6/GLM-4.6-FP4 " ,
2577+ with LLM (f"{ llm_models_root ()} /glm -4.6-fp4 " ,
25812578 max_batch_size = max_batch_size ,
25822579 tensor_parallel_size = tp_size ,
25832580 pipeline_parallel_size = pp_size ,
0 commit comments