@@ -4167,103 +4167,88 @@ class TestDeepSeekR1LongBenchV2(LlmapiAccuracyTestHarness):
41674167
41684168 @pytest .mark .skip_less_mpi_world_size (8 )
41694169 def test_fp8_8gpus (self ):
4170- original_model_dir = f"{ llm_models_root ()} /DeepSeek-R1/DeepSeek-R1-0528"
4171- if not os .path .exists (original_model_dir ):
4172- pytest .skip (f"Model directory { original_model_dir } does not exist" )
4170+ model_dir = f"{ llm_models_root ()} /DeepSeek-R1/DeepSeek-R1-0528"
4171+ if not os .path .exists (model_dir ):
4172+ pytest .skip (f"Model directory { model_dir } does not exist" )
41734173
4174- temp_dir = None
4175- try :
4176- # Create modified model directory using LongBenchV2 static method
4177- # This is a WAR for the fact that the model config is not modified to support long context.
4178- # TODO: remove this once the model config is modified to support long context.
4179- temp_dir = LongBenchV2 .create_modified_model_dir (original_model_dir )
4180-
4181- # Configure model settings
4182- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 ,
4183- enable_block_reuse = True ,
4184- enable_partial_reuse = False ,
4185- dtype = "fp8" )
4186-
4187- cuda_graph_config = CudaGraphConfig (enable_padding = True ,
4188- max_batch_size = 32 )
4174+ # Configure model settings
4175+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 ,
4176+ enable_block_reuse = True ,
4177+ enable_partial_reuse = False ,
4178+ dtype = "fp8" )
41894179
4190- mtp_config = MTPDecodingConfig (num_nextn_predict_layers = 3 )
4180+ cuda_graph_config = CudaGraphConfig (enable_padding = True ,
4181+ max_batch_size = 32 )
41914182
4192- moe_config = MoeConfig ( backend = 'DEEPGEMM' , max_num_tokens = 32000 )
4183+ mtp_config = MTPDecodingConfig ( num_nextn_predict_layers = 3 )
41934184
4194- pytorch_config = dict (cuda_graph_config = cuda_graph_config ,
4195- kv_cache_config = kv_cache_config ,
4196- speculative_config = mtp_config ,
4197- moe_config = moe_config ,
4198- enable_chunked_prefill = True ,
4199- enable_autotuner = True )
4185+ moe_config = MoeConfig (backend = 'DEEPGEMM' , max_num_tokens = 32000 )
42004186
4201- # Create LLM instance and evaluate
4202- with LLM (temp_dir ,
4203- tensor_parallel_size = 8 ,
4204- moe_expert_parallel_size = 8 ,
4205- max_num_tokens = 32000 ,
4206- max_batch_size = 32 ,
4207- ** pytorch_config ) as llm :
4187+ pytorch_config = dict (cuda_graph_config = cuda_graph_config ,
4188+ kv_cache_config = kv_cache_config ,
4189+ speculative_config = mtp_config ,
4190+ moe_config = moe_config ,
4191+ enable_chunked_prefill = True ,
4192+ enable_autotuner = True )
42084193
4209- task = LongBenchV2 (self .MODEL_NAME )
4194+ # Create LLM instance and evaluate
4195+ with LLM (model_dir ,
4196+ tensor_parallel_size = 8 ,
4197+ moe_expert_parallel_size = 8 ,
4198+ max_num_tokens = 32000 ,
4199+ max_batch_size = 32 ,
4200+ ** pytorch_config ) as llm :
42104201
4211- sampling_params = SamplingParams ( max_tokens = 32000 )
4202+ task = LongBenchV2 ( self . MODEL_NAME )
42124203
4213- task .evaluate (llm , sampling_params = sampling_params )
4204+ sampling_params = SamplingParams (
4205+ max_tokens = 32000 ,
4206+ truncate_prompt_tokens = 128000 ,
4207+ )
42144208
4215- finally :
4216- # Cleanup temporary files
4217- if temp_dir and os .path .exists (temp_dir ):
4218- import shutil
4219- shutil .rmtree (temp_dir , ignore_errors = True )
4209+ task .evaluate (llm , sampling_params = sampling_params )
42204210
42214211 @pytest .mark .skip_less_mpi_world_size (4 )
42224212 def test_nvfp4_4gpus (self ):
4223- original_model_dir = f"{ llm_models_root ()} /DeepSeek-R1/DeepSeek-R1-0528-FP4"
4224- temp_dir = None
4225- try :
4226- # Create modified model directory using LongBenchV2 static method
4227- temp_dir = LongBenchV2 .create_modified_model_dir (original_model_dir )
4228-
4229- # Configure model settings (no MOE config for FP4 version)
4230- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 ,
4231- enable_block_reuse = True ,
4232- enable_partial_reuse = False ,
4233- dtype = "fp8" )
4234-
4235- cuda_graph_config = CudaGraphConfig (enable_padding = True ,
4236- max_batch_size = 32 )
4237-
4238- mtp_config = MTPDecodingConfig (num_nextn_predict_layers = 3 )
4239-
4240- pytorch_config = dict (cuda_graph_config = cuda_graph_config ,
4241- kv_cache_config = kv_cache_config ,
4242- speculative_config = mtp_config ,
4243- enable_chunked_prefill = True ,
4244- enable_autotuner = True )
4245-
4246- # Create LLM instance and evaluate
4247- with LLM (temp_dir ,
4248- tensor_parallel_size = 4 ,
4249- moe_expert_parallel_size = 4 ,
4250- max_num_tokens = 32000 ,
4251- max_batch_size = 32 ,
4252- ** pytorch_config ) as llm :
4213+ model_dir = f"{ llm_models_root ()} /DeepSeek-R1/DeepSeek-R1-0528-FP4"
4214+ if not os .path .exists (model_dir ):
4215+ pytest .skip (f"Model directory { model_dir } does not exist" )
42534216
4254- assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
4217+ # Configure model settings (no MOE config for FP4 version)
4218+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 ,
4219+ enable_block_reuse = True ,
4220+ enable_partial_reuse = False ,
4221+ dtype = "fp8" )
4222+
4223+ cuda_graph_config = CudaGraphConfig (enable_padding = True ,
4224+ max_batch_size = 32 )
42554225
4256- task = LongBenchV2 ( self . MODEL_NAME )
4226+ mtp_config = MTPDecodingConfig ( num_nextn_predict_layers = 3 )
42574227
4258- sampling_params = SamplingParams (max_tokens = 32000 )
4228+ pytorch_config = dict (cuda_graph_config = cuda_graph_config ,
4229+ kv_cache_config = kv_cache_config ,
4230+ speculative_config = mtp_config ,
4231+ enable_chunked_prefill = True ,
4232+ enable_autotuner = True )
42594233
4260- task .evaluate (llm , sampling_params = sampling_params )
4234+ # Create LLM instance and evaluate
4235+ with LLM (model_dir ,
4236+ tensor_parallel_size = 4 ,
4237+ moe_expert_parallel_size = 4 ,
4238+ max_num_tokens = 32000 ,
4239+ max_batch_size = 32 ,
4240+ ** pytorch_config ) as llm :
42614241
4262- finally :
4263- # Cleanup temporary files
4264- if temp_dir and os .path .exists (temp_dir ):
4265- import shutil
4266- shutil .rmtree (temp_dir , ignore_errors = True )
4242+ assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
4243+
4244+ task = LongBenchV2 (self .MODEL_NAME )
4245+
4246+ sampling_params = SamplingParams (
4247+ max_tokens = 32000 ,
4248+ truncate_prompt_tokens = 128000 ,
4249+ )
4250+
4251+ task .evaluate (llm , sampling_params = sampling_params )
42674252
42684253
42694254class TestStarcoder2_3B (LlmapiAccuracyTestHarness ):
0 commit comments