[https://nvbugs/5667922][fix] Update long context evaluation config (#9426)

baize97 · web-flow · commit a2d9e6250a01 · 2025-11-25T19:33:38.000+08:00
Signed-off-by: mni &lt;125171826+baize97@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
@@ -450,90 +450,11 @@ class LongBenchV2(AccuracyTask):
     EVALUATOR_KWARGS = dict(
         dataset_path=DATASET_DIR,
         length="medium",
-        max_len=1280000,
+        max_len=120000,
         apply_chat_template=True,
         random_seed=0,
     )
 
-    @staticmethod
-    def create_modified_model_dir(original_model_dir: str,
-                                  max_position_embeddings: int = 1280000,
-                                  model_max_length: int = 1280000) -> str:
-        """
-        Create temporary directory with modified config files for long context evaluation.
-
-        This method creates a temporary directory with symlinks to all model files except
-        config files, which are copied and modified to support longer context lengths.
-        This is useful for evaluating models on long context tasks that exceed the
-        original model's max_position_embeddings.
-
-        Args:
-            original_model_dir: Path to the original model directory
-            max_position_embeddings: New value for max_position_embeddings in config.json
-            model_max_length: New value for model_max_length in tokenizer_config.json
-
-        Returns:
-            Path to the temporary modified model directory
-
-        Note:
-            The caller is responsible for cleaning up the temporary directory after use.
-        """
-        import tempfile
-
-        # Create temporary model directory with symlinks
-        temp_dir = tempfile.mkdtemp(prefix="longbench_v2_modified_model_")
-        logger.info(f"Created temporary model directory: {temp_dir}")
-
-        # Create symlinks for all files except config files
-        for item in os.listdir(original_model_dir):
-            src = os.path.join(original_model_dir, item)
-            dst = os.path.join(temp_dir, item)
-
-            # Skip config files - will handle them separately
-            if item in ["config.json", "tokenizer_config.json"]:
-                continue
-
-            # Create symlink for other files/directories
-            os.symlink(src, dst)
-            logger.info(f"  Symlinked: {item}")
-
-        # Modify and copy config.json
-        config_src = os.path.join(original_model_dir, "config.json")
-        config_dst = os.path.join(temp_dir, "config.json")
-        if os.path.exists(config_src):
-            with open(config_src, 'r', encoding='utf-8') as f:
-                config = json.load(f)
-
-            # Modify max_position_embeddings
-            original_max_pos = config.get('max_position_embeddings')
-            config['max_position_embeddings'] = max_position_embeddings
-            logger.info(
-                f"  Modified config.json: max_position_embeddings {original_max_pos} -> {max_position_embeddings}"
-            )
-
-            with open(config_dst, 'w', encoding='utf-8') as f:
-                json.dump(config, f, indent=2, ensure_ascii=False)
-
-        # Modify and copy tokenizer_config.json
-        tokenizer_config_src = os.path.join(original_model_dir,
-                                            "tokenizer_config.json")
-        tokenizer_config_dst = os.path.join(temp_dir, "tokenizer_config.json")
-        if os.path.exists(tokenizer_config_src):
-            with open(tokenizer_config_src, 'r', encoding='utf-8') as f:
-                tokenizer_config = json.load(f)
-
-            # Modify model_max_length
-            original_max_len = tokenizer_config.get('model_max_length')
-            tokenizer_config['model_max_length'] = model_max_length
-            logger.info(
-                f"  Modified tokenizer_config.json: model_max_length {original_max_len} -> {model_max_length}"
-            )
-
-            with open(tokenizer_config_dst, 'w', encoding='utf-8') as f:
-                json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
-
-        return temp_dir
-
 
 class CliFlowAccuracyTestHarness:
     # Model
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -4167,103 +4167,88 @@ class TestDeepSeekR1LongBenchV2(LlmapiAccuracyTestHarness):
 
     @pytest.mark.skip_less_mpi_world_size(8)
     def test_fp8_8gpus(self):
-        original_model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528"
-        if not os.path.exists(original_model_dir):
-            pytest.skip(f"Model directory {original_model_dir} does not exist")
+        model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528"
+        if not os.path.exists(model_dir):
+            pytest.skip(f"Model directory {model_dir} does not exist")
 
-        temp_dir = None
-        try:
-            # Create modified model directory using LongBenchV2 static method
-            # This is a WAR for the fact that the model config is not modified to support long context.
-            # TODO: remove this once the model config is modified to support long context.
-            temp_dir = LongBenchV2.create_modified_model_dir(original_model_dir)
-
-            # Configure model settings
-            kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
-                                            enable_block_reuse=True,
-                                            enable_partial_reuse=False,
-                                            dtype="fp8")
-
-            cuda_graph_config = CudaGraphConfig(enable_padding=True,
-                                                max_batch_size=32)
+        # Configure model settings
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
+                                        enable_block_reuse=True,
+                                        enable_partial_reuse=False,
+                                        dtype="fp8")
 
-            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3)
+        cuda_graph_config = CudaGraphConfig(enable_padding=True,
+                                            max_batch_size=32)
 
-            moe_config = MoeConfig(backend='DEEPGEMM', max_num_tokens=32000)
+        mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3)
 
-            pytorch_config = dict(cuda_graph_config=cuda_graph_config,
-                                  kv_cache_config=kv_cache_config,
-                                  speculative_config=mtp_config,
-                                  moe_config=moe_config,
-                                  enable_chunked_prefill=True,
-                                  enable_autotuner=True)
+        moe_config = MoeConfig(backend='DEEPGEMM', max_num_tokens=32000)
 
-            # Create LLM instance and evaluate
-            with LLM(temp_dir,
-                     tensor_parallel_size=8,
-                     moe_expert_parallel_size=8,
-                     max_num_tokens=32000,
-                     max_batch_size=32,
-                     **pytorch_config) as llm:
+        pytorch_config = dict(cuda_graph_config=cuda_graph_config,
+                              kv_cache_config=kv_cache_config,
+                              speculative_config=mtp_config,
+                              moe_config=moe_config,
+                              enable_chunked_prefill=True,
+                              enable_autotuner=True)
 
-                task = LongBenchV2(self.MODEL_NAME)
+        # Create LLM instance and evaluate
+        with LLM(model_dir,
+                 tensor_parallel_size=8,
+                 moe_expert_parallel_size=8,
+                 max_num_tokens=32000,
+                 max_batch_size=32,
+                 **pytorch_config) as llm:
 
-                sampling_params = SamplingParams(max_tokens=32000)
+            task = LongBenchV2(self.MODEL_NAME)
 
-                task.evaluate(llm, sampling_params=sampling_params)
+            sampling_params = SamplingParams(
+                max_tokens=32000,
+                truncate_prompt_tokens=128000,
+            )
 
-        finally:
-            # Cleanup temporary files
-            if temp_dir and os.path.exists(temp_dir):
-                import shutil
-                shutil.rmtree(temp_dir, ignore_errors=True)
+            task.evaluate(llm, sampling_params=sampling_params)
 
     @pytest.mark.skip_less_mpi_world_size(4)
     def test_nvfp4_4gpus(self):
-        original_model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528-FP4"
-        temp_dir = None
-        try:
-            # Create modified model directory using LongBenchV2 static method
-            temp_dir = LongBenchV2.create_modified_model_dir(original_model_dir)
-
-            # Configure model settings (no MOE config for FP4 version)
-            kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
-                                            enable_block_reuse=True,
-                                            enable_partial_reuse=False,
-                                            dtype="fp8")
-
-            cuda_graph_config = CudaGraphConfig(enable_padding=True,
-                                                max_batch_size=32)
-
-            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3)
-
-            pytorch_config = dict(cuda_graph_config=cuda_graph_config,
-                                  kv_cache_config=kv_cache_config,
-                                  speculative_config=mtp_config,
-                                  enable_chunked_prefill=True,
-                                  enable_autotuner=True)
-
-            # Create LLM instance and evaluate
-            with LLM(temp_dir,
-                     tensor_parallel_size=4,
-                     moe_expert_parallel_size=4,
-                     max_num_tokens=32000,
-                     max_batch_size=32,
-                     **pytorch_config) as llm:
+        model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528-FP4"
+        if not os.path.exists(model_dir):
+            pytest.skip(f"Model directory {model_dir} does not exist")
 
-                assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+        # Configure model settings (no MOE config for FP4 version)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
+                                        enable_block_reuse=True,
+                                        enable_partial_reuse=False,
+                                        dtype="fp8")
+
+        cuda_graph_config = CudaGraphConfig(enable_padding=True,
+                                            max_batch_size=32)
 
-                task = LongBenchV2(self.MODEL_NAME)
+        mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3)
 
-                sampling_params = SamplingParams(max_tokens=32000)
+        pytorch_config = dict(cuda_graph_config=cuda_graph_config,
+                              kv_cache_config=kv_cache_config,
+                              speculative_config=mtp_config,
+                              enable_chunked_prefill=True,
+                              enable_autotuner=True)
 
-                task.evaluate(llm, sampling_params=sampling_params)
+        # Create LLM instance and evaluate
+        with LLM(model_dir,
+                 tensor_parallel_size=4,
+                 moe_expert_parallel_size=4,
+                 max_num_tokens=32000,
+                 max_batch_size=32,
+                 **pytorch_config) as llm:
 
-        finally:
-            # Cleanup temporary files
-            if temp_dir and os.path.exists(temp_dir):
-                import shutil
-                shutil.rmtree(temp_dir, ignore_errors=True)
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+
+            task = LongBenchV2(self.MODEL_NAME)
+
+            sampling_params = SamplingParams(
+                max_tokens=32000,
+                truncate_prompt_tokens=128000,
+            )
+
+            task.evaluate(llm, sampling_params=sampling_params)
 
 
 class TestStarcoder2_3B(LlmapiAccuracyTestHarness):