Skip to content

Commit a2d9e62

Browse files
authored
[https://nvbugs/5667922][fix] Update long context evaluation config (#9426)
Signed-off-by: mni <125171826+baize97@users.noreply.github.com>
1 parent a38d91a commit a2d9e62

File tree

2 files changed

+66
-160
lines changed

2 files changed

+66
-160
lines changed

tests/integration/defs/accuracy/accuracy_core.py

Lines changed: 1 addition & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -450,90 +450,11 @@ class LongBenchV2(AccuracyTask):
450450
EVALUATOR_KWARGS = dict(
451451
dataset_path=DATASET_DIR,
452452
length="medium",
453-
max_len=1280000,
453+
max_len=120000,
454454
apply_chat_template=True,
455455
random_seed=0,
456456
)
457457

458-
@staticmethod
459-
def create_modified_model_dir(original_model_dir: str,
460-
max_position_embeddings: int = 1280000,
461-
model_max_length: int = 1280000) -> str:
462-
"""
463-
Create temporary directory with modified config files for long context evaluation.
464-
465-
This method creates a temporary directory with symlinks to all model files except
466-
config files, which are copied and modified to support longer context lengths.
467-
This is useful for evaluating models on long context tasks that exceed the
468-
original model's max_position_embeddings.
469-
470-
Args:
471-
original_model_dir: Path to the original model directory
472-
max_position_embeddings: New value for max_position_embeddings in config.json
473-
model_max_length: New value for model_max_length in tokenizer_config.json
474-
475-
Returns:
476-
Path to the temporary modified model directory
477-
478-
Note:
479-
The caller is responsible for cleaning up the temporary directory after use.
480-
"""
481-
import tempfile
482-
483-
# Create temporary model directory with symlinks
484-
temp_dir = tempfile.mkdtemp(prefix="longbench_v2_modified_model_")
485-
logger.info(f"Created temporary model directory: {temp_dir}")
486-
487-
# Create symlinks for all files except config files
488-
for item in os.listdir(original_model_dir):
489-
src = os.path.join(original_model_dir, item)
490-
dst = os.path.join(temp_dir, item)
491-
492-
# Skip config files - will handle them separately
493-
if item in ["config.json", "tokenizer_config.json"]:
494-
continue
495-
496-
# Create symlink for other files/directories
497-
os.symlink(src, dst)
498-
logger.info(f" Symlinked: {item}")
499-
500-
# Modify and copy config.json
501-
config_src = os.path.join(original_model_dir, "config.json")
502-
config_dst = os.path.join(temp_dir, "config.json")
503-
if os.path.exists(config_src):
504-
with open(config_src, 'r', encoding='utf-8') as f:
505-
config = json.load(f)
506-
507-
# Modify max_position_embeddings
508-
original_max_pos = config.get('max_position_embeddings')
509-
config['max_position_embeddings'] = max_position_embeddings
510-
logger.info(
511-
f" Modified config.json: max_position_embeddings {original_max_pos} -> {max_position_embeddings}"
512-
)
513-
514-
with open(config_dst, 'w', encoding='utf-8') as f:
515-
json.dump(config, f, indent=2, ensure_ascii=False)
516-
517-
# Modify and copy tokenizer_config.json
518-
tokenizer_config_src = os.path.join(original_model_dir,
519-
"tokenizer_config.json")
520-
tokenizer_config_dst = os.path.join(temp_dir, "tokenizer_config.json")
521-
if os.path.exists(tokenizer_config_src):
522-
with open(tokenizer_config_src, 'r', encoding='utf-8') as f:
523-
tokenizer_config = json.load(f)
524-
525-
# Modify model_max_length
526-
original_max_len = tokenizer_config.get('model_max_length')
527-
tokenizer_config['model_max_length'] = model_max_length
528-
logger.info(
529-
f" Modified tokenizer_config.json: model_max_length {original_max_len} -> {model_max_length}"
530-
)
531-
532-
with open(tokenizer_config_dst, 'w', encoding='utf-8') as f:
533-
json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
534-
535-
return temp_dir
536-
537458

538459
class CliFlowAccuracyTestHarness:
539460
# Model

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 65 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -4167,103 +4167,88 @@ class TestDeepSeekR1LongBenchV2(LlmapiAccuracyTestHarness):
41674167

41684168
@pytest.mark.skip_less_mpi_world_size(8)
41694169
def test_fp8_8gpus(self):
4170-
original_model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528"
4171-
if not os.path.exists(original_model_dir):
4172-
pytest.skip(f"Model directory {original_model_dir} does not exist")
4170+
model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528"
4171+
if not os.path.exists(model_dir):
4172+
pytest.skip(f"Model directory {model_dir} does not exist")
41734173

4174-
temp_dir = None
4175-
try:
4176-
# Create modified model directory using LongBenchV2 static method
4177-
# This is a WAR for the fact that the model config is not modified to support long context.
4178-
# TODO: remove this once the model config is modified to support long context.
4179-
temp_dir = LongBenchV2.create_modified_model_dir(original_model_dir)
4180-
4181-
# Configure model settings
4182-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
4183-
enable_block_reuse=True,
4184-
enable_partial_reuse=False,
4185-
dtype="fp8")
4186-
4187-
cuda_graph_config = CudaGraphConfig(enable_padding=True,
4188-
max_batch_size=32)
4174+
# Configure model settings
4175+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
4176+
enable_block_reuse=True,
4177+
enable_partial_reuse=False,
4178+
dtype="fp8")
41894179

4190-
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3)
4180+
cuda_graph_config = CudaGraphConfig(enable_padding=True,
4181+
max_batch_size=32)
41914182

4192-
moe_config = MoeConfig(backend='DEEPGEMM', max_num_tokens=32000)
4183+
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3)
41934184

4194-
pytorch_config = dict(cuda_graph_config=cuda_graph_config,
4195-
kv_cache_config=kv_cache_config,
4196-
speculative_config=mtp_config,
4197-
moe_config=moe_config,
4198-
enable_chunked_prefill=True,
4199-
enable_autotuner=True)
4185+
moe_config = MoeConfig(backend='DEEPGEMM', max_num_tokens=32000)
42004186

4201-
# Create LLM instance and evaluate
4202-
with LLM(temp_dir,
4203-
tensor_parallel_size=8,
4204-
moe_expert_parallel_size=8,
4205-
max_num_tokens=32000,
4206-
max_batch_size=32,
4207-
**pytorch_config) as llm:
4187+
pytorch_config = dict(cuda_graph_config=cuda_graph_config,
4188+
kv_cache_config=kv_cache_config,
4189+
speculative_config=mtp_config,
4190+
moe_config=moe_config,
4191+
enable_chunked_prefill=True,
4192+
enable_autotuner=True)
42084193

4209-
task = LongBenchV2(self.MODEL_NAME)
4194+
# Create LLM instance and evaluate
4195+
with LLM(model_dir,
4196+
tensor_parallel_size=8,
4197+
moe_expert_parallel_size=8,
4198+
max_num_tokens=32000,
4199+
max_batch_size=32,
4200+
**pytorch_config) as llm:
42104201

4211-
sampling_params = SamplingParams(max_tokens=32000)
4202+
task = LongBenchV2(self.MODEL_NAME)
42124203

4213-
task.evaluate(llm, sampling_params=sampling_params)
4204+
sampling_params = SamplingParams(
4205+
max_tokens=32000,
4206+
truncate_prompt_tokens=128000,
4207+
)
42144208

4215-
finally:
4216-
# Cleanup temporary files
4217-
if temp_dir and os.path.exists(temp_dir):
4218-
import shutil
4219-
shutil.rmtree(temp_dir, ignore_errors=True)
4209+
task.evaluate(llm, sampling_params=sampling_params)
42204210

42214211
@pytest.mark.skip_less_mpi_world_size(4)
42224212
def test_nvfp4_4gpus(self):
4223-
original_model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528-FP4"
4224-
temp_dir = None
4225-
try:
4226-
# Create modified model directory using LongBenchV2 static method
4227-
temp_dir = LongBenchV2.create_modified_model_dir(original_model_dir)
4228-
4229-
# Configure model settings (no MOE config for FP4 version)
4230-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
4231-
enable_block_reuse=True,
4232-
enable_partial_reuse=False,
4233-
dtype="fp8")
4234-
4235-
cuda_graph_config = CudaGraphConfig(enable_padding=True,
4236-
max_batch_size=32)
4237-
4238-
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3)
4239-
4240-
pytorch_config = dict(cuda_graph_config=cuda_graph_config,
4241-
kv_cache_config=kv_cache_config,
4242-
speculative_config=mtp_config,
4243-
enable_chunked_prefill=True,
4244-
enable_autotuner=True)
4245-
4246-
# Create LLM instance and evaluate
4247-
with LLM(temp_dir,
4248-
tensor_parallel_size=4,
4249-
moe_expert_parallel_size=4,
4250-
max_num_tokens=32000,
4251-
max_batch_size=32,
4252-
**pytorch_config) as llm:
4213+
model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528-FP4"
4214+
if not os.path.exists(model_dir):
4215+
pytest.skip(f"Model directory {model_dir} does not exist")
42534216

4254-
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
4217+
# Configure model settings (no MOE config for FP4 version)
4218+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
4219+
enable_block_reuse=True,
4220+
enable_partial_reuse=False,
4221+
dtype="fp8")
4222+
4223+
cuda_graph_config = CudaGraphConfig(enable_padding=True,
4224+
max_batch_size=32)
42554225

4256-
task = LongBenchV2(self.MODEL_NAME)
4226+
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3)
42574227

4258-
sampling_params = SamplingParams(max_tokens=32000)
4228+
pytorch_config = dict(cuda_graph_config=cuda_graph_config,
4229+
kv_cache_config=kv_cache_config,
4230+
speculative_config=mtp_config,
4231+
enable_chunked_prefill=True,
4232+
enable_autotuner=True)
42594233

4260-
task.evaluate(llm, sampling_params=sampling_params)
4234+
# Create LLM instance and evaluate
4235+
with LLM(model_dir,
4236+
tensor_parallel_size=4,
4237+
moe_expert_parallel_size=4,
4238+
max_num_tokens=32000,
4239+
max_batch_size=32,
4240+
**pytorch_config) as llm:
42614241

4262-
finally:
4263-
# Cleanup temporary files
4264-
if temp_dir and os.path.exists(temp_dir):
4265-
import shutil
4266-
shutil.rmtree(temp_dir, ignore_errors=True)
4242+
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
4243+
4244+
task = LongBenchV2(self.MODEL_NAME)
4245+
4246+
sampling_params = SamplingParams(
4247+
max_tokens=32000,
4248+
truncate_prompt_tokens=128000,
4249+
)
4250+
4251+
task.evaluate(llm, sampling_params=sampling_params)
42674252

42684253

42694254
class TestStarcoder2_3B(LlmapiAccuracyTestHarness):

0 commit comments

Comments
 (0)