|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | +""" |
| 4 | +Test Dual Batch Overlap (DBO) with Data Parallelism + Expert Parallelism. |
| 5 | +
|
| 6 | +DBO is specifically designed for DP+EP scenarios to hide communication latency |
| 7 | +by overlapping computation of two batches. This test validates that DBO works |
| 8 | +correctly with the DeepSeek-V2-Lite model using GSM8K evaluation. |
| 9 | +""" |
| 10 | + |
| 11 | +import pytest |
| 12 | + |
| 13 | +from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k |
| 14 | +from tests.utils import RemoteOpenAIServer |
| 15 | + |
| 16 | +MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat" |
| 17 | +DP_SIZE = 2 |
| 18 | + |
| 19 | +# GSM8K eval configuration |
| 20 | +NUM_QUESTIONS = 256 # Fast eval for CI; but must be large enough to hit dbo thresholds |
| 21 | +NUM_SHOTS = 5 # Few-shot examples |
| 22 | +MIN_ACCURACY = 0.62 # Expected 0.64 with 2% buffer (based on vLLM test data) |
| 23 | + |
| 24 | +# Increase max_num_seqs to trigger DBO for decode batches |
| 25 | +# With 64 seqs, decode batches should exceed the 32 token threshold |
| 26 | +MAX_NUM_SEQS = 64 # Increased from 16 to trigger decode DBO |
| 27 | + |
| 28 | +# DeepEP backends to test |
| 29 | +DEEPEP_BACKENDS = [ |
| 30 | + "deepep_low_latency", |
| 31 | + "deepep_high_throughput", |
| 32 | +] |
| 33 | + |
| 34 | + |
| 35 | +@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS) |
| 36 | +def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available): |
| 37 | + """ |
| 38 | + Test DBO with DP+EP using GSM8K evaluation. |
| 39 | + """ |
| 40 | + required_gpus = DP_SIZE |
| 41 | + |
| 42 | + if num_gpus_available < required_gpus: |
| 43 | + pytest.skip(f"Need at least {required_gpus} GPUs (DP={DP_SIZE})") |
| 44 | + |
| 45 | + # Server arguments for DBO + DP + EP |
| 46 | + server_args = [ |
| 47 | + "--max-model-len", |
| 48 | + "4096", |
| 49 | + "--max-num-seqs", |
| 50 | + str(MAX_NUM_SEQS), # Use larger batch to trigger decode DBO |
| 51 | + "--trust-remote-code", |
| 52 | + # Note: Not using --enforce-eager to test DBO's alternate CUDA graph dispatching |
| 53 | + "--data-parallel-size", |
| 54 | + str(DP_SIZE), |
| 55 | + "--enable-expert-parallel", |
| 56 | + "--enable-dbo", |
| 57 | + # Fix threshold so we know we trigger DBO |
| 58 | + "--dbo-decode-token-threshold", |
| 59 | + "16", |
| 60 | + "--dbo-prefill-token-threshold", |
| 61 | + "256", |
| 62 | + "--all2all-backend", |
| 63 | + all2all_backend, |
| 64 | + ] |
| 65 | + |
| 66 | + with RemoteOpenAIServer( |
| 67 | + MODEL_NAME, |
| 68 | + server_args, |
| 69 | + max_wait_seconds=600, # Allow time for model loading with DP+EP |
| 70 | + ) as remote_server: |
| 71 | + # Use host and port directly from RemoteOpenAIServer |
| 72 | + host = f"http://{remote_server.host}" |
| 73 | + port = remote_server.port |
| 74 | + |
| 75 | + # Run GSM8K evaluation |
| 76 | + results = evaluate_gsm8k( |
| 77 | + num_questions=NUM_QUESTIONS, |
| 78 | + num_shots=NUM_SHOTS, |
| 79 | + host=host, |
| 80 | + port=port, |
| 81 | + ) |
| 82 | + |
| 83 | + # Validate accuracy is reasonable |
| 84 | + accuracy = results["accuracy"] |
| 85 | + assert accuracy >= MIN_ACCURACY, ( |
| 86 | + f"DBO+DP+EP accuracy too low ({all2all_backend}): " |
| 87 | + f"{accuracy:.3f} < {MIN_ACCURACY:.3f} " |
| 88 | + f"(correct: {results['num_correct']}/{results['num_questions']})" |
| 89 | + ) |
0 commit comments