test to tp2 instead of tp4, fixed llm_root fixture

MrGeva · MrGeva · commit 9b2f3f1ae579 · 2025-11-10T07:40:28.000-08:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py
@@ -5,8 +5,10 @@
 from pathlib import Path
 
 import pytest
+import torch
 import yaml
 from click.testing import CliRunner
+from utils.cpp_paths import llm_root  # noqa: F401
 
 from tensorrt_llm.commands.bench import main
 
@@ -43,7 +45,7 @@ def timeout_handler(signum, frame):
 
 
 @pytest.fixture(scope="module")
-def shared_dataset(llm_root):
+def shared_dataset(llm_root):  # noqa: F811
     """Prepare dataset once for all tests in this module."""
     model_name = "meta-llama/Llama-3.1-8B"
     with tempfile.TemporaryDirectory() as temp_dir:
@@ -102,14 +104,15 @@ def _prepare_dataset(root_dir: str, temp_dir: str, model_path_or_name: str, num_
         "NCCL",
     ],
 )
-def test_allreduce_strategies(llm_root, shared_dataset, allreduce_strategy):
+def test_allreduce_strategies(llm_root, shared_dataset, allreduce_strategy):  # noqa: F811
     """Test different allreduce strategies with multi-GPU configuration.
 
-    This test validates that all allreduce strategies work correctly with TP=4.
+    This test validates that all allreduce strategies work correctly with TP=2.
     Note: TWOSHOT strategy will automatically fall back to ONESHOT when sequence
     length is smaller than TP size during initialization.
 
     Test has a 300 second timeout to prevent indefinite hangs.
+    Test will be skipped if fewer than 2 GPUs are available.
 
     Args:
         llm_root: Root directory fixture
@@ -120,10 +123,13 @@ def test_allreduce_strategies(llm_root, shared_dataset, allreduce_strategy):
     TEST_TIMEOUT_SECONDS = 300
 
     model_name = "meta-llama/Llama-3.1-8B"
-    tp_size = 4
+    tp_size = 2
     max_batch_size = 256
     max_num_tokens = 8192
 
+    if not torch.cuda.is_available() or torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Allreduce strategy test requires at least {tp_size} GPUs, skipping")
+
     with tempfile.TemporaryDirectory() as temp_dir:
         # Write shared dataset to temp location
         dataset_path = Path(temp_dir, "synthetic_128_128.txt")