File tree Expand file tree Collapse file tree 2 files changed +13
-3
lines changed Expand file tree Collapse file tree 2 files changed +13
-3
lines changed Original file line number Diff line number Diff line change 1111
1212from torch .nn .parallel import DistributedDataParallel as DDP
1313
14+ def verify_min_gpu_count (min_gpus : int = 2 ) -> bool :
15+ """ verification that we have at least 2 gpus to run dist examples """
16+ has_gpu = torch .accelerator .is_available ()
17+ gpu_count = torch .accelerator .device_count ()
18+ return has_gpu and gpu_count >= min_gpus
19+
1420class ToyModel (nn .Module ):
1521 def __init__ (self ):
1622 super (ToyModel , self ).__init__ ()
@@ -88,4 +94,8 @@ def main():
8894 dist .destroy_process_group ()
8995
9096if __name__ == "__main__" :
97+ _min_gpu_count = 2
98+ if not verify_min_gpu_count (min_gpus = _min_gpu_count ):
99+ print (f"Unable to locate sufficient { _min_gpu_count } gpus to run this example. Exiting." )
100+ sys .exit ()
91101 main ()
Original file line number Diff line number Diff line change 11# /bin/bash
22# bash run_example.sh {file_to_run.py} {num_gpus}
33# where file_to_run = example to run. Default = 'example.py'
4- # num_gpus = num local gpus to use (must be at least 2). Default = 4
4+ # num_gpus = num local gpus to use (must be at least 2). Default = 2
55
66# samples to run include:
77# example.py
88
9- echo " Launching ${1:- example.py} with ${2:- 4 } gpus"
10- torchrun --nnodes=1 --nproc_per_node=${2:- 4 } --rdzv_id=101 --rdzv_endpoint=" localhost:5972" ${1:- example.py}
9+ echo " Launching ${1:- example.py} with ${2:- 2 } gpus"
10+ torchrun --nnodes=1 --nproc_per_node=${2:- 2 } --rdzv_id=101 --rdzv_endpoint=" localhost:5972" ${1:- example.py}
You can’t perform that action at this time.
0 commit comments