@@ -270,6 +270,7 @@ def find_self_ip(self):
270270 ip = snics [0 ].address
271271 if ipaddress .ip_address (ip ) in ipaddress .ip_network (cidr ):
272272 logger .info ("Node IP address: %s" , ip )
273+ # Specify the network interface for NCCL/GLOO
273274 os .environ ["GLOO_SOCKET_IFNAME" ] = interface
274275 os .environ ["NCCL_SOCKET_IFNAME" ] = interface
275276 return ip
@@ -579,13 +580,22 @@ def save_deepspeed_env(self):
579580 # as the .deepspeed_env file is parsed line by line.
580581 if not v or "\n " in v :
581582 continue
583+ # Ignore variables that are node specific
584+ # The network interface name for each job run is a unique string, e.g. ens300f0v1604
585+ if k in ["NCCL_SOCKET_IFNAME" , "GLOO_SOCKET_IFNAME" , "JOB_RUN_OCID" ]:
586+ continue
582587 # Quote the value if it contains space
583588 # Environment variable containing space may not be exported correctly when using pdsh
584589 # https://github.com/microsoft/DeepSpeed/blob/v0.9.2/deepspeed/launcher/multinode_runner.py#L79
585590 if " " in v :
586591 v = shlex .quote (v )
587592
588593 f .write (f"{ k } ={ v } \n " )
594+ # The following are required for specifying the network interface to be used by NCCL/GLOO
595+ # The value should be the prefix of the expected network interface name
596+ # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-socket-ifname
597+ f .write ("NCCL_SOCKET_IFNAME=ens\n " )
598+ f .write ("GLOO_SOCKET_IFNAME=ens\n " )
589599 logger .debug ("Environment variables saved to %s" , self .ENV_FILE )
590600 self .run_command (f"cat { self .ENV_FILE } " )
591601
0 commit comments