@@ -48,23 +48,75 @@ def with_git(
4848
4949 @property
5050 def git (self ) -> str :
51+ """The specification for source code from Git repository."""
5152 return self .get_spec (self .CONST_GIT )
5253
5354 def with_inputs (self , mappings : dict ):
55+ """Specifies the input files to be copied into the job run.
56+
57+ Parameters
58+ ----------
59+ mappings : dict
60+ Each key is the source path (uri). It can be http/ftp link or OCI object storage URI.
61+ The corresponding value is the destination path in the job run, relative to the working directory.
62+
63+ Returns
64+ -------
65+ self
66+ The runtime instance.
67+
68+ Examples
69+ --------
70+ >>> pt_runtime.with_inputs({"oci://bucket@namespace/path/to/file.txt": "data/input.txt"})
71+
72+ """
5473 return self .set_spec (self .CONST_INPUT , mappings )
5574
5675 @property
5776 def inputs (self ) -> dict :
77+ """The input files to be copied into the job run."""
5878 return self .get_spec (self .CONST_INPUT )
5979
6080 def with_replica (self , count : int ):
81+ """Specifies the number of nodes (job runs) for the job.
82+
83+ Parameters
84+ ----------
85+ count : int
86+ Number of nodes (job runs)
87+
88+ Returns
89+ -------
90+ self
91+ The runtime instance.
92+ """
6193 return self .set_spec (self .CONST_REPLICA , count )
6294
6395 @property
6496 def replica (self ) -> int :
97+ """The number of nodes (job runs)."""
6598 return self .get_spec (self .CONST_REPLICA )
6699
67100 def with_dependency (self , pip_req = None , pip_pkg = None ):
101+ """Specifies additional dependencies to be installed using pip.
102+
103+ Parameters
104+ ----------
105+ pip_req : str, optional
106+ Path of the requirements.txt file, relative to the working directory, by default None
107+ pip_pkg : str, optional
108+ Command line args for `pip install`, by default None.
109+ Packages with version specification needs to be quoted.
110+
111+ Returns
112+ -------
113+ self
114+ The runtime instance.
115+
116+ Examples
117+ --------
118+ >>> pt_runtime.with_dependency('"package>1.0"')
119+ """
68120 dep = {}
69121 if pip_req :
70122 dep [self .CONST_PIP_REQ ] = pip_req
@@ -76,24 +128,72 @@ def with_dependency(self, pip_req=None, pip_pkg=None):
76128
77129 @property
78130 def dependencies (self ) -> dict :
131+ """Additional pip dependencies."""
79132 return self .get_spec (self .CONST_DEP )
80133
81134 def with_command (self , command : str , use_deepspeed = False ):
135+ """Specifies the command for launching the workload.
136+
137+ Parameters
138+ ----------
139+ command : str
140+ The command for launching the workload.
141+ The command should start with `torchrun`, `deepspeed` or `accelerate launch`.
142+
143+ For `torchrun`,
144+ ADS will set `--nnode`, `--nproc_per_node`, `--rdzv_backend` and `--rdzv_endpoint` automatically.
145+ The default `rdzv_backend` will be `c10d`.
146+ The default port for `rdzv_endpoint` is 29400
147+
148+ For `deepspeed`,
149+ ADS will generate the hostfile automatically and setup the SSH configurations.
150+
151+ For `accelerate launch`
152+ You can add your config YAML to the source code and specify it using `--config_file` argument.
153+ In your config, please use `LOCAL_MACHINE` as the compute environment.
154+ The same config file will be used by all nodes in multi-node workload.
155+ ADS will set `--num_processes`, `--num_machines`, `--machine_rank`, `--main_process_ip`
156+ and `--main_process_port` automatically. These values will override the ones from your config YAML.
157+ The default `main_process_port` is 29400
158+
159+ If you don't want to use the options set by ADS automatically,
160+ you can specify them explicitly in the command.
161+
162+ use_deepspeed : bool, optional
163+ Indicate whether to configure deepspeed for multi-node workload, by default False.
164+ If your command starts with "deepspeed" or contains the argument "--use_deepspeed",
165+ your job runs will be configured for deepspeed regardless of this setting.
166+ Make sure to set use_deepspeed to `True` here
167+ if you are using `accelerate launch` with deepspeed setting in config YAML.
168+
169+ Returns
170+ -------
171+ self
172+ The runtime instance.
173+
174+ Examples
175+ --------
176+ >>> pt_runtime.with_command("torchrun train.py")
177+ """
82178 if use_deepspeed :
83179 self .set_spec (self .CONST_DEEPSPEED , True )
84180 return self .set_spec (self .CONST_COMMAND , command )
85181
86182 @property
87183 def command (self ):
184+ """The command for launching the workload."""
88185 return self .get_spec (self .CONST_COMMAND )
89186
90187 @property
91188 def use_deepspeed (self ):
189+ """Indicate whether whether to configure deepspeed for multi-node workload"""
92190 if self .get_spec (self .CONST_DEEPSPEED ):
93191 return True
94192 return False
95193
96194 def run (self , dsc_job , ** kwargs ):
195+ """Starts the job runs
196+ """
97197 replicas = self .replica if self .replica else 1
98198 main_run = None
99199 for i in range (replicas ):
0 commit comments