@@ -45,6 +45,7 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
4545from typing import TYPE_CHECKING, Optional, Dict, List
4646from pathlib import Path
4747
48+ import openshift as oc
4849from torchx.components.dist import ddp
4950from torchx.runner import get_runner
5051from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo
@@ -88,8 +89,10 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
8889 max_retries: int = 0,
8990 mounts: Optional[List[str]] = None,
9091 rdzv_port: int = 29500,
92+ rdzv_backend: str = None,
9193 scheduler_args: Optional[Dict[str, str]] = None,
9294 image: Optional[str] = None,
95+ workspace: Optional[str] = f"file://{Path.cwd()}",
9396 ):
9497 if bool(script) == bool(m): # logical XOR
9598 raise ValueError(
@@ -108,10 +111,12 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
108111 self.max_retries = max_retries
109112 self.mounts: List[str] = mounts if mounts is not None else []
110113 self.rdzv_port = rdzv_port
114+ self.rdzv_backend = rdzv_backend
111115 self.scheduler_args: Dict[str, str] = (
112116 scheduler_args if scheduler_args is not None else dict()
113117 )
114118 self.image = image
119+ self.workspace = workspace
115120
116121 def _dry_run(self, cluster: "Cluster"):
117122 j = f"{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}" # # of proc. = # of gpus
@@ -131,17 +136,23 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
131136 env=self.env,
132137 max_retries=self.max_retries,
133138 rdzv_port=self.rdzv_port,
139+ rdzv_backend=self.rdzv_backend
140+ if self.rdzv_backend is not None
141+ else "static",
134142 mounts=self.mounts,
135143 ),
136144 scheduler=cluster.torchx_scheduler,
137145 cfg=cluster.torchx_config(**self.scheduler_args),
138- workspace=f"file://{Path.cwd()}" ,
146+ workspace=self.workspace ,
139147 )
140148
141149 def _missing_spec(self, spec: str):
142150 raise ValueError(f"Job definition missing arg: {spec}")
143151
144152 def _dry_run_no_cluster(self):
153+ if self.scheduler_args is not None:
154+ if self.scheduler_args.get("namespace") is None:
155+ self.scheduler_args["namespace"] = oc.get_project_name()
145156 return torchx_runner.dryrun(
146157 app=ddp(
147158 *self.script_args,
@@ -166,13 +177,16 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
166177 env=self.env, # should this still exist?
167178 max_retries=self.max_retries,
168179 rdzv_port=self.rdzv_port, # should this still exist?
180+ rdzv_backend=self.rdzv_backend
181+ if self.rdzv_backend is not None
182+ else "c10d",
169183 mounts=self.mounts,
170184 image=self.image
171185 if self.image is not None
172186 else self._missing_spec("image"),
173187 ),
174188 scheduler="kubernetes_mcad",
175- cfg=self.scheduler_args if self.scheduler_args is not None else None ,
189+ cfg=self.scheduler_args,
176190 workspace="",
177191 )
178192
@@ -291,7 +305,7 @@ <h3>Methods</h3>
291305</ dd >
292306< dt id ="codeflare_sdk.job.jobs.DDPJobDefinition "> < code class ="flex name class ">
293307< span > class < span class ="ident "> DDPJobDefinition</ span > </ span >
294- < span > (</ span > < span > script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None)</ span >
308+ < span > (</ span > < span > script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, rdzv_backend: str = None, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None, workspace: Optional[str] = 'file:///home/meyceoz/Documents/codeflare-sdk' )</ span >
295309</ code > </ dt >
296310< dd >
297311< div class ="desc "> </ div >
@@ -315,8 +329,10 @@ <h3>Methods</h3>
315329 max_retries: int = 0,
316330 mounts: Optional[List[str]] = None,
317331 rdzv_port: int = 29500,
332+ rdzv_backend: str = None,
318333 scheduler_args: Optional[Dict[str, str]] = None,
319334 image: Optional[str] = None,
335+ workspace: Optional[str] = f"file://{Path.cwd()}",
320336 ):
321337 if bool(script) == bool(m): # logical XOR
322338 raise ValueError(
@@ -335,10 +351,12 @@ <h3>Methods</h3>
335351 self.max_retries = max_retries
336352 self.mounts: List[str] = mounts if mounts is not None else []
337353 self.rdzv_port = rdzv_port
354+ self.rdzv_backend = rdzv_backend
338355 self.scheduler_args: Dict[str, str] = (
339356 scheduler_args if scheduler_args is not None else dict()
340357 )
341358 self.image = image
359+ self.workspace = workspace
342360
343361 def _dry_run(self, cluster: "Cluster"):
344362 j = f"{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}" # # of proc. = # of gpus
@@ -358,17 +376,23 @@ <h3>Methods</h3>
358376 env=self.env,
359377 max_retries=self.max_retries,
360378 rdzv_port=self.rdzv_port,
379+ rdzv_backend=self.rdzv_backend
380+ if self.rdzv_backend is not None
381+ else "static",
361382 mounts=self.mounts,
362383 ),
363384 scheduler=cluster.torchx_scheduler,
364385 cfg=cluster.torchx_config(**self.scheduler_args),
365- workspace=f"file://{Path.cwd()}" ,
386+ workspace=self.workspace ,
366387 )
367388
368389 def _missing_spec(self, spec: str):
369390 raise ValueError(f"Job definition missing arg: {spec}")
370391
371392 def _dry_run_no_cluster(self):
393+ if self.scheduler_args is not None:
394+ if self.scheduler_args.get("namespace") is None:
395+ self.scheduler_args["namespace"] = oc.get_project_name()
372396 return torchx_runner.dryrun(
373397 app=ddp(
374398 *self.script_args,
@@ -393,13 +417,16 @@ <h3>Methods</h3>
393417 env=self.env, # should this still exist?
394418 max_retries=self.max_retries,
395419 rdzv_port=self.rdzv_port, # should this still exist?
420+ rdzv_backend=self.rdzv_backend
421+ if self.rdzv_backend is not None
422+ else "c10d",
396423 mounts=self.mounts,
397424 image=self.image
398425 if self.image is not None
399426 else self._missing_spec("image"),
400427 ),
401428 scheduler="kubernetes_mcad",
402- cfg=self.scheduler_args if self.scheduler_args is not None else None ,
429+ cfg=self.scheduler_args,
403430 workspace="",
404431 )
405432
0 commit comments