6161from torchx .specs import AppDryRunInfo , AppDef
6262from torchx .runner import get_runner , Runner
6363from torchx .schedulers .ray_scheduler import RayJob
64+ from torchx .schedulers .kubernetes_mcad_scheduler import KubernetesMCADJob
6465import pytest
6566
6667
@@ -1686,6 +1687,40 @@ def test_DDPJobDefinition_dry_run():
16861687 assert ddp_job ._scheduler == "ray"
16871688
16881689
1690+ def test_DDPJobDefinition_dry_run_no_cluster ():
1691+ """
1692+ Test that the dry run method returns the correct type: AppDryRunInfo,
1693+ that the attributes of the returned object are of the correct type,
1694+ and that the values from cluster and job definition are correctly passed.
1695+ """
1696+ ddp = test_DDPJobDefinition_creation ()
1697+ ddp .image = "fake-image"
1698+ ddp_job = ddp ._dry_run_no_cluster ()
1699+ assert type (ddp_job ) == AppDryRunInfo
1700+ assert ddp_job ._fmt is not None
1701+ assert type (ddp_job .request ) == KubernetesMCADJob
1702+ assert type (ddp_job ._app ) == AppDef
1703+ assert type (ddp_job ._cfg ) == type (dict ())
1704+ assert type (ddp_job ._scheduler ) == type (str ())
1705+
1706+ assert (
1707+ ddp_job .request .resource ["spec" ]["resources" ]["GenericItems" ][0 ][
1708+ "generictemplate"
1709+ ]
1710+ .spec .containers [0 ]
1711+ .image
1712+ == "fake-image"
1713+ )
1714+
1715+ assert ddp_job ._app .roles [0 ].resource .cpu == 1
1716+ assert ddp_job ._app .roles [0 ].resource .gpu == 0
1717+ assert ddp_job ._app .roles [0 ].resource .memMB == 1024
1718+
1719+ assert ddp_job ._cfg ["requirements" ] == "test"
1720+
1721+ assert ddp_job ._scheduler == "kubernetes_mcad"
1722+
1723+
16891724def test_DDPJobDefinition_dry_run_no_resource_args ():
16901725 """
16911726 Test that the dry run correctly gets resources from the cluster object
@@ -1715,6 +1750,55 @@ def test_DDPJobDefinition_dry_run_no_resource_args():
17151750 )
17161751
17171752
1753+ def test_DDPJobDefinition_dry_run_no_cluster_no_resource_args ():
1754+ """
1755+ Test that the dry run method returns the correct type: AppDryRunInfo,
1756+ that the attributes of the returned object are of the correct type,
1757+ and that the values from cluster and job definition are correctly passed.
1758+ """
1759+ ddp = test_DDPJobDefinition_creation ()
1760+ try :
1761+ ddp ._dry_run_no_cluster ()
1762+ assert 0 == 1
1763+ except ValueError as e :
1764+ assert str (e ) == "Job definition missing arg: image"
1765+ ddp .image = "fake-image"
1766+ ddp .name = None
1767+ try :
1768+ ddp ._dry_run_no_cluster ()
1769+ assert 0 == 1
1770+ except ValueError as e :
1771+ assert str (e ) == "Job definition missing arg: name"
1772+ ddp .name = "fake"
1773+ ddp .cpu = None
1774+ try :
1775+ ddp ._dry_run_no_cluster ()
1776+ assert 0 == 1
1777+ except ValueError as e :
1778+ assert str (e ) == "Job definition missing arg: cpu (# cpus per worker)"
1779+ ddp .cpu = 1
1780+ ddp .gpu = None
1781+ try :
1782+ ddp ._dry_run_no_cluster ()
1783+ assert 0 == 1
1784+ except ValueError as e :
1785+ assert str (e ) == "Job definition missing arg: gpu (# gpus per worker)"
1786+ ddp .gpu = 1
1787+ ddp .memMB = None
1788+ try :
1789+ ddp ._dry_run_no_cluster ()
1790+ assert 0 == 1
1791+ except ValueError as e :
1792+ assert str (e ) == "Job definition missing arg: memMB (memory in MB)"
1793+ ddp .memMB = 1
1794+ ddp .j = None
1795+ try :
1796+ ddp ._dry_run_no_cluster ()
1797+ assert 0 == 1
1798+ except ValueError as e :
1799+ assert str (e ) == "Job definition missing arg: j (`workers`x`procs`)"
1800+
1801+
17181802def test_DDPJobDefinition_submit (mocker ):
17191803 """
17201804 Tests that the submit method returns the correct type: DDPJob
@@ -1733,6 +1817,14 @@ def test_DDPJobDefinition_submit(mocker):
17331817 assert type (ddp_job ._app_handle ) == str
17341818 assert ddp_job ._app_handle == "fake-dashboard-url"
17351819
1820+ ddp_def .image = "fake-image"
1821+ ddp_job = ddp_def .submit ()
1822+ assert type (ddp_job ) == DDPJob
1823+ assert type (ddp_job .job_definition ) == DDPJobDefinition
1824+ assert ddp_job .cluster == None
1825+ assert type (ddp_job ._app_handle ) == str
1826+ assert ddp_job ._app_handle == "fake-dashboard-url"
1827+
17361828
17371829def test_DDPJob_creation (mocker ):
17381830 ddp_def = test_DDPJobDefinition_creation ()
@@ -1757,6 +1849,29 @@ def test_DDPJob_creation(mocker):
17571849 return ddp_job
17581850
17591851
1852+ def test_DDPJob_creation_no_cluster (mocker ):
1853+ ddp_def = test_DDPJobDefinition_creation ()
1854+ ddp_def .image = "fake-image"
1855+ mocker .patch (
1856+ "codeflare_sdk.job.jobs.torchx_runner.schedule" ,
1857+ return_value = "fake-app-handle" ,
1858+ ) # a fake app_handle
1859+ ddp_job = DDPJob (ddp_def , None )
1860+ assert type (ddp_job ) == DDPJob
1861+ assert type (ddp_job .job_definition ) == DDPJobDefinition
1862+ assert ddp_job .cluster == None
1863+ assert type (ddp_job ._app_handle ) == str
1864+ assert ddp_job ._app_handle == "fake-app-handle"
1865+ _ , args , kwargs = torchx_runner .schedule .mock_calls [0 ]
1866+ assert type (args [0 ]) == AppDryRunInfo
1867+ job_info = args [0 ]
1868+ assert type (job_info .request ) == KubernetesMCADJob
1869+ assert type (job_info ._app ) == AppDef
1870+ assert type (job_info ._cfg ) == type (dict ())
1871+ assert type (job_info ._scheduler ) == type (str ())
1872+ return ddp_job
1873+
1874+
17601875def test_DDPJob_status (mocker ):
17611876 ddp_job = test_DDPJob_creation (mocker )
17621877 mocker .patch (
@@ -1777,6 +1892,18 @@ def test_DDPJob_logs(mocker):
17771892 assert args [0 ] == "fake-dashboard-url"
17781893
17791894
1895+ def arg_check_side_effect (* args ):
1896+ assert args [0 ] == "fake-app-handle"
1897+
1898+
1899+ def test_DDPJob_cancel (mocker ):
1900+ ddp_job = test_DDPJob_creation_no_cluster (mocker )
1901+ mocker .patch (
1902+ "codeflare_sdk.job.jobs.torchx_runner.cancel" , side_effect = arg_check_side_effect
1903+ )
1904+ ddp_job .cancel ()
1905+
1906+
17801907def parse_j (cmd ):
17811908
17821909 pattern = r"--nnodes\s+\d+\s+--nproc_per_node\s+\d+"
0 commit comments