Skip to content

Commit a108deb

Browse files
committed
feat: add delete command, make cancel abort for k8s (#1156)
1 parent 1e0492c commit a108deb

File tree

7 files changed

+149
-3
lines changed

7 files changed

+149
-3
lines changed

torchx/cli/cmd_delete.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
# pyre-strict
9+
10+
import argparse
11+
import logging
12+
13+
from torchx.cli.cmd_base import SubCommand
14+
from torchx.runner import get_runner
15+
16+
logger: logging.Logger = logging.getLogger(__name__)
17+
18+
19+
class CmdDelete(SubCommand):
20+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
21+
subparser.add_argument(
22+
"app_handle",
23+
type=str,
24+
help="torchx app handle (e.g. local://session-name/app-id)",
25+
)
26+
27+
def run(self, args: argparse.Namespace) -> None:
28+
app_handle = args.app_handle
29+
runner = get_runner()
30+
runner.delete(app_handle)

torchx/cli/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from torchx.cli.cmd_base import SubCommand
1717
from torchx.cli.cmd_cancel import CmdCancel
1818
from torchx.cli.cmd_configure import CmdConfigure
19+
from torchx.cli.cmd_delete import CmdDelete
1920
from torchx.cli.cmd_describe import CmdDescribe
2021
from torchx.cli.cmd_list import CmdList
2122
from torchx.cli.cmd_log import CmdLog
@@ -37,6 +38,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
3738
"builtins": CmdBuiltins(),
3839
"cancel": CmdCancel(),
3940
"configure": CmdConfigure(),
41+
"delete": CmdDelete(),
4042
"describe": CmdDescribe(),
4143
"list": CmdList(),
4244
"log": CmdLog(),

torchx/cli/test/cmd_delete_test.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
# pyre-strict
9+
10+
import argparse
11+
import unittest
12+
from unittest.mock import MagicMock, patch
13+
14+
from torchx.cli.cmd_delete import CmdDelete
15+
16+
17+
class CmdDeleteTest(unittest.TestCase):
18+
@patch("torchx.runner.api.Runner.delete")
19+
def test_run(self, delete: MagicMock) -> None:
20+
parser = argparse.ArgumentParser()
21+
cmd_delete = CmdDelete()
22+
cmd_delete.add_arguments(parser)
23+
24+
args = parser.parse_args(["foo://session/id"])
25+
cmd_delete.run(args)
26+
27+
self.assertEqual(delete.call_count, 1)
28+
delete.assert_called_with("foo://session/id")

torchx/runner/api.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,16 @@ def cancel(self, app_handle: AppHandle) -> None:
579579
if status is not None and not status.is_terminal():
580580
scheduler.cancel(app_id)
581581

582+
def delete(self, app_handle: AppHandle) -> None:
583+
"""
584+
Deletes the application from the scheduler.
585+
"""
586+
scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
587+
with log_event("delete", scheduler_backend, app_id):
588+
status = self.status(app_handle)
589+
if status is not None and not status.is_terminal():
590+
scheduler.delete(app_id)
591+
582592
def stop(self, app_handle: AppHandle) -> None:
583593
"""
584594
See method ``cancel``.

torchx/schedulers/api.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,13 @@ def cancel(self, app_id: str) -> None:
264264
# do nothing if the app does not exist
265265
return
266266

267+
def delete(self, app_id: str) -> None:
268+
"""
269+
Deletes the application. By default, this calls cancel.
270+
Schedulers can override to provide delete-specific behavior.
271+
"""
272+
self.cancel(app_id)
273+
267274
def log_iter(
268275
self,
269276
app_id: str,

torchx/schedulers/kubernetes_scheduler.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,16 @@ class KubernetesScheduler(
622622
$ torchx status kubernetes://torchx_user/1234
623623
...
624624
625+
**Cancellation**
626+
627+
Canceling a job aborts it while preserving the job spec for inspection
628+
and cloning via kubectl apply. Use the delete command to remove the job entirely:
629+
630+
.. code-block:: bash
631+
632+
$ torchx cancel kubernetes://namespace/jobname # abort, preserves spec
633+
$ torchx delete kubernetes://namespace/jobname # delete completely
634+
625635
**Config Options**
626636
627637
.. runopts::
@@ -818,6 +828,33 @@ def _validate(self, app: AppDef, scheduler: str, cfg: KubernetesOpts) -> None:
818828
pass
819829

820830
def _cancel_existing(self, app_id: str) -> None:
831+
"""
832+
Abort a Volcano job while preserving the spec for inspection.
833+
"""
834+
namespace, name = app_id.split(":")
835+
vcjob = self._custom_objects_api().get_namespaced_custom_object(
836+
group="batch.volcano.sh",
837+
version="v1alpha1",
838+
namespace=namespace,
839+
plural="jobs",
840+
name=name,
841+
)
842+
vcjob["status"]["state"]["phase"] = "Aborted"
843+
self._custom_objects_api().replace_namespaced_custom_object_status(
844+
group="batch.volcano.sh",
845+
version="v1alpha1",
846+
namespace=namespace,
847+
plural="jobs",
848+
name=name,
849+
body=vcjob,
850+
)
851+
852+
def delete(self, app_id: str) -> None:
853+
"""
854+
Delete a Volcano job completely from the cluster.
855+
"""
856+
if not self.exists(app_id):
857+
return
821858
namespace, name = app_id.split(":")
822859
self._custom_objects_api().delete_namespaced_custom_object(
823860
group="batch.volcano.sh",

torchx/schedulers/test/kubernetes_scheduler_test.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -800,11 +800,26 @@ def test_runopts(self) -> None:
800800
},
801801
)
802802

803-
@patch("kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object")
804-
def test_cancel_existing(self, delete_namespaced_custom_object: MagicMock) -> None:
803+
@patch("kubernetes.client.CustomObjectsApi.get_namespaced_custom_object")
804+
@patch("kubernetes.client.CustomObjectsApi.replace_namespaced_custom_object_status")
805+
def test_cancel_existing(
806+
self,
807+
replace_namespaced_custom_object_status: MagicMock,
808+
get_namespaced_custom_object: MagicMock,
809+
) -> None:
805810
scheduler = create_scheduler("test")
811+
get_namespaced_custom_object.return_value = {
812+
"status": {"state": {"phase": "Running"}}
813+
}
806814
scheduler._cancel_existing("testnamespace:testjob")
807-
call = delete_namespaced_custom_object.call_args
815+
get_namespaced_custom_object.assert_called_once_with(
816+
group="batch.volcano.sh",
817+
version="v1alpha1",
818+
namespace="testnamespace",
819+
plural="jobs",
820+
name="testjob",
821+
)
822+
call = replace_namespaced_custom_object_status.call_args
808823
args, kwargs = call
809824
self.assertEqual(
810825
kwargs,
@@ -814,9 +829,26 @@ def test_cancel_existing(self, delete_namespaced_custom_object: MagicMock) -> No
814829
"namespace": "testnamespace",
815830
"plural": "jobs",
816831
"name": "testjob",
832+
"body": {"status": {"state": {"phase": "Aborted"}}},
817833
},
818834
)
819835

836+
@patch("kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object")
837+
@patch("torchx.schedulers.kubernetes_scheduler.KubernetesScheduler.exists")
838+
def test_delete(
839+
self, exists: MagicMock, delete_namespaced_custom_object: MagicMock
840+
) -> None:
841+
scheduler = create_scheduler("test")
842+
exists.return_value = True
843+
scheduler.delete("testnamespace:testjob")
844+
delete_namespaced_custom_object.assert_called_once_with(
845+
group="batch.volcano.sh",
846+
version="v1alpha1",
847+
namespace="testnamespace",
848+
plural="jobs",
849+
name="testjob",
850+
)
851+
820852
@patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object")
821853
def test_list(self, list_namespaced_custom_object: MagicMock) -> None:
822854
with patch(

0 commit comments

Comments
 (0)