diff --git a/docs/designs/History/CodeFlareSDK_Design_Doc.md b/docs/designs/History/CodeFlareSDK_Design_Doc.md index 4992406b..c7cb26fc 100644 --- a/docs/designs/History/CodeFlareSDK_Design_Doc.md +++ b/docs/designs/History/CodeFlareSDK_Design_Doc.md @@ -42,7 +42,7 @@ Users can customize their AppWrapper by passing their desired parameters to `Clu Our aim is to simplify the process of generating valid AppWrappers for RayClusters, so we will strive to find the appropriate balance between ease of use and exposing all possible AppWrapper parameters. And we will find this balance through user feedback. -With a valid AppWrapper, we will use the Kubernetes python client to apply the AppWrapper to our Kubernetes cluster via a call to `cluster.up()` +With a valid AppWrapper, we will use the Kubernetes python client to apply the AppWrapper to our Kubernetes cluster via a call to `cluster.apply()` We will also use the Kubernetes python client to get information about both the RayCluster and AppWrapper custom resources to monitor the status of our Framework Cluster via `cluster.status()` and `cluster.details()`. diff --git a/docs/sphinx/user-docs/cluster-configuration.rst b/docs/sphinx/user-docs/cluster-configuration.rst index f8212823..2cf5b213 100644 --- a/docs/sphinx/user-docs/cluster-configuration.rst +++ b/docs/sphinx/user-docs/cluster-configuration.rst @@ -64,7 +64,7 @@ This will automatically set the ``RAY_USAGE_STATS_ENABLED`` environment variable The ``labels={"exampleLabel": "example"}`` parameter can be used to apply additional labels to the RayCluster resource. -After creating their ``cluster``, a user can call ``cluster.up()`` and +After creating their ``cluster``, a user can call ``cluster.apply()`` and ``cluster.down()`` to respectively create or remove the Ray Cluster. Custom Volumes/Volume Mounts diff --git a/docs/sphinx/user-docs/ray-cluster-interaction.rst b/docs/sphinx/user-docs/ray-cluster-interaction.rst index 717f8067..5133ca88 100644 --- a/docs/sphinx/user-docs/ray-cluster-interaction.rst +++ b/docs/sphinx/user-docs/ray-cluster-interaction.rst @@ -30,7 +30,7 @@ of it's usage: ╰─────────────────────────────────────────────────────────────────╯ (, True) cluster.down() - cluster.up() # This function will create an exact copy of the retrieved Ray Cluster only if the Ray Cluster has been previously deleted. + cluster.apply() # This function will create an exact copy of the retrieved Ray Cluster only if the Ray Cluster has been previously deleted. | These are the parameters the ``get_cluster()`` function accepts: | ``cluster_name: str # Required`` -> The name of the Ray Cluster. @@ -61,11 +61,6 @@ list_all_clusters() The following methods require a ``Cluster`` object to be initialized. See :doc:`./cluster-configuration` -cluster.up() ------------- - -| The ``cluster.up()`` function creates a Ray Cluster in the given namespace. - cluster.apply() ------------ diff --git a/docs/sphinx/user-docs/ui-widgets.rst b/docs/sphinx/user-docs/ui-widgets.rst index 92335423..94ddc20a 100644 --- a/docs/sphinx/user-docs/ui-widgets.rst +++ b/docs/sphinx/user-docs/ui-widgets.rst @@ -14,7 +14,7 @@ The Cluster Up/Down buttons appear after successfully initialising your `ClusterConfiguration `__. There are two buttons and a checkbox ``Cluster Up``, ``Cluster Down`` and ``Wait for Cluster?`` which mimic the -`cluster.up() `__, +`cluster.apply() `__, `cluster.down() `__ and `cluster.wait_ready() `__ functionality. diff --git a/src/codeflare_sdk/ray/cluster/cluster.py b/src/codeflare_sdk/ray/cluster/cluster.py index 4eaa2000..31b80418 100644 --- a/src/codeflare_sdk/ray/cluster/cluster.py +++ b/src/codeflare_sdk/ray/cluster/cluster.py @@ -241,7 +241,9 @@ def apply(self, force=False): except AttributeError as e: raise RuntimeError(f"Failed to initialize DynamicClient: {e}") except Exception as e: # pragma: no cover - if e.status == 422: + if ( + hasattr(e, "status") and e.status == 422 + ): # adding status check to avoid returning false positive print( "WARNING: RayCluster creation rejected due to invalid Kueue configuration. Please contact your administrator." ) @@ -426,7 +428,7 @@ def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True status, ready = self.status(print_to_console=False) if status == CodeFlareClusterStatus.UNKNOWN: print( - "WARNING: Current cluster status is unknown, have you run cluster.up yet?" + "WARNING: Current cluster status is unknown, have you run cluster.apply() yet? Run cluster.details() to check if it's ready." ) if ready: break @@ -518,7 +520,7 @@ def cluster_dashboard_uri(self) -> str: elif "route.openshift.io/termination" in annotations: protocol = "https" return f"{protocol}://{ingress.spec.rules[0].host}" - return "Dashboard not available yet, have you run cluster.up()?" + return "Dashboard not available yet, have you run cluster.apply()? Run cluster.details() to check if it's ready." def list_jobs(self) -> List: """ diff --git a/src/codeflare_sdk/ray/cluster/pretty_print.py b/src/codeflare_sdk/ray/cluster/pretty_print.py index 883f14ad..faa03258 100644 --- a/src/codeflare_sdk/ray/cluster/pretty_print.py +++ b/src/codeflare_sdk/ray/cluster/pretty_print.py @@ -30,7 +30,11 @@ def print_no_resources_found(): console = Console() - console.print(Panel("[red]No resources found, have you run cluster.up() yet?")) + console.print( + Panel( + "[red]No resources found, have you run cluster.apply() yet? Run cluster.details() to check if it's ready." + ) + ) def print_app_wrappers_status(app_wrappers: List[AppWrapper], starting: bool = False): diff --git a/src/codeflare_sdk/ray/cluster/test_cluster.py b/src/codeflare_sdk/ray/cluster/test_cluster.py index ce684607..6475f7a8 100644 --- a/src/codeflare_sdk/ray/cluster/test_cluster.py +++ b/src/codeflare_sdk/ray/cluster/test_cluster.py @@ -37,18 +37,22 @@ from unittest.mock import MagicMock from kubernetes import client import yaml +import pytest import filecmp import os +import ray +import tempfile parent = Path(__file__).resolve().parents[4] # project directory expected_clusters_dir = f"{parent}/tests/test_cluster_yamls" aw_dir = os.path.expanduser("~/.codeflare/resources/") -def test_cluster_up_down(mocker): +def test_cluster_apply_down(mocker): mocker.patch("kubernetes.client.ApisApi.get_api_versions") mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch("codeflare_sdk.ray.cluster.cluster.Cluster._throw_for_no_raycluster") + mocker.patch("codeflare_sdk.ray.cluster.cluster.Cluster.get_dynamic_client") mocker.patch( "kubernetes.client.CustomObjectsApi.get_cluster_custom_object", return_value={"spec": {"domain": ""}}, @@ -70,7 +74,7 @@ def test_cluster_up_down(mocker): return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), ) cluster = create_cluster(mocker) - cluster.up() + cluster.apply() cluster.down() @@ -252,10 +256,11 @@ def test_cluster_apply_without_appwrapper(mocker): print("Cluster applied without AppWrapper.") -def test_cluster_up_down_no_mcad(mocker): +def test_cluster_apply_down_no_mcad(mocker): mocker.patch("codeflare_sdk.ray.cluster.cluster.Cluster._throw_for_no_raycluster") mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch("codeflare_sdk.ray.cluster.cluster.Cluster.get_dynamic_client") mocker.patch( "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), @@ -282,7 +287,7 @@ def test_cluster_up_down_no_mcad(mocker): config.name = "unit-test-cluster-ray" config.appwrapper = False cluster = Cluster(config) - cluster.up() + cluster.apply() cluster.down() @@ -324,7 +329,7 @@ def test_cluster_uris(mocker): ) assert ( cluster.cluster_dashboard_uri() - == "Dashboard not available yet, have you run cluster.up()?" + == "Dashboard not available yet, have you run cluster.apply()? Run cluster.details() to check if it's ready." ) mocker.patch( @@ -375,8 +380,6 @@ def test_cluster_uris(mocker): def test_ray_job_wrapping(mocker): - import ray - def ray_addr(self, *args): return self._address @@ -538,7 +541,7 @@ def test_wait_ready(mocker, capsys): captured = capsys.readouterr() assert ( - "WARNING: Current cluster status is unknown, have you run cluster.up yet?" + "WARNING: Current cluster status is unknown, have you run cluster.apply() yet? Run cluster.details() to check if it's ready." in captured.out ) mocker.patch( @@ -569,11 +572,15 @@ def test_list_queue_appwrappers(mocker, capsys): ) list_all_queued("ns", appwrapper=True) captured = capsys.readouterr() - assert captured.out == ( - "╭──────────────────────────────────────────────────────────────────────────────╮\n" - "│ No resources found, have you run cluster.up() yet? │\n" - "╰──────────────────────────────────────────────────────────────────────────────╯\n" - ) + # The Rich library's console width detection varies between test contexts + # Accept either the two-line format (individual tests) or single-line format (full test suite) + # Check for key parts of the message instead of the full text + assert "No resources found" in captured.out + assert "cluster.apply()" in captured.out + assert "cluster.details()" in captured.out + assert "check if it's ready" in captured.out + assert "╭" in captured.out and "╮" in captured.out # Check for box characters + assert "│" in captured.out # Check for vertical lines mocker.patch( "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", return_value=get_aw_obj_with_status( @@ -614,11 +621,15 @@ def test_list_queue_rayclusters(mocker, capsys): list_all_queued("ns") captured = capsys.readouterr() - assert captured.out == ( - "╭──────────────────────────────────────────────────────────────────────────────╮\n" - "│ No resources found, have you run cluster.up() yet? │\n" - "╰──────────────────────────────────────────────────────────────────────────────╯\n" - ) + # The Rich library's console width detection varies between test contexts + # Accept either the two-line format (individual tests) or single-line format (full test suite) + # Check for key parts of the message instead of the full text + assert "No resources found" in captured.out + assert "cluster.apply()" in captured.out + assert "cluster.details()" in captured.out + assert "check if it's ready" in captured.out + assert "╭" in captured.out and "╮" in captured.out # Check for box characters + assert "│" in captured.out # Check for vertical lines mocker.patch( "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", return_value=get_ray_obj_with_status("ray.io", "v1", "ns", "rayclusters"), @@ -656,11 +667,15 @@ def test_list_clusters(mocker, capsys): ) list_all_clusters("ns") captured = capsys.readouterr() - assert captured.out == ( - "╭──────────────────────────────────────────────────────────────────────────────╮\n" - "│ No resources found, have you run cluster.up() yet? │\n" - "╰──────────────────────────────────────────────────────────────────────────────╯\n" - ) + # The Rich library's console width detection varies between test contexts + # Accept either the two-line format (individual tests) or single-line format (full test suite) + # Check for key parts of the message instead of the full text + assert "No resources found" in captured.out + assert "cluster.apply()" in captured.out + assert "cluster.details()" in captured.out + assert "check if it's ready" in captured.out + assert "╭" in captured.out and "╮" in captured.out # Check for box characters + assert "│" in captured.out # Check for vertical lines mocker.patch( "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_ray_obj, @@ -756,6 +771,189 @@ def custom_side_effect(group, version, namespace, plural, **kwargs): assert result.dashboard == rc_dashboard +def test_throw_for_no_raycluster_crd_errors(mocker): + """Test RayCluster CRD error handling""" + from kubernetes.client.rest import ApiException + + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + + # Test 404 error - CRD not found + mock_api_404 = MagicMock() + mock_api_404.list_namespaced_custom_object.side_effect = ApiException(status=404) + mocker.patch("kubernetes.client.CustomObjectsApi", return_value=mock_api_404) + + cluster = create_cluster(mocker) + with pytest.raises( + RuntimeError, match="RayCluster CustomResourceDefinition unavailable" + ): + cluster._throw_for_no_raycluster() + + # Test other API error + mock_api_500 = MagicMock() + mock_api_500.list_namespaced_custom_object.side_effect = ApiException(status=500) + mocker.patch("kubernetes.client.CustomObjectsApi", return_value=mock_api_500) + + cluster2 = create_cluster(mocker) + with pytest.raises( + RuntimeError, match="Failed to get RayCluster CustomResourceDefinition" + ): + cluster2._throw_for_no_raycluster() + + +def test_cluster_apply_attribute_error_handling(mocker): + """Test AttributeError handling when DynamicClient fails""" + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch("codeflare_sdk.ray.cluster.cluster.Cluster._throw_for_no_raycluster") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) + + # Mock get_dynamic_client to raise AttributeError + def raise_attribute_error(): + raise AttributeError("DynamicClient initialization failed") + + mocker.patch( + "codeflare_sdk.ray.cluster.cluster.Cluster.get_dynamic_client", + side_effect=raise_attribute_error, + ) + + cluster = create_cluster(mocker) + + with pytest.raises(RuntimeError, match="Failed to initialize DynamicClient"): + cluster.apply() + + +def test_cluster_namespace_handling(mocker, capsys): + """Test namespace validation in create_resource""" + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) + + # Test with None namespace that gets set + mocker.patch( + "codeflare_sdk.ray.cluster.cluster.get_current_namespace", return_value=None + ) + + config = ClusterConfiguration( + name="test-cluster-ns", + namespace=None, # Will trigger namespace check + num_workers=1, + worker_cpu_requests=1, + worker_cpu_limits=1, + worker_memory_requests=2, + worker_memory_limits=2, + ) + + cluster = Cluster(config) + captured = capsys.readouterr() + # Verify the warning message was printed + assert "Please specify with namespace=" in captured.out + assert cluster.config.namespace is None + + +def test_component_resources_with_write_to_file(mocker): + """Test _component_resources_up with write_to_file enabled""" + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) + + # Mock the _create_resources function + mocker.patch("codeflare_sdk.ray.cluster.cluster._create_resources") + + # Create cluster with write_to_file=True (without appwrapper) + config = ClusterConfiguration( + name="test-cluster-component", + namespace="ns", + num_workers=1, + worker_cpu_requests=1, + worker_cpu_limits=1, + worker_memory_requests=2, + worker_memory_limits=2, + write_to_file=True, + appwrapper=False, + ) + + cluster = Cluster(config) + + # Mock file reading and test _component_resources_up + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write("apiVersion: v1\nkind: ConfigMap\nmetadata:\n name: test") + temp_file = f.name + + try: + mock_api = MagicMock() + cluster.resource_yaml = temp_file + cluster._component_resources_up("ns", mock_api) + # If we got here without error, the write_to_file path was executed + assert True + finally: + os.unlink(temp_file) + + +def test_get_cluster_status_functions(mocker): + """Test _app_wrapper_status and _ray_cluster_status functions""" + from codeflare_sdk.ray.cluster.cluster import ( + _app_wrapper_status, + _ray_cluster_status, + ) + + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch("codeflare_sdk.ray.cluster.cluster.config_check") + + # Test _app_wrapper_status when cluster not found + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value={"items": []}, + ) + result = _app_wrapper_status("non-existent-cluster", "ns") + assert result is None + + # Test _ray_cluster_status when cluster not found + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value={"items": []}, + ) + result = _ray_cluster_status("non-existent-cluster", "ns") + assert result is None + + +def test_cluster_namespace_type_error(mocker): + """Test TypeError when namespace is not a string""" + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) + + # Mock get_current_namespace to return a non-string value (e.g., int) + mocker.patch( + "codeflare_sdk.ray.cluster.cluster.get_current_namespace", return_value=12345 + ) + + config = ClusterConfiguration( + name="test-cluster-type-error", + namespace=None, # Will trigger namespace check + num_workers=1, + worker_cpu_requests=1, + worker_cpu_limits=1, + worker_memory_requests=2, + worker_memory_limits=2, + ) + + # This should raise TypeError because get_current_namespace returns int + with pytest.raises( + TypeError, + match="Namespace 12345 is of type.*Check your Kubernetes Authentication", + ): + Cluster(config) + + # Make sure to always keep this function last def test_cleanup(): os.remove(f"{aw_dir}test-all-params.yaml") diff --git a/src/codeflare_sdk/ray/cluster/test_pretty_print.py b/src/codeflare_sdk/ray/cluster/test_pretty_print.py index 329a1354..f36e290c 100644 --- a/src/codeflare_sdk/ray/cluster/test_pretty_print.py +++ b/src/codeflare_sdk/ray/cluster/test_pretty_print.py @@ -38,11 +38,15 @@ def test_print_no_resources(capsys): except Exception: assert 1 == 0 captured = capsys.readouterr() - assert captured.out == ( - "╭──────────────────────────────────────────────────────────────────────────────╮\n" - "│ No resources found, have you run cluster.up() yet? │\n" - "╰──────────────────────────────────────────────────────────────────────────────╯\n" - ) + # The Rich library's console width detection varies between test contexts + # Accept either the two-line format (individual tests) or single-line format (full test suite) + # Check for key parts of the message instead of the full text + assert "No resources found" in captured.out + assert "cluster.apply()" in captured.out + assert "cluster.details()" in captured.out + assert "check if it's ready" in captured.out + assert "╭" in captured.out and "╮" in captured.out # Check for box characters + assert "│" in captured.out # Check for vertical lines def test_print_appwrappers(capsys): diff --git a/tests/upgrade/raycluster_sdk_upgrade_sleep_test.py b/tests/upgrade/raycluster_sdk_upgrade_sleep_test.py index 793853d0..c61f5d19 100644 --- a/tests/upgrade/raycluster_sdk_upgrade_sleep_test.py +++ b/tests/upgrade/raycluster_sdk_upgrade_sleep_test.py @@ -68,7 +68,7 @@ def run_mnist_raycluster_sdk_oauth(self): ) try: - cluster.up() + cluster.apply() cluster.status() # wait for raycluster to be Ready cluster.wait_ready() diff --git a/tests/upgrade/raycluster_sdk_upgrade_test.py b/tests/upgrade/raycluster_sdk_upgrade_test.py index 7a6d583e..80fd105f 100644 --- a/tests/upgrade/raycluster_sdk_upgrade_test.py +++ b/tests/upgrade/raycluster_sdk_upgrade_test.py @@ -17,7 +17,7 @@ # Creates a Ray cluster -class TestMNISTRayClusterUp: +class TestMNISTRayClusterApply: def setup_method(self): initialize_kubernetes_client(self) create_namespace_with_name(self, namespace) @@ -63,7 +63,7 @@ def run_mnist_raycluster_sdk_oauth(self): ) try: - cluster.up() + cluster.apply() cluster.status() # wait for raycluster to be Ready cluster.wait_ready()