From 2d2cea60f72baf2ec88cc532a96c0617a30c000f Mon Sep 17 00:00:00 2001 From: kryanbeane Date: Tue, 12 Aug 2025 21:28:35 +0100 Subject: [PATCH 1/2] feat(RHOAIENG-26482): add gcs fault tolerance --- src/codeflare_sdk/ray/rayjobs/__init__.py | 1 + src/codeflare_sdk/ray/rayjobs/config.py | 62 +++++++++++++++++++- src/codeflare_sdk/ray/rayjobs/rayjob.py | 1 - src/codeflare_sdk/ray/rayjobs/test_rayjob.py | 21 +++++++ 4 files changed, 83 insertions(+), 2 deletions(-) diff --git a/src/codeflare_sdk/ray/rayjobs/__init__.py b/src/codeflare_sdk/ray/rayjobs/__init__.py index 756fad91..c415c606 100644 --- a/src/codeflare_sdk/ray/rayjobs/__init__.py +++ b/src/codeflare_sdk/ray/rayjobs/__init__.py @@ -1,2 +1,3 @@ from .rayjob import RayJob, RayJobClusterConfig from .status import RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo +from .config import RayJobClusterConfig diff --git a/src/codeflare_sdk/ray/rayjobs/config.py b/src/codeflare_sdk/ray/rayjobs/config.py index 24c89a64..0d61753c 100644 --- a/src/codeflare_sdk/ray/rayjobs/config.py +++ b/src/codeflare_sdk/ray/rayjobs/config.py @@ -13,7 +13,7 @@ # limitations under the License. """ -The config sub-module contains the definition of the RayJobClusterConfigV2 dataclass, +The config sub-module contains the definition of the RayJobClusterConfig dataclass, which is used to specify resource requirements and other details when creating a Cluster object. """ @@ -139,6 +139,14 @@ class RayJobClusterConfig: A list of V1Volume objects to add to the Cluster volume_mounts: A list of V1VolumeMount objects to add to the Cluster + enable_gcs_ft: + A boolean indicating whether to enable GCS fault tolerance. + redis_address: + The address of the Redis server to use for GCS fault tolerance, required when enable_gcs_ft is True. + redis_password_secret: + Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"} + external_storage_namespace: + The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster. """ head_cpu_requests: Union[int, str] = 2 @@ -165,8 +173,33 @@ class RayJobClusterConfig: annotations: Dict[str, str] = field(default_factory=dict) volumes: list[V1Volume] = field(default_factory=list) volume_mounts: list[V1VolumeMount] = field(default_factory=list) + enable_gcs_ft: bool = False + redis_address: Optional[str] = None + redis_password_secret: Optional[Dict[str, str]] = None + external_storage_namespace: Optional[str] = None def __post_init__(self): + if self.enable_gcs_ft: + if not self.redis_address: + raise ValueError( + "redis_address must be provided when enable_gcs_ft is True" + ) + + if self.redis_password_secret and not isinstance( + self.redis_password_secret, dict + ): + raise ValueError( + "redis_password_secret must be a dictionary with 'name' and 'key' fields" + ) + + if self.redis_password_secret and ( + "name" not in self.redis_password_secret + or "key" not in self.redis_password_secret + ): + raise ValueError( + "redis_password_secret must contain both 'name' and 'key' fields" + ) + self._validate_types() self._memory_to_string() self._validate_gpu_config(self.head_accelerators) @@ -251,6 +284,11 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]: "workerGroupSpecs": [self._build_worker_group_spec(cluster_name)], } + # Add GCS fault tolerance if enabled + if self.enable_gcs_ft: + gcs_ft_options = self._build_gcs_ft_options() + ray_cluster_spec["gcsFaultToleranceOptions"] = gcs_ft_options + return ray_cluster_spec def _build_head_group_spec(self) -> Dict[str, Any]: @@ -453,3 +491,25 @@ def _generate_volumes(self) -> list: def _build_env_vars(self) -> list: """Build environment variables list.""" return [V1EnvVar(name=key, value=value) for key, value in self.envs.items()] + + def _build_gcs_ft_options(self) -> Dict[str, Any]: + """Build GCS fault tolerance options.""" + gcs_ft_options = {"redisAddress": self.redis_address} + + if ( + hasattr(self, "external_storage_namespace") + and self.external_storage_namespace + ): + gcs_ft_options["externalStorageNamespace"] = self.external_storage_namespace + + if hasattr(self, "redis_password_secret") and self.redis_password_secret: + gcs_ft_options["redisPassword"] = { + "valueFrom": { + "secretKeyRef": { + "name": self.redis_password_secret["name"], + "key": self.redis_password_secret["key"], + } + } + } + + return gcs_ft_options diff --git a/src/codeflare_sdk/ray/rayjobs/rayjob.py b/src/codeflare_sdk/ray/rayjobs/rayjob.py index ab0899d2..93b3ed71 100644 --- a/src/codeflare_sdk/ray/rayjobs/rayjob.py +++ b/src/codeflare_sdk/ray/rayjobs/rayjob.py @@ -140,7 +140,6 @@ def __init__( self.cluster_name = cluster_name logger.info(f"Using existing cluster: {self.cluster_name}") - # Initialize the KubeRay job API client self._api = RayjobApi() logger.info(f"Initialized RayJob: {self.name} in namespace: {self.namespace}") diff --git a/src/codeflare_sdk/ray/rayjobs/test_rayjob.py b/src/codeflare_sdk/ray/rayjobs/test_rayjob.py index 970f0159..c1ebaaa8 100644 --- a/src/codeflare_sdk/ray/rayjobs/test_rayjob.py +++ b/src/codeflare_sdk/ray/rayjobs/test_rayjob.py @@ -971,3 +971,24 @@ def test_rayjob_user_override_shutdown_behavior(mocker): ) assert rayjob_override_priority.shutdown_after_job_finishes is True + + +def test_build_ray_cluster_spec_with_gcs_ft(mocker): + """Test build_ray_cluster_spec with GCS fault tolerance enabled.""" + from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + + # Create a test cluster config with GCS FT enabled + cluster_config = RayJobClusterConfig( + enable_gcs_ft=True, + redis_address="redis://redis-service:6379", + external_storage_namespace="storage-ns", + ) + + # Build the spec using the method on the cluster config + spec = cluster_config.build_ray_cluster_spec("test-cluster") + + # Verify GCS fault tolerance options + assert "gcsFaultToleranceOptions" in spec + gcs_ft = spec["gcsFaultToleranceOptions"] + assert gcs_ft["redisAddress"] == "redis://redis-service:6379" + assert gcs_ft["externalStorageNamespace"] == "storage-ns" From e5f60440dcad6cc453c4e9c6a464823dba46b402 Mon Sep 17 00:00:00 2001 From: kryanbeane Date: Wed, 13 Aug 2025 17:07:34 +0100 Subject: [PATCH 2/2] feat(RHOAIENG-26482): disable usage stats and rename RayJobClusterConfig --- codecov.yml | 15 ++++ src/codeflare_sdk/__init__.py | 2 +- src/codeflare_sdk/ray/__init__.py | 2 +- src/codeflare_sdk/ray/rayjobs/__init__.py | 4 +- src/codeflare_sdk/ray/rayjobs/config.py | 12 +-- src/codeflare_sdk/ray/rayjobs/rayjob.py | 4 +- src/codeflare_sdk/ray/rayjobs/test_config.py | 67 +++++++++++++++-- src/codeflare_sdk/ray/rayjobs/test_rayjob.py | 78 ++++++++++---------- 8 files changed, 126 insertions(+), 58 deletions(-) create mode 100644 codecov.yml diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..803ac581 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,15 @@ +coverage: + precision: 2 + round: down + status: + project: + default: + target: auto + threshold: 2.5% + patch: + default: + target: 85% + threshold: 2.5% + +ignore: + - "**/__init__.py" diff --git a/src/codeflare_sdk/__init__.py b/src/codeflare_sdk/__init__.py index f9a06524..a27702e7 100644 --- a/src/codeflare_sdk/__init__.py +++ b/src/codeflare_sdk/__init__.py @@ -11,7 +11,7 @@ AppWrapperStatus, RayJobClient, RayJob, - RayJobClusterConfig, + ManagedClusterConfig, ) from .common.widgets import view_clusters diff --git a/src/codeflare_sdk/ray/__init__.py b/src/codeflare_sdk/ray/__init__.py index 806ed9a4..7bd0b2c8 100644 --- a/src/codeflare_sdk/ray/__init__.py +++ b/src/codeflare_sdk/ray/__init__.py @@ -6,7 +6,7 @@ from .rayjobs import ( RayJob, - RayJobClusterConfig, + ManagedClusterConfig, RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo, diff --git a/src/codeflare_sdk/ray/rayjobs/__init__.py b/src/codeflare_sdk/ray/rayjobs/__init__.py index c415c606..cd6b4123 100644 --- a/src/codeflare_sdk/ray/rayjobs/__init__.py +++ b/src/codeflare_sdk/ray/rayjobs/__init__.py @@ -1,3 +1,3 @@ -from .rayjob import RayJob, RayJobClusterConfig +from .rayjob import RayJob, ManagedClusterConfig from .status import RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo -from .config import RayJobClusterConfig +from .config import ManagedClusterConfig diff --git a/src/codeflare_sdk/ray/rayjobs/config.py b/src/codeflare_sdk/ray/rayjobs/config.py index 0d61753c..08c352ee 100644 --- a/src/codeflare_sdk/ray/rayjobs/config.py +++ b/src/codeflare_sdk/ray/rayjobs/config.py @@ -13,7 +13,7 @@ # limitations under the License. """ -The config sub-module contains the definition of the RayJobClusterConfig dataclass, +The config sub-module contains the definition of the ManagedClusterConfig dataclass, which is used to specify resource requirements and other details when creating a Cluster object. """ @@ -104,7 +104,7 @@ @dataclass -class RayJobClusterConfig: +class ManagedClusterConfig: """ This dataclass is used to specify resource requirements and other details for RayJobs. The cluster name and namespace are automatically derived from the RayJob configuration. @@ -179,6 +179,8 @@ class RayJobClusterConfig: external_storage_namespace: Optional[str] = None def __post_init__(self): + self.envs["RAY_USAGE_STATS_ENABLED"] = "0" + if self.enable_gcs_ft: if not self.redis_address: raise ValueError( @@ -223,7 +225,7 @@ def _memory_to_string(self): self.worker_memory_limits = f"{self.worker_memory_limits}G" def _validate_types(self): - """Validate the types of all fields in the RayJobClusterConfig dataclass.""" + """Validate the types of all fields in the ManagedClusterConfig dataclass.""" errors = [] for field_info in fields(self): value = getattr(self, field_info.name) @@ -268,10 +270,10 @@ def check_type(value, expected_type): def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]: """ - Build the RayCluster spec from RayJobClusterConfig for embedding in RayJob. + Build the RayCluster spec from ManagedClusterConfig for embedding in RayJob. Args: - self: The cluster configuration object (RayJobClusterConfig) + self: The cluster configuration object (ManagedClusterConfig) cluster_name: The name for the cluster (derived from RayJob name) Returns: diff --git a/src/codeflare_sdk/ray/rayjobs/rayjob.py b/src/codeflare_sdk/ray/rayjobs/rayjob.py index 93b3ed71..a1577d91 100644 --- a/src/codeflare_sdk/ray/rayjobs/rayjob.py +++ b/src/codeflare_sdk/ray/rayjobs/rayjob.py @@ -20,7 +20,7 @@ from typing import Dict, Any, Optional, Tuple from python_client.kuberay_job_api import RayjobApi -from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig +from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig from ...common.utils import get_current_namespace @@ -48,7 +48,7 @@ def __init__( job_name: str, entrypoint: str, cluster_name: Optional[str] = None, - cluster_config: Optional[RayJobClusterConfig] = None, + cluster_config: Optional[ManagedClusterConfig] = None, namespace: Optional[str] = None, runtime_env: Optional[Dict[str, Any]] = None, shutdown_after_job_finishes: Optional[bool] = None, diff --git a/src/codeflare_sdk/ray/rayjobs/test_config.py b/src/codeflare_sdk/ray/rayjobs/test_config.py index cefe9606..80736295 100644 --- a/src/codeflare_sdk/ray/rayjobs/test_config.py +++ b/src/codeflare_sdk/ray/rayjobs/test_config.py @@ -1,14 +1,14 @@ """ -Tests for the simplified RayJobClusterConfig accelerator_configs behavior. +Tests for the simplified ManagedClusterConfig accelerator_configs behavior. """ import pytest -from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig, DEFAULT_ACCELERATORS +from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig, DEFAULT_ACCELERATORS def test_accelerator_configs_defaults_to_default_accelerators(): """Test that accelerator_configs defaults to DEFAULT_ACCELERATORS.copy()""" - config = RayJobClusterConfig() + config = ManagedClusterConfig() # Should have all the default accelerators assert "nvidia.com/gpu" in config.accelerator_configs @@ -27,7 +27,7 @@ def test_accelerator_configs_can_be_overridden(): "custom.com/accelerator": "CUSTOM_ACCELERATOR", } - config = RayJobClusterConfig(accelerator_configs=custom_configs) + config = ManagedClusterConfig(accelerator_configs=custom_configs) # Should have custom configs assert config.accelerator_configs == custom_configs @@ -46,7 +46,7 @@ def test_accelerator_configs_can_extend_defaults(): "custom.com/accelerator": "CUSTOM_ACCEL", } - config = RayJobClusterConfig(accelerator_configs=extended_configs) + config = ManagedClusterConfig(accelerator_configs=extended_configs) # Should have all defaults plus custom assert "nvidia.com/gpu" in config.accelerator_configs @@ -57,7 +57,7 @@ def test_accelerator_configs_can_extend_defaults(): def test_gpu_validation_works_with_defaults(): """Test that GPU validation works with default accelerator configs""" - config = RayJobClusterConfig(head_accelerators={"nvidia.com/gpu": 1}) + config = ManagedClusterConfig(head_accelerators={"nvidia.com/gpu": 1}) # Should not raise any errors assert config.head_accelerators == {"nvidia.com/gpu": 1} @@ -65,7 +65,7 @@ def test_gpu_validation_works_with_defaults(): def test_gpu_validation_works_with_custom_configs(): """Test that GPU validation works with custom accelerator configs""" - config = RayJobClusterConfig( + config = ManagedClusterConfig( accelerator_configs={"custom.com/accelerator": "CUSTOM_ACCEL"}, head_accelerators={"custom.com/accelerator": 1}, ) @@ -79,4 +79,55 @@ def test_gpu_validation_fails_with_unsupported_accelerator(): with pytest.raises( ValueError, match="GPU configuration 'unsupported.com/accelerator' not found" ): - RayJobClusterConfig(head_accelerators={"unsupported.com/accelerator": 1}) + ManagedClusterConfig(head_accelerators={"unsupported.com/accelerator": 1}) + + +def test_ray_usage_stats_always_disabled_by_default(): + """Test that RAY_USAGE_STATS_ENABLED is always set to '0' by default""" + config = ManagedClusterConfig() + + # Should always have the environment variable set to "0" + assert "RAY_USAGE_STATS_ENABLED" in config.envs + assert config.envs["RAY_USAGE_STATS_ENABLED"] == "0" + + +def test_ray_usage_stats_overwrites_user_env(): + """Test that RAY_USAGE_STATS_ENABLED is always set to '0' even if user specifies it""" + # User tries to enable usage stats + config = ManagedClusterConfig(envs={"RAY_USAGE_STATS_ENABLED": "1"}) + + # Should still be disabled (our setting takes precedence) + assert "RAY_USAGE_STATS_ENABLED" in config.envs + assert config.envs["RAY_USAGE_STATS_ENABLED"] == "0" + + +def test_ray_usage_stats_overwrites_user_env_string(): + """Test that RAY_USAGE_STATS_ENABLED is always set to '0' even if user specifies it as string""" + # User tries to enable usage stats with string + config = ManagedClusterConfig(envs={"RAY_USAGE_STATS_ENABLED": "true"}) + + # Should still be disabled (our setting takes precedence) + assert "RAY_USAGE_STATS_ENABLED" in config.envs + assert config.envs["RAY_USAGE_STATS_ENABLED"] == "0" + + +def test_ray_usage_stats_with_other_user_envs(): + """Test that RAY_USAGE_STATS_ENABLED is set correctly while preserving other user envs""" + # User sets other environment variables + user_envs = { + "CUSTOM_VAR": "custom_value", + "ANOTHER_VAR": "another_value", + "RAY_USAGE_STATS_ENABLED": "1", # This should be overwritten + } + + config = ManagedClusterConfig(envs=user_envs) + + # Our setting should take precedence + assert config.envs["RAY_USAGE_STATS_ENABLED"] == "0" + + # Other user envs should be preserved + assert config.envs["CUSTOM_VAR"] == "custom_value" + assert config.envs["ANOTHER_VAR"] == "another_value" + + # Total count should be correct (3 user envs) + assert len(config.envs) == 3 diff --git a/src/codeflare_sdk/ray/rayjobs/test_rayjob.py b/src/codeflare_sdk/ray/rayjobs/test_rayjob.py index c1ebaaa8..1ecd4b48 100644 --- a/src/codeflare_sdk/ray/rayjobs/test_rayjob.py +++ b/src/codeflare_sdk/ray/rayjobs/test_rayjob.py @@ -265,10 +265,10 @@ def test_build_ray_cluster_spec(mocker): "workerGroupSpecs": [{"replicas": 2}], }, } - # Use RayJobClusterConfig which has the build_ray_cluster_spec method - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + # Use ManagedClusterConfig which has the build_ray_cluster_spec method + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig - cluster_config = RayJobClusterConfig(num_workers=2) + cluster_config = ManagedClusterConfig(num_workers=2) # Mock the method that will be called mocker.patch.object( @@ -345,10 +345,10 @@ def test_build_rayjob_cr_with_auto_cluster(mocker): "workerGroupSpecs": [{"replicas": 2}], }, } - # Use RayJobClusterConfig and mock its build_ray_cluster_spec method - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + # Use ManagedClusterConfig and mock its build_ray_cluster_spec method + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig - cluster_config = RayJobClusterConfig(num_workers=2) + cluster_config = ManagedClusterConfig(num_workers=2) # Mock the method that will be called mocker.patch.object( @@ -406,10 +406,10 @@ def test_submit_with_auto_cluster(mocker): mock_api_class.return_value = mock_api_instance mock_api_instance.submit_job.return_value = True - # Use RayJobClusterConfig and mock its build_ray_cluster_spec method - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + # Use ManagedClusterConfig and mock its build_ray_cluster_spec method + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig - cluster_config = RayJobClusterConfig(num_workers=1) + cluster_config = ManagedClusterConfig(num_workers=1) # Mock the method that will be called mocker.patch.object( @@ -488,9 +488,9 @@ def test_shutdown_behavior_with_cluster_config(mocker): """Test that shutdown_after_job_finishes is True when cluster_config is provided.""" mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi") - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig - cluster_config = RayJobClusterConfig() + cluster_config = ManagedClusterConfig() rayjob = RayJob( job_name="test-job", @@ -517,12 +517,12 @@ def test_shutdown_behavior_with_existing_cluster(mocker): def test_rayjob_with_rayjob_cluster_config(mocker): - """Test RayJob with the new RayJobClusterConfig.""" + """Test RayJob with the new ManagedClusterConfig.""" mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi") - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig - cluster_config = RayJobClusterConfig( + cluster_config = ManagedClusterConfig( num_workers=2, head_cpu_requests="500m", head_memory_requests="512Mi", @@ -540,13 +540,13 @@ def test_rayjob_with_rayjob_cluster_config(mocker): def test_rayjob_cluster_config_validation(mocker): - """Test validation of RayJobClusterConfig parameters.""" + """Test validation of ManagedClusterConfig parameters.""" mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi") - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig # Test with minimal valid config - cluster_config = RayJobClusterConfig() + cluster_config = ManagedClusterConfig() rayjob = RayJob( job_name="test-job", @@ -581,9 +581,9 @@ def test_build_ray_cluster_spec_integration(mocker): # Mock the RayjobApi class entirely mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi") - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig - cluster_config = RayJobClusterConfig() + cluster_config = ManagedClusterConfig() # Mock the build_ray_cluster_spec method on the cluster config mock_spec = {"spec": "test-spec"} @@ -658,9 +658,9 @@ def test_rayjob_cluster_name_generation_with_config(mocker): """Test cluster name generation when using cluster_config.""" mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi") - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig - cluster_config = RayJobClusterConfig() + cluster_config = ManagedClusterConfig() rayjob = RayJob( job_name="my-job", @@ -685,9 +685,9 @@ def test_rayjob_namespace_propagation_to_cluster_config(mocker): return_value="detected-ns", ) - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig - cluster_config = RayJobClusterConfig() + cluster_config = ManagedClusterConfig() rayjob = RayJob( job_name="test-job", @@ -735,10 +735,10 @@ def test_rayjob_constructor_parameter_validation(mocker): def test_build_ray_cluster_spec_function(mocker): """Test the build_ray_cluster_spec method directly.""" - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig # Create a test cluster config - cluster_config = RayJobClusterConfig( + cluster_config = ManagedClusterConfig( num_workers=2, head_cpu_requests="500m", head_memory_requests="512Mi", @@ -774,10 +774,10 @@ def test_build_ray_cluster_spec_function(mocker): def test_build_ray_cluster_spec_with_accelerators(mocker): """Test build_ray_cluster_spec with GPU accelerators.""" - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig # Create a test cluster config with GPU accelerators - cluster_config = RayJobClusterConfig( + cluster_config = ManagedClusterConfig( head_accelerators={"nvidia.com/gpu": 1}, worker_accelerators={"nvidia.com/gpu": 2}, ) @@ -801,7 +801,7 @@ def test_build_ray_cluster_spec_with_accelerators(mocker): def test_build_ray_cluster_spec_with_custom_volumes(mocker): """Test build_ray_cluster_spec with custom volumes and volume mounts.""" - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig from kubernetes.client import V1Volume, V1VolumeMount # Create custom volumes and volume mounts @@ -809,7 +809,7 @@ def test_build_ray_cluster_spec_with_custom_volumes(mocker): custom_volume_mount = V1VolumeMount(name="custom-data", mount_path="/data") # Create a test cluster config with custom volumes - cluster_config = RayJobClusterConfig( + cluster_config = ManagedClusterConfig( volumes=[custom_volume], volume_mounts=[custom_volume_mount], ) @@ -831,10 +831,10 @@ def test_build_ray_cluster_spec_with_custom_volumes(mocker): def test_build_ray_cluster_spec_with_environment_variables(mocker): """Test build_ray_cluster_spec with environment variables.""" - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig # Create a test cluster config with environment variables - cluster_config = RayJobClusterConfig( + cluster_config = ManagedClusterConfig( envs={"CUDA_VISIBLE_DEVICES": "0", "RAY_DISABLE_IMPORT_WARNING": "1"}, ) @@ -863,7 +863,7 @@ def test_build_ray_cluster_spec_with_environment_variables(mocker): def test_build_ray_cluster_spec_with_tolerations(mocker): """Test build_ray_cluster_spec with tolerations.""" - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig from kubernetes.client import V1Toleration # Create test tolerations @@ -875,7 +875,7 @@ def test_build_ray_cluster_spec_with_tolerations(mocker): ) # Create a test cluster config with tolerations - cluster_config = RayJobClusterConfig( + cluster_config = ManagedClusterConfig( head_tolerations=[head_toleration], worker_tolerations=[worker_toleration], ) @@ -900,10 +900,10 @@ def test_build_ray_cluster_spec_with_tolerations(mocker): def test_build_ray_cluster_spec_with_image_pull_secrets(mocker): """Test build_ray_cluster_spec with image pull secrets.""" - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig # Create a test cluster config with image pull secrets - cluster_config = RayJobClusterConfig( + cluster_config = ManagedClusterConfig( image_pull_secrets=["my-registry-secret", "another-secret"] ) @@ -947,9 +947,9 @@ def test_rayjob_user_override_shutdown_behavior(mocker): assert rayjob_existing_override.shutdown_after_job_finishes is True # Test 2: User overrides shutdown to False even when creating new cluster - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig - cluster_config = RayJobClusterConfig() + cluster_config = ManagedClusterConfig() rayjob_new_override = RayJob( job_name="test-job", @@ -975,10 +975,10 @@ def test_rayjob_user_override_shutdown_behavior(mocker): def test_build_ray_cluster_spec_with_gcs_ft(mocker): """Test build_ray_cluster_spec with GCS fault tolerance enabled.""" - from codeflare_sdk.ray.rayjobs.config import RayJobClusterConfig + from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig # Create a test cluster config with GCS FT enabled - cluster_config = RayJobClusterConfig( + cluster_config = ManagedClusterConfig( enable_gcs_ft=True, redis_address="redis://redis-service:6379", external_storage_namespace="storage-ns",