1313# limitations under the License.
1414
1515"""
16- The config sub-module contains the definition of the RayJobClusterConfigV2 dataclass,
16+ The config sub-module contains the definition of the RayJobClusterConfig dataclass,
1717which is used to specify resource requirements and other details when creating a
1818Cluster object.
1919"""
@@ -141,6 +141,14 @@ class RayJobClusterConfig:
141141 A list of V1Volume objects to add to the Cluster
142142 volume_mounts:
143143 A list of V1VolumeMount objects to add to the Cluster
144+ enable_gcs_ft:
145+ A boolean indicating whether to enable GCS fault tolerance.
146+ redis_address:
147+ The address of the Redis server to use for GCS fault tolerance, required when enable_gcs_ft is True.
148+ redis_password_secret:
149+ Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"}
150+ external_storage_namespace:
151+ The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster.
144152 """
145153
146154 head_cpu_requests : Union [int , str ] = 2
@@ -167,8 +175,33 @@ class RayJobClusterConfig:
167175 annotations : Dict [str , str ] = field (default_factory = dict )
168176 volumes : list [V1Volume ] = field (default_factory = list )
169177 volume_mounts : list [V1VolumeMount ] = field (default_factory = list )
178+ enable_gcs_ft : bool = False
179+ redis_address : Optional [str ] = None
180+ redis_password_secret : Optional [Dict [str , str ]] = None
181+ external_storage_namespace : Optional [str ] = None
170182
171183 def __post_init__ (self ):
184+ if self .enable_gcs_ft :
185+ if not self .redis_address :
186+ raise ValueError (
187+ "redis_address must be provided when enable_gcs_ft is True"
188+ )
189+
190+ if self .redis_password_secret and not isinstance (
191+ self .redis_password_secret , dict
192+ ):
193+ raise ValueError (
194+ "redis_password_secret must be a dictionary with 'name' and 'key' fields"
195+ )
196+
197+ if self .redis_password_secret and (
198+ "name" not in self .redis_password_secret
199+ or "key" not in self .redis_password_secret
200+ ):
201+ raise ValueError (
202+ "redis_password_secret must contain both 'name' and 'key' fields"
203+ )
204+
172205 self ._validate_types ()
173206 self ._memory_to_string ()
174207 self ._validate_gpu_config (self .head_accelerators )
@@ -253,6 +286,11 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]:
253286 "workerGroupSpecs" : [self ._build_worker_group_spec (cluster_name )],
254287 }
255288
289+ # Add GCS fault tolerance if enabled
290+ if self .enable_gcs_ft :
291+ gcs_ft_options = self ._build_gcs_ft_options ()
292+ ray_cluster_spec ["gcsFaultToleranceOptions" ] = gcs_ft_options
293+
256294 return ray_cluster_spec
257295
258296 def _build_head_group_spec (self ) -> Dict [str , Any ]:
@@ -455,3 +493,25 @@ def _generate_volumes(self) -> list:
455493 def _build_env_vars (self ) -> list :
456494 """Build environment variables list."""
457495 return [V1EnvVar (name = key , value = value ) for key , value in self .envs .items ()]
496+
497+ def _build_gcs_ft_options (self ) -> Dict [str , Any ]:
498+ """Build GCS fault tolerance options."""
499+ gcs_ft_options = {"redisAddress" : self .redis_address }
500+
501+ if (
502+ hasattr (self , "external_storage_namespace" )
503+ and self .external_storage_namespace
504+ ):
505+ gcs_ft_options ["externalStorageNamespace" ] = self .external_storage_namespace
506+
507+ if hasattr (self , "redis_password_secret" ) and self .redis_password_secret :
508+ gcs_ft_options ["redisPassword" ] = {
509+ "valueFrom" : {
510+ "secretKeyRef" : {
511+ "name" : self .redis_password_secret ["name" ],
512+ "key" : self .redis_password_secret ["key" ],
513+ }
514+ }
515+ }
516+
517+ return gcs_ft_options
0 commit comments