Skip to content

Commit a90d540

Browse files
CLOUDP-317886 - block removing cluster from MC Sharded deployment (#495)
# Summary During investigation of https://jira.mongodb.org/browse/CLOUDP-317886 we have found that the operator panics, when the cluster is removed from `clusterSpecList`. Initially we thought about allowing to do that, but later on I have discussed the issue with @lsierant and the better approach was chosen: - when the user wants to remove cluster that already has 0 members, we allow this - when the user wants to remove cluster that has more than 0 members, we block this, because it can lead to unexpected issues. This is described in the code comment: https://github.com/mongodb/mongodb-kubernetes/blob/31c346ee4770c3caabb620f3e7ad3f38ebdc4d3c/controllers/operator/mongodbshardedcluster_controller.go#L840-L846 ...and in the changelog: **MultiClusterSharded**: Block removing non-zero member cluster from MongoDB resource. This prevents from scaling down member cluster without current configuration available, which can lead to unexpected issues. Previously operator was crashing in that scenario, after the fix it will mark reconciliation as `Failed` with appropriate message. Example unsafe scenario that is now blocked: * User has 2 member clusters: `main` is used for application traffic, `read-analytics` is used for read-only analytics * `main` cluster has 7 voting members * `read-analytics` cluster has 3 non-voting members * User decides to remove `read-analytics` cluster, by removing the `clusterSpecItem` completely * Operator scales down members from `read-analytics` cluster one by one * Because the configuration does not have voting options specified anymore and by default `priority` is set to 1, the operator will remove one member, but the other two members will be reconfigured as voting members * `replicaset` contains now 9 voting members, which is not [supported by MongoDB](https://www.mongodb.com/docs/manual/reference/limits/#mongodb-limit-Number-of-Voting-Members-of-a-Replica-Set) ## Proof of Work Passing CI + new unit tests for `blockNonEmptyClusterSpecItemRemoval` function. ## Checklist - [x] Have you linked a jira ticket and/or is the ticket in the title? - [x] Have you checked whether your jira ticket required DOCSP changes? - [x] Have you added changelog file? - use `skip-changelog` label if not needed - refer to [Changelog files and Release Notes](https://github.com/mongodb/mongodb-kubernetes/blob/master/CONTRIBUTING.md#changelog-files-and-release-notes) section in CONTRIBUTING.md for more details --------- Co-authored-by: Vivek Singh <vsingh.ggits.2010@gmail.com>
1 parent c5839ff commit a90d540

8 files changed

+457
-138
lines changed

api/v1/mdb/mongodb_types.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"github.com/mongodb/mongodb-kubernetes/pkg/dns"
2626
"github.com/mongodb/mongodb-kubernetes/pkg/fcv"
2727
"github.com/mongodb/mongodb-kubernetes/pkg/kube"
28+
"github.com/mongodb/mongodb-kubernetes/pkg/multicluster"
2829
"github.com/mongodb/mongodb-kubernetes/pkg/util"
2930
"github.com/mongodb/mongodb-kubernetes/pkg/util/env"
3031
"github.com/mongodb/mongodb-kubernetes/pkg/util/stringutil"
@@ -1665,6 +1666,48 @@ func (m *MongoDbSpec) IsMultiCluster() bool {
16651666
return m.GetTopology() == ClusterTopologyMultiCluster
16661667
}
16671668

1669+
func (m *MongoDbSpec) GetShardClusterSpecList() ClusterSpecList {
1670+
if m.IsMultiCluster() {
1671+
return m.ShardSpec.ClusterSpecList
1672+
} else {
1673+
return ClusterSpecList{
1674+
{
1675+
ClusterName: multicluster.LegacyCentralClusterName,
1676+
Members: m.MongodsPerShardCount,
1677+
MemberConfig: m.MemberConfig,
1678+
},
1679+
}
1680+
}
1681+
}
1682+
1683+
func (m *MongoDbSpec) GetMongosClusterSpecList() ClusterSpecList {
1684+
if m.IsMultiCluster() {
1685+
return m.MongosSpec.ClusterSpecList
1686+
} else {
1687+
return ClusterSpecList{
1688+
{
1689+
ClusterName: multicluster.LegacyCentralClusterName,
1690+
Members: m.MongosCount,
1691+
ExternalAccessConfiguration: m.ExternalAccessConfiguration,
1692+
},
1693+
}
1694+
}
1695+
}
1696+
1697+
func (m *MongoDbSpec) GetConfigSrvClusterSpecList() ClusterSpecList {
1698+
if m.IsMultiCluster() {
1699+
return m.ConfigSrvSpec.ClusterSpecList
1700+
} else {
1701+
return ClusterSpecList{
1702+
{
1703+
ClusterName: multicluster.LegacyCentralClusterName,
1704+
Members: m.ConfigServerCount,
1705+
MemberConfig: m.MemberConfig,
1706+
},
1707+
}
1708+
}
1709+
}
1710+
16681711
type MongoDBConnectionStringBuilder struct {
16691712
MongoDB
16701713
hostnames []string

api/v1/mdb/shardedcluster.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,25 @@ func (s *ShardedClusterComponentSpec) GetAgentConfig() *AgentConfig {
9191
return &s.Agent
9292
}
9393

94+
func (s *ShardedClusterComponentSpec) ClusterSpecItemExists(clusterName string) bool {
95+
return s.getClusterSpecItemOrNil(clusterName) != nil
96+
}
97+
9498
func (s *ShardedClusterComponentSpec) GetClusterSpecItem(clusterName string) ClusterSpecItem {
99+
if clusterSpecItem := s.getClusterSpecItemOrNil(clusterName); clusterSpecItem != nil {
100+
return *clusterSpecItem
101+
}
102+
103+
// it should never occur - we preprocess all clusterSpecLists
104+
panic(fmt.Errorf("clusterName %s not found in clusterSpecList", clusterName))
105+
}
106+
107+
func (s *ShardedClusterComponentSpec) getClusterSpecItemOrNil(clusterName string) *ClusterSpecItem {
95108
for i := range s.ClusterSpecList {
96109
if s.ClusterSpecList[i].ClusterName == clusterName {
97-
return s.ClusterSpecList[i]
110+
return &s.ClusterSpecList[i]
98111
}
99112
}
100-
// it should never occur - we preprocess all clusterSpecLists
101-
panic(fmt.Errorf("clusterName %s not found in clusterSpecList", clusterName))
113+
114+
return nil
102115
}

api/v1/om/appdb_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,10 @@ func (m *AppDBSpec) GetMemberClusterSpecByName(memberClusterName string) mdbv1.C
494494

495495
// In case the member cluster is not found in the cluster spec list, we return an empty ClusterSpecItem
496496
// with 0 members to handle the case of removing a cluster from the spec list without a panic.
497+
//
498+
// This is not ideal, because we don't consider other fields that were removed i.e.MemberConfig.
499+
// When scaling down we should consider the full spec that was used to create the cluster.
500+
// https://jira.mongodb.org/browse/CLOUDP-349925
497501
return mdbv1.ClusterSpecItem{
498502
ClusterName: memberClusterName,
499503
Members: 0,
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
kind: fix
3+
date: 2025-10-06
4+
---
5+
6+
* **MultiClusterSharded**: Blocked removing non-zero member cluster from MongoDB resource. This prevents from scaling down member cluster without current configuration available, which could lead to unexpected issues.

controllers/operator/authentication_test.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ func TestX509ClusterAuthentication_CanBeEnabled_IfX509AuthenticationIsEnabled_Sh
6565
ctx := context.Background()
6666
scWithTls := test.DefaultClusterBuilder().EnableTLS().EnableX509().SetName("sc-with-tls").SetTLSCA("custom-ca").Build()
6767

68-
reconciler, _, client, _, err := defaultClusterReconciler(ctx, nil, "", "", scWithTls, nil)
68+
reconciler, _, client, _, err := defaultShardedClusterReconciler(ctx, nil, "", "", scWithTls, nil)
6969
require.NoError(t, err)
7070
addKubernetesTlsResources(ctx, client, scWithTls)
7171

@@ -76,7 +76,7 @@ func TestX509CanBeEnabled_WhenThereAreOnlyTlsDeployments_ShardedCluster(t *testi
7676
ctx := context.Background()
7777
scWithTls := test.DefaultClusterBuilder().EnableTLS().EnableX509().SetName("sc-with-tls").SetTLSCA("custom-ca").Build()
7878

79-
reconciler, _, client, _, err := defaultClusterReconciler(ctx, nil, "", "", scWithTls, nil)
79+
reconciler, _, client, _, err := defaultShardedClusterReconciler(ctx, nil, "", "", scWithTls, nil)
8080
require.NoError(t, err)
8181
addKubernetesTlsResources(ctx, client, scWithTls)
8282

@@ -333,7 +333,7 @@ func TestX509InternalClusterAuthentication_CanBeEnabledWithScram_ShardedCluster(
333333
EnableX509InternalClusterAuth().
334334
Build()
335335

336-
r, _, kubeClient, omConnectionFactory, _ := defaultClusterReconciler(ctx, nil, "", "", sc, nil)
336+
r, _, kubeClient, omConnectionFactory, _ := defaultShardedClusterReconciler(ctx, nil, "", "", sc, nil)
337337
addKubernetesTlsResources(ctx, r.client, sc)
338338
checkReconcileSuccessful(ctx, t, r, sc, kubeClient)
339339

@@ -770,15 +770,16 @@ func Test_NoAdditionalDomainsPresent(t *testing.T) {
770770
// The default secret we create does not contain additional domains so it will not be valid for this RS
771771
rs.Spec.Security.TLSConfig.AdditionalCertificateDomains = []string{"foo"}
772772

773-
reconciler, _, client, _, err := defaultClusterReconciler(ctx, nil, "", "", rs, nil)
774-
require.NoError(t, err)
775-
addKubernetesTlsResources(ctx, client, rs)
773+
kubeClient, omConnectionFactory := mock.NewDefaultFakeClient(rs)
774+
reconciler := newReplicaSetReconciler(ctx, kubeClient, nil, "", "", false, false, omConnectionFactory.GetConnectionFunc)
775+
addKubernetesTlsResources(ctx, kubeClient, rs)
776776

777-
secret := &corev1.Secret{}
777+
certSecret := &corev1.Secret{}
778778

779-
_ = client.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("%s-cert", rs.Name), Namespace: rs.Namespace}, secret)
779+
_ = kubeClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("%s-cert", rs.Name), Namespace: rs.Namespace}, certSecret)
780780

781-
err = certs.VerifyAndEnsureCertificatesForStatefulSet(ctx, reconciler.SecretClient, reconciler.SecretClient, fmt.Sprintf("%s-cert", rs.Name), certs.ReplicaSetConfig(*rs), nil)
781+
err := certs.VerifyAndEnsureCertificatesForStatefulSet(ctx, reconciler.SecretClient, reconciler.SecretClient, fmt.Sprintf("%s-cert", rs.Name), certs.ReplicaSetConfig(*rs), nil)
782+
require.Error(t, err)
782783
for i := 0; i < rs.Spec.Members; i++ {
783784
expectedErrorMessage := fmt.Sprintf("domain %s-%d.foo is not contained in the list of DNSNames", rs.Name, i)
784785
assert.Contains(t, err.Error(), expectedErrorMessage)

0 commit comments

Comments
 (0)