Skip to content

Commit b04a3db

Browse files
authored
[Feature] Disable member removal in case of health failure (#957)
1 parent d3a6c05 commit b04a3db

File tree

3 files changed

+13
-43
lines changed

3 files changed

+13
-43
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
- (Bugfix) Fix GetClient lock system
99
- (Feature) Backup InProgress Agency key discovery
1010
- (Feature) Backup & Maintenance Conditions
11+
- (Bugfix) Disable member removal in case of health failure
1112

1213
## [1.2.9](https://github.com/arangodb/kube-arangodb/tree/1.2.9) (2022-03-30)
1314
- (Feature) Improve Kubernetes clientsets management

pkg/deployment/deployment_inspector.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -346,8 +346,8 @@ func (d *Deployment) inspectDeploymentWithError(ctx context.Context, lastInterva
346346
return minInspectionInterval, errors.Wrapf(err, "AccessPackage creation failed")
347347
}
348348

349-
// Inspect deployment for obsolete members
350-
if err := d.resources.CleanupRemovedMembers(ctx, d.GetMembersState().Health()); err != nil {
349+
// Inspect deployment for synced members
350+
if err := d.resources.SyncMembersInCluster(ctx, d.GetMembersState().Health()); err != nil {
351351
return minInspectionInterval, errors.Wrapf(err, "Removed member cleanup failed")
352352
}
353353

pkg/deployment/resources/member_cleanup.go

Lines changed: 10 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ import (
3333
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
3434
memberState "github.com/arangodb/kube-arangodb/pkg/deployment/member"
3535
"github.com/arangodb/kube-arangodb/pkg/metrics"
36-
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3736
arangomemberv1 "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/arangomember/v1"
3837
)
3938

@@ -47,8 +46,8 @@ var (
4746
cleanupRemovedMembersCounters = metrics.MustRegisterCounterVec(metricsComponent, "cleanup_removed_members", "Number of cleanup-removed-members actions", metrics.DeploymentName, metrics.Result)
4847
)
4948

50-
// CleanupRemovedMembers removes all arangod members that are no longer part of ArangoDB deployment.
51-
func (r *Resources) CleanupRemovedMembers(ctx context.Context, health memberState.Health) error {
49+
// SyncMembersInCluster sets proper condition for all arangod members that belongs to the deployment.
50+
func (r *Resources) SyncMembersInCluster(ctx context.Context, health memberState.Health) error {
5251
if health.Error != nil {
5352
r.log.Info().Err(health.Error).Msg("Health of the cluster is missing")
5453
return nil
@@ -58,7 +57,7 @@ func (r *Resources) CleanupRemovedMembers(ctx context.Context, health memberStat
5857
switch r.context.GetSpec().GetMode() {
5958
case api.DeploymentModeCluster:
6059
deploymentName := r.context.GetAPIObject().GetName()
61-
if err := r.cleanupRemovedClusterMembers(ctx, health); err != nil {
60+
if err := r.syncMembersInCluster(ctx, health); err != nil {
6261
cleanupRemovedMembersCounters.WithLabelValues(deploymentName, metrics.Failed).Inc()
6362
return errors.WithStack(err)
6463
}
@@ -70,29 +69,25 @@ func (r *Resources) CleanupRemovedMembers(ctx context.Context, health memberStat
7069
}
7170
}
7271

73-
// cleanupRemovedClusterMembers removes all arangod members that are no longer part of the cluster.
74-
func (r *Resources) cleanupRemovedClusterMembers(ctx context.Context, health memberState.Health) error {
72+
// syncMembersInCluster sets proper condition for all arangod members that are part of the cluster.
73+
func (r *Resources) syncMembersInCluster(ctx context.Context, health memberState.Health) error {
7574
log := r.log
7675

7776
serverFound := func(id string) bool {
7877
_, found := health.Members[driver.ServerID(id)]
7978
return found
8079
}
8180

82-
// For over all members that can be removed
8381
status, lastVersion := r.context.GetStatus()
8482
updateStatusNeeded := false
85-
var podNamesToRemove, pvcNamesToRemove []string
83+
8684
status.Members.ForeachServerGroup(func(group api.ServerGroup, list api.MemberStatusList) error {
8785
if group != api.ServerGroupCoordinators && group != api.ServerGroupDBServers {
8886
// We're not interested in these other groups
8987
return nil
9088
}
9189
for _, m := range list {
92-
log := log.With().
93-
Str("member", m.ID).
94-
Str("role", group.AsRole()).
95-
Logger()
90+
log := log.With().Str("member", m.ID).Str("role", group.AsRole()).Logger()
9691
if serverFound(m.ID) {
9792
// Member is (still) found, skip it
9893
if m.Conditions.Update(api.ConditionTypeMemberOfCluster, true, "", "") {
@@ -104,25 +99,13 @@ func (r *Resources) cleanupRemovedClusterMembers(ctx context.Context, health mem
10499
}
105100
continue
106101
} else if !m.Conditions.IsTrue(api.ConditionTypeMemberOfCluster) {
107-
// Member is not yet recorded as member of cluster
108102
if m.Age() < minMemberAge {
109-
log.Debug().Dur("age", m.Age()).Msg("Member age is below minimum for removal")
103+
log.Debug().Dur("age", m.Age()).Msg("Member is not yet recorded as member of cluster")
110104
continue
111105
}
112-
log.Info().Msg("Member has never been part of the cluster for a long time. Removing it.")
106+
log.Warn().Msg("Member can not be found in cluster")
113107
} else {
114-
// Member no longer part of cluster, remove it
115-
log.Info().Msg("Member is no longer part of the ArangoDB cluster. Removing it.")
116-
}
117-
log.Info().Msg("Removing member")
118-
status.Members.RemoveByID(m.ID, group)
119-
updateStatusNeeded = true
120-
// Remove Pod & PVC (if any)
121-
if m.PodName != "" {
122-
podNamesToRemove = append(podNamesToRemove, m.PodName)
123-
}
124-
if m.PersistentVolumeClaimName != "" {
125-
pvcNamesToRemove = append(pvcNamesToRemove, m.PersistentVolumeClaimName)
108+
log.Info().Msg("Member is no longer part of the ArangoDB cluster")
126109
}
127110
}
128111
return nil
@@ -137,20 +120,6 @@ func (r *Resources) cleanupRemovedClusterMembers(ctx context.Context, health mem
137120
}
138121
}
139122

140-
for _, podName := range podNamesToRemove {
141-
log.Info().Str("pod", podName).Msg("Removing obsolete member pod")
142-
if err := r.context.DeletePod(ctx, podName, metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
143-
log.Warn().Err(err).Str("pod", podName).Msg("Failed to remove obsolete pod")
144-
}
145-
}
146-
147-
for _, pvcName := range pvcNamesToRemove {
148-
log.Info().Str("pvc", pvcName).Msg("Removing obsolete member PVC")
149-
if err := r.context.DeletePvc(ctx, pvcName); err != nil && !k8sutil.IsNotFound(err) {
150-
log.Warn().Err(err).Str("pvc", pvcName).Msg("Failed to remove obsolete PVC")
151-
}
152-
}
153-
154123
return nil
155124
}
156125

0 commit comments

Comments
 (0)