Skip to content

Commit f77a5d6

Browse files
authored
[Bugifx] CleanOut Discovery fix (#781)
1 parent 29b979d commit f77a5d6

15 files changed

+125
-30
lines changed

pkg/apis/deployment/v1/member_phase.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ func (p MemberPhase) IsFailed() bool {
6060
return p == MemberPhaseFailed
6161
}
6262

63+
// IsReady returns true when given phase == "Created"
64+
func (p MemberPhase) IsReady() bool {
65+
return p == MemberPhaseCreated
66+
}
67+
6368
// IsCreatedOrDrain returns true when given phase is MemberPhaseCreated or MemberPhaseDrain
6469
func (p MemberPhase) IsCreatedOrDrain() bool {
6570
return p == MemberPhaseCreated || p == MemberPhaseDrain

pkg/apis/deployment/v1/member_status_list.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,11 @@ func (l MemberStatusList) SelectMemberToRemove() (MemberStatus, error) {
160160
return m, nil
161161
}
162162
}
163+
for _, m := range l {
164+
if m.Conditions.IsTrue(ConditionTypeCleanedOut) {
165+
return m, nil
166+
}
167+
}
163168
// Pick a random member that is in created state
164169
perm := rand.Perm(len(l))
165170
for _, idx := range perm {

pkg/apis/deployment/v2alpha1/member_phase.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ func (p MemberPhase) IsFailed() bool {
6060
return p == MemberPhaseFailed
6161
}
6262

63+
// IsReady returns true when given phase == "Created"
64+
func (p MemberPhase) IsReady() bool {
65+
return p == MemberPhaseCreated
66+
}
67+
6368
// IsCreatedOrDrain returns true when given phase is MemberPhaseCreated or MemberPhaseDrain
6469
func (p MemberPhase) IsCreatedOrDrain() bool {
6570
return p == MemberPhaseCreated || p == MemberPhaseDrain

pkg/apis/deployment/v2alpha1/member_status_list.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,11 @@ func (l MemberStatusList) SelectMemberToRemove() (MemberStatus, error) {
160160
return m, nil
161161
}
162162
}
163+
for _, m := range l {
164+
if m.Conditions.IsTrue(ConditionTypeCleanedOut) {
165+
return m, nil
166+
}
167+
}
163168
// Pick a random member that is in created state
164169
perm := rand.Perm(len(l))
165170
for _, idx := range perm {

pkg/deployment/reconcile/action_cleanout_member.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ type actionCleanoutMember struct {
5959
// Returns true if the action is completely finished, false in case
6060
// the start time needs to be recorded and a ready condition needs to be checked.
6161
func (a *actionCleanoutMember) Start(ctx context.Context) (bool, error) {
62+
if a.action.Group != api.ServerGroupDBServers {
63+
// Proceed only on DBServers
64+
return true, nil
65+
}
66+
6267
m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID)
6368
if !ok {
6469
// We wanted to remove and it is already gone. All ok
@@ -87,6 +92,10 @@ func (a *actionCleanoutMember) Start(ctx context.Context) (bool, error) {
8792
var jobID string
8893
ctxJobID := driver.WithJobIDResponse(ctxChild, &jobID)
8994
if err := cluster.CleanOutServer(ctxJobID, a.action.MemberID); err != nil {
95+
if driver.IsNotFound(err) {
96+
// Member not found, it could be that it never connected to the cluster
97+
return true, nil
98+
}
9099
log.Debug().Err(err).Msg("Failed to cleanout member")
91100
return false, errors.WithStack(err)
92101
}
@@ -182,6 +191,7 @@ func (a *actionCleanoutMember) CheckProgress(ctx context.Context) (bool, bool, e
182191
return false, false, errors.WithStack(err)
183192
}
184193
}
194+
185195
// Cleanout completed
186196
return true, false, nil
187197
}

pkg/deployment/reconcile/action_remove_member.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ package reconcile
2626
import (
2727
"context"
2828

29+
apiErrors "k8s.io/apimachinery/pkg/api/errors"
30+
2931
"github.com/arangodb/kube-arangodb/pkg/util/errors"
3032

3133
"github.com/rs/zerolog"
@@ -105,14 +107,20 @@ func (a *actionRemoveMember) Start(ctx context.Context) (bool, error) {
105107
}
106108
}
107109
}
108-
// Remove the pod (if any)
109-
if err := a.actionCtx.DeletePod(ctx, m.PodName); err != nil {
110-
return false, errors.WithStack(err)
110+
if m.PodName != "" {
111+
// Remove the pod (if any)
112+
if err := a.actionCtx.DeletePod(ctx, m.PodName); err != nil {
113+
if !apiErrors.IsNotFound(err) {
114+
return false, errors.WithStack(err)
115+
}
116+
}
111117
}
112118
// Remove the pvc (if any)
113119
if m.PersistentVolumeClaimName != "" {
114120
if err := a.actionCtx.DeletePvc(ctx, m.PersistentVolumeClaimName); err != nil {
115-
return false, errors.WithStack(err)
121+
if !apiErrors.IsNotFound(err) {
122+
return false, errors.WithStack(err)
123+
}
116124
}
117125
}
118126
// Remove member

pkg/deployment/reconcile/helper_shutdown.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ func (s shutdownHelperAPI) Start(ctx context.Context) (bool, error) {
6161
log.Error().Msg("No such member")
6262
return true, nil
6363
}
64+
if m.PodName == "" {
65+
log.Warn().Msgf("Pod is empty")
66+
return true, nil
67+
}
6468
// Remove finalizers, so Kubernetes will quickly terminate the pod
6569
if err := s.actionCtx.RemovePodFinalizers(ctx, m.PodName); err != nil {
6670
return false, errors.WithStack(err)
@@ -130,6 +134,11 @@ func (s shutdownHelperDelete) Start(ctx context.Context) (bool, error) {
130134
return true, nil
131135
}
132136

137+
if m.PodName == "" {
138+
log.Warn().Msgf("Pod is empty")
139+
return true, nil
140+
}
141+
133142
// Terminate pod
134143
if err := s.actionCtx.DeletePod(ctx, m.PodName); err != nil {
135144
if !k8sutil.IsNotFound(err) {

pkg/deployment/reconcile/helper_wrap.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,19 @@ func withResignLeadership(group api.ServerGroup, member api.MemberStatus, reason
5151

5252
return api.AsPlan(plan).Before(api.NewAction(api.ActionTypeResignLeadership, group, member.ID, reason))
5353
}
54+
55+
func cleanOutMember(group api.ServerGroup, m api.MemberStatus) api.Plan {
56+
var plan api.Plan
57+
58+
if group == api.ServerGroupDBServers {
59+
plan = append(plan,
60+
api.NewAction(api.ActionTypeCleanOutMember, group, m.ID),
61+
)
62+
}
63+
plan = append(plan,
64+
api.NewAction(api.ActionTypeShutdownMember, group, m.ID),
65+
api.NewAction(api.ActionTypeRemoveMember, group, m.ID),
66+
)
67+
68+
return plan
69+
}

pkg/deployment/reconcile/plan_builder_clean_out.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ func createCleanOutPlan(ctx context.Context, log zerolog.Logger, _ k8sutil.APIOb
4343
return nil
4444
}
4545

46+
if !status.Conditions.IsTrue(api.ConditionTypeUpToDate) {
47+
// Do not consider to mark cleanedout servers when changes are propagating
48+
return nil
49+
}
50+
4651
cluster, err := getCluster(ctx, planCtx)
4752
if err != nil {
4853
log.Warn().Err(err).Msgf("Unable to get cluster")
@@ -57,6 +62,8 @@ func createCleanOutPlan(ctx context.Context, log zerolog.Logger, _ k8sutil.APIOb
5762
return nil
5863
}
5964

65+
var plan api.Plan
66+
6067
for id, member := range health.Health {
6168
switch member.Role {
6269
case driver.ServerRoleDBServer:
@@ -79,13 +86,13 @@ func createCleanOutPlan(ctx context.Context, log zerolog.Logger, _ k8sutil.APIOb
7986
Msgf("server is cleaned out so operator must do the same")
8087

8188
action := api.NewAction(api.ActionTypeSetMemberCondition, api.ServerGroupDBServers, string(id),
82-
"server is cleaned out so operator must do the same")
83-
action = action.AddParam(string(api.ConditionTypeCleanedOut), strconv.FormatBool(true))
89+
"server is cleaned out so operator must do the same").
90+
AddParam(string(api.ConditionTypeCleanedOut), strconv.FormatBool(true))
8491

85-
return api.Plan{action}
92+
plan = append(plan, action)
8693
}
8794
}
8895
}
8996

90-
return nil
97+
return plan
9198
}

pkg/deployment/reconcile/plan_builder_normal.go

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,14 +83,14 @@ func createNormalPlan(ctx context.Context, log zerolog.Logger, apiObject k8sutil
8383
return newPlanAppender(NewWithPlanBuilder(ctx, log, apiObject, spec, status, cachedStatus, builderCtx), nil).
8484
// Check for failed members
8585
ApplyIfEmpty(createMemberFailedRestorePlan).
86-
// Check for cleaned out dbserver in created state
87-
ApplyIfEmpty(createRemoveCleanedDBServersPlan).
8886
// Update status
8987
ApplySubPlanIfEmpty(createEncryptionKeyStatusPropagatedFieldUpdate, createEncryptionKeyStatusUpdate).
9088
ApplyIfEmpty(createTLSStatusUpdate).
9189
ApplyIfEmpty(createJWTStatusUpdate).
9290
// Check for scale up/down
9391
ApplyIfEmpty(createScaleMemberPlan).
92+
// Check for cleaned out dbserver in created state
93+
ApplyIfEmpty(createRemoveCleanedDBServersPlan).
9494
// Check for members to be removed
9595
ApplyIfEmpty(createReplaceMemberPlan).
9696
// Check for the need to rotate one or more members
@@ -205,10 +205,7 @@ func createRemoveCleanedDBServersPlan(ctx context.Context,
205205
Str("id", m.ID).
206206
Str("role", api.ServerGroupDBServers.AsRole()).
207207
Msg("Creating dbserver replacement plan because server is cleanout in created phase")
208-
return api.Plan{
209-
api.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, m.ID),
210-
api.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, ""),
211-
}
208+
return cleanOutMember(api.ServerGroupDBServers, m)
212209
}
213210
}
214211

0 commit comments

Comments
 (0)