Skip to content

Commit e86c907

Browse files
authored
Merge pull request #342 from arangodb/bug-fix/remove-server
Try harder to remove server from cluster.
2 parents edf5d2f + f963832 commit e86c907

File tree

2 files changed

+33
-2
lines changed

2 files changed

+33
-2
lines changed

pkg/deployment/reconcile/action_remove_member.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,22 @@ func (a *actionRemoveMember) Start(ctx context.Context) (bool, error) {
7070
if err := arangod.RemoveServerFromCluster(ctx, client.Connection(), driver.ServerID(m.ID)); err != nil {
7171
if !driver.IsNotFound(err) && !driver.IsPreconditionFailed(err) {
7272
return false, maskAny(errors.Wrapf(err, "Failed to remove server from cluster: %#v", err))
73+
} else if driver.IsPreconditionFailed(err) {
74+
cluster, err := client.Cluster(ctx)
75+
if err != nil {
76+
return false, maskAny(errors.Wrapf(err, "Failed to obtain cluster: %#v", err))
77+
}
78+
health, err := cluster.Health(ctx)
79+
if err != nil {
80+
return false, maskAny(errors.Wrapf(err, "Failed to obtain cluster health: %#v", err))
81+
}
82+
// We don't care if not found
83+
if record, ok := health.Health[driver.ServerID(m.ID)]; ok {
84+
if record.Status != driver.ServerStatusFailed {
85+
return false, maskAny(fmt.Errorf("can not remove server from cluster. Not yet terminated. Retry later"))
86+
}
87+
a.log.Warn().Msg("dbserver is failed but still in use")
88+
}
7389
} else {
7490
a.log.Warn().Msgf("ignoring error: %s", err.Error())
7591
}

pkg/deployment/resources/pod_termination.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
123123
log.Debug().Msg("Pod is already failed, safe to remove dbserver pod")
124124
return nil
125125
}
126+
// If pod is not member of cluster, do nothing
127+
if !memberStatus.Conditions.IsTrue(api.ConditionTypeMemberOfCluster) {
128+
log.Debug().Msg("Pod is not member of cluster")
129+
return nil
130+
}
131+
126132
// Inspect deployment deletion state
127133
apiObject := r.context.GetAPIObject()
128134
if apiObject.GetDeletionTimestamp() != nil {
@@ -154,6 +160,11 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
154160
dbserverDataWillBeGone = true
155161
}
156162

163+
// Once decided to drain the member, never go back
164+
if memberStatus.Phase == api.MemberPhaseCreated {
165+
dbserverDataWillBeGone = true
166+
}
167+
157168
// Is this a simple pod restart?
158169
if !dbserverDataWillBeGone {
159170
log.Debug().Msg("Pod is just being restarted, safe to remove dbserver pod")
@@ -230,11 +241,15 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
230241
return maskAny(err)
231242
}
232243
if jobStatus.IsFailed() {
233-
log.Warn().Str("reason", jobStatus.Reason()).Msg("Cleanout Job failed. Aborting plan")
244+
log.Warn().Str("reason", jobStatus.Reason()).Msg("Cleanout Job failed")
234245
// Revert cleanout state
235246
memberStatus.Phase = api.MemberPhaseCreated
236247
memberStatus.CleanoutJobID = ""
237-
return maskAny(fmt.Errorf("Clean out server job failed"))
248+
if err := updateMember(memberStatus); err != nil {
249+
return maskAny(err)
250+
}
251+
log.Error().Msg("Cleanout server job failed, continue anyway")
252+
return nil
238253
}
239254
}
240255

0 commit comments

Comments
 (0)