Skip to content

Commit 32ac512

Browse files
authored
Merge pull request #337 from arangodb/bug-fix/drain-node-cleanout
Schedule only one CleanOutServer job. Introduced new Drain phase.
2 parents 79dda1d + a64d92a commit 32ac512

File tree

6 files changed

+63
-7
lines changed

6 files changed

+63
-7
lines changed

pkg/apis/deployment/v1alpha/member_phase.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ const (
3434
MemberPhaseFailed MemberPhase = "Failed"
3535
// MemberPhaseCleanOut indicates that a dbserver is in the process of being cleaned out
3636
MemberPhaseCleanOut MemberPhase = "CleanOut"
37+
// MemberPhaseDrain indicates that a dbserver is in the process of being cleaned out as result of draining a node
38+
MemberPhaseDrain MemberPhase = "Drain"
3739
// MemberPhaseShuttingDown indicates that a member is shutting down
3840
MemberPhaseShuttingDown MemberPhase = "ShuttingDown"
3941
// MemberPhaseRotating indicates that a member is being rotated
@@ -46,3 +48,8 @@ const (
4648
func (p MemberPhase) IsFailed() bool {
4749
return p == MemberPhaseFailed
4850
}
51+
52+
// IsCreatedOrDrain returns true when given phase is MemberPhaseCreated or MemberPhaseDrain
53+
func (p MemberPhase) IsCreatedOrDrain() bool {
54+
return p == MemberPhaseCreated || p == MemberPhaseDrain
55+
}

pkg/deployment/reconcile/plan_builder.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
120120

121121
// Check for cleaned out dbserver in created state
122122
for _, m := range status.Members.DBServers {
123-
if len(plan) == 0 && m.Phase == api.MemberPhaseCreated && m.Conditions.IsTrue(api.ConditionTypeCleanedOut) {
123+
if len(plan) == 0 && m.Phase.IsCreatedOrDrain() && m.Conditions.IsTrue(api.ConditionTypeCleanedOut) {
124124
log.Debug().
125125
Str("id", m.ID).
126126
Str("role", api.ServerGroupDBServers.AsRole()).

pkg/deployment/resources/context.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"context"
2727

2828
driver "github.com/arangodb/go-driver"
29+
"github.com/arangodb/go-driver/agency"
2930
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
3031
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3132
"k8s.io/api/core/v1"
@@ -85,4 +86,6 @@ type Context interface {
8586
// GetDatabaseClient returns a cached client for the entire database (cluster coordinators or single server),
8687
// creating one if needed.
8788
GetDatabaseClient(ctx context.Context) (driver.Client, error)
89+
// GetAgency returns a connection to the entire agency.
90+
GetAgency(ctx context.Context) (agency.Agency, error)
8891
}

pkg/deployment/resources/pod_finalizers.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ func (r *Resources) inspectFinalizerPodDrainDBServer(ctx context.Context, log ze
123123
}
124124

125125
// If this DBServer is cleaned out, we need to remove the PVC.
126-
if memberStatus.Conditions.IsTrue(api.ConditionTypeCleanedOut) {
126+
if memberStatus.Conditions.IsTrue(api.ConditionTypeCleanedOut) || memberStatus.Phase == api.MemberPhaseDrain {
127127
pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace())
128128
if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
129129
log.Warn().Err(err).Msg("Failed to delete PVC for member")

pkg/deployment/resources/pod_inspector.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,11 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
182182
for _, m := range members {
183183
if podName := m.PodName; podName != "" {
184184
if !podExists(podName) {
185+
log.Debug().Str("pod-name", podName).Msg("Does not exist")
185186
switch m.Phase {
186187
case api.MemberPhaseNone:
187188
// Do nothing
189+
log.Debug().Str("pod-name", podName).Msg("PodPhase is None, waiting for the pod to be recreated")
188190
case api.MemberPhaseShuttingDown, api.MemberPhaseRotating, api.MemberPhaseUpgrading, api.MemberPhaseFailed:
189191
// Shutdown was intended, so not need to do anything here.
190192
// Just mark terminated

pkg/deployment/resources/pod_termination.go

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ import (
3131
"k8s.io/api/core/v1"
3232
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3333

34+
driver "github.com/arangodb/go-driver"
3435
"github.com/arangodb/go-driver/agency"
3536
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
37+
"github.com/arangodb/kube-arangodb/pkg/util/arangod"
3638
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3739
)
3840

@@ -187,13 +189,55 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
187189
// Not cleaned out yet, check member status
188190
if memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
189191
log.Warn().Msg("Member is already terminated before it could be cleaned out. Not good, but removing dbserver pod because we cannot do anything further")
192+
// At this point we have to set CleanedOut to true,
193+
// because we can no longer reason about the state in the agency and
194+
// bringing back the dbserver again may result in an cleaned out server without us knowing
195+
memberStatus.Conditions.Update(api.ConditionTypeCleanedOut, true, "Draining server failed", "")
196+
memberStatus.CleanoutJobID = ""
197+
if memberStatus.Phase == api.MemberPhaseDrain {
198+
memberStatus.Phase = api.MemberPhaseCreated
199+
}
200+
if err := updateMember(memberStatus); err != nil {
201+
return maskAny(err)
202+
}
190203
return nil
191204
}
192-
// Ensure the cleanout is triggered
193-
log.Debug().Msg("Server is not yet clean out. Triggering a clean out now")
194-
if err := cluster.CleanOutServer(ctx, memberStatus.ID); err != nil {
195-
log.Debug().Err(err).Msg("Failed to clean out server")
196-
return maskAny(err)
205+
if memberStatus.Phase == api.MemberPhaseCreated {
206+
// No cleanout job triggered
207+
var jobID string
208+
ctx = driver.WithJobIDResponse(ctx, &jobID)
209+
// Ensure the cleanout is triggered
210+
log.Debug().Msg("Server is not yet clean out. Triggering a clean out now")
211+
if err := cluster.CleanOutServer(ctx, memberStatus.ID); err != nil {
212+
log.Debug().Err(err).Msg("Failed to clean out server")
213+
return maskAny(err)
214+
}
215+
memberStatus.CleanoutJobID = jobID
216+
memberStatus.Phase = api.MemberPhaseDrain
217+
if err := updateMember(memberStatus); err != nil {
218+
return maskAny(err)
219+
}
220+
} else if memberStatus.Phase == api.MemberPhaseDrain {
221+
// Check the job progress
222+
agency, err := r.context.GetAgency(ctx)
223+
if err != nil {
224+
log.Debug().Err(err).Msg("Failed to create agency client")
225+
return maskAny(err)
226+
}
227+
jobStatus, err := arangod.CleanoutServerJobStatus(ctx, memberStatus.CleanoutJobID, c, agency)
228+
if err != nil {
229+
log.Debug().Err(err).Msg("Failed to fetch cleanout job status")
230+
return maskAny(err)
231+
}
232+
if jobStatus.IsFailed() {
233+
log.Warn().Str("reason", jobStatus.Reason()).Msg("Cleanout Job failed. Aborting plan")
234+
// Revert cleanout state
235+
memberStatus.Phase = api.MemberPhaseCreated
236+
memberStatus.CleanoutJobID = ""
237+
return maskAny(fmt.Errorf("Clean out server job failed"))
238+
}
197239
}
240+
198241
return maskAny(fmt.Errorf("Server is not yet cleaned out"))
242+
199243
}

0 commit comments

Comments
 (0)