Skip to content

Commit 4d8d37d

Browse files
author
lamai93
committed
Schedule only one CleanOutServer job. Introduced new Drain phase.
1 parent eed8ea1 commit 4d8d37d

File tree

5 files changed

+50
-7
lines changed

5 files changed

+50
-7
lines changed

pkg/apis/deployment/v1alpha/member_phase.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ const (
3434
MemberPhaseFailed MemberPhase = "Failed"
3535
// MemberPhaseCleanOut indicates that a dbserver is in the process of being cleaned out
3636
MemberPhaseCleanOut MemberPhase = "CleanOut"
37+
// MemberPhaseDrain indicates that a dbserver is n the process of being cleaned out as result of draining a node
38+
MemberPhaseDrain MemberPhase = "Drain"
3739
// MemberPhaseShuttingDown indicates that a member is shutting down
3840
MemberPhaseShuttingDown MemberPhase = "ShuttingDown"
3941
// MemberPhaseRotating indicates that a member is being rotated

pkg/deployment/reconcile/plan_builder.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
120120

121121
// Check for cleaned out dbserver in created state
122122
for _, m := range status.Members.DBServers {
123-
if len(plan) == 0 && m.Phase == api.MemberPhaseCreated && m.Conditions.IsTrue(api.ConditionTypeCleanedOut) {
123+
if len(plan) == 0 && (m.Phase == api.MemberPhaseCreated || m.Phase == api.MemberPhaseDrain) && m.Conditions.IsTrue(api.ConditionTypeCleanedOut) {
124124
log.Debug().
125125
Str("id", m.ID).
126126
Str("role", api.ServerGroupDBServers.AsRole()).

pkg/deployment/resources/context.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"context"
2727

2828
driver "github.com/arangodb/go-driver"
29+
"github.com/arangodb/go-driver/agency"
2930
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
3031
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3132
"k8s.io/api/core/v1"
@@ -85,4 +86,6 @@ type Context interface {
8586
// GetDatabaseClient returns a cached client for the entire database (cluster coordinators or single server),
8687
// creating one if needed.
8788
GetDatabaseClient(ctx context.Context) (driver.Client, error)
89+
// GetAgency returns a connection to the entire agency.
90+
GetAgency(ctx context.Context) (agency.Agency, error)
8891
}

pkg/deployment/resources/pod_inspector.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,11 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
182182
for _, m := range members {
183183
if podName := m.PodName; podName != "" {
184184
if !podExists(podName) {
185+
log.Debug().Str("pod-name", podName).Msg("Does not exist")
185186
switch m.Phase {
186187
case api.MemberPhaseNone:
187188
// Do nothing
189+
log.Debug().Str("pod-name", podName).Msg("PodPhase is None, waiting for the pod to be recreated")
188190
case api.MemberPhaseShuttingDown, api.MemberPhaseRotating, api.MemberPhaseUpgrading, api.MemberPhaseFailed:
189191
// Shutdown was intended, so not need to do anything here.
190192
// Just mark terminated

pkg/deployment/resources/pod_termination.go

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ import (
3131
"k8s.io/api/core/v1"
3232
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3333

34+
driver "github.com/arangodb/go-driver"
3435
"github.com/arangodb/go-driver/agency"
3536
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
37+
"github.com/arangodb/kube-arangodb/pkg/util/arangod"
3638
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3739
)
3840

@@ -185,15 +187,49 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
185187
return nil
186188
}
187189
// Not cleaned out yet, check member status
188-
if memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
190+
if memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) || (memberStatus.Phase == api.MemberPhaseCreated && !k8sutil.IsPodReady(p)) {
189191
log.Warn().Msg("Member is already terminated before it could be cleaned out. Not good, but removing dbserver pod because we cannot do anything further")
190192
return nil
191193
}
192-
// Ensure the cleanout is triggered
193-
log.Debug().Msg("Server is not yet clean out. Triggering a clean out now")
194-
if err := cluster.CleanOutServer(ctx, memberStatus.ID); err != nil {
195-
log.Debug().Err(err).Msg("Failed to clean out server")
196-
return maskAny(err)
194+
if memberStatus.Phase == api.MemberPhaseCreated {
195+
// No cleanout job triggered
196+
var jobID string
197+
ctx = driver.WithJobIDResponse(ctx, &jobID)
198+
// Ensure the cleanout is triggered
199+
log.Debug().Msg("Server is not yet clean out. Triggering a clean out now")
200+
if err := cluster.CleanOutServer(ctx, memberStatus.ID); err != nil {
201+
log.Debug().Err(err).Msg("Failed to clean out server")
202+
return maskAny(err)
203+
}
204+
memberStatus.CleanoutJobID = jobID
205+
memberStatus.Phase = api.MemberPhaseDrain
206+
if err := updateMember(memberStatus); err != nil {
207+
return maskAny(err)
208+
}
209+
} else if memberStatus.Phase == api.MemberPhaseDrain {
210+
// Check the job progress
211+
agency, err := r.context.GetAgency(ctx)
212+
if err != nil {
213+
log.Debug().Err(err).Msg("Failed to create agency client")
214+
return maskAny(err)
215+
}
216+
jobStatus, err := arangod.CleanoutServerJobStatus(ctx, memberStatus.CleanoutJobID, c, agency)
217+
if err != nil {
218+
log.Debug().Err(err).Msg("Failed to fetch cleanout job status")
219+
return maskAny(err)
220+
}
221+
if jobStatus.IsFailed() {
222+
log.Warn().Str("reason", jobStatus.Reason()).Msg("Cleanout Job failed. Aborting plan")
223+
// Revert cleanout state
224+
memberStatus.Phase = api.MemberPhaseCreated
225+
memberStatus.CleanoutJobID = ""
226+
return maskAny(fmt.Errorf("Clean out server job failed"))
227+
}
228+
} else {
229+
log.Warn().Msgf("Unexpected MemberPhase %s, allow removal", memberStatus.Phase)
230+
return nil
197231
}
232+
198233
return maskAny(fmt.Errorf("Server is not yet cleaned out"))
234+
199235
}

0 commit comments

Comments
 (0)