Skip to content

Commit a47d506

Browse files
committed
Various resilience fixes
1 parent adc3616 commit a47d506

File tree

7 files changed

+73
-60
lines changed

7 files changed

+73
-60
lines changed

pkg/apis/deployment/v1alpha/conditions.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ const (
4040
// ConditionTypeCleanedOut indicates that the member (dbserver) has been cleaned out.
4141
// Always check in combination with ConditionTypeTerminated.
4242
ConditionTypeCleanedOut ConditionType = "CleanedOut"
43+
// ConditionTypeAgentRecoveryNeeded indicates that the member (agent) will no
44+
// longer recover from its current volume and there has to be rebuild
45+
// using the recovery procedure.
46+
ConditionTypeAgentRecoveryNeeded ConditionType = "AgentRecoveryNeeded"
4347
// ConditionTypePodSchedulingFailure indicates that one or more pods belonging to the deployment cannot be schedule.
4448
ConditionTypePodSchedulingFailure ConditionType = "PodSchedulingFailure"
4549
// ConditionTypeSecretsChanged indicates that the value of one of more secrets used by

pkg/deployment/resources/pod_creator.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, memberID string,
606606
m.Phase = newPhase
607607
m.Conditions.Remove(api.ConditionTypeReady)
608608
m.Conditions.Remove(api.ConditionTypeTerminated)
609+
m.Conditions.Remove(api.ConditionTypeAgentRecoveryNeeded)
609610
m.Conditions.Remove(api.ConditionTypeAutoUpgrade)
610611
if err := status.Members.Update(m, group); err != nil {
611612
return maskAny(err)

pkg/deployment/resources/pod_finalizers.go

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu
4848
switch f {
4949
case constants.FinalizerPodAgencyServing:
5050
log.Debug().Msg("Inspecting agency-serving finalizer")
51-
if err := r.inspectFinalizerPodAgencyServing(ctx, log, p, memberStatus); err == nil {
51+
if err := r.inspectFinalizerPodAgencyServing(ctx, log, p, memberStatus, updateMember); err == nil {
5252
removalList = append(removalList, f)
5353
} else {
5454
log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove finalizer yet")
@@ -80,18 +80,26 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu
8080

8181
// inspectFinalizerPodAgencyServing checks the finalizer condition for agency-serving.
8282
// It returns nil if the finalizer can be removed.
83-
func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus) error {
84-
if err := r.prepareAgencyPodTermination(ctx, log, p, memberStatus); err != nil {
83+
func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
84+
if err := r.prepareAgencyPodTermination(ctx, log, p, memberStatus, func(update api.MemberStatus) error {
85+
if err := updateMember(update); err != nil {
86+
return maskAny(err)
87+
}
88+
memberStatus = update
89+
return nil
90+
}); err != nil {
8591
// Pod cannot be terminated yet
8692
return maskAny(err)
8793
}
8894

89-
// Remaining agents are healthy, we can remove this one and trigger a delete of the PVC
90-
pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace())
91-
if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
92-
log.Warn().Err(err).Msg("Failed to delete PVC for member")
93-
return maskAny(err)
94-
} else {
95+
// Remaining agents are healthy, if we need to perform complete recovery
96+
// of the agent, also remove the PVC
97+
if memberStatus.Conditions.IsTrue(api.ConditionTypeAgentRecoveryNeeded) {
98+
pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace())
99+
if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
100+
log.Warn().Err(err).Msg("Failed to delete PVC for member")
101+
return maskAny(err)
102+
}
95103
log.Debug().Str("pvc-name", memberStatus.PersistentVolumeClaimName).Msg("Removed PVC of member so agency can be completely replaced")
96104
}
97105

@@ -101,17 +109,24 @@ func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log ze
101109
// inspectFinalizerPodDrainDBServer checks the finalizer condition for drain-dbserver.
102110
// It returns nil if the finalizer can be removed.
103111
func (r *Resources) inspectFinalizerPodDrainDBServer(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
104-
if err := r.prepareDBServerPodTermination(ctx, log, p, memberStatus, updateMember); err != nil {
112+
if err := r.prepareDBServerPodTermination(ctx, log, p, memberStatus, func(update api.MemberStatus) error {
113+
if err := updateMember(update); err != nil {
114+
return maskAny(err)
115+
}
116+
memberStatus = update
117+
return nil
118+
}); err != nil {
105119
// Pod cannot be terminated yet
106120
return maskAny(err)
107121
}
108122

109-
// Remaining agents are healthy, we can remove this one and trigger a delete of the PVC
110-
pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace())
111-
if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
112-
log.Warn().Err(err).Msg("Failed to delete PVC for member")
113-
return maskAny(err)
114-
} else {
123+
// If this DBServer is cleaned out, we need to remove the PVC.
124+
if memberStatus.Conditions.IsTrue(api.ConditionTypeCleanedOut) {
125+
pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace())
126+
if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
127+
log.Warn().Err(err).Msg("Failed to delete PVC for member")
128+
return maskAny(err)
129+
}
115130
log.Debug().Str("pvc-name", memberStatus.PersistentVolumeClaimName).Msg("Removed PVC of member")
116131
}
117132

pkg/deployment/resources/pod_termination.go

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,26 +36,10 @@ import (
3636
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3737
)
3838

39-
// preparePodTermination checks if the given pod is allowed to terminate and if so,
40-
// prepares it for termination.
41-
// It returns nil if the pod is allowed to terminate yet, an error otherwise.
42-
func (r *Resources) preparePodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
43-
var err error
44-
switch group {
45-
case api.ServerGroupAgents:
46-
err = r.prepareAgencyPodTermination(ctx, log, p, memberStatus)
47-
case api.ServerGroupDBServers:
48-
err = r.prepareDBServerPodTermination(ctx, log, p, memberStatus, updateMember)
49-
default:
50-
err = nil
51-
}
52-
return maskAny(err)
53-
}
54-
5539
// prepareAgencyPodTermination checks if the given agency pod is allowed to terminate
5640
// and if so, prepares it for termination.
5741
// It returns nil if the pod is allowed to terminate, an error otherwise.
58-
func (r *Resources) prepareAgencyPodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus) error {
42+
func (r *Resources) prepareAgencyPodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
5943
// Inspect member phase
6044
if memberStatus.Phase.IsFailed() {
6145
log.Debug().Msg("Pod is already failed, safe to remove agency serving finalizer")
@@ -117,6 +101,14 @@ func (r *Resources) prepareAgencyPodTermination(ctx context.Context, log zerolog
117101
return maskAny(err)
118102
}
119103

104+
// Complete agent recovery is needed, since data is already gone or not accessible
105+
if memberStatus.Conditions.Update(api.ConditionTypeAgentRecoveryNeeded, true, "Data Gone", "") {
106+
if err := updateMember(memberStatus); err != nil {
107+
return maskAny(err)
108+
}
109+
}
110+
log.Debug().Msg("Agent is ready to be completely recovered.")
111+
120112
return nil
121113
}
122114

pkg/deployment/resources/pvc_finalizers.go

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,17 @@ const (
4141
)
4242

4343
// runPVCFinalizers goes through the list of PVC finalizers to see if they can be removed.
44-
// Returns: Interval_till_next_inspection, error
45-
func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) (time.Duration, error) {
44+
func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) (time.Duration, error) {
4645
log := r.log.With().Str("pvc-name", p.GetName()).Logger()
4746
var removalList []string
4847
for _, f := range p.ObjectMeta.GetFinalizers() {
4948
switch f {
5049
case constants.FinalizerPVCMemberExists:
5150
log.Debug().Msg("Inspecting member exists finalizer")
52-
if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus, updateMember); err == nil {
51+
if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus); err == nil {
5352
removalList = append(removalList, f)
5453
} else {
55-
log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove PVC finalizer yet")
54+
log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove finalizer yet")
5655
}
5756
}
5857
}
@@ -63,8 +62,6 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume
6362
if err := k8sutil.RemovePVCFinalizers(log, kubecli, p, removalList, ignoreNotFound); err != nil {
6463
log.Debug().Err(err).Msg("Failed to update PVC (to remove finalizers)")
6564
return 0, maskAny(err)
66-
} else {
67-
log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from PVC")
6865
}
6966
} else {
7067
// Check again at given interval
@@ -75,7 +72,7 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume
7572

7673
// inspectFinalizerPVCMemberExists checks the finalizer condition for member-exists.
7774
// It returns nil if the finalizer can be removed.
78-
func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zerolog.Logger, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
75+
func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zerolog.Logger, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) error {
7976
// Inspect member phase
8077
if memberStatus.Phase.IsFailed() {
8178
log.Debug().Msg("Member is already failed, safe to remove member-exists finalizer")
@@ -102,22 +99,10 @@ func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zer
10299
}
103100
}
104101

105-
// Member still exists, let's trigger a delete of it, if we're allowed to do so
102+
// Member still exists, let's trigger a delete of it
106103
if memberStatus.PodName != "" {
107-
pods := r.context.GetKubeCli().CoreV1().Pods(apiObject.GetNamespace())
108-
log.Info().Msg("Checking in Pod of member can be removed, because PVC is being removed")
109-
if pod, err := pods.Get(memberStatus.PodName, metav1.GetOptions{}); err != nil && !k8sutil.IsNotFound(err) {
110-
log.Debug().Err(err).Msg("Failed to get pod for PVC")
111-
return maskAny(err)
112-
} else if err == nil {
113-
// We've got the pod, check & prepare its termination
114-
if err := r.preparePodTermination(ctx, log, pod, group, memberStatus, updateMember); err != nil {
115-
log.Debug().Err(err).Msg("Not allowed to remove pod yet")
116-
return maskAny(err)
117-
}
118-
}
119-
120104
log.Info().Msg("Removing Pod of member, because PVC is being removed")
105+
pods := r.context.GetKubeCli().CoreV1().Pods(apiObject.GetNamespace())
121106
if err := pods.Delete(memberStatus.PodName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
122107
log.Debug().Err(err).Msg("Failed to delete pod")
123108
return maskAny(err)

pkg/deployment/resources/pvc_inspector.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ import (
2626
"context"
2727
"time"
2828

29-
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
3029
"github.com/arangodb/kube-arangodb/pkg/metrics"
3130
"github.com/arangodb/kube-arangodb/pkg/util"
3231
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
@@ -76,11 +75,7 @@ func (r *Resources) InspectPVCs(ctx context.Context) (time.Duration, error) {
7675
updateMemberStatusNeeded := false
7776
if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(&p) {
7877
// Process finalizers
79-
if x, err := r.runPVCFinalizers(ctx, &p, group, memberStatus, func(m api.MemberStatus) error {
80-
updateMemberStatusNeeded = true
81-
memberStatus = m
82-
return nil
83-
}); err != nil {
78+
if x, err := r.runPVCFinalizers(ctx, &p, group, memberStatus); err != nil {
8479
// Only log here, since we'll be called to try again.
8580
log.Warn().Err(err).Msg("Failed to run PVC finalizers")
8681
} else {

tests/resilience_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232

3333
"github.com/dchest/uniuri"
3434
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
35+
"k8s.io/apimachinery/pkg/types"
3536

3637
driver "github.com/arangodb/go-driver"
3738
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
@@ -92,6 +93,17 @@ func TestResiliencePod(t *testing.T) {
9293
if err != nil {
9394
t.Fatalf("Failed to get pod %s: %v", m.PodName, err)
9495
}
96+
// Get current PVC so we can compare UID later
97+
var originalPVCUID types.UID
98+
if m.PersistentVolumeClaimName != "" {
99+
originalPVC, err := kubecli.CoreV1().PersistentVolumeClaims(ns).Get(m.PersistentVolumeClaimName, metav1.GetOptions{})
100+
if err != nil {
101+
t.Fatalf("Failed to get PVC %s: %v", m.PersistentVolumeClaimName, err)
102+
} else {
103+
originalPVCUID = originalPVC.GetUID()
104+
}
105+
}
106+
// Now delete the pod
95107
if err := kubecli.CoreV1().Pods(ns).Delete(m.PodName, &metav1.DeleteOptions{}); err != nil {
96108
t.Fatalf("Failed to delete pod %s: %v", m.PodName, err)
97109
}
@@ -109,6 +121,15 @@ func TestResiliencePod(t *testing.T) {
109121
if err := retry.Retry(op, time.Minute); err != nil {
110122
t.Fatalf("Pod did not restart: %v", err)
111123
}
124+
// Now that the Pod has been replaced, check that the PVC has NOT been replaced (if any)
125+
if m.PersistentVolumeClaimName != "" {
126+
pvc, err := kubecli.CoreV1().PersistentVolumeClaims(ns).Get(m.PersistentVolumeClaimName, metav1.GetOptions{})
127+
if err != nil {
128+
t.Fatalf("Failed to get PVC %s: %v", m.PersistentVolumeClaimName, err)
129+
} else if originalPVCUID != pvc.GetUID() {
130+
t.Errorf("PVC for member %s has been replaced", m.ID)
131+
}
132+
}
112133
// Wait for deployment to be ready
113134
if _, err = waitUntilDeployment(c, depl.GetName(), ns, deploymentIsReady()); err != nil {
114135
t.Fatalf("Deployment not running in time: %v", err)

0 commit comments

Comments
 (0)