Skip to content

Commit 85918bd

Browse files
committed
Preparing pods for termination when PVC is deleted
1 parent 95dd05c commit 85918bd

File tree

6 files changed

+260
-71
lines changed

6 files changed

+260
-71
lines changed

pkg/deployment/reconcile/plan_builder.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
101101
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
102102
for _, m := range members {
103103
if m.Phase == api.MemberPhaseFailed && len(plan) == 0 {
104+
log.Debug().
105+
Str("id", m.ID).
106+
Str("role", group.AsRole()).
107+
Msg("Creating member replacement plan because member has failed")
104108
newID := ""
105109
if group == api.ServerGroupAgents {
106110
newID = m.ID // Agents cannot (yet) be replaced with new IDs
@@ -117,6 +121,10 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
117121
// Check for cleaned out dbserver in created state
118122
for _, m := range status.Members.DBServers {
119123
if len(plan) == 0 && m.Phase == api.MemberPhaseCreated && m.Conditions.IsTrue(api.ConditionTypeCleanedOut) {
124+
log.Debug().
125+
Str("id", m.ID).
126+
Str("role", api.ServerGroupDBServers.AsRole()).
127+
Msg("Creating dbserver replacement plan because server is cleanout in created phase")
120128
plan = append(plan,
121129
api.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, m.ID),
122130
api.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, ""),

pkg/deployment/resources/pod_finalizers.go

Lines changed: 3 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import (
3131
"k8s.io/api/core/v1"
3232
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3333

34-
"github.com/arangodb/go-driver/agency"
3534
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
3635
"github.com/arangodb/kube-arangodb/pkg/util/constants"
3736
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
@@ -84,68 +83,13 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu
8483
// inspectFinalizerPodAgencyServing checks the finalizer condition for agency-serving.
8584
// It returns nil if the finalizer can be removed.
8685
func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus) error {
87-
// Inspect member phase
88-
if memberStatus.Phase.IsFailed() {
89-
log.Debug().Msg("Pod is already failed, safe to remove agency serving finalizer")
90-
return nil
91-
}
92-
// Inspect deployment deletion state
93-
apiObject := r.context.GetAPIObject()
94-
if apiObject.GetDeletionTimestamp() != nil {
95-
log.Debug().Msg("Entire deployment is being deleted, safe to remove agency serving finalizer")
96-
return nil
97-
}
98-
99-
// Check node the pod is scheduled on
100-
agentDataWillBeGone := false
101-
if p.Spec.NodeName != "" {
102-
node, err := r.context.GetKubeCli().CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{})
103-
if err != nil {
104-
log.Warn().Err(err).Msg("Failed to get node for member")
105-
return maskAny(err)
106-
}
107-
if node.Spec.Unschedulable {
108-
agentDataWillBeGone = true
109-
}
110-
}
111-
112-
// Check PVC
113-
pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(apiObject.GetNamespace())
114-
pvc, err := pvcs.Get(memberStatus.PersistentVolumeClaimName, metav1.GetOptions{})
115-
if err != nil {
116-
log.Warn().Err(err).Msg("Failed to get PVC for member")
117-
return maskAny(err)
118-
}
119-
if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(pvc) {
120-
agentDataWillBeGone = true
121-
}
122-
123-
// Is this a simple pod restart?
124-
if !agentDataWillBeGone {
125-
log.Debug().Msg("Pod is just being restarted, safe to remove agency serving finalizer")
126-
return nil
127-
}
128-
129-
// Inspect agency state
130-
log.Debug().Msg("Agent data will be gone, so we will check agency serving status first")
131-
ctx = agency.WithAllowNoLeader(ctx) // The ID we're checking may be the leader, so ignore situations where all other agents are followers
132-
ctx, cancel := context.WithTimeout(ctx, time.Second*15) // Force a quick check
133-
defer cancel()
134-
agencyConns, err := r.context.GetAgencyClients(ctx, func(id string) bool { return id != memberStatus.ID })
135-
if err != nil {
136-
log.Debug().Err(err).Msg("Failed to create member client")
137-
return maskAny(err)
138-
}
139-
if len(agencyConns) == 0 {
140-
log.Debug().Err(err).Msg("No more remaining agents, we cannot delete this one")
141-
return maskAny(fmt.Errorf("No more remaining agents"))
142-
}
143-
if err := agency.AreAgentsHealthy(ctx, agencyConns); err != nil {
144-
log.Debug().Err(err).Msg("Remaining agents are not healthy")
86+
if err := r.prepareAgencyPodTermination(ctx, log, p, memberStatus); err != nil {
87+
// Pod cannot be terminated yet
14588
return maskAny(err)
14689
}
14790

14891
// Remaining agents are healthy, we can remove this one and trigger a delete of the PVC
92+
pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace())
14993
if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
15094
log.Warn().Err(err).Msg("Failed to delete PVC for member")
15195
return maskAny(err)
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package resources
24+
25+
import (
26+
"context"
27+
"fmt"
28+
"time"
29+
30+
"github.com/rs/zerolog"
31+
"k8s.io/api/core/v1"
32+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
33+
34+
"github.com/arangodb/go-driver/agency"
35+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
36+
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
37+
)
38+
39+
// preparePodTermination checks if the given pod is allowed to terminate and if so,
40+
// prepares it for termination.
41+
// It returns nil if the pod is allowed to terminate yet, an error otherwise.
42+
func (r *Resources) preparePodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
43+
var err error
44+
switch group {
45+
case api.ServerGroupAgents:
46+
err = r.prepareAgencyPodTermination(ctx, log, p, memberStatus)
47+
case api.ServerGroupDBServers:
48+
err = r.prepareDBServerPodTermination(ctx, log, p, memberStatus, updateMember)
49+
default:
50+
err = nil
51+
}
52+
return maskAny(err)
53+
}
54+
55+
// prepareAgencyPodTermination checks if the given agency pod is allowed to terminate
56+
// and if so, prepares it for termination.
57+
// It returns nil if the pod is allowed to terminate, an error otherwise.
58+
func (r *Resources) prepareAgencyPodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus) error {
59+
// Inspect member phase
60+
if memberStatus.Phase.IsFailed() {
61+
log.Debug().Msg("Pod is already failed, safe to remove agency serving finalizer")
62+
return nil
63+
}
64+
// Inspect deployment deletion state
65+
apiObject := r.context.GetAPIObject()
66+
if apiObject.GetDeletionTimestamp() != nil {
67+
log.Debug().Msg("Entire deployment is being deleted, safe to remove agency serving finalizer")
68+
return nil
69+
}
70+
71+
// Check node the pod is scheduled on
72+
agentDataWillBeGone := false
73+
if p.Spec.NodeName != "" {
74+
node, err := r.context.GetKubeCli().CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{})
75+
if err != nil {
76+
log.Warn().Err(err).Msg("Failed to get node for member")
77+
return maskAny(err)
78+
}
79+
if node.Spec.Unschedulable {
80+
agentDataWillBeGone = true
81+
}
82+
}
83+
84+
// Check PVC
85+
pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(apiObject.GetNamespace())
86+
pvc, err := pvcs.Get(memberStatus.PersistentVolumeClaimName, metav1.GetOptions{})
87+
if err != nil {
88+
log.Warn().Err(err).Msg("Failed to get PVC for member")
89+
return maskAny(err)
90+
}
91+
if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(pvc) {
92+
agentDataWillBeGone = true
93+
}
94+
95+
// Is this a simple pod restart?
96+
if !agentDataWillBeGone {
97+
log.Debug().Msg("Pod is just being restarted, safe to terminate agency pod")
98+
return nil
99+
}
100+
101+
// Inspect agency state
102+
log.Debug().Msg("Agent data will be gone, so we will check agency serving status first")
103+
ctx = agency.WithAllowNoLeader(ctx) // The ID we're checking may be the leader, so ignore situations where all other agents are followers
104+
ctx, cancel := context.WithTimeout(ctx, time.Second*15) // Force a quick check
105+
defer cancel()
106+
agencyConns, err := r.context.GetAgencyClients(ctx, func(id string) bool { return id != memberStatus.ID })
107+
if err != nil {
108+
log.Debug().Err(err).Msg("Failed to create member client")
109+
return maskAny(err)
110+
}
111+
if len(agencyConns) == 0 {
112+
log.Debug().Err(err).Msg("No more remaining agents, we cannot delete this one")
113+
return maskAny(fmt.Errorf("No more remaining agents"))
114+
}
115+
if err := agency.AreAgentsHealthy(ctx, agencyConns); err != nil {
116+
log.Debug().Err(err).Msg("Remaining agents are not healthy")
117+
return maskAny(err)
118+
}
119+
120+
return nil
121+
}
122+
123+
// prepareDBServerPodTermination checks if the given dbserver pod is allowed to terminate
124+
// and if so, prepares it for termination.
125+
// It returns nil if the pod is allowed to terminate, an error otherwise.
126+
func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
127+
// Inspect member phase
128+
if memberStatus.Phase.IsFailed() {
129+
log.Debug().Msg("Pod is already failed, safe to remove dbserver pod")
130+
return nil
131+
}
132+
// Inspect deployment deletion state
133+
apiObject := r.context.GetAPIObject()
134+
if apiObject.GetDeletionTimestamp() != nil {
135+
log.Debug().Msg("Entire deployment is being deleted, safe to remove dbserver pod")
136+
return nil
137+
}
138+
139+
// Check node the pod is scheduled on
140+
dbserverDataWillBeGone := false
141+
if p.Spec.NodeName != "" {
142+
node, err := r.context.GetKubeCli().CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{})
143+
if err != nil {
144+
log.Warn().Err(err).Msg("Failed to get node for member")
145+
return maskAny(err)
146+
}
147+
if node.Spec.Unschedulable {
148+
dbserverDataWillBeGone = true
149+
}
150+
}
151+
152+
// Check PVC
153+
pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(apiObject.GetNamespace())
154+
pvc, err := pvcs.Get(memberStatus.PersistentVolumeClaimName, metav1.GetOptions{})
155+
if err != nil {
156+
log.Warn().Err(err).Msg("Failed to get PVC for member")
157+
return maskAny(err)
158+
}
159+
if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(pvc) {
160+
dbserverDataWillBeGone = true
161+
}
162+
163+
// Is this a simple pod restart?
164+
if !dbserverDataWillBeGone {
165+
log.Debug().Msg("Pod is just being restarted, safe to remove dbserver pod")
166+
return nil
167+
}
168+
169+
// Inspect cleaned out state
170+
log.Debug().Msg("DBServer data is being deleted, so we will cleanout the dbserver first")
171+
c, err := r.context.GetDatabaseClient(ctx)
172+
if err != nil {
173+
log.Debug().Err(err).Msg("Failed to create member client")
174+
return maskAny(err)
175+
}
176+
cluster, err := c.Cluster(ctx)
177+
if err != nil {
178+
log.Debug().Err(err).Msg("Failed to access cluster")
179+
return maskAny(err)
180+
}
181+
cleanedOut, err := cluster.IsCleanedOut(ctx, memberStatus.ID)
182+
if err != nil {
183+
return maskAny(err)
184+
}
185+
if cleanedOut {
186+
// Cleanout completed
187+
if memberStatus.Conditions.Update(api.ConditionTypeCleanedOut, true, "CleanedOut", "") {
188+
if err := updateMember(memberStatus); err != nil {
189+
return maskAny(err)
190+
}
191+
}
192+
// Trigger PVC removal
193+
if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil {
194+
log.Warn().Err(err).Msg("Failed to delete PVC for member")
195+
return maskAny(err)
196+
}
197+
198+
log.Debug().Msg("Server is cleaned out. Save to remove drain dbserver finalizer")
199+
return nil
200+
}
201+
// Not cleaned out yet, check member status
202+
if memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
203+
log.Warn().Msg("Member is already terminated before it could be cleaned out. Not good, but removing dbserver pod because we cannot do anything further")
204+
return nil
205+
}
206+
// Ensure the cleanout is triggered
207+
log.Debug().Msg("Server is not yet clean out. Triggering a clean out now")
208+
if err := cluster.CleanOutServer(ctx, memberStatus.ID); err != nil {
209+
log.Debug().Err(err).Msg("Failed to clean out server")
210+
return maskAny(err)
211+
}
212+
return maskAny(fmt.Errorf("Server is not yet cleaned out"))
213+
}

pkg/deployment/resources/pvc_finalizers.go

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,14 @@ import (
3636
)
3737

3838
// runPVCFinalizers goes through the list of PVC finalizers to see if they can be removed.
39-
func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) error {
39+
func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
4040
log := r.log.With().Str("pvc-name", p.GetName()).Logger()
4141
var removalList []string
4242
for _, f := range p.ObjectMeta.GetFinalizers() {
4343
switch f {
4444
case constants.FinalizerPVCMemberExists:
4545
log.Debug().Msg("Inspecting member exists finalizer")
46-
if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus); err == nil {
46+
if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus, updateMember); err == nil {
4747
removalList = append(removalList, f)
4848
} else {
4949
log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove PVC finalizer yet")
@@ -66,7 +66,7 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume
6666

6767
// inspectFinalizerPVCMemberExists checks the finalizer condition for member-exists.
6868
// It returns nil if the finalizer can be removed.
69-
func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zerolog.Logger, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) error {
69+
func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zerolog.Logger, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
7070
// Inspect member phase
7171
if memberStatus.Phase.IsFailed() {
7272
log.Debug().Msg("Member is already failed, safe to remove member-exists finalizer")
@@ -93,10 +93,22 @@ func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zer
9393
}
9494
}
9595

96-
// Member still exists, let's trigger a delete of it
96+
// Member still exists, let's trigger a delete of it, if we're allowed to do so
9797
if memberStatus.PodName != "" {
98-
log.Info().Msg("Removing Pod of member, because PVC is being removed")
9998
pods := r.context.GetKubeCli().CoreV1().Pods(apiObject.GetNamespace())
99+
log.Info().Msg("Checking in Pod of member can be removed, because PVC is being removed")
100+
if pod, err := pods.Get(memberStatus.PodName, metav1.GetOptions{}); err != nil && !k8sutil.IsNotFound(err) {
101+
log.Debug().Err(err).Msg("Failed to get pod for PVC")
102+
return maskAny(err)
103+
} else if err == nil {
104+
// We've got the pod, check & prepare its termination
105+
if err := r.preparePodTermination(ctx, log, pod, group, memberStatus, updateMember); err != nil {
106+
log.Debug().Err(err).Msg("Not allowed to remove pod yet")
107+
return maskAny(err)
108+
}
109+
}
110+
111+
log.Info().Msg("Removing Pod of member, because PVC is being removed")
100112
if err := pods.Delete(memberStatus.PodName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
101113
log.Debug().Err(err).Msg("Failed to delete pod")
102114
return maskAny(err)

pkg/deployment/resources/pvc_inspector.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ package resources
2525
import (
2626
"context"
2727

28+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
2829
"github.com/arangodb/kube-arangodb/pkg/metrics"
2930
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3031
)
@@ -68,13 +69,23 @@ func (r *Resources) InspectPVCs(ctx context.Context) error {
6869
continue
6970
}
7071

72+
updateMemberStatusNeeded := false
7173
if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(&p) {
7274
// Process finalizers
73-
if err := r.runPVCFinalizers(ctx, &p, group, memberStatus); err != nil {
75+
if err := r.runPVCFinalizers(ctx, &p, group, memberStatus, func(m api.MemberStatus) error {
76+
updateMemberStatusNeeded = true
77+
memberStatus = m
78+
return nil
79+
}); err != nil {
7480
// Only log here, since we'll be called to try again.
7581
log.Warn().Err(err).Msg("Failed to run PVC finalizers")
7682
}
7783
}
84+
if updateMemberStatusNeeded {
85+
if err := status.Members.Update(memberStatus, group); err != nil {
86+
return maskAny(err)
87+
}
88+
}
7889
}
7990

8091
return nil

0 commit comments

Comments
 (0)