Skip to content

Commit 265397d

Browse files
authored
Merge pull request #153 from arangodb/bugfix/cleanup-long-terminating-stateful-pods
Cleanup long terminating stateful pods
2 parents 4a2f34b + 12ae458 commit 265397d

File tree

5 files changed

+146
-1
lines changed

5 files changed

+146
-1
lines changed

pkg/apis/deployment/v1alpha/conditions.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ const (
4646
// the deployment have changed. Once that is the case, the operator will no longer
4747
// touch the deployment, until the original secrets have been restored.
4848
ConditionTypeSecretsChanged ConditionType = "SecretsChanged"
49+
// ConditionTypeMemberOfCluster indicates that the member is a known member of the ArangoDB cluster.
50+
ConditionTypeMemberOfCluster ConditionType = "MemberOfCluster"
4951
)
5052

5153
// Condition represents one current condition of a deployment or deployment member.

pkg/apis/deployment/v1alpha/member_status.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ type MemberStatus struct {
5252
IsInitialized bool `json:"initialized"`
5353
}
5454

55+
// Age returns the duration since the creation timestamp of this member.
56+
func (s MemberStatus) Age() time.Duration {
57+
return time.Since(s.CreatedAt.Time)
58+
}
59+
5560
// RemoveTerminationsBefore removes all recent terminations before the given timestamp.
5661
// It returns the number of terminations that have been removed.
5762
func (s *MemberStatus) RemoveTerminationsBefore(timestamp time.Time) int {

pkg/deployment/deployment_inspector.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,14 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
135135
d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject))
136136
}
137137

138+
// Inspect deployment for obsolete members
139+
if err := d.resources.CleanupRemovedMembers(); err != nil {
140+
hasError = true
141+
d.CreateEvent(k8sutil.NewErrorEvent("Removed member cleanup failed", err, d.apiObject))
142+
}
143+
138144
// At the end of the inspect, we cleanup terminated pods.
139-
if d.resources.CleanupTerminatedPods(); err != nil {
145+
if err := d.resources.CleanupTerminatedPods(); err != nil {
140146
hasError = true
141147
d.CreateEvent(k8sutil.NewErrorEvent("Pod cleanup failed", err, d.apiObject))
142148
}

pkg/deployment/resources/context.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@ type Context interface {
7272
// CleanupPod deletes a given pod with force and explicit UID.
7373
// If the pod does not exist, the error is ignored.
7474
CleanupPod(p v1.Pod) error
75+
// DeletePod deletes a pod with given name in the namespace
76+
// of the deployment. If the pod does not exist, the error is ignored.
77+
DeletePod(podName string) error
78+
// DeletePvc deletes a persistent volume claim with given name in the namespace
79+
// of the deployment. If the pvc does not exist, the error is ignored.
80+
DeletePvc(pvcName string) error
7581
// GetAgencyClients returns a client connection for every agency member.
7682
GetAgencyClients(ctx context.Context, predicate func(memberID string) bool) ([]driver.Connection, error)
7783
// GetDatabaseClient returns a cached client for the entire database (cluster coordinators or single server),
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package resources
24+
25+
import (
26+
"context"
27+
"time"
28+
29+
driver "github.com/arangodb/go-driver"
30+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
31+
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
32+
)
33+
34+
const (
35+
// minMemberAge is the minimum duration we expect a member to be created before we remove it because
36+
// it is not part of a deployment.
37+
minMemberAge = time.Minute * 10
38+
)
39+
40+
// CleanupRemovedMembers removes all arangod members that are no longer part of ArangoDB deployment.
41+
func (r *Resources) CleanupRemovedMembers() error {
42+
// Decide what to do depending on cluster mode
43+
switch r.context.GetSpec().GetMode() {
44+
case api.DeploymentModeCluster:
45+
if err := r.cleanupRemovedClusterMembers(); err != nil {
46+
return maskAny(err)
47+
}
48+
return nil
49+
default:
50+
// Other mode have no concept of cluster in which members can be removed
51+
return nil
52+
}
53+
}
54+
55+
// cleanupRemovedClusterMembers removes all arangod members that are no longer part of the cluster.
56+
func (r *Resources) cleanupRemovedClusterMembers() error {
57+
log := r.log
58+
ctx := context.Background()
59+
60+
// Ask cluster for its health
61+
client, err := r.context.GetDatabaseClient(ctx)
62+
if err != nil {
63+
return maskAny(err)
64+
}
65+
c, err := client.Cluster(ctx)
66+
if err != nil {
67+
return maskAny(err)
68+
}
69+
h, err := c.Health(ctx)
70+
if err != nil {
71+
return maskAny(err)
72+
}
73+
74+
serverFound := func(id string) bool {
75+
_, found := h.Health[driver.ServerID(id)]
76+
return found
77+
}
78+
79+
// For over all members that can be removed
80+
status := r.context.GetStatus()
81+
status.Members.ForeachServerGroup(func(group api.ServerGroup, list *api.MemberStatusList) error {
82+
if group != api.ServerGroupCoordinators && group != api.ServerGroupDBServers {
83+
// We're not interested in these other groups
84+
return nil
85+
}
86+
for _, m := range *list {
87+
if serverFound(m.ID) {
88+
// Member is (still) found, skip it
89+
if m.Conditions.Update(api.ConditionTypeMemberOfCluster, true, "", "") {
90+
list.Update(m)
91+
if err := r.context.UpdateStatus(status); err != nil {
92+
return maskAny(err)
93+
}
94+
}
95+
continue
96+
} else if !m.Conditions.IsTrue(api.ConditionTypeMemberOfCluster) {
97+
// Member is not yet recorded as member of cluster
98+
if m.Age() < minMemberAge {
99+
continue
100+
}
101+
log.Info().Str("member", m.ID).Str("role", group.AsRole()).Msg("Member has never been part of the cluster for a long time. Removing it.")
102+
} else {
103+
// Member no longer part of cluster, remove it
104+
log.Info().Str("member", m.ID).Str("role", group.AsRole()).Msg("Member is no longer part of the ArangoDB cluster. Removing it.")
105+
}
106+
list.RemoveByID(m.ID)
107+
if err := r.context.UpdateStatus(status); err != nil {
108+
return maskAny(err)
109+
}
110+
// Remove Pod & PVC (if any)
111+
if m.PodName != "" {
112+
if err := r.context.DeletePod(m.PodName); err != nil && !k8sutil.IsNotFound(err) {
113+
log.Warn().Err(err).Str("pod", m.PodName).Msg("Failed to remove obsolete pod")
114+
}
115+
}
116+
if m.PersistentVolumeClaimName != "" {
117+
if err := r.context.DeletePvc(m.PersistentVolumeClaimName); err != nil && !k8sutil.IsNotFound(err) {
118+
log.Warn().Err(err).Str("pvc", m.PersistentVolumeClaimName).Msg("Failed to remove obsolete PVC")
119+
}
120+
}
121+
}
122+
return nil
123+
})
124+
125+
return nil
126+
}

0 commit comments

Comments
 (0)