Skip to content

Commit d1b2879

Browse files
authored
Merge pull request #425 from arangodb/bug-fix/unreliable-cluster-membership
Bug fix/unreliable cluster membership
2 parents 97d5b7b + 74ed1c1 commit d1b2879

File tree

3 files changed

+44
-5
lines changed

3 files changed

+44
-5
lines changed

pkg/apis/deployment/v1alpha/member_status_list.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ func (l MemberStatusList) Equal(other MemberStatusList) bool {
4141
}
4242

4343
for i := 0; i < len(l); i++ {
44-
o, found := l.ElementByID(l[i].ID)
44+
o, found := other.ElementByID(l[i].ID)
4545
if !found {
4646
return false
4747
}

pkg/apis/deployment/v1alpha/member_status_list_test.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,26 @@ func TestMemberStatusList(t *testing.T) {
6666
assert.Equal(t, m1.ID, (*list)[0].ID)
6767
assert.Equal(t, m2.ID, (*list)[1].ID)
6868
assert.Equal(t, m3.ID, (*list)[2].ID)
69+
70+
list2 := &MemberStatusList{m3, m2, m1}
71+
assert.True(t, list.Equal(*list2))
72+
assert.True(t, list2.Equal(*list))
73+
74+
list3 := &MemberStatusList{m3, m1}
75+
assert.False(t, list.Equal(*list3))
76+
assert.False(t, list3.Equal(*list))
77+
78+
list4 := MemberStatusList{m3, m2, m1}
79+
list4[1].Phase = "something-else"
80+
assert.False(t, list.Equal(list4))
81+
assert.False(t, list4.Equal(*list))
82+
83+
m4 := MemberStatus{ID: "m4"}
84+
list5 := &MemberStatusList{m1, m2, m4}
85+
assert.False(t, list.Equal(*list5))
86+
assert.False(t, list5.Equal(*list))
87+
88+
list6 := &MemberStatusList{m1, m2, m3, m4}
89+
assert.False(t, list.Equal(*list6))
90+
assert.False(t, list6.Equal(*list))
6991
}

pkg/deployment/resources/member_cleanup.go

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,9 @@ func (r *Resources) cleanupRemovedClusterMembers() error {
7171
r.health.mutex.Unlock()
7272

7373
// Only accept recent cluster health values
74-
if time.Since(ts) > maxClusterHealthAge {
74+
healthAge := time.Since(ts)
75+
if healthAge > maxClusterHealthAge {
76+
log.Info().Dur("age", healthAge).Msg("Cleanup longer than max cluster health. Exiting")
7577
return nil
7678
}
7779

@@ -90,23 +92,32 @@ func (r *Resources) cleanupRemovedClusterMembers() error {
9092
return nil
9193
}
9294
for _, m := range list {
95+
log := log.With().
96+
Str("member", m.ID).
97+
Str("role", group.AsRole()).
98+
Logger()
9399
if serverFound(m.ID) {
94100
// Member is (still) found, skip it
95101
if m.Conditions.Update(api.ConditionTypeMemberOfCluster, true, "", "") {
96-
status.Members.Update(m, group)
102+
if err := status.Members.Update(m, group); err != nil {
103+
log.Warn().Err(err).Msg("Failed to update member")
104+
}
97105
updateStatusNeeded = true
106+
log.Debug().Msg("Updating MemberOfCluster condition to true")
98107
}
99108
continue
100109
} else if !m.Conditions.IsTrue(api.ConditionTypeMemberOfCluster) {
101110
// Member is not yet recorded as member of cluster
102111
if m.Age() < minMemberAge {
112+
log.Debug().Dur("age", m.Age()).Msg("Member age is below minimum for removal")
103113
continue
104114
}
105-
log.Info().Str("member", m.ID).Str("role", group.AsRole()).Msg("Member has never been part of the cluster for a long time. Removing it.")
115+
log.Info().Msg("Member has never been part of the cluster for a long time. Removing it.")
106116
} else {
107117
// Member no longer part of cluster, remove it
108-
log.Info().Str("member", m.ID).Str("role", group.AsRole()).Msg("Member is no longer part of the ArangoDB cluster. Removing it.")
118+
log.Info().Msg("Member is no longer part of the ArangoDB cluster. Removing it.")
109119
}
120+
log.Info().Msg("Removing member")
110121
status.Members.RemoveByID(m.ID, group)
111122
updateStatusNeeded = true
112123
// Remove Pod & PVC (if any)
@@ -121,17 +132,23 @@ func (r *Resources) cleanupRemovedClusterMembers() error {
121132
})
122133

123134
if updateStatusNeeded {
135+
log.Debug().Msg("UpdateStatus needed")
136+
124137
if err := r.context.UpdateStatus(status, lastVersion); err != nil {
138+
log.Warn().Err(err).Msg("Failed to update deployment status")
125139
return maskAny(err)
126140
}
127141
}
128142

129143
for _, podName := range podNamesToRemove {
144+
log.Info().Str("pod", podName).Msg("Removing obsolete member pod")
130145
if err := r.context.DeletePod(podName); err != nil && !k8sutil.IsNotFound(err) {
131146
log.Warn().Err(err).Str("pod", podName).Msg("Failed to remove obsolete pod")
132147
}
133148
}
149+
134150
for _, pvcName := range pvcNamesToRemove {
151+
log.Info().Str("pvc", pvcName).Msg("Removing obsolete member PVC")
135152
if err := r.context.DeletePvc(pvcName); err != nil && !k8sutil.IsNotFound(err) {
136153
log.Warn().Err(err).Str("pvc", pvcName).Msg("Failed to remove obsolete PVC")
137154
}

0 commit comments

Comments
 (0)