Skip to content

Commit 5c9905b

Browse files
author
lamai93
committed
Delete all coordinator pods if all coordinators are failed. If a pod is in strange phase but there is no plan, fail it.
1 parent eed8ea1 commit 5c9905b

File tree

3 files changed

+59
-3
lines changed

3 files changed

+59
-3
lines changed

pkg/apis/deployment/v1alpha/member_status_list.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ package v1alpha
2525
import (
2626
"math/rand"
2727
"sort"
28+
"time"
2829

2930
"github.com/pkg/errors"
31+
v1 "k8s.io/api/core/v1"
3032
)
3133

3234
// MemberStatusList is a list of MemberStatus entries
@@ -178,3 +180,28 @@ func (l MemberStatusList) MembersReady() int {
178180
func (l MemberStatusList) AllMembersReady() bool {
179181
return len(l) == l.MembersReady()
180182
}
183+
184+
// AllConditionTrueSince returns true if all members satisfy the condition since the given period
185+
func (l MemberStatusList) AllConditionTrueSince(cond ConditionType, status v1.ConditionStatus, period time.Duration) bool {
186+
trueCount := 0
187+
for _, x := range l {
188+
if c, ok := x.Conditions.Get(cond); ok {
189+
if c.Status == status && c.LastTransitionTime.Time.Add(period).Before(time.Now()) {
190+
trueCount++
191+
}
192+
}
193+
}
194+
195+
return trueCount == len(l)
196+
}
197+
198+
// AllFailed returns true if all members are failed
199+
func (l MemberStatusList) AllFailed() bool {
200+
failedCount := 0
201+
for _, x := range l {
202+
if x.Phase.IsFailed() {
203+
failedCount++
204+
}
205+
}
206+
return failedCount == len(l)
207+
}

pkg/deployment/resilience/member_failure.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,20 @@ func (r *Resilience) CheckMemberFailure() error {
4949
Str("id", m.ID).
5050
Str("role", group.AsRole()).
5151
Logger()
52-
// Check current state
53-
if m.Phase != api.MemberPhaseCreated {
54-
// Phase is not Created, so we're not looking further.
52+
53+
// Check if there are Members with Phase Upgrading or Rotation but no plan
54+
switch m.Phase {
55+
case api.MemberPhaseNone:
5556
continue
57+
case api.MemberPhaseUpgrading, api.MemberPhaseRotating, api.MemberPhaseCleanOut:
58+
if len(status.Plan) == 0 {
59+
log.Error().Msgf("No plan but member is in phase %s - marking as failed", m.Phase)
60+
m.Phase = api.MemberPhaseFailed
61+
status.Members.Update(m, group)
62+
updateStatusNeeded = true
63+
}
5664
}
65+
5766
// Check if pod is ready
5867
if m.Conditions.IsTrue(api.ConditionTypeReady) {
5968
// Pod is now ready, so we're not looking further

pkg/deployment/resources/pod_inspector.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,13 +188,20 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
188188
case api.MemberPhaseShuttingDown, api.MemberPhaseRotating, api.MemberPhaseUpgrading, api.MemberPhaseFailed:
189189
// Shutdown was intended, so not need to do anything here.
190190
// Just mark terminated
191+
updateMemberNeeded := false
192+
if m.Conditions.Update(api.ConditionTypeReady, false, "Pod Does Not Exist", "") {
193+
updateMemberNeeded = true
194+
}
191195
wasTerminated := m.Conditions.IsTrue(api.ConditionTypeTerminated)
192196
if m.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Terminated", "") {
193197
if !wasTerminated {
194198
// Record termination time
195199
now := metav1.Now()
196200
m.RecentTerminations = append(m.RecentTerminations, now)
197201
}
202+
updateMemberNeeded = true
203+
}
204+
if updateMemberNeeded {
198205
// Save it
199206
if err := status.Members.Update(m, group); err != nil {
200207
return maskAny(err)
@@ -237,6 +244,19 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
237244
allMembersReady := status.Members.AllMembersReady(spec.GetMode(), spec.Sync.IsEnabled())
238245
status.Conditions.Update(api.ConditionTypeReady, allMembersReady, "", "")
239246

247+
if status.Members.Coordinators.AllFailed() {
248+
log.Error().Msg("All coordinators failed - reset")
249+
for _, m := range status.Members.Coordinators {
250+
if err := r.context.DeletePod(m.PodName); err != nil {
251+
log.Error().Err(err).Msg("Failed to delete pod")
252+
}
253+
m.Phase = api.MemberPhaseNone
254+
if err := status.Members.Update(m, api.ServerGroupCoordinators); err != nil {
255+
log.Error().Err(err).Msg("Failed to update member")
256+
}
257+
}
258+
}
259+
240260
// Update conditions
241261
if len(podNamesWithScheduleTimeout) > 0 {
242262
if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, true,

0 commit comments

Comments
 (0)