Skip to content

Commit edf5d2f

Browse files
authored
Merge pull request #339 from arangodb/bug-fix/no-coords-bug
No Coords
2 parents 5b720d2 + 722f88c commit edf5d2f

File tree

3 files changed

+51
-3
lines changed

3 files changed

+51
-3
lines changed

pkg/apis/deployment/v1alpha/member_status_list.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ package v1alpha
2525
import (
2626
"math/rand"
2727
"sort"
28+
"time"
2829

2930
"github.com/pkg/errors"
31+
v1 "k8s.io/api/core/v1"
3032
)
3133

3234
// MemberStatusList is a list of MemberStatus entries
@@ -178,3 +180,27 @@ func (l MemberStatusList) MembersReady() int {
178180
func (l MemberStatusList) AllMembersReady() bool {
179181
return len(l) == l.MembersReady()
180182
}
183+
184+
// AllConditionTrueSince returns true if all members satisfy the condition since the given period
185+
func (l MemberStatusList) AllConditionTrueSince(cond ConditionType, status v1.ConditionStatus, period time.Duration) bool {
186+
for _, x := range l {
187+
if c, ok := x.Conditions.Get(cond); ok {
188+
if c.Status == status && c.LastTransitionTime.Time.Add(period).Before(time.Now()) {
189+
continue
190+
}
191+
}
192+
return false
193+
}
194+
195+
return true
196+
}
197+
198+
// AllFailed returns true if all members are failed
199+
func (l MemberStatusList) AllFailed() bool {
200+
for _, x := range l {
201+
if !x.Phase.IsFailed() {
202+
return false
203+
}
204+
}
205+
return true
206+
}

pkg/deployment/resilience/member_failure.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,20 @@ func (r *Resilience) CheckMemberFailure() error {
4949
Str("id", m.ID).
5050
Str("role", group.AsRole()).
5151
Logger()
52-
// Check current state
53-
if m.Phase != api.MemberPhaseCreated {
54-
// Phase is not Created, so we're not looking further.
52+
53+
// Check if there are Members with Phase Upgrading or Rotation but no plan
54+
switch m.Phase {
55+
case api.MemberPhaseNone:
5556
continue
57+
case api.MemberPhaseUpgrading, api.MemberPhaseRotating, api.MemberPhaseCleanOut:
58+
if len(status.Plan) == 0 {
59+
log.Error().Msgf("No plan but member is in phase %s - marking as failed", m.Phase)
60+
m.Phase = api.MemberPhaseFailed
61+
status.Members.Update(m, group)
62+
updateStatusNeeded = true
63+
}
5664
}
65+
5766
// Check if pod is ready
5867
if m.Conditions.IsTrue(api.ConditionTypeReady) {
5968
// Pod is now ready, so we're not looking further

pkg/deployment/resources/pod_inspector.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,19 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
239239
allMembersReady := status.Members.AllMembersReady(spec.GetMode(), spec.Sync.IsEnabled())
240240
status.Conditions.Update(api.ConditionTypeReady, allMembersReady, "", "")
241241

242+
if spec.GetMode().HasCoordinators() && status.Members.Coordinators.AllFailed() {
243+
log.Error().Msg("All coordinators failed - reset")
244+
for _, m := range status.Members.Coordinators {
245+
if err := r.context.DeletePod(m.PodName); err != nil {
246+
log.Error().Err(err).Msg("Failed to delete pod")
247+
}
248+
m.Phase = api.MemberPhaseNone
249+
if err := status.Members.Update(m, api.ServerGroupCoordinators); err != nil {
250+
log.Error().Err(err).Msg("Failed to update member")
251+
}
252+
}
253+
}
254+
242255
// Update conditions
243256
if len(podNamesWithScheduleTimeout) > 0 {
244257
if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, true,

0 commit comments

Comments
 (0)