@@ -22,6 +22,7 @@ package reconcile
2222
2323import (
2424 "context"
25+ "strconv"
2526
2627 "github.com/arangodb/go-driver"
2728
@@ -31,6 +32,10 @@ import (
3132 "github.com/arangodb/kube-arangodb/pkg/util/globals"
3233)
3334
35+ const (
36+ actionResignLeadershipRebootID api.PlanLocalKey = "rebootID"
37+ )
38+
3439// newResignLeadershipAction creates a new Action that implements the given
3540// planned ResignLeadership action.
3641func newResignLeadershipAction (action api.Action , actionCtx ActionContext ) Action {
@@ -63,14 +68,14 @@ func (a *actionResignLeadership) Start(ctx context.Context) (bool, error) {
6368 client , err := a .actionCtx .GetMembersState ().State ().GetDatabaseClient ()
6469 if err != nil {
6570 a .log .Err (err ).Error ("Unable to get client" )
66- return true , errors .WithStack (err )
71+ return false , errors .WithStack (err )
6772 }
6873
6974 switch group {
7075 case api .ServerGroupDBServers :
7176 if agencyState , agencyOK := a .actionCtx .GetAgencyCache (); ! agencyOK {
72- a .log .Err ( err ). Warn ("Maintenance is enabled, skipping action " )
73- return true , errors . WithStack ( err )
77+ a .log .Warn ("AgencyCache is not ready " )
78+ return false , nil
7479 } else if agencyState .Supervision .Maintenance .Exists () {
7580 // We are done, action cannot be handled on maintenance mode
7681 a .log .Warn ("Maintenance is enabled, skipping action" )
@@ -82,7 +87,7 @@ func (a *actionResignLeadership) Start(ctx context.Context) (bool, error) {
8287 cluster , err := client .Cluster (ctxChild )
8388 if err != nil {
8489 a .log .Err (err ).Error ("Unable to get cluster client" )
85- return true , errors .WithStack (err )
90+ return false , errors .WithStack (err )
8691 }
8792
8893 var jobID string
@@ -92,13 +97,13 @@ func (a *actionResignLeadership) Start(ctx context.Context) (bool, error) {
9297 a .log .Debug ("Temporary shutdown, resign leadership" )
9398 if err := cluster .ResignServer (jobCtx , m .ID ); err != nil {
9499 a .log .Err (err ).Debug ("Failed to resign server" )
95- return true , errors .WithStack (err )
100+ return false , errors .WithStack (err )
96101 }
97102
98103 m .CleanoutJobID = jobID
99104
100105 if err := a .actionCtx .UpdateMember (ctx , m ); err != nil {
101- return true , errors .WithStack (err )
106+ return false , errors .WithStack (err )
102107 }
103108
104109 return false , nil
@@ -127,6 +132,8 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
127132 return false , false , errors .WithStack (err )
128133 }
129134 return true , false , nil
135+ } else if a .isServerRebooted (agencyState , driver .ServerID (m .ID )) {
136+ return true , false , nil
130137 }
131138
132139 _ , jobStatus := agencyState .Target .GetJob (state .JobID (m .CleanoutJobID ))
@@ -150,3 +157,31 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
150157 }
151158 return false , false , nil
152159}
160+
161+ // isServerRebooted returns true when a given server ID was rebooted during resignation of leadership.
162+ func (a * actionResignLeadership ) isServerRebooted (agencyState state.State , serverID driver.ServerID ) bool {
163+ rebootID , ok := agencyState .GetRebootID (serverID )
164+ if ! ok {
165+ return false
166+ }
167+
168+ v , ok := a .actionCtx .Get (a .action , actionResignLeadershipRebootID )
169+ if ! ok {
170+ a .log .Warn ("missing reboot ID in action's locals" , v )
171+ return false
172+ }
173+
174+ r , err := strconv .Atoi (v )
175+ if err != nil {
176+ a .log .Err (err ).Warn ("reboot ID '%s' supposed to be a number" , v )
177+ return false
178+ }
179+
180+ if rebootID <= r {
181+ // Server has not been restarted.
182+ return false
183+ }
184+
185+ a .log .Warn ("resign leadership aborted because rebootID has changed from %d to %d" , r , rebootID )
186+ return true
187+ }
0 commit comments