Skip to content

Commit 8949920

Browse files
authored
[Feature] Abort resignation of leadership when dbserver restarted (#1291)
1 parent 4f12875 commit 8949920

File tree

13 files changed

+187
-36
lines changed

13 files changed

+187
-36
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
- (Feature) Run secured containers as a feature
77
- (Feature) Expose core.PodSecurityContext Sysctl options
88
- (Bugfix) Skip Collection check for missing Database
9+
- (Feature) Abort resignation of leadership when DB server is restared
910

1011
## [1.2.31](https://github.com/arangodb/kube-arangodb/tree/1.2.31) (2023-07-14)
1112
- (Improvement) Block traffic on the services if there is more than 1 active leader in ActiveFailover mode

pkg/apis/deployment/v1/plan_locals.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//
22
// DISCLAIMER
33
//
4-
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
4+
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
55
//
66
// Licensed under the Apache License, Version 2.0 (the "License");
77
// you may not use this file except in compliance with the License.
@@ -20,12 +20,18 @@
2020

2121
package v1
2222

23+
import "fmt"
24+
2325
type PlanLocalKey string
2426

2527
func (p PlanLocalKey) String() string {
2628
return string(p)
2729
}
2830

31+
func (p PlanLocalKey) Register(action Action, format string, args ...interface{}) Action {
32+
return action.AddParam(p.String(), fmt.Sprintf(format, args...))
33+
}
34+
2935
type PlanLocals map[PlanLocalKey]string
3036

3137
func (p *PlanLocals) Remove(key PlanLocalKey) bool {

pkg/apis/deployment/v2alpha1/plan_locals.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//
22
// DISCLAIMER
33
//
4-
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
4+
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
55
//
66
// Licensed under the Apache License, Version 2.0 (the "License");
77
// you may not use this file except in compliance with the License.
@@ -20,12 +20,18 @@
2020

2121
package v2alpha1
2222

23+
import "fmt"
24+
2325
type PlanLocalKey string
2426

2527
func (p PlanLocalKey) String() string {
2628
return string(p)
2729
}
2830

31+
func (p PlanLocalKey) Register(action Action, format string, args ...interface{}) Action {
32+
return action.AddParam(p.String(), fmt.Sprintf(format, args...))
33+
}
34+
2935
type PlanLocals map[PlanLocalKey]string
3036

3137
func (p *PlanLocals) Remove(key PlanLocalKey) bool {

pkg/deployment/agency/definitions.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ const (
3636
TargetKey = "Target"
3737

3838
CurrentMaintenanceServers = "MaintenanceServers"
39+
CurrentServersKnown = "ServersKnown"
3940

4041
TargetHotBackupKey = "HotBackup"
4142

@@ -64,10 +65,6 @@ func GetAgencyKey(parts ...string) string {
6465
return fmt.Sprintf("/%s", strings.Join(parts, "/"))
6566
}
6667

67-
func GetAgencyReadKey(elements ...string) []string {
68-
return elements
69-
}
70-
7168
func GetAgencyReadRequest(elements ...[]string) ReadRequest {
7269
return elements
7370
}
@@ -78,6 +75,7 @@ func GetAgencyReadRequestFields() ReadRequest {
7875
GetAgencyKey(ArangoKey, PlanKey, PlanCollectionsKey),
7976
GetAgencyKey(ArangoKey, PlanKey, PlanDatabasesKey),
8077
GetAgencyKey(ArangoKey, CurrentKey, PlanCollectionsKey),
78+
GetAgencyKey(ArangoKey, CurrentKey, CurrentServersKnown),
8179
GetAgencyKey(ArangoKey, CurrentKey, CurrentMaintenanceServers),
8280
GetAgencyKey(ArangoKey, TargetKey, TargetHotBackupKey),
8381
GetAgencyKey(ArangoKey, TargetKey, TargetJobToDoKey),

pkg/deployment/agency/state/state.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
package state
2222

23+
import "github.com/arangodb/go-driver"
24+
2325
type Root struct {
2426
Arango State `json:"arango"`
2527
ArangoDB DB `json:"arangodb,omitempty"`
@@ -40,9 +42,18 @@ type State struct {
4042
Target Target `json:"Target"`
4143
}
4244

45+
// ServerKnown stores information about single ArangoDB server.
46+
type ServerKnown struct {
47+
// RebootID is an incremental value which describes how many times server was restarted.
48+
RebootID int `json:"rebootId"`
49+
}
50+
4351
type Current struct {
4452
MaintenanceServers CurrentMaintenanceServers `json:"MaintenanceServers,omitempty"`
4553
Collections CurrentCollections `json:"Collections"`
54+
55+
// ServersKnown stores information about ArangoDB servers.
56+
ServersKnown map[driver.ServerID]ServerKnown `json:"ServersKnown,omitempty"`
4657
}
4758

4859
type Plan struct {
@@ -371,3 +382,13 @@ func (s State) GetCollectionDatabaseByID(id string) (string, bool) {
371382

372383
return "", false
373384
}
385+
386+
// GetRebootID returns reboot ID for a given server ID.
387+
// returns false when a server ID does not exist in cache.
388+
func (s State) GetRebootID(id driver.ServerID) (int, bool) {
389+
if v, ok := s.Current.ServersKnown[id]; ok {
390+
return v.RebootID, true
391+
}
392+
393+
return 0, false
394+
}

pkg/deployment/agency/state/state_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,3 +373,20 @@ func Test_MissingDatabaseCase(t *testing.T) {
373373

374374
require.Len(t, GetDBServerBlockingRestartShards(s, "PRMR-1e4bxazq"), 0)
375375
}
376+
377+
func Test_GetRebootID(t *testing.T) {
378+
var s DumpState
379+
require.NoError(t, json.Unmarshal(agencyDump39, &s))
380+
381+
t.Run("Existing", func(t *testing.T) {
382+
id, ok := s.Agency.Arango.GetRebootID("PRMR-n92yizyp")
383+
require.True(t, ok)
384+
require.Equal(t, 1, id)
385+
})
386+
387+
t.Run("Missing", func(t *testing.T) {
388+
id, ok := s.Agency.Arango.GetRebootID("PRMR-n92yiz")
389+
require.False(t, ok)
390+
require.Equal(t, 0, id)
391+
})
392+
}

pkg/deployment/reconcile/action_resign_leadership.go

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ package reconcile
2222

2323
import (
2424
"context"
25+
"strconv"
2526

2627
"github.com/arangodb/go-driver"
2728

@@ -31,6 +32,10 @@ import (
3132
"github.com/arangodb/kube-arangodb/pkg/util/globals"
3233
)
3334

35+
const (
36+
actionResignLeadershipRebootID api.PlanLocalKey = "rebootID"
37+
)
38+
3439
// newResignLeadershipAction creates a new Action that implements the given
3540
// planned ResignLeadership action.
3641
func newResignLeadershipAction(action api.Action, actionCtx ActionContext) Action {
@@ -63,14 +68,14 @@ func (a *actionResignLeadership) Start(ctx context.Context) (bool, error) {
6368
client, err := a.actionCtx.GetMembersState().State().GetDatabaseClient()
6469
if err != nil {
6570
a.log.Err(err).Error("Unable to get client")
66-
return true, errors.WithStack(err)
71+
return false, errors.WithStack(err)
6772
}
6873

6974
switch group {
7075
case api.ServerGroupDBServers:
7176
if agencyState, agencyOK := a.actionCtx.GetAgencyCache(); !agencyOK {
72-
a.log.Err(err).Warn("Maintenance is enabled, skipping action")
73-
return true, errors.WithStack(err)
77+
a.log.Warn("AgencyCache is not ready")
78+
return false, nil
7479
} else if agencyState.Supervision.Maintenance.Exists() {
7580
// We are done, action cannot be handled on maintenance mode
7681
a.log.Warn("Maintenance is enabled, skipping action")
@@ -82,7 +87,7 @@ func (a *actionResignLeadership) Start(ctx context.Context) (bool, error) {
8287
cluster, err := client.Cluster(ctxChild)
8388
if err != nil {
8489
a.log.Err(err).Error("Unable to get cluster client")
85-
return true, errors.WithStack(err)
90+
return false, errors.WithStack(err)
8691
}
8792

8893
var jobID string
@@ -92,13 +97,13 @@ func (a *actionResignLeadership) Start(ctx context.Context) (bool, error) {
9297
a.log.Debug("Temporary shutdown, resign leadership")
9398
if err := cluster.ResignServer(jobCtx, m.ID); err != nil {
9499
a.log.Err(err).Debug("Failed to resign server")
95-
return true, errors.WithStack(err)
100+
return false, errors.WithStack(err)
96101
}
97102

98103
m.CleanoutJobID = jobID
99104

100105
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
101-
return true, errors.WithStack(err)
106+
return false, errors.WithStack(err)
102107
}
103108

104109
return false, nil
@@ -127,6 +132,8 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
127132
return false, false, errors.WithStack(err)
128133
}
129134
return true, false, nil
135+
} else if a.isServerRebooted(agencyState, driver.ServerID(m.ID)) {
136+
return true, false, nil
130137
}
131138

132139
_, jobStatus := agencyState.Target.GetJob(state.JobID(m.CleanoutJobID))
@@ -150,3 +157,31 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
150157
}
151158
return false, false, nil
152159
}
160+
161+
// isServerRebooted returns true when a given server ID was rebooted during resignation of leadership.
162+
func (a *actionResignLeadership) isServerRebooted(agencyState state.State, serverID driver.ServerID) bool {
163+
rebootID, ok := agencyState.GetRebootID(serverID)
164+
if !ok {
165+
return false
166+
}
167+
168+
v, ok := a.actionCtx.Get(a.action, actionResignLeadershipRebootID)
169+
if !ok {
170+
a.log.Warn("missing reboot ID in action's locals", v)
171+
return false
172+
}
173+
174+
r, err := strconv.Atoi(v)
175+
if err != nil {
176+
a.log.Err(err).Warn("reboot ID '%s' supposed to be a number", v)
177+
return false
178+
}
179+
180+
if rebootID <= r {
181+
// Server has not been restarted.
182+
return false
183+
}
184+
185+
a.log.Warn("resign leadership aborted because rebootID has changed from %d to %d", r, rebootID)
186+
return true
187+
}

pkg/deployment/reconcile/helper_wrap.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//
22
// DISCLAIMER
33
//
4-
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
4+
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
55
//
66
// Licensed under the Apache License, Version 2.0 (the "License");
77
// you may not use this file except in compliance with the License.
@@ -59,12 +59,17 @@ func withMemberMaintenance(group api.ServerGroup, member api.MemberStatus, reaso
5959
actions.NewAction(api.ActionTypeDisableMemberMaintenance, group, member, reason))
6060
}
6161

62-
func withResignLeadership(group api.ServerGroup, member api.MemberStatus, reason string, plan api.Plan) api.Plan {
62+
func withResignLeadership(group api.ServerGroup, member api.MemberStatus, reason string, plan api.Plan, rebootID *int) api.Plan {
6363
if member.Image == nil {
6464
return plan
6565
}
6666

67-
return api.AsPlan(plan).Before(actions.NewAction(api.ActionTypeResignLeadership, group, member, reason))
67+
action := actions.NewAction(api.ActionTypeResignLeadership, group, member, reason)
68+
if rebootID != nil {
69+
action = actionResignLeadershipRebootID.Register(action, "%d", *rebootID)
70+
}
71+
72+
return api.AsPlan(plan).Before(action)
6873
}
6974

7075
func cleanOutMember(group api.ServerGroup, m api.MemberStatus) api.Plan {

0 commit comments

Comments
 (0)