Skip to content

Commit e09d35e

Browse files
authored
[Feature] Move member recovery to high plan (#1026)
1 parent 07d6e01 commit e09d35e

File tree

6 files changed

+143
-89
lines changed

6 files changed

+143
-89
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
- (Feature) Move PVC resize action to high-priority plan
77
- (Feature) Remove forgotten ArangoDB jobs during restart
88
- (Feature) Add support for managed services
9+
- (Feature) Recreation member in the high plan
910

1011
## [1.2.14](https://github.com/arangodb/kube-arangodb/tree/1.2.14) (2022-07-14)
1112
- (Feature) Add ArangoSync TLS based rotation

pkg/deployment/reconcile/plan_builder_high.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb
5757
ApplyIfEmptyWithBackOff(LicenseCheck, 30*time.Second, r.updateClusterLicense).
5858
ApplyIfEmpty(r.createTopologyMemberConditionPlan).
5959
ApplyIfEmpty(r.createRebalancerCheckPlan).
60+
ApplyIfEmpty(r.createMemberFailedRestoreHighPlan).
6061
ApplyWithBackOff(BackOffCheck, time.Minute, r.emptyPlanBuilder)).
6162
Apply(r.createBackupInProgressConditionPlan). // Discover backups always
6263
Apply(r.createMaintenanceConditionPlan). // Discover maintenance always
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
21+
package reconcile
22+
23+
import (
24+
"context"
25+
26+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
27+
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
28+
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
29+
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
30+
)
31+
32+
// createMemberFailedRestoreNormalPlan returns only actions which are not recreate member.
33+
func (r *Reconciler) createMemberFailedRestoreNormalPlan(ctx context.Context, apiObject k8sutil.APIObject,
34+
spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
35+
condition := func(a api.Action) bool {
36+
return a.Type != api.ActionTypeRecreateMember
37+
}
38+
39+
return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition)
40+
}
41+
42+
// createMemberFailedRestoreHighPlan returns only recreate member actions.
43+
func (r *Reconciler) createMemberFailedRestoreHighPlan(ctx context.Context, apiObject k8sutil.APIObject,
44+
spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
45+
condition := func(a api.Action) bool {
46+
return a.Type == api.ActionTypeRecreateMember
47+
}
48+
49+
return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition)
50+
}
51+
52+
func (r *Reconciler) createMemberFailedRestoreInternal(_ context.Context, _ k8sutil.APIObject, spec api.DeploymentSpec,
53+
status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
54+
var plan api.Plan
55+
56+
// Fetch agency plan.
57+
agencyState, agencyOK := context.GetAgencyCache()
58+
59+
// Check for members in failed state.
60+
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
61+
failed := 0
62+
for _, m := range members {
63+
if m.Phase == api.MemberPhaseFailed {
64+
failed++
65+
}
66+
}
67+
for _, m := range members {
68+
if m.Phase != api.MemberPhaseFailed || len(plan) > 0 {
69+
continue
70+
}
71+
72+
memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole())
73+
74+
if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster {
75+
if !agencyOK {
76+
// If agency is down DBServers should not be touched.
77+
memberLog.Info("Agency state is not present")
78+
continue
79+
}
80+
81+
if c := spec.DBServers.GetCount(); c <= len(members)-failed {
82+
// There are more or equal alive members than current count. A member should not be recreated.
83+
continue
84+
}
85+
86+
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
87+
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated.
88+
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
89+
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
90+
91+
continue
92+
}
93+
// From here on, DBServer can be recreated.
94+
}
95+
96+
switch group {
97+
case api.ServerGroupAgents:
98+
// For agents just recreate member do not rotate ID, do not remove PVC or service.
99+
memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss")
100+
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
101+
case api.ServerGroupSingle:
102+
// Do not remove data for single.
103+
memberLog.Info("Restoring old member. Rotation for single servers is not safe")
104+
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
105+
default:
106+
if spec.GetAllowMemberRecreation(group) {
107+
memberLog.Info("Creating member replacement plan because member has failed")
108+
plan = append(plan,
109+
actions.NewAction(api.ActionTypeRemoveMember, group, m),
110+
actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")),
111+
actions.NewAction(api.ActionTypeWaitForMemberUp, group, withPredefinedMember(api.MemberIDPreviousAction)),
112+
)
113+
} else {
114+
memberLog.Info("Restoring old member. Recreation is disabled for group")
115+
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
116+
}
117+
}
118+
}
119+
return nil
120+
})
121+
122+
if len(plan) == 0 && !agencyOK {
123+
r.log.Warn("unable to build further plan without access to agency")
124+
plan = append(plan, actions.NewClusterAction(api.ActionTypeIdle))
125+
}
126+
127+
return plan
128+
}

pkg/deployment/reconcile/plan_builder_normal.go

Lines changed: 1 addition & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ import (
2424
"context"
2525

2626
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
27-
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
28-
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
2927
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3028
)
3129

@@ -50,7 +48,7 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API
5048
// Check for scale up
5149
ApplyIfEmpty(r.createScaleUPMemberPlan).
5250
// Check for failed members
53-
ApplyIfEmpty(r.createMemberFailedRestorePlan).
51+
ApplyIfEmpty(r.createMemberFailedRestoreNormalPlan).
5452
// Check for scale up/down
5553
ApplyIfEmpty(r.createScaleMemberPlan).
5654
// Update status
@@ -86,90 +84,6 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API
8684
return q.Plan(), q.BackOff(), true
8785
}
8886

89-
func (r *Reconciler) createMemberFailedRestorePlan(ctx context.Context, apiObject k8sutil.APIObject,
90-
spec api.DeploymentSpec, status api.DeploymentStatus,
91-
context PlanBuilderContext) api.Plan {
92-
var plan api.Plan
93-
94-
// Fetch agency plan
95-
agencyState, agencyOK := context.GetAgencyCache()
96-
97-
// Check for members in failed state
98-
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
99-
failed := 0
100-
for _, m := range members {
101-
if m.Phase == api.MemberPhaseFailed {
102-
failed++
103-
}
104-
}
105-
for _, m := range members {
106-
if m.Phase != api.MemberPhaseFailed || len(plan) > 0 {
107-
continue
108-
}
109-
110-
memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole())
111-
112-
if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster {
113-
// Do pre check for DBServers. If agency is down DBServers should not be touch
114-
if !agencyOK {
115-
memberLog.Info("Agency state is not present")
116-
continue
117-
}
118-
119-
if c := spec.DBServers.GetCount(); c <= len(members)-failed {
120-
// We have more or equal alive members than current count, we should not recreate this member
121-
continue
122-
}
123-
124-
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
125-
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated
126-
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
127-
plan = append(plan,
128-
actions.NewAction(api.ActionTypeRecreateMember, group, m))
129-
continue
130-
}
131-
132-
// Everything is fine, proceed
133-
}
134-
135-
switch group {
136-
case api.ServerGroupAgents:
137-
// For agents just recreate member do not rotate ID, do not remove PVC or service
138-
memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss")
139-
plan = append(plan,
140-
actions.NewAction(api.ActionTypeRecreateMember, group, m))
141-
case api.ServerGroupSingle:
142-
// Do not remove data for singles
143-
memberLog.Info("Restoring old member. Rotation for single servers is not safe")
144-
plan = append(plan,
145-
actions.NewAction(api.ActionTypeRecreateMember, group, m))
146-
default:
147-
if spec.GetAllowMemberRecreation(group) {
148-
memberLog.Info("Creating member replacement plan because member has failed")
149-
plan = append(plan,
150-
actions.NewAction(api.ActionTypeRemoveMember, group, m),
151-
actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")),
152-
)
153-
} else {
154-
memberLog.Info("Restoring old member. Recreation is disabled for group")
155-
plan = append(plan,
156-
actions.NewAction(api.ActionTypeRecreateMember, group, m))
157-
}
158-
}
159-
}
160-
return nil
161-
})
162-
163-
// Ensure that we were able to get agency info
164-
if len(plan) == 0 && !agencyOK {
165-
r.log.Warn("unable to build further plan without access to agency")
166-
plan = append(plan,
167-
actions.NewClusterAction(api.ActionTypeIdle))
168-
}
169-
170-
return plan
171-
}
172-
17387
func (r *Reconciler) createRemoveCleanedDBServersPlan(ctx context.Context, apiObject k8sutil.APIObject,
17488
spec api.DeploymentSpec, status api.DeploymentStatus,
17589
context PlanBuilderContext) api.Plan {

pkg/deployment/reconcile/plan_builder_test.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1017,8 +1017,14 @@ func TestCreatePlan(t *testing.T) {
10171017
}
10181018
ad.Status.Members.Agents[0].Phase = api.MemberPhaseFailed
10191019
ad.Status.Members.Agents[0].ID = "id"
1020+
for i := range ad.Status.Members.Coordinators {
1021+
ad.Status.Members.Coordinators[i].Phase = api.MemberPhaseCreated
1022+
}
1023+
for i := range ad.Status.Members.DBServers {
1024+
ad.Status.Members.DBServers[i].Phase = api.MemberPhaseCreated
1025+
}
10201026
},
1021-
ExpectedPlan: []api.Action{
1027+
ExpectedHighPlan: []api.Action{
10221028
actions.NewAction(api.ActionTypeRecreateMember, api.ServerGroupAgents, withPredefinedMember("id")),
10231029
},
10241030
ExpectedLog: "Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss",
@@ -1038,6 +1044,8 @@ func TestCreatePlan(t *testing.T) {
10381044
ExpectedPlan: []api.Action{
10391045
actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupCoordinators, withPredefinedMember("id")),
10401046
actions.NewAction(api.ActionTypeAddMember, api.ServerGroupCoordinators, withPredefinedMember("")),
1047+
actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupCoordinators,
1048+
withPredefinedMember(api.MemberIDPreviousAction)),
10411049
},
10421050
ExpectedLog: "Creating member replacement plan because member has failed",
10431051
},
@@ -1056,6 +1064,8 @@ func TestCreatePlan(t *testing.T) {
10561064
ExpectedPlan: []api.Action{
10571065
actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, withPredefinedMember("id")),
10581066
actions.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, withPredefinedMember("")),
1067+
actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupDBServers,
1068+
withPredefinedMember(api.MemberIDPreviousAction)),
10591069
},
10601070
ExpectedLog: "Creating member replacement plan because member has failed",
10611071
},

pkg/deployment/reconcile/reconciler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ func (r *Reconciler) CheckDeployment(ctx context.Context) error {
7676
}
7777

7878
if err := cache.Client().Kubernetes().CoreV1().Secrets(cache.Namespace()).Delete(ctx, m.PodName, meta.DeleteOptions{}); err != nil {
79-
r.log.Err(err).Error("Failed to delete pod")
79+
r.log.Err(err).Error("Failed to delete secret")
8080
}
8181
m.Phase = api.MemberPhaseNone
8282

0 commit comments

Comments
 (0)