Skip to content

Commit 13a5cc8

Browse files
committed
Merge branch 'master' into ready-probe
2 parents 238f26d + 9abab5b commit 13a5cc8

36 files changed

+1149
-225
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package v1alpha
24+
25+
// MemberPhase is a strongly typed lifetime phase of a deployment member
26+
type MemberPhase string
27+
28+
const (
29+
// MemberPhaseNone indicates that the state is not set yet
30+
MemberPhaseNone MemberPhase = ""
31+
// MemberPhaseCreated indicates that all resources needed for the member have been created
32+
MemberPhaseCreated MemberPhase = "Created"
33+
// MemberPhaseFailed indicates that the member is gone beyond hope of recovery. It must be replaced with a new member.
34+
MemberPhaseFailed MemberPhase = "Failed"
35+
// MemberPhaseCleanOut indicates that a dbserver is in the process of being cleaned out
36+
MemberPhaseCleanOut MemberPhase = "CleanOut"
37+
// MemberPhaseShuttingDown indicates that a member is shutting down
38+
MemberPhaseShuttingDown MemberPhase = "ShuttingDown"
39+
// MemberPhaseRotating indicates that a member is being rotated
40+
MemberPhaseRotating MemberPhase = "Rotating"
41+
// MemberPhaseUpgrading indicates that a member is in the process of upgrading its database data format
42+
MemberPhaseUpgrading MemberPhase = "Upgrading"
43+
)
44+
45+
// IsFailed returns true when given phase == "Failed"
46+
func (p MemberPhase) IsFailed() bool {
47+
return p == MemberPhaseFailed
48+
}

pkg/apis/deployment/v1alpha/member_state.go

Lines changed: 0 additions & 41 deletions
This file was deleted.

pkg/apis/deployment/v1alpha/member_status.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ package v1alpha
2525
import (
2626
"time"
2727

28+
"k8s.io/api/core/v1"
2829
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2930
)
3031

@@ -33,8 +34,10 @@ type MemberStatus struct {
3334
// ID holds the unique ID of the member.
3435
// This id is also used within the ArangoDB cluster to identify this server.
3536
ID string `json:"id"`
36-
// State holds the current state of this member
37-
State MemberState `json:"state"`
37+
// Phase holds the current lifetime phase of this member
38+
Phase MemberPhase `json:"phase"`
39+
// CreatedAt holds the creation timestamp of this member.
40+
CreatedAt metav1.Time `json:"created-at"`
3841
// PersistentVolumeClaimName holds the name of the persistent volume claim used for this member (if any).
3942
PersistentVolumeClaimName string `json:"persistentVolumeClaimName,omitempty"`
4043
// PodName holds the name of the Pod that currently runs this member
@@ -44,6 +47,9 @@ type MemberStatus struct {
4447
// RecentTerminatons holds the times when this member was recently terminated.
4548
// First entry is the oldest. (do not add omitempty, since we want to be able to switch from a list to an empty list)
4649
RecentTerminations []metav1.Time `json:"recent-terminations"`
50+
// IsInitialized is set after the very first time a pod was created for this member.
51+
// After that, DBServers must have a UUID field or fail.
52+
IsInitialized bool `json:"initialized"`
4753
}
4854

4955
// RemoveTerminationsBefore removes all recent terminations before the given timestamp.
@@ -78,3 +84,17 @@ func (s MemberStatus) RecentTerminationsSince(timestamp time.Time) int {
7884
}
7985
return count
8086
}
87+
88+
// IsNotReadySince returns true when the given member has not been ready since the given timestamp.
89+
// That means it:
90+
// - A) Was created before timestamp and never reached a ready state or
91+
// - B) The Ready condition is set to false, and last transision is before timestamp
92+
func (s MemberStatus) IsNotReadySince(timestamp time.Time) bool {
93+
cond, found := s.Conditions.Get(ConditionTypeReady)
94+
if found {
95+
// B
96+
return cond.Status != v1.ConditionTrue && cond.LastTransitionTime.Time.Before(timestamp)
97+
}
98+
// A
99+
return s.CreatedAt.Time.Before(timestamp)
100+
}

pkg/apis/deployment/v1alpha/member_status_list.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,15 @@ func (l MemberStatusList) SelectMemberToRemove() (MemberStatus, error) {
108108
if len(l) > 0 {
109109
// Try to find a not ready member
110110
for _, m := range l {
111-
if m.State == MemberStateNone {
111+
if m.Phase == MemberPhaseNone {
112112
return m, nil
113113
}
114114
}
115115
// Pick a random member that is in created state
116116
perm := rand.Perm(len(l))
117117
for _, idx := range perm {
118118
m := l[idx]
119-
if m.State == MemberStateCreated {
119+
if m.Phase == MemberPhaseCreated {
120120
return m, nil
121121
}
122122
}

pkg/apis/deployment/v1alpha/member_status_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,24 @@ func TestMemberStatusRecentTerminations(t *testing.T) {
5151
assert.Equal(t, 2, s.RemoveTerminationsBefore(time.Now()))
5252
assert.Len(t, s.RecentTerminations, 1)
5353
}
54+
55+
// TestMemberStatusIsNotReadySince tests the functions related to MemberStatus.IsNotReadySince.
56+
func TestMemberStatusIsNotReadySince(t *testing.T) {
57+
s := MemberStatus{
58+
CreatedAt: metav1.Now(),
59+
}
60+
assert.False(t, s.IsNotReadySince(time.Now().Add(-time.Hour)))
61+
62+
s.CreatedAt.Time = time.Now().Add(-time.Hour)
63+
assert.False(t, s.IsNotReadySince(time.Now().Add(-2*time.Hour)))
64+
assert.True(t, s.IsNotReadySince(time.Now().Add(-(time.Hour - time.Minute))))
65+
66+
s.CreatedAt = metav1.Now()
67+
s.Conditions.Update(ConditionTypeReady, true, "", "")
68+
assert.False(t, s.IsNotReadySince(time.Now().Add(-time.Minute)))
69+
assert.False(t, s.IsNotReadySince(time.Now().Add(time.Minute)))
70+
71+
s.Conditions.Update(ConditionTypeReady, false, "", "")
72+
assert.False(t, s.IsNotReadySince(time.Now().Add(-time.Minute)))
73+
assert.True(t, s.IsNotReadySince(time.Now().Add(time.Minute)))
74+
}

pkg/apis/deployment/v1alpha/zz_generated.deepcopy.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ func (in *ImageInfo) DeepCopy() *ImageInfo {
364364
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
365365
func (in *MemberStatus) DeepCopyInto(out *MemberStatus) {
366366
*out = *in
367+
in.CreatedAt.DeepCopyInto(&out.CreatedAt)
367368
if in.Conditions != nil {
368369
in, out := &in.Conditions, &out.Conditions
369370
*out = make(ConditionList, len(*in))

pkg/deployment/context_impl.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,14 @@ func (d *Deployment) GetServerClient(ctx context.Context, group api.ServerGroup,
9797
}
9898

9999
// GetAgencyClients returns a client connection for every agency member.
100-
func (d *Deployment) GetAgencyClients(ctx context.Context) ([]arangod.Agency, error) {
100+
// If the given predicate is not nil, only agents are included where the given predicate returns true.
101+
func (d *Deployment) GetAgencyClients(ctx context.Context, predicate func(id string) bool) ([]arangod.Agency, error) {
101102
agencyMembers := d.status.Members.Agents
102103
result := make([]arangod.Agency, 0, len(agencyMembers))
103104
for _, m := range agencyMembers {
105+
if predicate != nil && !predicate(m.ID) {
106+
continue
107+
}
104108
client, err := d.GetServerClient(ctx, api.ServerGroupAgents, m.ID)
105109
if err != nil {
106110
return nil, maskAny(err)
@@ -115,9 +119,11 @@ func (d *Deployment) GetAgencyClients(ctx context.Context) ([]arangod.Agency, er
115119
}
116120

117121
// CreateMember adds a new member to the given group.
118-
func (d *Deployment) CreateMember(group api.ServerGroup) error {
122+
// If ID is non-empty, it will be used, otherwise a new ID is created.
123+
func (d *Deployment) CreateMember(group api.ServerGroup, id string) error {
119124
log := d.deps.Log
120-
if err := d.createMember(group, d.apiObject); err != nil {
125+
id, err := d.createMember(group, id, d.apiObject)
126+
if err != nil {
121127
log.Debug().Err(err).Str("group", group.AsRole()).Msg("Failed to create member")
122128
return maskAny(err)
123129
}
@@ -126,6 +132,9 @@ func (d *Deployment) CreateMember(group api.ServerGroup) error {
126132
log.Debug().Err(err).Msg("Updating CR status failed")
127133
return maskAny(err)
128134
}
135+
// Create event about it
136+
d.CreateEvent(k8sutil.NewMemberAddEvent(id, group.AsRole(), d.apiObject))
137+
129138
return nil
130139
}
131140

pkg/deployment/deployment.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636

3737
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
3838
"github.com/arangodb/kube-arangodb/pkg/deployment/reconcile"
39+
"github.com/arangodb/kube-arangodb/pkg/deployment/resilience"
3940
"github.com/arangodb/kube-arangodb/pkg/deployment/resources"
4041
"github.com/arangodb/kube-arangodb/pkg/generated/clientset/versioned"
4142
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
@@ -92,6 +93,7 @@ type Deployment struct {
9293
recentInspectionErrors int
9394
clusterScalingIntegration *clusterScalingIntegration
9495
reconciler *reconcile.Reconciler
96+
resilience *resilience.Resilience
9597
resources *resources.Resources
9698
}
9799

@@ -111,6 +113,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
111113
clientCache: newClientCache(deps.KubeCli, apiObject),
112114
}
113115
d.reconciler = reconcile.NewReconciler(deps.Log, d)
116+
d.resilience = resilience.NewResilience(deps.Log, d)
114117
d.resources = resources.NewResources(deps.Log, d)
115118
if d.status.AcceptedSpec == nil {
116119
// We've validated the spec, so let's use it from now.

pkg/deployment/deployment_inspector.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
7676
d.CreateEvent(k8sutil.NewErrorEvent("Pod inspection failed", err, d.apiObject))
7777
}
7878

79+
// Check members for resilience
80+
if err := d.resilience.CheckMemberFailure(); err != nil {
81+
hasError = true
82+
d.CreateEvent(k8sutil.NewErrorEvent("Member failure detection failed", err, d.apiObject))
83+
}
84+
7985
// Create scale/update plan
8086
if err := d.reconciler.CreatePlan(); err != nil {
8187
hasError = true

pkg/deployment/images.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ func (ib *imagesBuilder) fetchArangoDBImageIDAndVersion(ctx context.Context, ima
166166
"--server.authentication=false",
167167
fmt.Sprintf("--server.endpoint=tcp://[::]:%d", k8sutil.ArangoPort),
168168
}
169-
if err := k8sutil.CreateArangodPod(ib.KubeCli, true, ib.APIObject, role, id, podName, "", image, ib.Spec.GetImagePullPolicy(), args, nil, nil, nil, "", ""); err != nil {
169+
if err := k8sutil.CreateArangodPod(ib.KubeCli, true, ib.APIObject, role, id, podName, "", image, ib.Spec.GetImagePullPolicy(), "", false, args, nil, nil, nil, "", ""); err != nil {
170170
log.Debug().Err(err).Msg("Failed to create image ID pod")
171171
return true, maskAny(err)
172172
}

0 commit comments

Comments
 (0)