Skip to content

Commit 56466c4

Browse files
committed
Fetch cluster health in go-routine
1 parent 38b91d3 commit 56466c4

File tree

5 files changed

+323
-16
lines changed

5 files changed

+323
-16
lines changed

pkg/deployment/deployment.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
141141
ci := newClusterScalingIntegration(d)
142142
d.clusterScalingIntegration = ci
143143
go ci.ListenForClusterEvents(d.stopCh)
144+
go d.resources.RunDeploymentHealthLoop(d.stopCh)
144145
}
145146
if config.AllowChaos {
146147
d.chaosMonkey = chaos.NewMonkey(deps.Log, d)
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package resources
24+
25+
import (
26+
"context"
27+
"time"
28+
29+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
30+
"github.com/arangodb/kube-arangodb/pkg/metrics"
31+
)
32+
33+
var (
34+
fetchDeploymentHealthCounters = metrics.MustRegisterCounterVec("deployment_resources", "fetchDeploymentHealth", "Number of times the health of the deployment was fetched", "deployment", "result")
35+
)
36+
37+
// RunDeploymentHealthLoop creates a loop to fetch the health of the deployment.
38+
// The loop ends when the given channel is closed.
39+
func (r *Resources) RunDeploymentHealthLoop(stopCh <-chan struct{}) {
40+
log := r.log
41+
deploymentName := r.context.GetAPIObject().GetName()
42+
43+
if r.context.GetSpec().GetMode() != api.DeploymentModeCluster {
44+
// Deployment health is currently only applicable for clusters
45+
return
46+
}
47+
48+
for {
49+
if err := r.fetchDeploymentHealth(); err != nil {
50+
log.Debug().Err(err).Msg("Failed to fetch deployment health")
51+
fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "failed").Inc()
52+
} else {
53+
fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "success").Inc()
54+
}
55+
select {
56+
case <-time.After(time.Second * 5):
57+
// Continue
58+
case <-stopCh:
59+
// We're done
60+
return
61+
}
62+
}
63+
}
64+
65+
// cleanupRemovedClusterMembers removes all arangod members that are no longer part of the cluster.
66+
func (r *Resources) fetchDeploymentHealth() error {
67+
// Ask cluster for its health
68+
ctx, cancel := context.WithTimeout(context.Background(), time.Second*15)
69+
defer cancel()
70+
client, err := r.context.GetDatabaseClient(ctx)
71+
if err != nil {
72+
return maskAny(err)
73+
}
74+
c, err := client.Cluster(ctx)
75+
if err != nil {
76+
return maskAny(err)
77+
}
78+
h, err := c.Health(ctx)
79+
if err != nil {
80+
return maskAny(err)
81+
}
82+
83+
// Save cluster health
84+
r.health.mutex.Lock()
85+
defer r.health.mutex.Unlock()
86+
r.health.clusterHealth = h
87+
r.health.timestamp = time.Now()
88+
return nil
89+
}

pkg/deployment/resources/member_cleanup.go

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,23 @@
2323
package resources
2424

2525
import (
26-
"context"
2726
"time"
2827

2928
driver "github.com/arangodb/go-driver"
3029
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
30+
"github.com/arangodb/kube-arangodb/pkg/metrics"
3131
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3232
)
3333

3434
const (
3535
// minMemberAge is the minimum duration we expect a member to be created before we remove it because
3636
// it is not part of a deployment.
37-
minMemberAge = time.Minute * 10
37+
minMemberAge = time.Minute * 10
38+
maxClusterHealthAge = time.Second * 20
39+
)
40+
41+
var (
42+
cleanupRemovedMembersCounters = metrics.MustRegisterCounterVec("deployment_resources", "cleanupRemovedMembers", "Number of cleanup-removed-members actions", "deployment", "result")
3843
)
3944

4045
// CleanupRemovedMembers removes all arangod members that are no longer part of ArangoDB deployment.
@@ -43,8 +48,10 @@ func (r *Resources) CleanupRemovedMembers() error {
4348
switch r.context.GetSpec().GetMode() {
4449
case api.DeploymentModeCluster:
4550
if err := r.cleanupRemovedClusterMembers(); err != nil {
51+
cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "failed").Inc()
4652
return maskAny(err)
4753
}
54+
cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "success").Inc()
4855
return nil
4956
default:
5057
// Other mode have no concept of cluster in which members can be removed
@@ -55,20 +62,16 @@ func (r *Resources) CleanupRemovedMembers() error {
5562
// cleanupRemovedClusterMembers removes all arangod members that are no longer part of the cluster.
5663
func (r *Resources) cleanupRemovedClusterMembers() error {
5764
log := r.log
58-
ctx := context.Background()
5965

60-
// Ask cluster for its health
61-
client, err := r.context.GetDatabaseClient(ctx)
62-
if err != nil {
63-
return maskAny(err)
64-
}
65-
c, err := client.Cluster(ctx)
66-
if err != nil {
67-
return maskAny(err)
68-
}
69-
h, err := c.Health(ctx)
70-
if err != nil {
71-
return maskAny(err)
66+
// Fetch recent cluster health
67+
r.health.mutex.Lock()
68+
h := r.health.clusterHealth
69+
ts := r.health.timestamp
70+
r.health.mutex.Unlock()
71+
72+
// Only accept recent cluster health values
73+
if time.Since(ts) > maxClusterHealthAge {
74+
return nil
7275
}
7376

7477
serverFound := func(id string) bool {

pkg/deployment/resources/resources.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,24 @@
2222

2323
package resources
2424

25-
import "github.com/rs/zerolog"
25+
import (
26+
"sync"
27+
"time"
28+
29+
driver "github.com/arangodb/go-driver"
30+
"github.com/rs/zerolog"
31+
)
2632

2733
// Resources is a service that creates low level resources for members
2834
// and inspects low level resources, put the inspection result in members.
2935
type Resources struct {
3036
log zerolog.Logger
3137
context Context
38+
health struct {
39+
clusterHealth driver.ClusterHealth // Last fetched cluster health
40+
timestamp time.Time // Timestamp of last fetch of cluster health
41+
mutex sync.Mutex // Mutex guarding fields in this struct
42+
}
3243
}
3344

3445
// NewResources creates a new Resources service, used to

pkg/util/errors/errors.go

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package errors
24+
25+
import (
26+
"context"
27+
"fmt"
28+
"io"
29+
"net"
30+
"net/url"
31+
"os"
32+
"syscall"
33+
34+
driver "github.com/arangodb/go-driver"
35+
errs "github.com/pkg/errors"
36+
)
37+
38+
var (
39+
Cause = errs.Cause
40+
New = errs.New
41+
WithStack = errs.WithStack
42+
Wrap = errs.Wrap
43+
Wrapf = errs.Wrapf
44+
)
45+
46+
// WithMessage annotates err with a new message.
47+
// The messages of given error is hidden.
48+
// If err is nil, WithMessage returns nil.
49+
func WithMessage(err error, message string) error {
50+
if err == nil {
51+
return nil
52+
}
53+
return &withMessage{
54+
cause: err,
55+
msg: message,
56+
}
57+
}
58+
59+
type withMessage struct {
60+
cause error
61+
msg string
62+
}
63+
64+
func (w *withMessage) Error() string { return w.msg }
65+
func (w *withMessage) Cause() error { return w.cause }
66+
67+
func (w *withMessage) Format(s fmt.State, verb rune) {
68+
switch verb {
69+
case 'v':
70+
if s.Flag('+') {
71+
fmt.Fprintf(s, "%+v\n", w.Cause())
72+
io.WriteString(s, w.msg)
73+
return
74+
}
75+
fallthrough
76+
case 's', 'q':
77+
io.WriteString(s, w.Error())
78+
}
79+
}
80+
81+
type timeout interface {
82+
Timeout() bool
83+
}
84+
85+
// IsTimeout returns true if the given error is caused by a timeout error.
86+
func IsTimeout(err error) bool {
87+
if err == nil {
88+
return false
89+
}
90+
if t, ok := errs.Cause(err).(timeout); ok {
91+
return t.Timeout()
92+
}
93+
return false
94+
}
95+
96+
type temporary interface {
97+
Temporary() bool
98+
}
99+
100+
// IsTemporary returns true if the given error is caused by a temporary error.
101+
func IsTemporary(err error) bool {
102+
if err == nil {
103+
return false
104+
}
105+
if t, ok := errs.Cause(err).(temporary); ok {
106+
return t.Temporary()
107+
}
108+
return false
109+
}
110+
111+
// IsEOF returns true if the given error is caused by an EOF error.
112+
func IsEOF(err error) bool {
113+
err = errs.Cause(err)
114+
if err == io.EOF {
115+
return true
116+
}
117+
if ok, err := libCause(err); ok {
118+
return IsEOF(err)
119+
}
120+
return false
121+
}
122+
123+
// IsConnectionRefused returns true if the given error is caused by an "connection refused" error.
124+
func IsConnectionRefused(err error) bool {
125+
err = errs.Cause(err)
126+
if err, ok := err.(syscall.Errno); ok {
127+
return err == syscall.ECONNREFUSED
128+
}
129+
if ok, err := libCause(err); ok {
130+
return IsConnectionRefused(err)
131+
}
132+
return false
133+
}
134+
135+
// IsConnectionReset returns true if the given error is caused by an "connection reset by peer" error.
136+
func IsConnectionReset(err error) bool {
137+
err = errs.Cause(err)
138+
if err, ok := err.(syscall.Errno); ok {
139+
return err == syscall.ECONNRESET
140+
}
141+
if ok, err := libCause(err); ok {
142+
return IsConnectionReset(err)
143+
}
144+
return false
145+
}
146+
147+
// IsContextCanceled returns true if the given error is caused by a context cancelation.
148+
func IsContextCanceled(err error) bool {
149+
err = errs.Cause(err)
150+
if err == context.Canceled {
151+
return true
152+
}
153+
if ok, err := libCause(err); ok {
154+
return IsContextCanceled(err)
155+
}
156+
return false
157+
}
158+
159+
// IsContextDeadlineExpired returns true if the given error is caused by a context deadline expiration.
160+
func IsContextDeadlineExpired(err error) bool {
161+
err = errs.Cause(err)
162+
if err == context.DeadlineExceeded {
163+
return true
164+
}
165+
if ok, err := libCause(err); ok {
166+
return IsContextDeadlineExpired(err)
167+
}
168+
return false
169+
}
170+
171+
// IsContextCanceledOrExpired returns true if the given error is caused by a context cancelation
172+
// or deadline expiration.
173+
func IsContextCanceledOrExpired(err error) bool {
174+
err = errs.Cause(err)
175+
if err == context.Canceled || err == context.DeadlineExceeded {
176+
return true
177+
}
178+
if ok, err := libCause(err); ok {
179+
return IsContextCanceledOrExpired(err)
180+
}
181+
return false
182+
}
183+
184+
// libCause returns the Cause of well known go library errors.
185+
func libCause(err error) (bool, error) {
186+
original := err
187+
for {
188+
switch e := err.(type) {
189+
case *driver.ResponseError:
190+
err = e.Err
191+
case *net.DNSConfigError:
192+
err = e.Err
193+
case *net.OpError:
194+
err = e.Err
195+
case *os.SyscallError:
196+
err = e.Err
197+
case *url.Error:
198+
err = e.Err
199+
default:
200+
return err != original, err
201+
}
202+
}
203+
}

0 commit comments

Comments
 (0)