Skip to content

Commit 1c43b69

Browse files
authored
Merge pull request #370 from arangodb/bug-fix/less-health-calls
Health Call optimization
2 parents 6bb4457 + 285f83b commit 1c43b69

File tree

7 files changed

+47
-26
lines changed

7 files changed

+47
-26
lines changed

pkg/deployment/context_impl.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ func (d *Deployment) GetSpec() api.DeploymentSpec {
8484
return d.apiObject.Spec
8585
}
8686

87+
// GetDeploymentHealth returns a copy of the latest known state of cluster health
88+
func (d *Deployment) GetDeploymentHealth() (driver.ClusterHealth, error) {
89+
return d.resources.GetDeploymentHealth()
90+
}
91+
8792
// GetStatus returns the current status of the deployment
8893
// together with the current version of that status.
8994
func (d *Deployment) GetStatus() (api.DeploymentStatus, int32) {

pkg/deployment/reconcile/action_context.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ type ActionContext interface {
8484
// SetCurrentImage changes the CurrentImage field in the deployment
8585
// status to the given image.
8686
SetCurrentImage(imageInfo api.ImageInfo) error
87+
// GetDeploymentHealth returns a copy of the latest known state of cluster health
88+
GetDeploymentHealth() (driver.ClusterHealth, error)
8789
// InvalidateSyncStatus resets the sync state to false and triggers an inspection
8890
InvalidateSyncStatus()
8991
}
@@ -107,6 +109,11 @@ func (ac *actionContext) GetMode() api.DeploymentMode {
107109
return ac.context.GetSpec().GetMode()
108110
}
109111

112+
// GetDeploymentHealth returns a copy of the latest known state of cluster health
113+
func (ac *actionContext) GetDeploymentHealth() (driver.ClusterHealth, error) {
114+
return ac.context.GetDeploymentHealth()
115+
}
116+
110117
// GetDatabaseClient returns a cached client for the entire database (cluster coordinators or single server),
111118
// creating one if needed.
112119
func (ac *actionContext) GetDatabaseClient(ctx context.Context) (driver.Client, error) {

pkg/deployment/reconcile/action_remove_member.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,9 @@ func (a *actionRemoveMember) Start(ctx context.Context) (bool, error) {
7272
a.log.Err(err).Str("member-id", m.ID).Msgf("Failed to remove server from cluster")
7373
// ignore this error, maybe all coordinators are failed and no connction to cluster is possible
7474
} else if driver.IsPreconditionFailed(err) {
75-
cluster, err := client.Cluster(ctx)
75+
health, err := a.actionCtx.GetDeploymentHealth()
7676
if err != nil {
77-
return false, maskAny(errors.Wrapf(err, "Failed to obtain cluster: %#v", err))
78-
}
79-
health, err := cluster.Health(ctx)
80-
if err != nil {
81-
return false, maskAny(errors.Wrapf(err, "Failed to obtain cluster health: %#v", err))
77+
return false, maskAny(errors.Wrapf(err, "failed to get cluster health"))
8278
}
8379
// We don't care if not found
8480
if record, ok := health.Health[driver.ServerID(m.ID)]; ok {

pkg/deployment/reconcile/action_wait_for_member_up.go

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828

2929
driver "github.com/arangodb/go-driver"
3030
"github.com/arangodb/go-driver/agency"
31+
"github.com/pkg/errors"
3132
"github.com/rs/zerolog"
3233

3334
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
@@ -143,20 +144,9 @@ func (a *actionWaitForMemberUp) checkProgressAgent(ctx context.Context) (bool, b
143144
// of a cluster deployment (coordinator/dbserver).
144145
func (a *actionWaitForMemberUp) checkProgressCluster(ctx context.Context) (bool, bool, error) {
145146
log := a.log
146-
c, err := a.actionCtx.GetDatabaseClient(ctx)
147-
if err != nil {
148-
log.Debug().Err(err).Msg("Failed to create database client")
149-
return false, false, maskAny(err)
150-
}
151-
cluster, err := c.Cluster(ctx)
147+
h, err := a.actionCtx.GetDeploymentHealth()
152148
if err != nil {
153-
log.Debug().Err(err).Msg("Failed to access cluster")
154-
return false, false, maskAny(err)
155-
}
156-
h, err := cluster.Health(ctx)
157-
if err != nil {
158-
log.Debug().Err(err).Msg("Failed to get cluster health")
159-
return false, false, maskAny(err)
149+
return false, false, maskAny(errors.Wrapf(err, "failed to get cluster health"))
160150
}
161151
sh, found := h.Health[driver.ServerID(a.action.MemberID)]
162152
if !found {

pkg/deployment/reconcile/context.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ import (
2828
"github.com/arangodb/arangosync/client"
2929
driver "github.com/arangodb/go-driver"
3030
"github.com/arangodb/go-driver/agency"
31-
"k8s.io/api/core/v1"
3231
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3332

3433
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
@@ -95,6 +94,8 @@ type Context interface {
9594
// GetExpectedPodArguments creates command line arguments for a server in the given group with given ID.
9695
GetExpectedPodArguments(apiObject metav1.Object, deplSpec api.DeploymentSpec, group api.ServerGroup,
9796
agents api.MemberStatusList, id string, version driver.Version) []string
97+
// GetDeploymentHealth returns a copy of the latest known state of cluster health
98+
GetDeploymentHealth() (driver.ClusterHealth, error)
9899
// GetShardSyncStatus returns true if all shards are in sync
99100
GetShardSyncStatus() bool
100101
// InvalidateSyncStatus resets the sync state to false and triggers an inspection

pkg/deployment/resources/deployment_health.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ package resources
2424

2525
import (
2626
"context"
27+
"fmt"
2728
"time"
2829

30+
driver "github.com/arangodb/go-driver"
2931
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
3032
"github.com/arangodb/kube-arangodb/pkg/metrics"
3133
)
@@ -91,6 +93,24 @@ func (r *Resources) fetchDeploymentHealth() error {
9193
return nil
9294
}
9395

96+
// GetDeploymentHealth returns a copy of the latest known state of cluster health
97+
func (r *Resources) GetDeploymentHealth() (driver.ClusterHealth, error) {
98+
99+
r.health.mutex.Lock()
100+
defer r.health.mutex.Unlock()
101+
if r.health.timestamp.IsZero() {
102+
return driver.ClusterHealth{}, fmt.Errorf("No cluster health available")
103+
}
104+
105+
newhealth := r.health.clusterHealth
106+
newhealth.Health = make(map[driver.ServerID]driver.ServerHealth)
107+
108+
for k, v := range r.health.clusterHealth.Health {
109+
newhealth.Health[k] = v
110+
}
111+
return newhealth, nil
112+
}
113+
94114
// RunDeploymentShardSyncLoop creates a loop to fetch the sync status of shards of the deployment.
95115
// The loop ends when the given channel is closed.
96116
func (r *Resources) RunDeploymentShardSyncLoop(stopCh <-chan struct{}) {

pkg/deployment/resources/pod_termination.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,12 @@ func (r *Resources) prepareAgencyPodTermination(ctx context.Context, log zerolog
5858
agentDataWillBeGone := false
5959
if p.Spec.NodeName != "" {
6060
node, err := r.context.GetKubeCli().CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{})
61-
if err != nil {
61+
if k8sutil.IsNotFound(err) {
62+
log.Warn().Msg("Node not found")
63+
} else if err != nil {
6264
log.Warn().Err(err).Msg("Failed to get node for member")
6365
return maskAny(err)
64-
}
65-
if node.Spec.Unschedulable {
66+
} else if node.Spec.Unschedulable {
6667
agentDataWillBeGone = true
6768
}
6869
}
@@ -140,11 +141,12 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
140141
dbserverDataWillBeGone := false
141142
if p.Spec.NodeName != "" {
142143
node, err := r.context.GetKubeCli().CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{})
143-
if err != nil {
144+
if k8sutil.IsNotFound(err) {
145+
log.Warn().Msg("Node not found")
146+
} else if err != nil {
144147
log.Warn().Err(err).Msg("Failed to get node for member")
145148
return maskAny(err)
146-
}
147-
if node.Spec.Unschedulable {
149+
} else if node.Spec.Unschedulable {
148150
dbserverDataWillBeGone = true
149151
}
150152
}

0 commit comments

Comments
 (0)