Skip to content

Commit 64201df

Browse files
authored
[Feature] [Metrics] Member restarts (#1047)
1 parent cde52bb commit 64201df

File tree

14 files changed

+517
-160
lines changed

14 files changed

+517
-160
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
- (Bugfix) Extend Agency HealthCheck for replace
2929
- (Bugfix) Allow to remove resources (CPU & Memory) on the managed pods
3030
- (Bugfix) Add DistributeShardsLike support
31+
- (Feature) Member restarts metric
3132

3233
## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07)
3334
- (Bugfix) Fix arangosync members state inspection

docs/generated/metrics/README.md

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,21 @@
22

33
## List
44

5-
| Name | Namespace | Group | Type | Description |
6-
|:---------------------------------------------------------------------------------------------------------------:|:-----------------:|:------------:|:-------:|:---------------------------------------------------|
7-
| [arangodb_operator_agency_errors](./arangodb_operator_agency_errors.md) | arangodb_operator | agency | Counter | Current count of agency cache fetch errors |
8-
| [arangodb_operator_agency_fetches](./arangodb_operator_agency_fetches.md) | arangodb_operator | agency | Counter | Current count of agency cache fetches |
9-
| [arangodb_operator_agency_index](./arangodb_operator_agency_index.md) | arangodb_operator | agency | Gauge | Current index of the agency cache |
10-
| [arangodb_operator_agency_cache_health_present](./arangodb_operator_agency_cache_health_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache health is present |
11-
| [arangodb_operator_agency_cache_healthy](./arangodb_operator_agency_cache_healthy.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is healthy |
12-
| [arangodb_operator_agency_cache_leaders](./arangodb_operator_agency_cache_leaders.md) | arangodb_operator | agency_cache | Gauge | Determines agency leader vote count |
13-
| [arangodb_operator_agency_cache_member_commit_offset](./arangodb_operator_agency_cache_member_commit_offset.md) | arangodb_operator | agency_cache | Gauge | Determines agency member commit offset |
14-
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
15-
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
16-
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
17-
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |
18-
| [arangodb_operator_rebalancer_moves_current](./arangodb_operator_rebalancer_moves_current.md) | arangodb_operator | rebalancer | Gauge | Define how many moves are currently in progress |
19-
| [arangodb_operator_rebalancer_moves_failed](./arangodb_operator_rebalancer_moves_failed.md) | arangodb_operator | rebalancer | Counter | Define how many moves failed |
20-
| [arangodb_operator_rebalancer_moves_generated](./arangodb_operator_rebalancer_moves_generated.md) | arangodb_operator | rebalancer | Counter | Define how many moves were generated |
21-
| [arangodb_operator_rebalancer_moves_succeeded](./arangodb_operator_rebalancer_moves_succeeded.md) | arangodb_operator | rebalancer | Counter | Define how many moves succeeded |
5+
| Name | Namespace | Group | Type | Description |
6+
|:---------------------------------------------------------------------------------------------------------------------------:|:-----------------:|:------------:|:-------:|:--------------------------------------------------------------------------------------|
7+
| [arangodb_operator_agency_errors](./arangodb_operator_agency_errors.md) | arangodb_operator | agency | Counter | Current count of agency cache fetch errors |
8+
| [arangodb_operator_agency_fetches](./arangodb_operator_agency_fetches.md) | arangodb_operator | agency | Counter | Current count of agency cache fetches |
9+
| [arangodb_operator_agency_index](./arangodb_operator_agency_index.md) | arangodb_operator | agency | Gauge | Current index of the agency cache |
10+
| [arangodb_operator_agency_cache_health_present](./arangodb_operator_agency_cache_health_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache health is present |
11+
| [arangodb_operator_agency_cache_healthy](./arangodb_operator_agency_cache_healthy.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is healthy |
12+
| [arangodb_operator_agency_cache_leaders](./arangodb_operator_agency_cache_leaders.md) | arangodb_operator | agency_cache | Gauge | Determines agency leader vote count |
13+
| [arangodb_operator_agency_cache_member_commit_offset](./arangodb_operator_agency_cache_member_commit_offset.md) | arangodb_operator | agency_cache | Gauge | Determines agency member commit offset |
14+
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
15+
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
16+
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
17+
| [arangodb_operator_members_unexpected_container_exit_codes](./arangodb_operator_members_unexpected_container_exit_codes.md) | arangodb_operator | members | Counter | Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers) |
18+
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |
19+
| [arangodb_operator_rebalancer_moves_current](./arangodb_operator_rebalancer_moves_current.md) | arangodb_operator | rebalancer | Gauge | Define how many moves are currently in progress |
20+
| [arangodb_operator_rebalancer_moves_failed](./arangodb_operator_rebalancer_moves_failed.md) | arangodb_operator | rebalancer | Counter | Define how many moves failed |
21+
| [arangodb_operator_rebalancer_moves_generated](./arangodb_operator_rebalancer_moves_generated.md) | arangodb_operator | rebalancer | Counter | Define how many moves were generated |
22+
| [arangodb_operator_rebalancer_moves_succeeded](./arangodb_operator_rebalancer_moves_succeeded.md) | arangodb_operator | rebalancer | Counter | Define how many moves succeeded |
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# arangodb_operator_members_unexpected_container_exit_codes (Counter)
2+
3+
## Description
4+
5+
Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)
6+
7+
## Labels
8+
9+
| Label | Description |
10+
|:--------------:|:-------------------------------------------|
11+
| namespace | Deployment Namespace |
12+
| name | Deployment Name |
13+
| member | Member ID |
14+
| container | Container Name |
15+
| container_type | Container/InitContainer/EphemeralContainer |
16+
| code | ExitCode |

internal/metrics.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,17 @@ func generateMetricsGO(root string, in MetricsDoc) error {
331331
keys = append(keys, "value")
332332

333333
for _, label := range details.Labels {
334-
k := strings.ToLower(label.Key)
334+
v := strings.Split(strings.ToLower(label.Key), "_")
335+
for id := range v {
336+
if id == 0 {
337+
continue
338+
}
339+
340+
v[id] = strings.Title(v[id])
341+
}
342+
343+
k := strings.Join(v, "")
344+
335345
keys = append(keys, k)
336346

337347
if t := label.Type; t != nil {

internal/metrics.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,4 +148,22 @@ namespaces:
148148
- key: namespace
149149
description: "Deployment Namespace"
150150
- key: name
151-
description: "Deployment Name"
151+
description: "Deployment Name"
152+
members:
153+
unexpected_container_exit_codes:
154+
shortDescription: "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)"
155+
description: "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)"
156+
type: "Counter"
157+
labels:
158+
- key: namespace
159+
description: "Deployment Namespace"
160+
- key: name
161+
description: "Deployment Name"
162+
- key: member
163+
description: "Member ID"
164+
- key: container
165+
description: "Container Name"
166+
- key: container_type
167+
description: "Container/InitContainer/EphemeralContainer"
168+
- key: code
169+
description: "ExitCode"

pkg/deployment/deployment.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,13 +137,7 @@ type Deployment struct {
137137

138138
memberState memberState.StateInspector
139139

140-
metrics struct {
141-
agency struct {
142-
errors uint64
143-
fetches uint64
144-
index uint64
145-
}
146-
}
140+
metrics Metrics
147141
}
148142

149143
func (d *Deployment) WithArangoMember(cache inspectorInterface.Inspector, timeout time.Duration, name string) reconciler.ArangoMemberModContext {

pkg/deployment/deployment_inspector.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -254,12 +254,12 @@ func (d *Deployment) inspectDeploymentWithError(ctx context.Context, lastInterva
254254
nextInterval = interval
255255
}
256256

257-
d.metrics.agency.fetches++
257+
d.metrics.Agency.Fetches++
258258
if offset, err := d.RefreshAgencyCache(ctx); err != nil {
259-
d.metrics.agency.errors++
259+
d.metrics.Agency.Errors++
260260
d.log.Err(err).Error("Unable to refresh agency")
261261
} else {
262-
d.metrics.agency.index = offset
262+
d.metrics.Agency.Index = offset
263263
}
264264

265265
// Refresh maintenance lock

pkg/deployment/metrics.go

Lines changed: 14 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -21,145 +21,22 @@
2121
package deployment
2222

2323
import (
24-
"sync"
25-
26-
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
2724
"github.com/arangodb/kube-arangodb/pkg/generated/metric_descriptions"
28-
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/throttle"
2925
"github.com/arangodb/kube-arangodb/pkg/util/metrics"
30-
"github.com/prometheus/client_golang/prometheus"
31-
)
32-
33-
const (
34-
// Component name for metrics of this package
35-
metricsComponent = "deployment"
3626
)
3727

38-
func init() {
39-
localInventory = inventory{
40-
deployments: map[string]map[string]*Deployment{},
41-
deploymentsMetric: metrics.NewDescription("arangodb_operator_deployments", "Number of active deployments", []string{"namespace", "deployment"}, nil),
42-
deploymentMetricsMembersMetric: metrics.NewDescription("arango_operator_deployment_members", "List of members", []string{"namespace", "deployment", "role", "id"}, nil),
43-
deploymentAgencyStateMetric: metrics.NewDescription("arango_operator_deployment_agency_state", "Reachability of agency", []string{"namespace", "deployment"}, nil),
44-
deploymentShardLeadersMetric: metrics.NewDescription("arango_operator_deployment_shard_leaders", "Deployment leader shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
45-
deploymentShardsMetric: metrics.NewDescription("arango_operator_deployment_shards", "Deployment shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
46-
47-
operatorStateRefreshMetric: metrics.NewDescription("arango_operator_deployment_state_refresh_count", "Number of refreshes in deployment", []string{"namespace", "deployment", "type"}, nil),
48-
}
49-
50-
prometheus.MustRegister(&localInventory)
51-
}
52-
53-
var localInventory inventory
54-
55-
var _ prometheus.Collector = &inventory{}
56-
57-
type inventory struct {
58-
lock sync.Mutex
59-
deployments map[string]map[string]*Deployment
60-
61-
deploymentsMetric, deploymentMetricsMembersMetric, deploymentAgencyStateMetric, deploymentShardsMetric, deploymentShardLeadersMetric metrics.Description
62-
63-
operatorStateRefreshMetric metrics.Description
64-
}
65-
66-
func (i *inventory) Describe(descs chan<- *prometheus.Desc) {
67-
i.lock.Lock()
68-
defer i.lock.Unlock()
69-
70-
pd := metrics.NewPushDescription(descs)
71-
pd.Push(i.deploymentsMetric, i.deploymentMetricsMembersMetric, i.deploymentAgencyStateMetric, i.deploymentShardLeadersMetric, i.deploymentShardsMetric, i.operatorStateRefreshMetric)
72-
73-
metric_descriptions.Descriptions(pd)
74-
}
75-
76-
func (i *inventory) Collect(m chan<- prometheus.Metric) {
77-
i.lock.Lock()
78-
defer i.lock.Unlock()
79-
80-
p := metrics.NewPushMetric(m)
81-
for _, deployments := range i.deployments {
82-
for _, deployment := range deployments {
83-
p.Push(i.deploymentsMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
84-
85-
deployment.CollectMetrics(p)
86-
87-
if state := deployment.acs.CurrentClusterCache(); state != nil {
88-
t := state.GetThrottles()
89-
90-
for _, c := range throttle.AllComponents() {
91-
p.Push(i.operatorStateRefreshMetric.Gauge(float64(t.Get(c).Count()), deployment.GetNamespace(), deployment.GetName(), string(c)))
92-
}
93-
}
94-
95-
spec := deployment.GetSpec()
96-
status, _ := deployment.GetStatus()
97-
98-
for _, member := range status.Members.AsList() {
99-
p.Push(i.deploymentMetricsMembersMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName(), member.Group.AsRole(), member.Member.ID))
100-
}
101-
102-
if spec.Mode.Get().HasAgents() {
103-
agency, agencyOk := deployment.GetAgencyCache()
104-
if !agencyOk {
105-
p.Push(i.deploymentAgencyStateMetric.Gauge(0, deployment.GetNamespace(), deployment.GetName()))
106-
continue
107-
}
108-
109-
p.Push(i.deploymentAgencyStateMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
110-
111-
if spec.Mode.Get() == api.DeploymentModeCluster {
112-
for db, collections := range agency.Current.Collections {
113-
for collection, shards := range collections {
114-
for shard, details := range shards {
115-
for id, server := range details.Servers {
116-
name := "UNKNOWN"
117-
if _, ok := agency.Plan.Collections[db]; ok {
118-
if _, ok := agency.Plan.Collections[db][collection]; ok {
119-
name = agency.Plan.Collections[db][collection].GetName(name)
120-
}
121-
}
122-
123-
m := []string{
124-
deployment.GetNamespace(),
125-
deployment.GetName(),
126-
db,
127-
name,
128-
shard,
129-
string(server),
130-
}
131-
132-
if id == 0 {
133-
p.Push(i.deploymentShardLeadersMetric.Gauge(1, m...))
134-
}
135-
p.Push(i.deploymentShardsMetric.Gauge(1, m...))
136-
}
137-
}
138-
}
139-
}
140-
}
141-
}
142-
}
28+
type Metrics struct {
29+
Agency struct {
30+
Errors uint64
31+
Fetches uint64
32+
Index uint64
14333
}
14434
}
14535

146-
func (i *inventory) Add(d *Deployment) {
147-
i.lock.Lock()
148-
defer i.lock.Unlock()
149-
150-
name, namespace := d.GetName(), d.GetNamespace()
151-
152-
if _, ok := i.deployments[namespace]; !ok {
153-
i.deployments[namespace] = map[string]*Deployment{}
154-
}
155-
156-
i.deployments[namespace][name] = d
157-
}
158-
15936
func (d *Deployment) CollectMetrics(m metrics.PushMetric) {
160-
m.Push(metric_descriptions.ArangodbOperatorAgencyErrorsCounter(float64(d.metrics.agency.errors), d.namespace, d.name))
161-
m.Push(metric_descriptions.ArangodbOperatorAgencyFetchesCounter(float64(d.metrics.agency.fetches), d.namespace, d.name))
162-
m.Push(metric_descriptions.ArangodbOperatorAgencyIndexGauge(float64(d.metrics.agency.index), d.namespace, d.name))
37+
m.Push(metric_descriptions.ArangodbOperatorAgencyErrorsCounter(float64(d.metrics.Agency.Errors), d.namespace, d.name))
38+
m.Push(metric_descriptions.ArangodbOperatorAgencyFetchesCounter(float64(d.metrics.Agency.Fetches), d.namespace, d.name))
39+
m.Push(metric_descriptions.ArangodbOperatorAgencyIndexGauge(float64(d.metrics.Agency.Index), d.namespace, d.name))
16340

16441
if c := d.agencyCache; c != nil {
16542
m.Push(metric_descriptions.ArangodbOperatorAgencyCachePresentGauge(1, d.namespace, d.name))
@@ -174,7 +51,13 @@ func (d *Deployment) CollectMetrics(m metrics.PushMetric) {
17451
m.Push(metric_descriptions.ArangodbOperatorAgencyCachePresentGauge(0, d.namespace, d.name))
17552
}
17653

54+
// Reconcile
17755
if c := d.reconciler; c != nil {
17856
c.CollectMetrics(m)
17957
}
58+
59+
// Resources
60+
if r := d.resources; r != nil {
61+
r.CollectMetrics(m)
62+
}
18063
}

0 commit comments

Comments
 (0)