Skip to content

Commit 0cc2d5d

Browse files
committed
Adding various metrics
1 parent 56466c4 commit 0cc2d5d

File tree

12 files changed

+724
-74
lines changed

12 files changed

+724
-74
lines changed

examples/metrics/dashboard.json

Lines changed: 558 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,22 @@
11
# This example shows how to integrate with the Prometheus Operator
22
# to bring metrics from kube-arangodb to Prometheus.
33

4-
apiVersion: v1
5-
kind: Service
6-
metadata:
7-
name: arango-deployment-operator
8-
labels:
9-
app: arango-deployment-operator
10-
spec:
11-
selector:
12-
app: arango-deployment-operator
13-
ports:
14-
- name: metrics
15-
port: 8528
16-
17-
---
18-
194
apiVersion: monitoring.coreos.com/v1
205
kind: ServiceMonitor
216
metadata:
227
name: arango-deployment-operator
8+
namespace: monitoring
239
labels:
24-
team: frontend
10+
prometheus: kube-prometheus
2511
spec:
2612
selector:
2713
matchLabels:
2814
app: arango-deployment-operator
15+
namespaceSelector:
16+
matchNames:
17+
- default
2918
endpoints:
30-
- port: metrics
19+
- port: server
3120
scheme: https
3221
tlsConfig:
3322
insecureSkipVerify: true
34-

pkg/deployment/deployment_inspector.go

Lines changed: 27 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,17 @@ import (
2727
"time"
2828

2929
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
30+
"github.com/arangodb/kube-arangodb/pkg/metrics"
3031
"github.com/arangodb/kube-arangodb/pkg/util"
3132
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3233
"github.com/arangodb/kube-arangodb/pkg/util/profiler"
3334
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3435
)
3536

37+
var (
38+
inspectDeploymentDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_deployment_duration", "Amount of time taken by a single inspection of a deployment (in sec)", metrics.DeploymentName)
39+
)
40+
3641
// inspectDeployment inspects the entire deployment, creates
3742
// a plan to update if needed and inspects underlying resources.
3843
// This function should be called when:
@@ -42,13 +47,16 @@ import (
4247
// Returns the delay until this function should be called again.
4348
func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval {
4449
log := d.deps.Log
50+
start := time.Now()
4551

4652
nextInterval := lastInterval
4753
hasError := false
4854
ctx := context.Background()
55+
deploymentName := d.apiObject.GetName()
56+
defer metrics.SetDuration(inspectDeploymentDurationGauges.WithLabelValues(deploymentName), start)
4957

5058
// Check deployment still exists
51-
updated, err := d.deps.DatabaseCRCli.DatabaseV1alpha().ArangoDeployments(d.apiObject.GetNamespace()).Get(d.apiObject.GetName(), metav1.GetOptions{})
59+
updated, err := d.deps.DatabaseCRCli.DatabaseV1alpha().ArangoDeployments(d.apiObject.GetNamespace()).Get(deploymentName, metav1.GetOptions{})
5260
if k8sutil.IsNotFound(err) {
5361
// Deployment is gone
5462
log.Info().Msg("Deployment is gone")
@@ -129,47 +137,27 @@ func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval
129137
}
130138

131139
// Ensure all resources are created
132-
{
133-
ps := profiler.Start()
134-
{
135-
ps := profiler.Start()
136-
if err := d.resources.EnsureSecrets(); err != nil {
137-
hasError = true
138-
d.CreateEvent(k8sutil.NewErrorEvent("Secret creation failed", err, d.apiObject))
139-
}
140-
ps.LogIf(log, time.Millisecond*10, "EnsureSecrets")
141-
}
142-
{
143-
ps := profiler.Start()
144-
if err := d.resources.EnsureServices(); err != nil {
145-
hasError = true
146-
d.CreateEvent(k8sutil.NewErrorEvent("Service creation failed", err, d.apiObject))
147-
}
148-
ps.LogIf(log, time.Millisecond*10, "EnsureServices")
149-
}
150-
if err := d.resources.EnsurePVCs(); err != nil {
151-
hasError = true
152-
d.CreateEvent(k8sutil.NewErrorEvent("PVC creation failed", err, d.apiObject))
153-
}
154-
{
155-
ps := profiler.Start()
156-
if err := d.resources.EnsurePods(); err != nil {
157-
hasError = true
158-
d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject))
159-
}
160-
ps.LogIf(log, time.Millisecond*10, "EnsurePods")
161-
}
162-
ps.Done(log, "ensure resources")
140+
if err := d.resources.EnsureSecrets(); err != nil {
141+
hasError = true
142+
d.CreateEvent(k8sutil.NewErrorEvent("Secret creation failed", err, d.apiObject))
143+
}
144+
if err := d.resources.EnsureServices(); err != nil {
145+
hasError = true
146+
d.CreateEvent(k8sutil.NewErrorEvent("Service creation failed", err, d.apiObject))
147+
}
148+
if err := d.resources.EnsurePVCs(); err != nil {
149+
hasError = true
150+
d.CreateEvent(k8sutil.NewErrorEvent("PVC creation failed", err, d.apiObject))
151+
}
152+
if err := d.resources.EnsurePods(); err != nil {
153+
hasError = true
154+
d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject))
163155
}
164156

165157
// Create access packages
166-
{
167-
ps := profiler.Start()
168-
if err := d.createAccessPackages(); err != nil {
169-
hasError = true
170-
d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject))
171-
}
172-
ps.Done(log, "createAccessPackages")
158+
if err := d.createAccessPackages(); err != nil {
159+
hasError = true
160+
d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject))
173161
}
174162

175163
// Inspect deployment for obsolete members

pkg/deployment/metrics.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package deployment
24+
25+
const (
26+
// Component name for metrics of this package
27+
metricsComponent = "deployment"
28+
)

pkg/deployment/resources/deployment_health.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import (
3131
)
3232

3333
var (
34-
fetchDeploymentHealthCounters = metrics.MustRegisterCounterVec("deployment_resources", "fetchDeploymentHealth", "Number of times the health of the deployment was fetched", "deployment", "result")
34+
deploymentHealthFetchesCounters = metrics.MustRegisterCounterVec(metricsComponent, "deployment_health_fetches", "Number of times the health of the deployment was fetched", metrics.DeploymentName, metrics.Result)
3535
)
3636

3737
// RunDeploymentHealthLoop creates a loop to fetch the health of the deployment.
@@ -48,9 +48,9 @@ func (r *Resources) RunDeploymentHealthLoop(stopCh <-chan struct{}) {
4848
for {
4949
if err := r.fetchDeploymentHealth(); err != nil {
5050
log.Debug().Err(err).Msg("Failed to fetch deployment health")
51-
fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "failed").Inc()
51+
deploymentHealthFetchesCounters.WithLabelValues(deploymentName, metrics.Failed).Inc()
5252
} else {
53-
fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "success").Inc()
53+
deploymentHealthFetchesCounters.WithLabelValues(deploymentName, metrics.Success).Inc()
5454
}
5555
select {
5656
case <-time.After(time.Second * 5):

pkg/deployment/resources/member_cleanup.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,19 +39,20 @@ const (
3939
)
4040

4141
var (
42-
cleanupRemovedMembersCounters = metrics.MustRegisterCounterVec("deployment_resources", "cleanupRemovedMembers", "Number of cleanup-removed-members actions", "deployment", "result")
42+
cleanupRemovedMembersCounters = metrics.MustRegisterCounterVec(metricsComponent, "cleanup_removed_members", "Number of cleanup-removed-members actions", metrics.DeploymentName, metrics.Result)
4343
)
4444

4545
// CleanupRemovedMembers removes all arangod members that are no longer part of ArangoDB deployment.
4646
func (r *Resources) CleanupRemovedMembers() error {
4747
// Decide what to do depending on cluster mode
4848
switch r.context.GetSpec().GetMode() {
4949
case api.DeploymentModeCluster:
50+
deploymentName := r.context.GetAPIObject().GetName()
5051
if err := r.cleanupRemovedClusterMembers(); err != nil {
51-
cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "failed").Inc()
52+
cleanupRemovedMembersCounters.WithLabelValues(deploymentName, metrics.Failed).Inc()
5253
return maskAny(err)
5354
}
54-
cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "success").Inc()
55+
cleanupRemovedMembersCounters.WithLabelValues(deploymentName, metrics.Success).Inc()
5556
return nil
5657
default:
5758
// Other mode have no concept of cluster in which members can be removed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package resources
24+
25+
const (
26+
// Component name for metrics of this package
27+
metricsComponent = "deployment_resources"
28+
)

pkg/deployment/resources/pod_inspector.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ import (
3636
)
3737

3838
var (
39-
inspectedPodCounter = metrics.MustRegisterCounter("deployment", "inspected_pods", "Number of pod inspections")
39+
inspectedPodsCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_pods", "Number of pod inspections per deployment", metrics.DeploymentName)
40+
inspectPodsDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_pods_duration", "Amount of time taken by a single inspection of all pods for a deployment (in sec)", metrics.DeploymentName)
4041
)
4142

4243
const (
@@ -50,8 +51,12 @@ const (
5051
// Returns: Interval_till_next_inspection, error
5152
func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
5253
log := r.log
54+
start := time.Now()
55+
apiObject := r.context.GetAPIObject()
56+
deploymentName := apiObject.GetName()
5357
var events []*k8sutil.Event
5458
nextInterval := maxPodInspectorInterval // Large by default, will be made smaller if needed in the rest of the function
59+
defer metrics.SetDuration(inspectPodsDurationGauges.WithLabelValues(deploymentName), start)
5560

5661
pods, err := r.context.GetOwnedPods()
5762
if err != nil {
@@ -61,7 +66,6 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
6166

6267
// Update member status from all pods found
6368
status, lastVersion := r.context.GetStatus()
64-
apiObject := r.context.GetAPIObject()
6569
var podNamesWithScheduleTimeout []string
6670
var unscheduledPodNames []string
6771
for _, p := range pods {
@@ -71,7 +75,7 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
7175
}
7276

7377
// Pod belongs to this deployment, update metric
74-
inspectedPodCounter.Inc()
78+
inspectedPodsCounters.WithLabelValues(deploymentName).Inc()
7579

7680
// Find member status
7781
memberStatus, group, found := status.Members.MemberStatusByPodName(p.GetName())

pkg/deployment/resources/pvc_inspector.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,22 @@ import (
3232
)
3333

3434
var (
35-
inspectedPVCCounter = metrics.MustRegisterCounter("deployment", "inspected_ppvcs", "Number of PVCs inspections")
35+
inspectedPVCsCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_pvcs", "Number of PVC inspections per deployment", metrics.DeploymentName)
36+
inspectPVCsDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_pvcs_duration", "Amount of time taken by a single inspection of all PVCs for a deployment (in sec)", metrics.DeploymentName)
37+
)
38+
39+
const (
3640
maxPVCInspectorInterval = util.Interval(time.Hour) // Maximum time between PVC inspection (if nothing else happens)
3741
)
3842

3943
// InspectPVCs lists all PVCs that belong to the given deployment and updates
4044
// the member status of the deployment accordingly.
4145
func (r *Resources) InspectPVCs(ctx context.Context) (util.Interval, error) {
4246
log := r.log
47+
start := time.Now()
4348
nextInterval := maxPVCInspectorInterval
49+
deploymentName := r.context.GetAPIObject().GetName()
50+
defer metrics.SetDuration(inspectPVCsDurationGauges.WithLabelValues(deploymentName), start)
4451

4552
pvcs, err := r.context.GetOwnedPVCs()
4653
if err != nil {
@@ -52,7 +59,7 @@ func (r *Resources) InspectPVCs(ctx context.Context) (util.Interval, error) {
5259
status, _ := r.context.GetStatus()
5360
for _, p := range pvcs {
5461
// PVC belongs to this deployment, update metric
55-
inspectedPVCCounter.Inc()
62+
inspectedPVCsCounters.WithLabelValues(deploymentName).Inc()
5663

5764
// Find member status
5865
memberStatus, group, found := status.Members.MemberStatusByPVCName(p.GetName())

pkg/deployment/resources/secrets.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,39 +25,57 @@ package resources
2525
import (
2626
"crypto/rand"
2727
"encoding/hex"
28+
"time"
2829

2930
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031

3132
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
33+
"github.com/arangodb/kube-arangodb/pkg/metrics"
3234
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3335
)
3436

37+
var (
38+
inspectedSecretsCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_secrets", "Number of Secret inspections per deployment", metrics.DeploymentName)
39+
inspectSecretsDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_secrets_duration", "Amount of time taken by a single inspection of all Secrets for a deployment (in sec)", metrics.DeploymentName)
40+
)
41+
3542
// EnsureSecrets creates all secrets needed to run the given deployment
3643
func (r *Resources) EnsureSecrets() error {
44+
start := time.Now()
3745
kubecli := r.context.GetKubeCli()
3846
ns := r.context.GetNamespace()
3947
secrets := k8sutil.NewSecretCache(kubecli.CoreV1().Secrets(ns))
4048
spec := r.context.GetSpec()
49+
deploymentName := r.context.GetAPIObject().GetName()
50+
defer metrics.SetDuration(inspectSecretsDurationGauges.WithLabelValues(deploymentName), start)
51+
counterMetric := inspectedSecretsCounters.WithLabelValues(deploymentName)
52+
4153
if spec.IsAuthenticated() {
54+
counterMetric.Inc()
4255
if err := r.ensureTokenSecret(secrets, spec.Authentication.GetJWTSecretName()); err != nil {
4356
return maskAny(err)
4457
}
4558
}
4659
if spec.IsSecure() {
60+
counterMetric.Inc()
4761
if err := r.ensureTLSCACertificateSecret(secrets, spec.TLS); err != nil {
4862
return maskAny(err)
4963
}
5064
}
5165
if spec.Sync.IsEnabled() {
66+
counterMetric.Inc()
5267
if err := r.ensureTokenSecret(secrets, spec.Sync.Authentication.GetJWTSecretName()); err != nil {
5368
return maskAny(err)
5469
}
70+
counterMetric.Inc()
5571
if err := r.ensureTokenSecret(secrets, spec.Sync.Monitoring.GetTokenSecretName()); err != nil {
5672
return maskAny(err)
5773
}
74+
counterMetric.Inc()
5875
if err := r.ensureTLSCACertificateSecret(secrets, spec.Sync.TLS); err != nil {
5976
return maskAny(err)
6077
}
78+
counterMetric.Inc()
6179
if err := r.ensureClientAuthCACertificateSecret(secrets, spec.Sync.Authentication); err != nil {
6280
return maskAny(err)
6381
}

0 commit comments

Comments
 (0)