Skip to content

Commit 42973e5

Browse files
alex5517Dasomeone
andauthored
feat(opentelemetry-collector): Add new alerts and enhance configuration (#1486)
* feat: Add new alerts and enhance configuration * feat: add alert summaries and update default datasource name * jsonnetfmt Signed-off-by: Alexander Soelberg Heidarsson <89837986+alex5517@users.noreply.github.com> --------- Signed-off-by: Alexander Soelberg Heidarsson <89837986+alex5517@users.noreply.github.com> Co-authored-by: Emily <1282515+Dasomeone@users.noreply.github.com>
1 parent 41bc6a4 commit 42973e5

File tree

5 files changed

+170
-17
lines changed

5 files changed

+170
-17
lines changed

opentelemetry-collector-mixin/alerts/alerts.libsonnet

Lines changed: 141 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,153 @@
55
name: 'otelcol',
66
rules: [
77
{
8-
alert: 'OtelcolSendingQueueFull',
8+
alert: 'ReceiverDroppedSpans',
99
expr: |||
10-
otelcol_exporter_queue_size >= otelcol_exporter_queue_capacity
10+
rate(otelcol_receiver_refused_spans_total[5m]) > 0
1111
|||,
12-
'for': '30m',
12+
'for': '2m',
13+
labels: {
14+
severity: 'critical',
15+
},
16+
annotations: {
17+
summary: 'Receiver is dropping spans.',
18+
description: 'The {{ $labels.receiver }} receiver is dropping spans at a rate of {{ humanize $value }} per second.',
19+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures',
20+
},
21+
},
22+
{
23+
alert: 'ReceiverDroppedMetrics',
24+
expr: |||
25+
rate(otelcol_receiver_refused_metric_points_total[5m]) > 0
26+
|||,
27+
'for': '2m',
28+
labels: {
29+
severity: 'critical',
30+
},
31+
annotations: {
32+
summary: 'Receiver is dropping metrics.',
33+
description: 'The {{ $labels.receiver }} receiver is dropping metrics at a rate of {{ humanize $value }} per second.',
34+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures',
35+
},
36+
},
37+
{
38+
alert: 'ReceiverDroppedLogs',
39+
expr: |||
40+
rate(otelcol_receiver_refused_log_records_total[5m]) > 0
41+
|||,
42+
'for': '5m',
43+
labels: {
44+
severity: 'critical',
45+
},
46+
annotations: {
47+
summary: 'Receiver is dropping logs.',
48+
description: 'The {{ $labels.receiver }} is dropping logs at a rate of {{ humanize $value }} per second.',
49+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures',
50+
},
51+
},
52+
{
53+
alert: 'ExporterDroppedSpans',
54+
expr: |||
55+
rate(otelcol_exporter_send_failed_spans_total[5m]) > 0
56+
|||,
57+
'for': '2m',
58+
labels: {
59+
severity: 'critical',
60+
},
61+
annotations: {
62+
summary: 'Exporter is dropping spans.',
63+
description: 'The {{ $labels.exporter }} exporter is dropping spans at a rate of {{ humanize $value }} per second.',
64+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures',
65+
},
66+
},
67+
{
68+
alert: 'ExporterDroppedMetrics',
69+
expr: |||
70+
rate(otelcol_exporter_send_failed_metric_points_total[5m]) > 0
71+
|||,
72+
'for': '2m',
73+
labels: {
74+
severity: 'critical',
75+
},
76+
annotations: {
77+
summary: 'Exporter is dropping metrics.',
78+
description: 'The {{ $labels.exporter }} exporter is dropping metrics at a rate of {{ humanize $value }} per second.',
79+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures',
80+
},
81+
},
82+
{
83+
alert: 'ExporterDroppedLogs',
84+
expr: |||
85+
rate(otelcol_exporter_send_failed_log_records_total[5m]) > 0
86+
|||,
87+
'for': '5m',
88+
labels: {
89+
severity: 'critical',
90+
},
91+
annotations: {
92+
summary: 'Exporter is dropping logs.',
93+
description: 'The {{ $labels.exporter }} is dropping logs at a rate of {{ humanize $value }} per second.',
94+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures',
95+
},
96+
},
97+
{
98+
alert: 'ExporterQueueSize',
99+
expr: |||
100+
otelcol_exporter_queue_size > otelcol_exporter_queue_capacity * 0.8
101+
|||,
102+
'for': '1m',
103+
labels: {
104+
severity: 'warning',
105+
},
106+
annotations: {
107+
summary: 'Exporter queue is filling up.',
108+
description: 'The {{ $labels.exporter }} queue has reached a size of {{ $value }}.',
109+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length',
110+
},
111+
},
112+
{
113+
alert: 'SendQueueFailedSpans',
114+
expr: |||
115+
rate(otelcol_exporter_enqueue_failed_spans_total[5m]) > 0
116+
|||,
117+
'for': '1m',
118+
labels: {
119+
severity: 'warning',
120+
},
121+
annotations: {
122+
summary: 'Exporter send queue failed to accept spans.',
123+
description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} spans.',
124+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length',
125+
},
126+
},
127+
{
128+
alert: 'SendQueueFailedMetricPoints',
129+
expr: |||
130+
rate(otelcol_exporter_enqueue_failed_metric_points_total[5m]) > 0
131+
|||,
132+
'for': '1m',
133+
labels: {
134+
severity: 'warning',
135+
},
136+
annotations: {
137+
summary: 'Exporter send queue failed to accept metric points.',
138+
description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} metric points.',
139+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length',
140+
},
141+
},
142+
{
143+
alert: 'SendQueueFailedLogRecords',
144+
expr: |||
145+
rate(otelcol_exporter_enqueue_failed_log_records_total[5m]) > 0
146+
|||,
147+
'for': '1m',
13148
labels: {
14149
severity: 'warning',
15150
},
16151
annotations: {
17-
summary: 'The sending queue has filled up.',
18-
description: 'The sending queue is full for {{ $labels.instance }}. The collector might start dropping data',
152+
summary: 'Exporter send queue failed to accept log records.',
153+
description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} log records.',
154+
runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length',
19155
},
20156
},
21157
],

opentelemetry-collector-mixin/config.libsonnet

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,30 @@
11
{
22
_config+:: {
3+
34
// Selector to apply to all dashboard variables, panel queries, alerts and recording rules.
45
// Can be used to filter metrics to specific OpenTelemetry Collector instances.
56
// Example: 'job="integrations/otel-collector"'
67
filteringSelector: '',
78

9+
// The label used to differentiate between different Kubernetes clusters.
10+
clusterLabel: 'cluster',
11+
namespaceLabel: 'namespace',
12+
jobLabel: 'job',
13+
14+
// Configuration for which group labels are enabled.
15+
labels: {
16+
cluster: false,
17+
namespace: false,
18+
job: true,
19+
},
20+
821
// Labels that represent a group of instances.
922
// Used in dashboard variables and alert aggregations.
10-
// Examples: ['job'] or ['environment', 'job', 'cluster']
11-
groupLabels: ['job'],
23+
groupLabels: [
24+
label
25+
for label in std.objectFields($._config.labels)
26+
if $._config.labels[label]
27+
],
1228

1329
// Labels that represent a single instance.
1430
// Used in dashboard variables and legend formats.
@@ -26,13 +42,15 @@
2642
refresh: '60s',
2743

2844
// Timezone for Grafana dashboards:: UTC, browser, ...
29-
grafanaTimezone: 'UTC',
45+
grafanaTimezone: 'browser',
3046

3147
// Tags for Grafana dashboards
3248
dashboardTags: ['otelcol'],
49+
50+
dashboardNamePrefix: 'OpenTelemetry Collector / ',
3351
},
3452

3553
// Default datasource name
36-
datasourceName: 'default',
54+
datasourceName: 'datasource',
3755
},
3856
}

opentelemetry-collector-mixin/dashboards.jsonnet

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@ local dashboards = (import 'mixin.libsonnet').grafanaDashboards;
22
local cfg = import 'config.libsonnet';
33

44
{
5-
[name]: dashboards[name] {
6-
timezone: cfg._config.grafana.grafanaTimezone,
7-
refresh: cfg._config.grafana.refresh,
8-
tags: cfg._config.grafana.dashboardTags,
9-
}
5+
[name]: dashboards[name]
106
for name in std.objectFields(dashboards)
117
}

opentelemetry-collector-mixin/dashboards/collector.libsonnet

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,15 @@ local cfg = import '../config.libsonnet';
99
grafanaDashboards+:: {
1010
'collector.json':
1111
g.dashboard.new(
12-
'OpenTelemetry collector health',
12+
cfg._config.grafana.dashboardNamePrefix + 'Operational',
1313
)
1414
+ g.dashboard.withDescription('A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.')
1515
+ g.dashboard.graphTooltip.withSharedCrosshair()
16-
+ g.dashboard.withVariables(variables.multiInstance)
16+
+ g.dashboard.withEditable(false)
1717
+ g.dashboard.withUid(cfg._config.grafanaDashboardIDs['collector.json'])
18+
+ g.dashboard.withTimezone(cfg._config.grafana.grafanaTimezone)
19+
+ g.dashboard.withTags(cfg._config.grafana.dashboardTags)
20+
+ g.dashboard.withVariables(variables.multiInstance)
1821
+ g.dashboard.withPanels(
1922
g.util.grid.wrapPanels([
2023
// Overview row

opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ commonlib.variables.new(
88
varMetric='otelcol_process_uptime',
99
enableLokiLogs=false,
1010
customAllValue='.*',
11-
prometheusDatasourceName='datasource',
11+
prometheusDatasourceName=cfg._config.datasourceName,
1212
prometheusDatasourceLabel='Data source',
1313
adHocEnabled=false,
1414
)

0 commit comments

Comments
 (0)