Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit 386613d

Browse files
committed
Improved alert messages with Cortex cluster
Signed-off-by: Marco Pracucci <marco@pracucci.com>
1 parent 47c4c25 commit 386613d

File tree

5 files changed

+87
-69
lines changed

5 files changed

+87
-69
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
* [CHANGE] Replace `CortexRulerFailedEvaluations` with two new alerts: `CortexRulerTooManyFailedPushes` and `CortexRulerTooManyFailedQueries`. #347
2121
* [CHANGE] Removed `CortexCacheRequestErrors` alert. This alert was not working because the legacy Cortex cache client instrumentation doesn't track errors. #346
2222
* [CHANGE] Removed `CortexQuerierCapacityFull` alert. #342
23+
* [CHANGE] Changes blocks storage alerts to group metrics by the configured `cluster_labels` (supporting the deprecated `alert_aggregation_labels`). #351
2324
* [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317
2425
* [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. #324
2526
* [ENHANCEMENT] Dashboards: defined container functions for common resources panels: containerDiskWritesPanel, containerDiskReadsPanel, containerDiskSpaceUtilization. #331

cortex-mixin/alerts/alerts.libsonnet

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
severity: 'critical',
1414
},
1515
annotations: {
16-
message: 'There are {{ printf "%f" $value }} unhealthy ingester(s).',
16+
message: 'Cortex cluster %(alert_aggregation_variables)s has {{ printf "%%f" $value }} unhealthy ingester(s).' % $._config,
1717
},
1818
},
1919
{
@@ -35,8 +35,8 @@
3535
},
3636
annotations: {
3737
message: |||
38-
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
39-
|||,
38+
The route {{ $labels.route }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors.
39+
||| % $._config,
4040
},
4141
},
4242
{
@@ -98,8 +98,8 @@
9898
},
9999
annotations: {
100100
message: |||
101-
Incorrect results for {{ printf "%.2f" $value }}% of queries.
102-
|||,
101+
The Cortex cluster %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% incorrect query results.
102+
||| % $._config,
103103
},
104104
},
105105
{
@@ -113,8 +113,8 @@
113113
},
114114
annotations: {
115115
message: |||
116-
An inconsistent runtime config file is used across cluster {{ $labels.job }}.
117-
|||,
116+
An inconsistent runtime config file is used across cluster %(alert_aggregation_variables)s.
117+
||| % $._config,
118118
},
119119
},
120120
{
@@ -145,8 +145,8 @@
145145
},
146146
annotations: {
147147
message: |||
148-
There are {{ $value }} queued up queries in query-frontend.
149-
|||,
148+
There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-frontend.
149+
||| % $._config,
150150
},
151151
},
152152
{
@@ -160,8 +160,8 @@
160160
},
161161
annotations: {
162162
message: |||
163-
There are {{ $value }} queued up queries in query-scheduler.
164-
|||,
163+
There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-scheduler.
164+
||| % $._config,
165165
},
166166
},
167167
{
@@ -178,8 +178,8 @@
178178
},
179179
annotations: {
180180
message: |||
181-
Memcached {{ $labels.name }} used by Cortex in {{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation.
182-
|||,
181+
Memcached {{ $labels.name }} used by Cortex %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors for {{ $labels.operation }} operation.
182+
||| % $._config,
183183
},
184184
},
185185
{
@@ -430,8 +430,8 @@
430430
},
431431
annotations: {
432432
message: |||
433-
Chunk memcached cluster is too small, should be at least {{ printf "%.2f" $value }}GB.
434-
|||,
433+
Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB.
434+
||| % $._config,
435435
},
436436
},
437437
{
@@ -448,8 +448,8 @@
448448
},
449449
annotations: {
450450
message: |||
451-
The number of in-memory series per ingester in {{ $labels.namespace }} is too high.
452-
|||,
451+
The number of in-memory series per ingester in %(alert_aggregation_variables)s is too high.
452+
||| % $._config,
453453
},
454454
},
455455
{
@@ -464,8 +464,8 @@
464464
},
465465
annotations: {
466466
message: |||
467-
Ingesters in {{ $labels.namespace }} ingest too many samples per second.
468-
|||,
467+
Ingesters in %(alert_aggregation_variables)s ingest too many samples per second.
468+
||| % $._config,
469469
},
470470
},
471471
{
@@ -483,8 +483,8 @@
483483
},
484484
annotations: {
485485
message: |||
486-
Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory.
487-
|||,
486+
Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory.
487+
||| % $._config,
488488
},
489489
},
490490
{
@@ -502,8 +502,8 @@
502502
},
503503
annotations: {
504504
message: |||
505-
Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory.
506-
|||,
505+
Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory.
506+
||| % $._config,
507507
},
508508
},
509509
],
@@ -526,8 +526,8 @@
526526
},
527527
annotations: {
528528
message: |||
529-
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write (push) errors.
530-
|||,
529+
Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% write (push) errors.
530+
||| % $._config,
531531
},
532532
},
533533
{
@@ -545,8 +545,8 @@
545545
},
546546
annotations: {
547547
message: |||
548-
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules.
549-
|||,
548+
Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors while evaluating rules.
549+
||| % $._config,
550550
},
551551
},
552552
{
@@ -563,8 +563,8 @@
563563
},
564564
annotations: {
565565
message: |||
566-
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}.
567-
|||,
566+
Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% missed iterations for the rule group {{ $labels.rule_group }}.
567+
||| % $._config,
568568
},
569569
},
570570
{
@@ -579,8 +579,8 @@
579579
},
580580
annotations: {
581581
message: |||
582-
Cortex Rulers {{ $labels.job }} are experiencing errors when checking the ring for rule group ownership.
583-
|||,
582+
Cortex Rulers in %(alert_aggregation_variables)s are experiencing errors when checking the ring for rule group ownership.
583+
||| % $._config,
584584
},
585585
},
586586
],
@@ -600,7 +600,7 @@
600600
severity: 'warning',
601601
},
602602
annotations: {
603-
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.',
603+
message: 'Cortex instance {{ $labels.instance }} in %(alert_aggregation_variables)s sees incorrect number of gossip members.' % $._config,
604604
},
605605
},
606606
],

cortex-mixin/alerts/blocks.libsonnet

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,24 @@
99
alert: 'CortexIngesterHasNotShippedBlocks',
1010
'for': '15m',
1111
expr: |||
12-
(min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
12+
(min by(%(alert_aggregation_labels)s, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
1313
and
14-
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
14+
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
1515
and
1616
# Only if the ingester has ingested samples over the last 4h.
17-
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
17+
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
1818
and
1919
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
2020
# had ingested samples in the past, then no traffic was received for a long period and then it starts
2121
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
2222
# samples, while the a block shipping is expected within the next 4h.
23-
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
24-
|||,
23+
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
24+
||| % $._config,
2525
labels: {
2626
severity: 'critical',
2727
},
2828
annotations: {
29-
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.',
29+
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
3030
},
3131
},
3232
{
@@ -35,15 +35,15 @@
3535
alert: 'CortexIngesterHasNotShippedBlocksSinceStart',
3636
'for': '4h',
3737
expr: |||
38-
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
38+
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
3939
and
40-
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
41-
|||,
40+
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
41+
||| % $._config,
4242
labels: {
4343
severity: 'critical',
4444
},
4545
annotations: {
46-
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.',
46+
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
4747
},
4848
},
4949
{
@@ -61,7 +61,7 @@
6161
severity: 'critical',
6262
},
6363
annotations: {
64-
message: "Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet.",
64+
message: "Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet." % $._config,
6565
},
6666
},
6767
{
@@ -77,7 +77,7 @@
7777
severity: 'critical',
7878
},
7979
annotations: {
80-
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.',
80+
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to compact TSDB head.' % $._config,
8181
},
8282
},
8383
{
@@ -89,7 +89,7 @@
8989
severity: 'critical',
9090
},
9191
annotations: {
92-
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB head.',
92+
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB head.' % $._config,
9393
},
9494
},
9595
{
@@ -101,7 +101,7 @@
101101
severity: 'critical',
102102
},
103103
annotations: {
104-
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to create TSDB checkpoint.',
104+
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to create TSDB checkpoint.' % $._config,
105105
},
106106
},
107107
{
@@ -113,7 +113,7 @@
113113
severity: 'critical',
114114
},
115115
annotations: {
116-
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete TSDB checkpoint.',
116+
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to delete TSDB checkpoint.' % $._config,
117117
},
118118
},
119119
{
@@ -125,7 +125,7 @@
125125
severity: 'warning',
126126
},
127127
annotations: {
128-
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB WAL.',
128+
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB WAL.' % $._config,
129129
},
130130
},
131131
{
@@ -137,7 +137,7 @@
137137
severity: 'critical',
138138
},
139139
annotations: {
140-
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} got a corrupted TSDB WAL.',
140+
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config,
141141
},
142142
},
143143
{
@@ -150,7 +150,7 @@
150150
severity: 'critical',
151151
},
152152
annotations: {
153-
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to write to TSDB WAL.',
153+
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to write to TSDB WAL.' % $._config,
154154
},
155155
},
156156
{
@@ -166,7 +166,7 @@
166166
severity: 'critical',
167167
},
168168
annotations: {
169-
message: 'Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.',
169+
message: 'Cortex Querier {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully scanned the bucket since {{ $value | humanizeDuration }}.' % $._config,
170170
},
171171
},
172172
{
@@ -177,20 +177,20 @@
177177
expr: |||
178178
100 * (
179179
(
180-
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
180+
sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
181181
-
182-
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m]))
182+
sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m]))
183183
)
184184
/
185-
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
185+
sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
186186
)
187187
> 1
188-
|||,
188+
||| % $._config,
189189
labels: {
190190
severity: 'warning',
191191
},
192192
annotations: {
193-
message: 'Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.',
193+
message: 'Cortex Queries in %(alert_aggregation_variables)s are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%%.0f" $value }}%% of queries.' % $._config,
194194
},
195195
},
196196
{
@@ -206,34 +206,34 @@
206206
severity: 'critical',
207207
},
208208
annotations: {
209-
message: 'Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.',
209+
message: 'Cortex Store Gateway {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully synched the bucket since {{ $value | humanizeDuration }}.' % $._config,
210210
},
211211
},
212212
{
213213
// Alert if the bucket index has not been updated for a given user.
214214
alert: 'CortexBucketIndexNotUpdated',
215215
expr: |||
216-
min by(namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
217-
|||,
216+
min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
217+
||| % $._config,
218218
labels: {
219219
severity: 'critical',
220220
},
221221
annotations: {
222-
message: 'Cortex bucket index for tenant {{ $labels.user }} in {{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}.',
222+
message: 'Cortex bucket index for tenant {{ $labels.user }} in %(alert_aggregation_variables)s has not been updated since {{ $value | humanizeDuration }}.' % $._config,
223223
},
224224
},
225225
{
226226
// Alert if a we consistently find partial blocks for a given tenant over a relatively large time range.
227227
alert: 'CortexTenantHasPartialBlocks',
228228
'for': '6h',
229229
expr: |||
230-
max by(namespace, user) (cortex_bucket_blocks_partials_count) > 0
231-
|||,
230+
max by(%(alert_aggregation_labels)s, user) (cortex_bucket_blocks_partials_count) > 0
231+
||| % $._config,
232232
labels: {
233233
severity: 'warning',
234234
},
235235
annotations: {
236-
message: 'Cortex tenant {{ $labels.user }} in {{ $labels.namespace }} has {{ $value }} partial blocks.',
236+
message: 'Cortex tenant {{ $labels.user }} in %(alert_aggregation_variables)s has {{ $value }} partial blocks.' % $._config,
237237
},
238238
},
239239
],

0 commit comments

Comments
 (0)