|
9 | 9 | alert: 'CortexIngesterHasNotShippedBlocks', |
10 | 10 | 'for': '15m', |
11 | 11 | expr: ||| |
12 | | - (min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) |
| 12 | + (min by(%(alert_aggregation_labels)s, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) |
13 | 13 | and |
14 | | - (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) |
| 14 | + (max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) |
15 | 15 | and |
16 | 16 | # Only if the ingester has ingested samples over the last 4h. |
17 | | - (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) |
| 17 | + (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) |
18 | 18 | and |
19 | 19 | # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance |
20 | 20 | # had ingested samples in the past, then no traffic was received for a long period and then it starts |
21 | 21 | # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving |
22 | 22 | # samples, while the a block shipping is expected within the next 4h. |
23 | | - (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0) |
24 | | - |||, |
| 23 | + (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0) |
| 24 | + ||| % $._config, |
25 | 25 | labels: { |
26 | 26 | severity: 'critical', |
27 | 27 | }, |
28 | 28 | annotations: { |
29 | | - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', |
| 29 | + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config, |
30 | 30 | }, |
31 | 31 | }, |
32 | 32 | { |
|
35 | 35 | alert: 'CortexIngesterHasNotShippedBlocksSinceStart', |
36 | 36 | 'for': '4h', |
37 | 37 | expr: ||| |
38 | | - (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) |
| 38 | + (max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) |
39 | 39 | and |
40 | | - (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) |
41 | | - |||, |
| 40 | + (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) |
| 41 | + ||| % $._config, |
42 | 42 | labels: { |
43 | 43 | severity: 'critical', |
44 | 44 | }, |
45 | 45 | annotations: { |
46 | | - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', |
| 46 | + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config, |
47 | 47 | }, |
48 | 48 | }, |
49 | 49 | { |
|
61 | 61 | severity: 'critical', |
62 | 62 | }, |
63 | 63 | annotations: { |
64 | | - message: "Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet.", |
| 64 | + message: "Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet." % $._config, |
65 | 65 | }, |
66 | 66 | }, |
67 | 67 | { |
|
77 | 77 | severity: 'critical', |
78 | 78 | }, |
79 | 79 | annotations: { |
80 | | - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.', |
| 80 | + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to compact TSDB head.' % $._config, |
81 | 81 | }, |
82 | 82 | }, |
83 | 83 | { |
|
89 | 89 | severity: 'critical', |
90 | 90 | }, |
91 | 91 | annotations: { |
92 | | - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB head.', |
| 92 | + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB head.' % $._config, |
93 | 93 | }, |
94 | 94 | }, |
95 | 95 | { |
|
101 | 101 | severity: 'critical', |
102 | 102 | }, |
103 | 103 | annotations: { |
104 | | - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to create TSDB checkpoint.', |
| 104 | + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to create TSDB checkpoint.' % $._config, |
105 | 105 | }, |
106 | 106 | }, |
107 | 107 | { |
|
113 | 113 | severity: 'critical', |
114 | 114 | }, |
115 | 115 | annotations: { |
116 | | - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete TSDB checkpoint.', |
| 116 | + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to delete TSDB checkpoint.' % $._config, |
117 | 117 | }, |
118 | 118 | }, |
119 | 119 | { |
|
125 | 125 | severity: 'warning', |
126 | 126 | }, |
127 | 127 | annotations: { |
128 | | - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB WAL.', |
| 128 | + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB WAL.' % $._config, |
129 | 129 | }, |
130 | 130 | }, |
131 | 131 | { |
|
137 | 137 | severity: 'critical', |
138 | 138 | }, |
139 | 139 | annotations: { |
140 | | - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} got a corrupted TSDB WAL.', |
| 140 | + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config, |
141 | 141 | }, |
142 | 142 | }, |
143 | 143 | { |
|
150 | 150 | severity: 'critical', |
151 | 151 | }, |
152 | 152 | annotations: { |
153 | | - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to write to TSDB WAL.', |
| 153 | + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to write to TSDB WAL.' % $._config, |
154 | 154 | }, |
155 | 155 | }, |
156 | 156 | { |
|
166 | 166 | severity: 'critical', |
167 | 167 | }, |
168 | 168 | annotations: { |
169 | | - message: 'Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.', |
| 169 | + message: 'Cortex Querier {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully scanned the bucket since {{ $value | humanizeDuration }}.' % $._config, |
170 | 170 | }, |
171 | 171 | }, |
172 | 172 | { |
|
177 | 177 | expr: ||| |
178 | 178 | 100 * ( |
179 | 179 | ( |
180 | | - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) |
| 180 | + sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) |
181 | 181 | - |
182 | | - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) |
| 182 | + sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) |
183 | 183 | ) |
184 | 184 | / |
185 | | - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) |
| 185 | + sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) |
186 | 186 | ) |
187 | 187 | > 1 |
188 | | - |||, |
| 188 | + ||| % $._config, |
189 | 189 | labels: { |
190 | 190 | severity: 'warning', |
191 | 191 | }, |
192 | 192 | annotations: { |
193 | | - message: 'Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.', |
| 193 | + message: 'Cortex Queries in %(alert_aggregation_variables)s are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%%.0f" $value }}%% of queries.' % $._config, |
194 | 194 | }, |
195 | 195 | }, |
196 | 196 | { |
|
206 | 206 | severity: 'critical', |
207 | 207 | }, |
208 | 208 | annotations: { |
209 | | - message: 'Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.', |
| 209 | + message: 'Cortex Store Gateway {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully synched the bucket since {{ $value | humanizeDuration }}.' % $._config, |
210 | 210 | }, |
211 | 211 | }, |
212 | 212 | { |
213 | 213 | // Alert if the bucket index has not been updated for a given user. |
214 | 214 | alert: 'CortexBucketIndexNotUpdated', |
215 | 215 | expr: ||| |
216 | | - min by(namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 |
217 | | - |||, |
| 216 | + min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 |
| 217 | + ||| % $._config, |
218 | 218 | labels: { |
219 | 219 | severity: 'critical', |
220 | 220 | }, |
221 | 221 | annotations: { |
222 | | - message: 'Cortex bucket index for tenant {{ $labels.user }} in {{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}.', |
| 222 | + message: 'Cortex bucket index for tenant {{ $labels.user }} in %(alert_aggregation_variables)s has not been updated since {{ $value | humanizeDuration }}.' % $._config, |
223 | 223 | }, |
224 | 224 | }, |
225 | 225 | { |
226 | 226 | // Alert if a we consistently find partial blocks for a given tenant over a relatively large time range. |
227 | 227 | alert: 'CortexTenantHasPartialBlocks', |
228 | 228 | 'for': '6h', |
229 | 229 | expr: ||| |
230 | | - max by(namespace, user) (cortex_bucket_blocks_partials_count) > 0 |
231 | | - |||, |
| 230 | + max by(%(alert_aggregation_labels)s, user) (cortex_bucket_blocks_partials_count) > 0 |
| 231 | + ||| % $._config, |
232 | 232 | labels: { |
233 | 233 | severity: 'warning', |
234 | 234 | }, |
235 | 235 | annotations: { |
236 | | - message: 'Cortex tenant {{ $labels.user }} in {{ $labels.namespace }} has {{ $value }} partial blocks.', |
| 236 | + message: 'Cortex tenant {{ $labels.user }} in %(alert_aggregation_variables)s has {{ $value }} partial blocks.' % $._config, |
237 | 237 | }, |
238 | 238 | }, |
239 | 239 | ], |
|
0 commit comments