Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit de6f82f

Browse files
committed
Extend Alertmanager dashboard with currently unused metrics.
Metrics for general operation: - Added "Tenants" stat panel using: `cortex_alertmanager_tenants_discovered` - Added "Tenant Configuration Sync" row using: `cortex_alertmanager_sync_configs_failed_total` `cortex_alertmanager_sync_configs_total` `cortex_alertmanager_ring_check_errors_total` Metrics specific to sharding operation: - Added "Sharding Initial State Sync" row using: `cortex_alertmanager_state_initial_sync_completed_total` `cortex_alertmanager_state_initial_sync_completed_total` `cortex_alertmanager_state_initial_sync_duration_seconds` - Added "Sharding State Operations" row using: `cortex_alertmanager_state_fetch_replica_state_total` `cortex_alertmanager_state_fetch_replica_state_failed_total` `cortex_alertmanager_state_replication_total` `cortex_alertmanager_state_replication_failed_total` `cortex_alertmanager_partial_state_merges_total` `cortex_alertmanager_partial_state_merges_failed_total` `cortex_alertmanager_state_persist_total` `cortex_alertmanager_state_persist_failed_total`
1 parent a337270 commit de6f82f

File tree

1 file changed

+149
-0
lines changed

1 file changed

+149
-0
lines changed

cortex-mixin/dashboards/alertmanager.libsonnet

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
1717
$.panel('Total Silences') +
1818
$.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short')
1919
)
20+
.addPanel(
21+
$.panel('Tenants') +
22+
$.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager'), format='short')
23+
)
2024
)
2125
.addRow(
2226
$.row('Alerts Received')
@@ -86,5 +90,150 @@ local utils = import 'mixin-utils/utils.libsonnet';
8690
)
8791
.addRows(
8892
$.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)', 'alertmanager-storage')
93+
)
94+
.addRow(
95+
$.row('Replication')
96+
.addPanel(
97+
$.panel('Tenants (By Instance)') +
98+
$.queryPanel(
99+
'sum by(pod) (cortex_alertmanager_tenants_owned{%s})' % $.jobMatcher('alertmanager'),
100+
'{{pod}}'
101+
) +
102+
$.stack
103+
)
104+
.addPanel(
105+
$.panel('Alerts (By Instance)') +
106+
$.queryPanel(
107+
'sum by(pod) (cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'),
108+
'{{pod}}'
109+
) +
110+
$.stack
111+
)
112+
.addPanel(
113+
$.panel('Silences (By Instance)') +
114+
$.queryPanel(
115+
'sum by(pod) (cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'),
116+
'{{pod}}'
117+
) +
118+
$.stack
119+
)
120+
)
121+
.addRow(
122+
$.row('Tenant Configuration Sync')
123+
.addPanel(
124+
$.panel('Syncs/sec') +
125+
$.queryPanel(
126+
[
127+
|||
128+
sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))
129+
-
130+
sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))
131+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
132+
'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
133+
],
134+
['success', 'failed']
135+
)
136+
)
137+
.addPanel(
138+
$.panel('Syncs/sec (By Reason)') +
139+
$.queryPanel(
140+
'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
141+
'{{reason}}'
142+
)
143+
)
144+
.addPanel(
145+
$.panel('Ring Check Errors/sec') +
146+
$.queryPanel(
147+
'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
148+
'errors'
149+
)
150+
)
151+
)
152+
.addRow(
153+
$.row('Sharding Initial State Sync')
154+
.addPanel(
155+
$.panel('Syncs/sec') +
156+
$.queryPanel(
157+
[
158+
|||
159+
sum(rate(cortex_alertmanager_state_initial_sync_total{%s}[$__rate_interval]))
160+
-
161+
sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))
162+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
163+
'sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
164+
],
165+
['success', 'failed']
166+
)
167+
)
168+
.addPanel(
169+
$.panel('Syncs/sec (By Outcome)') +
170+
$.queryPanel(
171+
'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
172+
'{{outcome}}'
173+
)
174+
)
175+
.addPanel(
176+
$.panel('Duration') +
177+
utils.latencyRecordingRulePanel('cortex_alertmanager_state_initial_sync_duration_seconds', $.jobSelector('alertmanager'))
178+
)
179+
)
180+
.addRow(
181+
$.row('Sharding State Operations')
182+
.addPanel(
183+
$.panel('Replica Fetches/sec') +
184+
$.queryPanel(
185+
[
186+
|||
187+
sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval]))
188+
-
189+
sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))
190+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
191+
'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
192+
],
193+
['success', 'failed']
194+
)
195+
)
196+
.addPanel(
197+
$.panel('Replica Updates/sec') +
198+
$.queryPanel(
199+
[
200+
|||
201+
sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval]))
202+
-
203+
sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))
204+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
205+
'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
206+
],
207+
['success', 'failed']
208+
)
209+
)
210+
.addPanel(
211+
$.panel('Partial Merges/sec') +
212+
$.queryPanel(
213+
[
214+
|||
215+
sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval]))
216+
-
217+
sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))
218+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
219+
'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
220+
],
221+
['success', 'failed']
222+
)
223+
)
224+
.addPanel(
225+
$.panel('Remote Storage Persists/sec') +
226+
$.queryPanel(
227+
[
228+
|||
229+
sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval]))
230+
-
231+
sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))
232+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
233+
'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
234+
],
235+
['success', 'failed']
236+
)
237+
)
89238
),
90239
}

0 commit comments

Comments
 (0)