@@ -17,6 +17,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
1717 $.panel('Total Silences' ) +
1818 $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager' ), format='short' )
1919 )
20+ .addPanel(
21+ $.panel('Tenants' ) +
22+ $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager' ), format='short' )
23+ )
2024 )
2125 .addRow(
2226 $.row('Alerts Received' )
@@ -86,5 +90,136 @@ local utils = import 'mixin-utils/utils.libsonnet';
8690 )
8791 .addRows(
8892 $.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)' , 'alertmanager-storage' )
93+ )
94+ .addRow(
95+ $.row('Replication' )
96+ .addPanel(
97+ $.panel('Per %s Tenants' % $._config.per_instance_label) +
98+ $.queryPanel(
99+ 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager' )],
100+ '{{%s}}' % $._config.per_instance_label
101+ ) +
102+ $.stack
103+ )
104+ .addPanel(
105+ $.panel('Per %s Alerts' % $._config.per_instance_label) +
106+ $.queryPanel(
107+ 'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager' )],
108+ '{{%s}}' % $._config.per_instance_label
109+ ) +
110+ $.stack
111+ )
112+ .addPanel(
113+ $.panel('Per %s Silences' % $._config.per_instance_label) +
114+ $.queryPanel(
115+ 'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager' )],
116+ '{{%s}}' % $._config.per_instance_label
117+ ) +
118+ $.stack
119+ )
120+ )
121+ .addRow(
122+ $.row('Tenant Configuration Sync' )
123+ .addPanel(
124+ $.panel('Syncs/sec' ) +
125+ $.queryPanel(
126+ [
127+ |||
128+ sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))
129+ -
130+ sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))
131+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
132+ 'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
133+ ],
134+ ['success' , 'failed' ]
135+ )
136+ )
137+ .addPanel(
138+ $.panel('Syncs/sec (By Reason)' ) +
139+ $.queryPanel(
140+ 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
141+ '{{reason}}'
142+ )
143+ )
144+ .addPanel(
145+ $.panel('Ring Check Errors/sec' ) +
146+ $.queryPanel(
147+ 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
148+ 'errors'
149+ )
150+ )
151+ )
152+ .addRow(
153+ $.row('Sharding Initial State Sync' )
154+ .addPanel(
155+ $.panel('Initial syncs/sec' ) +
156+ $.queryPanel(
157+ 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
158+ '{{outcome}}'
159+ )
160+ )
161+ .addPanel(
162+ $.panel('Initial sync duration' ) +
163+ $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds' , '{%s}' % $.jobMatcher('alertmanager' ))
164+ )
165+ .addPanel(
166+ $.panel('Fetch state from other alertmanagers /sec' ) +
167+ $.queryPanel(
168+ [
169+ |||
170+ sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval]))
171+ -
172+ sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))
173+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
174+ 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
175+ ],
176+ ['success' , 'failed' ]
177+ )
178+ )
179+ )
180+ .addRow(
181+ $.row('Sharding Runtime State Sync' )
182+ .addPanel(
183+ $.panel('Replicate state to other alertmanagers /sec' ) +
184+ $.queryPanel(
185+ [
186+ |||
187+ sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval]))
188+ -
189+ sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))
190+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
191+ 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
192+ ],
193+ ['success' , 'failed' ]
194+ )
195+ )
196+ .addPanel(
197+ $.panel('Merge state from other alertmanagers /sec' ) +
198+ $.queryPanel(
199+ [
200+ |||
201+ sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval]))
202+ -
203+ sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))
204+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
205+ 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
206+ ],
207+ ['success' , 'failed' ]
208+ )
209+ )
210+ .addPanel(
211+ $.panel('Persist state to remote storage /sec' ) +
212+ $.queryPanel(
213+ [
214+ |||
215+ sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval]))
216+ -
217+ sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))
218+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
219+ 'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
220+ ],
221+ ['success' , 'failed' ]
222+ )
223+ )
89224 ),
90225}
0 commit comments