@@ -17,6 +17,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
1717 $.panel('Total Silences' ) +
1818 $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager' ), format='short' )
1919 )
20+ .addPanel(
21+ $.panel('Tenants' ) +
22+ $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager' ), format='short' )
23+ )
2024 )
2125 .addRow(
2226 $.row('Alerts Received' )
@@ -86,5 +90,150 @@ local utils = import 'mixin-utils/utils.libsonnet';
8690 )
8791 .addRows(
8892 $.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)' , 'alertmanager-storage' )
93+ )
94+ .addRow(
95+ $.row('Replication' )
96+ .addPanel(
97+ $.panel('Tenants (By Instance)' ) +
98+ $.queryPanel(
99+ 'sum by(pod) (cortex_alertmanager_tenants_owned{%s})' % $.jobMatcher('alertmanager' ),
100+ '{{pod}}'
101+ ) +
102+ $.stack
103+ )
104+ .addPanel(
105+ $.panel('Alerts (By Instance)' ) +
106+ $.queryPanel(
107+ 'sum by(pod) (cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager' ),
108+ '{{pod}}'
109+ ) +
110+ $.stack
111+ )
112+ .addPanel(
113+ $.panel('Silences (By Instance)' ) +
114+ $.queryPanel(
115+ 'sum by(pod) (cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager' ),
116+ '{{pod}}'
117+ ) +
118+ $.stack
119+ )
120+ )
121+ .addRow(
122+ $.row('Tenant Configuration Sync' )
123+ .addPanel(
124+ $.panel('Syncs/sec' ) +
125+ $.queryPanel(
126+ [
127+ |||
128+ sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))
129+ -
130+ sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))
131+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
132+ 'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
133+ ],
134+ ['success' , 'failed' ]
135+ )
136+ )
137+ .addPanel(
138+ $.panel('Syncs/sec (By Reason)' ) +
139+ $.queryPanel(
140+ 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
141+ '{{reason}}'
142+ )
143+ )
144+ .addPanel(
145+ $.panel('Ring Check Errors/sec' ) +
146+ $.queryPanel(
147+ 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
148+ 'errors'
149+ )
150+ )
151+ )
152+ .addRow(
153+ $.row('Sharding Initial State Sync' )
154+ .addPanel(
155+ $.panel('Syncs/sec' ) +
156+ $.queryPanel(
157+ [
158+ |||
159+ sum(rate(cortex_alertmanager_state_initial_sync_total{%s}[$__rate_interval]))
160+ -
161+ sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))
162+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
163+ 'sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
164+ ],
165+ ['success' , 'failed' ]
166+ )
167+ )
168+ .addPanel(
169+ $.panel('Syncs/sec (By Outcome)' ) +
170+ $.queryPanel(
171+ 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
172+ '{{outcome}}'
173+ )
174+ )
175+ .addPanel(
176+ $.panel('Duration' ) +
177+ utils.latencyRecordingRulePanel('cortex_alertmanager_state_initial_sync_duration_seconds' , $.jobSelector('alertmanager' ))
178+ )
179+ )
180+ .addRow(
181+ $.row('Sharding State Operations' )
182+ .addPanel(
183+ $.panel('Replica Fetches/sec' ) +
184+ $.queryPanel(
185+ [
186+ |||
187+ sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval]))
188+ -
189+ sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))
190+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
191+ 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
192+ ],
193+ ['success' , 'failed' ]
194+ )
195+ )
196+ .addPanel(
197+ $.panel('Replica Updates/sec' ) +
198+ $.queryPanel(
199+ [
200+ |||
201+ sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval]))
202+ -
203+ sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))
204+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
205+ 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
206+ ],
207+ ['success' , 'failed' ]
208+ )
209+ )
210+ .addPanel(
211+ $.panel('Partial Merges/sec' ) +
212+ $.queryPanel(
213+ [
214+ |||
215+ sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval]))
216+ -
217+ sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))
218+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
219+ 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
220+ ],
221+ ['success' , 'failed' ]
222+ )
223+ )
224+ .addPanel(
225+ $.panel('Remote Storage Persists/sec' ) +
226+ $.queryPanel(
227+ [
228+ |||
229+ sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval]))
230+ -
231+ sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))
232+ ||| % [$.jobMatcher('alertmanager' ), $.jobMatcher('alertmanager' )],
233+ 'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager' ),
234+ ],
235+ ['success' , 'failed' ]
236+ )
237+ )
89238 ),
90239}
0 commit comments