@@ -34,6 +34,53 @@ cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 1
3434cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000
3535cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000
3636cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000
37+ # HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active.
38+ # TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge
39+ cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1
40+ cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10
41+ cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100
42+ # HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager.
43+ # TYPE cortex_prometheus_notifications_dropped_total counter
44+ cortex_prometheus_notifications_dropped_total{user="user1"} 1
45+ cortex_prometheus_notifications_dropped_total{user="user2"} 10
46+ cortex_prometheus_notifications_dropped_total{user="user3"} 100
47+ # HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications.
48+ # TYPE cortex_prometheus_notifications_errors_total counter
49+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1
50+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10
51+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100
52+ # HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications.
53+ # TYPE cortex_prometheus_notifications_latency_seconds summary
54+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1
55+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1
56+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1
57+ cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1
58+ cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1
59+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10
60+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10
61+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10
62+ cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10
63+ cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1
64+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100
65+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100
66+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100
67+ cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100
68+ cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1
69+ # HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue.
70+ # TYPE cortex_prometheus_notifications_queue_capacity gauge
71+ cortex_prometheus_notifications_queue_capacity{user="user1"} 1
72+ cortex_prometheus_notifications_queue_capacity{user="user2"} 10
73+ cortex_prometheus_notifications_queue_capacity{user="user3"} 100
74+ # HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue.
75+ # TYPE cortex_prometheus_notifications_queue_length gauge
76+ cortex_prometheus_notifications_queue_length{user="user1"} 1
77+ cortex_prometheus_notifications_queue_length{user="user2"} 10
78+ cortex_prometheus_notifications_queue_length{user="user3"} 100
79+ # HELP cortex_prometheus_notifications_sent_total Total number of alerts sent.
80+ # TYPE cortex_prometheus_notifications_sent_total counter
81+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1
82+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10
83+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100
3784# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
3885# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
3986cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
@@ -153,6 +200,53 @@ func TestManagerMetricsWithoutRuleGroupLabel(t *testing.T) {
153200cortex_prometheus_last_evaluation_samples{user="user1"} 2000
154201cortex_prometheus_last_evaluation_samples{user="user2"} 20000
155202cortex_prometheus_last_evaluation_samples{user="user3"} 200000
203+ # HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active.
204+ # TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge
205+ cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1
206+ cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10
207+ cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100
208+ # HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager.
209+ # TYPE cortex_prometheus_notifications_dropped_total counter
210+ cortex_prometheus_notifications_dropped_total{user="user1"} 1
211+ cortex_prometheus_notifications_dropped_total{user="user2"} 10
212+ cortex_prometheus_notifications_dropped_total{user="user3"} 100
213+ # HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications.
214+ # TYPE cortex_prometheus_notifications_errors_total counter
215+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1
216+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10
217+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100
218+ # HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications.
219+ # TYPE cortex_prometheus_notifications_latency_seconds summary
220+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1
221+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1
222+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1
223+ cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1
224+ cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1
225+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10
226+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10
227+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10
228+ cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10
229+ cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1
230+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100
231+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100
232+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100
233+ cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100
234+ cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1
235+ # HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue.
236+ # TYPE cortex_prometheus_notifications_queue_capacity gauge
237+ cortex_prometheus_notifications_queue_capacity{user="user1"} 1
238+ cortex_prometheus_notifications_queue_capacity{user="user2"} 10
239+ cortex_prometheus_notifications_queue_capacity{user="user3"} 100
240+ # HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue.
241+ # TYPE cortex_prometheus_notifications_queue_length gauge
242+ cortex_prometheus_notifications_queue_length{user="user1"} 1
243+ cortex_prometheus_notifications_queue_length{user="user2"} 10
244+ cortex_prometheus_notifications_queue_length{user="user3"} 100
245+ # HELP cortex_prometheus_notifications_sent_total Total number of alerts sent.
246+ # TYPE cortex_prometheus_notifications_sent_total counter
247+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1
248+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10
249+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100
156250# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
157251# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
158252cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
@@ -261,22 +355,37 @@ func populateManager(base float64) *prometheus.Registry {
261355 metrics .groupLastEvalSamples .WithLabelValues ("group_one" ).Add (base * 1000 )
262356 metrics .groupLastEvalSamples .WithLabelValues ("group_two" ).Add (base * 1000 )
263357
358+ metrics .notificationsLatency .WithLabelValues ("alertmanager_1" ).Observe (base )
359+ metrics .notificationsErrors .WithLabelValues ("alertmanager_1" ).Add (base )
360+ metrics .notificationsSent .WithLabelValues ("alertmanager_1" ).Add (base )
361+ metrics .notificationsDropped .Add (base )
362+ metrics .notificationsQueueLength .Set (base )
363+ metrics .notificationsQueueCapacity .Set (base )
364+ metrics .notificationsAlertmanagersDiscovered .Set (base )
264365 return r
265366}
266367
267368// Copied from github.com/prometheus/rules/manager.go
369+ // and github.com/prometheus/notifier/notifier.go
268370type groupMetrics struct {
269- evalDuration prometheus.Summary
270- iterationDuration prometheus.Summary
271- iterationsMissed * prometheus.CounterVec
272- iterationsScheduled * prometheus.CounterVec
273- evalTotal * prometheus.CounterVec
274- evalFailures * prometheus.CounterVec
275- groupInterval * prometheus.GaugeVec
276- groupLastEvalTime * prometheus.GaugeVec
277- groupLastDuration * prometheus.GaugeVec
278- groupRules * prometheus.GaugeVec
279- groupLastEvalSamples * prometheus.GaugeVec
371+ evalDuration prometheus.Summary
372+ iterationDuration prometheus.Summary
373+ iterationsMissed * prometheus.CounterVec
374+ iterationsScheduled * prometheus.CounterVec
375+ evalTotal * prometheus.CounterVec
376+ evalFailures * prometheus.CounterVec
377+ groupInterval * prometheus.GaugeVec
378+ groupLastEvalTime * prometheus.GaugeVec
379+ groupLastDuration * prometheus.GaugeVec
380+ groupRules * prometheus.GaugeVec
381+ groupLastEvalSamples * prometheus.GaugeVec
382+ notificationsLatency * prometheus.SummaryVec
383+ notificationsErrors * prometheus.CounterVec
384+ notificationsSent * prometheus.CounterVec
385+ notificationsDropped prometheus.Counter
386+ notificationsQueueLength prometheus.Gauge
387+ notificationsQueueCapacity prometheus.Gauge
388+ notificationsAlertmanagersDiscovered prometheus.Gauge
280389}
281390
282391func newGroupMetrics (r prometheus.Registerer ) * groupMetrics {
@@ -355,8 +464,53 @@ func newGroupMetrics(r prometheus.Registerer) *groupMetrics {
355464 },
356465 []string {"rule_group" },
357466 ),
467+ notificationsLatency : promauto .With (r ).NewSummaryVec (
468+ prometheus.SummaryOpts {
469+ Name : "prometheus_notifications_latency_seconds" ,
470+ Help : "Latency quantiles for sending alert notifications." ,
471+ Objectives : map [float64 ]float64 {0.5 : 0.05 , 0.9 : 0.01 , 0.99 : 0.001 },
472+ },
473+ []string {"alertmanager" },
474+ ),
475+ notificationsErrors : promauto .With (r ).NewCounterVec (
476+ prometheus.CounterOpts {
477+ Name : "prometheus_notifications_errors_total" ,
478+ Help : "Latency quantiles for sending alert notifications." ,
479+ },
480+ []string {"alertmanager" },
481+ ),
482+ notificationsSent : promauto .With (r ).NewCounterVec (
483+ prometheus.CounterOpts {
484+ Name : "prometheus_notifications_sent_total" ,
485+ Help : "Total number of errors sending alert notifications" ,
486+ },
487+ []string {"alertmanager" },
488+ ),
489+ notificationsDropped : promauto .With (r ).NewCounter (
490+ prometheus.CounterOpts {
491+ Name : "prometheus_notifications_dropped_total" ,
492+ Help : "Total number of alerts dropped due to errors when sending to Alertmanager." ,
493+ },
494+ ),
495+ notificationsQueueLength : promauto .With (r ).NewGauge (
496+ prometheus.GaugeOpts {
497+ Name : "prometheus_notifications_queue_length" ,
498+ Help : "The number of alert notifications in the queue." ,
499+ },
500+ ),
501+ notificationsQueueCapacity : promauto .With (r ).NewGauge (
502+ prometheus.GaugeOpts {
503+ Name : "prometheus_notifications_queue_capacity" ,
504+ Help : "The capacity of the alert notifications queue." ,
505+ },
506+ ),
507+ notificationsAlertmanagersDiscovered : promauto .With (r ).NewGauge (
508+ prometheus.GaugeOpts {
509+ Name : "prometheus_notifications_alertmanagers_discovered" ,
510+ Help : "The number of alertmanagers discovered and active." ,
511+ },
512+ ),
358513 }
359-
360514 return m
361515}
362516
0 commit comments