@@ -63,129 +63,7 @@ spec:
6363 - expr : count without(instance, pod, node) (up == 0)
6464 record : count:up0
6565 - name : alertmanager.rules
66- rules :
67- - alert : AlertmanagerFailedReload
68- annotations :
69- description : Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
70- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerFailedReload.md
71- summary : Reloading an Alertmanager configuration has failed.
72- expr : |
73- # Without max_over_time, failed scrapes could create false negatives, see
74- # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
75- max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring-satellite"}[5m]) == 0
76- for : 10m
77- labels :
78- severity : critical
79- - alert : AlertmanagerMembersInconsistent
80- annotations :
81- description : Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
82- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerMembersInconsistent.md
83- summary : A member of an Alertmanager cluster has not found all other cluster members.
84- expr : |
85- # Without max_over_time, failed scrapes could create false negatives, see
86- # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
87- max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring-satellite"}[5m])
88- < on (cluster) group_left
89- count by (cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring-satellite"}[5m]))
90- for : 15m
91- labels :
92- severity : critical
93- - alert : AlertmanagerFailedToSendAlerts
94- annotations :
95- description : Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
96- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerFailedToSendAlerts.md
97- summary : An Alertmanager instance failed to send notifications.
98- expr : |
99- (
100- rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring-satellite"}[5m])
101- /
102- rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring-satellite"}[5m])
103- )
104- > 0.01
105- for : 5m
106- labels :
107- severity : warning
108- - alert : AlertmanagerClusterFailedToSendAlerts
109- annotations :
110- description : The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
111- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterFailedToSendAlerts.md
112- summary : All Alertmanager instances in a cluster failed to send notifications to a critical integration.
113- expr : |
114- min by (cluster, integration) (
115- rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring-satellite", integration=~`slack|pagerduty`}[5m])
116- /
117- rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring-satellite", integration=~`slack|pagerduty`}[5m])
118- )
119- > 0.01
120- for : 5m
121- labels :
122- severity : critical
123- - alert : AlertmanagerClusterFailedToSendAlerts
124- annotations :
125- description : The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
126- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterFailedToSendAlerts.md
127- summary : All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
128- expr : |
129- min by (cluster, integration) (
130- rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring-satellite", integration!~`slack|pagerduty`}[5m])
131- /
132- rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring-satellite", integration!~`slack|pagerduty`}[5m])
133- )
134- > 0.01
135- for : 5m
136- labels :
137- severity : warning
138- - alert : AlertmanagerConfigInconsistent
139- annotations :
140- description : Alertmanager instances within the {{$labels.job}} cluster have different configurations.
141- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerConfigInconsistent.md
142- summary : Alertmanager instances within the same cluster have different configurations.
143- expr : |
144- count by (cluster) (
145- count_values by (cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring-satellite"})
146- )
147- != 1
148- for : 20m
149- labels :
150- severity : critical
151- - alert : AlertmanagerClusterDown
152- annotations :
153- description : ' {{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
154- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterDown.md
155- summary : Half or more of the Alertmanager instances within the same cluster are down.
156- expr : |
157- (
158- count by (cluster) (
159- avg_over_time(up{job="alertmanager-main",namespace="monitoring-satellite"}[5m]) < 0.5
160- )
161- /
162- count by (cluster) (
163- up{job="alertmanager-main",namespace="monitoring-satellite"}
164- )
165- )
166- >= 0.5
167- for : 5m
168- labels :
169- severity : critical
170- - alert : AlertmanagerClusterCrashlooping
171- annotations :
172- description : ' {{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
173- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterCrashlooping.md
174- summary : Half or more of the Alertmanager instances within the same cluster are crashlooping.
175- expr : |
176- (
177- count by (cluster) (
178- changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring-satellite"}[10m]) > 4
179- )
180- /
181- count by (cluster) (
182- up{job="alertmanager-main",namespace="monitoring-satellite"}
183- )
184- )
185- >= 0.5
186- for : 5m
187- labels :
188- severity : critical
66+ rules : []
18967 - name : kube-state-metrics
19068 rules : []
19169 - name : kubernetes-apps
0 commit comments