|
62 | 62 | }, |
63 | 63 | }, |
64 | 64 | { |
65 | | - alert: 'OpenStackPlacementHighMemoryUsageWarning', |
| 65 | + alert: 'OpenStackPlacementHighMemoryUsage', |
66 | 66 | expr: ||| |
67 | | - 100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"}) |
| 67 | + (100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"}) |
68 | 68 | / |
69 | | - (sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0) |
| 69 | + (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"} |
| 70 | + * |
| 71 | + openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0) |
70 | 72 | > %(alertsWarningPlacementHighMemoryUsage)s |
71 | 73 | ||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) }, |
72 | 74 | 'for': '5m', |
|
75 | 77 | severity: 'warning', |
76 | 78 | }, |
77 | 79 | annotations: { |
78 | | - summary: 'OpenStack is using a significant percentage of its allocated memory.', |
| 80 | + summary: 'OpenStack node is using a significant percentage of its allocated memory.', |
79 | 81 | description: ||| |
80 | | - OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated memory, |
| 82 | + OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated memory, |
81 | 83 | which is above the threshold of %(alertsWarningPlacementHighMemoryUsage)s percent. |
82 | | - ||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] }, |
| 84 | + ||| % this.config { nodeLabel: this.config.nodeLabel }, |
83 | 85 | }, |
84 | 86 | }, |
85 | 87 | { |
86 | | - alert: 'OpenStackNovaAgentDown', |
| 88 | + alert: 'OpenStackPlacementHighMemoryUsage', |
87 | 89 | expr: ||| |
88 | | - 100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"}) |
| 90 | + (100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"}) |
89 | 91 | / |
90 | | - (sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0) |
| 92 | + (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"} |
| 93 | + * |
| 94 | + openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0) |
91 | 95 | > %(alertsCriticalPlacementHighMemoryUsage)s |
92 | 96 | ||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) }, |
93 | 97 | 'for': '5m', |
|
96 | 100 | severity: 'critical', |
97 | 101 | }, |
98 | 102 | annotations: { |
99 | | - summary: 'OpenStack is using a large percentage of its allocated memory, consider allocating more resources.', |
| 103 | + summary: 'OpenStack node is using a large percentage of its allocated memory, consider allocating more resources.', |
100 | 104 | description: ||| |
101 | | - OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated memory, |
| 105 | + OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated memory, |
102 | 106 | which is above the threshold of %(alertsCriticalPlacementHighMemoryUsage)s percent. |
103 | | - ||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] }, |
| 107 | + ||| % this.config { nodeLabel: this.config.nodeLabel }, |
104 | 108 | }, |
105 | 109 | }, |
106 | 110 | { |
107 | | - alert: 'OpenStackPlacementHighVCPUUsageWarning', |
| 111 | + alert: 'OpenStackPlacementHighVCPUUsage', |
108 | 112 | expr: ||| |
109 | | - 100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"}) |
| 113 | + (100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"}) |
110 | 114 | / |
111 | | - (sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}) > 0) |
| 115 | + (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"} |
| 116 | + * |
| 117 | + openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="VCPU"}) > 0) |
112 | 118 | > %(alertsWarningPlacementHighVCPUUsage)s |
113 | 119 | ||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) }, |
114 | 120 | 'for': '5m', |
|
117 | 123 | severity: 'warning', |
118 | 124 | }, |
119 | 125 | annotations: { |
120 | | - summary: 'OpenStack is using a significant percentage of its allocated vCPU.', |
| 126 | + summary: 'OpenStack node is using a significant percentage of its allocated vCPU.', |
121 | 127 | description: ||| |
122 | | - OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU, |
| 128 | + OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU, |
123 | 129 | which is above the threshold of %(alertsWarningPlacementHighVCPUUsage)s percent. |
124 | | - ||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] }, |
| 130 | + ||| % this.config { nodeLabel: this.config.nodeLabel }, |
125 | 131 | }, |
126 | 132 | }, |
127 | 133 | { |
128 | | - alert: 'OpenStackPlacementHighVCPUUsageCritical', |
129 | | - |
| 134 | + alert: 'OpenStackPlacementHighVCPUUsage', |
130 | 135 | expr: ||| |
131 | | - 100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"}) |
| 136 | + (100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"}) |
132 | 137 | / |
133 | | - (sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}) > 0) |
| 138 | + (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"} |
| 139 | + * |
| 140 | + openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="VCPU"}) > 0) |
134 | 141 | > %(alertsCriticalPlacementHighVCPUUsage)s |
135 | 142 | ||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) }, |
136 | 143 | 'for': '5m', |
|
139 | 146 | severity: 'critical', |
140 | 147 | }, |
141 | 148 | annotations: { |
142 | | - summary: 'OpenStack is using a large percentage of its allocated vCPU, consider allocating more resources.', |
| 149 | + summary: 'OpenStack node is using a large percentage of its allocated vCPU, consider allocating more resources.', |
143 | 150 | description: ||| |
144 | | - OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU, |
| 151 | + OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU, |
145 | 152 | which is above the threshold of %(alertsCriticalPlacementHighVCPUUsage)s percent. |
146 | | - ||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] }, |
| 153 | + ||| % this.config { nodeLabel: this.config.nodeLabel }, |
147 | 154 | }, |
148 | 155 | }, |
149 | 156 | { |
150 | | - alert: 'OpenStackNeutronHighIPsUsageWarning', |
| 157 | + alert: 'OpenStackNeutronHighIPsUsage', |
151 | 158 | expr: ||| |
152 | 159 | 100 * |
153 | 160 | sum by (%(agg)s, network_name) (openstack_neutron_network_ip_availabilities_used{%(filteringSelector)s, network_name=~"%(alertsIPutilizationNetworksMatcher)s"}) |
|
171 | 178 | }, |
172 | 179 | }, |
173 | 180 | { |
174 | | - alert: 'OpenStackNeutronHighIPsUsageCritical', |
| 181 | + alert: 'OpenStackNeutronHighIPsUsage', |
175 | 182 | expr: ||| |
176 | 183 | 100 * |
177 | 184 | sum by (%(agg)s, network_name) (openstack_neutron_network_ip_availabilities_used{%(filteringSelector)s, network_name=~"%(alertsIPutilizationNetworksMatcher)s"}) |
|
265 | 272 | ||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] }, |
266 | 273 | }, |
267 | 274 | }, |
| 275 | + { |
| 276 | + alert: 'OpenStackNovaTooManyVMsNotRunning', |
| 277 | + expr: ||| |
| 278 | + count by (%(agg)s, hypervisor_hostname, availability_zone) (openstack_nova_server_status{%(filteringSelector)s, status=~"SHUTOFF|ERROR", hypervisor_hostname!=""})/ |
| 279 | + (count by (%(agg)s, hypervisor_hostname, availability_zone) (openstack_nova_server_status{%(filteringSelector)s}) > %(alertsCriticalVMsNotRunningInstanceMin)s) * 100 > %(alertsCriticalVMsNotRunningPercent)s |
| 280 | + ||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) }, |
| 281 | + 'for': '15m', |
| 282 | + labels: { |
| 283 | + severity: 'critical', |
| 284 | + }, |
| 285 | + annotations: { |
| 286 | + summary: 'Too many VMs are in SHUTOFF or ERROR states on the single hypervisor.', |
| 287 | + description: ||| |
| 288 | + There are too many VMs in `SHUTOFF` or `ERROR` states on the hypervisor {{ $labels.hypervisor_hostname }}: {{ printf "%%.0f" $value }} percent, |
| 289 | + which is above the threshold of %(alertsCriticalVMsNotRunningPercent)s percent. |
| 290 | +
|
| 291 | + Please check if the hypervisor was rebooted and if instances need to be started manually. |
| 292 | + ||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] }, |
| 293 | + }, |
| 294 | + }, |
268 | 295 | ], |
269 | 296 | }, |
270 | 297 | { |
|
0 commit comments