Skip to content

Commit d9709a2

Browse files
authored
fix(openstack): Update openstack alerts. (#1517)
* Update openstack alerts. To make sense, those alerts should be grouped by openstack nodes not cluster level. * Update alerts * Fix lint * Shorter alert names
1 parent 38e888e commit d9709a2

File tree

9 files changed

+140
-88
lines changed

9 files changed

+140
-88
lines changed

openstack-mixin/alerts.libsonnet

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,13 @@
6262
},
6363
},
6464
{
65-
alert: 'OpenStackPlacementHighMemoryUsageWarning',
65+
alert: 'OpenStackPlacementHighMemoryUsage',
6666
expr: |||
67-
100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"})
67+
(100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"})
6868
/
69-
(sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0)
69+
(openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}
70+
*
71+
openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0)
7072
> %(alertsWarningPlacementHighMemoryUsage)s
7173
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
7274
'for': '5m',
@@ -75,19 +77,21 @@
7577
severity: 'warning',
7678
},
7779
annotations: {
78-
summary: 'OpenStack is using a significant percentage of its allocated memory.',
80+
summary: 'OpenStack node is using a significant percentage of its allocated memory.',
7981
description: |||
80-
OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated memory,
82+
OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated memory,
8183
which is above the threshold of %(alertsWarningPlacementHighMemoryUsage)s percent.
82-
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
84+
||| % this.config { nodeLabel: this.config.nodeLabel },
8385
},
8486
},
8587
{
86-
alert: 'OpenStackNovaAgentDown',
88+
alert: 'OpenStackPlacementHighMemoryUsage',
8789
expr: |||
88-
100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"})
90+
(100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="MEMORY_MB"})
8991
/
90-
(sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0)
92+
(openstack_placement_resource_total{%(filteringSelector)s, resourcetype="MEMORY_MB"}
93+
*
94+
openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="MEMORY_MB"}) > 0)
9195
> %(alertsCriticalPlacementHighMemoryUsage)s
9296
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
9397
'for': '5m',
@@ -96,19 +100,21 @@
96100
severity: 'critical',
97101
},
98102
annotations: {
99-
summary: 'OpenStack is using a large percentage of its allocated memory, consider allocating more resources.',
103+
summary: 'OpenStack node is using a large percentage of its allocated memory, consider allocating more resources.',
100104
description: |||
101-
OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated memory,
105+
OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated memory,
102106
which is above the threshold of %(alertsCriticalPlacementHighMemoryUsage)s percent.
103-
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
107+
||| % this.config { nodeLabel: this.config.nodeLabel },
104108
},
105109
},
106110
{
107-
alert: 'OpenStackPlacementHighVCPUUsageWarning',
111+
alert: 'OpenStackPlacementHighVCPUUsage',
108112
expr: |||
109-
100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"})
113+
(100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"})
110114
/
111-
(sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}) > 0)
115+
(openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}
116+
*
117+
openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="VCPU"}) > 0)
112118
> %(alertsWarningPlacementHighVCPUUsage)s
113119
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
114120
'for': '5m',
@@ -117,20 +123,21 @@
117123
severity: 'warning',
118124
},
119125
annotations: {
120-
summary: 'OpenStack is using a significant percentage of its allocated vCPU.',
126+
summary: 'OpenStack node is using a significant percentage of its allocated vCPU.',
121127
description: |||
122-
OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU,
128+
OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU,
123129
which is above the threshold of %(alertsWarningPlacementHighVCPUUsage)s percent.
124-
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
130+
||| % this.config { nodeLabel: this.config.nodeLabel },
125131
},
126132
},
127133
{
128-
alert: 'OpenStackPlacementHighVCPUUsageCritical',
129-
134+
alert: 'OpenStackPlacementHighVCPUUsage',
130135
expr: |||
131-
100 * sum by (%(agg)s) (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"})
136+
(100 * (openstack_placement_resource_usage{%(filteringSelector)s, resourcetype="VCPU"})
132137
/
133-
(sum by (%(agg)s) (openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}) > 0)
138+
(openstack_placement_resource_total{%(filteringSelector)s, resourcetype="VCPU"}
139+
*
140+
openstack_placement_resource_allocation_ratio{%(filteringSelector)s, resourcetype="VCPU"}) > 0)
134141
> %(alertsCriticalPlacementHighVCPUUsage)s
135142
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
136143
'for': '5m',
@@ -139,15 +146,15 @@
139146
severity: 'critical',
140147
},
141148
annotations: {
142-
summary: 'OpenStack is using a large percentage of its allocated vCPU, consider allocating more resources.',
149+
summary: 'OpenStack node is using a large percentage of its allocated vCPU, consider allocating more resources.',
143150
description: |||
144-
OpenStack {{$labels.%(instanceFirstLabel)s}} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU,
151+
OpenStack node {{ $labels.%(nodeLabel)s }} is using {{ printf "%%.0f" $value }} percent of its allocated vCPU,
145152
which is above the threshold of %(alertsCriticalPlacementHighVCPUUsage)s percent.
146-
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
153+
||| % this.config { nodeLabel: this.config.nodeLabel },
147154
},
148155
},
149156
{
150-
alert: 'OpenStackNeutronHighIPsUsageWarning',
157+
alert: 'OpenStackNeutronHighIPsUsage',
151158
expr: |||
152159
100 *
153160
sum by (%(agg)s, network_name) (openstack_neutron_network_ip_availabilities_used{%(filteringSelector)s, network_name=~"%(alertsIPutilizationNetworksMatcher)s"})
@@ -171,7 +178,7 @@
171178
},
172179
},
173180
{
174-
alert: 'OpenStackNeutronHighIPsUsageCritical',
181+
alert: 'OpenStackNeutronHighIPsUsage',
175182
expr: |||
176183
100 *
177184
sum by (%(agg)s, network_name) (openstack_neutron_network_ip_availabilities_used{%(filteringSelector)s, network_name=~"%(alertsIPutilizationNetworksMatcher)s"})
@@ -265,6 +272,26 @@
265272
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
266273
},
267274
},
275+
{
276+
alert: 'OpenStackNovaTooManyVMsNotRunning',
277+
expr: |||
278+
count by (%(agg)s, hypervisor_hostname, availability_zone) (openstack_nova_server_status{%(filteringSelector)s, status=~"SHUTOFF|ERROR", hypervisor_hostname!=""})/
279+
(count by (%(agg)s, hypervisor_hostname, availability_zone) (openstack_nova_server_status{%(filteringSelector)s}) > %(alertsCriticalVMsNotRunningInstanceMin)s) * 100 > %(alertsCriticalVMsNotRunningPercent)s
280+
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
281+
'for': '15m',
282+
labels: {
283+
severity: 'critical',
284+
},
285+
annotations: {
286+
summary: 'Too many VMs are in SHUTOFF or ERROR states on the single hypervisor.',
287+
description: |||
288+
There are too many VMs in `SHUTOFF` or `ERROR` states on the hypervisor {{ $labels.hypervisor_hostname }}: {{ printf "%%.0f" $value }} percent,
289+
which is above the threshold of %(alertsCriticalVMsNotRunningPercent)s percent.
290+
291+
Please check if the hypervisor was rebooted and if instances need to be started manually.
292+
||| % this.config { instanceFirstLabel: this.config.instanceLabels[0] },
293+
},
294+
},
268295
],
269296
},
270297
{

openstack-mixin/config.libsonnet

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
groupLabels: ['job'],
44
// instance of openstack cluster
55
instanceLabels: ['instance'],
6-
6+
nodeLabel: 'hostname',
77
uid: 'openstack',
88
dashboardTags: [self.uid],
99
dashboardPeriod: 'now-30m',
@@ -23,7 +23,10 @@
2323
alertsWarningCinderHighBackupMemoryUsage: 80, // %
2424
alertsWarningCinderHighVolumeMemoryUsage: 80, // %
2525
alertsWarningCinderHighPoolCapacityUsage: 80, // %
26-
26+
// alert when this percent of VMs not running on the single host,
27+
// while there is at least this total number of instances overall.
28+
alertsCriticalVMsNotRunningPercent: 75, // %
29+
alertsCriticalVMsNotRunningInstanceMin: 10,
2730

2831
// regex to match network names where we should track IP address utilization:
2932
alertsIPutilizationNetworksMatcher: '.+',

openstack-mixin/dashboards_out/cinder

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,7 @@
553553
"label": "Job",
554554
"multi": true,
555555
"name": "job",
556-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\"}, job)",
556+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\"}, job)",
557557
"refresh": 2,
558558
"sort": 1,
559559
"type": "query"
@@ -568,7 +568,7 @@
568568
"label": "Instance",
569569
"multi": true,
570570
"name": "instance",
571-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\",job=~\"$job\"}, instance)",
571+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\",job=~\"$job\"}, instance)",
572572
"refresh": 2,
573573
"sort": 1,
574574
"type": "query"

openstack-mixin/dashboards_out/logs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@
172172
"type": "loki",
173173
"uid": "${loki_datasource}"
174174
},
175-
"expr": "sum by (level) (count_over_time({job=~\"integrations/openstack\",job=~\"$job\",level=~\"$level\",service=~\"$service\"}\n|~ \"$regex_search\"\n\n[$__auto]))\n",
175+
"expr": "sum by (level) (count_over_time({job=\"integrations/openstack\",job=~\"$job\",level=~\"$level\",service=~\"$service\"}\n|~ \"$regex_search\"\n\n[$__auto]))\n",
176176
"legendFormat": "{{ level }}"
177177
}
178178
],
@@ -214,7 +214,7 @@
214214
"type": "loki",
215215
"uid": "${loki_datasource}"
216216
},
217-
"expr": "{job=~\"integrations/openstack\",job=~\"$job\",level=~\"$level\",service=~\"$service\"} \n|~ \"$regex_search\"\n\n\n"
217+
"expr": "{job=\"integrations/openstack\",job=~\"$job\",level=~\"$level\",service=~\"$service\"} \n|~ \"$regex_search\"\n\n\n"
218218
}
219219
],
220220
"title": "Logs",
@@ -245,7 +245,7 @@
245245
"label": "Job",
246246
"multi": true,
247247
"name": "job",
248-
"query": "label_values({job=~\"integrations/openstack\"}, job)",
248+
"query": "label_values({job=\"integrations/openstack\"}, job)",
249249
"refresh": 2,
250250
"sort": 1,
251251
"type": "query"
@@ -260,7 +260,7 @@
260260
"label": "Level",
261261
"multi": true,
262262
"name": "level",
263-
"query": "label_values({job=~\"integrations/openstack\",job=~\"$job\"}, level)",
263+
"query": "label_values({job=\"integrations/openstack\",job=~\"$job\"}, level)",
264264
"refresh": 2,
265265
"sort": 1,
266266
"type": "query"
@@ -275,7 +275,7 @@
275275
"label": "Service",
276276
"multi": true,
277277
"name": "service",
278-
"query": "label_values({job=~\"integrations/openstack\",job=~\"$job\",level=~\"$level\"}, service)",
278+
"query": "label_values({job=\"integrations/openstack\",job=~\"$job\",level=~\"$level\"}, service)",
279279
"refresh": 2,
280280
"sort": 1,
281281
"type": "query"

openstack-mixin/dashboards_out/neutron

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -870,7 +870,7 @@
870870
"label": "Job",
871871
"multi": true,
872872
"name": "job",
873-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\"}, job)",
873+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\"}, job)",
874874
"refresh": 2,
875875
"sort": 1,
876876
"type": "query"
@@ -885,7 +885,7 @@
885885
"label": "Instance",
886886
"multi": true,
887887
"name": "instance",
888-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\",job=~\"$job\"}, instance)",
888+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\",job=~\"$job\"}, instance)",
889889
"refresh": 2,
890890
"sort": 1,
891891
"type": "query"

openstack-mixin/dashboards_out/nova

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,7 @@
438438
"label": "Job",
439439
"multi": true,
440440
"name": "job",
441-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\"}, job)",
441+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\"}, job)",
442442
"refresh": 2,
443443
"sort": 1,
444444
"type": "query"
@@ -453,7 +453,7 @@
453453
"label": "Instance",
454454
"multi": true,
455455
"name": "instance",
456-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\",job=~\"$job\"}, instance)",
456+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\",job=~\"$job\"}, instance)",
457457
"refresh": 2,
458458
"sort": 1,
459459
"type": "query"

openstack-mixin/dashboards_out/overview

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@
370370
},
371371
"id": 7,
372372
"options": {
373-
"alertInstanceLabelFilter": "job=~\"integrations/openstack\"=~\"${job=~\"integrations/openstack\":regex}\",job=~\"${job:regex}\""
373+
"alertInstanceLabelFilter": "job=\"integrations/openstack\"=~\"${job=\"integrations/openstack\":regex}\",job=~\"${job:regex}\""
374374
},
375375
"pluginVersion": "v10.0.0",
376376
"title": "Alerts",
@@ -1229,7 +1229,7 @@
12291229
"label": "Job",
12301230
"multi": true,
12311231
"name": "job",
1232-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\"}, job)",
1232+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\"}, job)",
12331233
"refresh": 2,
12341234
"sort": 1,
12351235
"type": "query"
@@ -1244,7 +1244,7 @@
12441244
"label": "Instance",
12451245
"multi": true,
12461246
"name": "instance",
1247-
"query": "label_values(openstack_identity_up{job=~\"integrations/openstack\",job=~\"$job\"}, instance)",
1247+
"query": "label_values(openstack_identity_up{job=\"integrations/openstack\",job=~\"$job\"}, instance)",
12481248
"refresh": 2,
12491249
"sort": 1,
12501250
"type": "query"

openstack-mixin/mixin.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ local openstack =
44
openstacklib.new()
55
+ openstacklib.withConfigMixin(
66
{
7-
filteringSelector: 'job=~"integrations/openstack"',
7+
filteringSelector: 'job="integrations/openstack"',
88
uid: 'openstack',
99
groupLabels: ['job'],
1010
enableLokiLogs: true,

0 commit comments

Comments
 (0)