|
77 | 77 | ]; |
78 | 78 | }; |
79 | 79 |
|
| 80 | + ci-world-node-exporter = { |
| 81 | + datasource = "vm"; |
| 82 | + rules = [ |
| 83 | + { |
| 84 | + alert = "node_down"; |
| 85 | + expr = ''up == 0''; |
| 86 | + for = "5m"; |
| 87 | + labels.severity = "critical"; |
| 88 | + annotations = { |
| 89 | + description = "{{$labels.alias}} of instance {{$labels.instance}} in job {{$labels.job}} has been down for more than 5 minutes."; |
| 90 | + summary = "{{$labels.alias}}: Node is down."; |
| 91 | + }; |
| 92 | + } |
| 93 | + { |
| 94 | + alert = "node_filesystem_full_90percent"; |
| 95 | + expr = ''sort(node_filesystem_free_bytes{device!="ramfs",fstype!="apfs"} < node_filesystem_size_bytes{device!="ramfs",fstype!="apfs"} * 0.1) / 1024^3''; |
| 96 | + for = "5m"; |
| 97 | + labels.severity = "critical"; |
| 98 | + annotations = { |
| 99 | + description = "{{$labels.alias}} of instance {{$labels.instance}} and device {{$labels.device}} on {{$labels.mountpoint}} has less than 10% space left on its filesystem."; |
| 100 | + summary = "{{$labels.alias}}: Filesystem is running out of space soon."; |
| 101 | + }; |
| 102 | + } |
| 103 | + { |
| 104 | + alert = "node_filesystem_full_in_4h"; |
| 105 | + expr = ''predict_linear(node_filesystem_free_bytes{device!~"ramfs|tmpfs|none",fstype!~"apfs|autofs|ramfs|cd9660"}[4h], 4*3600) <= 0''; |
| 106 | + for = "5m"; |
| 107 | + labels.severity = "warning"; |
| 108 | + annotations = { |
| 109 | + description = "{{$labels.alias}} of instance {{$labels.instance}} and device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours"; |
| 110 | + summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours."; |
| 111 | + }; |
| 112 | + } |
| 113 | + { |
| 114 | + alert = "node_ram_using_90percent"; |
| 115 | + expr = ''node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.10''; |
| 116 | + for = "30m"; |
| 117 | + labels.severity = "critical"; |
| 118 | + annotations = { |
| 119 | + description = "{{$labels.alias}} of instance {{$labels.instance}} in job {{$labels.job}} is using at least 90% of its RAM for at least 30 minutes now."; |
| 120 | + summary = "{{$labels.alias}}: High RAM utilization."; |
| 121 | + }; |
| 122 | + } |
| 123 | + { |
| 124 | + alert = "node_swap_using_80percent"; |
| 125 | + expr = ''node_memory_SwapTotal_bytes - (node_memory_SwapFree_bytes + node_memory_SwapCached_bytes) > node_memory_SwapTotal_bytes * 0.8''; |
| 126 | + for = "10m"; |
| 127 | + labels.severity = "warning"; |
| 128 | + annotations = { |
| 129 | + description = "{{$labels.alias}} of instance {{$labels.instance}} in job {{$labels.job}} is using 80% of its swap space for at least 10 minutes now."; |
| 130 | + summary = "{{$labels.alias}}: Running out of swap soon."; |
| 131 | + }; |
| 132 | + } |
| 133 | + { |
| 134 | + alert = "node_time_unsync"; |
| 135 | + expr = ''abs(node_timex_offset_seconds) > 0.500 or node_timex_sync_status != 1''; |
| 136 | + for = "10m"; |
| 137 | + labels.severity = "warning"; |
| 138 | + annotations = { |
| 139 | + description = "{{$labels.alias}} of instance {{$labels.instance}} in job {{$labels.job}} has local clock offset too large or out of sync with NTP"; |
| 140 | + summary = "{{$labels.alias}}: Clock out of sync with NTP"; |
| 141 | + }; |
| 142 | + } |
| 143 | + ]; |
| 144 | + }; |
| 145 | + |
80 | 146 | # inherit (inputs.bitte-cells.bitte.alerts) |
81 | 147 | # ; |
82 | 148 | } |
0 commit comments