Skip to content

Commit f5e9b53

Browse files
committed
imp: adds darwin guest and host monitoring and alerts
1 parent 030d072 commit f5e9b53

File tree

6 files changed

+23321
-8
lines changed

6 files changed

+23321
-8
lines changed

nix/cloud/alerts.nix

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,72 @@
7777
];
7878
};
7979

80+
ci-world-node-exporter = {
81+
datasource = "vm";
82+
rules = [
83+
{
84+
alert = "node_down";
85+
expr = ''up == 0'';
86+
for = "5m";
87+
labels.severity = "critical";
88+
annotations = {
89+
description = "{{$labels.alias}} of instance {{$labels.instance}} in job {{$labels.job}} has been down for more than 5 minutes.";
90+
summary = "{{$labels.alias}}: Node is down.";
91+
};
92+
}
93+
{
94+
alert = "node_filesystem_full_90percent";
95+
expr = ''sort(node_filesystem_free_bytes{device!="ramfs",fstype!="apfs"} < node_filesystem_size_bytes{device!="ramfs",fstype!="apfs"} * 0.1) / 1024^3'';
96+
for = "5m";
97+
labels.severity = "critical";
98+
annotations = {
99+
description = "{{$labels.alias}} of instance {{$labels.instance}} and device {{$labels.device}} on {{$labels.mountpoint}} has less than 10% space left on its filesystem.";
100+
summary = "{{$labels.alias}}: Filesystem is running out of space soon.";
101+
};
102+
}
103+
{
104+
alert = "node_filesystem_full_in_4h";
105+
expr = ''predict_linear(node_filesystem_free_bytes{device!~"ramfs|tmpfs|none",fstype!~"apfs|autofs|ramfs|cd9660"}[4h], 4*3600) <= 0'';
106+
for = "5m";
107+
labels.severity = "warning";
108+
annotations = {
109+
description = "{{$labels.alias}} of instance {{$labels.instance}} and device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours";
110+
summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.";
111+
};
112+
}
113+
{
114+
alert = "node_ram_using_90percent";
115+
expr = ''node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.10'';
116+
for = "30m";
117+
labels.severity = "critical";
118+
annotations = {
119+
description = "{{$labels.alias}} of instance {{$labels.instance}} in job {{$labels.job}} is using at least 90% of its RAM for at least 30 minutes now.";
120+
summary = "{{$labels.alias}}: High RAM utilization.";
121+
};
122+
}
123+
{
124+
alert = "node_swap_using_80percent";
125+
expr = ''node_memory_SwapTotal_bytes - (node_memory_SwapFree_bytes + node_memory_SwapCached_bytes) > node_memory_SwapTotal_bytes * 0.8'';
126+
for = "10m";
127+
labels.severity = "warning";
128+
annotations = {
129+
description = "{{$labels.alias}} of instance {{$labels.instance}} in job {{$labels.job}} is using 80% of its swap space for at least 10 minutes now.";
130+
summary = "{{$labels.alias}}: Running out of swap soon.";
131+
};
132+
}
133+
{
134+
alert = "node_time_unsync";
135+
expr = ''abs(node_timex_offset_seconds) > 0.500 or node_timex_sync_status != 1'';
136+
for = "10m";
137+
labels.severity = "warning";
138+
annotations = {
139+
description = "{{$labels.alias}} of instance {{$labels.instance}} in job {{$labels.job}} has local clock offset too large or out of sync with NTP";
140+
summary = "{{$labels.alias}}: Clock out of sync with NTP";
141+
};
142+
}
143+
];
144+
};
145+
80146
# inherit (inputs.bitte-cells.bitte.alerts)
81147
# ;
82148
}

nix/cloud/dashboards.nix

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
# importGrafonnixToJson = ...;
1010
in {
1111
ci-world-spongix = importAsJson ./dashboards/spongix.json;
12+
ci-world-mac-mini-zfs = importAsJson ./dashboards/mac-mini-zfs.json;
13+
ci-world-node-exporter = importAsJson ./dashboards/node-exporter.json;
1214

1315
# Upstream dashboards can be imported here, instead of directly
1416
# imported in the hydrationProfile. This will allow easier

0 commit comments

Comments
 (0)