Use prometheus_client for writing out metrics

technowhizz · technowhizz · commit 44601cabc58a · 2025-05-01T19:24:50.000+01:00
diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py
@@ -4,7 +4,9 @@
 import json
 import re
 import datetime
+import os
 
+from prometheus_client import CollectorRegistry, Gauge, write_to_textfile
 from pySMART import DeviceList
 
 SMARTCTL_PATH = "/usr/sbin/smartctl"
@@ -110,21 +112,24 @@ def parse_device_info(device):
         "serial_number": serial_number,
         "firmware_version": device.firmware or "",
     }
-    label_str = ",".join(f'{k}="{v}"' for k, v in labels.items())
+    sorted_labels = sorted(labels.items())
+    label_str = ",".join(f'{k}="{v}"' for k, v in sorted_labels)
+
+    metric_labels = f'disk="{device.name}",serial_number="{serial_number}",type="{device.interface}"'
 
     metrics = [
-        f'device_info{{{label_str}}} 1',
-        f'device_smart_available{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_capable else 0}',
+        f'device_info{{{label_str}}} 1.0',
+        f'device_smart_available{{{metric_labels}}} {float(1) if device.smart_capable else float(0)}',
     ]
 
     if device.smart_capable:
         metrics.append(
-            f'device_smart_enabled{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_enabled else 0}'
+            f'device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}'
         )
         if device.assessment:
             is_healthy = 1 if device.assessment.upper() == "PASS" else 0
             metrics.append(
-                f'device_smart_healthy{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {is_healthy}'
+                f'device_smart_healthy{{{metric_labels}}} {float(is_healthy)}'
             )
 
     return metrics
@@ -143,7 +148,7 @@ def parse_if_attributes(device):
     disk = device.name
     disk_type = device.interface or ""
     serial_number = (device.serial or "").lower()
-    labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial_number}"'
+    labels = f'disk="{disk}",serial_number="{serial_number}",type="{disk_type}"'
 
     # Inspect all public attributes on device.if_attributes
     for attr_name in dir(device.if_attributes):
@@ -156,27 +161,48 @@ def parse_if_attributes(device):
         snake_name = camel_to_snake(attr_name)
 
         if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)):
-            metrics.append(f"{snake_name}{{{labels}}} {val}")
+            metrics.append(f"{snake_name}{{{labels}}} {float(val)}")
 
     return metrics
 
-def format_output(metrics):
+def write_metrics_to_textfile(metrics, output_path=None):
     """
-    Convert a list of lines like "some_metric{...} value"
-    into a Prometheus text output with # HELP / # TYPE lines.
+    Write metrics to a Prometheus textfile using prometheus_client.
+    Args:
+        metrics (List[str]): List of metric strings in 'name{labels} value' format.
+        output_path (str): Path to write the metrics file. Defaults to node_exporter textfile collector path.
     """
-    output = []
-    last_metric = ""
-    for metric in sorted(metrics):
-        metric_name = metric.split("{")[0]
-        if metric_name != last_metric:
-            output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}")
-            output.append(f"# TYPE smartmon_{metric_name} gauge")
-            last_metric = metric_name
-        output.append(f"smartmon_{metric}")
-    return "\n".join(output)
-
-def main():
+    registry = CollectorRegistry()
+    metric_gauges = {}
+    for metric in metrics:
+        # Split metric into name, labels, and value
+        metric_name, rest = metric.split('{', 1)
+        label_str, value = rest.split('}', 1)
+        value = value.strip()
+        # Parse labels into a dictionary
+        labels = {}
+        label_keys = []
+        label_values = []
+        for label in label_str.split(','):
+            if '=' in label:
+                k, v = label.split('=', 1)
+                k = k.strip()
+                v = v.strip('"')
+                labels[k] = v
+                label_keys.append(k)
+                label_values.append(v)
+        help_str = f"SMART metric {metric_name}"
+        # Create Gauge if not already present
+        if metric_name not in metric_gauges:
+            metric_gauges[metric_name] = Gauge(metric_name, help_str, label_keys, registry=registry)
+        # Set metric value
+        gauge = metric_gauges[metric_name]
+        gauge.labels(*label_values).set(float(value))
+    if output_path is None:
+        output_path = '/var/lib/node_exporter/textfile_collector/smartmon.prom'
+    write_to_textfile(output_path, registry)  # Write all metrics to file
+
+def main(output_path=None):
     all_metrics = []
 
     try:
@@ -197,7 +223,7 @@ def main():
         disk_type = dev.interface or ""
         serial_number = (dev.serial or "").lower()
 
-        run_timestamp = int(datetime.datetime.now(datetime.UTC).timestamp())
+        run_timestamp = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
         all_metrics.append(f'smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}')
 
         active = 1
@@ -220,7 +246,11 @@ def main():
         all_metrics.extend(parse_device_info(dev))
         all_metrics.extend(parse_if_attributes(dev))
 
-    print(format_output(all_metrics))
+    write_metrics_to_textfile(all_metrics, output_path)
 
 if __name__ == "__main__":
-    main()
+    import argparse
+    parser = argparse.ArgumentParser(description="Export SMART metrics to Prometheus textfile format.")
+    parser.add_argument('--output', type=str, default=None, help='Output path for Prometheus textfile (default: /var/lib/node_exporter/textfile_collector/smartmon.prom)')
+    args = parser.parse_args()
+    main(args.output)
diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py
@@ -1,17 +1,19 @@
 import glob
 import json
 import os
-import re
 import unittest
+import tempfile
+import math
+from time import sleep
 
 from unittest.mock import patch, MagicMock
-
 from smartmon import (
     parse_device_info,
     parse_if_attributes,
     main,
     SMARTMON_ATTRS,
-    camel_to_snake
+    camel_to_snake,
+    write_metrics_to_textfile,
 )
 
 def load_json_fixture(filename):
@@ -75,7 +77,6 @@ def _test_parse_device_info(self, fixture_name):
         dev_serial = device_info["serial"].lower()
 
         # The device_info line should exist for every device
-        # e.g. device_info{disk="/dev/...",type="...",serial_number="..."} 1
         device_info_found = any(
             line.startswith("device_info{") and
             f'disk="{dev_name}"' in line and
@@ -94,32 +95,32 @@ def _test_parse_device_info(self, fixture_name):
                 line.startswith("device_smart_available{") and
                 f'disk="{dev_name}"' in line and
                 f'serial_number="{dev_serial}"' in line and
-                line.endswith(" 1")
+                line.endswith(" 1.0")
                 for line in metrics
             )
             self.assertTrue(
                 smart_available_found,
-                f"Expected device_smart_available=1 for {dev_name}, not found."
+                f"Expected device_smart_available=1.0 for {dev_name}, not found."
             )
 
         # If smart_enabled is true, we expect device_smart_enabled = 1
         if device_info.get("smart_enabled"):
             smart_enabled_found = any(
                 line.startswith("device_smart_enabled{") and
                 f'disk="{dev_name}"' in line and
-                line.endswith(" 1")
+                line.endswith(" 1.0")
                 for line in metrics
             )
             self.assertTrue(
                 smart_enabled_found,
-                f"Expected device_smart_enabled=1 for {dev_name}, not found."
+                f"Expected device_smart_enabled=1.0 for {dev_name}, not found."
             )
 
         # device_smart_healthy if assessment in [PASS, WARN, FAIL]
         # PASS => 1, otherwise => 0
         assessment = device_info.get("assessment", "").upper()
         if assessment in ["PASS", "WARN", "FAIL"]:
-            expected_val = 1 if assessment == "PASS" else 0
+            expected_val = float(1) if assessment == "PASS" else float(0)
             smart_healthy_found = any(
                 line.startswith("device_smart_healthy{") and
                 f'disk="{dev_name}"' in line and
@@ -162,9 +163,8 @@ def _test_parse_if_attributes(self, fixture_name):
             snake_key = camel_to_snake(attr_key)
 
             if isinstance(attr_val, (int, float)) and snake_key in SMARTMON_ATTRS:
-                # We expect e.g. critical_warning{disk="/dev/..."} <value>
                 expected_line = (
-                    f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}"
+                    f"{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}"
                 )
                 self.assertIn(
                     expected_line,
@@ -175,7 +175,7 @@ def _test_parse_if_attributes(self, fixture_name):
                 # If it's not in SMARTMON_ATTRS or not numeric,
                 # we do NOT expect a line with that name+value
                 unexpected_line = (
-                    f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}"
+                    f"{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}"
                 )
                 self.assertNotIn(
                     unexpected_line,
@@ -204,28 +204,32 @@ def test_parse_if_attributes(self):
 
     @patch("smartmon.run_command")
     @patch("smartmon.DeviceList")
-    def test_main(self, mock_devicelist_class, mock_run_cmd):
+    @patch("smartmon.write_metrics_to_textfile", wraps=write_metrics_to_textfile)
+    def test_main(self, mock_write_metrics, mock_devicelist_class, mock_run_cmd):
         """
         End-to-end test of main() for every JSON fixture in ./tests/.
         This ensures we can handle multiple disks (multiple fixture files).
+        Checks metrics written to a temp file, and that write_metrics_to_textfile is called once.
         """
+
+        # Patch run_command to return a version & "active" power_mode
+        def run_command_side_effect(cmd, parse_json=False):
+            if "--version" in cmd:
+                return "smartctl 7.3 5422 [x86_64-linux-5.15.0]\n..."
+            if "-n" in cmd and "standby" in cmd and parse_json:
+                return {"power_mode": "active"}
+            return ""
+
+        mock_run_cmd.side_effect = run_command_side_effect
+
         for fixture_path in self.fixture_files:
             fixture_name = os.path.basename(fixture_path)
             with self.subTest(msg=f"Testing main() with {fixture_name}"):
+                mock_write_metrics.reset_mock()
                 data = load_json_fixture(fixture_name)
                 device_info = data["device_info"]
                 if_attrs = data.get("if_attributes", {})
 
-                # Patch run_command to return a version & "active" power_mode
-                def run_command_side_effect(cmd, parse_json=False):
-                    if "--version" in cmd:
-                        return "smartctl 7.3 5422 [x86_64-linux-5.15.0]\n..."
-                    if "-n" in cmd and "standby" in cmd and parse_json:
-                        return {"power_mode": "active"}
-                    return ""
-
-                mock_run_cmd.side_effect = run_command_side_effect
-
                 # Mock a single device from the fixture
                 device_mock = self.create_mock_device_from_json(device_info, if_attrs)
 
@@ -234,41 +238,41 @@ def run_command_side_effect(cmd, parse_json=False):
                 mock_dev_list.devices = [device_mock]
                 mock_devicelist_class.return_value = mock_dev_list
 
-                with patch("builtins.print") as mock_print:
-                    main()
-
-                    printed_lines = []
-                    for call_args in mock_print.call_args_list:
-                        printed_lines.extend(call_args[0][0].split("\n"))
-                dev_name = device_info["name"]
-                dev_iface = device_info["interface"]
-                dev_serial = device_info["serial"].lower()
-
-                # We expect a line for the run timestamp, e.g.:
-                # smartmon_smartctl_run{disk="/dev/...",type="..."} 1671234567
-                run_line_found = any(
-                    line.startswith("smartmon_smartctl_run{") and
-                    f'disk="{dev_name}"' in line and
-                    f'type="{dev_iface}"' in line
-                    for line in printed_lines
-                )
-                self.assertTrue(
-                    run_line_found,
-                    f"Expected 'smartmon_smartctl_run' metric line for {dev_name} not found."
-                )
-
-                # Because we mocked "power_mode": "active", we expect device_active=1
-                active_line_found = any(
-                    line.startswith("smartmon_device_active{") and
-                    f'disk="{dev_name}"' in line and
-                    f'serial_number="{dev_serial}"' in line and
-                    line.endswith(" 1")
-                    for line in printed_lines
-                )
-                self.assertTrue(
-                    active_line_found,
-                    f"Expected 'device_active{{...}} 1' line for {dev_name} not found."
-                )
+                with tempfile.NamedTemporaryFile(mode="r+", delete_on_close=False) as tmpfile:
+                    path= tmpfile.name
+                    main(output_path=path)
+                    tmpfile.close()
+
+                    # Ensure write_metrics_to_textfile was called once
+                    self.assertEqual(mock_write_metrics.call_count, 1)
+
+                    with open(path, "r") as f:
+                        # Read the metrics from the file
+                        metrics_lines = [line.strip() for line in f.readlines() if line.strip() and not line.startswith('#')]
+                        print(f"Metrics lines: {metrics_lines}")
+
+                # Generate expected metrics using the parse functions
+                expected_metrics = []
+                expected_metrics.extend(parse_device_info(device_mock))
+                expected_metrics.extend(parse_if_attributes(device_mock))
+
+                # Check that all expected metrics are present in the file
+                for expected in expected_metrics:
+                    exp_metric, exp_val_str = expected.rsplit(" ", 1)
+                    exp_val = float(exp_val_str)
+                    found = any(
+                        (exp_metric in line) and
+                        math.isclose(float(line.rsplit(" ", 1)[1]), exp_val)
+                        for line in metrics_lines
+                    )
+                    self.assertTrue(found, f"Expected metric '{expected}' not found")
+
+                # Check that smartctl_version metric is present
+                version_found = any(line.startswith("smartctl_version{") for line in metrics_lines)
+                self.assertTrue(version_found, "Expected 'smartctl_version' metric not found in output file.")
+
+                # Check that the output file is not empty
+                self.assertTrue(metrics_lines, "Metrics output file is empty.")
 
 if __name__ == "__main__":
     unittest.main()