test: Add integration tests for hinting/reporting

JackThomson2 · JackThomson2 · commit 96a23822b8f1 · 2025-11-11T11:02:33.000Z
Add integration tests for free page hinting and reporting, both
functional and performance tests.

New functional tests to ensure that hinting and reporting are reducing
the RSS as expected in the guest. Updated reduce RSS test to touch
memory to reduce the chance of flakiness.

New performance tests for the balloon device. First being a test to
track the CPU overhead of hinting and reporting. Second being a test to
measure the faulting latency while reporting is running in the guest.

Signed-off-by: Jack Thomson &lt;jackabt@amazon.com&gt;
diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
@@ -3,6 +3,7 @@
 """Tests for guest-side operations on /balloon resources."""
 
 import logging
+import signal
 import time
 from subprocess import TimeoutExpired
 
@@ -293,7 +294,8 @@ def test_reinflate_balloon(uvm_plain_any):
 
 
 # pylint: disable=C0103
-def test_size_reduction(uvm_plain_any):
+@pytest.mark.parametrize("method", ["traditional", "hinting", "reporting"])
+def test_size_reduction(uvm_plain_any, method):
     """
     Verify that ballooning reduces RSS usage on a newly booted guest.
     """
@@ -302,30 +304,60 @@ def test_size_reduction(uvm_plain_any):
     test_microvm.basic_config()
     test_microvm.add_net_iface()
 
+    traditional_balloon = method == "traditional"
+    free_page_reporting = method == "reporting"
+    free_page_hinting = method == "hinting"
+
     # Add a memory balloon.
     test_microvm.api.balloon.put(
-        amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=0
+        amount_mib=0,
+        deflate_on_oom=True,
+        stats_polling_interval_s=0,
+        free_page_reporting=free_page_reporting,
+        free_page_hinting=free_page_hinting,
     )
 
     # Start the microvm.
     test_microvm.start()
     firecracker_pid = test_microvm.firecracker_pid
 
-    # Check memory usage.
+    get_stable_rss_mem_by_pid(firecracker_pid)
+
+    test_microvm.ssh.check_output(
+        "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
+    )
+
+    time.sleep(1)
+
     first_reading = get_stable_rss_mem_by_pid(firecracker_pid)
 
+    _, pid, _ = test_microvm.ssh.check_output("pidof fast_page_fault_helper")
+    # Kill the application which will free the held memory
+    test_microvm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
+
+    # Sleep to allow guest to clean up
+    time.sleep(1)
     # Have the guest drop its caches.
     test_microvm.ssh.run("sync; echo 3 > /proc/sys/vm/drop_caches")
-    time.sleep(5)
+    time.sleep(2)
 
     # We take the initial reading of the RSS, then calculate the amount
     # we need to inflate the balloon with by subtracting it from the
     # VM size and adding an offset of 10 MiB in order to make sure we
     # get a lower reading than the initial one.
     inflate_size = 256 - int(first_reading / 1024) + 10
 
-    # Now inflate the balloon.
-    test_microvm.api.balloon.patch(amount_mib=inflate_size)
+    if traditional_balloon:
+        # Now inflate the balloon
+        test_microvm.api.balloon.patch(amount_mib=inflate_size)
+    elif free_page_hinting:
+        test_microvm.api.balloon_hinting_start.patch()
+
+    _ = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    if traditional_balloon:
+        # Deflate the balloon completely.
+        test_microvm.api.balloon.patch(amount_mib=0)
 
     # Check memory usage again.
     second_reading = get_stable_rss_mem_by_pid(firecracker_pid)
@@ -534,7 +566,91 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory):
     assert stats_after_snap["available_memory"] > latest_stats["available_memory"]
 
 
-def test_memory_scrub(uvm_plain_any):
+@pytest.mark.parametrize("method", ["reporting", "hinting"])
+def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method):
+    """
+    Test that the balloon hinting and reporting works after pause/resume.
+    """
+    vm = uvm_plain_any
+    vm.spawn()
+    vm.basic_config(
+        vcpu_count=2,
+        mem_size_mib=256,
+    )
+    vm.add_net_iface()
+
+    free_page_reporting = method == "reporting"
+    free_page_hinting = method == "hinting"
+
+    # Add a memory balloon with stats enabled.
+    vm.api.balloon.put(
+        amount_mib=0,
+        deflate_on_oom=True,
+        stats_polling_interval_s=STATS_POLLING_INTERVAL_S,
+        free_page_reporting=free_page_reporting,
+        free_page_hinting=free_page_hinting,
+    )
+
+    vm.start()
+
+    vm.ssh.check_output(
+        "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
+    )
+
+    time.sleep(1)
+
+    firecracker_pid = vm.firecracker_pid
+
+    # Check memory usage.
+    first_reading = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    _, pid, _ = vm.ssh.check_output("pidof fast_page_fault_helper")
+    # Kill the application which will free the held memory
+    vm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
+    time.sleep(2)
+
+    if free_page_hinting:
+        vm.api.balloon_hinting_start.patch()
+
+    # Check memory usage again.
+    second_reading = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    # There should be a reduction in RSS, but it's inconsistent.
+    # We only test that the reduction happens.
+    assert first_reading > second_reading
+
+    snapshot = vm.snapshot_full()
+    microvm = microvm_factory.build_from_snapshot(snapshot)
+
+    firecracker_pid = microvm.firecracker_pid
+
+    microvm.ssh.check_output(
+        "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
+    )
+
+    time.sleep(1)
+
+    # Check memory usage.
+    third_reading = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    _, pid, _ = microvm.ssh.check_output("pidof fast_page_fault_helper")
+    # Kill the application which will free the held memory
+    microvm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
+    time.sleep(2)
+
+    if free_page_hinting:
+        microvm.api.balloon_hinting_start.patch()
+
+    # Check memory usage again.
+    fourth_reading = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    # There should be a reduction in RSS, but it's inconsistent.
+    # We only test that the reduction happens.
+    assert third_reading > fourth_reading
+
+
+@pytest.mark.parametrize("method", ["traditional", "hinting", "reporting"])
+def test_memory_scrub(uvm_plain_any, method):
     """
     Test that the memory is zeroed after deflate.
     """
@@ -543,29 +659,43 @@ def test_memory_scrub(uvm_plain_any):
     microvm.basic_config(vcpu_count=2, mem_size_mib=256)
     microvm.add_net_iface()
 
+    free_page_reporting = method == "reporting"
+    free_page_hinting = method == "hinting"
+
     # Add a memory balloon with stats enabled.
     microvm.api.balloon.put(
-        amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1
+        amount_mib=0,
+        deflate_on_oom=True,
+        stats_polling_interval_s=1,
+        free_page_reporting=free_page_reporting,
+        free_page_hinting=free_page_hinting,
     )
 
     microvm.start()
 
     # Dirty 60MB of pages.
     make_guest_dirty_memory(microvm.ssh, amount_mib=60)
 
-    # Now inflate the balloon with 60MB of pages.
-    microvm.api.balloon.patch(amount_mib=60)
+    if method == "traditional":
+        # Now inflate the balloon with 60MB of pages.
+        microvm.api.balloon.patch(amount_mib=60)
+    elif method == "hinting":
+        time.sleep(1)
+        microvm.api.balloon_hinting_start.patch()
+    elif method == "reporting":
+        # Reporting can take up to 2 seconds to complete
+        time.sleep(2)
 
     # Get the firecracker pid, and open an ssh connection.
     firecracker_pid = microvm.firecracker_pid
 
     # Wait for the inflate to complete.
     _ = get_stable_rss_mem_by_pid(firecracker_pid)
 
-    # Deflate the balloon completely.
-    microvm.api.balloon.patch(amount_mib=0)
-
-    # Wait for the deflate to complete.
-    _ = get_stable_rss_mem_by_pid(firecracker_pid)
+    if method == "traditional":
+        # Deflate the balloon completely.
+        microvm.api.balloon.patch(amount_mib=0)
+        # Wait for the deflate to complete.
+        _ = get_stable_rss_mem_by_pid(firecracker_pid)
 
     microvm.ssh.check_output("/usr/local/bin/readmem {} {}".format(60, 1))
diff --git a/tests/integration_tests/performance/test_balloon.py b/tests/integration_tests/performance/test_balloon.py
@@ -0,0 +1,149 @@
+# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for guest-side operations on /balloon resources."""
+
+import concurrent
+import signal
+import time
+
+import pytest
+
+from framework.utils import track_cpu_utilization
+
+NS_IN_MSEC = 1_000_000
+
+
+def trigger_page_fault_run(vm):
+    """
+    Clears old data and starts the fast_page_fault_helper script
+    """
+    vm.ssh.check_output(
+        "rm -f /tmp/fast_page_fault_helper.out && /usr/local/bin/fast_page_fault_helper -s"
+    )
+
+
+def get_page_fault_duration(vm):
+    """
+    Waits for the performance data to be available and will read the duration
+    """
+    _, duration, _ = vm.ssh.check_output(
+        "while [ ! -f /tmp/fast_page_fault_helper.out ]; do sleep 1; done; cat /tmp/fast_page_fault_helper.out"
+    )
+    return duration
+
+
+@pytest.mark.parametrize("method", ["reporting", "hinting"])
+@pytest.mark.nonci
+def test_hinting_reporting_cpu(
+    microvm_factory, guest_kernel_linux_6_1, rootfs, method, metrics
+):
+    """
+    Measure the CPU usage when running free page reporting and hinting
+    """
+    test_microvm = microvm_factory.build(
+        guest_kernel_linux_6_1, rootfs, pci=True, monitor_memory=False
+    )
+    test_microvm.spawn(emit_metrics=False)
+    test_microvm.basic_config(vcpu_count=2, mem_size_mib=1024)
+    test_microvm.add_net_iface()
+
+    free_page_reporting = method == "reporting"
+    free_page_hinting = method == "hinting"
+    # Add a deflated memory balloon.
+    test_microvm.api.balloon.put(
+        amount_mib=0,
+        deflate_on_oom=False,
+        stats_polling_interval_s=0,
+        free_page_reporting=free_page_reporting,
+        free_page_hinting=free_page_hinting,
+    )
+    test_microvm.start()
+    test_microvm.pin_threads(0)
+
+    metrics.set_dimensions(
+        {
+            "performance_test": "test_balloon_cpu",
+            # "huge_pages": str(huge_pages),
+            "method": method,
+            **test_microvm.dimensions,
+        }
+    )
+
+    test_microvm.ssh.check_output(
+        "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
+    )
+
+    # Give helper time to initialize
+    time.sleep(5)
+    _, pid, _ = test_microvm.ssh.check_output("pidof fast_page_fault_helper")
+    test_microvm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
+
+    cpu_util = None
+
+    if free_page_reporting:
+        cpu_util = track_cpu_utilization(test_microvm.firecracker_pid, 5, 0)
+    else:
+        test_microvm.ssh.check_output(
+            "while [ ! -f /tmp/fast_page_fault_helper.out ]; do sleep 1; done;"
+        )
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            cpu_load_future = executor.submit(
+                track_cpu_utilization,
+                test_microvm.firecracker_pid,
+                2,
+                omit=0,
+            )
+            test_microvm.api.balloon_hinting_start.patch()
+            cpu_util = cpu_load_future.result()
+
+    for thread_name, values in cpu_util.items():
+        for value in values:
+            metrics.put_metric(f"cpu_utilization_{thread_name}", value, "Percent")
+
+
+@pytest.mark.parametrize("sleep_duration", [0, 1, 30])
+@pytest.mark.nonci
+def test_hinting_fault_latency(
+    microvm_factory, guest_kernel_linux_6_1, rootfs, metrics, sleep_duration
+):
+    """
+    Measure the overhead of running free page reporting with allocation heavy
+    workloads.
+
+    Test with different sleep intervals to measure the effect
+    depending on frequenecy
+    """
+    runs = 5
+    test_microvm = microvm_factory.build(
+        guest_kernel_linux_6_1, rootfs, pci=True, monitor_memory=False
+    )
+    test_microvm.spawn(emit_metrics=False)
+    test_microvm.basic_config(vcpu_count=2, mem_size_mib=1024)
+    test_microvm.add_net_iface()
+
+    # Add a deflated memory balloon.
+    test_microvm.api.balloon.put(
+        amount_mib=0,
+        deflate_on_oom=False,
+        stats_polling_interval_s=0,
+        free_page_reporting=True,
+    )
+    test_microvm.start()
+    test_microvm.pin_threads(0)
+
+    metrics.set_dimensions(
+        {
+            "performance_test": "test_hinting_fault_latency",
+            "sleep_duration": str(sleep_duration),
+            **test_microvm.dimensions,
+        }
+    )
+
+    for i in range(runs):
+        trigger_page_fault_run(test_microvm)
+        reporting_duration = int(get_page_fault_duration(test_microvm)) / NS_IN_MSEC
+        metrics.put_metric("latency", reporting_duration, "Milliseconds")
+
+        if sleep_duration > 0 and (i + 1 < runs):
+            time.sleep(sleep_duration)