test: Add integration tests for hinting/reporting

JackThomson2 · JackThomson2 · commit 6f09bdfa95e3 · 2025-10-24T14:34:28.000Z
Add integration tests for free page hinting and reporting, both
functional and performance tests.

Update fast_page_helper so it can run in a oneshot mode, not requiring
the signal to track the performance.

New functional tests to ensure that hinting and reporting are reducing
the RSS as expected in the guest. Updated reduce RSS test to touch
memory to reduce the chance of flakiness.

New performance tests for the balloon device. First being a test to
track the CPU overhead of hinting and reporting. Second being a test to
measure the faulting latency while reporting is running in the guest.

Signed-off-by: Jack Thomson &lt;jackabt@amazon.com&gt;
diff --git a/resources/overlay/usr/local/bin/fast_page_fault_helper.c b/resources/overlay/usr/local/bin/fast_page_fault_helper.c
@@ -16,6 +16,7 @@
 #include <sys/mman.h> // mmap
 #include <time.h>     // clock_gettime
 #include <fcntl.h>    // open
+#include <getopt.h>   // getopt
 
 #define MEM_SIZE_MIB (128 * 1024 * 1024)
 #define NANOS_PER_SEC 1000000000
@@ -30,20 +31,39 @@ void touch_memory(void *mem, size_t size, char val) {
 
 int main() {
     sigset_t set;
-    int signal;
+    int signal, character;
     void *ptr;
     struct timespec start, end;
     long duration_nanos;
     FILE *out_file;
 
-    sigemptyset(&set);
-    if (sigaddset(&set, SIGUSR1) == -1) {
-        perror("sigaddset");
-        return 1;
+    char *options = 0;
+    int longindex = 0;
+    int signal_wait = 1;
+
+    struct option longopts[] = {
+      {"nosignal", no_argument, NULL, 's'},
+      {NULL, 0, NULL, 0}
+    };
+
+    while((character = getopt_long(argc, argv, "s", longopts, &longindex)) != -1) {
+      switch (character) {
+        case 's':
+          signal_wait = 0;
+          break;
+      }
     }
-    if (sigprocmask(SIG_BLOCK, &set, NULL) == -1)  {
-        perror("sigprocmask");
-        return 1;
+
+    if (signal_wait) {
+      sigemptyset(&set);
+      if (sigaddset(&set, SIGUSR1) == -1) {
+          perror("sigaddset");
+          return 1;
+      }
+      if (sigprocmask(SIG_BLOCK, &set, NULL) == -1)  {
+          perror("sigprocmask");
+          return 1;
+      }
     }
 
     ptr = mmap(NULL, MEM_SIZE_MIB, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
@@ -53,9 +73,11 @@ int main() {
         return 1;
     }
 
-    touch_memory(ptr, MEM_SIZE_MIB, 1);
+    if (signal_wait) {
+      touch_memory(ptr, MEM_SIZE_MIB, 1);
 
-    sigwait(&set, &signal);
+      sigwait(&set, &signal);
+    }
 
     clock_gettime(CLOCK_BOOTTIME, &start);
     touch_memory(ptr, MEM_SIZE_MIB, 2);
@@ -76,4 +98,4 @@ int main() {
     }
 
     return 0;
-}
+}
diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
@@ -4,6 +4,7 @@
 
 import logging
 import time
+import signal
 from subprocess import TimeoutExpired
 
 import pytest
@@ -13,7 +14,6 @@
 
 STATS_POLLING_INTERVAL_S = 1
 
-
 def get_stable_rss_mem_by_pid(pid, percentage_delta=1):
     """
     Get the RSS memory that a guest uses, given the pid of the guest.
@@ -83,7 +83,6 @@ def make_guest_dirty_memory(ssh_connection, amount_mib=32):
 
     time.sleep(5)
 
-
 def _test_rss_memory_lower(test_microvm):
     """Check inflating the balloon makes guest use less rss memory."""
     # Get the firecracker pid, and open an ssh connection.
@@ -293,7 +292,8 @@ def test_reinflate_balloon(uvm_plain_any):
 
 
 # pylint: disable=C0103
-def test_size_reduction(uvm_plain_any):
+@pytest.mark.parametrize("method", ["traditional", "hinting", "reporting"])
+def test_size_reduction(uvm_plain_any, method):
     """
     Verify that ballooning reduces RSS usage on a newly booted guest.
     """
@@ -302,30 +302,57 @@ def test_size_reduction(uvm_plain_any):
     test_microvm.basic_config()
     test_microvm.add_net_iface()
 
+    traditional_balloon = method == "traditional"
+    free_page_reporting = method == "reporting"
+    free_page_hinting = method == "hinting"
+
     # Add a memory balloon.
     test_microvm.api.balloon.put(
-        amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=0
+        amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=0,
+        free_page_reporting=free_page_reporting, free_page_hinting=free_page_hinting
     )
 
     # Start the microvm.
     test_microvm.start()
     firecracker_pid = test_microvm.firecracker_pid
 
-    # Check memory usage.
+    get_stable_rss_mem_by_pid(firecracker_pid)
+
+    test_microvm.ssh.check_output(
+        "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
+    )
+
+    time.sleep(1)
+
     first_reading = get_stable_rss_mem_by_pid(firecracker_pid)
 
+    _, pid, _ = test_microvm.ssh.check_output("pidof fast_page_fault_helper")
+    # Kill the application which will free the held memory
+    test_microvm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
+
+    # Sleep to allow guest to clean up
+    time.sleep(1)
     # Have the guest drop its caches.
     test_microvm.ssh.run("sync; echo 3 > /proc/sys/vm/drop_caches")
-    time.sleep(5)
+    time.sleep(2)
 
     # We take the initial reading of the RSS, then calculate the amount
     # we need to inflate the balloon with by subtracting it from the
     # VM size and adding an offset of 10 MiB in order to make sure we
     # get a lower reading than the initial one.
     inflate_size = 256 - int(first_reading / 1024) + 10
 
-    # Now inflate the balloon.
-    test_microvm.api.balloon.patch(amount_mib=inflate_size)
+    if traditional_balloon:
+            # Now inflate the balloon
+            test_microvm.api.balloon.patch(amount_mib=inflate_size)
+    elif free_page_hinting:
+            test_microvm.api.balloon_hinting_start.patch()
+
+    _ = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    if traditional_balloon:
+        # Deflate the balloon completely.
+        test_microvm.api.balloon.patch(amount_mib=0)
 
     # Check memory usage again.
     second_reading = get_stable_rss_mem_by_pid(firecracker_pid)
@@ -534,7 +561,91 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory):
     assert stats_after_snap["available_memory"] > latest_stats["available_memory"]
 
 
-def test_memory_scrub(uvm_plain_any):
+@pytest.mark.parametrize("method", ["reporting", "hinting"])
+def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method):
+    """
+    Test that the balloon hinting and reporting works after pause/resume.
+    """
+    vm = uvm_plain_any
+    vm.spawn()
+    vm.basic_config(
+        vcpu_count=2,
+        mem_size_mib=256,
+    )
+    vm.add_net_iface()
+
+    free_page_reporting = method == "reporting"
+    free_page_hinting = method == "hinting"
+
+    # Add a memory balloon with stats enabled.
+    vm.api.balloon.put(
+        amount_mib=0,
+        deflate_on_oom=True,
+        stats_polling_interval_s=STATS_POLLING_INTERVAL_S,
+        free_page_reporting=free_page_reporting, free_page_hinting=free_page_hinting
+    )
+
+    vm.start()
+
+    vm.ssh.check_output(
+        "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
+    )
+
+    time.sleep(1)
+
+    # Get the firecracker pid, and open an ssh connection.
+    firecracker_pid = vm.firecracker_pid
+
+    # Check memory usage.
+    first_reading = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    _, pid, _ = vm.ssh.check_output("pidof fast_page_fault_helper")
+    # Kill the application which will free the held memory
+    vm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
+    time.sleep(2)
+
+    if free_page_hinting:
+        vm.api.balloon_hinting_start.patch()
+
+    # Check memory usage again.
+    second_reading = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    # There should be a reduction in RSS, but it's inconsistent.
+    # We only test that the reduction happens.
+    assert first_reading > second_reading
+
+    snapshot = vm.snapshot_full()
+    microvm = microvm_factory.build_from_snapshot(snapshot)
+
+    firecracker_pid = microvm.firecracker_pid
+
+    microvm.ssh.check_output(
+        "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
+    )
+
+    time.sleep(1)
+
+    # Check memory usage.
+    third_reading = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    _, pid, _ = microvm.ssh.check_output("pidof fast_page_fault_helper")
+    # Kill the application which will free the held memory
+    microvm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
+    time.sleep(2)
+
+    if free_page_hinting:
+        microvm.api.balloon_hinting_start.patch()
+
+    # Check memory usage again.
+    fourth_reading = get_stable_rss_mem_by_pid(firecracker_pid)
+
+    # There should be a reduction in RSS, but it's inconsistent.
+    # We only test that the reduction happens.
+    assert third_reading > fourth_reading
+
+
+@pytest.mark.parametrize("method", ["none", "hinting", "reporting"])
+def test_memory_scrub(uvm_plain_any, method):
     """
     Test that the memory is zeroed after deflate.
     """
@@ -543,29 +654,39 @@ def test_memory_scrub(uvm_plain_any):
     microvm.basic_config(vcpu_count=2, mem_size_mib=256)
     microvm.add_net_iface()
 
+    free_page_reporting = method == "reporting"
+    free_page_hinting = method == "hinting"
+
     # Add a memory balloon with stats enabled.
     microvm.api.balloon.put(
-        amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1
+        amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1,
+        free_page_reporting=free_page_reporting, free_page_hinting=free_page_hinting
     )
 
     microvm.start()
 
     # Dirty 60MB of pages.
     make_guest_dirty_memory(microvm.ssh, amount_mib=60)
 
-    # Now inflate the balloon with 60MB of pages.
-    microvm.api.balloon.patch(amount_mib=60)
+    if method == "none":
+            # Now inflate the balloon with 60MB of pages.
+            microvm.api.balloon.patch(amount_mib=60)
+    elif method == "hinting":
+            time.sleep(1)
+            microvm.api.balloon_hinting_start.patch()
+    elif method == "reporting":
+            time.sleep(2)
 
     # Get the firecracker pid, and open an ssh connection.
     firecracker_pid = microvm.firecracker_pid
 
     # Wait for the inflate to complete.
     _ = get_stable_rss_mem_by_pid(firecracker_pid)
 
-    # Deflate the balloon completely.
-    microvm.api.balloon.patch(amount_mib=0)
-
-    # Wait for the deflate to complete.
-    _ = get_stable_rss_mem_by_pid(firecracker_pid)
+    if method == "none":
+        # Deflate the balloon completely.
+        microvm.api.balloon.patch(amount_mib=0)
+        # Wait for the deflate to complete.
+        _ = get_stable_rss_mem_by_pid(firecracker_pid)
 
     microvm.ssh.check_output("/usr/local/bin/readmem {} {}".format(60, 1))
diff --git a/tests/integration_tests/performance/test_balloon.py b/tests/integration_tests/performance/test_balloon.py