Skip to content

Commit 96a2382

Browse files
committed
test: Add integration tests for hinting/reporting
Add integration tests for free page hinting and reporting, both functional and performance tests. New functional tests to ensure that hinting and reporting are reducing the RSS as expected in the guest. Updated reduce RSS test to touch memory to reduce the chance of flakiness. New performance tests for the balloon device. First being a test to track the CPU overhead of hinting and reporting. Second being a test to measure the faulting latency while reporting is running in the guest. Signed-off-by: Jack Thomson <jackabt@amazon.com>
1 parent 8fc6134 commit 96a2382

File tree

2 files changed

+294
-15
lines changed

2 files changed

+294
-15
lines changed

tests/integration_tests/functional/test_balloon.py

Lines changed: 145 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""Tests for guest-side operations on /balloon resources."""
44

55
import logging
6+
import signal
67
import time
78
from subprocess import TimeoutExpired
89

@@ -293,7 +294,8 @@ def test_reinflate_balloon(uvm_plain_any):
293294

294295

295296
# pylint: disable=C0103
296-
def test_size_reduction(uvm_plain_any):
297+
@pytest.mark.parametrize("method", ["traditional", "hinting", "reporting"])
298+
def test_size_reduction(uvm_plain_any, method):
297299
"""
298300
Verify that ballooning reduces RSS usage on a newly booted guest.
299301
"""
@@ -302,30 +304,60 @@ def test_size_reduction(uvm_plain_any):
302304
test_microvm.basic_config()
303305
test_microvm.add_net_iface()
304306

307+
traditional_balloon = method == "traditional"
308+
free_page_reporting = method == "reporting"
309+
free_page_hinting = method == "hinting"
310+
305311
# Add a memory balloon.
306312
test_microvm.api.balloon.put(
307-
amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=0
313+
amount_mib=0,
314+
deflate_on_oom=True,
315+
stats_polling_interval_s=0,
316+
free_page_reporting=free_page_reporting,
317+
free_page_hinting=free_page_hinting,
308318
)
309319

310320
# Start the microvm.
311321
test_microvm.start()
312322
firecracker_pid = test_microvm.firecracker_pid
313323

314-
# Check memory usage.
324+
get_stable_rss_mem_by_pid(firecracker_pid)
325+
326+
test_microvm.ssh.check_output(
327+
"nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
328+
)
329+
330+
time.sleep(1)
331+
315332
first_reading = get_stable_rss_mem_by_pid(firecracker_pid)
316333

334+
_, pid, _ = test_microvm.ssh.check_output("pidof fast_page_fault_helper")
335+
# Kill the application which will free the held memory
336+
test_microvm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
337+
338+
# Sleep to allow guest to clean up
339+
time.sleep(1)
317340
# Have the guest drop its caches.
318341
test_microvm.ssh.run("sync; echo 3 > /proc/sys/vm/drop_caches")
319-
time.sleep(5)
342+
time.sleep(2)
320343

321344
# We take the initial reading of the RSS, then calculate the amount
322345
# we need to inflate the balloon with by subtracting it from the
323346
# VM size and adding an offset of 10 MiB in order to make sure we
324347
# get a lower reading than the initial one.
325348
inflate_size = 256 - int(first_reading / 1024) + 10
326349

327-
# Now inflate the balloon.
328-
test_microvm.api.balloon.patch(amount_mib=inflate_size)
350+
if traditional_balloon:
351+
# Now inflate the balloon
352+
test_microvm.api.balloon.patch(amount_mib=inflate_size)
353+
elif free_page_hinting:
354+
test_microvm.api.balloon_hinting_start.patch()
355+
356+
_ = get_stable_rss_mem_by_pid(firecracker_pid)
357+
358+
if traditional_balloon:
359+
# Deflate the balloon completely.
360+
test_microvm.api.balloon.patch(amount_mib=0)
329361

330362
# Check memory usage again.
331363
second_reading = get_stable_rss_mem_by_pid(firecracker_pid)
@@ -534,7 +566,91 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory):
534566
assert stats_after_snap["available_memory"] > latest_stats["available_memory"]
535567

536568

537-
def test_memory_scrub(uvm_plain_any):
569+
@pytest.mark.parametrize("method", ["reporting", "hinting"])
570+
def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method):
571+
"""
572+
Test that the balloon hinting and reporting works after pause/resume.
573+
"""
574+
vm = uvm_plain_any
575+
vm.spawn()
576+
vm.basic_config(
577+
vcpu_count=2,
578+
mem_size_mib=256,
579+
)
580+
vm.add_net_iface()
581+
582+
free_page_reporting = method == "reporting"
583+
free_page_hinting = method == "hinting"
584+
585+
# Add a memory balloon with stats enabled.
586+
vm.api.balloon.put(
587+
amount_mib=0,
588+
deflate_on_oom=True,
589+
stats_polling_interval_s=STATS_POLLING_INTERVAL_S,
590+
free_page_reporting=free_page_reporting,
591+
free_page_hinting=free_page_hinting,
592+
)
593+
594+
vm.start()
595+
596+
vm.ssh.check_output(
597+
"nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
598+
)
599+
600+
time.sleep(1)
601+
602+
firecracker_pid = vm.firecracker_pid
603+
604+
# Check memory usage.
605+
first_reading = get_stable_rss_mem_by_pid(firecracker_pid)
606+
607+
_, pid, _ = vm.ssh.check_output("pidof fast_page_fault_helper")
608+
# Kill the application which will free the held memory
609+
vm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
610+
time.sleep(2)
611+
612+
if free_page_hinting:
613+
vm.api.balloon_hinting_start.patch()
614+
615+
# Check memory usage again.
616+
second_reading = get_stable_rss_mem_by_pid(firecracker_pid)
617+
618+
# There should be a reduction in RSS, but it's inconsistent.
619+
# We only test that the reduction happens.
620+
assert first_reading > second_reading
621+
622+
snapshot = vm.snapshot_full()
623+
microvm = microvm_factory.build_from_snapshot(snapshot)
624+
625+
firecracker_pid = microvm.firecracker_pid
626+
627+
microvm.ssh.check_output(
628+
"nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
629+
)
630+
631+
time.sleep(1)
632+
633+
# Check memory usage.
634+
third_reading = get_stable_rss_mem_by_pid(firecracker_pid)
635+
636+
_, pid, _ = microvm.ssh.check_output("pidof fast_page_fault_helper")
637+
# Kill the application which will free the held memory
638+
microvm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
639+
time.sleep(2)
640+
641+
if free_page_hinting:
642+
microvm.api.balloon_hinting_start.patch()
643+
644+
# Check memory usage again.
645+
fourth_reading = get_stable_rss_mem_by_pid(firecracker_pid)
646+
647+
# There should be a reduction in RSS, but it's inconsistent.
648+
# We only test that the reduction happens.
649+
assert third_reading > fourth_reading
650+
651+
652+
@pytest.mark.parametrize("method", ["traditional", "hinting", "reporting"])
653+
def test_memory_scrub(uvm_plain_any, method):
538654
"""
539655
Test that the memory is zeroed after deflate.
540656
"""
@@ -543,29 +659,43 @@ def test_memory_scrub(uvm_plain_any):
543659
microvm.basic_config(vcpu_count=2, mem_size_mib=256)
544660
microvm.add_net_iface()
545661

662+
free_page_reporting = method == "reporting"
663+
free_page_hinting = method == "hinting"
664+
546665
# Add a memory balloon with stats enabled.
547666
microvm.api.balloon.put(
548-
amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1
667+
amount_mib=0,
668+
deflate_on_oom=True,
669+
stats_polling_interval_s=1,
670+
free_page_reporting=free_page_reporting,
671+
free_page_hinting=free_page_hinting,
549672
)
550673

551674
microvm.start()
552675

553676
# Dirty 60MB of pages.
554677
make_guest_dirty_memory(microvm.ssh, amount_mib=60)
555678

556-
# Now inflate the balloon with 60MB of pages.
557-
microvm.api.balloon.patch(amount_mib=60)
679+
if method == "traditional":
680+
# Now inflate the balloon with 60MB of pages.
681+
microvm.api.balloon.patch(amount_mib=60)
682+
elif method == "hinting":
683+
time.sleep(1)
684+
microvm.api.balloon_hinting_start.patch()
685+
elif method == "reporting":
686+
# Reporting can take up to 2 seconds to complete
687+
time.sleep(2)
558688

559689
# Get the firecracker pid, and open an ssh connection.
560690
firecracker_pid = microvm.firecracker_pid
561691

562692
# Wait for the inflate to complete.
563693
_ = get_stable_rss_mem_by_pid(firecracker_pid)
564694

565-
# Deflate the balloon completely.
566-
microvm.api.balloon.patch(amount_mib=0)
567-
568-
# Wait for the deflate to complete.
569-
_ = get_stable_rss_mem_by_pid(firecracker_pid)
695+
if method == "traditional":
696+
# Deflate the balloon completely.
697+
microvm.api.balloon.patch(amount_mib=0)
698+
# Wait for the deflate to complete.
699+
_ = get_stable_rss_mem_by_pid(firecracker_pid)
570700

571701
microvm.ssh.check_output("/usr/local/bin/readmem {} {}".format(60, 1))
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
"""Tests for guest-side operations on /balloon resources."""
4+
5+
import concurrent
6+
import signal
7+
import time
8+
9+
import pytest
10+
11+
from framework.utils import track_cpu_utilization
12+
13+
NS_IN_MSEC = 1_000_000
14+
15+
16+
def trigger_page_fault_run(vm):
17+
"""
18+
Clears old data and starts the fast_page_fault_helper script
19+
"""
20+
vm.ssh.check_output(
21+
"rm -f /tmp/fast_page_fault_helper.out && /usr/local/bin/fast_page_fault_helper -s"
22+
)
23+
24+
25+
def get_page_fault_duration(vm):
26+
"""
27+
Waits for the performance data to be available and will read the duration
28+
"""
29+
_, duration, _ = vm.ssh.check_output(
30+
"while [ ! -f /tmp/fast_page_fault_helper.out ]; do sleep 1; done; cat /tmp/fast_page_fault_helper.out"
31+
)
32+
return duration
33+
34+
35+
@pytest.mark.parametrize("method", ["reporting", "hinting"])
36+
@pytest.mark.nonci
37+
def test_hinting_reporting_cpu(
38+
microvm_factory, guest_kernel_linux_6_1, rootfs, method, metrics
39+
):
40+
"""
41+
Measure the CPU usage when running free page reporting and hinting
42+
"""
43+
test_microvm = microvm_factory.build(
44+
guest_kernel_linux_6_1, rootfs, pci=True, monitor_memory=False
45+
)
46+
test_microvm.spawn(emit_metrics=False)
47+
test_microvm.basic_config(vcpu_count=2, mem_size_mib=1024)
48+
test_microvm.add_net_iface()
49+
50+
free_page_reporting = method == "reporting"
51+
free_page_hinting = method == "hinting"
52+
# Add a deflated memory balloon.
53+
test_microvm.api.balloon.put(
54+
amount_mib=0,
55+
deflate_on_oom=False,
56+
stats_polling_interval_s=0,
57+
free_page_reporting=free_page_reporting,
58+
free_page_hinting=free_page_hinting,
59+
)
60+
test_microvm.start()
61+
test_microvm.pin_threads(0)
62+
63+
metrics.set_dimensions(
64+
{
65+
"performance_test": "test_balloon_cpu",
66+
# "huge_pages": str(huge_pages),
67+
"method": method,
68+
**test_microvm.dimensions,
69+
}
70+
)
71+
72+
test_microvm.ssh.check_output(
73+
"nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
74+
)
75+
76+
# Give helper time to initialize
77+
time.sleep(5)
78+
_, pid, _ = test_microvm.ssh.check_output("pidof fast_page_fault_helper")
79+
test_microvm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
80+
81+
cpu_util = None
82+
83+
if free_page_reporting:
84+
cpu_util = track_cpu_utilization(test_microvm.firecracker_pid, 5, 0)
85+
else:
86+
test_microvm.ssh.check_output(
87+
"while [ ! -f /tmp/fast_page_fault_helper.out ]; do sleep 1; done;"
88+
)
89+
90+
with concurrent.futures.ThreadPoolExecutor() as executor:
91+
cpu_load_future = executor.submit(
92+
track_cpu_utilization,
93+
test_microvm.firecracker_pid,
94+
2,
95+
omit=0,
96+
)
97+
test_microvm.api.balloon_hinting_start.patch()
98+
cpu_util = cpu_load_future.result()
99+
100+
for thread_name, values in cpu_util.items():
101+
for value in values:
102+
metrics.put_metric(f"cpu_utilization_{thread_name}", value, "Percent")
103+
104+
105+
@pytest.mark.parametrize("sleep_duration", [0, 1, 30])
106+
@pytest.mark.nonci
107+
def test_hinting_fault_latency(
108+
microvm_factory, guest_kernel_linux_6_1, rootfs, metrics, sleep_duration
109+
):
110+
"""
111+
Measure the overhead of running free page reporting with allocation heavy
112+
workloads.
113+
114+
Test with different sleep intervals to measure the effect
115+
depending on frequenecy
116+
"""
117+
runs = 5
118+
test_microvm = microvm_factory.build(
119+
guest_kernel_linux_6_1, rootfs, pci=True, monitor_memory=False
120+
)
121+
test_microvm.spawn(emit_metrics=False)
122+
test_microvm.basic_config(vcpu_count=2, mem_size_mib=1024)
123+
test_microvm.add_net_iface()
124+
125+
# Add a deflated memory balloon.
126+
test_microvm.api.balloon.put(
127+
amount_mib=0,
128+
deflate_on_oom=False,
129+
stats_polling_interval_s=0,
130+
free_page_reporting=True,
131+
)
132+
test_microvm.start()
133+
test_microvm.pin_threads(0)
134+
135+
metrics.set_dimensions(
136+
{
137+
"performance_test": "test_hinting_fault_latency",
138+
"sleep_duration": str(sleep_duration),
139+
**test_microvm.dimensions,
140+
}
141+
)
142+
143+
for i in range(runs):
144+
trigger_page_fault_run(test_microvm)
145+
reporting_duration = int(get_page_fault_duration(test_microvm)) / NS_IN_MSEC
146+
metrics.put_metric("latency", reporting_duration, "Milliseconds")
147+
148+
if sleep_duration > 0 and (i + 1 < runs):
149+
time.sleep(sleep_duration)

0 commit comments

Comments
 (0)