Skip to content

Commit 7e54481

Browse files
committed
remove try catch block
1 parent 23fb7d9 commit 7e54481

File tree

1 file changed

+143
-157
lines changed

1 file changed

+143
-157
lines changed

lisa/microsoft/testsuites/cloud_hypervisor/ch_tests_tool.py

Lines changed: 143 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class CloudHypervisorTests(Tool):
8585
repo_root: PurePath
8686

8787
# Perf-stable profile configuration Set to "" to disable
88-
perf_profile: str = "perf-stable"
88+
perf_profile: str = "perf-stable"
8989
perf_numa_node: int = 0
9090
perf_warmup_seconds: int = 30
9191
perf_mq_test_timeout: int = 90
@@ -1506,118 +1506,110 @@ def _setup_host_perf_policies(self) -> None:
15061506
- irqbalance → ON
15071507
- Reserve hugepages (1GB fallback to 2MB) on selected NUMA node
15081508
"""
1509-
try:
1510-
# CPU governor → performance
1511-
self.node.execute(
1512-
"for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do "
1513-
"echo performance | sudo tee $gov >/dev/null 2>&1 || true; done",
1514-
shell=True,
1515-
sudo=True,
1516-
)
1509+
# CPU governor → performance
1510+
self.node.execute(
1511+
"for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do "
1512+
"echo performance | sudo tee $gov >/dev/null 2>&1 || true; done",
1513+
shell=True,
1514+
sudo=True,
1515+
)
15171516

1518-
# Turbo → off (Intel + AMD)
1519-
# Intel: /sys/devices/system/cpu/intel_pstate/no_turbo
1520-
# AMD: /sys/devices/system/cpu/cpufreq/boost
1521-
self.node.execute(
1522-
"echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo "
1523-
">/dev/null 2>&1 || "
1524-
"echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost "
1525-
">/dev/null 2>&1 || true",
1526-
shell=True,
1527-
sudo=True,
1528-
)
1517+
# Turbo → off (Intel + AMD)
1518+
# Intel: /sys/devices/system/cpu/intel_pstate/no_turbo
1519+
# AMD: /sys/devices/system/cpu/cpufreq/boost
1520+
self.node.execute(
1521+
"echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo "
1522+
">/dev/null 2>&1 || "
1523+
"echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost "
1524+
">/dev/null 2>&1 || true",
1525+
shell=True,
1526+
sudo=True,
1527+
)
1528+
1529+
# C-states ≤ C1E (Intel-specific, best-effort)
1530+
self.node.execute(
1531+
"echo 1 | sudo tee /sys/module/intel_idle/parameters/max_cstate "
1532+
">/dev/null 2>&1 || true",
1533+
shell=True,
1534+
sudo=True,
1535+
)
1536+
1537+
# THP → never (host)
1538+
self.node.execute(
1539+
"echo never | sudo tee /sys/kernel/mm/transparent_hugepage/enabled "
1540+
">/dev/null 2>&1 || true",
1541+
shell=True,
1542+
sudo=True,
1543+
)
1544+
1545+
# irqbalance → ON
1546+
self.node.execute(
1547+
"sudo systemctl enable --now irqbalance 2>/dev/null || "
1548+
"sudo service irqbalance start 2>/dev/null || true",
1549+
shell=True,
1550+
sudo=True,
1551+
)
1552+
1553+
# Reserve hugepages (try 1GB first, fallback to 2MB)
1554+
hugepage_1g_path = (
1555+
f"/sys/devices/system/node/node{self._numa_node}/"
1556+
f"hugepages/hugepages-1048576kB/nr_hugepages"
1557+
)
1558+
hugepage_2m_path = (
1559+
f"/sys/devices/system/node/node{self._numa_node}/"
1560+
f"hugepages/hugepages-2048kB/nr_hugepages"
1561+
)
15291562

1530-
# C-states ≤ C1E (Intel-specific, best-effort)
1563+
# Check if 1GB hugepages are available
1564+
result = self.node.execute(
1565+
f"[ -f {hugepage_1g_path} ]",
1566+
shell=True,
1567+
)
1568+
1569+
if result.exit_code == 0:
1570+
# Try 1GB hugepages (16GB total)
15311571
self.node.execute(
1532-
"echo 1 | sudo tee /sys/module/intel_idle/parameters/max_cstate "
1533-
">/dev/null 2>&1 || true",
1572+
f"echo 16 | sudo tee {hugepage_1g_path} >/dev/null 2>&1 || true",
15341573
shell=True,
15351574
sudo=True,
15361575
)
1537-
1538-
# THP → never (host)
1539-
self.node.execute(
1540-
"echo never | sudo tee /sys/kernel/mm/transparent_hugepage/enabled "
1541-
">/dev/null 2>&1 || true",
1576+
# Verify allocation
1577+
verify = self.node.execute(
1578+
f"cat {hugepage_1g_path} 2>/dev/null || echo 0",
15421579
shell=True,
1543-
sudo=True,
15441580
)
1545-
1546-
# irqbalance → ON
1581+
allocated = int(verify.stdout.strip() or "0")
1582+
if allocated >= 16:
1583+
self._log.debug(f"Reserved 16GB (1GB pages) on node{self._numa_node}")
1584+
else:
1585+
self._log.debug(
1586+
f"Only {allocated}GB (1GB pages) allocated (requested 16GB)"
1587+
)
1588+
else:
1589+
# Fallback to 2MB hugepages (8192 pages = 16GB)
15471590
self.node.execute(
1548-
"sudo systemctl enable --now irqbalance 2>/dev/null || "
1549-
"sudo service irqbalance start 2>/dev/null || true",
1591+
f"echo 8192 | sudo tee {hugepage_2m_path} >/dev/null 2>&1 || true",
15501592
shell=True,
15511593
sudo=True,
15521594
)
1553-
1554-
# Reserve hugepages (try 1GB first, fallback to 2MB)
1555-
hugepage_1g_path = (
1556-
f"/sys/devices/system/node/node{self._numa_node}/"
1557-
f"hugepages/hugepages-1048576kB/nr_hugepages"
1558-
)
1559-
hugepage_2m_path = (
1560-
f"/sys/devices/system/node/node{self._numa_node}/"
1561-
f"hugepages/hugepages-2048kB/nr_hugepages"
1562-
)
1563-
1564-
# Check if 1GB hugepages are available
1565-
result = self.node.execute(
1566-
f"[ -f {hugepage_1g_path} ] && echo 'yes' || echo 'no'",
1595+
# Verify allocation
1596+
verify = self.node.execute(
1597+
f"cat {hugepage_2m_path} 2>/dev/null || echo 0",
15671598
shell=True,
15681599
)
1569-
1570-
if "yes" in result.stdout:
1571-
# Try 1GB hugepages (16GB total)
1572-
self.node.execute(
1573-
f"echo 16 | sudo tee {hugepage_1g_path} >/dev/null 2>&1 || true",
1574-
shell=True,
1575-
sudo=True,
1576-
)
1577-
# Verify allocation
1578-
verify = self.node.execute(
1579-
f"cat {hugepage_1g_path} 2>/dev/null || echo 0",
1580-
shell=True,
1581-
)
1582-
allocated = int(verify.stdout.strip() or "0")
1583-
if allocated >= 16:
1584-
self._log.debug(
1585-
f"Reserved 16GB (1GB pages) on node{self._numa_node}"
1586-
)
1587-
else:
1588-
self._log.debug(
1589-
f"Only {allocated}GB (1GB pages) allocated (requested 16GB)"
1590-
)
1600+
allocated = int(verify.stdout.strip() or "0")
1601+
# Convert 2MiB pages to GiB
1602+
allocated_gib = allocated * 2 / 1024
1603+
if allocated >= 8192:
1604+
self._log.debug(f"Reserved 16GB (2MB pages) on node{self._numa_node}")
15911605
else:
1592-
# Fallback to 2MB hugepages (8192 pages = 16GB)
1593-
self.node.execute(
1594-
f"echo 8192 | sudo tee {hugepage_2m_path} >/dev/null 2>&1 || true",
1595-
shell=True,
1596-
sudo=True,
1606+
self._log.debug(
1607+
f"Only {allocated_gib:.2f} GiB (2MiB pages) allocated "
1608+
f"(requested 16 GiB)"
15971609
)
1598-
# Verify allocation
1599-
verify = self.node.execute(
1600-
f"cat {hugepage_2m_path} 2>/dev/null || echo 0",
1601-
shell=True,
1602-
)
1603-
allocated = int(verify.stdout.strip() or "0")
1604-
# Convert 2MiB pages to GiB
1605-
allocated_gib = allocated * 2 / 1024
1606-
if allocated >= 8192:
1607-
self._log.debug(
1608-
f"Reserved 16GB (2MB pages) on node{self._numa_node}"
1609-
)
1610-
else:
1611-
self._log.debug(
1612-
f"Only {allocated_gib:.2f} GiB (2MiB pages) allocated "
1613-
f"(requested 16 GiB)"
1614-
)
16151610

1616-
# Export NUMA node for CH launcher
1617-
os.environ["CH_NUMA_NODE"] = str(self._numa_node)
1618-
1619-
except Exception as e:
1620-
self._log.debug(f"Host perf setup warning (non-fatal): {e}")
1611+
# Export NUMA node for CH launcher
1612+
os.environ["CH_NUMA_NODE"] = str(self._numa_node)
16211613

16221614
def _run_anchor_gate(self) -> None:
16231615
"""
@@ -1626,81 +1618,75 @@ def _run_anchor_gate(self) -> None:
16261618
Validates against EWMA baseline (±5%). Retries once on failure.
16271619
Uses exponential weighted moving average (alpha=0.3) for stability.
16281620
"""
1629-
try:
1630-
# 5-8s CPU/mem anchor using dd
1631-
start = time.time()
1632-
self.node.execute(
1633-
"dd if=/dev/zero of=/dev/null bs=1M count=2048 status=none",
1634-
shell=True,
1635-
timeout=15,
1636-
)
1637-
elapsed = time.time() - start
1621+
# 5-8s CPU/mem anchor using dd
1622+
start = time.time()
1623+
self.node.execute(
1624+
"dd if=/dev/zero of=/dev/null bs=1M count=2048 status=none",
1625+
shell=True,
1626+
timeout=15,
1627+
)
1628+
elapsed = time.time() - start
16381629

1639-
# Calculate throughput (GB/s) - 2048 MB = 2.048 GB
1640-
throughput = 2.048 / elapsed
1630+
# Calculate throughput (GB/s) - 2048 MB = 2.048 GB
1631+
throughput = 2.048 / elapsed
16411632

1642-
# EWMA validation (skip first run to establish baseline)
1643-
if self._anchor_ewma_count > 0:
1644-
deviation = abs(throughput - self._anchor_ewma) / self._anchor_ewma
1633+
# EWMA validation (skip first run to establish baseline)
1634+
if self._anchor_ewma_count > 0:
1635+
deviation = abs(throughput - self._anchor_ewma) / self._anchor_ewma
16451636

1646-
if deviation > self.ANCHOR_DEVIATION_THRESHOLD:
1647-
self._log.debug(
1648-
f"Anchor gate FAILED: {deviation * 100:.1f}% deviation "
1649-
f"(expected: {self._anchor_ewma:.2f} GB/s, "
1650-
f"got: {throughput:.2f} GB/s)"
1651-
)
1637+
if deviation > self.ANCHOR_DEVIATION_THRESHOLD:
1638+
self._log.debug(
1639+
f"Anchor gate FAILED: {deviation * 100:.1f}% deviation "
1640+
f"(expected: {self._anchor_ewma:.2f} GB/s, "
1641+
f"got: {throughput:.2f} GB/s)"
1642+
)
16521643

1653-
# Retry once
1654-
self._log.debug("Retrying anchor gate...")
1655-
time.sleep(5)
1644+
# Retry once
1645+
self._log.debug("Retrying anchor gate...")
1646+
time.sleep(5)
16561647

1657-
start = time.time()
1658-
self.node.execute(
1659-
"dd if=/dev/zero of=/dev/null bs=1M count=2048 status=none",
1660-
shell=True,
1661-
timeout=15,
1662-
)
1663-
elapsed = time.time() - start
1664-
throughput = 2.048 / elapsed
1648+
start = time.time()
1649+
self.node.execute(
1650+
"dd if=/dev/zero of=/dev/null bs=1M count=2048 status=none",
1651+
shell=True,
1652+
timeout=15,
1653+
)
1654+
elapsed = time.time() - start
1655+
throughput = 2.048 / elapsed
16651656

1666-
deviation = abs(throughput - self._anchor_ewma) / self._anchor_ewma
1657+
deviation = abs(throughput - self._anchor_ewma) / self._anchor_ewma
16671658

1668-
if deviation > self.ANCHOR_DEVIATION_THRESHOLD:
1669-
self._log.debug(
1670-
f"Anchor gate FAILED on retry: "
1671-
f"{deviation * 100:.1f}% deviation"
1672-
)
1673-
self._log_thermal_health("anchor_failure")
1674-
# Don't fail the suite, but log prominently
1675-
else:
1676-
self._log.debug(
1677-
f"Anchor gate PASSED on retry: {throughput:.2f} GB/s"
1678-
)
1659+
if deviation > self.ANCHOR_DEVIATION_THRESHOLD:
1660+
self._log.debug(
1661+
f"Anchor gate FAILED on retry: "
1662+
f"{deviation * 100:.1f}% deviation"
1663+
)
1664+
self._log_thermal_health("anchor_failure")
1665+
# Don't fail the suite, but log prominently
16791666
else:
16801667
self._log.debug(
1681-
f"Anchor gate PASSED: {deviation * 100:.1f}% deviation"
1668+
f"Anchor gate PASSED on retry: {throughput:.2f} GB/s"
16821669
)
16831670
else:
1684-
self._log.debug("Anchor baseline established")
1685-
1686-
# Update EWMA (alpha=0.3 for responsiveness)
1687-
alpha = self.ANCHOR_EWMA_ALPHA
1688-
if self._anchor_ewma_count == 0:
1689-
self._anchor_ewma = throughput
1690-
else:
1691-
self._anchor_ewma = alpha * throughput + (1 - alpha) * self._anchor_ewma
1671+
self._log.debug(f"Anchor gate PASSED: {deviation * 100:.1f}% deviation")
1672+
else:
1673+
self._log.debug("Anchor baseline established")
16921674

1693-
self._anchor_ewma_count += 1
1675+
# Update EWMA (alpha=0.3 for responsiveness)
1676+
alpha = self.ANCHOR_EWMA_ALPHA
1677+
if self._anchor_ewma_count == 0:
1678+
self._anchor_ewma = throughput
1679+
else:
1680+
self._anchor_ewma = alpha * throughput + (1 - alpha) * self._anchor_ewma
16941681

1695-
# Log anchor measurements for correlation across runs
1696-
self._log.info(
1697-
f"[ANCHOR_METRICS] measurement={throughput:.3f} GB/s, "
1698-
f"ewma={self._anchor_ewma:.3f} GB/s, count={self._anchor_ewma_count}, "
1699-
f"alpha={alpha}"
1700-
)
1682+
self._anchor_ewma_count += 1
17011683

1702-
except Exception as e:
1703-
self._log.debug(f"Anchor gate error (non-fatal): {e}")
1684+
# Log anchor measurements for correlation across runs
1685+
self._log.info(
1686+
f"[ANCHOR_METRICS] measurement={throughput:.3f} GB/s, "
1687+
f"ewma={self._anchor_ewma:.3f} GB/s, count={self._anchor_ewma_count}, "
1688+
f"alpha={alpha}"
1689+
)
17041690

17051691
def _setup_storage_hygiene(self) -> None:
17061692
"""

0 commit comments

Comments
 (0)