@@ -85,7 +85,7 @@ class CloudHypervisorTests(Tool):
8585 repo_root : PurePath
8686
8787 # Perf-stable profile configuration Set to "" to disable
88- perf_profile : str = "perf-stable"
88+ perf_profile : str = "perf-stable"
8989 perf_numa_node : int = 0
9090 perf_warmup_seconds : int = 30
9191 perf_mq_test_timeout : int = 90
@@ -1506,118 +1506,110 @@ def _setup_host_perf_policies(self) -> None:
15061506 - irqbalance → ON
15071507 - Reserve hugepages (1GB fallback to 2MB) on selected NUMA node
15081508 """
1509- try :
1510- # CPU governor → performance
1511- self .node .execute (
1512- "for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do "
1513- "echo performance | sudo tee $gov >/dev/null 2>&1 || true; done" ,
1514- shell = True ,
1515- sudo = True ,
1516- )
1509+ # CPU governor → performance
1510+ self .node .execute (
1511+ "for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do "
1512+ "echo performance | sudo tee $gov >/dev/null 2>&1 || true; done" ,
1513+ shell = True ,
1514+ sudo = True ,
1515+ )
15171516
1518- # Turbo → off (Intel + AMD)
1519- # Intel: /sys/devices/system/cpu/intel_pstate/no_turbo
1520- # AMD: /sys/devices/system/cpu/cpufreq/boost
1521- self .node .execute (
1522- "echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo "
1523- ">/dev/null 2>&1 || "
1524- "echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost "
1525- ">/dev/null 2>&1 || true" ,
1526- shell = True ,
1527- sudo = True ,
1528- )
1517+ # Turbo → off (Intel + AMD)
1518+ # Intel: /sys/devices/system/cpu/intel_pstate/no_turbo
1519+ # AMD: /sys/devices/system/cpu/cpufreq/boost
1520+ self .node .execute (
1521+ "echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo "
1522+ ">/dev/null 2>&1 || "
1523+ "echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost "
1524+ ">/dev/null 2>&1 || true" ,
1525+ shell = True ,
1526+ sudo = True ,
1527+ )
1528+
1529+ # C-states ≤ C1E (Intel-specific, best-effort)
1530+ self .node .execute (
1531+ "echo 1 | sudo tee /sys/module/intel_idle/parameters/max_cstate "
1532+ ">/dev/null 2>&1 || true" ,
1533+ shell = True ,
1534+ sudo = True ,
1535+ )
1536+
1537+ # THP → never (host)
1538+ self .node .execute (
1539+ "echo never | sudo tee /sys/kernel/mm/transparent_hugepage/enabled "
1540+ ">/dev/null 2>&1 || true" ,
1541+ shell = True ,
1542+ sudo = True ,
1543+ )
1544+
1545+ # irqbalance → ON
1546+ self .node .execute (
1547+ "sudo systemctl enable --now irqbalance 2>/dev/null || "
1548+ "sudo service irqbalance start 2>/dev/null || true" ,
1549+ shell = True ,
1550+ sudo = True ,
1551+ )
1552+
1553+ # Reserve hugepages (try 1GB first, fallback to 2MB)
1554+ hugepage_1g_path = (
1555+ f"/sys/devices/system/node/node{ self ._numa_node } /"
1556+ f"hugepages/hugepages-1048576kB/nr_hugepages"
1557+ )
1558+ hugepage_2m_path = (
1559+ f"/sys/devices/system/node/node{ self ._numa_node } /"
1560+ f"hugepages/hugepages-2048kB/nr_hugepages"
1561+ )
15291562
1530- # C-states ≤ C1E (Intel-specific, best-effort)
1563+ # Check if 1GB hugepages are available
1564+ result = self .node .execute (
1565+ f"[ -f { hugepage_1g_path } ]" ,
1566+ shell = True ,
1567+ )
1568+
1569+ if result .exit_code == 0 :
1570+ # Try 1GB hugepages (16GB total)
15311571 self .node .execute (
1532- "echo 1 | sudo tee /sys/module/intel_idle/parameters/max_cstate "
1533- ">/dev/null 2>&1 || true" ,
1572+ f"echo 16 | sudo tee { hugepage_1g_path } >/dev/null 2>&1 || true" ,
15341573 shell = True ,
15351574 sudo = True ,
15361575 )
1537-
1538- # THP → never (host)
1539- self .node .execute (
1540- "echo never | sudo tee /sys/kernel/mm/transparent_hugepage/enabled "
1541- ">/dev/null 2>&1 || true" ,
1576+ # Verify allocation
1577+ verify = self .node .execute (
1578+ f"cat { hugepage_1g_path } 2>/dev/null || echo 0" ,
15421579 shell = True ,
1543- sudo = True ,
15441580 )
1545-
1546- # irqbalance → ON
1581+ allocated = int (verify .stdout .strip () or "0" )
1582+ if allocated >= 16 :
1583+ self ._log .debug (f"Reserved 16GB (1GB pages) on node{ self ._numa_node } " )
1584+ else :
1585+ self ._log .debug (
1586+ f"Only { allocated } GB (1GB pages) allocated (requested 16GB)"
1587+ )
1588+ else :
1589+ # Fallback to 2MB hugepages (8192 pages = 16GB)
15471590 self .node .execute (
1548- "sudo systemctl enable --now irqbalance 2>/dev/null || "
1549- "sudo service irqbalance start 2>/dev/null || true" ,
1591+ f"echo 8192 | sudo tee { hugepage_2m_path } >/dev/null 2>&1 || true" ,
15501592 shell = True ,
15511593 sudo = True ,
15521594 )
1553-
1554- # Reserve hugepages (try 1GB first, fallback to 2MB)
1555- hugepage_1g_path = (
1556- f"/sys/devices/system/node/node{ self ._numa_node } /"
1557- f"hugepages/hugepages-1048576kB/nr_hugepages"
1558- )
1559- hugepage_2m_path = (
1560- f"/sys/devices/system/node/node{ self ._numa_node } /"
1561- f"hugepages/hugepages-2048kB/nr_hugepages"
1562- )
1563-
1564- # Check if 1GB hugepages are available
1565- result = self .node .execute (
1566- f"[ -f { hugepage_1g_path } ] && echo 'yes' || echo 'no'" ,
1595+ # Verify allocation
1596+ verify = self .node .execute (
1597+ f"cat { hugepage_2m_path } 2>/dev/null || echo 0" ,
15671598 shell = True ,
15681599 )
1569-
1570- if "yes" in result .stdout :
1571- # Try 1GB hugepages (16GB total)
1572- self .node .execute (
1573- f"echo 16 | sudo tee { hugepage_1g_path } >/dev/null 2>&1 || true" ,
1574- shell = True ,
1575- sudo = True ,
1576- )
1577- # Verify allocation
1578- verify = self .node .execute (
1579- f"cat { hugepage_1g_path } 2>/dev/null || echo 0" ,
1580- shell = True ,
1581- )
1582- allocated = int (verify .stdout .strip () or "0" )
1583- if allocated >= 16 :
1584- self ._log .debug (
1585- f"Reserved 16GB (1GB pages) on node{ self ._numa_node } "
1586- )
1587- else :
1588- self ._log .debug (
1589- f"Only { allocated } GB (1GB pages) allocated (requested 16GB)"
1590- )
1600+ allocated = int (verify .stdout .strip () or "0" )
1601+ # Convert 2MiB pages to GiB
1602+ allocated_gib = allocated * 2 / 1024
1603+ if allocated >= 8192 :
1604+ self ._log .debug (f"Reserved 16GB (2MB pages) on node{ self ._numa_node } " )
15911605 else :
1592- # Fallback to 2MB hugepages (8192 pages = 16GB)
1593- self .node .execute (
1594- f"echo 8192 | sudo tee { hugepage_2m_path } >/dev/null 2>&1 || true" ,
1595- shell = True ,
1596- sudo = True ,
1606+ self ._log .debug (
1607+ f"Only { allocated_gib :.2f} GiB (2MiB pages) allocated "
1608+ f"(requested 16 GiB)"
15971609 )
1598- # Verify allocation
1599- verify = self .node .execute (
1600- f"cat { hugepage_2m_path } 2>/dev/null || echo 0" ,
1601- shell = True ,
1602- )
1603- allocated = int (verify .stdout .strip () or "0" )
1604- # Convert 2MiB pages to GiB
1605- allocated_gib = allocated * 2 / 1024
1606- if allocated >= 8192 :
1607- self ._log .debug (
1608- f"Reserved 16GB (2MB pages) on node{ self ._numa_node } "
1609- )
1610- else :
1611- self ._log .debug (
1612- f"Only { allocated_gib :.2f} GiB (2MiB pages) allocated "
1613- f"(requested 16 GiB)"
1614- )
16151610
1616- # Export NUMA node for CH launcher
1617- os .environ ["CH_NUMA_NODE" ] = str (self ._numa_node )
1618-
1619- except Exception as e :
1620- self ._log .debug (f"Host perf setup warning (non-fatal): { e } " )
1611+ # Export NUMA node for CH launcher
1612+ os .environ ["CH_NUMA_NODE" ] = str (self ._numa_node )
16211613
16221614 def _run_anchor_gate (self ) -> None :
16231615 """
@@ -1626,81 +1618,75 @@ def _run_anchor_gate(self) -> None:
16261618 Validates against EWMA baseline (±5%). Retries once on failure.
16271619 Uses exponential weighted moving average (alpha=0.3) for stability.
16281620 """
1629- try :
1630- # 5-8s CPU/mem anchor using dd
1631- start = time .time ()
1632- self .node .execute (
1633- "dd if=/dev/zero of=/dev/null bs=1M count=2048 status=none" ,
1634- shell = True ,
1635- timeout = 15 ,
1636- )
1637- elapsed = time .time () - start
1621+ # 5-8s CPU/mem anchor using dd
1622+ start = time .time ()
1623+ self .node .execute (
1624+ "dd if=/dev/zero of=/dev/null bs=1M count=2048 status=none" ,
1625+ shell = True ,
1626+ timeout = 15 ,
1627+ )
1628+ elapsed = time .time () - start
16381629
1639- # Calculate throughput (GB/s) - 2048 MB = 2.048 GB
1640- throughput = 2.048 / elapsed
1630+ # Calculate throughput (GB/s) - 2048 MB = 2.048 GB
1631+ throughput = 2.048 / elapsed
16411632
1642- # EWMA validation (skip first run to establish baseline)
1643- if self ._anchor_ewma_count > 0 :
1644- deviation = abs (throughput - self ._anchor_ewma ) / self ._anchor_ewma
1633+ # EWMA validation (skip first run to establish baseline)
1634+ if self ._anchor_ewma_count > 0 :
1635+ deviation = abs (throughput - self ._anchor_ewma ) / self ._anchor_ewma
16451636
1646- if deviation > self .ANCHOR_DEVIATION_THRESHOLD :
1647- self ._log .debug (
1648- f"Anchor gate FAILED: { deviation * 100 :.1f} % deviation "
1649- f"(expected: { self ._anchor_ewma :.2f} GB/s, "
1650- f"got: { throughput :.2f} GB/s)"
1651- )
1637+ if deviation > self .ANCHOR_DEVIATION_THRESHOLD :
1638+ self ._log .debug (
1639+ f"Anchor gate FAILED: { deviation * 100 :.1f} % deviation "
1640+ f"(expected: { self ._anchor_ewma :.2f} GB/s, "
1641+ f"got: { throughput :.2f} GB/s)"
1642+ )
16521643
1653- # Retry once
1654- self ._log .debug ("Retrying anchor gate..." )
1655- time .sleep (5 )
1644+ # Retry once
1645+ self ._log .debug ("Retrying anchor gate..." )
1646+ time .sleep (5 )
16561647
1657- start = time .time ()
1658- self .node .execute (
1659- "dd if=/dev/zero of=/dev/null bs=1M count=2048 status=none" ,
1660- shell = True ,
1661- timeout = 15 ,
1662- )
1663- elapsed = time .time () - start
1664- throughput = 2.048 / elapsed
1648+ start = time .time ()
1649+ self .node .execute (
1650+ "dd if=/dev/zero of=/dev/null bs=1M count=2048 status=none" ,
1651+ shell = True ,
1652+ timeout = 15 ,
1653+ )
1654+ elapsed = time .time () - start
1655+ throughput = 2.048 / elapsed
16651656
1666- deviation = abs (throughput - self ._anchor_ewma ) / self ._anchor_ewma
1657+ deviation = abs (throughput - self ._anchor_ewma ) / self ._anchor_ewma
16671658
1668- if deviation > self .ANCHOR_DEVIATION_THRESHOLD :
1669- self ._log .debug (
1670- f"Anchor gate FAILED on retry: "
1671- f"{ deviation * 100 :.1f} % deviation"
1672- )
1673- self ._log_thermal_health ("anchor_failure" )
1674- # Don't fail the suite, but log prominently
1675- else :
1676- self ._log .debug (
1677- f"Anchor gate PASSED on retry: { throughput :.2f} GB/s"
1678- )
1659+ if deviation > self .ANCHOR_DEVIATION_THRESHOLD :
1660+ self ._log .debug (
1661+ f"Anchor gate FAILED on retry: "
1662+ f"{ deviation * 100 :.1f} % deviation"
1663+ )
1664+ self ._log_thermal_health ("anchor_failure" )
1665+ # Don't fail the suite, but log prominently
16791666 else :
16801667 self ._log .debug (
1681- f"Anchor gate PASSED: { deviation * 100 :.1f } % deviation "
1668+ f"Anchor gate PASSED on retry: { throughput :.2f } GB/s "
16821669 )
16831670 else :
1684- self ._log .debug ("Anchor baseline established" )
1685-
1686- # Update EWMA (alpha=0.3 for responsiveness)
1687- alpha = self .ANCHOR_EWMA_ALPHA
1688- if self ._anchor_ewma_count == 0 :
1689- self ._anchor_ewma = throughput
1690- else :
1691- self ._anchor_ewma = alpha * throughput + (1 - alpha ) * self ._anchor_ewma
1671+ self ._log .debug (f"Anchor gate PASSED: { deviation * 100 :.1f} % deviation" )
1672+ else :
1673+ self ._log .debug ("Anchor baseline established" )
16921674
1693- self ._anchor_ewma_count += 1
1675+ # Update EWMA (alpha=0.3 for responsiveness)
1676+ alpha = self .ANCHOR_EWMA_ALPHA
1677+ if self ._anchor_ewma_count == 0 :
1678+ self ._anchor_ewma = throughput
1679+ else :
1680+ self ._anchor_ewma = alpha * throughput + (1 - alpha ) * self ._anchor_ewma
16941681
1695- # Log anchor measurements for correlation across runs
1696- self ._log .info (
1697- f"[ANCHOR_METRICS] measurement={ throughput :.3f} GB/s, "
1698- f"ewma={ self ._anchor_ewma :.3f} GB/s, count={ self ._anchor_ewma_count } , "
1699- f"alpha={ alpha } "
1700- )
1682+ self ._anchor_ewma_count += 1
17011683
1702- except Exception as e :
1703- self ._log .debug (f"Anchor gate error (non-fatal): { e } " )
1684+ # Log anchor measurements for correlation across runs
1685+ self ._log .info (
1686+ f"[ANCHOR_METRICS] measurement={ throughput :.3f} GB/s, "
1687+ f"ewma={ self ._anchor_ewma :.3f} GB/s, count={ self ._anchor_ewma_count } , "
1688+ f"alpha={ alpha } "
1689+ )
17041690
17051691 def _setup_storage_hygiene (self ) -> None :
17061692 """
0 commit comments