Skip to content

Commit d400d18

Browse files
: logging: default disable log forwarding (#1878)
Summary: this diff sets the following defaults for hyperactor-mesh global configuration values: - `MESH_ENABLE_LOG_FORWARDING=false` - `MESH_ENABLE_FILE_CAPTURE=false` - `MESH_TAIL_LOG_LINES=0` the effect of this is to disable log forwarding, prevent allocating resources for log forwarding, no file capture at the hyperactor mesh level (including no "exit tail" capture), in fact, under these defaults there is no interception of child process stdio at all. a [workplace post is planned to announce this change](https://fb.workplace.com/groups/1399849971389924/permalink/1602002844507968/) in default configuration. this diff is built on: D85783397 provide config for enabling/disabling hyperactor-mesh logging interception features, D85919326 for avoiding spinning up `LogForwardActor` meshes when log forwarding is enabled and D85969320 for some detailed testing. Reviewed By: zdevito, vidhyav Differential Revision: D86994420
1 parent adcdb8e commit d400d18

File tree

6 files changed

+170
-10
lines changed

6 files changed

+170
-10
lines changed

hyperactor_mesh/src/bootstrap.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ declare_attrs! {
9999
env_name: Some("HYPERACTOR_MESH_ENABLE_LOG_FORWARDING".to_string()),
100100
py_name: None,
101101
})
102-
pub attr MESH_ENABLE_LOG_FORWARDING: bool = true;
102+
pub attr MESH_ENABLE_LOG_FORWARDING: bool = false;
103103

104104
/// When `true`: if stdio is piped, each child's `StreamFwder`
105105
/// also forwards lines to a host-scoped `FileAppender` managed by
@@ -124,7 +124,7 @@ declare_attrs! {
124124
env_name: Some("HYPERACTOR_MESH_ENABLE_FILE_CAPTURE".to_string()),
125125
py_name: None,
126126
})
127-
pub attr MESH_ENABLE_FILE_CAPTURE: bool = true;
127+
pub attr MESH_ENABLE_FILE_CAPTURE: bool = false;
128128

129129
/// Maximum number of log lines retained in a proc's stderr/stdout
130130
/// tail buffer. Used by [`StreamFwder`] when wiring child
@@ -133,7 +133,7 @@ declare_attrs! {
133133
env_name: Some("HYPERACTOR_MESH_TAIL_LOG_LINES".to_string()),
134134
py_name: None,
135135
})
136-
pub attr MESH_TAIL_LOG_LINES: usize = 100;
136+
pub attr MESH_TAIL_LOG_LINES: usize = 0;
137137

138138
/// If enabled (default), bootstrap child processes install
139139
/// `PR_SET_PDEATHSIG(SIGKILL)` so the kernel reaps them if the
@@ -3692,6 +3692,10 @@ mod tests {
36923692
#[tokio::test]
36933693
async fn exit_tail_is_attached_and_logged() {
36943694
hyperactor_telemetry::initialize_logging_for_test();
3695+
3696+
let lock = hyperactor::config::global::lock();
3697+
let _guard = lock.override_key(MESH_TAIL_LOG_LINES, 100);
3698+
36953699
// Spawn a child that writes to stderr then exits 7.
36963700
let mut cmd = Command::new("sh");
36973701
cmd.arg("-c")

hyperactor_mesh/src/v1/host_mesh.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1125,6 +1125,7 @@ mod tests {
11251125

11261126
use super::*;
11271127
use crate::Bootstrap;
1128+
use crate::bootstrap::MESH_TAIL_LOG_LINES;
11281129
use crate::resource::Status;
11291130
use crate::v1::ActorMesh;
11301131
use crate::v1::testactor;
@@ -1321,6 +1322,9 @@ mod tests {
13211322
#[tokio::test]
13221323
#[cfg(fbcode_build)]
13231324
async fn test_failing_proc_allocation() {
1325+
let lock = hyperactor::config::global::lock();
1326+
let _guard = lock.override_key(MESH_TAIL_LOG_LINES, 100);
1327+
13241328
let program = crate::testresource::get("monarch/hyperactor_mesh/bootstrap");
13251329

13261330
let hosts = vec![free_localhost_addr(), free_localhost_addr()];

monarch_hyperactor/src/v1/logging.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -310,9 +310,9 @@ impl LoggingMeshClient {
310310
// re-spawning infra, which we deliberately don't do at
311311
// runtime.
312312
(None, true) => {
313-
return Err(PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(
314-
"log forwarding disabled by config at startup; cannot enable streaming_to_client",
315-
));
313+
// return Err(PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(
314+
// "log forwarding disabled by config at startup; cannot enable streaming_to_client",
315+
// ));
316316
}
317317
}
318318

@@ -592,6 +592,9 @@ mod tests {
592592
);
593593
}
594594

595+
/*
596+
// Update (SF: 2025, 11, 13): We now ignore stream to client requests if
597+
// log forwarding is enabled.
595598
// (c) stream_to_client = true when forwarding was
596599
// never spawned -> Err
597600
let res = client_ref.set_mode(&py_instance, true, None, 10);
@@ -606,6 +609,7 @@ mod tests {
606609
"unexpected err when enabling streaming with no forwarders: {msg}"
607610
);
608611
}
612+
*/
609613
});
610614

611615
drop(client_py); // See note "NOTE ON LIFECYCLE / CLEANUP"

python/monarch/_src/actor/v1/proc_mesh.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,8 +365,8 @@ def rank_tensors(self) -> Dict[str, "Tensor"]:
365365

366366
async def logging_option(
367367
self,
368-
stream_to_client: bool = True,
369-
aggregate_window_sec: int | None = 3,
368+
stream_to_client: bool = False,
369+
aggregate_window_sec: int | None = None,
370370
level: int = logging.INFO,
371371
) -> None:
372372
"""

python/tests/test_allocator.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,9 +254,19 @@ async def test_allocate_failure_message(self) -> None:
254254
r"exited with code 1: Traceback \(most recent call last\).*",
255255
):
256256
with remote_process_allocator(
257-
envs={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}
257+
envs={
258+
"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1",
259+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
260+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
261+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
262+
}
258263
) as host1, remote_process_allocator(
259-
envs={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}
264+
envs={
265+
"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1",
266+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
267+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
268+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
269+
}
260270
) as host2:
261271
allocator = RemoteAllocator(
262272
world_id="test_remote_allocator",

python/tests/test_python_actors.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,16 @@ def _handle_undeliverable_message(
547547

548548
@pytest.mark.timeout(60)
549549
async def test_actor_log_streaming() -> None:
550+
old_env = {}
551+
env_vars = {
552+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
553+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
554+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
555+
}
556+
for key, value in env_vars.items():
557+
old_env[key] = os.environ.get(key)
558+
os.environ[key] = value
559+
550560
# Save original file descriptors
551561
original_stdout_fd = os.dup(1) # stdout
552562
original_stderr_fd = os.dup(2) # stderr
@@ -684,6 +694,12 @@ async def test_actor_log_streaming() -> None:
684694
), stderr_content
685695

686696
finally:
697+
for key, value in old_env.items():
698+
if value is None:
699+
os.environ.pop(key, None)
700+
else:
701+
os.environ[key] = value
702+
687703
# Ensure file descriptors are restored even if something goes wrong
688704
try:
689705
os.dup2(original_stdout_fd, 1)
@@ -699,6 +715,16 @@ async def test_alloc_based_log_streaming() -> None:
699715
"""Test both AllocHandle.stream_logs = False and True cases."""
700716

701717
async def test_stream_logs_case(stream_logs: bool, test_name: str) -> None:
718+
old_env = {}
719+
env_vars = {
720+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
721+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
722+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
723+
}
724+
for key, value in env_vars.items():
725+
old_env[key] = os.environ.get(key)
726+
os.environ[key] = value
727+
702728
# Save original file descriptors
703729
original_stdout_fd = os.dup(1) # stdout
704730

@@ -778,6 +804,11 @@ def _stream_logs(self) -> bool:
778804
), f"stream_logs=True case: {stdout_content}"
779805

780806
finally:
807+
for key, value in old_env.items():
808+
if value is None:
809+
os.environ.pop(key, None)
810+
else:
811+
os.environ[key] = value
781812
# Ensure file descriptors are restored even if something goes wrong
782813
try:
783814
os.dup2(original_stdout_fd, 1)
@@ -792,6 +823,16 @@ def _stream_logs(self) -> bool:
792823

793824
@pytest.mark.timeout(60)
794825
async def test_logging_option_defaults() -> None:
826+
old_env = {}
827+
env_vars = {
828+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
829+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
830+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
831+
}
832+
for key, value in env_vars.items():
833+
old_env[key] = os.environ.get(key)
834+
os.environ[key] = value
835+
795836
# Save original file descriptors
796837
original_stdout_fd = os.dup(1) # stdout
797838
original_stderr_fd = os.dup(2) # stderr
@@ -870,6 +911,12 @@ async def test_logging_option_defaults() -> None:
870911
), stderr_content
871912

872913
finally:
914+
for key, value in old_env.items():
915+
if value is None:
916+
os.environ.pop(key, None)
917+
else:
918+
os.environ[key] = value
919+
873920
# Ensure file descriptors are restored even if something goes wrong
874921
try:
875922
os.dup2(original_stdout_fd, 1)
@@ -906,6 +953,15 @@ def __init__(self):
906953
@pytest.mark.oss_skip
907954
async def test_flush_called_only_once() -> None:
908955
"""Test that flush is called only once when ending an ipython cell"""
956+
old_env = {}
957+
env_vars = {
958+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
959+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
960+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
961+
}
962+
for key, value in env_vars.items():
963+
old_env[key] = os.environ.get(key)
964+
os.environ[key] = value
909965
mock_ipython = MockIPython()
910966
with unittest.mock.patch(
911967
"monarch._src.actor.logging.get_ipython",
@@ -926,14 +982,29 @@ async def test_flush_called_only_once() -> None:
926982

927983
# now, flush should be called only once
928984
mock_ipython.events.trigger("post_run_cell", unittest.mock.MagicMock())
985+
929986
assert mock_flush.call_count == 1
987+
for key, value in old_env.items():
988+
if value is None:
989+
os.environ.pop(key, None)
990+
else:
991+
os.environ[key] = value
930992

931993

932994
# oss_skip: pytest keeps complaining about mocking get_ipython module
933995
@pytest.mark.oss_skip
934996
@pytest.mark.timeout(180)
935997
async def test_flush_logs_ipython() -> None:
936998
"""Test that logs are flushed when get_ipython is available and post_run_cell event is triggered."""
999+
old_env = {}
1000+
env_vars = {
1001+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
1002+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
1003+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
1004+
}
1005+
for key, value in env_vars.items():
1006+
old_env[key] = os.environ.get(key)
1007+
os.environ[key] = value
9371008
# Save original file descriptors
9381009
original_stdout_fd = os.dup(1) # stdout
9391010

@@ -1025,6 +1096,11 @@ async def test_flush_logs_ipython() -> None:
10251096
), stdout_content
10261097

10271098
finally:
1099+
for key, value in old_env.items():
1100+
if value is None:
1101+
os.environ.pop(key, None)
1102+
else:
1103+
os.environ[key] = value
10281104
# Ensure file descriptors are restored even if something goes wrong
10291105
try:
10301106
os.dup2(original_stdout_fd, 1)
@@ -1036,6 +1112,15 @@ async def test_flush_logs_ipython() -> None:
10361112
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
10371113
@pytest.mark.oss_skip
10381114
async def test_flush_logs_fast_exit() -> None:
1115+
old_env = {}
1116+
env_vars = {
1117+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
1118+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
1119+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
1120+
}
1121+
for key, value in env_vars.items():
1122+
old_env[key] = os.environ.get(key)
1123+
os.environ[key] = value
10391124
# We use a subprocess to run the test so we can handle the flushed logs at the end.
10401125
# Otherwise, it is hard to restore the original stdout/stderr.
10411126

@@ -1062,13 +1147,29 @@ async def test_flush_logs_fast_exit() -> None:
10621147
== 1
10631148
), process.stdout
10641149

1150+
for key, value in old_env.items():
1151+
if value is None:
1152+
os.environ.pop(key, None)
1153+
else:
1154+
os.environ[key] = value
1155+
10651156

10661157
@pytest.mark.timeout(60)
10671158
async def test_flush_on_disable_aggregation() -> None:
10681159
"""Test that logs are flushed when disabling aggregation.
10691160
10701161
This tests the corner case: "Make sure we flush whatever in the aggregators before disabling aggregation."
10711162
"""
1163+
old_env = {}
1164+
env_vars = {
1165+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
1166+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
1167+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
1168+
}
1169+
for key, value in env_vars.items():
1170+
old_env[key] = os.environ.get(key)
1171+
os.environ[key] = value
1172+
10721173
# Save original file descriptors
10731174
original_stdout_fd = os.dup(1) # stdout
10741175

@@ -1148,6 +1249,12 @@ async def test_flush_on_disable_aggregation() -> None:
11481249
), f"Expected 10 single log lines, got {total_single} from {stdout_content}"
11491250

11501251
finally:
1252+
for key, value in old_env.items():
1253+
if value is None:
1254+
os.environ.pop(key, None)
1255+
else:
1256+
os.environ[key] = value
1257+
11511258
# Ensure file descriptors are restored even if something goes wrong
11521259
try:
11531260
os.dup2(original_stdout_fd, 1)
@@ -1163,6 +1270,15 @@ async def test_multiple_ongoing_flushes_no_deadlock() -> None:
11631270
Because now a flush call is purely sync, it is very easy to get into a deadlock.
11641271
So we assert the last flush call will not get into such a state.
11651272
"""
1273+
old_env = {}
1274+
env_vars = {
1275+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
1276+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
1277+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
1278+
}
1279+
for key, value in env_vars.items():
1280+
old_env[key] = os.environ.get(key)
1281+
os.environ[key] = value
11661282
pm = this_host().spawn_procs(per_host={"gpus": 4})
11671283
am = pm.spawn("printer", Printer)
11681284

@@ -1185,13 +1301,29 @@ async def test_multiple_ongoing_flushes_no_deadlock() -> None:
11851301
# The last flush should not block
11861302
futures[-1].get()
11871303

1304+
for key, value in old_env.items():
1305+
if value is None:
1306+
os.environ.pop(key, None)
1307+
else:
1308+
os.environ[key] = value
1309+
11881310

11891311
@pytest.mark.timeout(60)
11901312
async def test_adjust_aggregation_window() -> None:
11911313
"""Test that the flush deadline is updated when the aggregation window is adjusted.
11921314
11931315
This tests the corner case: "This can happen if the user has adjusted the aggregation window."
11941316
"""
1317+
old_env = {}
1318+
env_vars = {
1319+
"HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
1320+
"HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
1321+
"HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
1322+
}
1323+
for key, value in env_vars.items():
1324+
old_env[key] = os.environ.get(key)
1325+
os.environ[key] = value
1326+
11951327
# Save original file descriptors
11961328
original_stdout_fd = os.dup(1) # stdout
11971329

@@ -1258,6 +1390,12 @@ async def test_adjust_aggregation_window() -> None:
12581390
), stdout_content
12591391

12601392
finally:
1393+
for key, value in old_env.items():
1394+
if value is None:
1395+
os.environ.pop(key, None)
1396+
else:
1397+
os.environ[key] = value
1398+
12611399
# Ensure file descriptors are restored even if something goes wrong
12621400
try:
12631401
os.dup2(original_stdout_fd, 1)

0 commit comments

Comments
 (0)