Skip to content

Commit ec1e389

Browse files
vidhyavmeta-codesync[bot]
authored andcommitted
Added endpoint metrics on the actor side (#1781)
Summary: Pull Request resolved: #1781 Added endpoint metrics on the actor side. I may need to add two more things: Message size We already have a log message but I will spruce it up even further in a following diff. Reviewed By: pzhan9 Differential Revision: D86424930 fbshipit-source-id: f7ba5d82ba5891836417a9b06a58880106872610
1 parent ab94fdb commit ec1e389

File tree

3 files changed

+60
-2
lines changed

3 files changed

+60
-2
lines changed

monarch_hyperactor/src/actor.rs

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ use crate::local_state_broker::LocalStateBrokerMessage;
6060
use crate::mailbox::EitherPortRef;
6161
use crate::mailbox::PyMailbox;
6262
use crate::mailbox::PythonUndeliverableMessageEnvelope;
63+
use crate::metrics::ENDPOINT_ACTOR_COUNT;
64+
use crate::metrics::ENDPOINT_ACTOR_ERROR;
65+
use crate::metrics::ENDPOINT_ACTOR_LATENCY_US_HISTOGRAM;
66+
use crate::metrics::ENDPOINT_ACTOR_PANIC;
6367
use crate::proc::InstanceWrapper;
6468
use crate::proc::PyActorId;
6569
use crate::proc::PyProc;
@@ -759,11 +763,17 @@ impl Handler<PythonMessage> for PythonActor {
759763
cx.port(),
760764
PythonTask::new(future)?,
761765
receiver,
766+
cx.self_id().to_string(),
767+
endpoint.clone(),
762768
)
763769
.instrument(
764770
tracing::info_span!(
765-
"Calling endpoint on PythonActor", actor = %cx.self_id(), rank = rank, endpoint = endpoint
766-
).or_current()
771+
"PythonActor endpoint",
772+
actor_id = %cx.self_id(),
773+
%rank,
774+
%endpoint
775+
)
776+
.or_current()
767777
.follows_from(tracing::Span::current().id())
768778
.clone(),
769779
),
@@ -860,7 +870,19 @@ async fn handle_async_endpoint_panic(
860870
panic_sender: PortHandle<PanicFromPy>,
861871
task: PythonTask,
862872
side_channel: oneshot::Receiver<PyObject>,
873+
actor_id: String,
874+
endpoint: String,
863875
) {
876+
// Create attributes for metrics with actor_id and endpoint
877+
let attributes =
878+
hyperactor_telemetry::kv_pairs!("actor_id" => actor_id, "endpoint" => endpoint);
879+
880+
// Record the start time for latency measurement
881+
let start_time = std::time::Instant::now();
882+
883+
// Increment throughput counter
884+
ENDPOINT_ACTOR_COUNT.add(1, attributes);
885+
864886
let err_or_never = async {
865887
// The side channel will resolve with a value if a panic occured during
866888
// processing of the async endpoint, see [Panics in async endpoints].
@@ -871,6 +893,7 @@ async fn handle_async_endpoint_panic(
871893
.unwrap()
872894
.clone()
873895
.into();
896+
ENDPOINT_ACTOR_PANIC.add(1, attributes);
874897
Some(err.into())
875898
}),
876899
// An Err means that the sender has been dropped without sending.
@@ -892,10 +915,17 @@ async fn handle_async_endpoint_panic(
892915
result
893916
}
894917
} {
918+
// Record error and panic metrics
919+
ENDPOINT_ACTOR_ERROR.add(1, attributes);
920+
895921
panic_sender
896922
.send(PanicFromPy(panic))
897923
.expect("Unable to send panic message");
898924
}
925+
926+
// Record latency in microseconds
927+
let elapsed_micros = start_time.elapsed().as_micros() as f64;
928+
ENDPOINT_ACTOR_LATENCY_US_HISTOGRAM.record(elapsed_micros, attributes);
899929
}
900930

901931
#[pyclass(module = "monarch._rust_bindings.monarch_hyperactor.actor")]

monarch_hyperactor/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ pub mod context;
2323
pub mod local_state_broker;
2424
pub mod logging;
2525
pub mod mailbox;
26+
pub mod metrics;
2627
pub mod ndslice;
2728
pub mod proc;
2829
pub mod proc_mesh;

monarch_hyperactor/src/metrics.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
//! Metrics for Python actor endpoints.
10+
//!
11+
//! This module contains metrics definitions for tracking Python actor endpoint performance.
12+
13+
use hyperactor_telemetry::declare_static_counter;
14+
use hyperactor_telemetry::declare_static_histogram;
15+
16+
// ENDPOINT METRICS
17+
// Tracks latency of endpoint calls in microseconds
18+
declare_static_histogram!(
19+
ENDPOINT_ACTOR_LATENCY_US_HISTOGRAM,
20+
"endpoint_actor_latency_us_histogram"
21+
);
22+
// Tracks the total number of endpoint calls
23+
declare_static_counter!(ENDPOINT_ACTOR_COUNT, "endpoint_actor_count");
24+
// Tracks errors that occur during endpoint execution
25+
declare_static_counter!(ENDPOINT_ACTOR_ERROR, "endpoint_actor_error");
26+
// Tracks panics that occur during endpoint execution
27+
declare_static_counter!(ENDPOINT_ACTOR_PANIC, "endpoint_actor_panic");

0 commit comments

Comments
 (0)