Skip to content

Commit dddd73f

Browse files
benjipelletierfacebook-github-bot
authored andcommitted
Add channel pair metrics (#1809)
Summary: We want to track throughput, error, latency, etc for channel pair metrics. Add those to `net.rs` Attributes for channel metrics: ``` "error_type" => human readable error type if an error occured "source" => source address (populated when available) "dest" => destination address ``` Reviewed By: vidhyav Differential Revision: D86425779
1 parent 6d14685 commit dddd73f

File tree

3 files changed

+163
-24
lines changed

3 files changed

+163
-24
lines changed

hyperactor/src/channel/net/client.rs

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ use super::framed::FrameReader;
3333
use super::framed::FrameWrite;
3434
use super::framed::WriteState;
3535
use crate::RemoteMessage;
36+
use crate::channel::ChannelAddr;
3637
use crate::channel::ChannelError;
3738
use crate::channel::SendError;
3839
use crate::channel::TxStatus;
@@ -203,14 +204,18 @@ struct Outbox<'a, M: RemoteMessage> {
203204
next_seq: u64,
204205
deque: MessageDeque<M>,
205206
log_id: &'a str,
207+
dest_addr: &'a ChannelAddr,
208+
session_id: u64,
206209
}
207210

208211
impl<'a, M: RemoteMessage> Outbox<'a, M> {
209-
fn new(log_id: &'a str) -> Self {
212+
fn new(log_id: &'a str, dest_addr: &'a ChannelAddr, session_id: u64) -> Self {
210213
Self {
211214
next_seq: 0,
212215
deque: MessageDeque(VecDeque::new()),
213216
log_id,
217+
dest_addr,
218+
session_id,
214219
}
215220
}
216221

@@ -255,7 +260,24 @@ impl<'a, M: RemoteMessage> Outbox<'a, M> {
255260

256261
let frame = Frame::Message(self.next_seq, message);
257262
let message = serialize_bincode(&frame).map_err(|e| format!("serialization error: {e}"))?;
258-
metrics::REMOTE_MESSAGE_SEND_SIZE.record(message.frame_len() as f64, &[]);
263+
let message_size = message.frame_len();
264+
metrics::REMOTE_MESSAGE_SEND_SIZE.record(message_size as f64, &[]);
265+
266+
// Track throughput for this channel pair
267+
metrics::CHANNEL_THROUGHPUT_BYTES.add(
268+
message_size as u64,
269+
hyperactor_telemetry::kv_pairs!(
270+
"dest" => self.dest_addr.to_string(),
271+
"session_id" => self.session_id.to_string(),
272+
),
273+
);
274+
metrics::CHANNEL_THROUGHPUT_MESSAGES.add(
275+
1,
276+
hyperactor_telemetry::kv_pairs!(
277+
"dest" => self.dest_addr.to_string(),
278+
"session_id" => self.session_id.to_string(),
279+
),
280+
);
259281

260282
self.deque.push_back(QueuedMessage {
261283
seq: self.next_seq,
@@ -371,7 +393,7 @@ impl<'a, M: RemoteMessage> Unacked<'a, M> {
371393
}
372394

373395
/// Remove acked messages from the deque.
374-
fn prune(&mut self, acked: u64, acked_at: Instant) {
396+
fn prune(&mut self, acked: u64, acked_at: Instant, dest_addr: &ChannelAddr, session_id: u64) {
375397
assert!(
376398
self.largest_acked.as_ref().map_or(0, |i| i.0) <= acked,
377399
"{}: received out-of-order ack; received: {}; stored largest: {}",
@@ -386,7 +408,16 @@ impl<'a, M: RemoteMessage> Unacked<'a, M> {
386408
let deque = &mut self.deque;
387409
while let Some(msg) = deque.front() {
388410
if msg.seq <= acked {
389-
deque.pop_front();
411+
let msg: QueuedMessage<M> = deque.pop_front().unwrap();
412+
// Track latency: time from when message was first received to when it was acked
413+
let latency_micros = msg.received_at.elapsed().as_micros() as i64;
414+
metrics::CHANNEL_LATENCY_MICROS.record(
415+
latency_micros as f64,
416+
hyperactor_telemetry::kv_pairs!(
417+
"dest" => dest_addr.to_string(),
418+
"session_id" => session_id.to_string(),
419+
),
420+
);
390421
} else {
391422
// Messages in the deque are orderd by seq in ascending
392423
// order. So we could return early once we encounter
@@ -461,9 +492,9 @@ enum State<'a, M: RemoteMessage> {
461492
}
462493

463494
impl<'a, M: RemoteMessage> State<'a, M> {
464-
fn init(log_id: &'a str) -> Self {
495+
fn init(log_id: &'a str, dest_addr: &'a ChannelAddr, session_id: u64) -> Self {
465496
Self::Running(Deliveries {
466-
outbox: Outbox::new(log_id),
497+
outbox: Outbox::new(log_id, dest_addr, session_id),
467498
unacked: Unacked::new(None, log_id),
468499
})
469500
}
@@ -543,7 +574,8 @@ async fn run<M: RemoteMessage>(
543574

544575
let session_id = rand::random();
545576
let log_id = format!("session {}.{}", link.dest(), session_id);
546-
let mut state = State::init(&log_id);
577+
let dest = link.dest();
578+
let mut state = State::init(&log_id, &dest, session_id);
547579
let mut conn = Conn::reconnect_with_default();
548580

549581
let (state, conn) = loop {
@@ -859,7 +891,7 @@ where
859891
Ok(response) => {
860892
match response {
861893
NetRxResponse::Ack(ack) => {
862-
unacked.prune(ack, RealClock.now());
894+
unacked.prune(ack, RealClock.now(), &link.dest(), session_id);
863895
(State::Running(Deliveries { outbox, unacked }), Conn::Connected { reader, write_state })
864896
}
865897
NetRxResponse::Reject => {
@@ -934,6 +966,15 @@ where
934966
"{log_id}: outbox send error: {err}; message size: {}",
935967
outbox.front_size().expect("outbox should not be empty"),
936968
);
969+
// Track error for this channel pair
970+
metrics::CHANNEL_ERRORS.add(
971+
1,
972+
hyperactor_telemetry::kv_pairs!(
973+
"dest" => link.dest().to_string(),
974+
"session_id" => session_id.to_string(),
975+
"error_type" => metrics::ChannelErrorType::SendError.as_str(),
976+
),
977+
);
937978
(State::Running(Deliveries { outbox, unacked }), Conn::reconnect_with_default())
938979
}
939980
}
@@ -1030,6 +1071,18 @@ where
10301071

10311072
// Need to resend unacked after reconnecting.
10321073
let largest_acked = unacked.largest_acked;
1074+
let num_retries = unacked.deque.len();
1075+
if num_retries > 0 {
1076+
// Track reconnection for this channel pair
1077+
metrics::CHANNEL_RECONNECTIONS.add(
1078+
1,
1079+
hyperactor_telemetry::kv_pairs!(
1080+
"dest" => link.dest().to_string(),
1081+
"transport" => link.dest().transport().to_string(),
1082+
"reason" => "reconnect_with_unacked",
1083+
),
1084+
);
1085+
}
10331086
outbox.requeue_unacked(unacked.deque);
10341087
(
10351088
State::Running(Deliveries {
@@ -1061,6 +1114,15 @@ where
10611114
session_id,
10621115
err
10631116
);
1117+
// Track connection error for this channel pair
1118+
metrics::CHANNEL_ERRORS.add(
1119+
1,
1120+
hyperactor_telemetry::kv_pairs!(
1121+
"dest" => link.dest().to_string(),
1122+
"session_id" => session_id.to_string(),
1123+
"error_type" => metrics::ChannelErrorType::ConnectionError.as_str(),
1124+
),
1125+
);
10641126
(
10651127
State::Running(Deliveries { outbox, unacked }),
10661128
Conn::reconnect(backoff),

hyperactor/src/channel/net/server.rs

Lines changed: 59 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,21 @@ impl<S: AsyncRead + AsyncWrite + Send + 'static + Unpin> ServerConn<S> {
181181
};
182182

183183
// De-frame the multi-part message.
184+
let bytes_len = bytes.len();
184185
let message = match serde_multipart::Message::from_framed(bytes) {
185186
Ok(message) => message,
186-
Err(err) => break (
187+
Err(err) => {
188+
// Track deframing error for this channel pair
189+
metrics::CHANNEL_ERRORS.add(
190+
1,
191+
hyperactor_telemetry::kv_pairs!(
192+
"source" => self.source.to_string(),
193+
"dest" => self.dest.to_string(),
194+
"session_id" => session_id.to_string(),
195+
"error_type" => metrics::ChannelErrorType::DeframeError.as_str(),
196+
),
197+
);
198+
break (
187199
next,
188200
Err::<(), anyhow::Error>(err.into()).context(
189201
format!(
@@ -192,7 +204,8 @@ impl<S: AsyncRead + AsyncWrite + Send + 'static + Unpin> ServerConn<S> {
192204
)
193205
),
194206
false
195-
),
207+
)
208+
},
196209
};
197210

198211
// Finally decode the message. This assembles the M-typed message
@@ -220,6 +233,23 @@ impl<S: AsyncRead + AsyncWrite + Send + 'static + Unpin> ServerConn<S> {
220233
}
221234
match self.send_with_buffer_metric(&log_id, &tx, message).await {
222235
Ok(()) => {
236+
// Track throughput for this channel pair
237+
metrics::CHANNEL_THROUGHPUT_BYTES.add(
238+
bytes_len as u64,
239+
hyperactor_telemetry::kv_pairs!(
240+
"source" => self.source.to_string(),
241+
"dest" => self.dest.to_string(),
242+
"session_id" => session_id.to_string(),
243+
),
244+
);
245+
metrics::CHANNEL_THROUGHPUT_MESSAGES.add(
246+
1,
247+
hyperactor_telemetry::kv_pairs!(
248+
"source" => self.source.to_string(),
249+
"dest" => self.dest.to_string(),
250+
"session_id" => session_id.to_string(),
251+
),
252+
);
223253
// In channel's contract, "delivered" means the message
224254
// is sent to the NetRx object. Therefore, we could bump
225255
// `next_seq` as far as the message is put on the mpsc
@@ -236,16 +266,28 @@ impl<S: AsyncRead + AsyncWrite + Send + 'static + Unpin> ServerConn<S> {
236266
}
237267
}
238268
},
239-
Err(err) => break (
240-
next,
241-
Err::<(), anyhow::Error>(err.into()).context(
242-
format!(
243-
"{log_id}: deserialize message with M = {}",
244-
type_name::<M>(),
245-
)
246-
),
247-
false
248-
),
269+
Err(err) => {
270+
// Track deserialization error for this channel pair
271+
metrics::CHANNEL_ERRORS.add(
272+
1,
273+
hyperactor_telemetry::kv_pairs!(
274+
"source" => self.source.to_string(),
275+
"dest" => self.dest.to_string(),
276+
"session_id" => session_id.to_string(),
277+
"error_type" => metrics::ChannelErrorType::DeserializeError.as_str(),
278+
),
279+
);
280+
break (
281+
next,
282+
Err::<(), anyhow::Error>(err.into()).context(
283+
format!(
284+
"{log_id}: deserialize message with M = {}",
285+
type_name::<M>(),
286+
)
287+
),
288+
false
289+
)
290+
},
249291
}
250292
},
251293
}
@@ -490,11 +532,13 @@ where
490532
};
491533

492534
if let Err(ref err) = res {
493-
metrics::CHANNEL_CONNECTION_ERRORS.add(
535+
metrics::CHANNEL_ERRORS.add(
494536
1,
495537
hyperactor_telemetry::kv_pairs!(
496538
"transport" => dest.transport().to_string(),
497539
"error" => err.to_string(),
540+
"error_type" => metrics::ChannelErrorType::ConnectionError.as_str(),
541+
"dest" => dest.to_string(),
498542
),
499543
);
500544

@@ -513,12 +557,13 @@ where
513557
});
514558
}
515559
Err(err) => {
516-
metrics::CHANNEL_CONNECTION_ERRORS.add(
560+
metrics::CHANNEL_ERRORS.add(
517561
1,
518562
hyperactor_telemetry::kv_pairs!(
519563
"transport" => listener_channel_addr.transport().to_string(),
520564
"operation" => "accept",
521565
"error" => err.to_string(),
566+
"error_type" => metrics::ChannelErrorType::ConnectionError.as_str(),
522567
),
523568
);
524569

hyperactor/src/metrics.rs

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,31 @@ use hyperactor_telemetry::declare_static_histogram;
1515
use hyperactor_telemetry::declare_static_timer;
1616
use hyperactor_telemetry::declare_static_up_down_counter;
1717

18+
/// Error types for channel-related errors. Only used for telemetry.
19+
#[derive(Debug, Clone, Copy)]
20+
pub enum ChannelErrorType {
21+
/// Error occurred while sending a message.
22+
SendError,
23+
/// Error occurred while connecting to a channel.
24+
ConnectionError,
25+
/// Error occurred while deframing a message.
26+
DeframeError,
27+
/// Error occurred while deserializing a message.
28+
DeserializeError,
29+
}
30+
31+
impl ChannelErrorType {
32+
/// Returns the string representation of the error type.
33+
pub fn as_str(&self) -> &'static str {
34+
match self {
35+
ChannelErrorType::SendError => "send_error",
36+
ChannelErrorType::ConnectionError => "connection_error",
37+
ChannelErrorType::DeframeError => "deframe_error",
38+
ChannelErrorType::DeserializeError => "deserialize_error",
39+
}
40+
}
41+
}
42+
1843
// MAILBOX
1944
// Tracks messages that couldn't be delivered to their destination and were returned as undeliverable
2045
declare_static_counter!(
@@ -44,16 +69,23 @@ declare_static_timer!(
4469
declare_static_histogram!(REMOTE_MESSAGE_SEND_SIZE, "channel.remote_message_send_size");
4570
// Tracks the number of new channel connections established (client and server)
4671
declare_static_counter!(CHANNEL_CONNECTIONS, "channel.connections");
47-
// Tracks errors that occur when establishing channel connections
48-
declare_static_counter!(CHANNEL_CONNECTION_ERRORS, "channel.connection_errors");
4972
// Tracks the number of channel reconnection attempts
5073
declare_static_counter!(CHANNEL_RECONNECTIONS, "channel.reconnections");
74+
// Tracks errors for each channel pair
75+
declare_static_counter!(CHANNEL_ERRORS, "channel.errors");
5176
// Tracks the number of NetRx encountering full buffer, i.e. its mpsc channel.
5277

5378
// This metric counts how often the NetRx→client mpsc channel remains full,
5479
// incrementing once per CHANNEL_NET_RX_BUFFER_FULL_CHECK_INTERVAL while blocked.
5580
declare_static_counter!(CHANNEL_NET_RX_BUFFER_FULL, "channel.net_rx_buffer_full");
5681

82+
// Tracks throughput (bytes sent)
83+
declare_static_counter!(CHANNEL_THROUGHPUT_BYTES, "channel.throughput.bytes");
84+
// Tracks throughput (message count)
85+
declare_static_counter!(CHANNEL_THROUGHPUT_MESSAGES, "channel.throughput.messages");
86+
// Tracks message latency for each channel pair in microseconds
87+
declare_static_histogram!(CHANNEL_LATENCY_MICROS, "channel.latency.us");
88+
5789
// PROC MESH
5890
// Tracks the number of active processes in the process mesh
5991
declare_static_counter!(PROC_MESH_ALLOCATION, "proc_mesh.active_procs");

0 commit comments

Comments
 (0)