Skip to content

Commit 8798bdf

Browse files
committed
netstacklat: Exclude TCP reads for HOL blocked segments
The 'tcp-socket-read' currently reports the latency for the skb containing the last TCP segment read from the socket. However, this segment might have been head of line (HOL) blocked by a previous segment missing. In this case, netstacklat's reported latency will include HOL blocking periods that is dependent on external factors (such as network packet loss, and network latency impacts retransmission time). As netstacklat is primarily intended to identify issues within the local host (in the network stack or receiving applications), filter out any socket reads were the last read SKB might have experienced HOL-blocking. To exclude HOL-blocked reads, track whenever a TCP segment arrives out-of-order (ooo), i.e. ahead of the expected next sequence. These segments are kept in a separate ooo-queue and later merged back into the socket receive queue once the missing segment gap has been filled. The ooo-segments are therefore the only segments that may experience HOL blocking (as non ooo-segments are immediately added to the socket receive queue). Specifically, track the right edge (the maximum) of the ooo sequence range. If the final byte of any socket read is lower than right edge of the ooo sequence range, filter it out as potentially HOL blocked. If the last read byte is ahead of the ooo-range, the last byte must have been read from a segment that arrived in-order and therefore haven't experienced HOL-blocking, so it's safe to include the latency measurement. Furthermore, also invalidate the ooo-range when the last read byte is ahead to prevent an old ooo-range value sticking around until the sequence number wraps around. There are some scenarios that may cause this ooo-filtering to fail. - If the program is started in the middle of an ongoing flow, there may be ooo segments that arrived before we could record them in the the tracked ooo-range. This may cause the latency for some initial HOL-blocked reads to be reported. - If multiple reads are done to the socket concurrently, we may not correctly track the last read byte. The kernel does not keep a lock on the TCP socket at the time our hooked function tcp_recv_timestamp() runs. If two reads are done in parallel, it's therefore possible that for both reads we will check the last read byte (tcp_sock.copied_seq) after the second read has updated it. We may then incorrectly conclude that the first read was ahead of the ooo-range when it was not, and record its latency when we should have excluded it. In practice I belive this issue should be quite rare, as most applications will probably not attempt to perform multiple concurrent reads to a single connected TCP socket in parallel (as then you cannot know which part of the payload the parallel reads will return). - The tracked ooo-range may concurrently be updated at tcp_data_queue_ofo() and and tcp_recv_timestamp(), leading to inconsitent state. The last of these issue will be addressed in a later commit that adds bpf spin locks. Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
1 parent cc17c68 commit 8798bdf

File tree

3 files changed

+175
-1
lines changed

3 files changed

+175
-1
lines changed

headers/vmlinux/vmlinux_net.h

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,35 @@ struct sk_buff {
161161
struct skb_ext *extensions;
162162
};
163163

164+
struct tcp_skb_cb {
165+
__u32 seq;
166+
__u32 end_seq;
167+
union {
168+
struct {
169+
u16 tcp_gso_segs;
170+
u16 tcp_gso_size;
171+
};
172+
};
173+
__u8 tcp_flags;
174+
__u8 sacked;
175+
__u8 ip_dsfield;
176+
__u8 txstamp_ack : 1;
177+
__u8 eor : 1;
178+
__u8 has_rxtstamp : 1;
179+
__u8 unused : 5;
180+
__u32 ack_seq;
181+
union {
182+
struct {
183+
__u32 is_app_limited : 1;
184+
__u32 delivered_ce : 20;
185+
__u32 unused : 11;
186+
__u32 delivered;
187+
u64 first_tx_mstamp;
188+
u64 delivered_mstamp;
189+
} tx;
190+
};
191+
};
192+
164193
struct nf_conn {
165194
unsigned long status;
166195
};
@@ -202,4 +231,51 @@ struct sock {
202231
u32 sk_rx_dst_cookie;
203232
};
204233

234+
struct inet_sock {
235+
struct sock sk;
236+
};
237+
238+
struct inet_connection_sock {
239+
struct inet_sock icsk_inet;
240+
};
241+
242+
struct tcp_sock {
243+
struct inet_connection_sock inet_conn;
244+
__u8 __cacheline_group_begin__tcp_sock_read_tx[0];
245+
u32 max_window;
246+
u32 rcv_ssthresh;
247+
u32 reordering;
248+
u32 notsent_lowat;
249+
u16 gso_segs;
250+
struct sk_buff *lost_skb_hint;
251+
struct sk_buff *retransmit_skb_hint;
252+
__u8 __cacheline_group_end__tcp_sock_read_tx[0];
253+
__u8 __cacheline_group_begin__tcp_sock_read_txrx[0];
254+
u32 tsoffset;
255+
u32 snd_wnd;
256+
u32 mss_cache;
257+
u32 snd_cwnd;
258+
u32 prr_out;
259+
u32 lost_out;
260+
u32 sacked_out;
261+
u16 tcp_header_len;
262+
u8 scaling_ratio;
263+
u8 chrono_type : 2;
264+
u8 repair : 1;
265+
u8 tcp_usec_ts : 1;
266+
u8 is_sack_reneg : 1;
267+
u8 is_cwnd_limited : 1;
268+
__u8 __cacheline_group_end__tcp_sock_read_txrx[0];
269+
__u8 __cacheline_group_begin__tcp_sock_read_rx[0];
270+
u32 copied_seq;
271+
u32 rcv_tstamp;
272+
u32 snd_wl1;
273+
u32 tlp_high_seq;
274+
u32 rttvar_us;
275+
u32 retrans_out;
276+
u16 advmss;
277+
u16 urg_data;
278+
u32 lost;
279+
};
280+
205281
#endif /* __VMLINUX_NET_H__ */

netstacklat/netstacklat.bpf.c

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111

1212
#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
1313

14+
// Mimic macros from /include/net/tcp.h
15+
#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
16+
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
17+
1418
char LICENSE[] SEC("license") = "GPL";
1519

1620

@@ -38,6 +42,12 @@ struct sk_buff___old {
3842
__u8 mono_delivery_time: 1;
3943
} __attribute__((preserve_access_index));
4044

45+
struct tcp_sock_ooo_range {
46+
u32 ooo_seq_end;
47+
/* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */
48+
bool active;
49+
};
50+
4151
struct {
4252
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
4353
__uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64);
@@ -66,6 +76,22 @@ struct {
6676
__type(value, u64);
6777
} netstack_cgroupfilter SEC(".maps");
6878

79+
struct {
80+
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
81+
__uint(map_flags, BPF_F_NO_PREALLOC);
82+
__type(key, int);
83+
__type(value, struct tcp_sock_ooo_range);
84+
} netstack_tcp_ooo_range SEC(".maps");
85+
86+
/*
87+
* Is a < b considering u32 wrap around?
88+
* Based on the before() function in /include/net/tcp.h
89+
*/
90+
static bool u32_lt(u32 a, u32 b)
91+
{
92+
return (s32)(a - b) < 0;
93+
}
94+
6995
static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key)
7096
{
7197
u64 zero = 0;
@@ -328,6 +354,60 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
328354
record_latency_since(tstamp, &key);
329355
}
330356

357+
static void tcp_update_ooo_range(struct sock *sk, struct sk_buff *skb)
358+
{
359+
struct tcp_sock_ooo_range *tp_ooo_range;
360+
361+
tp_ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL,
362+
BPF_SK_STORAGE_GET_F_CREATE);
363+
if (!tp_ooo_range)
364+
return;
365+
366+
if (tp_ooo_range->active) {
367+
if (u32_lt(tp_ooo_range->ooo_seq_end, TCP_SKB_CB(skb)->end_seq))
368+
tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq;
369+
} else {
370+
tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq;
371+
tp_ooo_range->active = true;
372+
}
373+
374+
}
375+
376+
static bool tcp_read_in_ooo_range(struct sock *sk)
377+
{
378+
struct tcp_sock_ooo_range *tp_ooo_range;
379+
struct tcp_sock *tp = tcp_sk(sk);
380+
u32 last_read_seq;
381+
int err;
382+
383+
tp_ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, 0);
384+
if (!tp_ooo_range)
385+
/* no recorded ooo-range for sock, so cannot be in ooo-range */
386+
return false;
387+
388+
if (!tp_ooo_range->active)
389+
return false;
390+
391+
err = bpf_core_read(&last_read_seq, sizeof(last_read_seq), &tp->copied_seq);
392+
if (err) {
393+
/*
394+
* Shouldn't happen.
395+
* Should probably emit some warning if reading copied_seq
396+
* unexpectedly fails. Assume not in ooo-range to avoid
397+
* systematically filtering out ALL values if this does happen.
398+
*/
399+
bpf_printk("failed to read tcp_sock->copied_seq: err=%d", err);
400+
return false;
401+
}
402+
403+
if (u32_lt(tp_ooo_range->ooo_seq_end, last_read_seq)) {
404+
tp_ooo_range->active = false;
405+
return false;
406+
} else {
407+
return true;
408+
}
409+
}
410+
331411
SEC("fentry/ip_rcv_core")
332412
int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block,
333413
void *tp, void *res, bool compat_mode)
@@ -393,6 +473,11 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
393473
struct scm_timestamping_internal *tss)
394474
{
395475
struct timespec64 *ts = &tss->ts[0];
476+
477+
/* skip if preceeding sock read ended in ooo-range */
478+
if (tcp_read_in_ooo_range(sk))
479+
return 0;
480+
396481
record_socket_latency(sk, NULL,
397482
(ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
398483
NETSTACKLAT_HOOK_TCP_SOCK_READ);
@@ -407,3 +492,15 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
407492
NETSTACKLAT_HOOK_UDP_SOCK_READ);
408493
return 0;
409494
}
495+
496+
/* This program should also be disabled if tcp-socket-read is disabled */
497+
SEC("fentry/tcp_data_queue_ofo")
498+
int BPF_PROG(netstacklat_tcp_data_queue_ofo, struct sock *sk,
499+
struct sk_buff *skb)
500+
{
501+
if (!filter_network(skb, sk))
502+
return 0;
503+
504+
tcp_update_ooo_range(sk, skb);
505+
return 0;
506+
}

netstacklat/netstacklat.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,8 @@ static void hook_to_progs(struct hook_prog_collection *progs,
258258
break;
259259
case NETSTACKLAT_HOOK_TCP_SOCK_READ:
260260
progs->progs[0] = obj->progs.netstacklat_tcp_recv_timestamp;
261-
progs->nprogs = 1;
261+
progs->progs[1] = obj->progs.netstacklat_tcp_data_queue_ofo;
262+
progs->nprogs = 2;
262263
break;
263264
case NETSTACKLAT_HOOK_UDP_SOCK_READ:
264265
progs->progs[0] = obj->progs.netstacklat_skb_consume_udp;

0 commit comments

Comments
 (0)