Skip to content

Commit 59e923d

Browse files
committed
netstacklat: Add sanity checks for TCP HoL blocking filter
The logic for excluding samples from TCP reads that may have been delayed by HOL blocking relies on reading a number of fields from the TCP socket outside of the socket lock. This may be prone to errors due to the socket state being updated at another place in the kernel while our eBPF program is running. To reduce the risk that data races causes issues for our HoL detection logic, add a number of sanity checks to the read values. The most problematic of the read fields is ooo_last_skb, as that is a pointer to another skb. This pointer is only valid as long as the out_of_order_queue is non-empty. Due to a data race, we may check that the ooo-queue is non-empty while there are still SKBs in it, then the ooo-queue is cleared by the kernel, and then we attempt to read the contents of the ooo_last_skb SKB, which may at this point have been freed and/or recycled. This may result in incorrect values being used for the sequence limit used to exclude future reads of ooo-segments. The faulty sequence limit may both cause reads of HOL-blocked segments to be included or the exclusion of an unnecessarily large amount of future reads (up to 2 GB). To reduce the risk that the garbage data from an invalid SKB is used, introduce two sanity checks for end_seq in the ooo_last_skb. First check if the sequence number is zero, if so assume it is invalid (even though it can be a valid sequence number). Even though we will get an error code if reading the data from this SKB fails altogether, we may still succeed reading from a no longer valid SKB, in which case there is a high risk the data will have been zeroed. If it's non-zero, also check that it is within the current receive window (if not, clamp it to the receive window). Also introduce sanity checks for rcv_nxt and copied_seq in the tcp_sock, ensuring that they monotonically increase. To enable this, the last read (sane) value is tracked together with the ooo-state in the socket storage. For these, do not consider sequence 0 invalid, as these fields should be valid (although possibly updated in-between) as long as reading them succeeds (and failure to read is detected through the returned error code of bpf_core_read()). Skip adding similar monotonic growth validity checks for the rcv_wup field that now may need to be probed to compute the receive window as a compromise to not have to unconditionally probe and update its state every time. For the rcv_wnd field, also needed to calculate the receive window, I am not aware of any simple validity checks to perform. Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
1 parent dff6bc5 commit 59e923d

File tree

1 file changed

+123
-33
lines changed

1 file changed

+123
-33
lines changed

netstacklat/netstacklat.bpf.c

Lines changed: 123 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* SPDX-License-Identifier: GPL-2.0-or-later */
22
#include "vmlinux_local.h"
33
#include <linux/bpf.h>
4+
#include <linux/errno.h>
45

56
#include <bpf/bpf_helpers.h>
67
#include <bpf/bpf_tracing.h>
@@ -46,6 +47,8 @@ struct sk_buff___old {
4647
struct tcp_sock_ooo_range {
4748
u32 prev_n_ooopkts;
4849
u32 ooo_seq_end;
50+
u32 last_rcv_nxt;
51+
u32 last_copied_seq;
4952
/* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */
5053
bool active;
5154
};
@@ -330,10 +333,43 @@ static bool filter_min_sockqueue_len(struct sock *sk)
330333
return false;
331334
}
332335

333-
static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
336+
/* Get the current receive window end sequence for tp
337+
* In the kernel receive window checks are done against
338+
* tp->rcv_nxt + tcp_receive_window(tp). This function should give a compareable
339+
* result, i.e. rcv_wup + rcv_wnd or rcv_nxt, whichever is higher
340+
*/
341+
static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq)
334342
{
343+
u32 rcv_wup, rcv_wnd, window = 0;
344+
int err;
345+
346+
err = bpf_core_read(&rcv_wup, sizeof(rcv_wup), &tp->rcv_wup);
347+
if (err) {
348+
bpf_printk("failed to read tcp_sock->rcv_wup, err=%d", err);
349+
goto exit;
350+
}
351+
352+
err = bpf_core_read(&rcv_wnd, sizeof(rcv_wnd), &tp->rcv_wnd);
353+
if (err) {
354+
bpf_printk("failed to read tcp_sock->rcv_wnd, err=%d", err);
355+
goto exit;
356+
}
357+
358+
window = rcv_wup + rcv_wnd;
359+
if (u32_lt(window, rcv_nxt))
360+
window = rcv_nxt;
361+
362+
exit:
363+
*seq = window;
364+
return err;
365+
}
366+
367+
static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 rcv_nxt,
368+
struct tcp_sock_ooo_range *ooo_range,
369+
u32 *seq)
370+
{
371+
u32 cur_rcv_window, max_seq = 0;
335372
struct tcp_skb_cb cb;
336-
u32 max_seq = 0;
337373
int err = 0;
338374

339375
if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) {
@@ -342,57 +378,99 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
342378
* receive queue. Current rcv_nxt must therefore be ahead
343379
* of all ooo-segments that have arrived until now.
344380
*/
345-
err = bpf_core_read(&max_seq, sizeof(max_seq), &tp->rcv_nxt);
346-
if (err)
347-
bpf_printk("failed to read tcp_sock->rcv_nxt, err=%d",
348-
err);
349-
} else {
350-
/*
351-
* Some ooo-segments currently in ooo-queue
352-
* Max out-of-order seq is given by the seq_end of the tail
353-
* skb in the ooo-queue.
354-
*/
355-
err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb);
356-
if (err)
357-
bpf_printk(
358-
"failed to read tcp_sock->ooo_last_skb->cb, err=%d",
359-
err);
360-
max_seq = cb.end_seq;
381+
max_seq = rcv_nxt;
382+
goto exit;
383+
}
384+
385+
/*
386+
* Some ooo-segments currently in ooo-queue
387+
* Max out-of-order seq is given by the seq_end of the tail
388+
* skb in the ooo-queue.
389+
*/
390+
err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb);
391+
if (err) {
392+
bpf_printk("failed to read tcp_sock->ooo_last_skb->cb, err=%d",
393+
err);
394+
goto exit;
361395
}
362396

397+
// Sanity check - ooo_last_skb->cb.end_seq within the receive window?
398+
err = get_current_rcv_wnd_seq(tp, rcv_nxt, &cur_rcv_window);
399+
if (err)
400+
goto exit;
401+
402+
if (cb.end_seq == 0 || u32_lt(cur_rcv_window, cb.end_seq))
403+
max_seq = cur_rcv_window;
404+
else
405+
max_seq = cb.end_seq;
406+
407+
exit:
363408
*seq = max_seq;
364409
return err;
365410
}
366411

367-
static bool tcp_read_in_ooo_range(struct tcp_sock *tp,
412+
static bool tcp_read_in_ooo_range(struct tcp_sock *tp, u32 copied_seq,
368413
struct tcp_sock_ooo_range *ooo_range)
369414
{
370-
u32 read_seq;
371-
int err;
372-
373415
if (!ooo_range->active)
374416
return false;
375417

376-
err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq);
377-
if (err) {
378-
bpf_printk("failed to read tcp_sock->copied_seq, err=%d", err);
379-
return true; // Assume we may be in ooo-range
380-
}
381-
382-
if (u32_lt(ooo_range->ooo_seq_end, read_seq)) {
418+
if (u32_lt(ooo_range->ooo_seq_end, copied_seq)) {
383419
ooo_range->active = false;
384420
return false;
385421
} else {
386422
return true;
387423
}
388424
}
389425

426+
static int get_and_validate_rcvnxt(struct tcp_sock *tp,
427+
struct tcp_sock_ooo_range *ooo_range,
428+
u32 *rcvnxt)
429+
{
430+
u32 rcv_nxt = 0;
431+
int err;
432+
433+
err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt);
434+
if (err || (ooo_range->last_rcv_nxt &&
435+
u32_lt(rcv_nxt, ooo_range->last_rcv_nxt))) {
436+
bpf_printk("failed to read valid tcp_sock->rcv_nxt, err=%d",
437+
err);
438+
err = err ?: -ERANGE;
439+
} else {
440+
ooo_range->last_rcv_nxt = rcv_nxt;
441+
}
442+
443+
*rcvnxt = rcv_nxt;
444+
return err;
445+
}
446+
447+
static int get_and_validate_copiedseq(struct tcp_sock *tp,
448+
struct tcp_sock_ooo_range *ooo_range,
449+
u32 *copiedseq)
450+
{
451+
u32 copied_seq = 0;
452+
int err;
453+
454+
err = bpf_core_read(&copied_seq, sizeof(copied_seq), &tp->copied_seq);
455+
if (err || (ooo_range->last_copied_seq &&
456+
u32_lt(copied_seq, ooo_range->last_copied_seq))) {
457+
bpf_printk("failed to read valid tcp_sock->copied_seq, err=%d",
458+
err);
459+
err = err ?: -ERANGE;
460+
} else {
461+
ooo_range->last_copied_seq = copied_seq;
462+
}
463+
464+
*copiedseq = copied_seq;
465+
return err;
466+
}
467+
390468
static bool tcp_read_maybe_holblocked(struct sock *sk)
391469
{
470+
u32 n_ooopkts, rcv_nxt, copied_seq, nxt_seq;
392471
struct tcp_sock_ooo_range *ooo_range;
472+
int err, err_rcvnxt, err_copiedseq;
393473
struct tcp_sock *tp = tcp_sk(sk);
394-
u32 n_ooopkts, nxt_seq;
395-
int err;
396474

397475
err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack);
398476
if (err) {
@@ -412,18 +490,30 @@ static bool tcp_read_maybe_holblocked(struct sock *sk)
412490
return true; // Assume we may be in ooo-range
413491
}
414492

493+
/* rcv_nxt and copied_seq may not be needed, but to ensure we always
494+
* update our tracked state for them, read, sanity check and update
495+
* both their values here. Errors are only checked for in the paths
496+
* were the values are actually needed.
497+
*/
498+
err_rcvnxt = get_and_validate_rcvnxt(tp, ooo_range, &rcv_nxt);
499+
err_copiedseq = get_and_validate_copiedseq(tp, ooo_range, &copied_seq);
500+
415501
// Increase in ooo-packets since last - figure out next safe seq
416502
if (n_ooopkts > ooo_range->prev_n_ooopkts) {
417503
ooo_range->prev_n_ooopkts = n_ooopkts;
418-
err = current_max_possible_ooo_seq(tp, &nxt_seq);
504+
err = err_rcvnxt ?:
505+
current_max_possible_ooo_seq(tp, rcv_nxt,
506+
ooo_range, &nxt_seq);
419507
if (!err) {
420508
ooo_range->ooo_seq_end = nxt_seq;
421509
ooo_range->active = true;
422510
}
511+
423512
return true;
424513
}
425514

426-
return tcp_read_in_ooo_range(tp, ooo_range);
515+
return err_copiedseq ? true :
516+
tcp_read_in_ooo_range(tp, copied_seq, ooo_range);
427517
}
428518

429519
static void record_socket_latency(struct sock *sk, struct sk_buff *skb,

0 commit comments

Comments
 (0)