Skip to content

Commit e08ca1e

Browse files
committed
bpf: udp: Implement batching for sockets iterator
JIRA: https://issues.redhat.com/browse/RHEL-65787 commit c96dac8 Author: Aditi Ghag <aditi.ghag@isovalent.com> Date: Fri May 19 22:51:53 2023 +0000 bpf: udp: Implement batching for sockets iterator Batch UDP sockets from BPF iterator that allows for overlapping locking semantics in BPF/kernel helpers executed in BPF programs. This facilitates BPF socket destroy kfunc (introduced by follow-up patches) to execute from BPF iterator programs. Previously, BPF iterators acquired the sock lock and sockets hash table bucket lock while executing BPF programs. This prevented BPF helpers that again acquire these locks to be executed from BPF iterators. With the batching approach, we acquire a bucket lock, batch all the bucket sockets, and then release the bucket lock. This enables BPF or kernel helpers to skip sock locking when invoked in the supported BPF contexts. The batching logic is similar to the logic implemented in TCP iterator: https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@fb.com/. Suggested-by: Martin KaFai Lau <martin.lau@kernel.org> Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com> Link: https://lore.kernel.org/r/20230519225157.760788-6-aditi.ghag@isovalent.com Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
1 parent cde5da2 commit e08ca1e

File tree

1 file changed

+199
-6
lines changed

1 file changed

+199
-6
lines changed

net/ipv4/udp.c

Lines changed: 199 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3163,6 +3163,143 @@ struct bpf_iter__udp {
31633163
int bucket __aligned(8);
31643164
};
31653165

3166+
struct bpf_udp_iter_state {
3167+
struct udp_iter_state state;
3168+
unsigned int cur_sk;
3169+
unsigned int end_sk;
3170+
unsigned int max_sk;
3171+
int offset;
3172+
struct sock **batch;
3173+
bool st_bucket_done;
3174+
};
3175+
3176+
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
3177+
unsigned int new_batch_sz);
3178+
static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
3179+
{
3180+
struct bpf_udp_iter_state *iter = seq->private;
3181+
struct udp_iter_state *state = &iter->state;
3182+
struct net *net = seq_file_net(seq);
3183+
struct udp_table *udptable;
3184+
unsigned int batch_sks = 0;
3185+
bool resized = false;
3186+
struct sock *sk;
3187+
3188+
/* The current batch is done, so advance the bucket. */
3189+
if (iter->st_bucket_done) {
3190+
state->bucket++;
3191+
iter->offset = 0;
3192+
}
3193+
3194+
udptable = udp_get_table_seq(seq, net);
3195+
3196+
again:
3197+
/* New batch for the next bucket.
3198+
* Iterate over the hash table to find a bucket with sockets matching
3199+
* the iterator attributes, and return the first matching socket from
3200+
* the bucket. The remaining matched sockets from the bucket are batched
3201+
* before releasing the bucket lock. This allows BPF programs that are
3202+
* called in seq_show to acquire the bucket lock if needed.
3203+
*/
3204+
iter->cur_sk = 0;
3205+
iter->end_sk = 0;
3206+
iter->st_bucket_done = false;
3207+
batch_sks = 0;
3208+
3209+
for (; state->bucket <= udptable->mask; state->bucket++) {
3210+
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
3211+
3212+
if (hlist_empty(&hslot2->head)) {
3213+
iter->offset = 0;
3214+
continue;
3215+
}
3216+
3217+
spin_lock_bh(&hslot2->lock);
3218+
udp_portaddr_for_each_entry(sk, &hslot2->head) {
3219+
if (seq_sk_match(seq, sk)) {
3220+
/* Resume from the last iterated socket at the
3221+
* offset in the bucket before iterator was stopped.
3222+
*/
3223+
if (iter->offset) {
3224+
--iter->offset;
3225+
continue;
3226+
}
3227+
if (iter->end_sk < iter->max_sk) {
3228+
sock_hold(sk);
3229+
iter->batch[iter->end_sk++] = sk;
3230+
}
3231+
batch_sks++;
3232+
}
3233+
}
3234+
spin_unlock_bh(&hslot2->lock);
3235+
3236+
if (iter->end_sk)
3237+
break;
3238+
3239+
/* Reset the current bucket's offset before moving to the next bucket. */
3240+
iter->offset = 0;
3241+
}
3242+
3243+
/* All done: no batch made. */
3244+
if (!iter->end_sk)
3245+
return NULL;
3246+
3247+
if (iter->end_sk == batch_sks) {
3248+
/* Batching is done for the current bucket; return the first
3249+
* socket to be iterated from the batch.
3250+
*/
3251+
iter->st_bucket_done = true;
3252+
goto done;
3253+
}
3254+
if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
3255+
resized = true;
3256+
/* After allocating a larger batch, retry one more time to grab
3257+
* the whole bucket.
3258+
*/
3259+
state->bucket--;
3260+
goto again;
3261+
}
3262+
done:
3263+
return iter->batch[0];
3264+
}
3265+
3266+
static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3267+
{
3268+
struct bpf_udp_iter_state *iter = seq->private;
3269+
struct sock *sk;
3270+
3271+
/* Whenever seq_next() is called, the iter->cur_sk is
3272+
* done with seq_show(), so unref the iter->cur_sk.
3273+
*/
3274+
if (iter->cur_sk < iter->end_sk) {
3275+
sock_put(iter->batch[iter->cur_sk++]);
3276+
++iter->offset;
3277+
}
3278+
3279+
/* After updating iter->cur_sk, check if there are more sockets
3280+
* available in the current bucket batch.
3281+
*/
3282+
if (iter->cur_sk < iter->end_sk)
3283+
sk = iter->batch[iter->cur_sk];
3284+
else
3285+
/* Prepare a new batch. */
3286+
sk = bpf_iter_udp_batch(seq);
3287+
3288+
++*pos;
3289+
return sk;
3290+
}
3291+
3292+
static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
3293+
{
3294+
/* bpf iter does not support lseek, so it always
3295+
* continue from where it was stop()-ped.
3296+
*/
3297+
if (*pos)
3298+
return bpf_iter_udp_batch(seq);
3299+
3300+
return SEQ_START_TOKEN;
3301+
}
3302+
31663303
static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
31673304
struct udp_sock *udp_sk, uid_t uid, int bucket)
31683305
{
@@ -3183,18 +3320,37 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
31833320
struct bpf_prog *prog;
31843321
struct sock *sk = v;
31853322
uid_t uid;
3323+
int ret;
31863324

31873325
if (v == SEQ_START_TOKEN)
31883326
return 0;
31893327

3328+
lock_sock(sk);
3329+
3330+
if (unlikely(sk_unhashed(sk))) {
3331+
ret = SEQ_SKIP;
3332+
goto unlock;
3333+
}
3334+
31903335
uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
31913336
meta.seq = seq;
31923337
prog = bpf_iter_get_info(&meta, false);
3193-
return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
3338+
ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
3339+
3340+
unlock:
3341+
release_sock(sk);
3342+
return ret;
3343+
}
3344+
3345+
static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
3346+
{
3347+
while (iter->cur_sk < iter->end_sk)
3348+
sock_put(iter->batch[iter->cur_sk++]);
31943349
}
31953350

31963351
static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
31973352
{
3353+
struct bpf_udp_iter_state *iter = seq->private;
31983354
struct bpf_iter_meta meta;
31993355
struct bpf_prog *prog;
32003356

@@ -3205,12 +3361,15 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
32053361
(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
32063362
}
32073363

3208-
udp_seq_stop(seq, v);
3364+
if (iter->cur_sk < iter->end_sk) {
3365+
bpf_iter_udp_put_batch(iter);
3366+
iter->st_bucket_done = false;
3367+
}
32093368
}
32103369

32113370
static const struct seq_operations bpf_iter_udp_seq_ops = {
3212-
.start = udp_seq_start,
3213-
.next = udp_seq_next,
3371+
.start = bpf_iter_udp_seq_start,
3372+
.next = bpf_iter_udp_seq_next,
32143373
.stop = bpf_iter_udp_seq_stop,
32153374
.show = bpf_iter_udp_seq_show,
32163375
};
@@ -3350,21 +3509,55 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
33503509
DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
33513510
struct udp_sock *udp_sk, uid_t uid, int bucket)
33523511

3512+
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
3513+
unsigned int new_batch_sz)
3514+
{
3515+
struct sock **new_batch;
3516+
3517+
new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
3518+
GFP_USER | __GFP_NOWARN);
3519+
if (!new_batch)
3520+
return -ENOMEM;
3521+
3522+
bpf_iter_udp_put_batch(iter);
3523+
kvfree(iter->batch);
3524+
iter->batch = new_batch;
3525+
iter->max_sk = new_batch_sz;
3526+
3527+
return 0;
3528+
}
3529+
3530+
#define INIT_BATCH_SZ 16
3531+
33533532
static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
33543533
{
3355-
return bpf_iter_init_seq_net(priv_data, aux);
3534+
struct bpf_udp_iter_state *iter = priv_data;
3535+
int ret;
3536+
3537+
ret = bpf_iter_init_seq_net(priv_data, aux);
3538+
if (ret)
3539+
return ret;
3540+
3541+
ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
3542+
if (ret)
3543+
bpf_iter_fini_seq_net(priv_data);
3544+
3545+
return ret;
33563546
}
33573547

33583548
static void bpf_iter_fini_udp(void *priv_data)
33593549
{
3550+
struct bpf_udp_iter_state *iter = priv_data;
3551+
33603552
bpf_iter_fini_seq_net(priv_data);
3553+
kvfree(iter->batch);
33613554
}
33623555

33633556
static const struct bpf_iter_seq_info udp_seq_info = {
33643557
.seq_ops = &bpf_iter_udp_seq_ops,
33653558
.init_seq_private = bpf_iter_init_udp,
33663559
.fini_seq_private = bpf_iter_fini_udp,
3367-
.seq_priv_size = sizeof(struct udp_iter_state),
3560+
.seq_priv_size = sizeof(struct bpf_udp_iter_state),
33683561
};
33693562

33703563
static struct bpf_iter_reg udp_reg_info = {

0 commit comments

Comments
 (0)