Skip to content

Commit 06006de

Browse files
committed
bpf: Add verifier support for timed may_goto
JIRA: https://issues.redhat.com/browse/RHEL-78202 commit e723608 Author: Kumar Kartikeya Dwivedi <memxor@gmail.com> Date: Mon Mar 3 16:32:38 2025 -0800 bpf: Add verifier support for timed may_goto Implement support in the verifier for replacing may_goto implementation from a counter-based approach to one which samples time on the local CPU to have a bigger loop bound. We implement it by maintaining 16-bytes per-stack frame, and using 8 bytes for maintaining the count for amortizing time sampling, and 8 bytes for the starting timestamp. To minimize overhead, we need to avoid spilling and filling of registers around this sequence, so we push this cost into the time sampling function 'arch_bpf_timed_may_goto'. This is a JIT-specific wrapper around bpf_check_timed_may_goto which returns us the count to store into the stack through BPF_REG_AX. All caller-saved registers (r0-r5) are guaranteed to remain untouched. The loop can be broken by returning count as 0, otherwise we dispatch into the function when the count drops to 0, and the runtime chooses to refresh it (by returning count as BPF_MAX_TIMED_LOOPS) or returning 0 and aborting the loop on next iteration. Since the check for 0 is done right after loading the count from the stack, all subsequent cond_break sequences should immediately break as well, of the same loop or subsequent loops in the program. We pass in the stack_depth of the count (and thus the timestamp, by adding 8 to it) to the arch_bpf_timed_may_goto call so that it can be passed in to bpf_check_timed_may_goto as an argument after r1 is saved, by adding the offset to r10/fp. This adjustment will be arch specific, and the next patch will introduce support for x86. Note that depending on loop complexity, time spent in the loop can be more than the current limit (250 ms), but imposing an upper bound on program runtime is an orthogonal problem which will be addressed when program cancellations are supported. The current time afforded by cond_break may not be enough for cases where BPF programs want to implement locking algorithms inline, and use cond_break as a promise to the verifier that they will eventually terminate. Below are some benchmarking numbers on the time taken per-iteration for an empty loop that counts the number of iterations until cond_break fires. For comparison, we compare it against bpf_for/bpf_repeat which is another way to achieve the same number of spins (BPF_MAX_LOOPS). The hardware used for benchmarking was a Sapphire Rapids Intel server with performance governor enabled, mitigations were enabled. +-----------------------------+--------------+--------------+------------------+ | Loop type | Iterations | Time (ms) | Time/iter (ns) | +-----------------------------|--------------+--------------+------------------+ | may_goto | 8388608 | 3 | 0.36 | | timed_may_goto (count=65535)| 589674932 | 250 | 0.42 | | bpf_for | 8388608 | 10 | 1.19 | +-----------------------------+--------------+--------------+------------------+ This gives a good approximation at low overhead while staying close to the current implementation. Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20250304003239.2390751-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Gregory Bell <grbell@redhat.com>
1 parent 95082e6 commit 06006de

File tree

4 files changed

+96
-8
lines changed

4 files changed

+96
-8
lines changed

include/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2002,6 +2002,7 @@ struct bpf_array {
20022002
*/
20032003
enum {
20042004
BPF_MAX_LOOPS = 8 * 1024 * 1024,
2005+
BPF_MAX_TIMED_LOOPS = 0xffff,
20052006
};
20062007

20072008
#define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \

include/linux/filter.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,11 @@ struct bpf_prog_stats {
671671
struct u64_stats_sync syncp;
672672
} __aligned(2 * sizeof(u64));
673673

674+
struct bpf_timed_may_goto {
675+
u64 count;
676+
u64 timestamp;
677+
};
678+
674679
struct sk_filter {
675680
refcount_t refcnt;
676681
struct rcu_head rcu;
@@ -1132,8 +1137,11 @@ bool bpf_jit_supports_ptr_xchg(void);
11321137
bool bpf_jit_supports_arena(void);
11331138
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
11341139
bool bpf_jit_supports_private_stack(void);
1140+
bool bpf_jit_supports_timed_may_goto(void);
11351141
u64 bpf_arch_uaddress_limit(void);
11361142
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
1143+
u64 arch_bpf_timed_may_goto(void);
1144+
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *);
11371145
bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id);
11381146

11391147
static inline bool bpf_dump_raw_ok(const struct cred *cred)

kernel/bpf/core.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3129,6 +3129,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp,
31293129
{
31303130
}
31313131

3132+
bool __weak bpf_jit_supports_timed_may_goto(void)
3133+
{
3134+
return false;
3135+
}
3136+
3137+
u64 __weak arch_bpf_timed_may_goto(void)
3138+
{
3139+
return 0;
3140+
}
3141+
3142+
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
3143+
{
3144+
u64 time = ktime_get_mono_fast_ns();
3145+
3146+
/* Populate the timestamp for this stack frame, and refresh count. */
3147+
if (!p->timestamp) {
3148+
p->timestamp = time;
3149+
return BPF_MAX_TIMED_LOOPS;
3150+
}
3151+
/* Check if we've exhausted our time slice, and zero count. */
3152+
if (time - p->timestamp >= (NSEC_PER_SEC / 4))
3153+
return 0;
3154+
/* Refresh the count for the stack frame. */
3155+
return BPF_MAX_TIMED_LOOPS;
3156+
}
3157+
31323158
/* for configs without MMU or 32-bit */
31333159
__weak const struct bpf_map_ops arena_map_ops;
31343160
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)

kernel/bpf/verifier.c

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21600,7 +21600,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
2160021600
goto next_insn;
2160121601
}
2160221602

21603-
if (is_may_goto_insn(insn)) {
21603+
if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
21604+
int stack_off_cnt = -stack_depth - 16;
21605+
21606+
/*
21607+
* Two 8 byte slots, depth-16 stores the count, and
21608+
* depth-8 stores the start timestamp of the loop.
21609+
*
21610+
* The starting value of count is BPF_MAX_TIMED_LOOPS
21611+
* (0xffff). Every iteration loads it and subs it by 1,
21612+
* until the value becomes 0 in AX (thus, 1 in stack),
21613+
* after which we call arch_bpf_timed_may_goto, which
21614+
* either sets AX to 0xffff to keep looping, or to 0
21615+
* upon timeout. AX is then stored into the stack. In
21616+
* the next iteration, we either see 0 and break out, or
21617+
* continue iterating until the next time value is 0
21618+
* after subtraction, rinse and repeat.
21619+
*/
21620+
stack_depth_extra = 16;
21621+
insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
21622+
if (insn->off >= 0)
21623+
insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
21624+
else
21625+
insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
21626+
insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
21627+
insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
21628+
/*
21629+
* AX is used as an argument to pass in stack_off_cnt
21630+
* (to add to r10/fp), and also as the return value of
21631+
* the call to arch_bpf_timed_may_goto.
21632+
*/
21633+
insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
21634+
insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
21635+
insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
21636+
cnt = 7;
21637+
21638+
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
21639+
if (!new_prog)
21640+
return -ENOMEM;
21641+
21642+
delta += cnt - 1;
21643+
env->prog = prog = new_prog;
21644+
insn = new_prog->insnsi + i + delta;
21645+
goto next_insn;
21646+
} else if (is_may_goto_insn(insn)) {
2160421647
int stack_off = -stack_depth - 8;
2160521648

2160621649
stack_depth_extra = 8;
@@ -22141,23 +22184,33 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
2214122184

2214222185
env->prog->aux->stack_depth = subprogs[0].stack_depth;
2214322186
for (i = 0; i < env->subprog_cnt; i++) {
22187+
int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
2214422188
int subprog_start = subprogs[i].start;
2214522189
int stack_slots = subprogs[i].stack_extra / 8;
22190+
int slots = delta, cnt = 0;
2214622191

2214722192
if (!stack_slots)
2214822193
continue;
22149-
if (stack_slots > 1) {
22194+
/* We need two slots in case timed may_goto is supported. */
22195+
if (stack_slots > slots) {
2215022196
verbose(env, "verifier bug: stack_slots supports may_goto only\n");
2215122197
return -EFAULT;
2215222198
}
2215322199

22154-
/* Add ST insn to subprog prologue to init extra stack */
22155-
insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
22156-
-subprogs[i].stack_depth, BPF_MAX_LOOPS);
22200+
stack_depth = subprogs[i].stack_depth;
22201+
if (bpf_jit_supports_timed_may_goto()) {
22202+
insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
22203+
BPF_MAX_TIMED_LOOPS);
22204+
insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
22205+
} else {
22206+
/* Add ST insn to subprog prologue to init extra stack */
22207+
insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
22208+
BPF_MAX_LOOPS);
22209+
}
2215722210
/* Copy first actual insn to preserve it */
22158-
insn_buf[1] = env->prog->insnsi[subprog_start];
22211+
insn_buf[cnt++] = env->prog->insnsi[subprog_start];
2215922212

22160-
new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
22213+
new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
2216122214
if (!new_prog)
2216222215
return -ENOMEM;
2216322216
env->prog = prog = new_prog;
@@ -22167,7 +22220,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
2216722220
* to insn after BPF_ST that inits may_goto count.
2216822221
* Adjustment will succeed because bpf_patch_insn_data() didn't fail.
2216922222
*/
22170-
WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1));
22223+
WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
2217122224
}
2217222225

2217322226
/* Since poke tab is now finalized, publish aux to tracker. */

0 commit comments

Comments
 (0)