diff mbox series

[bpf-next,v3,4/7] bpf: Introduce support for bpf_local_irq_{save,restore}

Message ID 20241127165846.2001009-5-memxor@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series IRQ save/restore | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 8 maintainers not CCed: kpsingh@kernel.org jolsa@kernel.org song@kernel.org haoluo@google.com john.fastabend@gmail.com yonghong.song@linux.dev martin.lau@linux.dev sdf@fomichev.me
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 122 this patch: 124
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Please don't use multiple blank lines WARNING: 'unecessary' may be misspelled - perhaps 'unnecessary'? WARNING: 'unitialized' may be misspelled - perhaps 'uninitialized'? WARNING: 'unsinged' may be misspelled - perhaps 'unsigned'? WARNING: else is not generally useful after a break or return WARNING: line length of 100 exceeds 80 columns WARNING: line length of 106 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 11 this patch: 11
netdev/source_inline success Was 0 now: 0

Commit Message

Kumar Kartikeya Dwivedi Nov. 27, 2024, 4:58 p.m. UTC
Teach the verifier about IRQ-disabled sections through the introduction
of two new kfuncs, bpf_local_irq_save, to save IRQ state and disable
them, and bpf_local_irq_restore, to restore IRQ state and enable them
back again.

For the purposes of tracking the saved IRQ state, the verifier is taught
about a new special object on the stack of type STACK_IRQ_FLAG. This is
a 8 byte value which saves the IRQ flags which are to be passed back to
the IRQ restore kfunc.

Renumber the enums for REF_TYPE_* to simplify the check in
find_lock_state, filtering out non-lock types as they grow will become
cumbersome and is unecessary.

To track a dynamic number of IRQ-disabled regions and their associated
saved states, a new resource type RES_TYPE_IRQ is introduced, which its
state management functions: acquire_irq_state and release_irq_state,
taking advantage of the refactoring and clean ups made in earlier
commits.

One notable requirement of the kernel's IRQ save and restore API is that
they cannot happen out of order. For this purpose, when releasing reference
we keep track of the prev_id we saw with REF_TYPE_IRQ. Since reference
states are inserted in increasing order of the index, this is used to
remember the ordering of acquisitions of IRQ saved states, so that we
maintain a logical stack in acquisition order of resource identities,
and can enforce LIFO ordering when restoring IRQ state. The top of the
stack is maintained using bpf_verifier_state's active_irq_id.

The logic to detect initialized and unitialized irq flag slots, marking
and unmarking is similar to how it's done for iterators. No additional
checks are needed in refsafe for REF_TYPE_IRQ, apart from the usual
check_id satisfiability check on the ref[i].id. We have to perform the
same check_ids check on state->active_irq_id as well.

The kfuncs themselves are plain wrappers over local_irq_save and
local_irq_restore macros.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 include/linux/bpf_verifier.h |   8 +-
 kernel/bpf/helpers.c         |  17 +++
 kernel/bpf/log.c             |   1 +
 kernel/bpf/verifier.c        | 279 ++++++++++++++++++++++++++++++++++-
 4 files changed, 302 insertions(+), 3 deletions(-)

Comments

Eduard Zingerman Nov. 28, 2024, 4:31 a.m. UTC | #1
On Wed, 2024-11-27 at 08:58 -0800, Kumar Kartikeya Dwivedi wrote:
> Teach the verifier about IRQ-disabled sections through the introduction
> of two new kfuncs, bpf_local_irq_save, to save IRQ state and disable
> them, and bpf_local_irq_restore, to restore IRQ state and enable them
> back again.
> 
> For the purposes of tracking the saved IRQ state, the verifier is taught
> about a new special object on the stack of type STACK_IRQ_FLAG. This is
> a 8 byte value which saves the IRQ flags which are to be passed back to
> the IRQ restore kfunc.
> 
> Renumber the enums for REF_TYPE_* to simplify the check in
> find_lock_state, filtering out non-lock types as they grow will become
> cumbersome and is unecessary.
> 
> To track a dynamic number of IRQ-disabled regions and their associated
> saved states, a new resource type RES_TYPE_IRQ is introduced, which its
> state management functions: acquire_irq_state and release_irq_state,
> taking advantage of the refactoring and clean ups made in earlier
> commits.
> 
> One notable requirement of the kernel's IRQ save and restore API is that
> they cannot happen out of order. For this purpose, when releasing reference
> we keep track of the prev_id we saw with REF_TYPE_IRQ. Since reference
> states are inserted in increasing order of the index, this is used to
> remember the ordering of acquisitions of IRQ saved states, so that we
> maintain a logical stack in acquisition order of resource identities,
> and can enforce LIFO ordering when restoring IRQ state. The top of the
> stack is maintained using bpf_verifier_state's active_irq_id.
> 
> The logic to detect initialized and unitialized irq flag slots, marking
> and unmarking is similar to how it's done for iterators. No additional
> checks are needed in refsafe for REF_TYPE_IRQ, apart from the usual
> check_id satisfiability check on the ref[i].id. We have to perform the
> same check_ids check on state->active_irq_id as well.
> 
> The kfuncs themselves are plain wrappers over local_irq_save and
> local_irq_restore macros.
> 
> Acked-by: Eduard Zingerman <eddyz87@gmail.com>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>

Sorry, two more nits below.

[...]

> +static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> +{
> +	struct bpf_func_state *state = func(env, reg);
> +	struct bpf_stack_state *slot;
> +	struct bpf_reg_state *st;
> +	int spi, i, err;
> +
> +	spi = irq_flag_get_spi(env, reg);
> +	if (spi < 0)
> +		return spi;
> +
> +	slot = &state->stack[spi];
> +	st = &slot->spilled_ptr;
> +
> +	err = release_irq_state(env->cur_state, st->ref_obj_id);
> +	WARN_ON_ONCE(err && err != -EACCES);
> +	if (err) {
> +		verbose(env, "cannot restore irq state out of order\n");

Nit: maybe also print acquire_irq_id and an instruction where it was acquired?

> +		return err;
> +	}
> +
> +	__mark_reg_not_init(env, st);
> +
> +	/* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
> +	st->live |= REG_LIVE_WRITTEN;
> +
> +	for (i = 0; i < BPF_REG_SIZE; i++)
> +		slot->slot_type[i] = STACK_INVALID;
> +
> +	mark_stack_slot_scratched(env, spi);
> +	return 0;
> +}
> +
> +static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> +{
> +	struct bpf_func_state *state = func(env, reg);
> +	struct bpf_stack_state *slot;
> +	int spi, i;
> +
> +	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
> +	 * will do check_mem_access to check and update stack bounds later, so
> +	 * return true for that case.
> +	 */
> +	spi = irq_flag_get_spi(env, reg);
> +	if (spi == -ERANGE)
> +		return true;

Nit: is it possible to swap is_irq_flag_reg_valid_uninit() and
     check_mem_access(), so that ERANGE special case would be not needed?

> +	if (spi < 0)
> +		return false;
> +
> +	slot = &state->stack[spi];
> +
> +	for (i = 0; i < BPF_REG_SIZE; i++)
> +		if (slot->slot_type[i] == STACK_IRQ_FLAG)
> +			return false;
> +	return true;
> +}

[...]
Kumar Kartikeya Dwivedi Nov. 28, 2024, 4:39 a.m. UTC | #2
On Thu, 28 Nov 2024 at 05:31, Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Wed, 2024-11-27 at 08:58 -0800, Kumar Kartikeya Dwivedi wrote:
> > Teach the verifier about IRQ-disabled sections through the introduction
> > of two new kfuncs, bpf_local_irq_save, to save IRQ state and disable
> > them, and bpf_local_irq_restore, to restore IRQ state and enable them
> > back again.
> >
> > For the purposes of tracking the saved IRQ state, the verifier is taught
> > about a new special object on the stack of type STACK_IRQ_FLAG. This is
> > a 8 byte value which saves the IRQ flags which are to be passed back to
> > the IRQ restore kfunc.
> >
> > Renumber the enums for REF_TYPE_* to simplify the check in
> > find_lock_state, filtering out non-lock types as they grow will become
> > cumbersome and is unecessary.
> >
> > To track a dynamic number of IRQ-disabled regions and their associated
> > saved states, a new resource type RES_TYPE_IRQ is introduced, which its
> > state management functions: acquire_irq_state and release_irq_state,
> > taking advantage of the refactoring and clean ups made in earlier
> > commits.
> >
> > One notable requirement of the kernel's IRQ save and restore API is that
> > they cannot happen out of order. For this purpose, when releasing reference
> > we keep track of the prev_id we saw with REF_TYPE_IRQ. Since reference
> > states are inserted in increasing order of the index, this is used to
> > remember the ordering of acquisitions of IRQ saved states, so that we
> > maintain a logical stack in acquisition order of resource identities,
> > and can enforce LIFO ordering when restoring IRQ state. The top of the
> > stack is maintained using bpf_verifier_state's active_irq_id.
> >
> > The logic to detect initialized and unitialized irq flag slots, marking
> > and unmarking is similar to how it's done for iterators. No additional
> > checks are needed in refsafe for REF_TYPE_IRQ, apart from the usual
> > check_id satisfiability check on the ref[i].id. We have to perform the
> > same check_ids check on state->active_irq_id as well.
> >
> > The kfuncs themselves are plain wrappers over local_irq_save and
> > local_irq_restore macros.
> >
> > Acked-by: Eduard Zingerman <eddyz87@gmail.com>
> > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>
> Sorry, two more nits below.
>
> [...]
>
> > +static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> > +{
> > +     struct bpf_func_state *state = func(env, reg);
> > +     struct bpf_stack_state *slot;
> > +     struct bpf_reg_state *st;
> > +     int spi, i, err;
> > +
> > +     spi = irq_flag_get_spi(env, reg);
> > +     if (spi < 0)
> > +             return spi;
> > +
> > +     slot = &state->stack[spi];
> > +     st = &slot->spilled_ptr;
> > +
> > +     err = release_irq_state(env->cur_state, st->ref_obj_id);
> > +     WARN_ON_ONCE(err && err != -EACCES);
> > +     if (err) {
> > +             verbose(env, "cannot restore irq state out of order\n");
>
> Nit: maybe also print acquire_irq_id and an instruction where it was acquired?

Ack. For printing the insn_idx, I guess just search in the refs array?

>
> > +             return err;
> > +     }
> > +
> > +     __mark_reg_not_init(env, st);
> > +
> > +     /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
> > +     st->live |= REG_LIVE_WRITTEN;
> > +
> > +     for (i = 0; i < BPF_REG_SIZE; i++)
> > +             slot->slot_type[i] = STACK_INVALID;
> > +
> > +     mark_stack_slot_scratched(env, spi);
> > +     return 0;
> > +}
> > +
> > +static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> > +{
> > +     struct bpf_func_state *state = func(env, reg);
> > +     struct bpf_stack_state *slot;
> > +     int spi, i;
> > +
> > +     /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
> > +      * will do check_mem_access to check and update stack bounds later, so
> > +      * return true for that case.
> > +      */
> > +     spi = irq_flag_get_spi(env, reg);
> > +     if (spi == -ERANGE)
> > +             return true;
>
> Nit: is it possible to swap is_irq_flag_reg_valid_uninit() and
>      check_mem_access(), so that ERANGE special case would be not needed?
>

I don't think so. For dynptr, iter, irq, ERANGE indicates stack needs
to be grown, so check_mem_access will naturally do that when writing.
When not ERANGE, we need to catch cases where we have a bad slot_type.
If we overwrote it with check_mem_access, then it would scrub the slot
type as well.

When I fixed this stuff for dynptr, we had to additionally
destroy_if_dynptr_stack_slot because it wasn't required to 'release' a
dynptr when overwriting it.
Andrii made sure this was necessary for iters so now slot_type ==
STACK_ITER is just rejected instead of overwrite without a destroy
operation.
Similar idea is followed for irq flag.

Just paging in context for all this, but I may be missing if you have
something in mind.

> > +     if (spi < 0)
> > +             return false;
> > +
> > +     slot = &state->stack[spi];
> > +
> > +     for (i = 0; i < BPF_REG_SIZE; i++)
> > +             if (slot->slot_type[i] == STACK_IRQ_FLAG)
> > +                     return false;
> > +     return true;
> > +}
>
> [...]
>
Eduard Zingerman Nov. 28, 2024, 7:26 a.m. UTC | #3
On Thu, 2024-11-28 at 05:39 +0100, Kumar Kartikeya Dwivedi wrote:

[...]

> > > +static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> > > +{
> > > +     struct bpf_func_state *state = func(env, reg);
> > > +     struct bpf_stack_state *slot;
> > > +     int spi, i;
> > > +
> > > +     /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
> > > +      * will do check_mem_access to check and update stack bounds later, so
> > > +      * return true for that case.
> > > +      */
> > > +     spi = irq_flag_get_spi(env, reg);
> > > +     if (spi == -ERANGE)
> > > +             return true;
> > 
> > Nit: is it possible to swap is_irq_flag_reg_valid_uninit() and
> >      check_mem_access(), so that ERANGE special case would be not needed?
> > 
> 
> I don't think so. For dynptr, iter, irq, ERANGE indicates stack needs
> to be grown, so check_mem_access will naturally do that when writing.
> When not ERANGE, we need to catch cases where we have a bad slot_type.
> If we overwrote it with check_mem_access, then it would scrub the slot
> type as well.
> 
> When I fixed this stuff for dynptr, we had to additionally
> destroy_if_dynptr_stack_slot because it wasn't required to 'release' a
> dynptr when overwriting it.
> Andrii made sure this was necessary for iters so now slot_type ==
> STACK_ITER is just rejected instead of overwrite without a destroy
> operation.
> Similar idea is followed for irq flag.
> 
> Just paging in context for all this, but I may be missing if you have
> something in mind.

I see, makes sense. And is_dynptr_reg_valid_uninit() has the same check.
Thank you for explaining.

> > > +     if (spi < 0)
> > > +             return false;
> > > +
> > > +     slot = &state->stack[spi];
> > > +
> > > +     for (i = 0; i < BPF_REG_SIZE; i++)
> > > +             if (slot->slot_type[i] == STACK_IRQ_FLAG)
> > > +                     return false;
> > > +     return true;
> > > +}
> > 
> > [...]
> >
diff mbox series

Patch

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index af64b5415df8..3da7ae6c7bba 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -233,6 +233,7 @@  enum bpf_stack_slot_type {
 	 */
 	STACK_DYNPTR,
 	STACK_ITER,
+	STACK_IRQ_FLAG,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
@@ -254,8 +255,10 @@  struct bpf_reference_state {
 	 * default to pointer reference on zero initialization of a state.
 	 */
 	enum ref_state_type {
-		REF_TYPE_PTR = 0,
-		REF_TYPE_LOCK,
+		REF_TYPE_PTR	= 1,
+		REF_TYPE_IRQ	= 2,
+
+		REF_TYPE_LOCK	= 3,
 	} type;
 	/* Track each reference created with a unique id, even if the same
 	 * instruction creates the reference multiple times (eg, via CALL).
@@ -420,6 +423,7 @@  struct bpf_verifier_state {
 	u32 acquired_refs;
 	u32 active_locks;
 	u32 active_preempt_locks;
+	u32 active_irq_id;
 	bool active_rcu_lock;
 
 	bool speculative;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 751c150f9e1c..532ea74d4850 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3057,6 +3057,21 @@  __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user
 	return ret + 1;
 }
 
+/* Keep unsinged long in prototype so that kfunc is usable when emitted to
+ * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
+ * unsigned long always points to 8-byte region on stack, the kernel may only
+ * read and write the 4-bytes on 32-bit.
+ */
+__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
+{
+	local_irq_save(*flags__irq_flag);
+}
+
+__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
+{
+	local_irq_restore(*flags__irq_flag);
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -3149,6 +3164,8 @@  BTF_ID_FLAGS(func, bpf_get_kmem_cache)
 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_local_irq_save)
+BTF_ID_FLAGS(func, bpf_local_irq_restore)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 8b52e5b7504c..434fc320ba1d 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -537,6 +537,7 @@  static char slot_type_char[] = {
 	[STACK_ZERO]	= '0',
 	[STACK_DYNPTR]	= 'd',
 	[STACK_ITER]	= 'i',
+	[STACK_IRQ_FLAG] = 'f'
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index be2365a9794a..c6b40da49835 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -661,6 +661,11 @@  static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 	return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
 }
 
+static int irq_flag_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	return stack_slot_obj_get_spi(env, reg, "irq_flag", 1);
+}
+
 static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 {
 	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -1156,10 +1161,126 @@  static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s
 	return 0;
 }
 
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_irq_state(struct bpf_verifier_state *state, int id);
+
+static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
+				     struct bpf_kfunc_call_arg_meta *meta,
+				     struct bpf_reg_state *reg, int insn_idx)
+{
+	struct bpf_func_state *state = func(env, reg);
+	struct bpf_stack_state *slot;
+	struct bpf_reg_state *st;
+	int spi, i, id;
+
+	spi = irq_flag_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
+
+	id = acquire_irq_state(env, insn_idx);
+	if (id < 0)
+		return id;
+
+	slot = &state->stack[spi];
+	st = &slot->spilled_ptr;
+
+	__mark_reg_known_zero(st);
+	st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+	st->live |= REG_LIVE_WRITTEN;
+	st->ref_obj_id = id;
+
+	for (i = 0; i < BPF_REG_SIZE; i++)
+		slot->slot_type[i] = STACK_IRQ_FLAG;
+
+	mark_stack_slot_scratched(env, spi);
+	return 0;
+}
+
+static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	struct bpf_stack_state *slot;
+	struct bpf_reg_state *st;
+	int spi, i, err;
+
+	spi = irq_flag_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
+
+	slot = &state->stack[spi];
+	st = &slot->spilled_ptr;
+
+	err = release_irq_state(env->cur_state, st->ref_obj_id);
+	WARN_ON_ONCE(err && err != -EACCES);
+	if (err) {
+		verbose(env, "cannot restore irq state out of order\n");
+		return err;
+	}
+
+	__mark_reg_not_init(env, st);
+
+	/* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+	st->live |= REG_LIVE_WRITTEN;
+
+	for (i = 0; i < BPF_REG_SIZE; i++)
+		slot->slot_type[i] = STACK_INVALID;
+
+	mark_stack_slot_scratched(env, spi);
+	return 0;
+}
+
+static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	struct bpf_stack_state *slot;
+	int spi, i;
+
+	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+	 * will do check_mem_access to check and update stack bounds later, so
+	 * return true for that case.
+	 */
+	spi = irq_flag_get_spi(env, reg);
+	if (spi == -ERANGE)
+		return true;
+	if (spi < 0)
+		return false;
+
+	slot = &state->stack[spi];
+
+	for (i = 0; i < BPF_REG_SIZE; i++)
+		if (slot->slot_type[i] == STACK_IRQ_FLAG)
+			return false;
+	return true;
+}
+
+static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	struct bpf_stack_state *slot;
+	struct bpf_reg_state *st;
+	int spi, i;
+
+	spi = irq_flag_get_spi(env, reg);
+	if (spi < 0)
+		return -EINVAL;
+
+	slot = &state->stack[spi];
+	st = &slot->spilled_ptr;
+
+	if (!st->ref_obj_id)
+		return -EINVAL;
+
+	for (i = 0; i < BPF_REG_SIZE; i++)
+		if (slot->slot_type[i] != STACK_IRQ_FLAG)
+			return -EINVAL;
+	return 0;
+}
+
 /* Check if given stack slot is "special":
  *   - spilled register state (STACK_SPILL);
  *   - dynptr state (STACK_DYNPTR);
  *   - iter state (STACK_ITER).
+ *   - irq flag state (STACK_IRQ_FLAG)
  */
 static bool is_stack_slot_special(const struct bpf_stack_state *stack)
 {
@@ -1169,6 +1290,7 @@  static bool is_stack_slot_special(const struct bpf_stack_state *stack)
 	case STACK_SPILL:
 	case STACK_DYNPTR:
 	case STACK_ITER:
+	case STACK_IRQ_FLAG:
 		return true;
 	case STACK_INVALID:
 	case STACK_MISC:
@@ -1291,6 +1413,7 @@  static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf
 	dst->active_locks = src->active_locks;
 	dst->active_preempt_locks = src->active_preempt_locks;
 	dst->active_rcu_lock = src->active_rcu_lock;
+	dst->active_irq_id = src->active_irq_id;
 	return 0;
 }
 
@@ -1392,6 +1515,20 @@  static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum r
 	return 0;
 }
 
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx)
+{
+	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_reference_state *s;
+
+	s = acquire_reference_state(env, insn_idx, true);
+	if (!s)
+		return -ENOMEM;
+	s->type = REF_TYPE_IRQ;
+
+	state->active_irq_id = s->id;
+	return s->id;
+}
+
 static void release_reference_state(struct bpf_verifier_state *state, int idx)
 {
 	int last_idx;
@@ -1420,6 +1557,28 @@  static int release_lock_state(struct bpf_verifier_state *state, int type, int id
 	return -EINVAL;
 }
 
+static int release_irq_state(struct bpf_verifier_state *state, int id)
+{
+	u32 prev_id = 0;
+	int i;
+
+	if (id != state->active_irq_id)
+		return -EACCES;
+
+	for (i = 0; i < state->acquired_refs; i++) {
+		if (state->refs[i].type != REF_TYPE_IRQ)
+			continue;
+		if (state->refs[i].id == id) {
+			release_reference_state(state, i);
+			state->active_irq_id = prev_id;
+			return 0;
+		} else {
+			prev_id = state->refs[i].id;
+		}
+	}
+	return -EINVAL;
+}
+
 static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *state, enum ref_state_type type,
 						   int id, void *ptr)
 {
@@ -1428,7 +1587,7 @@  static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st
 	for (i = 0; i < state->acquired_refs; i++) {
 		struct bpf_reference_state *s = &state->refs[i];
 
-		if (s->type == REF_TYPE_PTR || s->type != type)
+		if (s->type != type)
 			continue;
 
 		if (s->id == id && s->ptr == ptr)
@@ -3236,6 +3395,16 @@  static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *re
 	return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
 }
 
+static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	int spi;
+
+	spi = irq_flag_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
+	return mark_stack_slot_obj_read(env, reg, spi, 1);
+}
+
 /* This function is supposed to be used by the following 32-bit optimization
  * code only. It returns TRUE if the source or destination register operates
  * on 64-bit, otherwise return FALSE.
@@ -10012,6 +10181,12 @@  static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			return -EINVAL;
 		}
 
+		if (env->cur_state->active_irq_id) {
+			verbose(env, "global function calls are not allowed with IRQs disabled,\n"
+				     "use static function instead\n");
+			return -EINVAL;
+		}
+
 		if (err) {
 			verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
 				subprog, sub_name);
@@ -10536,6 +10711,11 @@  static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit
 		return err;
 	}
 
+	if (check_lock && env->cur_state->active_irq_id) {
+		verbose(env, "%s cannot be used inside bpf_local_irq_save-ed region\n", prefix);
+		return -EINVAL;
+	}
+
 	if (check_lock && env->cur_state->active_rcu_lock) {
 		verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
 		return -EINVAL;
@@ -10740,6 +10920,17 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
 	}
 
+	if (env->cur_state->active_irq_id) {
+		if (fn->might_sleep) {
+			verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n",
+				func_id_name(func_id), func_id);
+			return -EINVAL;
+		}
+
+		if (in_sleepable(env) && is_storage_get_function(func_id))
+			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+	}
+
 	meta.func_id = func_id;
 	/* check args */
 	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -11301,6 +11492,11 @@  static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param
 	return btf_param_match_suffix(btf, arg, "__str");
 }
 
+static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param *arg)
+{
+	return btf_param_match_suffix(btf, arg, "__irq_flag");
+}
+
 static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
 					  const struct btf_param *arg,
 					  const char *name)
@@ -11454,6 +11650,7 @@  enum kfunc_ptr_arg_type {
 	KF_ARG_PTR_TO_CONST_STR,
 	KF_ARG_PTR_TO_MAP,
 	KF_ARG_PTR_TO_WORKQUEUE,
+	KF_ARG_PTR_TO_IRQ_FLAG,
 };
 
 enum special_kfunc_type {
@@ -11485,6 +11682,8 @@  enum special_kfunc_type {
 	KF_bpf_iter_css_task_new,
 	KF_bpf_session_cookie,
 	KF_bpf_get_kmem_cache,
+	KF_bpf_local_irq_save,
+	KF_bpf_local_irq_restore,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -11551,6 +11750,8 @@  BTF_ID(func, bpf_session_cookie)
 BTF_ID_UNUSED
 #endif
 BTF_ID(func, bpf_get_kmem_cache)
+BTF_ID(func, bpf_local_irq_save)
+BTF_ID(func, bpf_local_irq_restore)
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -11641,6 +11842,9 @@  get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_wq(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_WORKQUEUE;
 
+	if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
+		return KF_ARG_PTR_TO_IRQ_FLAG;
+
 	if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
 		if (!btf_type_is_struct(ref_t)) {
 			verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -11744,6 +11948,54 @@  static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int process_irq_flag(struct bpf_verifier_env *env, int regno,
+			     struct bpf_kfunc_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	bool irq_save;
+	int err;
+
+	if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) {
+		irq_save = true;
+	} else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) {
+		irq_save = false;
+	} else {
+		verbose(env, "verifier internal error: unknown irq flags kfunc\n");
+		return -EFAULT;
+	}
+
+	if (irq_save) {
+		if (!is_irq_flag_reg_valid_uninit(env, reg)) {
+			verbose(env, "expected uninitialized irq flag as arg#%d\n", regno);
+			return -EINVAL;
+		}
+
+		err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false);
+		if (err)
+			return err;
+
+		err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx);
+		if (err)
+			return err;
+	} else {
+		err = is_irq_flag_reg_valid_init(env, reg);
+		if (err) {
+			verbose(env, "expected an initialized irq flag as arg#%d\n", regno);
+			return err;
+		}
+
+		err = mark_irq_flag_read(env, reg);
+		if (err)
+			return err;
+
+		err = unmark_stack_slot_irq_flag(env, reg);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+
 static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct btf_record *rec = reg_btf_record(reg);
@@ -12332,6 +12584,7 @@  static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
 		case KF_ARG_PTR_TO_CONST_STR:
 		case KF_ARG_PTR_TO_WORKQUEUE:
+		case KF_ARG_PTR_TO_IRQ_FLAG:
 			break;
 		default:
 			WARN_ON_ONCE(1);
@@ -12626,6 +12879,15 @@  static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			if (ret < 0)
 				return ret;
 			break;
+		case KF_ARG_PTR_TO_IRQ_FLAG:
+			if (reg->type != PTR_TO_STACK) {
+				verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
+				return -EINVAL;
+			}
+			ret = process_irq_flag(env, regno, meta);
+			if (ret < 0)
+				return ret;
+			break;
 		}
 	}
 
@@ -12806,6 +13068,11 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EINVAL;
 	}
 
+	if (env->cur_state->active_irq_id && sleepable) {
+		verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name);
+		return -EACCES;
+	}
+
 	/* In case of release function, we get register number of refcounted
 	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
 	 */
@@ -17739,6 +18006,12 @@  static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
 				return false;
 			break;
+		case STACK_IRQ_FLAG:
+			old_reg = &old->stack[spi].spilled_ptr;
+			cur_reg = &cur->stack[spi].spilled_ptr;
+			if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+				return false;
+			break;
 		case STACK_MISC:
 		case STACK_ZERO:
 		case STACK_INVALID:
@@ -17768,12 +18041,16 @@  static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c
 	if (old->active_rcu_lock != cur->active_rcu_lock)
 		return false;
 
+	if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
+		return false;
+
 	for (i = 0; i < old->acquired_refs; i++) {
 		if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
 		    old->refs[i].type != cur->refs[i].type)
 			return false;
 		switch (old->refs[i].type) {
 		case REF_TYPE_PTR:
+		case REF_TYPE_IRQ:
 			break;
 		case REF_TYPE_LOCK:
 			if (old->refs[i].ptr != cur->refs[i].ptr)