diff mbox series

[bpf-next,1/3] bpf: Call free_htab_elem() after htab_unlock_bucket()

Message ID 20241106063542.357743-2-houtao@huaweicloud.com (mailing list archive)
State Accepted
Commit 81eef03cbf70972f90b63525ae51617c812e2953
Delegated to: BPF
Headers show
Series Fix lockdep warning for htab of map | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next-0
bpf/vmtest-bpf-net-PR success PR summary
bpf/vmtest-bpf-net-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-net-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-net-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-net-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-net-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-net-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-net-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-net-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-net-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-net-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-net-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-net-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-net-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-net-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-net-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-net-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-net-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-net-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-net-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-net-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-net-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-net-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-net-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-net-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-net-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-net-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-net-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-net-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-net-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-net-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-net-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-net-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-net-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-net-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-net-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-net-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-net-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-net-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-net-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-net-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-net-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-net-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17

Commit Message

Hou Tao Nov. 6, 2024, 6:35 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

For htab of maps, when the map is removed from the htab, it may hold the
last reference of the map. bpf_map_fd_put_ptr() will invoke
bpf_map_free_id() to free the id of the removed map element. However,
bpf_map_fd_put_ptr() is invoked while holding a bucket lock
(raw_spin_lock_t), and bpf_map_free_id() attempts to acquire map_idr_lock
(spinlock_t), triggering the following lockdep warning:

  =============================
  [ BUG: Invalid wait context ]
  6.11.0-rc4+ #49 Not tainted
  -----------------------------
  test_maps/4881 is trying to lock:
  ffffffff84884578 (map_idr_lock){+...}-{3:3}, at: bpf_map_free_id.part.0+0x21/0x70
  other info that might help us debug this:
  context-{5:5}
  2 locks held by test_maps/4881:
   #0: ffffffff846caf60 (rcu_read_lock){....}-{1:3}, at: bpf_fd_htab_map_update_elem+0xf9/0x270
   #1: ffff888149ced148 (&htab->lockdep_key#2){....}-{2:2}, at: htab_map_update_elem+0x178/0xa80
  stack backtrace:
  CPU: 0 UID: 0 PID: 4881 Comm: test_maps Not tainted 6.11.0-rc4+ #49
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
  Call Trace:
   <TASK>
   dump_stack_lvl+0x6e/0xb0
   dump_stack+0x10/0x20
   __lock_acquire+0x73e/0x36c0
   lock_acquire+0x182/0x450
   _raw_spin_lock_irqsave+0x43/0x70
   bpf_map_free_id.part.0+0x21/0x70
   bpf_map_put+0xcf/0x110
   bpf_map_fd_put_ptr+0x9a/0xb0
   free_htab_elem+0x69/0xe0
   htab_map_update_elem+0x50f/0xa80
   bpf_fd_htab_map_update_elem+0x131/0x270
   htab_map_update_elem+0x50f/0xa80
   bpf_fd_htab_map_update_elem+0x131/0x270
   bpf_map_update_value+0x266/0x380
   __sys_bpf+0x21bb/0x36b0
   __x64_sys_bpf+0x45/0x60
   x64_sys_call+0x1b2a/0x20d0
   do_syscall_64+0x5d/0x100
   entry_SYSCALL_64_after_hwframe+0x76/0x7e

One way to fix the lockdep warning is using raw_spinlock_t for
map_idr_lock as well. However, bpf_map_alloc_id() invokes
idr_alloc_cyclic() after acquiring map_idr_lock, it will trigger a
similar lockdep warning because the slab's lock (s->cpu_slab->lock) is
still a spinlock.

Instead of changing map_idr_lock's type, fix the issue by invoking
htab_put_fd_value() after htab_unlock_bucket(). However, only deferring
the invocation of htab_put_fd_value() is not enough, because the old map
pointers in htab of maps can not be saved during batched deletion.
Therefore, also defer the invocation of free_htab_elem(), so these
to-be-freed elements could be linked together similar to lru map.

There are four callers for ->map_fd_put_ptr:

(1) alloc_htab_elem() (through htab_put_fd_value())
It invokes ->map_fd_put_ptr() under a raw_spinlock_t. The invocation of
htab_put_fd_value() can not simply move after htab_unlock_bucket(),
because the old element has already been stashed in htab->extra_elems.
It may be reused immediately after htab_unlock_bucket() and the
invocation of htab_put_fd_value() after htab_unlock_bucket() may release
the newly-added element incorrectly. Therefore, saving the map pointer
of the old element for htab of maps before unlocking the bucket and
releasing the map_ptr after unlock. Beside the map pointer in the old
element, should do the same thing for the special fields in the old
element as well.

(2) free_htab_elem() (through htab_put_fd_value())
Its caller includes __htab_map_lookup_and_delete_elem(),
htab_map_delete_elem() and __htab_map_lookup_and_delete_batch().

For htab_map_delete_elem(), simply invoke free_htab_elem() after
htab_unlock_bucket(). For __htab_map_lookup_and_delete_batch(), just
like lru map, linking the to-be-freed element into node_to_free list
and invoking free_htab_elem() for these element after unlock. It is safe
to reuse batch_flink as the link for node_to_free, because these
elements have been removed from the hash llist.

Because htab of maps doesn't support lookup_and_delete operation,
__htab_map_lookup_and_delete_elem() doesn't have the problem, so kept
it as is.

(3) fd_htab_map_free()
It invokes ->map_fd_put_ptr without raw_spinlock_t.

(4) bpf_fd_htab_map_update_elem()
It invokes ->map_fd_put_ptr without raw_spinlock_t.

After moving free_htab_elem() outside htab bucket lock scope, using
pcpu_freelist_push() instead of __pcpu_freelist_push() to disable
the irq before freeing elements, and protecting the invocations of
bpf_mem_cache_free() with migrate_{disable|enable} pair.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/hashtab.c | 56 ++++++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 17 deletions(-)

Comments

Hou Tao Nov. 6, 2024, 9:53 a.m. UTC | #1
On 11/6/2024 2:35 PM, Hou Tao wrote:
> From: Hou Tao <houtao1@huawei.com>
>
> For htab of maps, when the map is removed from the htab, it may hold the
> last reference of the map. bpf_map_fd_put_ptr() will invoke
> bpf_map_free_id() to free the id of the removed map element. However,
> bpf_map_fd_put_ptr() is invoked while holding a bucket lock
> (raw_spin_lock_t), and bpf_map_free_id() attempts to acquire map_idr_lock
> (spinlock_t), triggering the following lockdep warning:
>
>   =============================
>   [ BUG: Invalid wait context ]
>   6.11.0-rc4+ #49 Not tainted
>   -----------------------------
>   test_maps/4881 is trying to lock:
>   ffffffff84884578 (map_idr_lock){+...}-{3:3}, at: bpf_map_free_id.part.0+0x21/0x70
>   other info that might help us debug this:
>   context-{5:5}
>   2 locks held by test_maps/4881:
>    #0: ffffffff846caf60 (rcu_read_lock){....}-{1:3}, at: bpf_fd_htab_map_update_elem+0xf9/0x270
>    #1: ffff888149ced148 (&htab->lockdep_key#2){....}-{2:2}, at: htab_map_update_elem+0x178/0xa80
>   stack backtrace:
>   CPU: 0 UID: 0 PID: 4881 Comm: test_maps Not tainted 6.11.0-rc4+ #49
>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
>   Call Trace:
>    <TASK>
>    dump_stack_lvl+0x6e/0xb0
>    dump_stack+0x10/0x20
>    __lock_acquire+0x73e/0x36c0
>    lock_acquire+0x182/0x450
>    _raw_spin_lock_irqsave+0x43/0x70
>    bpf_map_free_id.part.0+0x21/0x70
>    bpf_map_put+0xcf/0x110
>    bpf_map_fd_put_ptr+0x9a/0xb0
>    free_htab_elem+0x69/0xe0
>    htab_map_update_elem+0x50f/0xa80
>    bpf_fd_htab_map_update_elem+0x131/0x270
>    htab_map_update_elem+0x50f/0xa80
>    bpf_fd_htab_map_update_elem+0x131/0x270
>    bpf_map_update_value+0x266/0x380
>    __sys_bpf+0x21bb/0x36b0
>    __x64_sys_bpf+0x45/0x60
>    x64_sys_call+0x1b2a/0x20d0
>    do_syscall_64+0x5d/0x100
>    entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> One way to fix the lockdep warning is using raw_spinlock_t for
> map_idr_lock as well. However, bpf_map_alloc_id() invokes
> idr_alloc_cyclic() after acquiring map_idr_lock, it will trigger a
> similar lockdep warning because the slab's lock (s->cpu_slab->lock) is
> still a spinlock.

Is it OK to move the calling of bpf_map_free_id() from bpf_map_put() to
bpf_map_free_deferred() ? It could fix the lockdep warning for htab of
maps. Its downside is that the free of map id will be delayed, but I
think it will not make a visible effect to the user, because the refcnt
is already 0, trying to get the map fd through map id will return -ENOENT.
> Instead of changing map_idr_lock's type, fix the issue by invoking
> htab_put_fd_value() after htab_unlock_bucket(). However, only deferring
> the invocation of htab_put_fd_value() is not enough, because the old map
> pointers in htab of maps can not be saved during batched deletion.
> Therefore, also defer the invocation of free_htab_elem(), so these
> to-be-freed elements could be linked together similar to lru map.
>
> There are four callers for ->map_fd_put_ptr:
>
> (1) alloc_htab_elem() (through htab_put_fd_value())
> It invokes ->map_fd_put_ptr() under a raw_spinlock_t. The invocation of
> htab_put_fd_value() can not simply move after htab_unlock_bucket(),
> because the old element has already been stashed in htab->extra_elems.
> It may be reused immediately after htab_unlock_bucket() and the
> invocation of htab_put_fd_value() after htab_unlock_bucket() may release
> the newly-added element incorrectly. Therefore, saving the map pointer
> of the old element for htab of maps before unlocking the bucket and
> releasing the map_ptr after unlock. Beside the map pointer in the old
> element, should do the same thing for the special fields in the old
> element as well.
>
> (2) free_htab_elem() (through htab_put_fd_value())
> Its caller includes __htab_map_lookup_and_delete_elem(),
> htab_map_delete_elem() and __htab_map_lookup_and_delete_batch().
>
> For htab_map_delete_elem(), simply invoke free_htab_elem() after
> htab_unlock_bucket(). For __htab_map_lookup_and_delete_batch(), just
> like lru map, linking the to-be-freed element into node_to_free list
> and invoking free_htab_elem() for these element after unlock. It is safe
> to reuse batch_flink as the link for node_to_free, because these
> elements have been removed from the hash llist.
>
> Because htab of maps doesn't support lookup_and_delete operation,
> __htab_map_lookup_and_delete_elem() doesn't have the problem, so kept
> it as is.
>
> (3) fd_htab_map_free()
> It invokes ->map_fd_put_ptr without raw_spinlock_t.
>
> (4) bpf_fd_htab_map_update_elem()
> It invokes ->map_fd_put_ptr without raw_spinlock_t.
>
> After moving free_htab_elem() outside htab bucket lock scope, using
> pcpu_freelist_push() instead of __pcpu_freelist_push() to disable
> the irq before freeing elements, and protecting the invocations of
> bpf_mem_cache_free() with migrate_{disable|enable} pair.
>
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  kernel/bpf/hashtab.c | 56 ++++++++++++++++++++++++++++++--------------
>  1 file changed, 39 insertions(+), 17 deletions(-)
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index b14b87463ee0..3ec941a0ea41 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -896,9 +896,12 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>  static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
>  {
>  	check_and_free_fields(htab, l);
> +
> +	migrate_disable();
>  	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
>  		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
>  	bpf_mem_cache_free(&htab->ma, l);
> +	migrate_enable();
>  }
>  
>  static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
> @@ -948,7 +951,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
>  	if (htab_is_prealloc(htab)) {
>  		bpf_map_dec_elem_count(&htab->map);
>  		check_and_free_fields(htab, l);
> -		__pcpu_freelist_push(&htab->freelist, &l->fnode);
> +		pcpu_freelist_push(&htab->freelist, &l->fnode);
>  	} else {
>  		dec_elem_count(htab);
>  		htab_elem_free(htab, l);
> @@ -1018,7 +1021,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
>  			 */
>  			pl_new = this_cpu_ptr(htab->extra_elems);
>  			l_new = *pl_new;
> -			htab_put_fd_value(htab, old_elem);
>  			*pl_new = old_elem;
>  		} else {
>  			struct pcpu_freelist_node *l;
> @@ -1105,6 +1107,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	struct htab_elem *l_new = NULL, *l_old;
>  	struct hlist_nulls_head *head;
>  	unsigned long flags;
> +	void *old_map_ptr;
>  	struct bucket *b;
>  	u32 key_size, hash;
>  	int ret;
> @@ -1183,12 +1186,27 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
>  	if (l_old) {
>  		hlist_nulls_del_rcu(&l_old->hash_node);
> +
> +		/* l_old has already been stashed in htab->extra_elems, free
> +		 * its special fields before it is available for reuse. Also
> +		 * save the old map pointer in htab of maps before unlock
> +		 * and release it after unlock.
> +		 */
> +		old_map_ptr = NULL;
> +		if (htab_is_prealloc(htab)) {
> +			if (map->ops->map_fd_put_ptr)
> +				old_map_ptr = fd_htab_map_get_ptr(map, l_old);
> +			check_and_free_fields(htab, l_old);
> +		}
> +	}
> +	htab_unlock_bucket(htab, b, hash, flags);
> +	if (l_old) {
> +		if (old_map_ptr)
> +			map->ops->map_fd_put_ptr(map, old_map_ptr, true);
>  		if (!htab_is_prealloc(htab))
>  			free_htab_elem(htab, l_old);
> -		else
> -			check_and_free_fields(htab, l_old);
>  	}
> -	ret = 0;
> +	return 0;
>  err:
>  	htab_unlock_bucket(htab, b, hash, flags);
>  	return ret;
> @@ -1432,15 +1450,15 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
>  		return ret;
>  
>  	l = lookup_elem_raw(head, hash, key, key_size);
> -
> -	if (l) {
> +	if (l)
>  		hlist_nulls_del_rcu(&l->hash_node);
> -		free_htab_elem(htab, l);
> -	} else {
> +	else
>  		ret = -ENOENT;
> -	}
>  
>  	htab_unlock_bucket(htab, b, hash, flags);
> +
> +	if (l)
> +		free_htab_elem(htab, l);
>  	return ret;
>  }
>  
> @@ -1853,13 +1871,14 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>  			 * may cause deadlock. See comments in function
>  			 * prealloc_lru_pop(). Let us do bpf_lru_push_free()
>  			 * after releasing the bucket lock.
> +			 *
> +			 * For htab of maps, htab_put_fd_value() in
> +			 * free_htab_elem() may acquire a spinlock with bucket
> +			 * lock being held and it violates the lock rule, so
> +			 * invoke free_htab_elem() after unlock as well.
>  			 */
> -			if (is_lru_map) {
> -				l->batch_flink = node_to_free;
> -				node_to_free = l;
> -			} else {
> -				free_htab_elem(htab, l);
> -			}
> +			l->batch_flink = node_to_free;
> +			node_to_free = l;
>  		}
>  		dst_key += key_size;
>  		dst_val += value_size;
> @@ -1871,7 +1890,10 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>  	while (node_to_free) {
>  		l = node_to_free;
>  		node_to_free = node_to_free->batch_flink;
> -		htab_lru_push_free(htab, l);
> +		if (is_lru_map)
> +			htab_lru_push_free(htab, l);
> +		else
> +			free_htab_elem(htab, l);
>  	}
>  
>  next_batch:
Alexei Starovoitov Nov. 8, 2024, 7:55 p.m. UTC | #2
On Wed, Nov 6, 2024 at 1:53 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
>
>
> On 11/6/2024 2:35 PM, Hou Tao wrote:
> > From: Hou Tao <houtao1@huawei.com>
> >
> > For htab of maps, when the map is removed from the htab, it may hold the
> > last reference of the map. bpf_map_fd_put_ptr() will invoke
> > bpf_map_free_id() to free the id of the removed map element. However,
> > bpf_map_fd_put_ptr() is invoked while holding a bucket lock
> > (raw_spin_lock_t), and bpf_map_free_id() attempts to acquire map_idr_lock
> > (spinlock_t), triggering the following lockdep warning:
> >
> >   =============================
> >   [ BUG: Invalid wait context ]
> >   6.11.0-rc4+ #49 Not tainted
> >   -----------------------------
> >   test_maps/4881 is trying to lock:
> >   ffffffff84884578 (map_idr_lock){+...}-{3:3}, at: bpf_map_free_id.part.0+0x21/0x70
> >   other info that might help us debug this:
> >   context-{5:5}
> >   2 locks held by test_maps/4881:
> >    #0: ffffffff846caf60 (rcu_read_lock){....}-{1:3}, at: bpf_fd_htab_map_update_elem+0xf9/0x270
> >    #1: ffff888149ced148 (&htab->lockdep_key#2){....}-{2:2}, at: htab_map_update_elem+0x178/0xa80
> >   stack backtrace:
> >   CPU: 0 UID: 0 PID: 4881 Comm: test_maps Not tainted 6.11.0-rc4+ #49
> >   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
> >   Call Trace:
> >    <TASK>
> >    dump_stack_lvl+0x6e/0xb0
> >    dump_stack+0x10/0x20
> >    __lock_acquire+0x73e/0x36c0
> >    lock_acquire+0x182/0x450
> >    _raw_spin_lock_irqsave+0x43/0x70
> >    bpf_map_free_id.part.0+0x21/0x70
> >    bpf_map_put+0xcf/0x110
> >    bpf_map_fd_put_ptr+0x9a/0xb0
> >    free_htab_elem+0x69/0xe0
> >    htab_map_update_elem+0x50f/0xa80
> >    bpf_fd_htab_map_update_elem+0x131/0x270
> >    htab_map_update_elem+0x50f/0xa80
> >    bpf_fd_htab_map_update_elem+0x131/0x270
> >    bpf_map_update_value+0x266/0x380
> >    __sys_bpf+0x21bb/0x36b0
> >    __x64_sys_bpf+0x45/0x60
> >    x64_sys_call+0x1b2a/0x20d0
> >    do_syscall_64+0x5d/0x100
> >    entry_SYSCALL_64_after_hwframe+0x76/0x7e
> >
> > One way to fix the lockdep warning is using raw_spinlock_t for
> > map_idr_lock as well. However, bpf_map_alloc_id() invokes
> > idr_alloc_cyclic() after acquiring map_idr_lock, it will trigger a
> > similar lockdep warning because the slab's lock (s->cpu_slab->lock) is
> > still a spinlock.
>
> Is it OK to move the calling of bpf_map_free_id() from bpf_map_put() to
> bpf_map_free_deferred() ? It could fix the lockdep warning for htab of
> maps. Its downside is that the free of map id will be delayed, but I
> think it will not make a visible effect to the user, because the refcnt
> is already 0, trying to get the map fd through map id will return -ENOENT.

I've applied the current patch, since doing free outside of bucket lock
is a good thing to do anyway.
With that no need to move bpf_map_free_id(), right?
At least offload case relies on id being removed immediately.
I don't remember what else might care.
Hou Tao Nov. 9, 2024, 1:29 a.m. UTC | #3
Hi,

On 11/9/2024 3:55 AM, Alexei Starovoitov wrote:
> On Wed, Nov 6, 2024 at 1:53 AM Hou Tao <houtao@huaweicloud.com> wrote:
>>
>>
>> On 11/6/2024 2:35 PM, Hou Tao wrote:
>>> From: Hou Tao <houtao1@huawei.com>
>>>
>>> For htab of maps, when the map is removed from the htab, it may hold the
>>> last reference of the map. bpf_map_fd_put_ptr() will invoke
>>> bpf_map_free_id() to free the id of the removed map element. However,
>>> bpf_map_fd_put_ptr() is invoked while holding a bucket lock
>>> (raw_spin_lock_t), and bpf_map_free_id() attempts to acquire map_idr_lock
>>> (spinlock_t), triggering the following lockdep warning:
>>>
>>>   =============================
>>>   [ BUG: Invalid wait context ]
>>>   6.11.0-rc4+ #49 Not tainted
>>>   -----------------------------
>>>   test_maps/4881 is trying to lock:
>>>   ffffffff84884578 (map_idr_lock){+...}-{3:3}, at: bpf_map_free_id.part.0+0x21/0x70
>>>   other info that might help us debug this:
>>>   context-{5:5}
>>>   2 locks held by test_maps/4881:
>>>    #0: ffffffff846caf60 (rcu_read_lock){....}-{1:3}, at: bpf_fd_htab_map_update_elem+0xf9/0x270
>>>    #1: ffff888149ced148 (&htab->lockdep_key#2){....}-{2:2}, at: htab_map_update_elem+0x178/0xa80
>>>   stack backtrace:
>>>   CPU: 0 UID: 0 PID: 4881 Comm: test_maps Not tainted 6.11.0-rc4+ #49
>>>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
>>>   Call Trace:
>>>    <TASK>
>>>    dump_stack_lvl+0x6e/0xb0
>>>    dump_stack+0x10/0x20
>>>    __lock_acquire+0x73e/0x36c0
>>>    lock_acquire+0x182/0x450
>>>    _raw_spin_lock_irqsave+0x43/0x70
>>>    bpf_map_free_id.part.0+0x21/0x70
>>>    bpf_map_put+0xcf/0x110
>>>    bpf_map_fd_put_ptr+0x9a/0xb0
>>>    free_htab_elem+0x69/0xe0
>>>    htab_map_update_elem+0x50f/0xa80
>>>    bpf_fd_htab_map_update_elem+0x131/0x270
>>>    htab_map_update_elem+0x50f/0xa80
>>>    bpf_fd_htab_map_update_elem+0x131/0x270
>>>    bpf_map_update_value+0x266/0x380
>>>    __sys_bpf+0x21bb/0x36b0
>>>    __x64_sys_bpf+0x45/0x60
>>>    x64_sys_call+0x1b2a/0x20d0
>>>    do_syscall_64+0x5d/0x100
>>>    entry_SYSCALL_64_after_hwframe+0x76/0x7e
>>>
>>> One way to fix the lockdep warning is using raw_spinlock_t for
>>> map_idr_lock as well. However, bpf_map_alloc_id() invokes
>>> idr_alloc_cyclic() after acquiring map_idr_lock, it will trigger a
>>> similar lockdep warning because the slab's lock (s->cpu_slab->lock) is
>>> still a spinlock.
>> Is it OK to move the calling of bpf_map_free_id() from bpf_map_put() to
>> bpf_map_free_deferred() ? It could fix the lockdep warning for htab of
>> maps. Its downside is that the free of map id will be delayed, but I
>> think it will not make a visible effect to the user, because the refcnt
>> is already 0, trying to get the map fd through map id will return -ENOENT.
> I've applied the current patch, since doing free outside of bucket lock
> is a good thing to do anyway.
> With that no need to move bpf_map_free_id(), right?
> At least offload case relies on id being removed immediately.
> I don't remember what else might care.
> .

Yes. There is no need to move bpf_map_free_id() after applying the
patch. Thanks for the information about offload map. According to the
implementation of _bpf_map_offload_destroy(), it seems that it may need
to call bpf_map_free_id() twice, so now bpf_map_free_id() frees the
immediately and sets map->id as 0.
diff mbox series

Patch

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b14b87463ee0..3ec941a0ea41 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -896,9 +896,12 @@  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
 	check_and_free_fields(htab, l);
+
+	migrate_disable();
 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
 	bpf_mem_cache_free(&htab->ma, l);
+	migrate_enable();
 }
 
 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -948,7 +951,7 @@  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	if (htab_is_prealloc(htab)) {
 		bpf_map_dec_elem_count(&htab->map);
 		check_and_free_fields(htab, l);
-		__pcpu_freelist_push(&htab->freelist, &l->fnode);
+		pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		dec_elem_count(htab);
 		htab_elem_free(htab, l);
@@ -1018,7 +1021,6 @@  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			 */
 			pl_new = this_cpu_ptr(htab->extra_elems);
 			l_new = *pl_new;
-			htab_put_fd_value(htab, old_elem);
 			*pl_new = old_elem;
 		} else {
 			struct pcpu_freelist_node *l;
@@ -1105,6 +1107,7 @@  static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	struct htab_elem *l_new = NULL, *l_old;
 	struct hlist_nulls_head *head;
 	unsigned long flags;
+	void *old_map_ptr;
 	struct bucket *b;
 	u32 key_size, hash;
 	int ret;
@@ -1183,12 +1186,27 @@  static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
 		hlist_nulls_del_rcu(&l_old->hash_node);
+
+		/* l_old has already been stashed in htab->extra_elems, free
+		 * its special fields before it is available for reuse. Also
+		 * save the old map pointer in htab of maps before unlock
+		 * and release it after unlock.
+		 */
+		old_map_ptr = NULL;
+		if (htab_is_prealloc(htab)) {
+			if (map->ops->map_fd_put_ptr)
+				old_map_ptr = fd_htab_map_get_ptr(map, l_old);
+			check_and_free_fields(htab, l_old);
+		}
+	}
+	htab_unlock_bucket(htab, b, hash, flags);
+	if (l_old) {
+		if (old_map_ptr)
+			map->ops->map_fd_put_ptr(map, old_map_ptr, true);
 		if (!htab_is_prealloc(htab))
 			free_htab_elem(htab, l_old);
-		else
-			check_and_free_fields(htab, l_old);
 	}
-	ret = 0;
+	return 0;
 err:
 	htab_unlock_bucket(htab, b, hash, flags);
 	return ret;
@@ -1432,15 +1450,15 @@  static long htab_map_delete_elem(struct bpf_map *map, void *key)
 		return ret;
 
 	l = lookup_elem_raw(head, hash, key, key_size);
-
-	if (l) {
+	if (l)
 		hlist_nulls_del_rcu(&l->hash_node);
-		free_htab_elem(htab, l);
-	} else {
+	else
 		ret = -ENOENT;
-	}
 
 	htab_unlock_bucket(htab, b, hash, flags);
+
+	if (l)
+		free_htab_elem(htab, l);
 	return ret;
 }
 
@@ -1853,13 +1871,14 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 			 * may cause deadlock. See comments in function
 			 * prealloc_lru_pop(). Let us do bpf_lru_push_free()
 			 * after releasing the bucket lock.
+			 *
+			 * For htab of maps, htab_put_fd_value() in
+			 * free_htab_elem() may acquire a spinlock with bucket
+			 * lock being held and it violates the lock rule, so
+			 * invoke free_htab_elem() after unlock as well.
 			 */
-			if (is_lru_map) {
-				l->batch_flink = node_to_free;
-				node_to_free = l;
-			} else {
-				free_htab_elem(htab, l);
-			}
+			l->batch_flink = node_to_free;
+			node_to_free = l;
 		}
 		dst_key += key_size;
 		dst_val += value_size;
@@ -1871,7 +1890,10 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 	while (node_to_free) {
 		l = node_to_free;
 		node_to_free = node_to_free->batch_flink;
-		htab_lru_push_free(htab, l);
+		if (is_lru_map)
+			htab_lru_push_free(htab, l);
+		else
+			free_htab_elem(htab, l);
 	}
 
 next_batch: