diff mbox series

btrfs: fix a race in encoded read

Message ID 20241212075303.2538880-1-neelx@suse.com (mailing list archive)
State New
Headers show
Series btrfs: fix a race in encoded read | expand

Commit Message

Daniel Vacek Dec. 12, 2024, 7:53 a.m. UTC
While testing the encoded read feature the following crash was observed
and it can be reliably reproduced:

[ 2916.441731] Oops: general protection fault, probably for non-canonical address 0xa3f64e06d5eee2c7: 0000 [#1] PREEMPT_RT SMP NOPTI
[ 2916.441736] CPU: 5 UID: 0 PID: 592 Comm: kworker/u38:4 Kdump: loaded Not tainted 6.13.0-rc1+ #4
[ 2916.441739] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 2916.441740] Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
[ 2916.441777] RIP: 0010:__wake_up_common+0x29/0xa0
[ 2916.441808] RSP: 0018:ffffaaec0128fd80 EFLAGS: 00010216
[ 2916.441810] RAX: 0000000000000001 RBX: ffff95a6429cf020 RCX: 0000000000000000
[ 2916.441811] RDX: a3f64e06d5eee2c7 RSI: 0000000000000003 RDI: ffff95a6429cf000
                    ^^^^^^^^^^^^^^^^
                    This comes from `priv->wait.head.next`

[ 2916.441823] Call Trace:
[ 2916.441833]  <TASK>
[ 2916.441881]  ? __wake_up_common+0x29/0xa0
[ 2916.441883]  __wake_up_common_lock+0x37/0x60
[ 2916.441887]  btrfs_encoded_read_endio+0x73/0x90 [btrfs]  <<< UAF of `priv` object,
[ 2916.441921]  btrfs_check_read_bio+0x321/0x500 [btrfs]        details below.
[ 2916.441947]  process_scheduled_works+0xc1/0x410
[ 2916.441960]  worker_thread+0x105/0x240

crash> btrfs_encoded_read_private.wait.head ffff95a6429cf000	# `priv` from RDI ^^
  wait.head = {
    next = 0xa3f64e06d5eee2c7,	# Corrupted as the object was already freed/reused.
    prev = 0xffff95a6429cf020	# Stale data still point to itself (`&priv->wait.head`
  }				  also in RBX ^^) ie. the list was free.

Possibly, this is easier (or even only?) reproducible on preemptible kernel.
It just happened to build an RT kernel for additional testing coverage.
Enabling slab debug gives us further related details, mostly confirming
what's expected:

[11:23:07] =============================================================================
[11:23:07] BUG kmalloc-64 (Not tainted): Poison overwritten
[11:23:07] -----------------------------------------------------------------------------

[11:23:07] 0xffff8fc7c5b6b542-0xffff8fc7c5b6b543 @offset=5442. First byte 0x4 instead of 0x6b
                            ^
     That makes two bytes into the `priv->wait.lock`

[11:23:07] FIX kmalloc-64: Restoring Poison 0xffff8fc7c5b6b542-0xffff8fc7c5b6b543=0x6b
[11:23:07] Allocated in btrfs_encoded_read_regular_fill_pages+0x5e/0x260 [btrfs] age=4 cpu=0 pid=18295
[11:23:07]  __kmalloc_cache_noprof+0x81/0x2a0
[11:23:07]  btrfs_encoded_read_regular_fill_pages+0x5e/0x260 [btrfs]
[11:23:07]  btrfs_encoded_read_regular+0xee/0x200 [btrfs]
[11:23:07]  btrfs_ioctl_encoded_read+0x477/0x600 [btrfs]
[11:23:07]  btrfs_ioctl+0xefe/0x2a00 [btrfs]
[11:23:07]  __x64_sys_ioctl+0xa3/0xc0
[11:23:07]  do_syscall_64+0x74/0x180
[11:23:07]  entry_SYSCALL_64_after_hwframe+0x76/0x7e

  9121  	unsigned long i = 0;
  9122  	struct btrfs_bio *bbio;
  9123  	int ret;
  9124
* 9125  	priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
  9126  	if (!priv)
  9127  		return -ENOMEM;
  9128
  9129  	init_waitqueue_head(&priv->wait);

[11:23:07] Freed in btrfs_encoded_read_regular_fill_pages+0x1f9/0x260 [btrfs] age=4 cpu=0 pid=18295
[11:23:07]  btrfs_encoded_read_regular_fill_pages+0x1f9/0x260 [btrfs]
[11:23:07]  btrfs_encoded_read_regular+0xee/0x200 [btrfs]
[11:23:07]  btrfs_ioctl_encoded_read+0x477/0x600 [btrfs]
[11:23:07]  btrfs_ioctl+0xefe/0x2a00 [btrfs]
[11:23:07]  __x64_sys_ioctl+0xa3/0xc0
[11:23:07]  do_syscall_64+0x74/0x180
[11:23:07]  entry_SYSCALL_64_after_hwframe+0x76/0x7e

  9171  		if (atomic_dec_return(&priv->pending) != 0)
  9172  			io_wait_event(priv->wait, !atomic_read(&priv->pending));
  9173  		/* See btrfs_encoded_read_endio() for ordering. */
  9174  		ret = blk_status_to_errno(READ_ONCE(priv->status));
* 9175  		kfree(priv);
  9176  		return ret;
  9177  	}
  9178  }

`priv` was freed here but then after that it was further used. The report
is comming soon after, see below. Note that the report is a few seconds
delayed by the RCU stall timeout. (It is the same example as with the
GPF crash above, just that one was reported right away without any delay).

Due to the poison this time instead of the GPF exception as observed above
the UAF caused a CPU hard lockup (reported by the RCU stall check as this
was a VM):

[11:23:28] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[11:23:28] rcu:         0-...!: (1 GPs behind) idle=48b4/1/0x4000000000000000 softirq=0/0 fqs=5 rcuc=5254 jiffies(starved)
[11:23:28] rcu:         (detected by 1, t=5252 jiffies, g=1631241, q=250054 ncpus=8)
[11:23:28] Sending NMI from CPU 1 to CPUs 0:
[11:23:28] NMI backtrace for cpu 0
[11:23:28] CPU: 0 UID: 0 PID: 21445 Comm: kworker/u33:3 Kdump: loaded Tainted: G    B              6.13.0-rc1+ #4
[11:23:28] Tainted: [B]=BAD_PAGE
[11:23:28] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[11:23:28] Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
[11:23:28] RIP: 0010:native_halt+0xa/0x10
[11:23:28] RSP: 0018:ffffb42ec277bc48 EFLAGS: 00000046
[11:23:28] Call Trace:
[11:23:28]  <TASK>
[11:23:28]  kvm_wait+0x53/0x60
[11:23:28]  __pv_queued_spin_lock_slowpath+0x2ea/0x350
[11:23:28]  _raw_spin_lock_irq+0x2b/0x40
[11:23:28]  rtlock_slowlock_locked+0x1f3/0xce0
[11:23:28]  rt_spin_lock+0x7b/0xb0
[11:23:28]  __wake_up_common_lock+0x23/0x60
[11:23:28]  btrfs_encoded_read_endio+0x73/0x90 [btrfs]  <<< UAF of `priv` object.
[11:23:28]  btrfs_check_read_bio+0x321/0x500 [btrfs]
[11:23:28]  process_scheduled_works+0xc1/0x410
[11:23:28]  worker_thread+0x105/0x240

  9105  		if (priv->uring_ctx) {
  9106  			int err = blk_status_to_errno(READ_ONCE(priv->status));
  9107  			btrfs_uring_read_extent_endio(priv->uring_ctx, err);
  9108  			kfree(priv);
  9109  		} else {
* 9110  			wake_up(&priv->wait);	<<< So we know UAF/GPF happens here.
  9111  		}
  9112  	}
  9113  	bio_put(&bbio->bio);

Now, the wait queue here does not really guarantee a proper
synchronization between `btrfs_encoded_read_regular_fill_pages()`
and `btrfs_encoded_read_endio()` which eventually results in various
use-afer-free effects like general protection fault or CPU hard lockup.

Using plain wait queue without additional instrumentation on top of the
`pending` counter is simply insufficient in this context. The reason wait
queue fails here is because the lifespan of that structure is only within
the `btrfs_encoded_read_regular_fill_pages()` function. In such a case
plain wait queue cannot be used to synchronize for it's own destruction.

Fix this by correctly using completion instead.

Also, while the lifespan of the structures in sync case is strictly
limited within the `..._fill_pages()` function, there is no need to
allocate from slab. Stack can be safely used instead.

Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl")
CC: stable@vger.kernel.org # 5.18+
Signed-off-by: Daniel Vacek <neelx@suse.com>
---
 fs/btrfs/inode.c | 62 ++++++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

Comments

Johannes Thumshirn Dec. 12, 2024, 8 a.m. UTC | #1
On 12.12.24 08:54, Daniel Vacek wrote:
> While testing the encoded read feature the following crash was observed
> and it can be reliably reproduced:
> 


Hi Daniel,

This suspiciously looks like '05b36b04d74a ("btrfs: fix use-after-free 
in btrfs_encoded_read_endio()")'. Do you have this patch applied to your 
kernel? IIRC it went upstream with 6.13-rc2.

Byte,
	Johannes

> [ 2916.441731] Oops: general protection fault, probably for non-canonical address 0xa3f64e06d5eee2c7: 0000 [#1] PREEMPT_RT SMP NOPTI
> [ 2916.441736] CPU: 5 UID: 0 PID: 592 Comm: kworker/u38:4 Kdump: loaded Not tainted 6.13.0-rc1+ #4
> [ 2916.441739] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [ 2916.441740] Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
> [ 2916.441777] RIP: 0010:__wake_up_common+0x29/0xa0
> [ 2916.441808] RSP: 0018:ffffaaec0128fd80 EFLAGS: 00010216
> [ 2916.441810] RAX: 0000000000000001 RBX: ffff95a6429cf020 RCX: 0000000000000000
> [ 2916.441811] RDX: a3f64e06d5eee2c7 RSI: 0000000000000003 RDI: ffff95a6429cf000
>                      ^^^^^^^^^^^^^^^^
>                      This comes from `priv->wait.head.next`
> 
> [ 2916.441823] Call Trace:
> [ 2916.441833]  <TASK>
> [ 2916.441881]  ? __wake_up_common+0x29/0xa0
> [ 2916.441883]  __wake_up_common_lock+0x37/0x60
> [ 2916.441887]  btrfs_encoded_read_endio+0x73/0x90 [btrfs]  <<< UAF of `priv` object,
> [ 2916.441921]  btrfs_check_read_bio+0x321/0x500 [btrfs]        details below.
> [ 2916.441947]  process_scheduled_works+0xc1/0x410
> [ 2916.441960]  worker_thread+0x105/0x240
> 
> crash> btrfs_encoded_read_private.wait.head ffff95a6429cf000	# `priv` from RDI ^^
>    wait.head = {
>      next = 0xa3f64e06d5eee2c7,	# Corrupted as the object was already freed/reused.
>      prev = 0xffff95a6429cf020	# Stale data still point to itself (`&priv->wait.head`
>    }				  also in RBX ^^) ie. the list was free.
> 
> Possibly, this is easier (or even only?) reproducible on preemptible kernel.
> It just happened to build an RT kernel for additional testing coverage.
> Enabling slab debug gives us further related details, mostly confirming
> what's expected:
> 
> [11:23:07] =============================================================================
> [11:23:07] BUG kmalloc-64 (Not tainted): Poison overwritten
> [11:23:07] -----------------------------------------------------------------------------
> 
> [11:23:07] 0xffff8fc7c5b6b542-0xffff8fc7c5b6b543 @offset=5442. First byte 0x4 instead of 0x6b
>                              ^
>       That makes two bytes into the `priv->wait.lock`
> 
> [11:23:07] FIX kmalloc-64: Restoring Poison 0xffff8fc7c5b6b542-0xffff8fc7c5b6b543=0x6b
> [11:23:07] Allocated in btrfs_encoded_read_regular_fill_pages+0x5e/0x260 [btrfs] age=4 cpu=0 pid=18295
> [11:23:07]  __kmalloc_cache_noprof+0x81/0x2a0
> [11:23:07]  btrfs_encoded_read_regular_fill_pages+0x5e/0x260 [btrfs]
> [11:23:07]  btrfs_encoded_read_regular+0xee/0x200 [btrfs]
> [11:23:07]  btrfs_ioctl_encoded_read+0x477/0x600 [btrfs]
> [11:23:07]  btrfs_ioctl+0xefe/0x2a00 [btrfs]
> [11:23:07]  __x64_sys_ioctl+0xa3/0xc0
> [11:23:07]  do_syscall_64+0x74/0x180
> [11:23:07]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
>    9121  	unsigned long i = 0;
>    9122  	struct btrfs_bio *bbio;
>    9123  	int ret;
>    9124
> * 9125  	priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
>    9126  	if (!priv)
>    9127  		return -ENOMEM;
>    9128
>    9129  	init_waitqueue_head(&priv->wait);
> 
> [11:23:07] Freed in btrfs_encoded_read_regular_fill_pages+0x1f9/0x260 [btrfs] age=4 cpu=0 pid=18295
> [11:23:07]  btrfs_encoded_read_regular_fill_pages+0x1f9/0x260 [btrfs]
> [11:23:07]  btrfs_encoded_read_regular+0xee/0x200 [btrfs]
> [11:23:07]  btrfs_ioctl_encoded_read+0x477/0x600 [btrfs]
> [11:23:07]  btrfs_ioctl+0xefe/0x2a00 [btrfs]
> [11:23:07]  __x64_sys_ioctl+0xa3/0xc0
> [11:23:07]  do_syscall_64+0x74/0x180
> [11:23:07]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
>    9171  		if (atomic_dec_return(&priv->pending) != 0)
>    9172  			io_wait_event(priv->wait, !atomic_read(&priv->pending));
>    9173  		/* See btrfs_encoded_read_endio() for ordering. */
>    9174  		ret = blk_status_to_errno(READ_ONCE(priv->status));
> * 9175  		kfree(priv);
>    9176  		return ret;
>    9177  	}
>    9178  }
> 
> `priv` was freed here but then after that it was further used. The report
> is comming soon after, see below. Note that the report is a few seconds
> delayed by the RCU stall timeout. (It is the same example as with the
> GPF crash above, just that one was reported right away without any delay).
> 
> Due to the poison this time instead of the GPF exception as observed above
> the UAF caused a CPU hard lockup (reported by the RCU stall check as this
> was a VM):
> 
> [11:23:28] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
> [11:23:28] rcu:         0-...!: (1 GPs behind) idle=48b4/1/0x4000000000000000 softirq=0/0 fqs=5 rcuc=5254 jiffies(starved)
> [11:23:28] rcu:         (detected by 1, t=5252 jiffies, g=1631241, q=250054 ncpus=8)
> [11:23:28] Sending NMI from CPU 1 to CPUs 0:
> [11:23:28] NMI backtrace for cpu 0
> [11:23:28] CPU: 0 UID: 0 PID: 21445 Comm: kworker/u33:3 Kdump: loaded Tainted: G    B              6.13.0-rc1+ #4
> [11:23:28] Tainted: [B]=BAD_PAGE
> [11:23:28] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [11:23:28] Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
> [11:23:28] RIP: 0010:native_halt+0xa/0x10
> [11:23:28] RSP: 0018:ffffb42ec277bc48 EFLAGS: 00000046
> [11:23:28] Call Trace:
> [11:23:28]  <TASK>
> [11:23:28]  kvm_wait+0x53/0x60
> [11:23:28]  __pv_queued_spin_lock_slowpath+0x2ea/0x350
> [11:23:28]  _raw_spin_lock_irq+0x2b/0x40
> [11:23:28]  rtlock_slowlock_locked+0x1f3/0xce0
> [11:23:28]  rt_spin_lock+0x7b/0xb0
> [11:23:28]  __wake_up_common_lock+0x23/0x60
> [11:23:28]  btrfs_encoded_read_endio+0x73/0x90 [btrfs]  <<< UAF of `priv` object.
> [11:23:28]  btrfs_check_read_bio+0x321/0x500 [btrfs]
> [11:23:28]  process_scheduled_works+0xc1/0x410
> [11:23:28]  worker_thread+0x105/0x240
> 
>    9105  		if (priv->uring_ctx) {
>    9106  			int err = blk_status_to_errno(READ_ONCE(priv->status));
>    9107  			btrfs_uring_read_extent_endio(priv->uring_ctx, err);
>    9108  			kfree(priv);
>    9109  		} else {
> * 9110  			wake_up(&priv->wait);	<<< So we know UAF/GPF happens here.
>    9111  		}
>    9112  	}
>    9113  	bio_put(&bbio->bio);
> 
> Now, the wait queue here does not really guarantee a proper
> synchronization between `btrfs_encoded_read_regular_fill_pages()`
> and `btrfs_encoded_read_endio()` which eventually results in various
> use-afer-free effects like general protection fault or CPU hard lockup.
> 
> Using plain wait queue without additional instrumentation on top of the
> `pending` counter is simply insufficient in this context. The reason wait
> queue fails here is because the lifespan of that structure is only within
> the `btrfs_encoded_read_regular_fill_pages()` function. In such a case
> plain wait queue cannot be used to synchronize for it's own destruction.
> 
> Fix this by correctly using completion instead.
> 
> Also, while the lifespan of the structures in sync case is strictly
> limited within the `..._fill_pages()` function, there is no need to
> allocate from slab. Stack can be safely used instead.
> 
> Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl")
> CC: stable@vger.kernel.org # 5.18+
> Signed-off-by: Daniel Vacek <neelx@suse.com>
> ---
>   fs/btrfs/inode.c | 62 ++++++++++++++++++++++++++----------------------
>   1 file changed, 33 insertions(+), 29 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index fa648ab6fe806..61e0fd5c6a15f 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9078,7 +9078,7 @@ static ssize_t btrfs_encoded_read_inline(
>   }
>   
>   struct btrfs_encoded_read_private {
> -	wait_queue_head_t wait;
> +	struct completion *sync_read;
>   	void *uring_ctx;
>   	atomic_t pending;
>   	blk_status_t status;
> @@ -9090,23 +9090,22 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
>   
>   	if (bbio->bio.bi_status) {
>   		/*
> -		 * The memory barrier implied by the atomic_dec_return() here
> -		 * pairs with the memory barrier implied by the
> -		 * atomic_dec_return() or io_wait_event() in
> -		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
> -		 * write is observed before the load of status in
> -		 * btrfs_encoded_read_regular_fill_pages().
> +		 * The memory barrier implied by the
> +		 * atomic_dec_and_test() here pairs with the memory
> +		 * barrier implied by the atomic_dec_and_test() in
> +		 * btrfs_encoded_read_regular_fill_pages() to ensure
> +		 * that this write is observed before the load of
> +		 * status in btrfs_encoded_read_regular_fill_pages().
>   		 */
>   		WRITE_ONCE(priv->status, bbio->bio.bi_status);
>   	}
>   	if (atomic_dec_and_test(&priv->pending)) {
> -		int err = blk_status_to_errno(READ_ONCE(priv->status));
> -
>   		if (priv->uring_ctx) {
> +			int err = blk_status_to_errno(READ_ONCE(priv->status));
>   			btrfs_uring_read_extent_endio(priv->uring_ctx, err);
>   			kfree(priv);
>   		} else {
> -			wake_up(&priv->wait);
> +			complete(priv->sync_read);
>   		}
>   	}
>   	bio_put(&bbio->bio);
> @@ -9117,16 +9116,21 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
>   					  struct page **pages, void *uring_ctx)
>   {
>   	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> -	struct btrfs_encoded_read_private *priv;
> +	struct completion sync_read;
> +	struct btrfs_encoded_read_private sync_priv, *priv;
>   	unsigned long i = 0;
>   	struct btrfs_bio *bbio;
> -	int ret;
>   
> -	priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
> -	if (!priv)
> -		return -ENOMEM;
> +	if (uring_ctx) {
> +		priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
> +		if (!priv)
> +			return -ENOMEM;
> +	} else {
> +		priv = &sync_priv;
> +		init_completion(&sync_read);
> +		priv->sync_read = &sync_read;
> +	}
>   
> -	init_waitqueue_head(&priv->wait);
>   	atomic_set(&priv->pending, 1);
>   	priv->status = 0;
>   	priv->uring_ctx = uring_ctx;
> @@ -9158,23 +9162,23 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
>   	atomic_inc(&priv->pending);
>   	btrfs_submit_bbio(bbio, 0);
>   
> -	if (uring_ctx) {
> -		if (atomic_dec_return(&priv->pending) == 0) {
> -			ret = blk_status_to_errno(READ_ONCE(priv->status));
> -			btrfs_uring_read_extent_endio(uring_ctx, ret);
> +	if (atomic_dec_and_test(&priv->pending)) {
> +		if (uring_ctx) {
> +			int err = blk_status_to_errno(READ_ONCE(priv->status));
> +			btrfs_uring_read_extent_endio(uring_ctx, err);
>   			kfree(priv);
> -			return ret;
> +			return err;
> +		} else {
> +			complete(&sync_read);
>   		}
> +	}
>   
> +	if (uring_ctx)
>   		return -EIOCBQUEUED;
> -	} else {
> -		if (atomic_dec_return(&priv->pending) != 0)
> -			io_wait_event(priv->wait, !atomic_read(&priv->pending));
> -		/* See btrfs_encoded_read_endio() for ordering. */
> -		ret = blk_status_to_errno(READ_ONCE(priv->status));
> -		kfree(priv);
> -		return ret;
> -	}
> +
> +	wait_for_completion_io(&sync_read);
> +	/* See btrfs_encoded_read_endio() for ordering. */
> +	return blk_status_to_errno(READ_ONCE(priv->status));
>   }
>   
>   ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
Daniel Vacek Dec. 12, 2024, 8:16 a.m. UTC | #2
Hi Johannes,

On Thu, Dec 12, 2024 at 9:00 AM Johannes Thumshirn
<Johannes.Thumshirn@wdc.com> wrote:
>
> On 12.12.24 08:54, Daniel Vacek wrote:
> > While testing the encoded read feature the following crash was observed
> > and it can be reliably reproduced:
> >
>
>
> Hi Daniel,
>
> This suspiciously looks like '05b36b04d74a ("btrfs: fix use-after-free
> in btrfs_encoded_read_endio()")'. Do you have this patch applied to your
> kernel? IIRC it went upstream with 6.13-rc2.

Yes, I do. This one is on top of it. The crash happens with
`05b36b04d74a` applied. All the crashes were reproduced with
build of `feffde684ac2`.

Honestly, `05b36b04d74a` looks a bit suspicious to me as it really
does not look to deal correctly with the issue to me. I was a bit
surprised/puzzled.

Anyways, I could reproduce the crash in a matter of half an hour. With
this fix the torture is surviving for 22 hours atm.

--nX
Daniel Vacek Dec. 12, 2024, 8:53 a.m. UTC | #3
On Thu, Dec 12, 2024 at 9:35 AM Johannes Thumshirn
<Johannes.Thumshirn@wdc.com> wrote:
>
> On 12.12.24 09:09, Daniel Vacek wrote:
> > Hi Johannes,
> >
> > On Thu, Dec 12, 2024 at 9:00 AM Johannes Thumshirn
> > <Johannes.Thumshirn@wdc.com> wrote:
> >>
> >> On 12.12.24 08:54, Daniel Vacek wrote:
> >>> While testing the encoded read feature the following crash was observed
> >>> and it can be reliably reproduced:
> >>>
> >>
> >>
> >> Hi Daniel,
> >>
> >> This suspiciously looks like '05b36b04d74a ("btrfs: fix use-after-free
> >> in btrfs_encoded_read_endio()")'. Do you have this patch applied to your
> >> kernel? IIRC it went upstream with 6.13-rc2.
> >
> > Yes, I do. This one is on top of it. The crash happens with
> > `05b36b04d74a` applied. All the crashes were reproduced with
> > `feffde684ac2`.
> >
> > Honestly, `05b36b04d74a` looks a bit suspicious to me as it really
> > does not look to deal correctly with the issue to me. I was a bit
> > surprised/puzzled.
>
> Can you elaborate why?

As it only touches one of those four atomic_dec_... lines. In theory
the issue can happen also on the two async places, IIUC. It's only a
matter of race probability.

> > Anyways, I could reproduce the crash in a matter of half an hour. With
> > this fix the torture is surviving for 22 hours atm.
>
> Do you also have '3ff867828e93 ("btrfs: simplify waiting for encoded
> read endios")'? Looking at the diff it doesn't seems so.

I cannot find that one. Am I missing something? Which repo are you using?

--nX
Johannes Thumshirn Dec. 12, 2024, 9:02 a.m. UTC | #4
On 12.12.24 09:53, Daniel Vacek wrote:
> On Thu, Dec 12, 2024 at 9:35 AM Johannes Thumshirn
> <Johannes.Thumshirn@wdc.com> wrote:
>>
>> On 12.12.24 09:09, Daniel Vacek wrote:
>>> Hi Johannes,
>>>
>>> On Thu, Dec 12, 2024 at 9:00 AM Johannes Thumshirn
>>> <Johannes.Thumshirn@wdc.com> wrote:
>>>>
>>>> On 12.12.24 08:54, Daniel Vacek wrote:
>>>>> While testing the encoded read feature the following crash was observed
>>>>> and it can be reliably reproduced:
>>>>>
>>>>
>>>>
>>>> Hi Daniel,
>>>>
>>>> This suspiciously looks like '05b36b04d74a ("btrfs: fix use-after-free
>>>> in btrfs_encoded_read_endio()")'. Do you have this patch applied to your
>>>> kernel? IIRC it went upstream with 6.13-rc2.
>>>
>>> Yes, I do. This one is on top of it. The crash happens with
>>> `05b36b04d74a` applied. All the crashes were reproduced with
>>> `feffde684ac2`.
>>>
>>> Honestly, `05b36b04d74a` looks a bit suspicious to me as it really
>>> does not look to deal correctly with the issue to me. I was a bit
>>> surprised/puzzled.
>>
>> Can you elaborate why?
> 
> As it only touches one of those four atomic_dec_... lines. In theory
> the issue can happen also on the two async places, IIUC. It's only a
> matter of race probability.
> 
>>> Anyways, I could reproduce the crash in a matter of half an hour. With
>>> this fix the torture is surviving for 22 hours atm.
>>
>> Do you also have '3ff867828e93 ("btrfs: simplify waiting for encoded
>> read endios")'? Looking at the diff it doesn't seems so.
> 
> I cannot find that one. Am I missing something? Which repo are you using?

The for-next branch for btrfs [1], which is what ppl developing against 
btrfs should use. Can you please re-test with it and if needed re-base 
your patch on top of it?

[1] https://github.com/btrfs/linux for-next

Thanks,
	Johannes
Daniel Vacek Dec. 12, 2024, 9:08 a.m. UTC | #5
On Thu, Dec 12, 2024 at 10:02 AM Johannes Thumshirn
<Johannes.Thumshirn@wdc.com> wrote:
>
> On 12.12.24 09:53, Daniel Vacek wrote:
> > On Thu, Dec 12, 2024 at 9:35 AM Johannes Thumshirn
> > <Johannes.Thumshirn@wdc.com> wrote:
> >>
> >> On 12.12.24 09:09, Daniel Vacek wrote:
> >>> Hi Johannes,
> >>>
> >>> On Thu, Dec 12, 2024 at 9:00 AM Johannes Thumshirn
> >>> <Johannes.Thumshirn@wdc.com> wrote:
> >>>>
> >>>> On 12.12.24 08:54, Daniel Vacek wrote:
> >>>>> While testing the encoded read feature the following crash was observed
> >>>>> and it can be reliably reproduced:
> >>>>>
> >>>>
> >>>>
> >>>> Hi Daniel,
> >>>>
> >>>> This suspiciously looks like '05b36b04d74a ("btrfs: fix use-after-free
> >>>> in btrfs_encoded_read_endio()")'. Do you have this patch applied to your
> >>>> kernel? IIRC it went upstream with 6.13-rc2.
> >>>
> >>> Yes, I do. This one is on top of it. The crash happens with
> >>> `05b36b04d74a` applied. All the crashes were reproduced with
> >>> `feffde684ac2`.
> >>>
> >>> Honestly, `05b36b04d74a` looks a bit suspicious to me as it really
> >>> does not look to deal correctly with the issue to me. I was a bit
> >>> surprised/puzzled.
> >>
> >> Can you elaborate why?
> >
> > As it only touches one of those four atomic_dec_... lines. In theory
> > the issue can happen also on the two async places, IIUC. It's only a
> > matter of race probability.
> >
> >>> Anyways, I could reproduce the crash in a matter of half an hour. With
> >>> this fix the torture is surviving for 22 hours atm.
> >>
> >> Do you also have '3ff867828e93 ("btrfs: simplify waiting for encoded
> >> read endios")'? Looking at the diff it doesn't seems so.
> >
> > I cannot find that one. Am I missing something? Which repo are you using?
>
> The for-next branch for btrfs [1], which is what ppl developing against
> btrfs should use. Can you please re-test with it and if needed re-base
> your patch on top of it?
>
> [1] https://github.com/btrfs/linux for-next

I did check here and I don't really see the commit.

$ git remote -v
origin    https://github.com/btrfs/linux.git (fetch)
origin    https://github.com/btrfs/linux.git (push)
$ git fetch
$ git show 3ff867828e93 --
fatal: bad revision '3ff867828e93'

Note, I was testing v6.13-rc1. This is a fix not a feature development.

--nX

> Thanks,
>         Johannes
Johannes Thumshirn Dec. 12, 2024, 9:13 a.m. UTC | #6
On 12.12.24 10:08, Daniel Vacek wrote:
> On Thu, Dec 12, 2024 at 10:02 AM Johannes Thumshirn
> <Johannes.Thumshirn@wdc.com> wrote:
>>
>> On 12.12.24 09:53, Daniel Vacek wrote:
>>> On Thu, Dec 12, 2024 at 9:35 AM Johannes Thumshirn
>>> <Johannes.Thumshirn@wdc.com> wrote:
>>>>
>>>> On 12.12.24 09:09, Daniel Vacek wrote:
>>>>> Hi Johannes,
>>>>>
>>>>> On Thu, Dec 12, 2024 at 9:00 AM Johannes Thumshirn
>>>>> <Johannes.Thumshirn@wdc.com> wrote:
>>>>>>
>>>>>> On 12.12.24 08:54, Daniel Vacek wrote:
>>>>>>> While testing the encoded read feature the following crash was observed
>>>>>>> and it can be reliably reproduced:
>>>>>>>
>>>>>>
>>>>>>
>>>>>> Hi Daniel,
>>>>>>
>>>>>> This suspiciously looks like '05b36b04d74a ("btrfs: fix use-after-free
>>>>>> in btrfs_encoded_read_endio()")'. Do you have this patch applied to your
>>>>>> kernel? IIRC it went upstream with 6.13-rc2.
>>>>>
>>>>> Yes, I do. This one is on top of it. The crash happens with
>>>>> `05b36b04d74a` applied. All the crashes were reproduced with
>>>>> `feffde684ac2`.
>>>>>
>>>>> Honestly, `05b36b04d74a` looks a bit suspicious to me as it really
>>>>> does not look to deal correctly with the issue to me. I was a bit
>>>>> surprised/puzzled.
>>>>
>>>> Can you elaborate why?
>>>
>>> As it only touches one of those four atomic_dec_... lines. In theory
>>> the issue can happen also on the two async places, IIUC. It's only a
>>> matter of race probability.
>>>
>>>>> Anyways, I could reproduce the crash in a matter of half an hour. With
>>>>> this fix the torture is surviving for 22 hours atm.
>>>>
>>>> Do you also have '3ff867828e93 ("btrfs: simplify waiting for encoded
>>>> read endios")'? Looking at the diff it doesn't seems so.
>>>
>>> I cannot find that one. Am I missing something? Which repo are you using?
>>
>> The for-next branch for btrfs [1], which is what ppl developing against
>> btrfs should use. Can you please re-test with it and if needed re-base
>> your patch on top of it?
>>
>> [1] https://github.com/btrfs/linux for-next
> 
> I did check here and I don't really see the commit.
> 
> $ git remote -v
> origin    https://github.com/btrfs/linux.git (fetch)
> origin    https://github.com/btrfs/linux.git (push)
> $ git fetch
> $ git show 3ff867828e93 --
> fatal: bad revision '3ff867828e93'
> 
> Note, I was testing v6.13-rc1. This is a fix not a feature development.

It got recently force pushed, 34725028ec5500018f1cb5bfd55c669c7bbf1346 
it is now, sorry.
Daniel Vacek Dec. 12, 2024, 9:34 a.m. UTC | #7
On Thu, Dec 12, 2024 at 10:14 AM Johannes Thumshirn
<Johannes.Thumshirn@wdc.com> wrote:
> It got recently force pushed, 34725028ec5500018f1cb5bfd55c669c7bbf1346
> it is now, sorry.

Yeah, this looks very similar and it should fix the bug as well. In
fact the fix part looks exactly the same, I just also changed the
slab/stack allocation while you changed the atomic/refcount. But these
are unrelated, IIUC. I actually planned to split it into two patches
but David told me it's not necessary and I should send it as it is.

Just nitpicking about your patch, the subject says simplify while I
don't really see any simplification.
Also it does not mention the UAF bug leading to crashes it fixes,
missing the Fixes: and CC: stable tags.

What do we do now?

--nX
Johannes Thumshirn Dec. 12, 2024, 10:10 a.m. UTC | #8
On 12.12.24 10:35, Daniel Vacek wrote:
> On Thu, Dec 12, 2024 at 10:14 AM Johannes Thumshirn
> <Johannes.Thumshirn@wdc.com> wrote:
>> It got recently force pushed, 34725028ec5500018f1cb5bfd55c669c7bbf1346
>> it is now, sorry.
> 
> Yeah, this looks very similar and it should fix the bug as well. In
> fact the fix part looks exactly the same, I just also changed the
> slab/stack allocation while you changed the atomic/refcount. But these
> are unrelated, IIUC. I actually planned to split it into two patches
> but David told me it's not necessary and I should send it as it is.
> 
> Just nitpicking about your patch, the subject says simplify while I
> don't really see any simplification.
> Also it does not mention the UAF bug leading to crashes it fixes,
> missing the Fixes: and CC: stable tags.
> 
> What do we do now?

I think it's up to David if he want's to send the patch for this rc or 
not. In my test environment the part that went upstream was sufficient 
to fix the UAF, so this was the part that actually went to Linus first.

@Dave can you send '34725028ec55 ("btrfs: simplify waiting for encoded 
read endios")' in the next PR? I can update the Fixes tag.
Daniel Vacek Dec. 12, 2024, 10:26 a.m. UTC | #9
On Thu, Dec 12, 2024 at 11:10 AM Johannes Thumshirn
<Johannes.Thumshirn@wdc.com> wrote:
>
> On 12.12.24 10:35, Daniel Vacek wrote:
> > On Thu, Dec 12, 2024 at 10:14 AM Johannes Thumshirn
> > <Johannes.Thumshirn@wdc.com> wrote:
> >> It got recently force pushed, 34725028ec5500018f1cb5bfd55c669c7bbf1346
> >> it is now, sorry.
> >
> > Yeah, this looks very similar and it should fix the bug as well. In
> > fact the fix part looks exactly the same, I just also changed the
> > slab/stack allocation while you changed the atomic/refcount. But these
> > are unrelated, IIUC. I actually planned to split it into two patches
> > but David told me it's not necessary and I should send it as it is.
> >
> > Just nitpicking about your patch, the subject says simplify while I
> > don't really see any simplification.
> > Also it does not mention the UAF bug leading to crashes it fixes,
> > missing the Fixes: and CC: stable tags.
> >
> > What do we do now?
>
> I think it's up to David if he want's to send the patch for this rc or
> not. In my test environment the part that went upstream was sufficient
> to fix the UAF, so this was the part that actually went to Linus first.

But it (I assume you are referring to `05b36b04d74a`) does not really
fix the UAF. I'm still able to get the same crashes even with this
commit applied. That was actually where I originally started testing.

> @Dave can you send '34725028ec55 ("btrfs: simplify waiting for encoded
> read endios")' in the next PR? I can update the Fixes tag.

The commit message definitely needs to be updated mentioning that this
actually fixes the UAF which `05b36b04d74a` does not really address.

--nX
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fa648ab6fe806..61e0fd5c6a15f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9078,7 +9078,7 @@  static ssize_t btrfs_encoded_read_inline(
 }
 
 struct btrfs_encoded_read_private {
-	wait_queue_head_t wait;
+	struct completion *sync_read;
 	void *uring_ctx;
 	atomic_t pending;
 	blk_status_t status;
@@ -9090,23 +9090,22 @@  static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 
 	if (bbio->bio.bi_status) {
 		/*
-		 * The memory barrier implied by the atomic_dec_return() here
-		 * pairs with the memory barrier implied by the
-		 * atomic_dec_return() or io_wait_event() in
-		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
-		 * write is observed before the load of status in
-		 * btrfs_encoded_read_regular_fill_pages().
+		 * The memory barrier implied by the
+		 * atomic_dec_and_test() here pairs with the memory
+		 * barrier implied by the atomic_dec_and_test() in
+		 * btrfs_encoded_read_regular_fill_pages() to ensure
+		 * that this write is observed before the load of
+		 * status in btrfs_encoded_read_regular_fill_pages().
 		 */
 		WRITE_ONCE(priv->status, bbio->bio.bi_status);
 	}
 	if (atomic_dec_and_test(&priv->pending)) {
-		int err = blk_status_to_errno(READ_ONCE(priv->status));
-
 		if (priv->uring_ctx) {
+			int err = blk_status_to_errno(READ_ONCE(priv->status));
 			btrfs_uring_read_extent_endio(priv->uring_ctx, err);
 			kfree(priv);
 		} else {
-			wake_up(&priv->wait);
+			complete(priv->sync_read);
 		}
 	}
 	bio_put(&bbio->bio);
@@ -9117,16 +9116,21 @@  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 					  struct page **pages, void *uring_ctx)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct btrfs_encoded_read_private *priv;
+	struct completion sync_read;
+	struct btrfs_encoded_read_private sync_priv, *priv;
 	unsigned long i = 0;
 	struct btrfs_bio *bbio;
-	int ret;
 
-	priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
-	if (!priv)
-		return -ENOMEM;
+	if (uring_ctx) {
+		priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+		if (!priv)
+			return -ENOMEM;
+	} else {
+		priv = &sync_priv;
+		init_completion(&sync_read);
+		priv->sync_read = &sync_read;
+	}
 
-	init_waitqueue_head(&priv->wait);
 	atomic_set(&priv->pending, 1);
 	priv->status = 0;
 	priv->uring_ctx = uring_ctx;
@@ -9158,23 +9162,23 @@  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 	atomic_inc(&priv->pending);
 	btrfs_submit_bbio(bbio, 0);
 
-	if (uring_ctx) {
-		if (atomic_dec_return(&priv->pending) == 0) {
-			ret = blk_status_to_errno(READ_ONCE(priv->status));
-			btrfs_uring_read_extent_endio(uring_ctx, ret);
+	if (atomic_dec_and_test(&priv->pending)) {
+		if (uring_ctx) {
+			int err = blk_status_to_errno(READ_ONCE(priv->status));
+			btrfs_uring_read_extent_endio(uring_ctx, err);
 			kfree(priv);
-			return ret;
+			return err;
+		} else {
+			complete(&sync_read);
 		}
+	}
 
+	if (uring_ctx)
 		return -EIOCBQUEUED;
-	} else {
-		if (atomic_dec_return(&priv->pending) != 0)
-			io_wait_event(priv->wait, !atomic_read(&priv->pending));
-		/* See btrfs_encoded_read_endio() for ordering. */
-		ret = blk_status_to_errno(READ_ONCE(priv->status));
-		kfree(priv);
-		return ret;
-	}
+
+	wait_for_completion_io(&sync_read);
+	/* See btrfs_encoded_read_endio() for ordering. */
+	return blk_status_to_errno(READ_ONCE(priv->status));
 }
 
 ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,