diff mbox series

[RFC] writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs

Message ID 20230406140247.1936541-1-libaokun1@huawei.com (mailing list archive)
State Mainlined, archived
Headers show
Series [RFC] writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs | expand

Commit Message

Baokun Li April 6, 2023, 2:02 p.m. UTC
KASAN report null-ptr-deref:
==================================================================
BUG: KASAN: null-ptr-deref in bdi_split_work_to_wbs+0x6ce/0x6e0
Write of size 8 at addr 0000000000000000 by task syz-executor.3/3514

CPU: 3 PID: 3514 Comm: syz-executor.3 Not tainted 5.10.0-dirty #1
Call Trace:
 dump_stack+0xbe/0xfd
 __kasan_report.cold+0x34/0x84
 kasan_report+0x3a/0x50
 check_memory_region+0xfd/0x1f0
 bdi_split_work_to_wbs+0x6ce/0x6e0
 __writeback_inodes_sb_nr+0x184/0x1f0
 try_to_writeback_inodes_sb+0x7f/0xa0
 ext4_nonda_switch+0x125/0x130
 ext4_da_write_begin+0x126/0x6e0
 generic_perform_write+0x199/0x3a0
 ext4_buffered_write_iter+0x16d/0x2b0
 ext4_file_write_iter+0xea/0x140
 new_sync_write+0x2fa/0x430
 vfs_write+0x4a1/0x570
 ksys_write+0xf6/0x1f0
 do_syscall_64+0x30/0x40
 entry_SYSCALL_64_after_hwframe+0x61/0xc6
RIP: 0033:0x45513d
[...]
==================================================================

Above issue may happen as follows:

            cpu1                        cpu2
----------------------------|----------------------------
ext4_nonda_switch
 try_to_writeback_inodes_sb
  __writeback_inodes_sb_nr
   bdi_split_work_to_wbs
    kmalloc(sizeof(*work), GFP_ATOMIC)  ---> alloc mem failed
                                inode_switch_wbs
                                 inode_switch_wbs_work_fn
                                  wb_put_many
                                   percpu_ref_put_many
                                    ref->data->release(ref)
                                     cgwb_release
                                      &wb->release_work
                                       cgwb_release_workfn
                                        percpu_ref_exit
                                         ref->data = NULL
                                         kfree(data)
    wb_get(wb)
     percpu_ref_get(&wb->refcnt)
      percpu_ref_get_many(ref, 1)
       atomic_long_add(nr, &ref->data->count) ---> ref->data = NULL
        atomic64_add(i, v) ---> trigger null-ptr-deref

bdi_split_work_to_wbs() traverses &bdi->wb_list to split work into all wbs.
If the allocation of new work fails, the on-stack fallback will be used and
the reference count of the current wb is increased afterwards. If cgroup
writeback membership switches occur before getting the reference count and
the current wb is released as old_wd, then calling wb_get() or wb_put()
will trigger the null pointer dereference above.

A similar problem is fixed in commit 7fc5854f8c6e ("writeback: synchronize
sync(2) against cgroup writeback membership switches"), but the patch only
adds locks to sync_inodes_sb() and not to the __writeback_inodes_sb_nr()
function that also calls bdi_split_work_to_wbs() function. So avoid the
above race by adding the same lock to __writeback_inodes_sb_nr() and
expanding the range of wb_switch_rwsem held in inode_switch_wbs_work_fn().

Fixes: b817525a4a80 ("writeback: bdi_writeback iteration must not skip dying ones")
Cc: stable@vger.kernel.org
Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
 fs/fs-writeback.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

Comments

Baokun Li April 8, 2023, 3:19 a.m. UTC | #1
Sorry for the noise, my understanding of this issue is wrong.

Please ignore this patch.

I will send a v2 patch.


On 2023/4/6 22:02, Baokun Li wrote:
> KASAN report null-ptr-deref:
> ==================================================================
> BUG: KASAN: null-ptr-deref in bdi_split_work_to_wbs+0x6ce/0x6e0
> Write of size 8 at addr 0000000000000000 by task syz-executor.3/3514
>
> CPU: 3 PID: 3514 Comm: syz-executor.3 Not tainted 5.10.0-dirty #1
> Call Trace:
>   dump_stack+0xbe/0xfd
>   __kasan_report.cold+0x34/0x84
>   kasan_report+0x3a/0x50
>   check_memory_region+0xfd/0x1f0
>   bdi_split_work_to_wbs+0x6ce/0x6e0
>   __writeback_inodes_sb_nr+0x184/0x1f0
>   try_to_writeback_inodes_sb+0x7f/0xa0
>   ext4_nonda_switch+0x125/0x130
>   ext4_da_write_begin+0x126/0x6e0
>   generic_perform_write+0x199/0x3a0
>   ext4_buffered_write_iter+0x16d/0x2b0
>   ext4_file_write_iter+0xea/0x140
>   new_sync_write+0x2fa/0x430
>   vfs_write+0x4a1/0x570
>   ksys_write+0xf6/0x1f0
>   do_syscall_64+0x30/0x40
>   entry_SYSCALL_64_after_hwframe+0x61/0xc6
> RIP: 0033:0x45513d
> [...]
> ==================================================================
>
> Above issue may happen as follows:
>
>              cpu1                        cpu2
> ----------------------------|----------------------------
> ext4_nonda_switch
>   try_to_writeback_inodes_sb
>    __writeback_inodes_sb_nr
>     bdi_split_work_to_wbs
>      kmalloc(sizeof(*work), GFP_ATOMIC)  ---> alloc mem failed
>                                  inode_switch_wbs
>                                   inode_switch_wbs_work_fn
>                                    wb_put_many
>                                     percpu_ref_put_many
>                                      ref->data->release(ref)
>                                       cgwb_release
>                                        &wb->release_work
>                                         cgwb_release_workfn
>                                          percpu_ref_exit
>                                           ref->data = NULL
>                                           kfree(data)
>      wb_get(wb)
>       percpu_ref_get(&wb->refcnt)
>        percpu_ref_get_many(ref, 1)
>         atomic_long_add(nr, &ref->data->count) ---> ref->data = NULL
>          atomic64_add(i, v) ---> trigger null-ptr-deref
>
> bdi_split_work_to_wbs() traverses &bdi->wb_list to split work into all wbs.
> If the allocation of new work fails, the on-stack fallback will be used and
> the reference count of the current wb is increased afterwards. If cgroup
> writeback membership switches occur before getting the reference count and
> the current wb is released as old_wd, then calling wb_get() or wb_put()
> will trigger the null pointer dereference above.
>
> A similar problem is fixed in commit 7fc5854f8c6e ("writeback: synchronize
> sync(2) against cgroup writeback membership switches"), but the patch only
> adds locks to sync_inodes_sb() and not to the __writeback_inodes_sb_nr()
> function that also calls bdi_split_work_to_wbs() function. So avoid the
> above race by adding the same lock to __writeback_inodes_sb_nr() and
> expanding the range of wb_switch_rwsem held in inode_switch_wbs_work_fn().
>
> Fixes: b817525a4a80 ("writeback: bdi_writeback iteration must not skip dying ones")
> Cc: stable@vger.kernel.org
> Signed-off-by: Baokun Li <libaokun1@huawei.com>
> ---
>   fs/fs-writeback.c | 12 ++++++++++--
>   1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 195dc23e0d83..52825aaf549b 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -506,13 +506,13 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
>   	spin_unlock(&new_wb->list_lock);
>   	spin_unlock(&old_wb->list_lock);
>   
> -	up_read(&bdi->wb_switch_rwsem);
> -
>   	if (nr_switched) {
>   		wb_wakeup(new_wb);
>   		wb_put_many(old_wb, nr_switched);
>   	}
>   
> +	up_read(&bdi->wb_switch_rwsem);
> +
>   	for (inodep = isw->inodes; *inodep; inodep++)
>   		iput(*inodep);
>   	wb_put(new_wb);
> @@ -936,6 +936,11 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
>    * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
>    * distributed to the busy wbs according to each wb's proportion in the
>    * total active write bandwidth of @bdi.
> + *
> + * Called under &bdi->wb_switch_rwsem, otherwise bdi_split_work_to_wbs()
> + * may race against cgwb (cgroup writeback) membership switches, which may
> + * cause some inodes to fail to write back, or even trigger a null pointer
> + * dereference using a freed wb.
>    */
>   static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
>   				  struct wb_writeback_work *base_work,
> @@ -2637,8 +2642,11 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
>   		return;
>   	WARN_ON(!rwsem_is_locked(&sb->s_umount));
>   
> +	/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
> +	bdi_down_write_wb_switch_rwsem(bdi);
>   	bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
>   	wb_wait_for_completion(&done);
> +	bdi_up_write_wb_switch_rwsem(bdi);
>   }
>   
>   /**
diff mbox series

Patch

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 195dc23e0d83..52825aaf549b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -506,13 +506,13 @@  static void inode_switch_wbs_work_fn(struct work_struct *work)
 	spin_unlock(&new_wb->list_lock);
 	spin_unlock(&old_wb->list_lock);
 
-	up_read(&bdi->wb_switch_rwsem);
-
 	if (nr_switched) {
 		wb_wakeup(new_wb);
 		wb_put_many(old_wb, nr_switched);
 	}
 
+	up_read(&bdi->wb_switch_rwsem);
+
 	for (inodep = isw->inodes; *inodep; inodep++)
 		iput(*inodep);
 	wb_put(new_wb);
@@ -936,6 +936,11 @@  static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
  * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
  * distributed to the busy wbs according to each wb's proportion in the
  * total active write bandwidth of @bdi.
+ *
+ * Called under &bdi->wb_switch_rwsem, otherwise bdi_split_work_to_wbs()
+ * may race against cgwb (cgroup writeback) membership switches, which may
+ * cause some inodes to fail to write back, or even trigger a null pointer
+ * dereference using a freed wb.
  */
 static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 				  struct wb_writeback_work *base_work,
@@ -2637,8 +2642,11 @@  static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
+	/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
+	bdi_down_write_wb_switch_rwsem(bdi);
 	bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
 	wb_wait_for_completion(&done);
+	bdi_up_write_wb_switch_rwsem(bdi);
 }
 
 /**