diff mbox series

brd: fix null pointer when modprobe brd

Message ID 20241025070511.932879-1-yangerkun@huaweicloud.com (mailing list archive)
State New, archived
Headers show
Series brd: fix null pointer when modprobe brd | expand

Commit Message

yangerkun Oct. 25, 2024, 7:05 a.m. UTC
From: Yang Erkun <yangerkun@huawei.com>

My colleague Wupeng found the following problems during fault injection:

BUG: unable to handle page fault for address: fffffbfff809d073
PGD 6e648067 P4D 123ec8067 PUD 123ec4067 PMD 100e38067 PTE 0
Oops: Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI
CPU: 5 UID: 0 PID: 755 Comm: modprobe Not tainted 6.12.0-rc3+ #17
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.16.1-2.fc37 04/01/2014
RIP: 0010:__asan_load8+0x4c/0xa0
...
Call Trace:
 <TASK>
 blkdev_put_whole+0x41/0x70
 bdev_release+0x1a3/0x250
 blkdev_release+0x11/0x20
 __fput+0x1d7/0x4a0
 task_work_run+0xfc/0x180
 syscall_exit_to_user_mode+0x1de/0x1f0
 do_syscall_64+0x6b/0x170
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

The error path of modprobe will ignores the refcnt for the module, and
directly releases everything about this all associated resources. For
the brd module, it can be brd_fops. The brd_init function attempts to
create 16 ramdisks; each time add_disk is called, a file for the block
device is created to help do partition scanning and remains alive util
we return to userspace(Also we can open it with another user thread).
Let's say brd_init has successfully create the first ramdisk, but fail
to create the second, the error handling path will release code segment.
Consequently, bdev_release for file of first ramdisk will trigger null
pointer since brd_fops has been removed.

To resolve issue, implement a solution similar to loop_init, and
reintroduce brd_devices_mutex to help serialize modifications to
brd_list.

Fixes: 7f9b348cb5e9 ("brd: convert to blk_alloc_disk/blk_cleanup_disk")
Reported-by: Wupeng Ma <mawupeng1@huawei.com>
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
---
 drivers/block/brd.c | 70 ++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 39 deletions(-)

Comments

Yu Kuai Oct. 25, 2024, 7:59 a.m. UTC | #1
Hi,

在 2024/10/25 15:05, Yang Erkun 写道:
> From: Yang Erkun <yangerkun@huawei.com>
> 
> My colleague Wupeng found the following problems during fault injection:
> 
> BUG: unable to handle page fault for address: fffffbfff809d073
> PGD 6e648067 P4D 123ec8067 PUD 123ec4067 PMD 100e38067 PTE 0
> Oops: Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI
> CPU: 5 UID: 0 PID: 755 Comm: modprobe Not tainted 6.12.0-rc3+ #17
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> 1.16.1-2.fc37 04/01/2014
> RIP: 0010:__asan_load8+0x4c/0xa0
> ...
> Call Trace:
>   <TASK>
>   blkdev_put_whole+0x41/0x70
>   bdev_release+0x1a3/0x250
>   blkdev_release+0x11/0x20
>   __fput+0x1d7/0x4a0
>   task_work_run+0xfc/0x180
>   syscall_exit_to_user_mode+0x1de/0x1f0
>   do_syscall_64+0x6b/0x170
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
> The error path of modprobe will ignores the refcnt for the module, and
> directly releases everything about this all associated resources. For
> the brd module, it can be brd_fops. The brd_init function attempts to
> create 16 ramdisks; each time add_disk is called, a file for the block
> device is created to help do partition scanning and remains alive util
> we return to userspace(Also we can open it with another user thread).
> Let's say brd_init has successfully create the first ramdisk, but fail
> to create the second, the error handling path will release code segment.
> Consequently, bdev_release for file of first ramdisk will trigger null
> pointer since brd_fops has been removed.
> 
> To resolve issue, implement a solution similar to loop_init, and
> reintroduce brd_devices_mutex to help serialize modifications to
> brd_list.
> 

This patch looks good to me, just some nits below:

> Fixes: 7f9b348cb5e9 ("brd: convert to blk_alloc_disk/blk_cleanup_disk")
> Reported-by: Wupeng Ma <mawupeng1@huawei.com>
> Signed-off-by: Yang Erkun <yangerkun@huawei.com>
> ---
>   drivers/block/brd.c | 70 ++++++++++++++++++++-------------------------
>   1 file changed, 31 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/block/brd.c b/drivers/block/brd.c
> index 2fd1ed101748..f86dd0795d3c 100644
> --- a/drivers/block/brd.c
> +++ b/drivers/block/brd.c
> @@ -316,6 +316,7 @@ __setup("ramdisk_size=", ramdisk_size);
>    * (should share code eventually).
>    */
>   static LIST_HEAD(brd_devices);
> +static DEFINE_MUTEX(brd_devices_mutex);
>   static struct dentry *brd_debugfs_dir;
>   
>   static int brd_alloc(int i)
> @@ -340,14 +341,21 @@ static int brd_alloc(int i)
>   					  BLK_FEAT_NOWAIT,
>   	};
>   
> -	list_for_each_entry(brd, &brd_devices, brd_list)
> -		if (brd->brd_number == i)
> +	mutex_lock(&brd_devices_mutex);
> +	list_for_each_entry(brd, &brd_devices, brd_list) {
> +		if (brd->brd_number == i) {
> +			mutex_unlock(&brd_devices_mutex);
>   			return -EEXIST;
> +		}
> +	}
>   	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
> -	if (!brd)
> +	if (!brd) {
> +		mutex_unlock(&brd_devices_mutex);
>   		return -ENOMEM;
> +	}
>   	brd->brd_number		= i;
>   	list_add_tail(&brd->brd_list, &brd_devices);
> +	mutex_unlock(&brd_devices_mutex);
>   
>   	xa_init(&brd->brd_pages);

The global mutex is used to protect brd_alloc() from module init and
brd_alloc() from creating on open? If so, I think just use the lock in
the caller is cleaner.

>   
> @@ -378,7 +386,9 @@ static int brd_alloc(int i)
>   out_cleanup_disk:
>   	put_disk(disk);
>   out_free_dev:
> +	mutex_lock(&brd_devices_mutex);
>   	list_del(&brd->brd_list);
> +	mutex_unlock(&brd_devices_mutex);
>   	kfree(brd);
>   	return err;
>   }
> @@ -388,21 +398,6 @@ static void brd_probe(dev_t dev)
>   	brd_alloc(MINOR(dev) / max_part);
>   }
>   
> -static void brd_cleanup(void)
> -{
> -	struct brd_device *brd, *next;
> -
> -	debugfs_remove_recursive(brd_debugfs_dir);
> -
> -	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
> -		del_gendisk(brd->brd_disk);
> -		put_disk(brd->brd_disk);
> -		brd_free_pages(brd);
> -		list_del(&brd->brd_list);
> -		kfree(brd);
> -	}
> -}
> -
>   static inline void brd_check_and_reset_par(void)
>   {
>   	if (unlikely(!max_part))
> @@ -424,17 +419,7 @@ static inline void brd_check_and_reset_par(void)
>   
>   static int __init brd_init(void)
>   {
> -	int err, i;
> -
> -	brd_check_and_reset_par();
> -
> -	brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
> -
> -	for (i = 0; i < rd_nr; i++) {
> -		err = brd_alloc(i);
> -		if (err)
> -			goto out_free;
> -	}
> +	int i;
>   
>   	/*
>   	 * brd module now has a feature to instantiate underlying device
> @@ -450,28 +435,35 @@ static int __init brd_init(void)
>   	 *	If (X / max_part) was not already created it will be created
>   	 *	dynamically.
>   	 */
> +	brd_check_and_reset_par();
> +	brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
>   
>   	if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
> -		err = -EIO;
> -		goto out_free;
> +		pr_info("brd: module NOT loaded !!!\n");
> +		debugfs_remove_recursive(brd_debugfs_dir);
> +		return -EIO;
>   	}
>   
> +	for (i = 0; i < rd_nr; i++)
> +		brd_alloc(i);

Now that brd_alloc() can fail while loding the module will succed,
please also add err log if brd_alloc() failed.
> +
>   	pr_info("brd: module loaded\n");
>   	return 0;
> -
> -out_free:
> -	brd_cleanup();
> -
> -	pr_info("brd: module NOT loaded !!!\n");
> -	return err;
>   }
>   
>   static void __exit brd_exit(void)
>   {
> +	struct brd_device *brd, *next;
>   
>   	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
> -	brd_cleanup();
> -
> +	debugfs_remove_recursive(brd_debugfs_dir);
> +	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
> +		del_gendisk(brd->brd_disk);
> +		put_disk(brd->brd_disk);
> +		brd_free_pages(brd);
> +		list_del(&brd->brd_list);
> +		kfree(brd);
> +	}

Why the global mutex is not used here? It took a while for me to figure
out that this is safe, because unregister_blkdev() make sure no new
brd_alloc() can be called from open path, and in progress open will be
synchronized with del_gendisk().

However, please add some comments or just hold the global mutex.

Thanks,
Kuai

>   	pr_info("brd: module unloaded\n");
>   }
>   
>
Tetsuo Handa Oct. 25, 2024, 10:40 a.m. UTC | #2
On 2024/10/25 16:05, Yang Erkun wrote:
> From: Yang Erkun <yangerkun@huawei.com>
> 
> My colleague Wupeng found the following problems during fault injection:
> 
> BUG: unable to handle page fault for address: fffffbfff809d073

Excuse me, but subject says "null pointer" whereas dmesg says
"not a null pointer dereference". Is this a use-after-free bug?
Also, what verb comes after "when modprobe brd" ?

Is this problem happening with parallel execution? If yes, parallelly
running what and what?

Is this problem happening with what fault injection?
What function (exact location in source code with call trace) has
failed due to fault injection?

> Call Trace:
>  <TASK>
>  blkdev_put_whole+0x41/0x70
>  bdev_release+0x1a3/0x250
>  blkdev_release+0x11/0x20
>  __fput+0x1d7/0x4a0
>  task_work_run+0xfc/0x180
>  syscall_exit_to_user_mode+0x1de/0x1f0
>  do_syscall_64+0x6b/0x170
>  entry_SYSCALL_64_after_hwframe+0x76/0x7e

This suggests that a userspace process has open()ed the device
before brd_init() from modprobe completed?

Please show more context including execution flow until crash.

  CPU0: (or Process1)    CPU1: (or Process2)
    does what?
                           does what?
    does what?
                           does what and is wrong?

Also, you don't need to embed brd_cleanup() into the caller
just because the caller becomes 1 by this change.
Yu Kuai Oct. 26, 2024, 1:21 a.m. UTC | #3
Hi,

在 2024/10/25 18:40, Tetsuo Handa 写道:
> On 2024/10/25 16:05, Yang Erkun wrote:
>> From: Yang Erkun <yangerkun@huawei.com>
>>
>> My colleague Wupeng found the following problems during fault injection:
>>
>> BUG: unable to handle page fault for address: fffffbfff809d073
> 
> Excuse me, but subject says "null pointer" whereas dmesg says
> "not a null pointer dereference". Is this a use-after-free bug?
> Also, what verb comes after "when modprobe brd" ?
> 
> Is this problem happening with parallel execution? If yes, parallelly
> running what and what?

The problem is straightforward, to be short,

T1: morprobe brd
brd_init
  brd_alloc
   add_disk
		T2: open brd
		bdev_open
		 try_module_get
   // err path
   brd_cleanup
  		 // dereference brd_fops() while module is freed.

Thanks,
Kuai

> 
> Is this problem happening with what fault injection?
> What function (exact location in source code with call trace) has
> failed due to fault injection?
> 
>> Call Trace:
>>   <TASK>
>>   blkdev_put_whole+0x41/0x70
>>   bdev_release+0x1a3/0x250
>>   blkdev_release+0x11/0x20
>>   __fput+0x1d7/0x4a0
>>   task_work_run+0xfc/0x180
>>   syscall_exit_to_user_mode+0x1de/0x1f0
>>   do_syscall_64+0x6b/0x170
>>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
> This suggests that a userspace process has open()ed the device
> before brd_init() from modprobe completed?
> 
> Please show more context including execution flow until crash.
> 
>    CPU0: (or Process1)    CPU1: (or Process2)
>      does what?
>                             does what?
>      does what?
>                             does what and is wrong?
> 
> Also, you don't need to embed brd_cleanup() into the caller
> just because the caller becomes 1 by this change.
> 
> .
>
Tetsuo Handa Oct. 26, 2024, 5:55 a.m. UTC | #4
On 2024/10/26 10:21, Yu Kuai wrote:
> Hi,
> 
> 在 2024/10/25 18:40, Tetsuo Handa 写道:
>> On 2024/10/25 16:05, Yang Erkun wrote:
>>> From: Yang Erkun <yangerkun@huawei.com>
>>>
>>> My colleague Wupeng found the following problems during fault injection:
>>>
>>> BUG: unable to handle page fault for address: fffffbfff809d073
>>
>> Excuse me, but subject says "null pointer" whereas dmesg says
>> "not a null pointer dereference". Is this a use-after-free bug?
>> Also, what verb comes after "when modprobe brd" ?
>>
>> Is this problem happening with parallel execution? If yes, parallelly
>> running what and what?
> 
> The problem is straightforward, to be short,
> 
> T1: morprobe brd
> brd_init
>  brd_alloc
>   add_disk
>         T2: open brd
>         bdev_open
>          try_module_get
>   // err path
>   brd_cleanup
>           // dereference brd_fops() while module is freed.

Then, fault injection is irrelevant, isn't it?

If bdev_open() can grab a reference before module's initialization phase
completes is a problem, I think that we can fix the problem with just

 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 	      const struct blk_holder_ops *hops, struct file *bdev_file)
 {
 (...snipped...)
 	ret = -ENXIO;
 	if (!disk_live(disk))
 		goto abort_claiming;
-	if (!try_module_get(disk->fops->owner))
+	if ((disk->fops->owner && module_is_coming(disk->fops->owner)) || !try_module_get(disk->fops->owner))
 		goto abort_claiming;
 	ret = -EBUSY;
 	if (!bdev_may_open(bdev, mode))
 (...snipped...)
 }

change. It would be cleaner if we can do

 bool try_module_get(struct module *module)
 {
 	bool ret = true;
 
 	if (module) {
 		/* Note: here, we can fail to get a reference */
-		if (likely(module_is_live(module) &&
+		if (likely(module_is_live(module) && !module_is_coming(module) &&
 			   atomic_inc_not_zero(&module->refcnt) != 0))
 			trace_module_get(module, _RET_IP_);
 		else
 			ret = false;
 	}
 	return ret;
 }

but I don't know if this change breaks something.
Maybe we can introduce a variant like

bool try_module_get_safe(struct module *module)
{
	bool ret = true;

	if (module) {
		/* Note: here, we can fail to get a reference */
		if (likely(module_is_live(module) && !module_is_coming(module) &&
			   atomic_inc_not_zero(&module->refcnt) != 0))
			trace_module_get(module, _RET_IP_);
		else
			ret = false;
	}
	return ret;
}

and use it from bdev_open().
Yu Kuai Oct. 26, 2024, 6:28 a.m. UTC | #5
Hi,

在 2024/10/26 13:55, Tetsuo Handa 写道:
> On 2024/10/26 10:21, Yu Kuai wrote:
>> Hi,
>>
>> 在 2024/10/25 18:40, Tetsuo Handa 写道:
>>> On 2024/10/25 16:05, Yang Erkun wrote:
>>>> From: Yang Erkun <yangerkun@huawei.com>
>>>>
>>>> My colleague Wupeng found the following problems during fault injection:
>>>>
>>>> BUG: unable to handle page fault for address: fffffbfff809d073
>>>
>>> Excuse me, but subject says "null pointer" whereas dmesg says
>>> "not a null pointer dereference". Is this a use-after-free bug?
>>> Also, what verb comes after "when modprobe brd" ?
>>>
>>> Is this problem happening with parallel execution? If yes, parallelly
>>> running what and what?
>>
>> The problem is straightforward, to be short,
>>
>> T1: morprobe brd
>> brd_init
>>   brd_alloc
>>    add_disk
>>          T2: open brd
>>          bdev_open
>>           try_module_get
>>    // err path
>>    brd_cleanup
>>            // dereference brd_fops() while module is freed.
> 
> Then, fault injection is irrelevant, isn't it?

Fault injection must involved in the test, brd_init() is unlikely to
fail.
> 
> If bdev_open() can grab a reference before module's initialization phase
> completes is a problem, I think that we can fix the problem with just

Yes, and root cause is that stuff inside module can be freed if module
initialization failed, it's not safe to deference disk->fops in this
case.
> 
>   int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
>   	      const struct blk_holder_ops *hops, struct file *bdev_file)
>   {
>   (...snipped...)
>   	ret = -ENXIO;
>   	if (!disk_live(disk))
>   		goto abort_claiming;
> -	if (!try_module_get(disk->fops->owner))
> +	if ((disk->fops->owner && module_is_coming(disk->fops->owner)) || !try_module_get(disk->fops->owner))

This is not a common issue, do you see similiar behaviour for other
drivers? If not, I'll suggest to fix this in brd.

Thanks,
Kuai
>   		goto abort_claiming;
>   	ret = -EBUSY;
>   	if (!bdev_may_open(bdev, mode))
>   (...snipped...)
>   }
> 
> change. It would be cleaner if we can do
> 
>   bool try_module_get(struct module *module)
>   {
>   	bool ret = true;
>   
>   	if (module) {
>   		/* Note: here, we can fail to get a reference */
> -		if (likely(module_is_live(module) &&
> +		if (likely(module_is_live(module) && !module_is_coming(module) &&
>   			   atomic_inc_not_zero(&module->refcnt) != 0))
>   			trace_module_get(module, _RET_IP_);
>   		else
>   			ret = false;
>   	}
>   	return ret;
>   }
> 
> but I don't know if this change breaks something.
> Maybe we can introduce a variant like
> 
> bool try_module_get_safe(struct module *module)
> {
> 	bool ret = true;
> 
> 	if (module) {
> 		/* Note: here, we can fail to get a reference */
> 		if (likely(module_is_live(module) && !module_is_coming(module) &&
> 			   atomic_inc_not_zero(&module->refcnt) != 0))
> 			trace_module_get(module, _RET_IP_);
> 		else
> 			ret = false;
> 	}
> 	return ret;
> }
> 
> and use it from bdev_open().
> 
> 
> .
>
Tetsuo Handa Oct. 26, 2024, 8:06 a.m. UTC | #6
On 2024/10/26 15:28, Yu Kuai wrote:
> Hi,
> 
> 在 2024/10/26 13:55, Tetsuo Handa 写道:
>> On 2024/10/26 10:21, Yu Kuai wrote:
>>> Hi,
>>>
>>> 在 2024/10/25 18:40, Tetsuo Handa 写道:
>>>> On 2024/10/25 16:05, Yang Erkun wrote:
>>>>> From: Yang Erkun <yangerkun@huawei.com>
>>>>>
>>>>> My colleague Wupeng found the following problems during fault injection:
>>>>>
>>>>> BUG: unable to handle page fault for address: fffffbfff809d073
>>>>
>>>> Excuse me, but subject says "null pointer" whereas dmesg says
>>>> "not a null pointer dereference". Is this a use-after-free bug?
>>>> Also, what verb comes after "when modprobe brd" ?
>>>>
>>>> Is this problem happening with parallel execution? If yes, parallelly
>>>> running what and what?
>>>
>>> The problem is straightforward, to be short,
>>>
>>> T1: morprobe brd
>>> brd_init
>>>   brd_alloc
>>>    add_disk
>>>          T2: open brd
>>>          bdev_open
>>>           try_module_get
>>>    // err path
>>>    brd_cleanup
>>>            // dereference brd_fops() while module is freed.
>>
>> Then, fault injection is irrelevant, isn't it?
> 
> Fault injection must involved in the test, brd_init() is unlikely to
> fail.
>>
>> If bdev_open() can grab a reference before module's initialization phase
>> completes is a problem, I think that we can fix the problem with just
> 
> Yes, and root cause is that stuff inside module can be freed if module
> initialization failed, it's not safe to deference disk->fops in this
> case.

Too bad. Then, we have to defer disk_alloc() until module initialization phase
is guaranteed to return success like loop.c does. Please update patch title and
description to something like below.

Subject: brd: defer automatic disk creation until module initialization succeeds

loop_init() is calling loop_add() after __register_blkdev() succeeds and is
ignoring disk_add() failure from loop_add(), for loop_add() failure is not
fatal and successfully created disks are already visible to bdev_open().

brd_init() is currently calling brd_alloc() before __register_blkdev()
succeeds and is releasing successfully created disks when brd_init()
returns an error. This can cause UAF when brd_init() failure raced with
bdev_open(), for successfully created disks are already visible to
bdev_open(). Fix this problem by following what loop_init() does.
yangerkun Oct. 28, 2024, 2:10 a.m. UTC | #7
在 2024/10/25 15:59, Yu Kuai 写道:
> Hi,
> 
> 在 2024/10/25 15:05, Yang Erkun 写道:
>> From: Yang Erkun <yangerkun@huawei.com>
>>
>> My colleague Wupeng found the following problems during fault injection:
>>
>> BUG: unable to handle page fault for address: fffffbfff809d073
>> PGD 6e648067 P4D 123ec8067 PUD 123ec4067 PMD 100e38067 PTE 0
>> Oops: Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI
>> CPU: 5 UID: 0 PID: 755 Comm: modprobe Not tainted 6.12.0-rc3+ #17
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
>> 1.16.1-2.fc37 04/01/2014
>> RIP: 0010:__asan_load8+0x4c/0xa0
>> ...
>> Call Trace:
>>   <TASK>
>>   blkdev_put_whole+0x41/0x70
>>   bdev_release+0x1a3/0x250
>>   blkdev_release+0x11/0x20
>>   __fput+0x1d7/0x4a0
>>   task_work_run+0xfc/0x180
>>   syscall_exit_to_user_mode+0x1de/0x1f0
>>   do_syscall_64+0x6b/0x170
>>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>>
>> The error path of modprobe will ignores the refcnt for the module, and
>> directly releases everything about this all associated resources. For
>> the brd module, it can be brd_fops. The brd_init function attempts to
>> create 16 ramdisks; each time add_disk is called, a file for the block
>> device is created to help do partition scanning and remains alive util
>> we return to userspace(Also we can open it with another user thread).
>> Let's say brd_init has successfully create the first ramdisk, but fail
>> to create the second, the error handling path will release code segment.
>> Consequently, bdev_release for file of first ramdisk will trigger null
>> pointer since brd_fops has been removed.
>>
>> To resolve issue, implement a solution similar to loop_init, and
>> reintroduce brd_devices_mutex to help serialize modifications to
>> brd_list.
>>
> 
> This patch looks good to me, just some nits below:
> 
>> Fixes: 7f9b348cb5e9 ("brd: convert to blk_alloc_disk/blk_cleanup_disk")
>> Reported-by: Wupeng Ma <mawupeng1@huawei.com>
>> Signed-off-by: Yang Erkun <yangerkun@huawei.com>
>> ---
>>   drivers/block/brd.c | 70 ++++++++++++++++++++-------------------------
>>   1 file changed, 31 insertions(+), 39 deletions(-)
>>
>> diff --git a/drivers/block/brd.c b/drivers/block/brd.c
>> index 2fd1ed101748..f86dd0795d3c 100644
>> --- a/drivers/block/brd.c
>> +++ b/drivers/block/brd.c
>> @@ -316,6 +316,7 @@ __setup("ramdisk_size=", ramdisk_size);
>>    * (should share code eventually).
>>    */
>>   static LIST_HEAD(brd_devices);
>> +static DEFINE_MUTEX(brd_devices_mutex);
>>   static struct dentry *brd_debugfs_dir;
>>   static int brd_alloc(int i)
>> @@ -340,14 +341,21 @@ static int brd_alloc(int i)
>>                         BLK_FEAT_NOWAIT,
>>       };
>> -    list_for_each_entry(brd, &brd_devices, brd_list)
>> -        if (brd->brd_number == i)
>> +    mutex_lock(&brd_devices_mutex);
>> +    list_for_each_entry(brd, &brd_devices, brd_list) {
>> +        if (brd->brd_number == i) {
>> +            mutex_unlock(&brd_devices_mutex);
>>               return -EEXIST;
>> +        }
>> +    }
>>       brd = kzalloc(sizeof(*brd), GFP_KERNEL);
>> -    if (!brd)
>> +    if (!brd) {
>> +        mutex_unlock(&brd_devices_mutex);
>>           return -ENOMEM;
>> +    }
>>       brd->brd_number        = i;
>>       list_add_tail(&brd->brd_list, &brd_devices);
>> +    mutex_unlock(&brd_devices_mutex);
>>       xa_init(&brd->brd_pages);
> 
> The global mutex is used to protect brd_alloc() from module init and
> brd_alloc() from creating on open? If so, I think just use the lock in
> the caller is cleaner.

Yes, it appears cleaner. However, is it a little too long to hold the
mutex lock, considering that brd_alloc in brd_init can occur
concurrently with brd_prbe? See f7bf35862477 ("brd: reduce the
brd_devices_mutex scope")

> 
>> @@ -378,7 +386,9 @@ static int brd_alloc(int i)
>>   out_cleanup_disk:
>>       put_disk(disk);
>>   out_free_dev:
>> +    mutex_lock(&brd_devices_mutex);
>>       list_del(&brd->brd_list);
>> +    mutex_unlock(&brd_devices_mutex);
>>       kfree(brd);
>>       return err;
>>   }
>> @@ -388,21 +398,6 @@ static void brd_probe(dev_t dev)
>>       brd_alloc(MINOR(dev) / max_part);
>>   }
>> -static void brd_cleanup(void)
>> -{
>> -    struct brd_device *brd, *next;
>> -
>> -    debugfs_remove_recursive(brd_debugfs_dir);
>> -
>> -    list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
>> -        del_gendisk(brd->brd_disk);
>> -        put_disk(brd->brd_disk);
>> -        brd_free_pages(brd);
>> -        list_del(&brd->brd_list);
>> -        kfree(brd);
>> -    }
>> -}
>> -
>>   static inline void brd_check_and_reset_par(void)
>>   {
>>       if (unlikely(!max_part))
>> @@ -424,17 +419,7 @@ static inline void brd_check_and_reset_par(void)
>>   static int __init brd_init(void)
>>   {
>> -    int err, i;
>> -
>> -    brd_check_and_reset_par();
>> -
>> -    brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
>> -
>> -    for (i = 0; i < rd_nr; i++) {
>> -        err = brd_alloc(i);
>> -        if (err)
>> -            goto out_free;
>> -    }
>> +    int i;
>>       /*
>>        * brd module now has a feature to instantiate underlying device
>> @@ -450,28 +435,35 @@ static int __init brd_init(void)
>>        *    If (X / max_part) was not already created it will be created
>>        *    dynamically.
>>        */
>> +    brd_check_and_reset_par();
>> +    brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
>>       if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
>> -        err = -EIO;
>> -        goto out_free;
>> +        pr_info("brd: module NOT loaded !!!\n");
>> +        debugfs_remove_recursive(brd_debugfs_dir);
>> +        return -EIO;
>>       }
>> +    for (i = 0; i < rd_nr; i++)
>> +        brd_alloc(i);
> 
> Now that brd_alloc() can fail while loding the module will succed,
> please also add err log if brd_alloc() failed.

OK

>> +
>>       pr_info("brd: module loaded\n");
>>       return 0;
>> -
>> -out_free:
>> -    brd_cleanup();
>> -
>> -    pr_info("brd: module NOT loaded !!!\n");
>> -    return err;
>>   }
>>   static void __exit brd_exit(void)
>>   {
>> +    struct brd_device *brd, *next;
>>       unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
>> -    brd_cleanup();
>> -
>> +    debugfs_remove_recursive(brd_debugfs_dir);
>> +    list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
>> +        del_gendisk(brd->brd_disk);
>> +        put_disk(brd->brd_disk);
>> +        brd_free_pages(brd);
>> +        list_del(&brd->brd_list);
>> +        kfree(brd);
>> +    }
> 
> Why the global mutex is not used here? It took a while for me to figure
> out that this is safe, because unregister_blkdev() make sure no new
> brd_alloc() can be called from open path, and in progress open will be
> synchronized with del_gendisk().
Yes

> 
> However, please add some comments or just hold the global mutex.

OK

> 
> Thanks,
> Kuai
> 
>>       pr_info("brd: module unloaded\n");
>>   }
>>
yangerkun Oct. 28, 2024, 2:15 a.m. UTC | #8
在 2024/10/26 16:06, Tetsuo Handa 写道:
> On 2024/10/26 15:28, Yu Kuai wrote:
>> Hi,
>>
>> 在 2024/10/26 13:55, Tetsuo Handa 写道:
>>> On 2024/10/26 10:21, Yu Kuai wrote:
>>>> Hi,
>>>>
>>>> 在 2024/10/25 18:40, Tetsuo Handa 写道:
>>>>> On 2024/10/25 16:05, Yang Erkun wrote:
>>>>>> From: Yang Erkun <yangerkun@huawei.com>
>>>>>>
>>>>>> My colleague Wupeng found the following problems during fault injection:
>>>>>>
>>>>>> BUG: unable to handle page fault for address: fffffbfff809d073
>>>>>
>>>>> Excuse me, but subject says "null pointer" whereas dmesg says
>>>>> "not a null pointer dereference". Is this a use-after-free bug?
>>>>> Also, what verb comes after "when modprobe brd" ?
>>>>>
>>>>> Is this problem happening with parallel execution? If yes, parallelly
>>>>> running what and what?
>>>>
>>>> The problem is straightforward, to be short,
>>>>
>>>> T1: morprobe brd
>>>> brd_init
>>>>    brd_alloc
>>>>     add_disk
>>>>           T2: open brd
>>>>           bdev_open
>>>>            try_module_get
>>>>     // err path
>>>>     brd_cleanup
>>>>             // dereference brd_fops() while module is freed.
>>>
>>> Then, fault injection is irrelevant, isn't it?
>>
>> Fault injection must involved in the test, brd_init() is unlikely to
>> fail.
>>>
>>> If bdev_open() can grab a reference before module's initialization phase
>>> completes is a problem, I think that we can fix the problem with just
>>
>> Yes, and root cause is that stuff inside module can be freed if module
>> initialization failed, it's not safe to deference disk->fops in this
>> case.
> 
> Too bad. Then, we have to defer disk_alloc() until module initialization phase
> is guaranteed to return success like loop.c does. Please update patch title and
> description to something like below.
> 
> Subject: brd: defer automatic disk creation until module initialization succeeds
> 
> loop_init() is calling loop_add() after __register_blkdev() succeeds and is
> ignoring disk_add() failure from loop_add(), for loop_add() failure is not
> fatal and successfully created disks are already visible to bdev_open().
> 
> brd_init() is currently calling brd_alloc() before __register_blkdev()
> succeeds and is releasing successfully created disks when brd_init()
> returns an error. This can cause UAF when brd_init() failure raced with
> bdev_open(), for successfully created disks are already visible to
> bdev_open(). Fix this problem by following what loop_init() does.

Tetsuo and Kuai,

This commit msg looks good to me. I couldn't reply to some emails the
other day. Thanks a lot for your discussion and review to make this
problem clear!

Thanks,
Erkun.
Luis R. Rodriguez Nov. 1, 2024, 12:31 a.m. UTC | #9
On Sat, Oct 26, 2024 at 02:55:59PM +0900, Tetsuo Handa wrote:
> If bdev_open() can grab a reference before module's initialization phase
> completes is a problem,

Yes, that would indicate there's a bug and alas we have a regression.
Commit d1909c0221739 ("module: Don't ignore errors from
set_memory_XX()") merged on v6.9 introduced a regression, allowing
module init to start and later us failing module initialization to
complete. So to be clear, there's a possible transition from live to
not live right away.

This was discussed in this thread:

https://lore.kernel.org/all/Zuv0nmFblHUwuT8v@bombadil.infradead.org/T/#u

> I think that we can fix the problem with just
> 
>  int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
>  	      const struct blk_holder_ops *hops, struct file *bdev_file)
>  {
>  (...snipped...)
>  	ret = -ENXIO;
>  	if (!disk_live(disk))
>  		goto abort_claiming;
> -	if (!try_module_get(disk->fops->owner))
> +	if ((disk->fops->owner && module_is_coming(disk->fops->owner)) || !try_module_get(disk->fops->owner))
>  		goto abort_claiming;
>  	ret = -EBUSY;
>  	if (!bdev_may_open(bdev, mode))
>  (...snipped...)
>  }
> 
> change. It would be cleaner if we can do
> 
>  bool try_module_get(struct module *module)
>  {
>  	bool ret = true;
>  
>  	if (module) {
>  		/* Note: here, we can fail to get a reference */
> -		if (likely(module_is_live(module) &&
> +		if (likely(module_is_live(module) && !module_is_coming(module) &&
>  			   atomic_inc_not_zero(&module->refcnt) != 0))
>  			trace_module_get(module, _RET_IP_);
>  		else
>  			ret = false;
>  	}
>  	return ret;
>  }
> 
> but I don't know if this change breaks something.

As I see it, if we fix the above regression I can't see how a module
being live can transition into coming other than the regression above.

  Luis
diff mbox series

Patch

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 2fd1ed101748..f86dd0795d3c 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -316,6 +316,7 @@  __setup("ramdisk_size=", ramdisk_size);
  * (should share code eventually).
  */
 static LIST_HEAD(brd_devices);
+static DEFINE_MUTEX(brd_devices_mutex);
 static struct dentry *brd_debugfs_dir;
 
 static int brd_alloc(int i)
@@ -340,14 +341,21 @@  static int brd_alloc(int i)
 					  BLK_FEAT_NOWAIT,
 	};
 
-	list_for_each_entry(brd, &brd_devices, brd_list)
-		if (brd->brd_number == i)
+	mutex_lock(&brd_devices_mutex);
+	list_for_each_entry(brd, &brd_devices, brd_list) {
+		if (brd->brd_number == i) {
+			mutex_unlock(&brd_devices_mutex);
 			return -EEXIST;
+		}
+	}
 	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
-	if (!brd)
+	if (!brd) {
+		mutex_unlock(&brd_devices_mutex);
 		return -ENOMEM;
+	}
 	brd->brd_number		= i;
 	list_add_tail(&brd->brd_list, &brd_devices);
+	mutex_unlock(&brd_devices_mutex);
 
 	xa_init(&brd->brd_pages);
 
@@ -378,7 +386,9 @@  static int brd_alloc(int i)
 out_cleanup_disk:
 	put_disk(disk);
 out_free_dev:
+	mutex_lock(&brd_devices_mutex);
 	list_del(&brd->brd_list);
+	mutex_unlock(&brd_devices_mutex);
 	kfree(brd);
 	return err;
 }
@@ -388,21 +398,6 @@  static void brd_probe(dev_t dev)
 	brd_alloc(MINOR(dev) / max_part);
 }
 
-static void brd_cleanup(void)
-{
-	struct brd_device *brd, *next;
-
-	debugfs_remove_recursive(brd_debugfs_dir);
-
-	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
-		del_gendisk(brd->brd_disk);
-		put_disk(brd->brd_disk);
-		brd_free_pages(brd);
-		list_del(&brd->brd_list);
-		kfree(brd);
-	}
-}
-
 static inline void brd_check_and_reset_par(void)
 {
 	if (unlikely(!max_part))
@@ -424,17 +419,7 @@  static inline void brd_check_and_reset_par(void)
 
 static int __init brd_init(void)
 {
-	int err, i;
-
-	brd_check_and_reset_par();
-
-	brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
-
-	for (i = 0; i < rd_nr; i++) {
-		err = brd_alloc(i);
-		if (err)
-			goto out_free;
-	}
+	int i;
 
 	/*
 	 * brd module now has a feature to instantiate underlying device
@@ -450,28 +435,35 @@  static int __init brd_init(void)
 	 *	If (X / max_part) was not already created it will be created
 	 *	dynamically.
 	 */
+	brd_check_and_reset_par();
+	brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
 
 	if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
-		err = -EIO;
-		goto out_free;
+		pr_info("brd: module NOT loaded !!!\n");
+		debugfs_remove_recursive(brd_debugfs_dir);
+		return -EIO;
 	}
 
+	for (i = 0; i < rd_nr; i++)
+		brd_alloc(i);
+
 	pr_info("brd: module loaded\n");
 	return 0;
-
-out_free:
-	brd_cleanup();
-
-	pr_info("brd: module NOT loaded !!!\n");
-	return err;
 }
 
 static void __exit brd_exit(void)
 {
+	struct brd_device *brd, *next;
 
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
-	brd_cleanup();
-
+	debugfs_remove_recursive(brd_debugfs_dir);
+	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
+		del_gendisk(brd->brd_disk);
+		put_disk(brd->brd_disk);
+		brd_free_pages(brd);
+		list_del(&brd->brd_list);
+		kfree(brd);
+	}
 	pr_info("brd: module unloaded\n");
 }