Message ID | 20241025070511.932879-1-yangerkun@huaweicloud.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | brd: fix null pointer when modprobe brd | expand |
Hi, 在 2024/10/25 15:05, Yang Erkun 写道: > From: Yang Erkun <yangerkun@huawei.com> > > My colleague Wupeng found the following problems during fault injection: > > BUG: unable to handle page fault for address: fffffbfff809d073 > PGD 6e648067 P4D 123ec8067 PUD 123ec4067 PMD 100e38067 PTE 0 > Oops: Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI > CPU: 5 UID: 0 PID: 755 Comm: modprobe Not tainted 6.12.0-rc3+ #17 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS > 1.16.1-2.fc37 04/01/2014 > RIP: 0010:__asan_load8+0x4c/0xa0 > ... > Call Trace: > <TASK> > blkdev_put_whole+0x41/0x70 > bdev_release+0x1a3/0x250 > blkdev_release+0x11/0x20 > __fput+0x1d7/0x4a0 > task_work_run+0xfc/0x180 > syscall_exit_to_user_mode+0x1de/0x1f0 > do_syscall_64+0x6b/0x170 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > The error path of modprobe will ignores the refcnt for the module, and > directly releases everything about this all associated resources. For > the brd module, it can be brd_fops. The brd_init function attempts to > create 16 ramdisks; each time add_disk is called, a file for the block > device is created to help do partition scanning and remains alive util > we return to userspace(Also we can open it with another user thread). > Let's say brd_init has successfully create the first ramdisk, but fail > to create the second, the error handling path will release code segment. > Consequently, bdev_release for file of first ramdisk will trigger null > pointer since brd_fops has been removed. > > To resolve issue, implement a solution similar to loop_init, and > reintroduce brd_devices_mutex to help serialize modifications to > brd_list. > This patch looks good to me, just some nits below: > Fixes: 7f9b348cb5e9 ("brd: convert to blk_alloc_disk/blk_cleanup_disk") > Reported-by: Wupeng Ma <mawupeng1@huawei.com> > Signed-off-by: Yang Erkun <yangerkun@huawei.com> > --- > drivers/block/brd.c | 70 ++++++++++++++++++++------------------------- > 1 file changed, 31 insertions(+), 39 deletions(-) > > diff --git a/drivers/block/brd.c b/drivers/block/brd.c > index 2fd1ed101748..f86dd0795d3c 100644 > --- a/drivers/block/brd.c > +++ b/drivers/block/brd.c > @@ -316,6 +316,7 @@ __setup("ramdisk_size=", ramdisk_size); > * (should share code eventually). > */ > static LIST_HEAD(brd_devices); > +static DEFINE_MUTEX(brd_devices_mutex); > static struct dentry *brd_debugfs_dir; > > static int brd_alloc(int i) > @@ -340,14 +341,21 @@ static int brd_alloc(int i) > BLK_FEAT_NOWAIT, > }; > > - list_for_each_entry(brd, &brd_devices, brd_list) > - if (brd->brd_number == i) > + mutex_lock(&brd_devices_mutex); > + list_for_each_entry(brd, &brd_devices, brd_list) { > + if (brd->brd_number == i) { > + mutex_unlock(&brd_devices_mutex); > return -EEXIST; > + } > + } > brd = kzalloc(sizeof(*brd), GFP_KERNEL); > - if (!brd) > + if (!brd) { > + mutex_unlock(&brd_devices_mutex); > return -ENOMEM; > + } > brd->brd_number = i; > list_add_tail(&brd->brd_list, &brd_devices); > + mutex_unlock(&brd_devices_mutex); > > xa_init(&brd->brd_pages); The global mutex is used to protect brd_alloc() from module init and brd_alloc() from creating on open? If so, I think just use the lock in the caller is cleaner. > > @@ -378,7 +386,9 @@ static int brd_alloc(int i) > out_cleanup_disk: > put_disk(disk); > out_free_dev: > + mutex_lock(&brd_devices_mutex); > list_del(&brd->brd_list); > + mutex_unlock(&brd_devices_mutex); > kfree(brd); > return err; > } > @@ -388,21 +398,6 @@ static void brd_probe(dev_t dev) > brd_alloc(MINOR(dev) / max_part); > } > > -static void brd_cleanup(void) > -{ > - struct brd_device *brd, *next; > - > - debugfs_remove_recursive(brd_debugfs_dir); > - > - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { > - del_gendisk(brd->brd_disk); > - put_disk(brd->brd_disk); > - brd_free_pages(brd); > - list_del(&brd->brd_list); > - kfree(brd); > - } > -} > - > static inline void brd_check_and_reset_par(void) > { > if (unlikely(!max_part)) > @@ -424,17 +419,7 @@ static inline void brd_check_and_reset_par(void) > > static int __init brd_init(void) > { > - int err, i; > - > - brd_check_and_reset_par(); > - > - brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); > - > - for (i = 0; i < rd_nr; i++) { > - err = brd_alloc(i); > - if (err) > - goto out_free; > - } > + int i; > > /* > * brd module now has a feature to instantiate underlying device > @@ -450,28 +435,35 @@ static int __init brd_init(void) > * If (X / max_part) was not already created it will be created > * dynamically. > */ > + brd_check_and_reset_par(); > + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); > > if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { > - err = -EIO; > - goto out_free; > + pr_info("brd: module NOT loaded !!!\n"); > + debugfs_remove_recursive(brd_debugfs_dir); > + return -EIO; > } > > + for (i = 0; i < rd_nr; i++) > + brd_alloc(i); Now that brd_alloc() can fail while loding the module will succed, please also add err log if brd_alloc() failed. > + > pr_info("brd: module loaded\n"); > return 0; > - > -out_free: > - brd_cleanup(); > - > - pr_info("brd: module NOT loaded !!!\n"); > - return err; > } > > static void __exit brd_exit(void) > { > + struct brd_device *brd, *next; > > unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); > - brd_cleanup(); > - > + debugfs_remove_recursive(brd_debugfs_dir); > + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { > + del_gendisk(brd->brd_disk); > + put_disk(brd->brd_disk); > + brd_free_pages(brd); > + list_del(&brd->brd_list); > + kfree(brd); > + } Why the global mutex is not used here? It took a while for me to figure out that this is safe, because unregister_blkdev() make sure no new brd_alloc() can be called from open path, and in progress open will be synchronized with del_gendisk(). However, please add some comments or just hold the global mutex. Thanks, Kuai > pr_info("brd: module unloaded\n"); > } > >
On 2024/10/25 16:05, Yang Erkun wrote: > From: Yang Erkun <yangerkun@huawei.com> > > My colleague Wupeng found the following problems during fault injection: > > BUG: unable to handle page fault for address: fffffbfff809d073 Excuse me, but subject says "null pointer" whereas dmesg says "not a null pointer dereference". Is this a use-after-free bug? Also, what verb comes after "when modprobe brd" ? Is this problem happening with parallel execution? If yes, parallelly running what and what? Is this problem happening with what fault injection? What function (exact location in source code with call trace) has failed due to fault injection? > Call Trace: > <TASK> > blkdev_put_whole+0x41/0x70 > bdev_release+0x1a3/0x250 > blkdev_release+0x11/0x20 > __fput+0x1d7/0x4a0 > task_work_run+0xfc/0x180 > syscall_exit_to_user_mode+0x1de/0x1f0 > do_syscall_64+0x6b/0x170 > entry_SYSCALL_64_after_hwframe+0x76/0x7e This suggests that a userspace process has open()ed the device before brd_init() from modprobe completed? Please show more context including execution flow until crash. CPU0: (or Process1) CPU1: (or Process2) does what? does what? does what? does what and is wrong? Also, you don't need to embed brd_cleanup() into the caller just because the caller becomes 1 by this change.
Hi, 在 2024/10/25 18:40, Tetsuo Handa 写道: > On 2024/10/25 16:05, Yang Erkun wrote: >> From: Yang Erkun <yangerkun@huawei.com> >> >> My colleague Wupeng found the following problems during fault injection: >> >> BUG: unable to handle page fault for address: fffffbfff809d073 > > Excuse me, but subject says "null pointer" whereas dmesg says > "not a null pointer dereference". Is this a use-after-free bug? > Also, what verb comes after "when modprobe brd" ? > > Is this problem happening with parallel execution? If yes, parallelly > running what and what? The problem is straightforward, to be short, T1: morprobe brd brd_init brd_alloc add_disk T2: open brd bdev_open try_module_get // err path brd_cleanup // dereference brd_fops() while module is freed. Thanks, Kuai > > Is this problem happening with what fault injection? > What function (exact location in source code with call trace) has > failed due to fault injection? > >> Call Trace: >> <TASK> >> blkdev_put_whole+0x41/0x70 >> bdev_release+0x1a3/0x250 >> blkdev_release+0x11/0x20 >> __fput+0x1d7/0x4a0 >> task_work_run+0xfc/0x180 >> syscall_exit_to_user_mode+0x1de/0x1f0 >> do_syscall_64+0x6b/0x170 >> entry_SYSCALL_64_after_hwframe+0x76/0x7e > > This suggests that a userspace process has open()ed the device > before brd_init() from modprobe completed? > > Please show more context including execution flow until crash. > > CPU0: (or Process1) CPU1: (or Process2) > does what? > does what? > does what? > does what and is wrong? > > Also, you don't need to embed brd_cleanup() into the caller > just because the caller becomes 1 by this change. > > . >
On 2024/10/26 10:21, Yu Kuai wrote: > Hi, > > 在 2024/10/25 18:40, Tetsuo Handa 写道: >> On 2024/10/25 16:05, Yang Erkun wrote: >>> From: Yang Erkun <yangerkun@huawei.com> >>> >>> My colleague Wupeng found the following problems during fault injection: >>> >>> BUG: unable to handle page fault for address: fffffbfff809d073 >> >> Excuse me, but subject says "null pointer" whereas dmesg says >> "not a null pointer dereference". Is this a use-after-free bug? >> Also, what verb comes after "when modprobe brd" ? >> >> Is this problem happening with parallel execution? If yes, parallelly >> running what and what? > > The problem is straightforward, to be short, > > T1: morprobe brd > brd_init > brd_alloc > add_disk > T2: open brd > bdev_open > try_module_get > // err path > brd_cleanup > // dereference brd_fops() while module is freed. Then, fault injection is irrelevant, isn't it? If bdev_open() can grab a reference before module's initialization phase completes is a problem, I think that we can fix the problem with just int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file) { (...snipped...) ret = -ENXIO; if (!disk_live(disk)) goto abort_claiming; - if (!try_module_get(disk->fops->owner)) + if ((disk->fops->owner && module_is_coming(disk->fops->owner)) || !try_module_get(disk->fops->owner)) goto abort_claiming; ret = -EBUSY; if (!bdev_may_open(bdev, mode)) (...snipped...) } change. It would be cleaner if we can do bool try_module_get(struct module *module) { bool ret = true; if (module) { /* Note: here, we can fail to get a reference */ - if (likely(module_is_live(module) && + if (likely(module_is_live(module) && !module_is_coming(module) && atomic_inc_not_zero(&module->refcnt) != 0)) trace_module_get(module, _RET_IP_); else ret = false; } return ret; } but I don't know if this change breaks something. Maybe we can introduce a variant like bool try_module_get_safe(struct module *module) { bool ret = true; if (module) { /* Note: here, we can fail to get a reference */ if (likely(module_is_live(module) && !module_is_coming(module) && atomic_inc_not_zero(&module->refcnt) != 0)) trace_module_get(module, _RET_IP_); else ret = false; } return ret; } and use it from bdev_open().
Hi, 在 2024/10/26 13:55, Tetsuo Handa 写道: > On 2024/10/26 10:21, Yu Kuai wrote: >> Hi, >> >> 在 2024/10/25 18:40, Tetsuo Handa 写道: >>> On 2024/10/25 16:05, Yang Erkun wrote: >>>> From: Yang Erkun <yangerkun@huawei.com> >>>> >>>> My colleague Wupeng found the following problems during fault injection: >>>> >>>> BUG: unable to handle page fault for address: fffffbfff809d073 >>> >>> Excuse me, but subject says "null pointer" whereas dmesg says >>> "not a null pointer dereference". Is this a use-after-free bug? >>> Also, what verb comes after "when modprobe brd" ? >>> >>> Is this problem happening with parallel execution? If yes, parallelly >>> running what and what? >> >> The problem is straightforward, to be short, >> >> T1: morprobe brd >> brd_init >> brd_alloc >> add_disk >> T2: open brd >> bdev_open >> try_module_get >> // err path >> brd_cleanup >> // dereference brd_fops() while module is freed. > > Then, fault injection is irrelevant, isn't it? Fault injection must involved in the test, brd_init() is unlikely to fail. > > If bdev_open() can grab a reference before module's initialization phase > completes is a problem, I think that we can fix the problem with just Yes, and root cause is that stuff inside module can be freed if module initialization failed, it's not safe to deference disk->fops in this case. > > int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, > const struct blk_holder_ops *hops, struct file *bdev_file) > { > (...snipped...) > ret = -ENXIO; > if (!disk_live(disk)) > goto abort_claiming; > - if (!try_module_get(disk->fops->owner)) > + if ((disk->fops->owner && module_is_coming(disk->fops->owner)) || !try_module_get(disk->fops->owner)) This is not a common issue, do you see similiar behaviour for other drivers? If not, I'll suggest to fix this in brd. Thanks, Kuai > goto abort_claiming; > ret = -EBUSY; > if (!bdev_may_open(bdev, mode)) > (...snipped...) > } > > change. It would be cleaner if we can do > > bool try_module_get(struct module *module) > { > bool ret = true; > > if (module) { > /* Note: here, we can fail to get a reference */ > - if (likely(module_is_live(module) && > + if (likely(module_is_live(module) && !module_is_coming(module) && > atomic_inc_not_zero(&module->refcnt) != 0)) > trace_module_get(module, _RET_IP_); > else > ret = false; > } > return ret; > } > > but I don't know if this change breaks something. > Maybe we can introduce a variant like > > bool try_module_get_safe(struct module *module) > { > bool ret = true; > > if (module) { > /* Note: here, we can fail to get a reference */ > if (likely(module_is_live(module) && !module_is_coming(module) && > atomic_inc_not_zero(&module->refcnt) != 0)) > trace_module_get(module, _RET_IP_); > else > ret = false; > } > return ret; > } > > and use it from bdev_open(). > > > . >
On 2024/10/26 15:28, Yu Kuai wrote: > Hi, > > 在 2024/10/26 13:55, Tetsuo Handa 写道: >> On 2024/10/26 10:21, Yu Kuai wrote: >>> Hi, >>> >>> 在 2024/10/25 18:40, Tetsuo Handa 写道: >>>> On 2024/10/25 16:05, Yang Erkun wrote: >>>>> From: Yang Erkun <yangerkun@huawei.com> >>>>> >>>>> My colleague Wupeng found the following problems during fault injection: >>>>> >>>>> BUG: unable to handle page fault for address: fffffbfff809d073 >>>> >>>> Excuse me, but subject says "null pointer" whereas dmesg says >>>> "not a null pointer dereference". Is this a use-after-free bug? >>>> Also, what verb comes after "when modprobe brd" ? >>>> >>>> Is this problem happening with parallel execution? If yes, parallelly >>>> running what and what? >>> >>> The problem is straightforward, to be short, >>> >>> T1: morprobe brd >>> brd_init >>> brd_alloc >>> add_disk >>> T2: open brd >>> bdev_open >>> try_module_get >>> // err path >>> brd_cleanup >>> // dereference brd_fops() while module is freed. >> >> Then, fault injection is irrelevant, isn't it? > > Fault injection must involved in the test, brd_init() is unlikely to > fail. >> >> If bdev_open() can grab a reference before module's initialization phase >> completes is a problem, I think that we can fix the problem with just > > Yes, and root cause is that stuff inside module can be freed if module > initialization failed, it's not safe to deference disk->fops in this > case. Too bad. Then, we have to defer disk_alloc() until module initialization phase is guaranteed to return success like loop.c does. Please update patch title and description to something like below. Subject: brd: defer automatic disk creation until module initialization succeeds loop_init() is calling loop_add() after __register_blkdev() succeeds and is ignoring disk_add() failure from loop_add(), for loop_add() failure is not fatal and successfully created disks are already visible to bdev_open(). brd_init() is currently calling brd_alloc() before __register_blkdev() succeeds and is releasing successfully created disks when brd_init() returns an error. This can cause UAF when brd_init() failure raced with bdev_open(), for successfully created disks are already visible to bdev_open(). Fix this problem by following what loop_init() does.
在 2024/10/25 15:59, Yu Kuai 写道: > Hi, > > 在 2024/10/25 15:05, Yang Erkun 写道: >> From: Yang Erkun <yangerkun@huawei.com> >> >> My colleague Wupeng found the following problems during fault injection: >> >> BUG: unable to handle page fault for address: fffffbfff809d073 >> PGD 6e648067 P4D 123ec8067 PUD 123ec4067 PMD 100e38067 PTE 0 >> Oops: Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI >> CPU: 5 UID: 0 PID: 755 Comm: modprobe Not tainted 6.12.0-rc3+ #17 >> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS >> 1.16.1-2.fc37 04/01/2014 >> RIP: 0010:__asan_load8+0x4c/0xa0 >> ... >> Call Trace: >> <TASK> >> blkdev_put_whole+0x41/0x70 >> bdev_release+0x1a3/0x250 >> blkdev_release+0x11/0x20 >> __fput+0x1d7/0x4a0 >> task_work_run+0xfc/0x180 >> syscall_exit_to_user_mode+0x1de/0x1f0 >> do_syscall_64+0x6b/0x170 >> entry_SYSCALL_64_after_hwframe+0x76/0x7e >> >> The error path of modprobe will ignores the refcnt for the module, and >> directly releases everything about this all associated resources. For >> the brd module, it can be brd_fops. The brd_init function attempts to >> create 16 ramdisks; each time add_disk is called, a file for the block >> device is created to help do partition scanning and remains alive util >> we return to userspace(Also we can open it with another user thread). >> Let's say brd_init has successfully create the first ramdisk, but fail >> to create the second, the error handling path will release code segment. >> Consequently, bdev_release for file of first ramdisk will trigger null >> pointer since brd_fops has been removed. >> >> To resolve issue, implement a solution similar to loop_init, and >> reintroduce brd_devices_mutex to help serialize modifications to >> brd_list. >> > > This patch looks good to me, just some nits below: > >> Fixes: 7f9b348cb5e9 ("brd: convert to blk_alloc_disk/blk_cleanup_disk") >> Reported-by: Wupeng Ma <mawupeng1@huawei.com> >> Signed-off-by: Yang Erkun <yangerkun@huawei.com> >> --- >> drivers/block/brd.c | 70 ++++++++++++++++++++------------------------- >> 1 file changed, 31 insertions(+), 39 deletions(-) >> >> diff --git a/drivers/block/brd.c b/drivers/block/brd.c >> index 2fd1ed101748..f86dd0795d3c 100644 >> --- a/drivers/block/brd.c >> +++ b/drivers/block/brd.c >> @@ -316,6 +316,7 @@ __setup("ramdisk_size=", ramdisk_size); >> * (should share code eventually). >> */ >> static LIST_HEAD(brd_devices); >> +static DEFINE_MUTEX(brd_devices_mutex); >> static struct dentry *brd_debugfs_dir; >> static int brd_alloc(int i) >> @@ -340,14 +341,21 @@ static int brd_alloc(int i) >> BLK_FEAT_NOWAIT, >> }; >> - list_for_each_entry(brd, &brd_devices, brd_list) >> - if (brd->brd_number == i) >> + mutex_lock(&brd_devices_mutex); >> + list_for_each_entry(brd, &brd_devices, brd_list) { >> + if (brd->brd_number == i) { >> + mutex_unlock(&brd_devices_mutex); >> return -EEXIST; >> + } >> + } >> brd = kzalloc(sizeof(*brd), GFP_KERNEL); >> - if (!brd) >> + if (!brd) { >> + mutex_unlock(&brd_devices_mutex); >> return -ENOMEM; >> + } >> brd->brd_number = i; >> list_add_tail(&brd->brd_list, &brd_devices); >> + mutex_unlock(&brd_devices_mutex); >> xa_init(&brd->brd_pages); > > The global mutex is used to protect brd_alloc() from module init and > brd_alloc() from creating on open? If so, I think just use the lock in > the caller is cleaner. Yes, it appears cleaner. However, is it a little too long to hold the mutex lock, considering that brd_alloc in brd_init can occur concurrently with brd_prbe? See f7bf35862477 ("brd: reduce the brd_devices_mutex scope") > >> @@ -378,7 +386,9 @@ static int brd_alloc(int i) >> out_cleanup_disk: >> put_disk(disk); >> out_free_dev: >> + mutex_lock(&brd_devices_mutex); >> list_del(&brd->brd_list); >> + mutex_unlock(&brd_devices_mutex); >> kfree(brd); >> return err; >> } >> @@ -388,21 +398,6 @@ static void brd_probe(dev_t dev) >> brd_alloc(MINOR(dev) / max_part); >> } >> -static void brd_cleanup(void) >> -{ >> - struct brd_device *brd, *next; >> - >> - debugfs_remove_recursive(brd_debugfs_dir); >> - >> - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { >> - del_gendisk(brd->brd_disk); >> - put_disk(brd->brd_disk); >> - brd_free_pages(brd); >> - list_del(&brd->brd_list); >> - kfree(brd); >> - } >> -} >> - >> static inline void brd_check_and_reset_par(void) >> { >> if (unlikely(!max_part)) >> @@ -424,17 +419,7 @@ static inline void brd_check_and_reset_par(void) >> static int __init brd_init(void) >> { >> - int err, i; >> - >> - brd_check_and_reset_par(); >> - >> - brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); >> - >> - for (i = 0; i < rd_nr; i++) { >> - err = brd_alloc(i); >> - if (err) >> - goto out_free; >> - } >> + int i; >> /* >> * brd module now has a feature to instantiate underlying device >> @@ -450,28 +435,35 @@ static int __init brd_init(void) >> * If (X / max_part) was not already created it will be created >> * dynamically. >> */ >> + brd_check_and_reset_par(); >> + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); >> if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { >> - err = -EIO; >> - goto out_free; >> + pr_info("brd: module NOT loaded !!!\n"); >> + debugfs_remove_recursive(brd_debugfs_dir); >> + return -EIO; >> } >> + for (i = 0; i < rd_nr; i++) >> + brd_alloc(i); > > Now that brd_alloc() can fail while loding the module will succed, > please also add err log if brd_alloc() failed. OK >> + >> pr_info("brd: module loaded\n"); >> return 0; >> - >> -out_free: >> - brd_cleanup(); >> - >> - pr_info("brd: module NOT loaded !!!\n"); >> - return err; >> } >> static void __exit brd_exit(void) >> { >> + struct brd_device *brd, *next; >> unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); >> - brd_cleanup(); >> - >> + debugfs_remove_recursive(brd_debugfs_dir); >> + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { >> + del_gendisk(brd->brd_disk); >> + put_disk(brd->brd_disk); >> + brd_free_pages(brd); >> + list_del(&brd->brd_list); >> + kfree(brd); >> + } > > Why the global mutex is not used here? It took a while for me to figure > out that this is safe, because unregister_blkdev() make sure no new > brd_alloc() can be called from open path, and in progress open will be > synchronized with del_gendisk(). Yes > > However, please add some comments or just hold the global mutex. OK > > Thanks, > Kuai > >> pr_info("brd: module unloaded\n"); >> } >>
在 2024/10/26 16:06, Tetsuo Handa 写道: > On 2024/10/26 15:28, Yu Kuai wrote: >> Hi, >> >> 在 2024/10/26 13:55, Tetsuo Handa 写道: >>> On 2024/10/26 10:21, Yu Kuai wrote: >>>> Hi, >>>> >>>> 在 2024/10/25 18:40, Tetsuo Handa 写道: >>>>> On 2024/10/25 16:05, Yang Erkun wrote: >>>>>> From: Yang Erkun <yangerkun@huawei.com> >>>>>> >>>>>> My colleague Wupeng found the following problems during fault injection: >>>>>> >>>>>> BUG: unable to handle page fault for address: fffffbfff809d073 >>>>> >>>>> Excuse me, but subject says "null pointer" whereas dmesg says >>>>> "not a null pointer dereference". Is this a use-after-free bug? >>>>> Also, what verb comes after "when modprobe brd" ? >>>>> >>>>> Is this problem happening with parallel execution? If yes, parallelly >>>>> running what and what? >>>> >>>> The problem is straightforward, to be short, >>>> >>>> T1: morprobe brd >>>> brd_init >>>> brd_alloc >>>> add_disk >>>> T2: open brd >>>> bdev_open >>>> try_module_get >>>> // err path >>>> brd_cleanup >>>> // dereference brd_fops() while module is freed. >>> >>> Then, fault injection is irrelevant, isn't it? >> >> Fault injection must involved in the test, brd_init() is unlikely to >> fail. >>> >>> If bdev_open() can grab a reference before module's initialization phase >>> completes is a problem, I think that we can fix the problem with just >> >> Yes, and root cause is that stuff inside module can be freed if module >> initialization failed, it's not safe to deference disk->fops in this >> case. > > Too bad. Then, we have to defer disk_alloc() until module initialization phase > is guaranteed to return success like loop.c does. Please update patch title and > description to something like below. > > Subject: brd: defer automatic disk creation until module initialization succeeds > > loop_init() is calling loop_add() after __register_blkdev() succeeds and is > ignoring disk_add() failure from loop_add(), for loop_add() failure is not > fatal and successfully created disks are already visible to bdev_open(). > > brd_init() is currently calling brd_alloc() before __register_blkdev() > succeeds and is releasing successfully created disks when brd_init() > returns an error. This can cause UAF when brd_init() failure raced with > bdev_open(), for successfully created disks are already visible to > bdev_open(). Fix this problem by following what loop_init() does. Tetsuo and Kuai, This commit msg looks good to me. I couldn't reply to some emails the other day. Thanks a lot for your discussion and review to make this problem clear! Thanks, Erkun.
On Sat, Oct 26, 2024 at 02:55:59PM +0900, Tetsuo Handa wrote: > If bdev_open() can grab a reference before module's initialization phase > completes is a problem, Yes, that would indicate there's a bug and alas we have a regression. Commit d1909c0221739 ("module: Don't ignore errors from set_memory_XX()") merged on v6.9 introduced a regression, allowing module init to start and later us failing module initialization to complete. So to be clear, there's a possible transition from live to not live right away. This was discussed in this thread: https://lore.kernel.org/all/Zuv0nmFblHUwuT8v@bombadil.infradead.org/T/#u > I think that we can fix the problem with just > > int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, > const struct blk_holder_ops *hops, struct file *bdev_file) > { > (...snipped...) > ret = -ENXIO; > if (!disk_live(disk)) > goto abort_claiming; > - if (!try_module_get(disk->fops->owner)) > + if ((disk->fops->owner && module_is_coming(disk->fops->owner)) || !try_module_get(disk->fops->owner)) > goto abort_claiming; > ret = -EBUSY; > if (!bdev_may_open(bdev, mode)) > (...snipped...) > } > > change. It would be cleaner if we can do > > bool try_module_get(struct module *module) > { > bool ret = true; > > if (module) { > /* Note: here, we can fail to get a reference */ > - if (likely(module_is_live(module) && > + if (likely(module_is_live(module) && !module_is_coming(module) && > atomic_inc_not_zero(&module->refcnt) != 0)) > trace_module_get(module, _RET_IP_); > else > ret = false; > } > return ret; > } > > but I don't know if this change breaks something. As I see it, if we fix the above regression I can't see how a module being live can transition into coming other than the regression above. Luis
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 2fd1ed101748..f86dd0795d3c 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -316,6 +316,7 @@ __setup("ramdisk_size=", ramdisk_size); * (should share code eventually). */ static LIST_HEAD(brd_devices); +static DEFINE_MUTEX(brd_devices_mutex); static struct dentry *brd_debugfs_dir; static int brd_alloc(int i) @@ -340,14 +341,21 @@ static int brd_alloc(int i) BLK_FEAT_NOWAIT, }; - list_for_each_entry(brd, &brd_devices, brd_list) - if (brd->brd_number == i) + mutex_lock(&brd_devices_mutex); + list_for_each_entry(brd, &brd_devices, brd_list) { + if (brd->brd_number == i) { + mutex_unlock(&brd_devices_mutex); return -EEXIST; + } + } brd = kzalloc(sizeof(*brd), GFP_KERNEL); - if (!brd) + if (!brd) { + mutex_unlock(&brd_devices_mutex); return -ENOMEM; + } brd->brd_number = i; list_add_tail(&brd->brd_list, &brd_devices); + mutex_unlock(&brd_devices_mutex); xa_init(&brd->brd_pages); @@ -378,7 +386,9 @@ static int brd_alloc(int i) out_cleanup_disk: put_disk(disk); out_free_dev: + mutex_lock(&brd_devices_mutex); list_del(&brd->brd_list); + mutex_unlock(&brd_devices_mutex); kfree(brd); return err; } @@ -388,21 +398,6 @@ static void brd_probe(dev_t dev) brd_alloc(MINOR(dev) / max_part); } -static void brd_cleanup(void) -{ - struct brd_device *brd, *next; - - debugfs_remove_recursive(brd_debugfs_dir); - - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { - del_gendisk(brd->brd_disk); - put_disk(brd->brd_disk); - brd_free_pages(brd); - list_del(&brd->brd_list); - kfree(brd); - } -} - static inline void brd_check_and_reset_par(void) { if (unlikely(!max_part)) @@ -424,17 +419,7 @@ static inline void brd_check_and_reset_par(void) static int __init brd_init(void) { - int err, i; - - brd_check_and_reset_par(); - - brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); - - for (i = 0; i < rd_nr; i++) { - err = brd_alloc(i); - if (err) - goto out_free; - } + int i; /* * brd module now has a feature to instantiate underlying device @@ -450,28 +435,35 @@ static int __init brd_init(void) * If (X / max_part) was not already created it will be created * dynamically. */ + brd_check_and_reset_par(); + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { - err = -EIO; - goto out_free; + pr_info("brd: module NOT loaded !!!\n"); + debugfs_remove_recursive(brd_debugfs_dir); + return -EIO; } + for (i = 0; i < rd_nr; i++) + brd_alloc(i); + pr_info("brd: module loaded\n"); return 0; - -out_free: - brd_cleanup(); - - pr_info("brd: module NOT loaded !!!\n"); - return err; } static void __exit brd_exit(void) { + struct brd_device *brd, *next; unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); - brd_cleanup(); - + debugfs_remove_recursive(brd_debugfs_dir); + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { + del_gendisk(brd->brd_disk); + put_disk(brd->brd_disk); + brd_free_pages(brd); + list_del(&brd->brd_list); + kfree(brd); + } pr_info("brd: module unloaded\n"); }