Message ID | 50c3ed995fd81c45876c86304c8a00bf3e396cfd.1722251717.git.lorenzo.stoakes@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Make core VMA operations internal and testable | expand |
Hi Lorenzo Stoakes, Greetings! I used syzkaller and found KASAN: slab-use-after-free Read in userfaultfd_set_ctx in next-20240805. Bisected the first bad commit: 4651ba8201cf userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240806_122723_userfaultfd_set_ctx Syzkaller repro code: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/repro.c Syzkaller repro syscall steps: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/repro.prog Syzkaller analysis report: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/repro.report Kconfig(make olddefconfig): https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/kconfig_origin Bisect info: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/bisect_info.log Dmesg: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/d6dbc9f56c3a70e915625b6f1887882c23dc5c91_dmesg.log bzImage: https://github.com/xupengfe/syzkaller_logs/raw/main/240806_122723_userfaultfd_set_ctx/bzImage_d6dbc9f56c3a70e915625b6f1887882c23dc5c91.tar.gz " [ 29.675551] ================================================================== [ 29.676133] BUG: KASAN: slab-use-after-free in userfaultfd_set_ctx+0x31c/0x360 [ 29.676716] Read of size 8 at addr ffff888027c5f100 by task repro/1498 [ 29.677218] [ 29.677358] CPU: 0 UID: 0 PID: 1498 Comm: repro Not tainted 6.11.0-rc2-next-20240805-d6dbc9f56c3a #1 [ 29.678053] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 29.678910] Call Trace: [ 29.679117] <TASK> [ 29.679296] dump_stack_lvl+0xea/0x150 [ 29.679622] print_report+0xce/0x610 [ 29.679924] ? userfaultfd_set_ctx+0x31c/0x360 [ 29.680289] ? kasan_complete_mode_report_info+0x80/0x200 [ 29.680716] ? userfaultfd_set_ctx+0x31c/0x360 [ 29.681077] kasan_report+0xcc/0x110 [ 29.681372] ? userfaultfd_set_ctx+0x31c/0x360 [ 29.681729] __asan_report_load8_noabort+0x18/0x20 [ 29.682118] userfaultfd_set_ctx+0x31c/0x360 [ 29.682465] userfaultfd_clear_vma+0x104/0x190 [ 29.682826] userfaultfd_release_all+0x294/0x4a0 [ 29.683201] ? __pfx_userfaultfd_release_all+0x10/0x10 [ 29.683615] ? __this_cpu_preempt_check+0x21/0x30 [ 29.684003] ? __pfx_userfaultfd_release+0x10/0x10 [ 29.684389] userfaultfd_release+0x112/0x1e0 [ 29.684735] ? __pfx_userfaultfd_release+0x10/0x10 [ 29.685114] ? evm_file_release+0x193/0x1f0 [ 29.685454] __fput+0x426/0xbc0 [ 29.685719] ? __sanitizer_cov_trace_const_cmp2+0x1c/0x30 [ 29.686153] __fput_sync+0x58/0x70 [ 29.686435] __x64_sys_close+0x93/0x120 [ 29.686744] x64_sys_call+0x189a/0x20d0 [ 29.687066] do_syscall_64+0x6d/0x140 [ 29.687371] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 29.687779] RIP: 0033:0x7f3b67b3f247 [ 29.688078] Code: ff e8 cd e3 01 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 c3 c9 f5 ff [ 29.689519] RSP: 002b:00007ffd1f0ac7d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 [ 29.690140] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3b67b3f247 [ 29.690717] RDX: 0000000020000100 RSI: 000000008010aa01 RDI: 0000000000000003 [ 29.691270] RBP: 00007ffd1f0ac7f0 R08: 00007ffd1f0ac7f0 R09: 00007ffd1f0ac7f0 [ 29.691822] R10: 00007ffd1f0ac7f0 R11: 0000000000000246 R12: 00007ffd1f0ac968 [ 29.692375] R13: 0000000000401bf9 R14: 0000000000403e08 R15: 00007f3b67c72000 [ 29.692939] </TASK> [ 29.693124] [ 29.693258] Allocated by task 1498: [ 29.693545] kasan_save_stack+0x2c/0x60 [ 29.693875] kasan_save_track+0x18/0x40 [ 29.694205] kasan_save_alloc_info+0x3c/0x50 [ 29.694576] __kasan_slab_alloc+0x62/0x80 [ 29.694921] kmem_cache_alloc_noprof+0x114/0x370 [ 29.695319] vm_area_dup+0x2a/0x1b0 [ 29.695630] __split_vma+0x188/0x1020 [ 29.695952] vma_modify+0x1fc/0x390 [ 29.696250] userfaultfd_clear_vma+0xd4/0x190 [ 29.696609] userfaultfd_ioctl+0x3c0b/0x4560 [ 29.696964] __x64_sys_ioctl+0x1b9/0x230 [ 29.697295] x64_sys_call+0x1209/0x20d0 [ 29.697620] do_syscall_64+0x6d/0x140 [ 29.697937] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 29.698364] [ 29.698508] Freed by task 1505: [ 29.698779] kasan_save_stack+0x2c/0x60 [ 29.699110] kasan_save_track+0x18/0x40 [ 29.699441] kasan_save_free_info+0x3f/0x60 [ 29.699797] __kasan_slab_free+0x47/0x60 [ 29.700137] kmem_cache_free+0x2f2/0x4b0 [ 29.700471] vm_area_free_rcu_cb+0x7f/0xa0 [ 29.700819] rcu_core+0x877/0x18f0 [ 29.701123] rcu_core_si+0x12/0x20 [ 29.701421] handle_softirqs+0x1c7/0x870 [ 29.701760] __irq_exit_rcu+0xa9/0x120 [ 29.702082] irq_exit_rcu+0x12/0x30 [ 29.702386] sysvec_apic_timer_interrupt+0xa5/0xc0 [ 29.702802] asm_sysvec_apic_timer_interrupt+0x1f/0x30 [ 29.703237] [ 29.703377] Last potentially related work creation: [ 29.703782] kasan_save_stack+0x2c/0x60 [ 29.704114] __kasan_record_aux_stack+0x93/0xb0 [ 29.704503] kasan_record_aux_stack_noalloc+0xf/0x20 [ 29.704923] __call_rcu_common.constprop.0+0x72/0x6b0 [ 29.705349] call_rcu+0x12/0x20 [ 29.705625] vm_area_free+0x26/0x30 [ 29.705928] vma_complete+0x57e/0xf60 [ 29.706245] vma_merge+0x166b/0x3540 [ 29.706555] vma_modify+0x9f/0x390 [ 29.706853] userfaultfd_clear_vma+0xd4/0x190 [ 29.707227] userfaultfd_release_all+0x294/0x4a0 [ 29.707621] userfaultfd_release+0x112/0x1e0 [ 29.707991] __fput+0x426/0xbc0 [ 29.708267] __fput_sync+0x58/0x70 [ 29.708563] __x64_sys_close+0x93/0x120 [ 29.708891] x64_sys_call+0x189a/0x20d0 [ 29.709220] do_syscall_64+0x6d/0x140 [ 29.709538] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 29.709965] [ 29.710105] The buggy address belongs to the object at ffff888027c5f0f0 [ 29.710105] which belongs to the cache vm_area_struct of size 176 [ 29.711130] The buggy address is located 16 bytes inside of [ 29.711130] freed 176-byte region [ffff888027c5f0f0, ffff888027c5f1a0) [ 29.712104] [ 29.712245] The buggy address belongs to the physical page: [ 29.712703] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x27c5f [ 29.713349] memcg:ffff8880198eaa01 [ 29.713639] flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) [ 29.714173] page_type: 0xfdffffff(slab) [ 29.714507] raw: 000fffffc0000000 ffff88800d319dc0 dead000000000122 0000000000000000 [ 29.715137] raw: 0000000000000000 0000000000110011 00000001fdffffff ffff8880198eaa01 [ 29.715765] page dumped because: kasan: bad access detected [ 29.716220] [ 29.716360] Memory state around the buggy address: [ 29.716756] ffff888027c5f000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 29.717349] ffff888027c5f080: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fa fb [ 29.717940] >ffff888027c5f100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 29.718521] ^ [ 29.718796] ffff888027c5f180: fb fb fb fb fc fc fc fc fc fc fc fc 00 00 00 00 [ 29.719388] ffff888027c5f200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 29.720025] ================================================================== [ 29.720671] Disabling lock debugging due to kernel taint " Thanks! --- If you don't need the following environment to reproduce the problem or if you already have one reproduced environment, please ignore the following information. How to reproduce: git clone https://gitlab.com/xupengfe/repro_vm_env.git cd repro_vm_env tar -xvf repro_vm_env.tar.gz cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0 // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel // You could change the bzImage_xxx as you want // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version You could use below command to log in, there is no password for root. ssh -p 10023 root@localhost After login vm(virtual machine) successfully, you could transfer reproduced binary to the vm by below way, and reproduce the problem in vm: gcc -pthread -o repro repro.c scp -P 10023 repro root@localhost:/root/ Get the bzImage for target kernel: Please use target kconfig and copy it to kernel_src/.config make olddefconfig make -jx bzImage //x should equal or less than cpu num your pc has Fill the bzImage file into above start3.sh to load the target kernel in vm. Tips: If you already have qemu-system-x86_64, please ignore below info. If you want to install qemu v7.1.0 version: git clone https://github.com/qemu/qemu.git cd qemu git checkout -f v7.1.0 mkdir build cd build yum install -y ninja-build.x86_64 yum -y install libslirp-devel.x86_64 ../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp make make install Best Regards, Thanks! On 2024-07-29 at 12:50:35 +0100, Lorenzo Stoakes wrote: > This patch forms part of a patch series intending to separate out VMA logic > and render it testable from userspace, which requires that core > manipulation functions be exposed in an mm/-internal header file. > > In order to do this, we must abstract APIs we wish to test, in this > instance functions which ultimately invoke vma_modify(). > > This patch therefore moves all logic which ultimately invokes vma_modify() > to mm/userfaultfd.c, trying to transfer code at a functional granularity > where possible. > > Reviewed-by: Vlastimil Babka <vbabka@suse.cz> > Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > fs/userfaultfd.c | 160 +++----------------------------- > include/linux/userfaultfd_k.h | 19 ++++ > mm/userfaultfd.c | 168 ++++++++++++++++++++++++++++++++++ > 3 files changed, 198 insertions(+), 149 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index 27a3e9285fbf..b3ed7207df7e 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -104,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) > return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; > } > > -static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, > - vm_flags_t flags) > -{ > - const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; > - > - vm_flags_reset(vma, flags); > - /* > - * For shared mappings, we want to enable writenotify while > - * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply > - * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. > - */ > - if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) > - vma_set_page_prot(vma); > -} > - > static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, > int wake_flags, void *key) > { > @@ -615,22 +600,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, > spin_unlock_irq(&ctx->event_wqh.lock); > > if (release_new_ctx) { > - struct vm_area_struct *vma; > - struct mm_struct *mm = release_new_ctx->mm; > - VMA_ITERATOR(vmi, mm, 0); > - > - /* the various vma->vm_userfaultfd_ctx still points to it */ > - mmap_write_lock(mm); > - for_each_vma(vmi, vma) { > - if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { > - vma_start_write(vma); > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > - userfaultfd_set_vm_flags(vma, > - vma->vm_flags & ~__VM_UFFD_FLAGS); > - } > - } > - mmap_write_unlock(mm); > - > + userfaultfd_release_new(release_new_ctx); > userfaultfd_ctx_put(release_new_ctx); > } > > @@ -662,9 +632,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) > return 0; > > if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { > - vma_start_write(vma); > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); > + userfaultfd_reset_ctx(vma); > return 0; > } > > @@ -749,9 +717,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, > up_write(&ctx->map_changing_lock); > } else { > /* Drop uffd context if remap feature not enabled */ > - vma_start_write(vma); > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); > + userfaultfd_reset_ctx(vma); > } > } > > @@ -870,53 +836,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) > { > struct userfaultfd_ctx *ctx = file->private_data; > struct mm_struct *mm = ctx->mm; > - struct vm_area_struct *vma, *prev; > /* len == 0 means wake all */ > struct userfaultfd_wake_range range = { .len = 0, }; > - unsigned long new_flags; > - VMA_ITERATOR(vmi, mm, 0); > > WRITE_ONCE(ctx->released, true); > > - if (!mmget_not_zero(mm)) > - goto wakeup; > - > - /* > - * Flush page faults out of all CPUs. NOTE: all page faults > - * must be retried without returning VM_FAULT_SIGBUS if > - * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx > - * changes while handle_userfault released the mmap_lock. So > - * it's critical that released is set to true (above), before > - * taking the mmap_lock for writing. > - */ > - mmap_write_lock(mm); > - prev = NULL; > - for_each_vma(vmi, vma) { > - cond_resched(); > - BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ > - !!(vma->vm_flags & __VM_UFFD_FLAGS)); > - if (vma->vm_userfaultfd_ctx.ctx != ctx) { > - prev = vma; > - continue; > - } > - /* Reset ptes for the whole vma range if wr-protected */ > - if (userfaultfd_wp(vma)) > - uffd_wp_range(vma, vma->vm_start, > - vma->vm_end - vma->vm_start, false); > - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; > - vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, > - vma->vm_end, new_flags, > - NULL_VM_UFFD_CTX); > - > - vma_start_write(vma); > - userfaultfd_set_vm_flags(vma, new_flags); > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > + userfaultfd_release_all(mm, ctx); > > - prev = vma; > - } > - mmap_write_unlock(mm); > - mmput(mm); > -wakeup: > /* > * After no new page faults can wait on this fault_*wqh, flush > * the last page faults that may have been already waiting on > @@ -1293,14 +1219,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > unsigned long arg) > { > struct mm_struct *mm = ctx->mm; > - struct vm_area_struct *vma, *prev, *cur; > + struct vm_area_struct *vma, *cur; > int ret; > struct uffdio_register uffdio_register; > struct uffdio_register __user *user_uffdio_register; > - unsigned long vm_flags, new_flags; > + unsigned long vm_flags; > bool found; > bool basic_ioctls; > - unsigned long start, end, vma_end; > + unsigned long start, end; > struct vma_iterator vmi; > bool wp_async = userfaultfd_wp_async_ctx(ctx); > > @@ -1428,57 +1354,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > } for_each_vma_range(vmi, cur, end); > BUG_ON(!found); > > - vma_iter_set(&vmi, start); > - prev = vma_prev(&vmi); > - if (vma->vm_start < start) > - prev = vma; > - > - ret = 0; > - for_each_vma_range(vmi, vma, end) { > - cond_resched(); > - > - BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); > - BUG_ON(vma->vm_userfaultfd_ctx.ctx && > - vma->vm_userfaultfd_ctx.ctx != ctx); > - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); > - > - /* > - * Nothing to do: this vma is already registered into this > - * userfaultfd and with the right tracking mode too. > - */ > - if (vma->vm_userfaultfd_ctx.ctx == ctx && > - (vma->vm_flags & vm_flags) == vm_flags) > - goto skip; > - > - if (vma->vm_start > start) > - start = vma->vm_start; > - vma_end = min(end, vma->vm_end); > - > - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; > - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, > - new_flags, > - (struct vm_userfaultfd_ctx){ctx}); > - if (IS_ERR(vma)) { > - ret = PTR_ERR(vma); > - break; > - } > - > - /* > - * In the vma_merge() successful mprotect-like case 8: > - * the next vma was merged into the current one and > - * the current one has not been updated yet. > - */ > - vma_start_write(vma); > - userfaultfd_set_vm_flags(vma, new_flags); > - vma->vm_userfaultfd_ctx.ctx = ctx; > - > - if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) > - hugetlb_unshare_all_pmds(vma); > - > - skip: > - prev = vma; > - start = vma->vm_end; > - } > + ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, > + wp_async); > > out_unlock: > mmap_write_unlock(mm); > @@ -1519,7 +1396,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > struct vm_area_struct *vma, *prev, *cur; > int ret; > struct uffdio_range uffdio_unregister; > - unsigned long new_flags; > bool found; > unsigned long start, end, vma_end; > const void __user *buf = (void __user *)arg; > @@ -1622,27 +1498,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); > } > > - /* Reset ptes for the whole vma range if wr-protected */ > - if (userfaultfd_wp(vma)) > - uffd_wp_range(vma, start, vma_end - start, false); > - > - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; > - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, > - new_flags, NULL_VM_UFFD_CTX); > + vma = userfaultfd_clear_vma(&vmi, prev, vma, > + start, vma_end); > if (IS_ERR(vma)) { > ret = PTR_ERR(vma); > break; > } > > - /* > - * In the vma_merge() successful mprotect-like case 8: > - * the next vma was merged into the current one and > - * the current one has not been updated yet. > - */ > - vma_start_write(vma); > - userfaultfd_set_vm_flags(vma, new_flags); > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > - > skip: > prev = vma; > start = vma->vm_end; > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > index a12bcf042551..9fc6ce15c499 100644 > --- a/include/linux/userfaultfd_k.h > +++ b/include/linux/userfaultfd_k.h > @@ -267,6 +267,25 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, > extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); > extern bool userfaultfd_wp_async(struct vm_area_struct *vma); > > +void userfaultfd_reset_ctx(struct vm_area_struct *vma); > + > +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, > + struct vm_area_struct *prev, > + struct vm_area_struct *vma, > + unsigned long start, > + unsigned long end); > + > +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, > + struct vm_area_struct *vma, > + unsigned long vm_flags, > + unsigned long start, unsigned long end, > + bool wp_async); > + > +void userfaultfd_release_new(struct userfaultfd_ctx *ctx); > + > +void userfaultfd_release_all(struct mm_struct *mm, > + struct userfaultfd_ctx *ctx); > + > #else /* CONFIG_USERFAULTFD */ > > /* mm helpers */ > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index e54e5c8907fa..3b7715ecf292 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -1760,3 +1760,171 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, > VM_WARN_ON(!moved && !err); > return moved ? moved : err; > } > + > +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, > + vm_flags_t flags) > +{ > + const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; > + > + vm_flags_reset(vma, flags); > + /* > + * For shared mappings, we want to enable writenotify while > + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply > + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. > + */ > + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) > + vma_set_page_prot(vma); > +} > + > +static void userfaultfd_set_ctx(struct vm_area_struct *vma, > + struct userfaultfd_ctx *ctx, > + unsigned long flags) > +{ > + vma_start_write(vma); > + vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; > + userfaultfd_set_vm_flags(vma, > + (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); > +} > + > +void userfaultfd_reset_ctx(struct vm_area_struct *vma) > +{ > + userfaultfd_set_ctx(vma, NULL, 0); > +} > + > +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, > + struct vm_area_struct *prev, > + struct vm_area_struct *vma, > + unsigned long start, > + unsigned long end) > +{ > + struct vm_area_struct *ret; > + > + /* Reset ptes for the whole vma range if wr-protected */ > + if (userfaultfd_wp(vma)) > + uffd_wp_range(vma, start, end - start, false); > + > + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, > + vma->vm_flags & ~__VM_UFFD_FLAGS, > + NULL_VM_UFFD_CTX); > + > + /* > + * In the vma_merge() successful mprotect-like case 8: > + * the next vma was merged into the current one and > + * the current one has not been updated yet. > + */ > + if (!IS_ERR(ret)) > + userfaultfd_reset_ctx(vma); > + > + return ret; > +} > + > +/* Assumes mmap write lock taken, and mm_struct pinned. */ > +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, > + struct vm_area_struct *vma, > + unsigned long vm_flags, > + unsigned long start, unsigned long end, > + bool wp_async) > +{ > + VMA_ITERATOR(vmi, ctx->mm, start); > + struct vm_area_struct *prev = vma_prev(&vmi); > + unsigned long vma_end; > + unsigned long new_flags; > + > + if (vma->vm_start < start) > + prev = vma; > + > + for_each_vma_range(vmi, vma, end) { > + cond_resched(); > + > + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); > + BUG_ON(vma->vm_userfaultfd_ctx.ctx && > + vma->vm_userfaultfd_ctx.ctx != ctx); > + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); > + > + /* > + * Nothing to do: this vma is already registered into this > + * userfaultfd and with the right tracking mode too. > + */ > + if (vma->vm_userfaultfd_ctx.ctx == ctx && > + (vma->vm_flags & vm_flags) == vm_flags) > + goto skip; > + > + if (vma->vm_start > start) > + start = vma->vm_start; > + vma_end = min(end, vma->vm_end); > + > + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; > + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, > + new_flags, > + (struct vm_userfaultfd_ctx){ctx}); > + if (IS_ERR(vma)) > + return PTR_ERR(vma); > + > + /* > + * In the vma_merge() successful mprotect-like case 8: > + * the next vma was merged into the current one and > + * the current one has not been updated yet. > + */ > + userfaultfd_set_ctx(vma, ctx, vm_flags); > + > + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) > + hugetlb_unshare_all_pmds(vma); > + > +skip: > + prev = vma; > + start = vma->vm_end; > + } > + > + return 0; > +} > + > +void userfaultfd_release_new(struct userfaultfd_ctx *ctx) > +{ > + struct mm_struct *mm = ctx->mm; > + struct vm_area_struct *vma; > + VMA_ITERATOR(vmi, mm, 0); > + > + /* the various vma->vm_userfaultfd_ctx still points to it */ > + mmap_write_lock(mm); > + for_each_vma(vmi, vma) { > + if (vma->vm_userfaultfd_ctx.ctx == ctx) > + userfaultfd_reset_ctx(vma); > + } > + mmap_write_unlock(mm); > +} > + > +void userfaultfd_release_all(struct mm_struct *mm, > + struct userfaultfd_ctx *ctx) > +{ > + struct vm_area_struct *vma, *prev; > + VMA_ITERATOR(vmi, mm, 0); > + > + if (!mmget_not_zero(mm)) > + return; > + > + /* > + * Flush page faults out of all CPUs. NOTE: all page faults > + * must be retried without returning VM_FAULT_SIGBUS if > + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx > + * changes while handle_userfault released the mmap_lock. So > + * it's critical that released is set to true (above), before > + * taking the mmap_lock for writing. > + */ > + mmap_write_lock(mm); > + prev = NULL; > + for_each_vma(vmi, vma) { > + cond_resched(); > + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ > + !!(vma->vm_flags & __VM_UFFD_FLAGS)); > + if (vma->vm_userfaultfd_ctx.ctx != ctx) { > + prev = vma; > + continue; > + } > + > + vma = userfaultfd_clear_vma(&vmi, prev, vma, > + vma->vm_start, vma->vm_end); > + prev = vma; > + } > + mmap_write_unlock(mm); > + mmput(mm); > +} > -- > 2.45.2 >
On Wed, Aug 07, 2024 at 11:45:56AM GMT, Pengfei Xu wrote: > Hi Lorenzo Stoakes, > > Greetings! > > I used syzkaller and found > KASAN: slab-use-after-free Read in userfaultfd_set_ctx in next-20240805. > > Bisected the first bad commit: > 4651ba8201cf userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c Hi, Thanks for this, I will investigate as a priority and work up a fix. As it's simply a refactor I suspect this should be relatively straightforward. > > All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240806_122723_userfaultfd_set_ctx > Syzkaller repro code: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/repro.c > Syzkaller repro syscall steps: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/repro.prog > Syzkaller analysis report: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/repro.report > Kconfig(make olddefconfig): https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/kconfig_origin > Bisect info: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/bisect_info.log > Dmesg: https://github.com/xupengfe/syzkaller_logs/blob/main/240806_122723_userfaultfd_set_ctx/d6dbc9f56c3a70e915625b6f1887882c23dc5c91_dmesg.log > bzImage: https://github.com/xupengfe/syzkaller_logs/raw/main/240806_122723_userfaultfd_set_ctx/bzImage_d6dbc9f56c3a70e915625b6f1887882c23dc5c91.tar.gz > > " > [ 29.675551] ================================================================== > [ 29.676133] BUG: KASAN: slab-use-after-free in userfaultfd_set_ctx+0x31c/0x360 > [ 29.676716] Read of size 8 at addr ffff888027c5f100 by task repro/1498 > [ 29.677218] > [ 29.677358] CPU: 0 UID: 0 PID: 1498 Comm: repro Not tainted 6.11.0-rc2-next-20240805-d6dbc9f56c3a #1 > [ 29.678053] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 > [ 29.678910] Call Trace: > [ 29.679117] <TASK> > [ 29.679296] dump_stack_lvl+0xea/0x150 > [ 29.679622] print_report+0xce/0x610 > [ 29.679924] ? userfaultfd_set_ctx+0x31c/0x360 > [ 29.680289] ? kasan_complete_mode_report_info+0x80/0x200 > [ 29.680716] ? userfaultfd_set_ctx+0x31c/0x360 > [ 29.681077] kasan_report+0xcc/0x110 > [ 29.681372] ? userfaultfd_set_ctx+0x31c/0x360 > [ 29.681729] __asan_report_load8_noabort+0x18/0x20 > [ 29.682118] userfaultfd_set_ctx+0x31c/0x360 > [ 29.682465] userfaultfd_clear_vma+0x104/0x190 > [ 29.682826] userfaultfd_release_all+0x294/0x4a0 > [ 29.683201] ? __pfx_userfaultfd_release_all+0x10/0x10 > [ 29.683615] ? __this_cpu_preempt_check+0x21/0x30 > [ 29.684003] ? __pfx_userfaultfd_release+0x10/0x10 > [ 29.684389] userfaultfd_release+0x112/0x1e0 > [ 29.684735] ? __pfx_userfaultfd_release+0x10/0x10 > [ 29.685114] ? evm_file_release+0x193/0x1f0 > [ 29.685454] __fput+0x426/0xbc0 > [ 29.685719] ? __sanitizer_cov_trace_const_cmp2+0x1c/0x30 > [ 29.686153] __fput_sync+0x58/0x70 > [ 29.686435] __x64_sys_close+0x93/0x120 > [ 29.686744] x64_sys_call+0x189a/0x20d0 > [ 29.687066] do_syscall_64+0x6d/0x140 > [ 29.687371] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 29.687779] RIP: 0033:0x7f3b67b3f247 > [ 29.688078] Code: ff e8 cd e3 01 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 c3 c9 f5 ff > [ 29.689519] RSP: 002b:00007ffd1f0ac7d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 > [ 29.690140] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3b67b3f247 > [ 29.690717] RDX: 0000000020000100 RSI: 000000008010aa01 RDI: 0000000000000003 > [ 29.691270] RBP: 00007ffd1f0ac7f0 R08: 00007ffd1f0ac7f0 R09: 00007ffd1f0ac7f0 > [ 29.691822] R10: 00007ffd1f0ac7f0 R11: 0000000000000246 R12: 00007ffd1f0ac968 > [ 29.692375] R13: 0000000000401bf9 R14: 0000000000403e08 R15: 00007f3b67c72000 > [ 29.692939] </TASK> > [ 29.693124] > [ 29.693258] Allocated by task 1498: > [ 29.693545] kasan_save_stack+0x2c/0x60 > [ 29.693875] kasan_save_track+0x18/0x40 > [ 29.694205] kasan_save_alloc_info+0x3c/0x50 > [ 29.694576] __kasan_slab_alloc+0x62/0x80 > [ 29.694921] kmem_cache_alloc_noprof+0x114/0x370 > [ 29.695319] vm_area_dup+0x2a/0x1b0 > [ 29.695630] __split_vma+0x188/0x1020 > [ 29.695952] vma_modify+0x1fc/0x390 > [ 29.696250] userfaultfd_clear_vma+0xd4/0x190 > [ 29.696609] userfaultfd_ioctl+0x3c0b/0x4560 > [ 29.696964] __x64_sys_ioctl+0x1b9/0x230 > [ 29.697295] x64_sys_call+0x1209/0x20d0 > [ 29.697620] do_syscall_64+0x6d/0x140 > [ 29.697937] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 29.698364] > [ 29.698508] Freed by task 1505: > [ 29.698779] kasan_save_stack+0x2c/0x60 > [ 29.699110] kasan_save_track+0x18/0x40 > [ 29.699441] kasan_save_free_info+0x3f/0x60 > [ 29.699797] __kasan_slab_free+0x47/0x60 > [ 29.700137] kmem_cache_free+0x2f2/0x4b0 > [ 29.700471] vm_area_free_rcu_cb+0x7f/0xa0 > [ 29.700819] rcu_core+0x877/0x18f0 > [ 29.701123] rcu_core_si+0x12/0x20 > [ 29.701421] handle_softirqs+0x1c7/0x870 > [ 29.701760] __irq_exit_rcu+0xa9/0x120 > [ 29.702082] irq_exit_rcu+0x12/0x30 > [ 29.702386] sysvec_apic_timer_interrupt+0xa5/0xc0 > [ 29.702802] asm_sysvec_apic_timer_interrupt+0x1f/0x30 > [ 29.703237] > [ 29.703377] Last potentially related work creation: > [ 29.703782] kasan_save_stack+0x2c/0x60 > [ 29.704114] __kasan_record_aux_stack+0x93/0xb0 > [ 29.704503] kasan_record_aux_stack_noalloc+0xf/0x20 > [ 29.704923] __call_rcu_common.constprop.0+0x72/0x6b0 > [ 29.705349] call_rcu+0x12/0x20 > [ 29.705625] vm_area_free+0x26/0x30 > [ 29.705928] vma_complete+0x57e/0xf60 > [ 29.706245] vma_merge+0x166b/0x3540 > [ 29.706555] vma_modify+0x9f/0x390 > [ 29.706853] userfaultfd_clear_vma+0xd4/0x190 > [ 29.707227] userfaultfd_release_all+0x294/0x4a0 > [ 29.707621] userfaultfd_release+0x112/0x1e0 > [ 29.707991] __fput+0x426/0xbc0 > [ 29.708267] __fput_sync+0x58/0x70 > [ 29.708563] __x64_sys_close+0x93/0x120 > [ 29.708891] x64_sys_call+0x189a/0x20d0 > [ 29.709220] do_syscall_64+0x6d/0x140 > [ 29.709538] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 29.709965] > [ 29.710105] The buggy address belongs to the object at ffff888027c5f0f0 > [ 29.710105] which belongs to the cache vm_area_struct of size 176 > [ 29.711130] The buggy address is located 16 bytes inside of > [ 29.711130] freed 176-byte region [ffff888027c5f0f0, ffff888027c5f1a0) > [ 29.712104] > [ 29.712245] The buggy address belongs to the physical page: > [ 29.712703] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x27c5f > [ 29.713349] memcg:ffff8880198eaa01 > [ 29.713639] flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) > [ 29.714173] page_type: 0xfdffffff(slab) > [ 29.714507] raw: 000fffffc0000000 ffff88800d319dc0 dead000000000122 0000000000000000 > [ 29.715137] raw: 0000000000000000 0000000000110011 00000001fdffffff ffff8880198eaa01 > [ 29.715765] page dumped because: kasan: bad access detected > [ 29.716220] > [ 29.716360] Memory state around the buggy address: > [ 29.716756] ffff888027c5f000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > [ 29.717349] ffff888027c5f080: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fa fb > [ 29.717940] >ffff888027c5f100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > [ 29.718521] ^ > [ 29.718796] ffff888027c5f180: fb fb fb fb fc fc fc fc fc fc fc fc 00 00 00 00 > [ 29.719388] ffff888027c5f200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > [ 29.720025] ================================================================== > [ 29.720671] Disabling lock debugging due to kernel taint > " > > Thanks! > > --- > > If you don't need the following environment to reproduce the problem or if you > already have one reproduced environment, please ignore the following information. > > How to reproduce: > git clone https://gitlab.com/xupengfe/repro_vm_env.git > cd repro_vm_env > tar -xvf repro_vm_env.tar.gz > cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0 > // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel > // You could change the bzImage_xxx as you want > // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version > You could use below command to log in, there is no password for root. > ssh -p 10023 root@localhost > > After login vm(virtual machine) successfully, you could transfer reproduced > binary to the vm by below way, and reproduce the problem in vm: > gcc -pthread -o repro repro.c > scp -P 10023 repro root@localhost:/root/ > > Get the bzImage for target kernel: > Please use target kconfig and copy it to kernel_src/.config > make olddefconfig > make -jx bzImage //x should equal or less than cpu num your pc has > > Fill the bzImage file into above start3.sh to load the target kernel in vm. > > > Tips: > If you already have qemu-system-x86_64, please ignore below info. > If you want to install qemu v7.1.0 version: > git clone https://github.com/qemu/qemu.git > cd qemu > git checkout -f v7.1.0 > mkdir build > cd build > yum install -y ninja-build.x86_64 > yum -y install libslirp-devel.x86_64 > ../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp > make > make install > > Best Regards, > Thanks! > > > On 2024-07-29 at 12:50:35 +0100, Lorenzo Stoakes wrote: > > This patch forms part of a patch series intending to separate out VMA logic > > and render it testable from userspace, which requires that core > > manipulation functions be exposed in an mm/-internal header file. > > > > In order to do this, we must abstract APIs we wish to test, in this > > instance functions which ultimately invoke vma_modify(). > > > > This patch therefore moves all logic which ultimately invokes vma_modify() > > to mm/userfaultfd.c, trying to transfer code at a functional granularity > > where possible. > > > > Reviewed-by: Vlastimil Babka <vbabka@suse.cz> > > Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > --- > > fs/userfaultfd.c | 160 +++----------------------------- > > include/linux/userfaultfd_k.h | 19 ++++ > > mm/userfaultfd.c | 168 ++++++++++++++++++++++++++++++++++ > > 3 files changed, 198 insertions(+), 149 deletions(-) > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > index 27a3e9285fbf..b3ed7207df7e 100644 > > --- a/fs/userfaultfd.c > > +++ b/fs/userfaultfd.c > > @@ -104,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) > > return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; > > } > > > > -static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, > > - vm_flags_t flags) > > -{ > > - const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; > > - > > - vm_flags_reset(vma, flags); > > - /* > > - * For shared mappings, we want to enable writenotify while > > - * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply > > - * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. > > - */ > > - if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) > > - vma_set_page_prot(vma); > > -} > > - > > static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, > > int wake_flags, void *key) > > { > > @@ -615,22 +600,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, > > spin_unlock_irq(&ctx->event_wqh.lock); > > > > if (release_new_ctx) { > > - struct vm_area_struct *vma; > > - struct mm_struct *mm = release_new_ctx->mm; > > - VMA_ITERATOR(vmi, mm, 0); > > - > > - /* the various vma->vm_userfaultfd_ctx still points to it */ > > - mmap_write_lock(mm); > > - for_each_vma(vmi, vma) { > > - if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { > > - vma_start_write(vma); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > - userfaultfd_set_vm_flags(vma, > > - vma->vm_flags & ~__VM_UFFD_FLAGS); > > - } > > - } > > - mmap_write_unlock(mm); > > - > > + userfaultfd_release_new(release_new_ctx); > > userfaultfd_ctx_put(release_new_ctx); > > } > > > > @@ -662,9 +632,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) > > return 0; > > > > if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { > > - vma_start_write(vma); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); > > + userfaultfd_reset_ctx(vma); > > return 0; > > } > > > > @@ -749,9 +717,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, > > up_write(&ctx->map_changing_lock); > > } else { > > /* Drop uffd context if remap feature not enabled */ > > - vma_start_write(vma); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); > > + userfaultfd_reset_ctx(vma); > > } > > } > > > > @@ -870,53 +836,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) > > { > > struct userfaultfd_ctx *ctx = file->private_data; > > struct mm_struct *mm = ctx->mm; > > - struct vm_area_struct *vma, *prev; > > /* len == 0 means wake all */ > > struct userfaultfd_wake_range range = { .len = 0, }; > > - unsigned long new_flags; > > - VMA_ITERATOR(vmi, mm, 0); > > > > WRITE_ONCE(ctx->released, true); > > > > - if (!mmget_not_zero(mm)) > > - goto wakeup; > > - > > - /* > > - * Flush page faults out of all CPUs. NOTE: all page faults > > - * must be retried without returning VM_FAULT_SIGBUS if > > - * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx > > - * changes while handle_userfault released the mmap_lock. So > > - * it's critical that released is set to true (above), before > > - * taking the mmap_lock for writing. > > - */ > > - mmap_write_lock(mm); > > - prev = NULL; > > - for_each_vma(vmi, vma) { > > - cond_resched(); > > - BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ > > - !!(vma->vm_flags & __VM_UFFD_FLAGS)); > > - if (vma->vm_userfaultfd_ctx.ctx != ctx) { > > - prev = vma; > > - continue; > > - } > > - /* Reset ptes for the whole vma range if wr-protected */ > > - if (userfaultfd_wp(vma)) > > - uffd_wp_range(vma, vma->vm_start, > > - vma->vm_end - vma->vm_start, false); > > - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; > > - vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, > > - vma->vm_end, new_flags, > > - NULL_VM_UFFD_CTX); > > - > > - vma_start_write(vma); > > - userfaultfd_set_vm_flags(vma, new_flags); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > + userfaultfd_release_all(mm, ctx); > > > > - prev = vma; > > - } > > - mmap_write_unlock(mm); > > - mmput(mm); > > -wakeup: > > /* > > * After no new page faults can wait on this fault_*wqh, flush > > * the last page faults that may have been already waiting on > > @@ -1293,14 +1219,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > > unsigned long arg) > > { > > struct mm_struct *mm = ctx->mm; > > - struct vm_area_struct *vma, *prev, *cur; > > + struct vm_area_struct *vma, *cur; > > int ret; > > struct uffdio_register uffdio_register; > > struct uffdio_register __user *user_uffdio_register; > > - unsigned long vm_flags, new_flags; > > + unsigned long vm_flags; > > bool found; > > bool basic_ioctls; > > - unsigned long start, end, vma_end; > > + unsigned long start, end; > > struct vma_iterator vmi; > > bool wp_async = userfaultfd_wp_async_ctx(ctx); > > > > @@ -1428,57 +1354,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, > > } for_each_vma_range(vmi, cur, end); > > BUG_ON(!found); > > > > - vma_iter_set(&vmi, start); > > - prev = vma_prev(&vmi); > > - if (vma->vm_start < start) > > - prev = vma; > > - > > - ret = 0; > > - for_each_vma_range(vmi, vma, end) { > > - cond_resched(); > > - > > - BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); > > - BUG_ON(vma->vm_userfaultfd_ctx.ctx && > > - vma->vm_userfaultfd_ctx.ctx != ctx); > > - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); > > - > > - /* > > - * Nothing to do: this vma is already registered into this > > - * userfaultfd and with the right tracking mode too. > > - */ > > - if (vma->vm_userfaultfd_ctx.ctx == ctx && > > - (vma->vm_flags & vm_flags) == vm_flags) > > - goto skip; > > - > > - if (vma->vm_start > start) > > - start = vma->vm_start; > > - vma_end = min(end, vma->vm_end); > > - > > - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; > > - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, > > - new_flags, > > - (struct vm_userfaultfd_ctx){ctx}); > > - if (IS_ERR(vma)) { > > - ret = PTR_ERR(vma); > > - break; > > - } > > - > > - /* > > - * In the vma_merge() successful mprotect-like case 8: > > - * the next vma was merged into the current one and > > - * the current one has not been updated yet. > > - */ > > - vma_start_write(vma); > > - userfaultfd_set_vm_flags(vma, new_flags); > > - vma->vm_userfaultfd_ctx.ctx = ctx; > > - > > - if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) > > - hugetlb_unshare_all_pmds(vma); > > - > > - skip: > > - prev = vma; > > - start = vma->vm_end; > > - } > > + ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, > > + wp_async); > > > > out_unlock: > > mmap_write_unlock(mm); > > @@ -1519,7 +1396,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > > struct vm_area_struct *vma, *prev, *cur; > > int ret; > > struct uffdio_range uffdio_unregister; > > - unsigned long new_flags; > > bool found; > > unsigned long start, end, vma_end; > > const void __user *buf = (void __user *)arg; > > @@ -1622,27 +1498,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, > > wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); > > } > > > > - /* Reset ptes for the whole vma range if wr-protected */ > > - if (userfaultfd_wp(vma)) > > - uffd_wp_range(vma, start, vma_end - start, false); > > - > > - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; > > - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, > > - new_flags, NULL_VM_UFFD_CTX); > > + vma = userfaultfd_clear_vma(&vmi, prev, vma, > > + start, vma_end); > > if (IS_ERR(vma)) { > > ret = PTR_ERR(vma); > > break; > > } > > > > - /* > > - * In the vma_merge() successful mprotect-like case 8: > > - * the next vma was merged into the current one and > > - * the current one has not been updated yet. > > - */ > > - vma_start_write(vma); > > - userfaultfd_set_vm_flags(vma, new_flags); > > - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; > > - > > skip: > > prev = vma; > > start = vma->vm_end; > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > > index a12bcf042551..9fc6ce15c499 100644 > > --- a/include/linux/userfaultfd_k.h > > +++ b/include/linux/userfaultfd_k.h > > @@ -267,6 +267,25 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, > > extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); > > extern bool userfaultfd_wp_async(struct vm_area_struct *vma); > > > > +void userfaultfd_reset_ctx(struct vm_area_struct *vma); > > + > > +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, > > + unsigned long end); > > + > > +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, > > + struct vm_area_struct *vma, > > + unsigned long vm_flags, > > + unsigned long start, unsigned long end, > > + bool wp_async); > > + > > +void userfaultfd_release_new(struct userfaultfd_ctx *ctx); > > + > > +void userfaultfd_release_all(struct mm_struct *mm, > > + struct userfaultfd_ctx *ctx); > > + > > #else /* CONFIG_USERFAULTFD */ > > > > /* mm helpers */ > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > > index e54e5c8907fa..3b7715ecf292 100644 > > --- a/mm/userfaultfd.c > > +++ b/mm/userfaultfd.c > > @@ -1760,3 +1760,171 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, > > VM_WARN_ON(!moved && !err); > > return moved ? moved : err; > > } > > + > > +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, > > + vm_flags_t flags) > > +{ > > + const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; > > + > > + vm_flags_reset(vma, flags); > > + /* > > + * For shared mappings, we want to enable writenotify while > > + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply > > + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. > > + */ > > + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) > > + vma_set_page_prot(vma); > > +} > > + > > +static void userfaultfd_set_ctx(struct vm_area_struct *vma, > > + struct userfaultfd_ctx *ctx, > > + unsigned long flags) > > +{ > > + vma_start_write(vma); > > + vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; > > + userfaultfd_set_vm_flags(vma, > > + (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); > > +} > > + > > +void userfaultfd_reset_ctx(struct vm_area_struct *vma) > > +{ > > + userfaultfd_set_ctx(vma, NULL, 0); > > +} > > + > > +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, > > + unsigned long end) > > +{ > > + struct vm_area_struct *ret; > > + > > + /* Reset ptes for the whole vma range if wr-protected */ > > + if (userfaultfd_wp(vma)) > > + uffd_wp_range(vma, start, end - start, false); > > + > > + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, > > + vma->vm_flags & ~__VM_UFFD_FLAGS, > > + NULL_VM_UFFD_CTX); > > + > > + /* > > + * In the vma_merge() successful mprotect-like case 8: > > + * the next vma was merged into the current one and > > + * the current one has not been updated yet. > > + */ > > + if (!IS_ERR(ret)) > > + userfaultfd_reset_ctx(vma); > > + > > + return ret; > > +} > > + > > +/* Assumes mmap write lock taken, and mm_struct pinned. */ > > +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, > > + struct vm_area_struct *vma, > > + unsigned long vm_flags, > > + unsigned long start, unsigned long end, > > + bool wp_async) > > +{ > > + VMA_ITERATOR(vmi, ctx->mm, start); > > + struct vm_area_struct *prev = vma_prev(&vmi); > > + unsigned long vma_end; > > + unsigned long new_flags; > > + > > + if (vma->vm_start < start) > > + prev = vma; > > + > > + for_each_vma_range(vmi, vma, end) { > > + cond_resched(); > > + > > + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); > > + BUG_ON(vma->vm_userfaultfd_ctx.ctx && > > + vma->vm_userfaultfd_ctx.ctx != ctx); > > + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); > > + > > + /* > > + * Nothing to do: this vma is already registered into this > > + * userfaultfd and with the right tracking mode too. > > + */ > > + if (vma->vm_userfaultfd_ctx.ctx == ctx && > > + (vma->vm_flags & vm_flags) == vm_flags) > > + goto skip; > > + > > + if (vma->vm_start > start) > > + start = vma->vm_start; > > + vma_end = min(end, vma->vm_end); > > + > > + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; > > + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, > > + new_flags, > > + (struct vm_userfaultfd_ctx){ctx}); > > + if (IS_ERR(vma)) > > + return PTR_ERR(vma); > > + > > + /* > > + * In the vma_merge() successful mprotect-like case 8: > > + * the next vma was merged into the current one and > > + * the current one has not been updated yet. > > + */ > > + userfaultfd_set_ctx(vma, ctx, vm_flags); > > + > > + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) > > + hugetlb_unshare_all_pmds(vma); > > + > > +skip: > > + prev = vma; > > + start = vma->vm_end; > > + } > > + > > + return 0; > > +} > > + > > +void userfaultfd_release_new(struct userfaultfd_ctx *ctx) > > +{ > > + struct mm_struct *mm = ctx->mm; > > + struct vm_area_struct *vma; > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + /* the various vma->vm_userfaultfd_ctx still points to it */ > > + mmap_write_lock(mm); > > + for_each_vma(vmi, vma) { > > + if (vma->vm_userfaultfd_ctx.ctx == ctx) > > + userfaultfd_reset_ctx(vma); > > + } > > + mmap_write_unlock(mm); > > +} > > + > > +void userfaultfd_release_all(struct mm_struct *mm, > > + struct userfaultfd_ctx *ctx) > > +{ > > + struct vm_area_struct *vma, *prev; > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + if (!mmget_not_zero(mm)) > > + return; > > + > > + /* > > + * Flush page faults out of all CPUs. NOTE: all page faults > > + * must be retried without returning VM_FAULT_SIGBUS if > > + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx > > + * changes while handle_userfault released the mmap_lock. So > > + * it's critical that released is set to true (above), before > > + * taking the mmap_lock for writing. > > + */ > > + mmap_write_lock(mm); > > + prev = NULL; > > + for_each_vma(vmi, vma) { > > + cond_resched(); > > + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ > > + !!(vma->vm_flags & __VM_UFFD_FLAGS)); > > + if (vma->vm_userfaultfd_ctx.ctx != ctx) { > > + prev = vma; > > + continue; > > + } > > + > > + vma = userfaultfd_clear_vma(&vmi, prev, vma, > > + vma->vm_start, vma->vm_end); > > + prev = vma; > > + } > > + mmap_write_unlock(mm); > > + mmput(mm); > > +} > > -- > > 2.45.2 > >
On Wed, Aug 07, 2024 at 11:45:56AM GMT, Pengfei Xu wrote: > Hi Lorenzo Stoakes, > > Greetings! > > I used syzkaller and found > KASAN: slab-use-after-free Read in userfaultfd_set_ctx in next-20240805. > > Bisected the first bad commit: > 4651ba8201cf userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c > > All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240806_122723_userfaultfd_set_ct [snip] Andrew - As this is so small, could you take this as a fix-patch? The fix is enclosed below. Pengfei - Sorry for the delay on getting this resolved, I was struggling to repro with my usual dev setup, after trying a lot of things I ended up using the supplied repro env and was able to do so there. (I suspect that VMAs are laid out slightly differently in my usual arch base image perhaps based on tunables, and this was the delta on that!) Regardless, I was able to identify the cause - we incorrectly pass a stale pointer to userfaultfd_reset_ctx() if a merge is performed in userfaultfd_clear_vma(). This was a subtle mistake on my part, I don't see any other instances like this in the patch. Syzkaller managed to get this merge to happen and kasan picked up on it, so thank you very much for supplying the infra! The fix itself is very simple, a one-liner, enclosed below. ----8<---- From 193abd1c3a51e6bf1d85ddfe01845e9713336970 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Date: Wed, 7 Aug 2024 12:44:27 +0100 Subject: [PATCH] mm: userfaultfd: fix user-after-free in userfaultfd_clear_vma() After invoking vma_modify_flags_uffd() in userfaultfd_clear_vma(), we may have merged the vma, and depending on the kind of merge, deleted the vma, rendering the vma pointer invalid. The code incorrectly referenced this now possibly invalid vma pointer when invoking userfaultfd_reset_ctx(). If no merge is possible, vma_modify_flags_uffd() performs a split and returns the original vma. Therefore the correct approach is to simply pass the ret pointer to userfaultfd_ret_ctx(). Reported-by: Pengfei Xu <pengfei.xu@intel.com> Fixes: e310f2b78a77 ("userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c") Closes: https://lore.kernel.org/all/ZrLt9HIxV9QiZotn@xpf.sh.intel.com/ Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> --- mm/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3b7715ecf292..966e6c81a685 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1813,7 +1813,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, * the current one has not been updated yet. */ if (!IS_ERR(ret)) - userfaultfd_reset_ctx(vma); + userfaultfd_reset_ctx(ret); return ret; } -- 2.45.2
Hi Lorenzo Stoakes, On 2024-08-07 at 13:03:52 +0100, Lorenzo Stoakes wrote: > On Wed, Aug 07, 2024 at 11:45:56AM GMT, Pengfei Xu wrote: > > Hi Lorenzo Stoakes, > > > > Greetings! > > > > I used syzkaller and found > > KASAN: slab-use-after-free Read in userfaultfd_set_ctx in next-20240805. > > > > Bisected the first bad commit: > > 4651ba8201cf userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c > > > > All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240806_122723_userfaultfd_set_ct > > [snip] > > Andrew - As this is so small, could you take this as a fix-patch? The fix > is enclosed below. > > > Pengfei - Sorry for the delay on getting this resolved, I was struggling to > repro with my usual dev setup, after trying a lot of things I ended up > using the supplied repro env and was able to do so there. > > (I suspect that VMAs are laid out slightly differently in my usual arch base > image perhaps based on tunables, and this was the delta on that!) > > Regardless, I was able to identify the cause - we incorrectly pass a stale > pointer to userfaultfd_reset_ctx() if a merge is performed in > userfaultfd_clear_vma(). > > This was a subtle mistake on my part, I don't see any other instances like > this in the patch. > > Syzkaller managed to get this merge to happen and kasan picked up on it, so > thank you very much for supplying the infra! > > The fix itself is very simple, a one-liner, enclosed below. > Thanks for your patch! I tested the below patch on top of next-20240805, this issue could not be reproduced. So it's fixed. Best Regrads, Thanks! > ----8<---- > From 193abd1c3a51e6bf1d85ddfe01845e9713336970 Mon Sep 17 00:00:00 2001 > From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Date: Wed, 7 Aug 2024 12:44:27 +0100 > Subject: [PATCH] mm: userfaultfd: fix user-after-free in > userfaultfd_clear_vma() > > After invoking vma_modify_flags_uffd() in userfaultfd_clear_vma(), we may > have merged the vma, and depending on the kind of merge, deleted the vma, > rendering the vma pointer invalid. > > The code incorrectly referenced this now possibly invalid vma pointer when > invoking userfaultfd_reset_ctx(). > > If no merge is possible, vma_modify_flags_uffd() performs a split and > returns the original vma. Therefore the correct approach is to simply pass > the ret pointer to userfaultfd_ret_ctx(). > > Reported-by: Pengfei Xu <pengfei.xu@intel.com> > Fixes: e310f2b78a77 ("userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c") > Closes: https://lore.kernel.org/all/ZrLt9HIxV9QiZotn@xpf.sh.intel.com/ > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > mm/userfaultfd.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 3b7715ecf292..966e6c81a685 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -1813,7 +1813,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, > * the current one has not been updated yet. > */ > if (!IS_ERR(ret)) > - userfaultfd_reset_ctx(vma); > + userfaultfd_reset_ctx(ret); > > return ret; > } > -- > 2.45.2
On 8/7/24 14:03, Lorenzo Stoakes wrote: > On Wed, Aug 07, 2024 at 11:45:56AM GMT, Pengfei Xu wrote: >> Hi Lorenzo Stoakes, >> >> Greetings! >> >> I used syzkaller and found >> KASAN: slab-use-after-free Read in userfaultfd_set_ctx in next-20240805. >> >> Bisected the first bad commit: >> 4651ba8201cf userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c >> >> All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240806_122723_userfaultfd_set_ct > > [snip] > > Andrew - As this is so small, could you take this as a fix-patch? The fix > is enclosed below. > > > Pengfei - Sorry for the delay on getting this resolved, I was struggling to > repro with my usual dev setup, after trying a lot of things I ended up > using the supplied repro env and was able to do so there. > > (I suspect that VMAs are laid out slightly differently in my usual arch base > image perhaps based on tunables, and this was the delta on that!) > > Regardless, I was able to identify the cause - we incorrectly pass a stale > pointer to userfaultfd_reset_ctx() if a merge is performed in > userfaultfd_clear_vma(). > > This was a subtle mistake on my part, I don't see any other instances like > this in the patch. > > Syzkaller managed to get this merge to happen and kasan picked up on it, so > thank you very much for supplying the infra! > > The fix itself is very simple, a one-liner, enclosed below. > > ----8<---- > From 193abd1c3a51e6bf1d85ddfe01845e9713336970 Mon Sep 17 00:00:00 2001 > From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Date: Wed, 7 Aug 2024 12:44:27 +0100 > Subject: [PATCH] mm: userfaultfd: fix user-after-free in > userfaultfd_clear_vma() > > After invoking vma_modify_flags_uffd() in userfaultfd_clear_vma(), we may > have merged the vma, and depending on the kind of merge, deleted the vma, > rendering the vma pointer invalid. > > The code incorrectly referenced this now possibly invalid vma pointer when > invoking userfaultfd_reset_ctx(). > > If no merge is possible, vma_modify_flags_uffd() performs a split and > returns the original vma. Therefore the correct approach is to simply pass > the ret pointer to userfaultfd_ret_ctx(). > > Reported-by: Pengfei Xu <pengfei.xu@intel.com> > Fixes: e310f2b78a77 ("userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c") > Closes: https://lore.kernel.org/all/ZrLt9HIxV9QiZotn@xpf.sh.intel.com/ > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> > --- > mm/userfaultfd.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 3b7715ecf292..966e6c81a685 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -1813,7 +1813,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, > * the current one has not been updated yet. > */ > if (!IS_ERR(ret)) > - userfaultfd_reset_ctx(vma); > + userfaultfd_reset_ctx(ret); > > return ret; > } > -- > 2.45.2
Hi Lorenzo Stoakes, On 2024-08-07 at 13:03:52 +0100, Lorenzo Stoakes wrote: > On Wed, Aug 07, 2024 at 11:45:56AM GMT, Pengfei Xu wrote: > > Hi Lorenzo Stoakes, > > > > Greetings! > > > > I used syzkaller and found > > KASAN: slab-use-after-free Read in userfaultfd_set_ctx in next-20240805. > > > > Bisected the first bad commit: > > 4651ba8201cf userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c > > > > All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240806_122723_userfaultfd_set_ct > > [snip] > > Andrew - As this is so small, could you take this as a fix-patch? The fix > is enclosed below. > > > Pengfei - Sorry for the delay on getting this resolved, I was struggling to > repro with my usual dev setup, after trying a lot of things I ended up > using the supplied repro env and was able to do so there. Glad to know the repro environment is helpful. Thank you for your patch(I verified it's fixed) and it helps me learn more. > > (I suspect that VMAs are laid out slightly differently in my usual arch base > image perhaps based on tunables, and this was the delta on that!) > > Regardless, I was able to identify the cause - we incorrectly pass a stale > pointer to userfaultfd_reset_ctx() if a merge is performed in > userfaultfd_clear_vma(). > > This was a subtle mistake on my part, I don't see any other instances like > this in the patch. > > Syzkaller managed to get this merge to happen and kasan picked up on it, so > thank you very much for supplying the infra! You are welcome. :) Best Regards, Thank you! > > The fix itself is very simple, a one-liner, enclosed below. > > ----8<---- > From 193abd1c3a51e6bf1d85ddfe01845e9713336970 Mon Sep 17 00:00:00 2001 > From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Date: Wed, 7 Aug 2024 12:44:27 +0100 > Subject: [PATCH] mm: userfaultfd: fix user-after-free in > userfaultfd_clear_vma() > > After invoking vma_modify_flags_uffd() in userfaultfd_clear_vma(), we may > have merged the vma, and depending on the kind of merge, deleted the vma, > rendering the vma pointer invalid. > > The code incorrectly referenced this now possibly invalid vma pointer when > invoking userfaultfd_reset_ctx(). > > If no merge is possible, vma_modify_flags_uffd() performs a split and > returns the original vma. Therefore the correct approach is to simply pass > the ret pointer to userfaultfd_ret_ctx(). > > Reported-by: Pengfei Xu <pengfei.xu@intel.com> > Fixes: e310f2b78a77 ("userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c") > Closes: https://lore.kernel.org/all/ZrLt9HIxV9QiZotn@xpf.sh.intel.com/ > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > mm/userfaultfd.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 3b7715ecf292..966e6c81a685 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -1813,7 +1813,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, > * the current one has not been updated yet. > */ > if (!IS_ERR(ret)) > - userfaultfd_reset_ctx(vma); > + userfaultfd_reset_ctx(ret); > > return ret; > } > -- > 2.45.2
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 27a3e9285fbf..b3ed7207df7e 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -104,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; } -static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, - vm_flags_t flags) -{ - const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; - - vm_flags_reset(vma, flags); - /* - * For shared mappings, we want to enable writenotify while - * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply - * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. - */ - if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) - vma_set_page_prot(vma); -} - static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -615,22 +600,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, spin_unlock_irq(&ctx->event_wqh.lock); if (release_new_ctx) { - struct vm_area_struct *vma; - struct mm_struct *mm = release_new_ctx->mm; - VMA_ITERATOR(vmi, mm, 0); - - /* the various vma->vm_userfaultfd_ctx still points to it */ - mmap_write_lock(mm); - for_each_vma(vmi, vma) { - if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { - vma_start_write(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - userfaultfd_set_vm_flags(vma, - vma->vm_flags & ~__VM_UFFD_FLAGS); - } - } - mmap_write_unlock(mm); - + userfaultfd_release_new(release_new_ctx); userfaultfd_ctx_put(release_new_ctx); } @@ -662,9 +632,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) return 0; if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { - vma_start_write(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); + userfaultfd_reset_ctx(vma); return 0; } @@ -749,9 +717,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, up_write(&ctx->map_changing_lock); } else { /* Drop uffd context if remap feature not enabled */ - vma_start_write(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); + userfaultfd_reset_ctx(vma); } } @@ -870,53 +836,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; struct mm_struct *mm = ctx->mm; - struct vm_area_struct *vma, *prev; /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; - unsigned long new_flags; - VMA_ITERATOR(vmi, mm, 0); WRITE_ONCE(ctx->released, true); - if (!mmget_not_zero(mm)) - goto wakeup; - - /* - * Flush page faults out of all CPUs. NOTE: all page faults - * must be retried without returning VM_FAULT_SIGBUS if - * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx - * changes while handle_userfault released the mmap_lock. So - * it's critical that released is set to true (above), before - * taking the mmap_lock for writing. - */ - mmap_write_lock(mm); - prev = NULL; - for_each_vma(vmi, vma) { - cond_resched(); - BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ - !!(vma->vm_flags & __VM_UFFD_FLAGS)); - if (vma->vm_userfaultfd_ctx.ctx != ctx) { - prev = vma; - continue; - } - /* Reset ptes for the whole vma range if wr-protected */ - if (userfaultfd_wp(vma)) - uffd_wp_range(vma, vma->vm_start, - vma->vm_end - vma->vm_start, false); - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, - vma->vm_end, new_flags, - NULL_VM_UFFD_CTX); - - vma_start_write(vma); - userfaultfd_set_vm_flags(vma, new_flags); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + userfaultfd_release_all(mm, ctx); - prev = vma; - } - mmap_write_unlock(mm); - mmput(mm); -wakeup: /* * After no new page faults can wait on this fault_*wqh, flush * the last page faults that may have been already waiting on @@ -1293,14 +1219,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { struct mm_struct *mm = ctx->mm; - struct vm_area_struct *vma, *prev, *cur; + struct vm_area_struct *vma, *cur; int ret; struct uffdio_register uffdio_register; struct uffdio_register __user *user_uffdio_register; - unsigned long vm_flags, new_flags; + unsigned long vm_flags; bool found; bool basic_ioctls; - unsigned long start, end, vma_end; + unsigned long start, end; struct vma_iterator vmi; bool wp_async = userfaultfd_wp_async_ctx(ctx); @@ -1428,57 +1354,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - vma_iter_set(&vmi, start); - prev = vma_prev(&vmi); - if (vma->vm_start < start) - prev = vma; - - ret = 0; - for_each_vma_range(vmi, vma, end) { - cond_resched(); - - BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); - BUG_ON(vma->vm_userfaultfd_ctx.ctx && - vma->vm_userfaultfd_ctx.ctx != ctx); - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); - - /* - * Nothing to do: this vma is already registered into this - * userfaultfd and with the right tracking mode too. - */ - if (vma->vm_userfaultfd_ctx.ctx == ctx && - (vma->vm_flags & vm_flags) == vm_flags) - goto skip; - - if (vma->vm_start > start) - start = vma->vm_start; - vma_end = min(end, vma->vm_end); - - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, - (struct vm_userfaultfd_ctx){ctx}); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - break; - } - - /* - * In the vma_merge() successful mprotect-like case 8: - * the next vma was merged into the current one and - * the current one has not been updated yet. - */ - vma_start_write(vma); - userfaultfd_set_vm_flags(vma, new_flags); - vma->vm_userfaultfd_ctx.ctx = ctx; - - if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) - hugetlb_unshare_all_pmds(vma); - - skip: - prev = vma; - start = vma->vm_end; - } + ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, + wp_async); out_unlock: mmap_write_unlock(mm); @@ -1519,7 +1396,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, *prev, *cur; int ret; struct uffdio_range uffdio_unregister; - unsigned long new_flags; bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; @@ -1622,27 +1498,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } - /* Reset ptes for the whole vma range if wr-protected */ - if (userfaultfd_wp(vma)) - uffd_wp_range(vma, start, vma_end - start, false); - - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, NULL_VM_UFFD_CTX); + vma = userfaultfd_clear_vma(&vmi, prev, vma, + start, vma_end); if (IS_ERR(vma)) { ret = PTR_ERR(vma); break; } - /* - * In the vma_merge() successful mprotect-like case 8: - * the next vma was merged into the current one and - * the current one has not been updated yet. - */ - vma_start_write(vma); - userfaultfd_set_vm_flags(vma, new_flags); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - skip: prev = vma; start = vma->vm_end; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a12bcf042551..9fc6ce15c499 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -267,6 +267,25 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); extern bool userfaultfd_wp_async(struct vm_area_struct *vma); +void userfaultfd_reset_ctx(struct vm_area_struct *vma); + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end); + +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + unsigned long vm_flags, + unsigned long start, unsigned long end, + bool wp_async); + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx); + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e54e5c8907fa..3b7715ecf292 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1760,3 +1760,171 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, VM_WARN_ON(!moved && !err); return moved ? moved : err; } + +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t flags) +{ + const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; + + vm_flags_reset(vma, flags); + /* + * For shared mappings, we want to enable writenotify while + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. + */ + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) + vma_set_page_prot(vma); +} + +static void userfaultfd_set_ctx(struct vm_area_struct *vma, + struct userfaultfd_ctx *ctx, + unsigned long flags) +{ + vma_start_write(vma); + vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; + userfaultfd_set_vm_flags(vma, + (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); +} + +void userfaultfd_reset_ctx(struct vm_area_struct *vma) +{ + userfaultfd_set_ctx(vma, NULL, 0); +} + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + struct vm_area_struct *ret; + + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(vma, start, end - start, false); + + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, + vma->vm_flags & ~__VM_UFFD_FLAGS, + NULL_VM_UFFD_CTX); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + if (!IS_ERR(ret)) + userfaultfd_reset_ctx(vma); + + return ret; +} + +/* Assumes mmap write lock taken, and mm_struct pinned. */ +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + unsigned long vm_flags, + unsigned long start, unsigned long end, + bool wp_async) +{ + VMA_ITERATOR(vmi, ctx->mm, start); + struct vm_area_struct *prev = vma_prev(&vmi); + unsigned long vma_end; + unsigned long new_flags; + + if (vma->vm_start < start) + prev = vma; + + for_each_vma_range(vmi, vma, end) { + cond_resched(); + + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); + BUG_ON(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (vma->vm_userfaultfd_ctx.ctx == ctx && + (vma->vm_flags & vm_flags) == vm_flags) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + userfaultfd_set_ctx(vma, ctx, vm_flags); + + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) + hugetlb_unshare_all_pmds(vma); + +skip: + prev = vma; + start = vma->vm_end; + } + + return 0; +} + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + /* the various vma->vm_userfaultfd_ctx still points to it */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + if (vma->vm_userfaultfd_ctx.ctx == ctx) + userfaultfd_reset_ctx(vma); + } + mmap_write_unlock(mm); +} + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx) +{ + struct vm_area_struct *vma, *prev; + VMA_ITERATOR(vmi, mm, 0); + + if (!mmget_not_zero(mm)) + return; + + /* + * Flush page faults out of all CPUs. NOTE: all page faults + * must be retried without returning VM_FAULT_SIGBUS if + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx + * changes while handle_userfault released the mmap_lock. So + * it's critical that released is set to true (above), before + * taking the mmap_lock for writing. + */ + mmap_write_lock(mm); + prev = NULL; + for_each_vma(vmi, vma) { + cond_resched(); + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & __VM_UFFD_FLAGS)); + if (vma->vm_userfaultfd_ctx.ctx != ctx) { + prev = vma; + continue; + } + + vma = userfaultfd_clear_vma(&vmi, prev, vma, + vma->vm_start, vma->vm_end); + prev = vma; + } + mmap_write_unlock(mm); + mmput(mm); +}