Message ID | 20190924171518.26682-1-hannes@cmpxchg.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm: drop mmap_sem before calling balance_dirty_pages() in write fault | expand |
On Tue, Sep 24, 2019 at 01:15:18PM -0400, Johannes Weiner wrote: > +static int fault_dirty_shared_page(struct vm_fault *vmf) vm_fault_t, shirley? > @@ -2239,16 +2241,36 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, > mapping = page_rmapping(page); > unlock_page(page); > > + if (!page_mkwrite) > + file_update_time(vma->vm_file); > + > + /* > + * Throttle page dirtying rate down to writeback speed. > + * > + * mapping may be NULL here because some device drivers do not > + * set page.mapping but still dirty their pages > + * > + * Drop the mmap_sem before waiting on IO, if we can. The file > + * is pinning the mapping, as per above. > + */ > if ((dirtied || page_mkwrite) && mapping) { > - /* > - * Some device drivers do not set page.mapping > - * but still dirty their pages > - */ > + struct file *fpin = NULL; > + > + if ((vmf->flags & > + (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == > + FAULT_FLAG_ALLOW_RETRY) { > + fpin = get_file(vma->vm_file); > + up_read(&vma->vm_mm->mmap_sem); > + ret = VM_FAULT_RETRY; > + } > + > balance_dirty_pages_ratelimited(mapping); > + > + if (fpin) > + fput(fpin); > } > > - if (!page_mkwrite) > - file_update_time(vma->vm_file); > + return ret; > } I'm not a fan of moving file_update_time() to _before_ the balance_dirty_pages call. Also, this is now the third place that needs maybe_unlock_mmap_for_io, see https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/ How about: + struct file *fpin = NULL; if ((dirtied || page_mkwrite) && mapping) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); balance_dirty_pages_ratelimited(mapping); } + + if (fpin) { + file_update_time(fpin); + fput(fpin); + return VM_FAULT_RETRY; + } if (!page_mkwrite) file_update_time(vma->vm_file); + return 0; } > /* > @@ -2491,6 +2513,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) > __releases(vmf->ptl) > { > struct vm_area_struct *vma = vmf->vma; > + int ret = VM_FAULT_WRITE; vm_fault_t again. > @@ -3576,7 +3599,6 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) > static vm_fault_t do_fault(struct vm_fault *vmf) > { > struct vm_area_struct *vma = vmf->vma; > - struct mm_struct *vm_mm = vma->vm_mm; > vm_fault_t ret; > > /* > @@ -3617,7 +3639,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf) > > /* preallocated pagetable is unused: free it */ > if (vmf->prealloc_pte) { > - pte_free(vm_mm, vmf->prealloc_pte); > + /* > + * XXX: Accessing vma->vm_mm now is not safe. The page > + * fault handler may have dropped the mmap_sem a long > + * time ago. Only s390 derefs that parameter. > + */ > + pte_free(vma->vm_mm, vmf->prealloc_pte); I'm confused. This code looks like it was safe before (as it was caching vma->vm_mm in a local variable), and now you've made it unsafe ... ?
On Tue, Sep 24, 2019 at 10:48:09AM -0700, Matthew Wilcox wrote: > On Tue, Sep 24, 2019 at 01:15:18PM -0400, Johannes Weiner wrote: > > +static int fault_dirty_shared_page(struct vm_fault *vmf) > > vm_fault_t, shirley? Ah yes, I changed it. > > @@ -2239,16 +2241,36 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, > > mapping = page_rmapping(page); > > unlock_page(page); > > > > + if (!page_mkwrite) > > + file_update_time(vma->vm_file); > > + > > + /* > > + * Throttle page dirtying rate down to writeback speed. > > + * > > + * mapping may be NULL here because some device drivers do not > > + * set page.mapping but still dirty their pages > > + * > > + * Drop the mmap_sem before waiting on IO, if we can. The file > > + * is pinning the mapping, as per above. > > + */ > > if ((dirtied || page_mkwrite) && mapping) { > > - /* > > - * Some device drivers do not set page.mapping > > - * but still dirty their pages > > - */ > > + struct file *fpin = NULL; > > + > > + if ((vmf->flags & > > + (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == > > + FAULT_FLAG_ALLOW_RETRY) { > > + fpin = get_file(vma->vm_file); > > + up_read(&vma->vm_mm->mmap_sem); > > + ret = VM_FAULT_RETRY; > > + } > > + > > balance_dirty_pages_ratelimited(mapping); > > + > > + if (fpin) > > + fput(fpin); > > } > > > > - if (!page_mkwrite) > > - file_update_time(vma->vm_file); > > + return ret; > > } > > I'm not a fan of moving file_update_time() to _before_ the > balance_dirty_pages call. Can you elaborate why? If the filesystem has a page_mkwrite op, it will have already called file_update_time() before this function is entered. If anything, this change makes the sequence more consistent. > Also, this is now the third place that needs > maybe_unlock_mmap_for_io, see > https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/ Good idea, I moved the helper to internal.h and converted to it. I left the shmem site alone, though. It doesn't require the file pinning, so it shouldn't pointlessly bump the file refcount and suggest such a dependency - that could cost somebody later quite a bit of time trying to understand the code. > How about: > > + struct file *fpin = NULL; > > if ((dirtied || page_mkwrite) && mapping) { > fpin = maybe_unlock_mmap_for_io(vmf, fpin); > balance_dirty_pages_ratelimited(mapping); > } > + > + if (fpin) { > + file_update_time(fpin); This isn't an equivalent change and could update the time twice if the fs has a page_mkwrite op. > + fput(fpin); > + return VM_FAULT_RETRY; > + } > > if (!page_mkwrite) > file_update_time(vma->vm_file); > + return 0; > } > > > /* > > @@ -2491,6 +2513,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) > > __releases(vmf->ptl) > > { > > struct vm_area_struct *vma = vmf->vma; > > + int ret = VM_FAULT_WRITE; > > vm_fault_t again. Done. > > @@ -3576,7 +3599,6 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) > > static vm_fault_t do_fault(struct vm_fault *vmf) > > { > > struct vm_area_struct *vma = vmf->vma; > > - struct mm_struct *vm_mm = vma->vm_mm; > > vm_fault_t ret; > > > > /* > > @@ -3617,7 +3639,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf) > > > > /* preallocated pagetable is unused: free it */ > > if (vmf->prealloc_pte) { > > - pte_free(vm_mm, vmf->prealloc_pte); > > + /* > > + * XXX: Accessing vma->vm_mm now is not safe. The page > > + * fault handler may have dropped the mmap_sem a long > > + * time ago. Only s390 derefs that parameter. > > + */ > > + pte_free(vma->vm_mm, vmf->prealloc_pte); > > I'm confused. This code looks like it was safe before (as it was caching > vma->vm_mm in a local variable), and now you've made it unsafe ... ? You're right. After looking at it again, this was a total brainfart: the mm itself is of course safe to access. I removed these two hunks. Thanks! Updated patch: --- From 60c1493bd3b5e0800147165a1dc36b77cf3306e5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner <jweiner@fb.com> Date: Wed, 8 May 2019 13:53:38 -0700 Subject: [PATCH v2] mm: drop mmap_sem before calling balance_dirty_pages() in write fault One of our services is observing hanging ps/top/etc under heavy write IO, and the task states show this is an mmap_sem priority inversion: A write fault is holding the mmap_sem in read-mode and waiting for (heavily cgroup-limited) IO in balance_dirty_pages(): [<0>] balance_dirty_pages+0x724/0x905 [<0>] balance_dirty_pages_ratelimited+0x254/0x390 [<0>] fault_dirty_shared_page.isra.96+0x4a/0x90 [<0>] do_wp_page+0x33e/0x400 [<0>] __handle_mm_fault+0x6f0/0xfa0 [<0>] handle_mm_fault+0xe4/0x200 [<0>] __do_page_fault+0x22b/0x4a0 [<0>] page_fault+0x45/0x50 [<0>] 0xffffffffffffffff Somebody tries to change the address space, contending for the mmap_sem in write-mode: [<0>] call_rwsem_down_write_failed_killable+0x13/0x20 [<0>] do_mprotect_pkey+0xa8/0x330 [<0>] SyS_mprotect+0xf/0x20 [<0>] do_syscall_64+0x5b/0x100 [<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [<0>] 0xffffffffffffffff The waiting writer locks out all subsequent readers to avoid lock starvation, and several threads can be seen hanging like this: [<0>] call_rwsem_down_read_failed+0x14/0x30 [<0>] proc_pid_cmdline_read+0xa0/0x480 [<0>] __vfs_read+0x23/0x140 [<0>] vfs_read+0x87/0x130 [<0>] SyS_read+0x42/0x90 [<0>] do_syscall_64+0x5b/0x100 [<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [<0>] 0xffffffffffffffff To fix this, do what we do for cache read faults already: drop the mmap_sem before calling into anything IO bound, in this case the balance_dirty_pages() function, and return VM_FAULT_RETRY. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> --- mm/filemap.c | 21 --------------------- mm/internal.h | 21 +++++++++++++++++++++ mm/memory.c | 38 +++++++++++++++++++++++++++----------- 3 files changed, 48 insertions(+), 32 deletions(-) version 2: - use maybe_unlock_mmap_for_io(), as per Willy - remove vma dereference bug, as per Willy - use vm_fault_t, as per Willy diff --git a/mm/filemap.c b/mm/filemap.c index 7f99d53005bb..9072c3fbf837 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2362,27 +2362,6 @@ EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) -static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, - struct file *fpin) -{ - int flags = vmf->flags; - - if (fpin) - return fpin; - - /* - * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or - * anything, so we only pin the file and drop the mmap_sem if only - * FAULT_FLAG_ALLOW_RETRY is set. - */ - if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == - FAULT_FLAG_ALLOW_RETRY) { - fpin = get_file(vmf->vma->vm_file); - up_read(&vmf->vma->vm_mm->mmap_sem); - } - return fpin; -} - /* * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem * @vmf - the vm_fault for this fault. diff --git a/mm/internal.h b/mm/internal.h index 0d5f720c75ab..7dd7fbb577a9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -362,6 +362,27 @@ vma_address(struct page *page, struct vm_area_struct *vma) return max(start, vma->vm_start); } +static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) +{ + int flags = vmf->flags; + + if (fpin) + return fpin; + + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} + #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } diff --git a/mm/memory.c b/mm/memory.c index 2e796372927f..088b6d3b6f88 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2221,10 +2221,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) * * The function expects the page to be locked and unlocks it. */ -static void fault_dirty_shared_page(struct vm_area_struct *vma, - struct page *page) +static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; + struct page *page = vmf->page; bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; @@ -2239,16 +2240,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, mapping = page_rmapping(page); unlock_page(page); + if (!page_mkwrite) + file_update_time(vma->vm_file); + + /* + * Throttle page dirtying rate down to writeback speed. + * + * mapping may be NULL here because some device drivers do not + * set page.mapping but still dirty their pages + * + * Drop the mmap_sem before waiting on IO, if we can. The file + * is pinning the mapping, as per above. + */ if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ + struct file *fpin; + + fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); + if (fpin) { + fput(fpin); + return VM_FAULT_RETRY; + } } - if (!page_mkwrite) - file_update_time(vma->vm_file); + return 0; } /* @@ -2491,6 +2506,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = VM_FAULT_WRITE; get_page(vmf->page); @@ -2514,10 +2530,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) wp_page_reuse(vmf); lock_page(vmf->page); } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); put_page(vmf->page); - return VM_FAULT_WRITE; + return ret; } /* @@ -3561,7 +3577,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) return ret; } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); return ret; }
On Tue, Sep 24, 2019 at 03:42:38PM -0400, Johannes Weiner wrote: > > I'm not a fan of moving file_update_time() to _before_ the > > balance_dirty_pages call. > > Can you elaborate why? If the filesystem has a page_mkwrite op, it > will have already called file_update_time() before this function is > entered. If anything, this change makes the sequence more consistent. Oh, that makes sense. I thought it should be updated after all the data was written, but it probably doesn't make much difference. > > Also, this is now the third place that needs > > maybe_unlock_mmap_for_io, see > > https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/ > > Good idea, I moved the helper to internal.h and converted to it. > > I left the shmem site alone, though. It doesn't require the file > pinning, so it shouldn't pointlessly bump the file refcount and > suggest such a dependency - that could cost somebody later quite a bit > of time trying to understand the code. The problem for shmem is this: spin_unlock(&inode->i_lock); schedule(); spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); spin_unlock(&inode->i_lock); While scheduled, the VMA can go away and the inode be reclaimed, making this a use-after-free. The initial suggestion was an increment on the inode refcount, but since we already have a pattern which involves pinning the file, I thought that was a better way to go. > From: Johannes Weiner <jweiner@fb.com> > Date: Wed, 8 May 2019 13:53:38 -0700 > Subject: [PATCH v2] mm: drop mmap_sem before calling balance_dirty_pages() > in write fault > > One of our services is observing hanging ps/top/etc under heavy write > IO, and the task states show this is an mmap_sem priority inversion: > > A write fault is holding the mmap_sem in read-mode and waiting for > (heavily cgroup-limited) IO in balance_dirty_pages(): > > [<0>] balance_dirty_pages+0x724/0x905 > [<0>] balance_dirty_pages_ratelimited+0x254/0x390 > [<0>] fault_dirty_shared_page.isra.96+0x4a/0x90 > [<0>] do_wp_page+0x33e/0x400 > [<0>] __handle_mm_fault+0x6f0/0xfa0 > [<0>] handle_mm_fault+0xe4/0x200 > [<0>] __do_page_fault+0x22b/0x4a0 > [<0>] page_fault+0x45/0x50 > [<0>] 0xffffffffffffffff > > Somebody tries to change the address space, contending for the > mmap_sem in write-mode: > > [<0>] call_rwsem_down_write_failed_killable+0x13/0x20 > [<0>] do_mprotect_pkey+0xa8/0x330 > [<0>] SyS_mprotect+0xf/0x20 > [<0>] do_syscall_64+0x5b/0x100 > [<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 > [<0>] 0xffffffffffffffff > > The waiting writer locks out all subsequent readers to avoid lock > starvation, and several threads can be seen hanging like this: > > [<0>] call_rwsem_down_read_failed+0x14/0x30 > [<0>] proc_pid_cmdline_read+0xa0/0x480 > [<0>] __vfs_read+0x23/0x140 > [<0>] vfs_read+0x87/0x130 > [<0>] SyS_read+0x42/0x90 > [<0>] do_syscall_64+0x5b/0x100 > [<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 > [<0>] 0xffffffffffffffff > > To fix this, do what we do for cache read faults already: drop the > mmap_sem before calling into anything IO bound, in this case the > balance_dirty_pages() function, and return VM_FAULT_RETRY. > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
On Tue, Sep 24, 2019 at 01:46:08PM -0700, Matthew Wilcox wrote: > On Tue, Sep 24, 2019 at 03:42:38PM -0400, Johannes Weiner wrote: > > > I'm not a fan of moving file_update_time() to _before_ the > > > balance_dirty_pages call. > > > > Can you elaborate why? If the filesystem has a page_mkwrite op, it > > will have already called file_update_time() before this function is > > entered. If anything, this change makes the sequence more consistent. > > Oh, that makes sense. I thought it should be updated after all the data > was written, but it probably doesn't make much difference. > > > > Also, this is now the third place that needs > > > maybe_unlock_mmap_for_io, see > > > https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/ > > > > Good idea, I moved the helper to internal.h and converted to it. > > > > I left the shmem site alone, though. It doesn't require the file > > pinning, so it shouldn't pointlessly bump the file refcount and > > suggest such a dependency - that could cost somebody later quite a bit > > of time trying to understand the code. > > The problem for shmem is this: > > spin_unlock(&inode->i_lock); > schedule(); > > spin_lock(&inode->i_lock); > finish_wait(shmem_falloc_waitq, &shmem_fault_wait); > spin_unlock(&inode->i_lock); > > While scheduled, the VMA can go away and the inode be reclaimed, making > this a use-after-free. The initial suggestion was an increment on > the inode refcount, but since we already have a pattern which involves > pinning the file, I thought that was a better way to go. I completely read over the context of that email you linked - that there is a bug in the existing code - and looked at it as mere refactoring patch. My apologies. Switching that shmem site to maybe_unlock_mmap_for_io() to indirectly pin the inode (in a separate bug fix patch) indeed makes sense to me. > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> > > Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Thanks, Matthew.
On Tue, Sep 24, 2019 at 05:43:37PM -0400, Johannes Weiner wrote: > On Tue, Sep 24, 2019 at 01:46:08PM -0700, Matthew Wilcox wrote: > > On Tue, Sep 24, 2019 at 03:42:38PM -0400, Johannes Weiner wrote: > > > > I'm not a fan of moving file_update_time() to _before_ the > > > > balance_dirty_pages call. > > > > > > Can you elaborate why? If the filesystem has a page_mkwrite op, it > > > will have already called file_update_time() before this function is > > > entered. If anything, this change makes the sequence more consistent. > > > > Oh, that makes sense. I thought it should be updated after all the data > > was written, but it probably doesn't make much difference. > > > > > > Also, this is now the third place that needs > > > > maybe_unlock_mmap_for_io, see > > > > https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/ > > > > > > Good idea, I moved the helper to internal.h and converted to it. > > > > > > I left the shmem site alone, though. It doesn't require the file > > > pinning, so it shouldn't pointlessly bump the file refcount and > > > suggest such a dependency - that could cost somebody later quite a bit > > > of time trying to understand the code. > > > > The problem for shmem is this: > > > > spin_unlock(&inode->i_lock); > > schedule(); > > > > spin_lock(&inode->i_lock); > > finish_wait(shmem_falloc_waitq, &shmem_fault_wait); > > spin_unlock(&inode->i_lock); > > > > While scheduled, the VMA can go away and the inode be reclaimed, making > > this a use-after-free. The initial suggestion was an increment on > > the inode refcount, but since we already have a pattern which involves > > pinning the file, I thought that was a better way to go. > > I completely read over the context of that email you linked - that > there is a bug in the existing code - and looked at it as mere > refactoring patch. My apologies. > > Switching that shmem site to maybe_unlock_mmap_for_io() to indirectly > pin the inode (in a separate bug fix patch) indeed makes sense to me. The patch on top of this one is below. Please post them together if you are going to resend yours. > > > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> > > > > Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> From bdf96fe9e3c1a319e9fd131efbe0118ea41a41b1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Date: Thu, 26 Sep 2019 16:34:26 +0300 Subject: [PATCH] shmem: Pin the file in shmem_fault() if mmap_sem is dropped syzbot found the following crash: BUG: KASAN: use-after-free in perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13 Read of size 8 at addr ffff8880a5cf2c50 by task syz-executor.0/26173 CPU: 0 PID: 26173 Comm: syz-executor.0 Not tainted 5.3.0-rc6 #146 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351 __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482 kasan_report+0x12/0x17 mm/kasan/common.c:618 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132 perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13 trace_lock_acquire include/trace/events/lock.h:13 [inline] lock_acquire+0x2de/0x410 kernel/locking/lockdep.c:4411 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:338 [inline] shmem_fault+0x5ec/0x7b0 mm/shmem.c:2034 __do_fault+0x111/0x540 mm/memory.c:3083 do_shared_fault mm/memory.c:3535 [inline] do_fault mm/memory.c:3613 [inline] handle_pte_fault mm/memory.c:3840 [inline] __handle_mm_fault+0x2adf/0x3f20 mm/memory.c:3964 handle_mm_fault+0x1b5/0x6b0 mm/memory.c:4001 do_user_addr_fault arch/x86/mm/fault.c:1441 [inline] __do_page_fault+0x536/0xdd0 arch/x86/mm/fault.c:1506 do_page_fault+0x38/0x590 arch/x86/mm/fault.c:1530 page_fault+0x39/0x40 arch/x86/entry/entry_64.S:1202 It happens if the VMA got unmapped under us while we dropped mmap_sem and inode got freed. Pinning the file if we drop mmap_sem fixes the issue. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reported-by: syzbot+03ee87124ee05af991bd@syzkaller.appspotmail.com Cc: Hillf Danton <hdanton@sina.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Hugh Dickins <hughd@google.com> --- mm/shmem.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 30ce722c23fa..f672e4145cfd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2022,16 +2022,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { + struct file *fpin = NULL; wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; - if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* It's polite to up mmap_sem if we can */ - up_read(&vma->vm_mm->mmap_sem); + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + if (fpin) ret = VM_FAULT_RETRY; - } shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, @@ -2049,6 +2047,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); spin_unlock(&inode->i_lock); + + if (fpin) + fput(fpin); return ret; } spin_unlock(&inode->i_lock);
On Thu, Sep 26, 2019 at 04:49:23PM +0300, Kirill A. Shutemov wrote: > It happens if the VMA got unmapped under us while we dropped mmap_sem > and inode got freed. > > Pinning the file if we drop mmap_sem fixes the issue. > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > Reported-by: syzbot+03ee87124ee05af991bd@syzkaller.appspotmail.com > Cc: Hillf Danton <hdanton@sina.com> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
On Thu, Sep 26, 2019 at 04:49:23PM +0300, Kirill A. Shutemov wrote: > On Tue, Sep 24, 2019 at 05:43:37PM -0400, Johannes Weiner wrote: > > On Tue, Sep 24, 2019 at 01:46:08PM -0700, Matthew Wilcox wrote: > > > On Tue, Sep 24, 2019 at 03:42:38PM -0400, Johannes Weiner wrote: > > > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> > > > > > > Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Thanks! > From bdf96fe9e3c1a319e9fd131efbe0118ea41a41b1 Mon Sep 17 00:00:00 2001 > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> > Date: Thu, 26 Sep 2019 16:34:26 +0300 > Subject: [PATCH] shmem: Pin the file in shmem_fault() if mmap_sem is dropped > > syzbot found the following crash: > > BUG: KASAN: use-after-free in perf_trace_lock_acquire+0x401/0x530 > include/trace/events/lock.h:13 > Read of size 8 at addr ffff8880a5cf2c50 by task syz-executor.0/26173 > > CPU: 0 PID: 26173 Comm: syz-executor.0 Not tainted 5.3.0-rc6 #146 > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS > Google 01/01/2011 > Call Trace: > __dump_stack lib/dump_stack.c:77 [inline] > dump_stack+0x172/0x1f0 lib/dump_stack.c:113 > print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351 > __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482 > kasan_report+0x12/0x17 mm/kasan/common.c:618 > __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132 > perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13 > trace_lock_acquire include/trace/events/lock.h:13 [inline] > lock_acquire+0x2de/0x410 kernel/locking/lockdep.c:4411 > __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] > _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151 > spin_lock include/linux/spinlock.h:338 [inline] > shmem_fault+0x5ec/0x7b0 mm/shmem.c:2034 > __do_fault+0x111/0x540 mm/memory.c:3083 > do_shared_fault mm/memory.c:3535 [inline] > do_fault mm/memory.c:3613 [inline] > handle_pte_fault mm/memory.c:3840 [inline] > __handle_mm_fault+0x2adf/0x3f20 mm/memory.c:3964 > handle_mm_fault+0x1b5/0x6b0 mm/memory.c:4001 > do_user_addr_fault arch/x86/mm/fault.c:1441 [inline] > __do_page_fault+0x536/0xdd0 arch/x86/mm/fault.c:1506 > do_page_fault+0x38/0x590 arch/x86/mm/fault.c:1530 > page_fault+0x39/0x40 arch/x86/entry/entry_64.S:1202 > > It happens if the VMA got unmapped under us while we dropped mmap_sem > and inode got freed. > > Pinning the file if we drop mmap_sem fixes the issue. > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > Reported-by: syzbot+03ee87124ee05af991bd@syzkaller.appspotmail.com > Cc: Hillf Danton <hdanton@sina.com> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Hugh Dickins <hughd@google.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> I have just one nitpick: > @@ -2022,16 +2022,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) > shmem_falloc->waitq && > vmf->pgoff >= shmem_falloc->start && > vmf->pgoff < shmem_falloc->next) { > + struct file *fpin = NULL; That initialization seems unnecessary, as the fpin assignment below is unconditional in the variable's scope. The second argument to maybe_unlock_mmap_for_io() for tracking state when the function is called multiple times in the filemap fault goto maze, we shouldn't need that here for a simple invocation. > wait_queue_head_t *shmem_falloc_waitq; > DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); > > ret = VM_FAULT_NOPAGE; > - if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && > - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { > - /* It's polite to up mmap_sem if we can */ > - up_read(&vma->vm_mm->mmap_sem); > + fpin = maybe_unlock_mmap_for_io(vmf, fpin); I.e. this: fpin = maybe_unlock_mmap_for_io(vmf, NULL); > + if (fpin) > ret = VM_FAULT_RETRY; > - } > > shmem_falloc_waitq = shmem_falloc->waitq; > prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, > @@ -2049,6 +2047,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) > spin_lock(&inode->i_lock); > finish_wait(shmem_falloc_waitq, &shmem_fault_wait); > spin_unlock(&inode->i_lock); > + > + if (fpin) > + fput(fpin); > return ret; > } > spin_unlock(&inode->i_lock);
On Thu, Sep 26, 2019 at 03:26:24PM -0400, Johannes Weiner wrote: > I have just one nitpick: Here's addressed one: From 22a9d58a79a3ebb727d9e909c8f833cd0a751c08 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Date: Thu, 26 Sep 2019 16:34:26 +0300 Subject: [PATCH] shmem: Pin the file in shmem_fault() if mmap_sem is dropped syzbot found the following crash: BUG: KASAN: use-after-free in perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13 Read of size 8 at addr ffff8880a5cf2c50 by task syz-executor.0/26173 CPU: 0 PID: 26173 Comm: syz-executor.0 Not tainted 5.3.0-rc6 #146 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351 __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482 kasan_report+0x12/0x17 mm/kasan/common.c:618 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132 perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13 trace_lock_acquire include/trace/events/lock.h:13 [inline] lock_acquire+0x2de/0x410 kernel/locking/lockdep.c:4411 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:338 [inline] shmem_fault+0x5ec/0x7b0 mm/shmem.c:2034 __do_fault+0x111/0x540 mm/memory.c:3083 do_shared_fault mm/memory.c:3535 [inline] do_fault mm/memory.c:3613 [inline] handle_pte_fault mm/memory.c:3840 [inline] __handle_mm_fault+0x2adf/0x3f20 mm/memory.c:3964 handle_mm_fault+0x1b5/0x6b0 mm/memory.c:4001 do_user_addr_fault arch/x86/mm/fault.c:1441 [inline] __do_page_fault+0x536/0xdd0 arch/x86/mm/fault.c:1506 do_page_fault+0x38/0x590 arch/x86/mm/fault.c:1530 page_fault+0x39/0x40 arch/x86/entry/entry_64.S:1202 It happens if the VMA got unmapped under us while we dropped mmap_sem and inode got freed. Pinning the file if we drop mmap_sem fixes the issue. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reported-by: syzbot+03ee87124ee05af991bd@syzkaller.appspotmail.com Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Hillf Danton <hdanton@sina.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Hugh Dickins <hughd@google.com> --- mm/shmem.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 30ce722c23fa..0601ad615ccb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2022,16 +2022,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { + struct file *fpin; wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; - if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* It's polite to up mmap_sem if we can */ - up_read(&vma->vm_mm->mmap_sem); + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + if (fpin) ret = VM_FAULT_RETRY; - } shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, @@ -2049,6 +2047,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); spin_unlock(&inode->i_lock); + + if (fpin) + fput(fpin); return ret; } spin_unlock(&inode->i_lock);
diff --git a/mm/memory.c b/mm/memory.c index 2e796372927f..da5eb1d67447 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2221,12 +2221,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) * * The function expects the page to be locked and unlocks it. */ -static void fault_dirty_shared_page(struct vm_area_struct *vma, - struct page *page) +static int fault_dirty_shared_page(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; + struct page *page = vmf->page; bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; + int ret = 0; dirtied = set_page_dirty(page); VM_BUG_ON_PAGE(PageAnon(page), page); @@ -2239,16 +2241,36 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, mapping = page_rmapping(page); unlock_page(page); + if (!page_mkwrite) + file_update_time(vma->vm_file); + + /* + * Throttle page dirtying rate down to writeback speed. + * + * mapping may be NULL here because some device drivers do not + * set page.mapping but still dirty their pages + * + * Drop the mmap_sem before waiting on IO, if we can. The file + * is pinning the mapping, as per above. + */ if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ + struct file *fpin = NULL; + + if ((vmf->flags & + (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vma->vm_file); + up_read(&vma->vm_mm->mmap_sem); + ret = VM_FAULT_RETRY; + } + balance_dirty_pages_ratelimited(mapping); + + if (fpin) + fput(fpin); } - if (!page_mkwrite) - file_update_time(vma->vm_file); + return ret; } /* @@ -2491,6 +2513,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; + int ret = VM_FAULT_WRITE; get_page(vmf->page); @@ -2514,10 +2537,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) wp_page_reuse(vmf); lock_page(vmf->page); } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); put_page(vmf->page); - return VM_FAULT_WRITE; + return ret; } /* @@ -3561,7 +3584,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) return ret; } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); return ret; } @@ -3576,7 +3599,6 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* @@ -3617,7 +3639,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vm_mm, vmf->prealloc_pte); + /* + * XXX: Accessing vma->vm_mm now is not safe. The page + * fault handler may have dropped the mmap_sem a long + * time ago. Only s390 derefs that parameter. + */ + pte_free(vma->vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret;