Message ID | 20230201034137.2463113-1-stevensd@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm/khugepaged: skip shmem with armed userfaultfd | expand |
On Tue, Jan 31, 2023 at 7:42 PM David Stevens <stevensd@chromium.org> wrote: > > From: David Stevens <stevensd@chromium.org> > > Collapsing memory in a vma that has an armed userfaultfd results in > zero-filling any missing pages, which breaks user-space paging for those > filled pages. Avoid khugepage bypassing userfaultfd by not collapsing > pages in shmem reached via scanning a vma with an armed userfaultfd if > doing so would zero-fill any pages. > > Signed-off-by: David Stevens <stevensd@chromium.org> > --- > mm/khugepaged.c | 35 ++++++++++++++++++++++++----------- > 1 file changed, 24 insertions(+), 11 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 79be13133322..48e944fb8972 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, > * + restore gaps in the page cache; > * + unlock and free huge page; > */ > -static int collapse_file(struct mm_struct *mm, unsigned long addr, > - struct file *file, pgoff_t start, > +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma, > + unsigned long addr, struct file *file, pgoff_t start, > struct collapse_control *cc) > { > struct address_space *mapping = file->f_mapping; > @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > * be able to map it or use it in another way until we unlock it. > */ > > + if (is_shmem) > + mmap_read_lock(mm); If you release mmap_lock before then reacquire it here, the vma is not trusted anymore. It is not safe to use the vma anymore. Since you already read uffd_was_armed before releasing mmap_lock, so you could pass it directly to collapse_file w/o dereferencing vma again. The problem may be false positive (not userfaultfd armed anymore), but it should be fine. Khugepaged could collapse this area in the next round. Also +userfaultfd folks. > + > xas_set(&xas, start); > for (index = start; index < end; index++) { > struct page *page = xas_next(&xas); > @@ -1792,6 +1795,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > VM_BUG_ON(index != xas.xa_index); > if (is_shmem) { > if (!page) { > + if (userfaultfd_armed(vma)) { > + result = SCAN_EXCEED_NONE_PTE; > + goto xa_locked; > + } > /* > * Stop if extent has been truncated or > * hole-punched, and is now completely > @@ -2095,6 +2102,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > hpage->mapping = NULL; > } > > + if (is_shmem) > + mmap_read_unlock(mm); > if (hpage) > unlock_page(hpage); > out: > @@ -2108,8 +2117,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > return result; > } > > -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > - struct file *file, pgoff_t start, > +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma, > + unsigned long addr, struct file *file, pgoff_t start, > struct collapse_control *cc) > { > struct page *page = NULL; > @@ -2118,6 +2127,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > int present, swap; > int node = NUMA_NO_NODE; > int result = SCAN_SUCCEED; > + bool uffd_was_armed = userfaultfd_armed(vma); > + > + mmap_read_unlock(mm); > > present = 0; > swap = 0; > @@ -2193,13 +2205,16 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > } > rcu_read_unlock(); > > + if (uffd_was_armed && present < HPAGE_PMD_NR) > + result = SCAN_EXCEED_SWAP_PTE; > + > if (result == SCAN_SUCCEED) { > if (cc->is_khugepaged && > present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { > result = SCAN_EXCEED_NONE_PTE; > count_vm_event(THP_SCAN_EXCEED_NONE_PTE); > } else { > - result = collapse_file(mm, addr, file, start, cc); > + result = collapse_file(mm, vma, addr, file, start, cc); > } > } > > @@ -2207,8 +2222,8 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > return result; > } > #else > -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > - struct file *file, pgoff_t start, > +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma, > + unsigned long addr, struct file *file, pgoff_t start, > struct collapse_control *cc) > { > BUILD_BUG(); > @@ -2304,8 +2319,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, > pgoff_t pgoff = linear_page_index(vma, > khugepaged_scan.address); > > - mmap_read_unlock(mm); > - *result = hpage_collapse_scan_file(mm, > + *result = hpage_collapse_scan_file(mm, vma, > khugepaged_scan.address, > file, pgoff, cc); > mmap_locked = false; > @@ -2656,9 +2670,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, > struct file *file = get_file(vma->vm_file); > pgoff_t pgoff = linear_page_index(vma, addr); > > - mmap_read_unlock(mm); > mmap_locked = false; > - result = hpage_collapse_scan_file(mm, addr, file, pgoff, > + result = hpage_collapse_scan_file(mm, vma, addr, file, pgoff, > cc); > fput(file); > } else { > -- > 2.39.1.456.gfc5497dd1b-goog > >
On Wed, Feb 01, 2023 at 09:36:37AM -0800, Yang Shi wrote: > On Tue, Jan 31, 2023 at 7:42 PM David Stevens <stevensd@chromium.org> wrote: > > > > From: David Stevens <stevensd@chromium.org> > > > > Collapsing memory in a vma that has an armed userfaultfd results in > > zero-filling any missing pages, which breaks user-space paging for those > > filled pages. Avoid khugepage bypassing userfaultfd by not collapsing > > pages in shmem reached via scanning a vma with an armed userfaultfd if > > doing so would zero-fill any pages. > > > > Signed-off-by: David Stevens <stevensd@chromium.org> > > --- > > mm/khugepaged.c | 35 ++++++++++++++++++++++++----------- > > 1 file changed, 24 insertions(+), 11 deletions(-) > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > index 79be13133322..48e944fb8972 100644 > > --- a/mm/khugepaged.c > > +++ b/mm/khugepaged.c > > @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, > > * + restore gaps in the page cache; > > * + unlock and free huge page; > > */ > > -static int collapse_file(struct mm_struct *mm, unsigned long addr, > > - struct file *file, pgoff_t start, > > +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma, > > + unsigned long addr, struct file *file, pgoff_t start, > > struct collapse_control *cc) > > { > > struct address_space *mapping = file->f_mapping; > > @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > * be able to map it or use it in another way until we unlock it. > > */ > > > > + if (is_shmem) > > + mmap_read_lock(mm); > > If you release mmap_lock before then reacquire it here, the vma is not > trusted anymore. It is not safe to use the vma anymore. > > Since you already read uffd_was_armed before releasing mmap_lock, so > you could pass it directly to collapse_file w/o dereferencing vma > again. The problem may be false positive (not userfaultfd armed > anymore), but it should be fine. Khugepaged could collapse this area > in the next round. Unfortunately that may not be enough.. because it's also possible that it reads uffd_armed==false, released mmap_sem, passed it over to the scanner, but then when scanning the file uffd got armed in parallel. There's another problem where the current vma may not have uffd armed, khugepaged may think it has nothing to do with uffd and moved on with collapsing, but actually it's armed in another vma of either the current mm or just another mm's. It seems non-trivial too to safely check this across all the vmas, let's say, by a reverse walk - the only safe way is to walk all the vmas and take the write lock for every mm, but that's not only too heavy but also merely impossible to always make it right because of deadlock issues and on the order of mmap write lock to take.. So far what I can still think of is, if we can extend shmem_inode_info and have a counter showing how many uffd has been armed. It can be a generic counter too (e.g. shmem_inode_info.collapse_guard_counter) just to avoid the page cache being collapsed under the hood, but I am also not aware of whether it can be reused by other things besides uffd. Then when we do the real collapsing, say, when: xas_set_order(&xas, start, HPAGE_PMD_ORDER); xas_store(&xas, hpage); xas_unlock_irq(&xas); We may need to make sure that counter keeps static (probably by holding some locks during the process) and we only do that last phase collapse if counter==0. Similar checks in this patch can still be done, but that'll only service as a role of failing faster before the ultimate check on the uffd_armed counter. Otherwise I just don't quickly see how to avoid race conditions. It'll be great if someone can come up with something better than above.. Copy Hugh too. Thanks,
On Wed, Feb 01, 2023 at 12:41:37PM +0900, David Stevens wrote: > From: David Stevens <stevensd@chromium.org> > > Collapsing memory in a vma that has an armed userfaultfd results in > zero-filling any missing pages, which breaks user-space paging for those > filled pages. Avoid khugepage bypassing userfaultfd by not collapsing > pages in shmem reached via scanning a vma with an armed userfaultfd if > doing so would zero-fill any pages. Could you elaborate on the failure? Will zero-filling the page prevent userfaultfd from catching future access? A test-case would help a lot. And what prevents the same pages be filled (with zeros or otherwise) via write(2) bypassing VMA checks? I cannot immediately see it. BTW, there's already a check that prevent establishing PMD in the place if VM_UFFD_WP is set. Maybe just an update of the check in retract_page_tables() from userfaultfd_wp() to userfaultfd_armed() would be enough? I have very limited understanding of userfaultfd(). Sorry in advance for stupid questions.
On Wed, Feb 1, 2023 at 12:52 PM Peter Xu <peterx@redhat.com> wrote: > > On Wed, Feb 01, 2023 at 09:36:37AM -0800, Yang Shi wrote: > > On Tue, Jan 31, 2023 at 7:42 PM David Stevens <stevensd@chromium.org> wrote: > > > > > > From: David Stevens <stevensd@chromium.org> > > > > > > Collapsing memory in a vma that has an armed userfaultfd results in > > > zero-filling any missing pages, which breaks user-space paging for those > > > filled pages. Avoid khugepage bypassing userfaultfd by not collapsing > > > pages in shmem reached via scanning a vma with an armed userfaultfd if > > > doing so would zero-fill any pages. > > > > > > Signed-off-by: David Stevens <stevensd@chromium.org> > > > --- > > > mm/khugepaged.c | 35 ++++++++++++++++++++++++----------- > > > 1 file changed, 24 insertions(+), 11 deletions(-) > > > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > > index 79be13133322..48e944fb8972 100644 > > > --- a/mm/khugepaged.c > > > +++ b/mm/khugepaged.c > > > @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, > > > * + restore gaps in the page cache; > > > * + unlock and free huge page; > > > */ > > > -static int collapse_file(struct mm_struct *mm, unsigned long addr, > > > - struct file *file, pgoff_t start, > > > +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma, > > > + unsigned long addr, struct file *file, pgoff_t start, > > > struct collapse_control *cc) > > > { > > > struct address_space *mapping = file->f_mapping; > > > @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > > * be able to map it or use it in another way until we unlock it. > > > */ > > > > > > + if (is_shmem) > > > + mmap_read_lock(mm); > > > > If you release mmap_lock before then reacquire it here, the vma is not > > trusted anymore. It is not safe to use the vma anymore. > > > > Since you already read uffd_was_armed before releasing mmap_lock, so > > you could pass it directly to collapse_file w/o dereferencing vma > > again. The problem may be false positive (not userfaultfd armed > > anymore), but it should be fine. Khugepaged could collapse this area > > in the next round. > > Unfortunately that may not be enough.. because it's also possible that it > reads uffd_armed==false, released mmap_sem, passed it over to the scanner, > but then when scanning the file uffd got armed in parallel. Aha, yeah, I missed that part... thanks for pointing it out. > > There's another problem where the current vma may not have uffd armed, > khugepaged may think it has nothing to do with uffd and moved on with > collapsing, but actually it's armed in another vma of either the current mm > or just another mm's. Out of curiosity, could you please elaborate how another vma armed with userfaultfd could have an impact on the vmas that are not armed? > > It seems non-trivial too to safely check this across all the vmas, let's > say, by a reverse walk - the only safe way is to walk all the vmas and take > the write lock for every mm, but that's not only too heavy but also merely > impossible to always make it right because of deadlock issues and on the > order of mmap write lock to take.. > > So far what I can still think of is, if we can extend shmem_inode_info and > have a counter showing how many uffd has been armed. It can be a generic > counter too (e.g. shmem_inode_info.collapse_guard_counter) just to avoid > the page cache being collapsed under the hood, but I am also not aware of > whether it can be reused by other things besides uffd. > > Then when we do the real collapsing, say, when: > > xas_set_order(&xas, start, HPAGE_PMD_ORDER); > xas_store(&xas, hpage); > xas_unlock_irq(&xas); > > We may need to make sure that counter keeps static (probably by holding > some locks during the process) and we only do that last phase collapse if > counter==0. > > Similar checks in this patch can still be done, but that'll only service as > a role of failing faster before the ultimate check on the uffd_armed > counter. Otherwise I just don't quickly see how to avoid race conditions. > > It'll be great if someone can come up with something better than above.. > Copy Hugh too. > > Thanks, > > -- > Peter Xu >
On Thu, Feb 2, 2023 at 8:09 AM Kirill A. Shutemov <kirill@shutemov.name> wrote: > > On Wed, Feb 01, 2023 at 12:41:37PM +0900, David Stevens wrote: > > From: David Stevens <stevensd@chromium.org> > > > > Collapsing memory in a vma that has an armed userfaultfd results in > > zero-filling any missing pages, which breaks user-space paging for those > > filled pages. Avoid khugepage bypassing userfaultfd by not collapsing > > pages in shmem reached via scanning a vma with an armed userfaultfd if > > doing so would zero-fill any pages. > > Could you elaborate on the failure? Will zero-filling the page prevent > userfaultfd from catching future access? Yes, zero-filling the page causes future major faults to be lost, since it populates the pages in the backing shmem. The path for anonymous memory in khugepaged does properly handle userfaultfd_armed, but the path for shmem does not. > A test-case would help a lot. Here's a sample program that demonstrates the issue. On a v6.1 kernel, no major fault is observed by the monitor thread. I used MADV_COLLAPSE to exercise khugepaged_scan_file, but you would get the same effect by replacing the madvise with a sleep and waiting for khugepaged to scan the test process. #define _GNU_SOURCE #include <inttypes.h> #include <stdio.h> #include <linux/userfaultfd.h> #include <threads.h> #include <unistd.h> #include <stdlib.h> #include <sys/mman.h> #include <sys/syscall.h> #include <sys/ioctl.h> #include <assert.h> int had_fault; int monitor_thread(void *arg) { int ret, uffd = (int) (uintptr_t) arg; struct uffd_msg msg; ret = read(uffd, &msg, sizeof(msg)); assert(ret > 0); assert(msg.event == UFFD_EVENT_PAGEFAULT); had_fault = 1; struct uffdio_zeropage zeropage; zeropage.range.start = msg.arg.pagefault.address & ~0xfff; zeropage.range.len = 4096; zeropage.mode = 0; ret = ioctl(uffd, UFFDIO_ZEROPAGE, &zeropage); assert(ret >= 0); } int main() { int ret; int uffd = syscall(__NR_userfaultfd, 0); assert(uffd >= 0); struct uffdio_api uffdio_api; uffdio_api.api = UFFD_API; uffdio_api.features = UFFD_FEATURE_MISSING_SHMEM; ret = ioctl(uffd, UFFDIO_API, &uffdio_api); assert(ret >= 0); int memfd = memfd_create("memfd", MFD_CLOEXEC); assert(memfd >= 0); ret = ftruncate(memfd, 2 * 1024 * 1024); assert(ret >= 0); uint8_t *addr = mmap(NULL, 2 * 1024 * 1024, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, 0); assert(addr != MAP_FAILED); addr[0] = 0xff; struct uffdio_register uffdio_register; uffdio_register.range.start = (unsigned long) addr; uffdio_register.range.len = 2 * 1024 * 1024; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; ret = ioctl(uffd, UFFDIO_REGISTER, &uffdio_register); assert(ret >= 0); thrd_t t; ret = thrd_create(&t, monitor_thread, (void *) (uintptr_t) uffd); assert(ret >= 0); ret = madvise(addr, 2 * 1024 * 1024, 25 /* MADV_COLLAPSE */); printf("madvise ret %d\n", ret); addr[4096] = 0xff; printf("%s major fault\n", had_fault ? "had" : "no"); return 0; } > And what prevents the same pages be filled (with zeros or otherwise) via > write(2) bypassing VMA checks? I cannot immediately see it. There isn't any such check. You can bypass userfaultfd on a shmem with write syscalls, or simply by writing to the shmem through a different vma. However, it is definitely possible for userspace to use shmem plus userfaultfd in a safe way. And the kernel shouldn't come along and break a setup that should be safe from the perspective of userspace. > BTW, there's already a check that prevent establishing PMD in the place if > VM_UFFD_WP is set. > > Maybe just an update of the check in retract_page_tables() from > userfaultfd_wp() to userfaultfd_armed() would be enough? It seems like it will be a little more complicated than that, because the target VM having an armed userfaultfd is a hard error condition. However, it might not be that difficult to modify collapse_file to rollback if necessary. I'll consider this approach for v2 of this patch. -David > I have very limited understanding of userfaultfd(). Sorry in advance for > stupid questions. > > -- > Kiryl Shutsemau / Kirill A. Shutemov
On Thu, Feb 2, 2023 at 5:52 AM Peter Xu <peterx@redhat.com> wrote: > > On Wed, Feb 01, 2023 at 09:36:37AM -0800, Yang Shi wrote: > > On Tue, Jan 31, 2023 at 7:42 PM David Stevens <stevensd@chromium.org> wrote: > > > > > > From: David Stevens <stevensd@chromium.org> > > > > > > Collapsing memory in a vma that has an armed userfaultfd results in > > > zero-filling any missing pages, which breaks user-space paging for those > > > filled pages. Avoid khugepage bypassing userfaultfd by not collapsing > > > pages in shmem reached via scanning a vma with an armed userfaultfd if > > > doing so would zero-fill any pages. > > > > > > Signed-off-by: David Stevens <stevensd@chromium.org> > > > --- > > > mm/khugepaged.c | 35 ++++++++++++++++++++++++----------- > > > 1 file changed, 24 insertions(+), 11 deletions(-) > > > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > > index 79be13133322..48e944fb8972 100644 > > > --- a/mm/khugepaged.c > > > +++ b/mm/khugepaged.c > > > @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, > > > * + restore gaps in the page cache; > > > * + unlock and free huge page; > > > */ > > > -static int collapse_file(struct mm_struct *mm, unsigned long addr, > > > - struct file *file, pgoff_t start, > > > +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma, > > > + unsigned long addr, struct file *file, pgoff_t start, > > > struct collapse_control *cc) > > > { > > > struct address_space *mapping = file->f_mapping; > > > @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > > * be able to map it or use it in another way until we unlock it. > > > */ > > > > > > + if (is_shmem) > > > + mmap_read_lock(mm); > > > > If you release mmap_lock before then reacquire it here, the vma is not > > trusted anymore. It is not safe to use the vma anymore. > > > > Since you already read uffd_was_armed before releasing mmap_lock, so > > you could pass it directly to collapse_file w/o dereferencing vma > > again. The problem may be false positive (not userfaultfd armed > > anymore), but it should be fine. Khugepaged could collapse this area > > in the next round. I didn't notice this race condition. It should be possible to adapt hugepage_vma_revalidate for this situation, or at least to create an analogous situation. > Unfortunately that may not be enough.. because it's also possible that it > reads uffd_armed==false, released mmap_sem, passed it over to the scanner, > but then when scanning the file uffd got armed in parallel. > > There's another problem where the current vma may not have uffd armed, > khugepaged may think it has nothing to do with uffd and moved on with > collapsing, but actually it's armed in another vma of either the current mm > or just another mm's. > > It seems non-trivial too to safely check this across all the vmas, let's > say, by a reverse walk - the only safe way is to walk all the vmas and take > the write lock for every mm, but that's not only too heavy but also merely > impossible to always make it right because of deadlock issues and on the > order of mmap write lock to take.. > > So far what I can still think of is, if we can extend shmem_inode_info and > have a counter showing how many uffd has been armed. It can be a generic > counter too (e.g. shmem_inode_info.collapse_guard_counter) just to avoid > the page cache being collapsed under the hood, but I am also not aware of > whether it can be reused by other things besides uffd. > > Then when we do the real collapsing, say, when: > > xas_set_order(&xas, start, HPAGE_PMD_ORDER); > xas_store(&xas, hpage); > xas_unlock_irq(&xas); > > We may need to make sure that counter keeps static (probably by holding > some locks during the process) and we only do that last phase collapse if > counter==0. > > Similar checks in this patch can still be done, but that'll only service as > a role of failing faster before the ultimate check on the uffd_armed > counter. Otherwise I just don't quickly see how to avoid race conditions. I don't know if it's necessary to go that far. Userfaultfd plus shmem is inherently brittle. It's possible for userspace to bypass userfaultfd on a shmem mapping by accessing the shmem through a different mapping or simply by using the write syscall. It might be sufficient to say that the kernel won't directly bypass a VMA's userfaultfd to collapse the underlying shmem's pages. Although on the other hand, I guess it's not great for the presence of an unused shmem mapping lying around to cause khugepaged to have user-visible side effects. -David > It'll be great if someone can come up with something better than above.. > Copy Hugh too. > > Thanks, > > -- > Peter Xu >
On Thu, Feb 2, 2023 at 1:56 AM David Stevens <stevensd@chromium.org> wrote: > > On Thu, Feb 2, 2023 at 5:52 AM Peter Xu <peterx@redhat.com> wrote: > > > > On Wed, Feb 01, 2023 at 09:36:37AM -0800, Yang Shi wrote: > > > On Tue, Jan 31, 2023 at 7:42 PM David Stevens <stevensd@chromium.org> wrote: > > > > > > > > From: David Stevens <stevensd@chromium.org> > > > > > > > > Collapsing memory in a vma that has an armed userfaultfd results in > > > > zero-filling any missing pages, which breaks user-space paging for those > > > > filled pages. Avoid khugepage bypassing userfaultfd by not collapsing > > > > pages in shmem reached via scanning a vma with an armed userfaultfd if > > > > doing so would zero-fill any pages. > > > > > > > > Signed-off-by: David Stevens <stevensd@chromium.org> > > > > --- > > > > mm/khugepaged.c | 35 ++++++++++++++++++++++++----------- > > > > 1 file changed, 24 insertions(+), 11 deletions(-) > > > > > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > > > index 79be13133322..48e944fb8972 100644 > > > > --- a/mm/khugepaged.c > > > > +++ b/mm/khugepaged.c > > > > @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, > > > > * + restore gaps in the page cache; > > > > * + unlock and free huge page; > > > > */ > > > > -static int collapse_file(struct mm_struct *mm, unsigned long addr, > > > > - struct file *file, pgoff_t start, > > > > +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma, > > > > + unsigned long addr, struct file *file, pgoff_t start, > > > > struct collapse_control *cc) > > > > { > > > > struct address_space *mapping = file->f_mapping; > > > > @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > > > * be able to map it or use it in another way until we unlock it. > > > > */ > > > > > > > > + if (is_shmem) > > > > + mmap_read_lock(mm); > > > > > > If you release mmap_lock before then reacquire it here, the vma is not > > > trusted anymore. It is not safe to use the vma anymore. > > > > > > Since you already read uffd_was_armed before releasing mmap_lock, so > > > you could pass it directly to collapse_file w/o dereferencing vma > > > again. The problem may be false positive (not userfaultfd armed > > > anymore), but it should be fine. Khugepaged could collapse this area > > > in the next round. > > I didn't notice this race condition. It should be possible to adapt > hugepage_vma_revalidate for this situation, or at least to create an > analogous situation. But once you release mmap_lock, the vma still may be changed, revalidation just can guarantee the vma is valid when you hold the mmap_lock unless mmap_lock is held for the whole collapse or at some point that the collapse doesn't have impact on userfaultfd anymore. We definitely don't want to hold mmap_lock for the whole collapse, but I don't know whether we could release it earlier or not due to my limited knowledge of userfaultfd. > > > Unfortunately that may not be enough.. because it's also possible that it > > reads uffd_armed==false, released mmap_sem, passed it over to the scanner, > > but then when scanning the file uffd got armed in parallel. > > > > There's another problem where the current vma may not have uffd armed, > > khugepaged may think it has nothing to do with uffd and moved on with > > collapsing, but actually it's armed in another vma of either the current mm > > or just another mm's. > > > > It seems non-trivial too to safely check this across all the vmas, let's > > say, by a reverse walk - the only safe way is to walk all the vmas and take > > the write lock for every mm, but that's not only too heavy but also merely > > impossible to always make it right because of deadlock issues and on the > > order of mmap write lock to take.. > > > > So far what I can still think of is, if we can extend shmem_inode_info and > > have a counter showing how many uffd has been armed. It can be a generic > > counter too (e.g. shmem_inode_info.collapse_guard_counter) just to avoid > > the page cache being collapsed under the hood, but I am also not aware of > > whether it can be reused by other things besides uffd. > > > > Then when we do the real collapsing, say, when: > > > > xas_set_order(&xas, start, HPAGE_PMD_ORDER); > > xas_store(&xas, hpage); > > xas_unlock_irq(&xas); > > > > We may need to make sure that counter keeps static (probably by holding > > some locks during the process) and we only do that last phase collapse if > > counter==0. > > > > Similar checks in this patch can still be done, but that'll only service as > > a role of failing faster before the ultimate check on the uffd_armed > > counter. Otherwise I just don't quickly see how to avoid race conditions. > > I don't know if it's necessary to go that far. Userfaultfd plus shmem > is inherently brittle. It's possible for userspace to bypass > userfaultfd on a shmem mapping by accessing the shmem through a > different mapping or simply by using the write syscall. It might be > sufficient to say that the kernel won't directly bypass a VMA's > userfaultfd to collapse the underlying shmem's pages. Although on the > other hand, I guess it's not great for the presence of an unused shmem > mapping lying around to cause khugepaged to have user-visible side > effects. > > -David > > > It'll be great if someone can come up with something better than above.. > > Copy Hugh too. > > > > Thanks, > > > > -- > > Peter Xu > >
On Wed, Feb 01, 2023 at 03:57:33PM -0800, Yang Shi wrote: > > There's another problem where the current vma may not have uffd armed, > > khugepaged may think it has nothing to do with uffd and moved on with > > collapsing, but actually it's armed in another vma of either the current mm > > or just another mm's. > > Out of curiosity, could you please elaborate how another vma armed > with userfaultfd could have an impact on the vmas that are not armed? It's e.g. when >1 vmas mapped to the same shmem file on the same range, one registered with uffd missing, others not. Then others can cause page cache populated without generating message to the vma that got uffd missing mode registered, so there'll be the same issue as when khugepaged accidentally does thp collapsings. Thanks,
On Thu, Feb 02, 2023 at 09:40:12AM -0800, Yang Shi wrote: > On Thu, Feb 2, 2023 at 1:56 AM David Stevens <stevensd@chromium.org> wrote: > > > > On Thu, Feb 2, 2023 at 5:52 AM Peter Xu <peterx@redhat.com> wrote: > > > > > > On Wed, Feb 01, 2023 at 09:36:37AM -0800, Yang Shi wrote: > > > > On Tue, Jan 31, 2023 at 7:42 PM David Stevens <stevensd@chromium.org> wrote: > > > > > > > > > > From: David Stevens <stevensd@chromium.org> > > > > > > > > > > Collapsing memory in a vma that has an armed userfaultfd results in > > > > > zero-filling any missing pages, which breaks user-space paging for those > > > > > filled pages. Avoid khugepage bypassing userfaultfd by not collapsing > > > > > pages in shmem reached via scanning a vma with an armed userfaultfd if > > > > > doing so would zero-fill any pages. > > > > > > > > > > Signed-off-by: David Stevens <stevensd@chromium.org> > > > > > --- > > > > > mm/khugepaged.c | 35 ++++++++++++++++++++++++----------- > > > > > 1 file changed, 24 insertions(+), 11 deletions(-) > > > > > > > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > > > > index 79be13133322..48e944fb8972 100644 > > > > > --- a/mm/khugepaged.c > > > > > +++ b/mm/khugepaged.c > > > > > @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, > > > > > * + restore gaps in the page cache; > > > > > * + unlock and free huge page; > > > > > */ > > > > > -static int collapse_file(struct mm_struct *mm, unsigned long addr, > > > > > - struct file *file, pgoff_t start, > > > > > +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma, > > > > > + unsigned long addr, struct file *file, pgoff_t start, > > > > > struct collapse_control *cc) > > > > > { > > > > > struct address_space *mapping = file->f_mapping; > > > > > @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > > > > * be able to map it or use it in another way until we unlock it. > > > > > */ > > > > > > > > > > + if (is_shmem) > > > > > + mmap_read_lock(mm); > > > > > > > > If you release mmap_lock before then reacquire it here, the vma is not > > > > trusted anymore. It is not safe to use the vma anymore. > > > > > > > > Since you already read uffd_was_armed before releasing mmap_lock, so > > > > you could pass it directly to collapse_file w/o dereferencing vma > > > > again. The problem may be false positive (not userfaultfd armed > > > > anymore), but it should be fine. Khugepaged could collapse this area > > > > in the next round. > > > > I didn't notice this race condition. It should be possible to adapt > > hugepage_vma_revalidate for this situation, or at least to create an > > analogous situation. > > But once you release mmap_lock, the vma still may be changed, > revalidation just can guarantee the vma is valid when you hold the > mmap_lock unless mmap_lock is held for the whole collapse or at some > point that the collapse doesn't have impact on userfaultfd anymore. We > definitely don't want to hold mmap_lock for the whole collapse, but I > don't know whether we could release it earlier or not due to my > limited knowledge of userfaultfd. I agree with Yang; I don't quickly see how that'll resolve the issue. > > > > > > Unfortunately that may not be enough.. because it's also possible that it > > > reads uffd_armed==false, released mmap_sem, passed it over to the scanner, > > > but then when scanning the file uffd got armed in parallel. > > > > > > There's another problem where the current vma may not have uffd armed, > > > khugepaged may think it has nothing to do with uffd and moved on with > > > collapsing, but actually it's armed in another vma of either the current mm > > > or just another mm's. > > > > > > It seems non-trivial too to safely check this across all the vmas, let's > > > say, by a reverse walk - the only safe way is to walk all the vmas and take > > > the write lock for every mm, but that's not only too heavy but also merely > > > impossible to always make it right because of deadlock issues and on the > > > order of mmap write lock to take.. > > > > > > So far what I can still think of is, if we can extend shmem_inode_info and > > > have a counter showing how many uffd has been armed. It can be a generic > > > counter too (e.g. shmem_inode_info.collapse_guard_counter) just to avoid > > > the page cache being collapsed under the hood, but I am also not aware of > > > whether it can be reused by other things besides uffd. > > > > > > Then when we do the real collapsing, say, when: > > > > > > xas_set_order(&xas, start, HPAGE_PMD_ORDER); > > > xas_store(&xas, hpage); > > > xas_unlock_irq(&xas); > > > > > > We may need to make sure that counter keeps static (probably by holding > > > some locks during the process) and we only do that last phase collapse if > > > counter==0. > > > > > > Similar checks in this patch can still be done, but that'll only service as > > > a role of failing faster before the ultimate check on the uffd_armed > > > counter. Otherwise I just don't quickly see how to avoid race conditions. > > > > I don't know if it's necessary to go that far. Userfaultfd plus shmem > > is inherently brittle. It's possible for userspace to bypass > > userfaultfd on a shmem mapping by accessing the shmem through a > > different mapping or simply by using the write syscall. Yes this is possible, but this is user-visible operation - no matter it was a read()/write() from another process, or mmap()ed memory accesses. Khugepaged merges ptes in a way that is out of control of users. That's something the user can hardly control. AFAICT currently file-based uffd missing mode all works in that way. IOW the user should have full control of the file/inode under the hood to make sure there will be nothing surprising. Otherwise I don't really see how the missing mode can work solidly since it's page cache based. > > It might be sufficient to say that the kernel won't directly bypass a > > VMA's userfaultfd to collapse the underlying shmem's pages. Although on > > the other hand, I guess it's not great for the presence of an unused > > shmem mapping lying around to cause khugepaged to have user-visible > > side effects. Maybe it works for your use case already, for example, if in your app the shmem is only and always be mapped once? However that doesn't seem like a complete solution to me. There's nothing that will prevent another mapping being established, and right after that happens it'll stop working, because khugepaged can notice that new mm/vma which doesn't register with uffd at all, and thinks it a good idea to collapse the shmem page cache again. Uffd will silently fail in another case even if not immediately in your current app/reproducer. Again, I don't think what I propose above is anything close to good.. It'll literally disable any collapsing possibility for a shmem node as long as any small portion of the inode mapping address space got registered by any process with uffd. I just don't see any easier approach so far. Thanks,
On Thu, Feb 2, 2023 at 12:04 PM Peter Xu <peterx@redhat.com> wrote: > > On Wed, Feb 01, 2023 at 03:57:33PM -0800, Yang Shi wrote: > > > There's another problem where the current vma may not have uffd armed, > > > khugepaged may think it has nothing to do with uffd and moved on with > > > collapsing, but actually it's armed in another vma of either the current mm > > > or just another mm's. > > > > Out of curiosity, could you please elaborate how another vma armed > > with userfaultfd could have an impact on the vmas that are not armed? > > It's e.g. when >1 vmas mapped to the same shmem file on the same range, one > registered with uffd missing, others not. Then others can cause page cache > populated without generating message to the vma that got uffd missing mode > registered, so there'll be the same issue as when khugepaged accidentally > does thp collapsings. Thanks, Got it, thank you. > > -- > Peter Xu >
> > > I don't know if it's necessary to go that far. Userfaultfd plus shmem > > > is inherently brittle. It's possible for userspace to bypass > > > userfaultfd on a shmem mapping by accessing the shmem through a > > > different mapping or simply by using the write syscall. > > Yes this is possible, but this is user-visible operation - no matter it was > a read()/write() from another process, or mmap()ed memory accesses. > Khugepaged merges ptes in a way that is out of control of users. That's > something the user can hardly control. > > AFAICT currently file-based uffd missing mode all works in that way. IOW > the user should have full control of the file/inode under the hood to make > sure there will be nothing surprising. Otherwise I don't really see how > the missing mode can work solidly since it's page cache based. > > > > It might be sufficient to say that the kernel won't directly bypass a > > > VMA's userfaultfd to collapse the underlying shmem's pages. Although on > > > the other hand, I guess it's not great for the presence of an unused > > > shmem mapping lying around to cause khugepaged to have user-visible > > > side effects. > > Maybe it works for your use case already, for example, if in your app the > shmem is only and always be mapped once? However that doesn't seem like a > complete solution to me. We're using userfaultfd for guest memory for a VM. We do have sandboxed device processes. However, thinking about it a bit more, this approach would probably cause issues with device hotplug. > There's nothing that will prevent another mapping being established, and > right after that happens it'll stop working, because khugepaged can notice > that new mm/vma which doesn't register with uffd at all, and thinks it a > good idea to collapse the shmem page cache again. Uffd will silently fail > in another case even if not immediately in your current app/reproducer. > > Again, I don't think what I propose above is anything close to good.. It'll > literally disable any collapsing possibility for a shmem node as long as > any small portion of the inode mapping address space got registered by any > process with uffd. I just don't see any easier approach so far. Maybe we can make things easier by being more precise about what bug we're trying to fix. Strictly speaking, I don't think what we're concerned about is whether or not userfaultfd is registered on a particular VMA at a particular point in time. I think what we're actually concerned about is that when userspace has a page with an armed userfaultfd that it knows is missing, that page should not be filled by khugepaged. If userspace doesn't know that a userfaultfd armed page is missing, then even if khugepaged fills that page, as far as userspace is concerned, the page was filled by khugepaged before userfaultfd was armed. If that's a valid way to look at it, then I think the fact that collapse_file locks hpage provides most of the necessary locking. From there, we need to check whether there are any VMAs with armed userfaultfds that might have observed a missing page. I think that can be done while iterating over VMAs in retract_page_tables without acquiring any mmap_lock by adding some memory barriers to userfaultfd_set_vm_flags and userfaultfd_armed. It is possible that a userfaultfd gets registered on a particular VMA after we check its flags but before the collapse finishes. I think the only observability hole left would be operations on the shmem file descriptor that don't actually lock pages (e.g. SEEK_DATA/SEEK_HOLE), which are hopefully solvable with some more thought. -David > Thanks, > > -- > Peter Xu >
On Fri, Feb 03, 2023 at 03:09:55PM +0900, David Stevens wrote: > > > > I don't know if it's necessary to go that far. Userfaultfd plus shmem > > > > is inherently brittle. It's possible for userspace to bypass > > > > userfaultfd on a shmem mapping by accessing the shmem through a > > > > different mapping or simply by using the write syscall. > > > > Yes this is possible, but this is user-visible operation - no matter it was > > a read()/write() from another process, or mmap()ed memory accesses. > > Khugepaged merges ptes in a way that is out of control of users. That's > > something the user can hardly control. > > > > AFAICT currently file-based uffd missing mode all works in that way. IOW > > the user should have full control of the file/inode under the hood to make > > sure there will be nothing surprising. Otherwise I don't really see how > > the missing mode can work solidly since it's page cache based. > > > > > > It might be sufficient to say that the kernel won't directly bypass a > > > > VMA's userfaultfd to collapse the underlying shmem's pages. Although on > > > > the other hand, I guess it's not great for the presence of an unused > > > > shmem mapping lying around to cause khugepaged to have user-visible > > > > side effects. > > > > Maybe it works for your use case already, for example, if in your app the > > shmem is only and always be mapped once? However that doesn't seem like a > > complete solution to me. > > We're using userfaultfd for guest memory for a VM. We do have > sandboxed device processes. However, thinking about it a bit more, > this approach would probably cause issues with device hotplug. > > > There's nothing that will prevent another mapping being established, and > > right after that happens it'll stop working, because khugepaged can notice > > that new mm/vma which doesn't register with uffd at all, and thinks it a > > good idea to collapse the shmem page cache again. Uffd will silently fail > > in another case even if not immediately in your current app/reproducer. > > > > Again, I don't think what I propose above is anything close to good.. It'll > > literally disable any collapsing possibility for a shmem node as long as > > any small portion of the inode mapping address space got registered by any > > process with uffd. I just don't see any easier approach so far. > > Maybe we can make things easier by being more precise about what bug > we're trying to fix. Strictly speaking, I don't think what we're > concerned about is whether or not userfaultfd is registered on a > particular VMA at a particular point in time. I think what we're actually > concerned about is that when userspace has a page with an armed > userfaultfd that it knows is missing, that page should not be filled by > khugepaged. If userspace doesn't know that a userfaultfd armed page is > missing, then even if khugepaged fills that page, as far as userspace is > concerned, the page was filled by khugepaged before userfaultfd was > armed. IMHO that's a common issue to solve with registrations on uffd missing mode, and what I'm aware of as a common solution to it is, for shmem, punching holes where we do hope to receive a message. It should only happen _after_ the missing mode registration is successful. At least that's what we do with QEMU's postcopy. > > If that's a valid way to look at it, then I think the fact that > collapse_file locks hpage provides most of the necessary locking. The hpage is still not visible to the page cache at all, not until it is used to replace the small pages, right? Do you mean "when holding the page lock of the existing small pages"? > From there, we need to check whether there are any VMAs with armed > userfaultfds that might have observed a missing page. I think that can be > done while iterating over VMAs in retract_page_tables without acquiring I am not 100% sure how this works. AFAICT we retract pgtables only if we have already collapsed the page. I don't see how it can be undone? > any mmap_lock by adding some memory barriers to userfaultfd_set_vm_flags > and userfaultfd_armed. It is possible that a userfaultfd gets registered > on a particular VMA after we check its flags but before the collapse > finishes. I think the only observability hole left would be operations on > the shmem file descriptor that don't actually lock pages > (e.g. SEEK_DATA/SEEK_HOLE), which are hopefully solvable with some more > thought. So it's possible that I just didn't grasp the fundamental idea of the new proposal on how it works at all. If you're confident with the idea I'd be also glad to read the patch directly; maybe that'll help to explain stuff. Thanks,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 79be13133322..48e944fb8972 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, * + restore gaps in the page cache; * + unlock and free huge page; */ -static int collapse_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * be able to map it or use it in another way until we unlock it. */ + if (is_shmem) + mmap_read_lock(mm); + xas_set(&xas, start); for (index = start; index < end; index++) { struct page *page = xas_next(&xas); @@ -1792,6 +1795,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, VM_BUG_ON(index != xas.xa_index); if (is_shmem) { if (!page) { + if (userfaultfd_armed(vma)) { + result = SCAN_EXCEED_NONE_PTE; + goto xa_locked; + } /* * Stop if extent has been truncated or * hole-punched, and is now completely @@ -2095,6 +2102,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, hpage->mapping = NULL; } + if (is_shmem) + mmap_read_unlock(mm); if (hpage) unlock_page(hpage); out: @@ -2108,8 +2117,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, return result; } -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct page *page = NULL; @@ -2118,6 +2127,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; + bool uffd_was_armed = userfaultfd_armed(vma); + + mmap_read_unlock(mm); present = 0; swap = 0; @@ -2193,13 +2205,16 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, } rcu_read_unlock(); + if (uffd_was_armed && present < HPAGE_PMD_NR) + result = SCAN_EXCEED_SWAP_PTE; + if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - result = collapse_file(mm, addr, file, start, cc); + result = collapse_file(mm, vma, addr, file, start, cc); } } @@ -2207,8 +2222,8 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, return result; } #else -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { BUILD_BUG(); @@ -2304,8 +2319,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); - mmap_read_unlock(mm); - *result = hpage_collapse_scan_file(mm, + *result = hpage_collapse_scan_file(mm, vma, khugepaged_scan.address, file, pgoff, cc); mmap_locked = false; @@ -2656,9 +2670,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); - mmap_read_unlock(mm); mmap_locked = false; - result = hpage_collapse_scan_file(mm, addr, file, pgoff, + result = hpage_collapse_scan_file(mm, vma, addr, file, pgoff, cc); fput(file); } else {