Message ID | 20220226022655.350562-1-namit@vmware.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v3] userfaultfd: provide unmasked address on page-fault | expand |
On Sat, Feb 26, 2022 at 02:26:55AM +0000, Nadav Amit wrote: > From: Nadav Amit <namit@vmware.com> > > Userfaultfd is supposed to provide the full address (i.e., unmasked) of > the faulting access back to userspace. However, that is not the case for > quite some time. > > Even running "userfaultfd_demo" from the userfaultfd man page provides > the wrong output (and contradicts the man page). Notice that > "UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) > and not the first read address (0x7fc5e30b300f). > > Address returned by mmap() = 0x7fc5e30b3000 > > fault_handler_thread(): > poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 > UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000 > (uffdio_copy.copy returned 4096) > Read address 0x7fc5e30b300f in main(): A > Read address 0x7fc5e30b340f in main(): A > Read address 0x7fc5e30b380f in main(): A > Read address 0x7fc5e30b3c0f in main(): A > > The exact address is useful for various reasons and specifically for > prefetching decisions. If it is known that the memory is populated by > certain objects whose size is not page-aligned, then based on the > faulting address, the uffd-monitor can decide whether to prefetch and > prefault the adjacent page. > > This bug has been for quite some time in the kernel: since commit > 1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address") > vmf->virtual_address"), which dates back to 2016. A concern has been > raised that existing userspace application might rely on the old/wrong > behavior in which the address is masked. Therefore, it was suggested to > provide the masked address unless the user explicitly asks for the exact > address. > > Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct > userfaultfd to provide the exact address. Add a new "real_address" field > to vmf to hold the unmasked address. Provide the address to userspace > accordingly. > > Initialize real_address in various code-paths to be consistent with > address, even when it is not used, to be on the safe side. > > Acked-by: Peter Xu <peterx@redhat.com> > Reviewed-by: David Hildenbrand <david@redhat.com> > Cc: Andrea Arcangeli <aarcange@redhat.com> > Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> > Cc: Jan Kara <jack@suse.cz> > Signed-off-by: Nadav Amit <namit@vmware.com> Acked-by: Mike Rapoport <rppt@linux.ibm.com> > > --- > > v2->v3: > * Initialize real_address on all code paths [Jan] > > v1->v2: > * Add uffd feature to selectively enable [David, Andrea] > --- > fs/userfaultfd.c | 5 ++++- > include/linux/mm.h | 3 ++- > include/uapi/linux/userfaultfd.h | 8 +++++++- > mm/hugetlb.c | 6 ++++-- > mm/memory.c | 1 + > mm/swapfile.c | 1 + > 6 files changed, 19 insertions(+), 5 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index e26b10132d47..826927026fe7 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, > struct uffd_msg msg; > msg_init(&msg); > msg.event = UFFD_EVENT_PAGEFAULT; > + > + if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) > + address &= PAGE_MASK; > msg.arg.pagefault.address = address; > /* > * These flags indicate why the userfault occurred: > @@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) > > init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); > uwq.wq.private = current; > - uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, > + uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, > ctx->features); > uwq.ctx = ctx; > uwq.waken = false; > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 213cc569b192..27df0ca0a36a 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -478,7 +478,8 @@ struct vm_fault { > struct vm_area_struct *vma; /* Target VMA */ > gfp_t gfp_mask; /* gfp mask to be used for allocations */ > pgoff_t pgoff; /* Logical page offset based on vma */ > - unsigned long address; /* Faulting virtual address */ > + unsigned long address; /* Faulting virtual address - masked */ > + unsigned long real_address; /* Faulting virtual address - unmaked */ > }; > enum fault_flag flags; /* FAULT_FLAG_xxx flags > * XXX: should really be 'const' */ > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > index 05b31d60acf6..ef739054cb1c 100644 > --- a/include/uapi/linux/userfaultfd.h > +++ b/include/uapi/linux/userfaultfd.h > @@ -32,7 +32,8 @@ > UFFD_FEATURE_SIGBUS | \ > UFFD_FEATURE_THREAD_ID | \ > UFFD_FEATURE_MINOR_HUGETLBFS | \ > - UFFD_FEATURE_MINOR_SHMEM) > + UFFD_FEATURE_MINOR_SHMEM | \ > + UFFD_FEATURE_EXACT_ADDRESS) > #define UFFD_API_IOCTLS \ > ((__u64)1 << _UFFDIO_REGISTER | \ > (__u64)1 << _UFFDIO_UNREGISTER | \ > @@ -189,6 +190,10 @@ struct uffdio_api { > * > * UFFD_FEATURE_MINOR_SHMEM indicates the same support as > * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. > + * > + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page > + * faults would be provided and the offset within the page would not be > + * masked. > */ > #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) > #define UFFD_FEATURE_EVENT_FORK (1<<1) > @@ -201,6 +206,7 @@ struct uffdio_api { > #define UFFD_FEATURE_THREAD_ID (1<<8) > #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) > #define UFFD_FEATURE_MINOR_SHMEM (1<<10) > +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) > __u64 features; > > __u64 ioctls; > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 61895cc01d09..16017f90568b 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5342,6 +5342,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, > pgoff_t idx, > unsigned int flags, > unsigned long haddr, > + unsigned long addr, > unsigned long reason) > { > vm_fault_t ret; > @@ -5349,6 +5350,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, > struct vm_fault vmf = { > .vma = vma, > .address = haddr, > + .real_address = addr, > .flags = flags, > > /* > @@ -5417,7 +5419,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, > /* Check for page in userfault range */ > if (userfaultfd_missing(vma)) { > ret = hugetlb_handle_userfault(vma, mapping, idx, > - flags, haddr, > + flags, haddr, address, > VM_UFFD_MISSING); > goto out; > } > @@ -5481,7 +5483,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, > unlock_page(page); > put_page(page); > ret = hugetlb_handle_userfault(vma, mapping, idx, > - flags, haddr, > + flags, haddr, address, > VM_UFFD_MINOR); > goto out; > } > diff --git a/mm/memory.c b/mm/memory.c > index c125c4969913..aae53fde13d9 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4622,6 +4622,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > struct vm_fault vmf = { > .vma = vma, > .address = address & PAGE_MASK, > + .real_address = address, > .flags = flags, > .pgoff = linear_page_index(vma, address), > .gfp_mask = __get_fault_gfp_mask(vma), > diff --git a/mm/swapfile.c b/mm/swapfile.c > index bf0df7aa7158..33c7abb16610 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -1951,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > struct vm_fault vmf = { > .vma = vma, > .address = addr, > + .real_address = addr, > .pmd = pmd, > }; > > -- > 2.25.1 >
On Sat 26-02-22 02:26:55, Nadav Amit wrote: > From: Nadav Amit <namit@vmware.com> > > Userfaultfd is supposed to provide the full address (i.e., unmasked) of > the faulting access back to userspace. However, that is not the case for > quite some time. > > Even running "userfaultfd_demo" from the userfaultfd man page provides > the wrong output (and contradicts the man page). Notice that > "UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) > and not the first read address (0x7fc5e30b300f). > > Address returned by mmap() = 0x7fc5e30b3000 > > fault_handler_thread(): > poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 > UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000 > (uffdio_copy.copy returned 4096) > Read address 0x7fc5e30b300f in main(): A > Read address 0x7fc5e30b340f in main(): A > Read address 0x7fc5e30b380f in main(): A > Read address 0x7fc5e30b3c0f in main(): A > > The exact address is useful for various reasons and specifically for > prefetching decisions. If it is known that the memory is populated by > certain objects whose size is not page-aligned, then based on the > faulting address, the uffd-monitor can decide whether to prefetch and > prefault the adjacent page. > > This bug has been for quite some time in the kernel: since commit > 1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address") > vmf->virtual_address"), which dates back to 2016. A concern has been > raised that existing userspace application might rely on the old/wrong > behavior in which the address is masked. Therefore, it was suggested to > provide the masked address unless the user explicitly asks for the exact > address. > > Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct > userfaultfd to provide the exact address. Add a new "real_address" field > to vmf to hold the unmasked address. Provide the address to userspace > accordingly. > > Initialize real_address in various code-paths to be consistent with > address, even when it is not used, to be on the safe side. > > Acked-by: Peter Xu <peterx@redhat.com> > Reviewed-by: David Hildenbrand <david@redhat.com> > Cc: Andrea Arcangeli <aarcange@redhat.com> > Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> > Cc: Jan Kara <jack@suse.cz> > Signed-off-by: Nadav Amit <namit@vmware.com> > > --- > > v2->v3: > * Initialize real_address on all code paths [Jan] > > v1->v2: > * Add uffd feature to selectively enable [David, Andrea] I've just noticed one typo below. Otherwise the patch looks good to me. Feel free to add: Reviewed-by: Jan Kara <jack@suse.cz> > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 213cc569b192..27df0ca0a36a 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -478,7 +478,8 @@ struct vm_fault { > struct vm_area_struct *vma; /* Target VMA */ > gfp_t gfp_mask; /* gfp mask to be used for allocations */ > pgoff_t pgoff; /* Logical page offset based on vma */ > - unsigned long address; /* Faulting virtual address */ > + unsigned long address; /* Faulting virtual address - masked */ > + unsigned long real_address; /* Faulting virtual address - unmaked */ ^^ typo here Honza
On Sat, Feb 26, 2022 at 02:26:55AM +0000, Nadav Amit wrote: > From: Nadav Amit <namit@vmware.com> > > Userfaultfd is supposed to provide the full address (i.e., unmasked) of > the faulting access back to userspace. However, that is not the case for > quite some time. > > Even running "userfaultfd_demo" from the userfaultfd man page provides > the wrong output (and contradicts the man page). Notice that > "UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) > and not the first read address (0x7fc5e30b300f). > > Address returned by mmap() = 0x7fc5e30b3000 > > fault_handler_thread(): > poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 > UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000 > (uffdio_copy.copy returned 4096) > Read address 0x7fc5e30b300f in main(): A > Read address 0x7fc5e30b340f in main(): A > Read address 0x7fc5e30b380f in main(): A > Read address 0x7fc5e30b3c0f in main(): A > > The exact address is useful for various reasons and specifically for > prefetching decisions. If it is known that the memory is populated by > certain objects whose size is not page-aligned, then based on the > faulting address, the uffd-monitor can decide whether to prefetch and > prefault the adjacent page. > > This bug has been for quite some time in the kernel: since commit > 1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address") > vmf->virtual_address"), which dates back to 2016. A concern has been > raised that existing userspace application might rely on the old/wrong > behavior in which the address is masked. Therefore, it was suggested to > provide the masked address unless the user explicitly asks for the exact > address. > > Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct > userfaultfd to provide the exact address. Add a new "real_address" field > to vmf to hold the unmasked address. Provide the address to userspace > accordingly. > > Initialize real_address in various code-paths to be consistent with > address, even when it is not used, to be on the safe side. > > Acked-by: Peter Xu <peterx@redhat.com> > Reviewed-by: David Hildenbrand <david@redhat.com> > Cc: Andrea Arcangeli <aarcange@redhat.com> > Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> > Cc: Jan Kara <jack@suse.cz> > Signed-off-by: Nadav Amit <namit@vmware.com> Hi, Andrew, Just a heads-up that this version has not yet been updated in -mm I think, while the queued one is the old version. IOW, uffd is currently broken on latest linux-next on hugetlb. Thanks,
> On Mar 3, 2022, at 12:03 AM, Peter Xu <peterx@redhat.com> wrote: > > On Sat, Feb 26, 2022 at 02:26:55AM +0000, Nadav Amit wrote: >> From: Nadav Amit <namit@vmware.com> >> >> Userfaultfd is supposed to provide the full address (i.e., unmasked) of >> the faulting access back to userspace. However, that is not the case for >> quite some time. >> >> Even running "userfaultfd_demo" from the userfaultfd man page provides >> the wrong output (and contradicts the man page). Notice that >> "UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) >> and not the first read address (0x7fc5e30b300f). >> >> Address returned by mmap() = 0x7fc5e30b3000 >> >> fault_handler_thread(): >> poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 >> UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000 >> (uffdio_copy.copy returned 4096) >> Read address 0x7fc5e30b300f in main(): A >> Read address 0x7fc5e30b340f in main(): A >> Read address 0x7fc5e30b380f in main(): A >> Read address 0x7fc5e30b3c0f in main(): A >> >> The exact address is useful for various reasons and specifically for >> prefetching decisions. If it is known that the memory is populated by >> certain objects whose size is not page-aligned, then based on the >> faulting address, the uffd-monitor can decide whether to prefetch and >> prefault the adjacent page. >> >> This bug has been for quite some time in the kernel: since commit >> 1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address") >> vmf->virtual_address"), which dates back to 2016. A concern has been >> raised that existing userspace application might rely on the old/wrong >> behavior in which the address is masked. Therefore, it was suggested to >> provide the masked address unless the user explicitly asks for the exact >> address. >> >> Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct >> userfaultfd to provide the exact address. Add a new "real_address" field >> to vmf to hold the unmasked address. Provide the address to userspace >> accordingly. >> >> Initialize real_address in various code-paths to be consistent with >> address, even when it is not used, to be on the safe side. >> >> Acked-by: Peter Xu <peterx@redhat.com> >> Reviewed-by: David Hildenbrand <david@redhat.com> >> Cc: Andrea Arcangeli <aarcange@redhat.com> >> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> >> Cc: Jan Kara <jack@suse.cz> >> Signed-off-by: Nadav Amit <namit@vmware.com> > > Hi, Andrew, > > Just a heads-up that this version has not yet been updated in -mm I think, > while the queued one is the old version. > > IOW, uffd is currently broken on latest linux-next on hugetlb. Thanks Peter for reminding Andrew. Andrew, please acknowledge it would be queue for the next version and I will submit a patch to the man pages.
> On Mar 3, 2022, at 11:05 AM, Nadav Amit <namit@vmware.com> wrote: > > > >> On Mar 3, 2022, at 12:03 AM, Peter Xu <peterx@redhat.com> wrote: >> >> On Sat, Feb 26, 2022 at 02:26:55AM +0000, Nadav Amit wrote: >>> From: Nadav Amit <namit@vmware.com> >>> >>> Userfaultfd is supposed to provide the full address (i.e., unmasked) of >>> the faulting access back to userspace. However, that is not the case for >>> quite some time. >>> >>> Even running "userfaultfd_demo" from the userfaultfd man page provides >>> the wrong output (and contradicts the man page). Notice that >>> "UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) >>> and not the first read address (0x7fc5e30b300f). >>> >>> Address returned by mmap() = 0x7fc5e30b3000 >>> >>> fault_handler_thread(): >>> poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 >>> UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000 >>> (uffdio_copy.copy returned 4096) >>> Read address 0x7fc5e30b300f in main(): A >>> Read address 0x7fc5e30b340f in main(): A >>> Read address 0x7fc5e30b380f in main(): A >>> Read address 0x7fc5e30b3c0f in main(): A >>> >>> The exact address is useful for various reasons and specifically for >>> prefetching decisions. If it is known that the memory is populated by >>> certain objects whose size is not page-aligned, then based on the >>> faulting address, the uffd-monitor can decide whether to prefetch and >>> prefault the adjacent page. >>> >>> This bug has been for quite some time in the kernel: since commit >>> 1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address") >>> vmf->virtual_address"), which dates back to 2016. A concern has been >>> raised that existing userspace application might rely on the old/wrong >>> behavior in which the address is masked. Therefore, it was suggested to >>> provide the masked address unless the user explicitly asks for the exact >>> address. >>> >>> Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct >>> userfaultfd to provide the exact address. Add a new "real_address" field >>> to vmf to hold the unmasked address. Provide the address to userspace >>> accordingly. >>> >>> Initialize real_address in various code-paths to be consistent with >>> address, even when it is not used, to be on the safe side. >>> >>> Acked-by: Peter Xu <peterx@redhat.com> >>> Reviewed-by: David Hildenbrand <david@redhat.com> >>> Cc: Andrea Arcangeli <aarcange@redhat.com> >>> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> >>> Cc: Jan Kara <jack@suse.cz> >>> Signed-off-by: Nadav Amit <namit@vmware.com> >> >> Hi, Andrew, >> >> Just a heads-up that this version has not yet been updated in -mm I think, >> while the queued one is the old version. >> >> IOW, uffd is currently broken on latest linux-next on hugetlb. > > Thanks Peter for reminding Andrew. > > Andrew, please acknowledge it would be queue for the next version and > I will submit a patch to the man pages. Peter (et. al), I’ll send it in a more orderly fashion later, but let me know if I got something completely wrong for the man page change: [ Thanks as usual; sorry - limited experience changing man pages ] -- >8 -- From: Nadav Amit <namit@vmware.com> Date: Thu, 3 Mar 2022 19:44:37 +0000 Subject: [PATCH] ioctl_userfaultfd: add UFFD_FEATURE_EXACT_ADDRESS Describe the new UFFD_FEATURE_EXACT_ADDRESS API feature. Signed-off-by: Nadav Amit <namit@vmware.com> --- man2/ioctl_userfaultfd.2 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/man2/ioctl_userfaultfd.2 b/man2/ioctl_userfaultfd.2 index 504f61d4b..2d065504e 100644 --- a/man2/ioctl_userfaultfd.2 +++ b/man2/ioctl_userfaultfd.2 @@ -214,6 +214,12 @@ memory accesses to the regions registered with userfaultfd. If this feature bit is set, .I uffd_msg.pagefault.feat.ptid will be set to the faulted thread ID for each page-fault message. +.TP +.BR UFFD_FEATURE_EXACT_ADDRESS " (since Linux 5.18)" +If this feature bit is set, +.I uffd_msg.pagefault.address +will be set to the exact page-fault address that was reported by the hardware, +and will not mask the offset within the page. .PP The returned .I ioctls
On Thu, 3 Mar 2022 19:05:36 +0000 Nadav Amit <namit@vmware.com> wrote: > > > >> Reviewed-by: David Hildenbrand <david@redhat.com> > >> Cc: Andrea Arcangeli <aarcange@redhat.com> > >> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> > >> Cc: Jan Kara <jack@suse.cz> > >> Signed-off-by: Nadav Amit <namit@vmware.com> > > > > Hi, Andrew, > > > > Just a heads-up that this version has not yet been updated in -mm I think, > > while the queued one is the old version. > > > > IOW, uffd is currently broken on latest linux-next on hugetlb. > > Thanks Peter for reminding Andrew. > > Andrew, please acknowledge it would be queue for the next version and > I will submit a patch to the man pages. Queued now, thanks. Also a fix for Jan's comment typo.
On Thu, Mar 03, 2022 at 07:51:26PM +0000, Nadav Amit wrote: > Peter (et. al), > > I’ll send it in a more orderly fashion later, but let me know if I got > something completely wrong for the man page change: > > [ Thanks as usual; sorry - limited experience changing man pages ] > > -- >8 -- > > From: Nadav Amit <namit@vmware.com> > Date: Thu, 3 Mar 2022 19:44:37 +0000 > Subject: [PATCH] ioctl_userfaultfd: add UFFD_FEATURE_EXACT_ADDRESS > > Describe the new UFFD_FEATURE_EXACT_ADDRESS API feature. > > Signed-off-by: Nadav Amit <namit@vmware.com> > --- > man2/ioctl_userfaultfd.2 | 6 ++++++ > 1 file changed, 6 insertions(+) > > diff --git a/man2/ioctl_userfaultfd.2 b/man2/ioctl_userfaultfd.2 > index 504f61d4b..2d065504e 100644 > --- a/man2/ioctl_userfaultfd.2 > +++ b/man2/ioctl_userfaultfd.2 > @@ -214,6 +214,12 @@ memory accesses to the regions registered with userfaultfd. > If this feature bit is set, > .I uffd_msg.pagefault.feat.ptid > will be set to the faulted thread ID for each page-fault message. > +.TP > +.BR UFFD_FEATURE_EXACT_ADDRESS " (since Linux 5.18)" > +If this feature bit is set, > +.I uffd_msg.pagefault.address > +will be set to the exact page-fault address that was reported by the hardware, > +and will not mask the offset within the page. > .PP > The returned > .I ioctls > -- > 2.25.1 Looks good to me, thanks! Acked-by: Peter Xu <peterx@redhat.com>
On 03.03.22 20:51, Nadav Amit wrote: > > >> On Mar 3, 2022, at 11:05 AM, Nadav Amit <namit@vmware.com> wrote: >> >> >> >>> On Mar 3, 2022, at 12:03 AM, Peter Xu <peterx@redhat.com> wrote: >>> >>> On Sat, Feb 26, 2022 at 02:26:55AM +0000, Nadav Amit wrote: >>>> From: Nadav Amit <namit@vmware.com> >>>> >>>> Userfaultfd is supposed to provide the full address (i.e., unmasked) of >>>> the faulting access back to userspace. However, that is not the case for >>>> quite some time. >>>> >>>> Even running "userfaultfd_demo" from the userfaultfd man page provides >>>> the wrong output (and contradicts the man page). Notice that >>>> "UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) >>>> and not the first read address (0x7fc5e30b300f). >>>> >>>> Address returned by mmap() = 0x7fc5e30b3000 >>>> >>>> fault_handler_thread(): >>>> poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 >>>> UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000 >>>> (uffdio_copy.copy returned 4096) >>>> Read address 0x7fc5e30b300f in main(): A >>>> Read address 0x7fc5e30b340f in main(): A >>>> Read address 0x7fc5e30b380f in main(): A >>>> Read address 0x7fc5e30b3c0f in main(): A >>>> >>>> The exact address is useful for various reasons and specifically for >>>> prefetching decisions. If it is known that the memory is populated by >>>> certain objects whose size is not page-aligned, then based on the >>>> faulting address, the uffd-monitor can decide whether to prefetch and >>>> prefault the adjacent page. >>>> >>>> This bug has been for quite some time in the kernel: since commit >>>> 1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address") >>>> vmf->virtual_address"), which dates back to 2016. A concern has been >>>> raised that existing userspace application might rely on the old/wrong >>>> behavior in which the address is masked. Therefore, it was suggested to >>>> provide the masked address unless the user explicitly asks for the exact >>>> address. >>>> >>>> Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct >>>> userfaultfd to provide the exact address. Add a new "real_address" field >>>> to vmf to hold the unmasked address. Provide the address to userspace >>>> accordingly. >>>> >>>> Initialize real_address in various code-paths to be consistent with >>>> address, even when it is not used, to be on the safe side. >>>> >>>> Acked-by: Peter Xu <peterx@redhat.com> >>>> Reviewed-by: David Hildenbrand <david@redhat.com> >>>> Cc: Andrea Arcangeli <aarcange@redhat.com> >>>> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> >>>> Cc: Jan Kara <jack@suse.cz> >>>> Signed-off-by: Nadav Amit <namit@vmware.com> >>> >>> Hi, Andrew, >>> >>> Just a heads-up that this version has not yet been updated in -mm I think, >>> while the queued one is the old version. >>> >>> IOW, uffd is currently broken on latest linux-next on hugetlb. >> >> Thanks Peter for reminding Andrew. >> >> Andrew, please acknowledge it would be queue for the next version and >> I will submit a patch to the man pages. > > Peter (et. al), > > I’ll send it in a more orderly fashion later, but let me know if I got > something completely wrong for the man page change: > > [ Thanks as usual; sorry - limited experience changing man pages ] > > -- >8 -- > > From: Nadav Amit <namit@vmware.com> > Date: Thu, 3 Mar 2022 19:44:37 +0000 > Subject: [PATCH] ioctl_userfaultfd: add UFFD_FEATURE_EXACT_ADDRESS > > Describe the new UFFD_FEATURE_EXACT_ADDRESS API feature. > > Signed-off-by: Nadav Amit <namit@vmware.com> > --- > man2/ioctl_userfaultfd.2 | 6 ++++++ > 1 file changed, 6 insertions(+) > > diff --git a/man2/ioctl_userfaultfd.2 b/man2/ioctl_userfaultfd.2 > index 504f61d4b..2d065504e 100644 > --- a/man2/ioctl_userfaultfd.2 > +++ b/man2/ioctl_userfaultfd.2 > @@ -214,6 +214,12 @@ memory accesses to the regions registered with userfaultfd. > If this feature bit is set, > .I uffd_msg.pagefault.feat.ptid > will be set to the faulted thread ID for each page-fault message. > +.TP > +.BR UFFD_FEATURE_EXACT_ADDRESS " (since Linux 5.18)" > +If this feature bit is set, > +.I uffd_msg.pagefault.address > +will be set to the exact page-fault address that was reported by the hardware, > +and will not mask the offset within the page. > .PP > The returned > .I ioctls Do we want to add a comment about early uffd code that did this as well? "Note that old Linux versions might indicate the exact address as well, even though the feature bit is not set."
> On Mar 4, 2022, at 2:38 AM, David Hildenbrand <david@redhat.com> wrote: > > Do we want to add a comment about early uffd code that did this as well? > > "Note that old Linux versions might indicate the exact address as well, > even though the feature bit is not set." Thanks! I thought how to phrase something non-committing as you did and gave up. I will go with yours.
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e26b10132d47..826927026fe7 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, struct uffd_msg msg; msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; + + if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) + address &= PAGE_MASK; msg.arg.pagefault.address = address; /* * These flags indicate why the userfault occurred: @@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, + uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..27df0ca0a36a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -478,7 +478,8 @@ struct vm_fault { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ - unsigned long address; /* Faulting virtual address */ + unsigned long address; /* Faulting virtual address - masked */ + unsigned long real_address; /* Faulting virtual address - unmaked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 05b31d60acf6..ef739054cb1c 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -32,7 +32,8 @@ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ UFFD_FEATURE_MINOR_HUGETLBFS | \ - UFFD_FEATURE_MINOR_SHMEM) + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -189,6 +190,10 @@ struct uffdio_api { * * UFFD_FEATURE_MINOR_SHMEM indicates the same support as * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. + * + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page + * faults would be provided and the offset within the page would not be + * masked. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -201,6 +206,7 @@ struct uffdio_api { #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) #define UFFD_FEATURE_MINOR_SHMEM (1<<10) +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) __u64 features; __u64 ioctls; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61895cc01d09..16017f90568b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5342,6 +5342,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, pgoff_t idx, unsigned int flags, unsigned long haddr, + unsigned long addr, unsigned long reason) { vm_fault_t ret; @@ -5349,6 +5350,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = haddr, + .real_address = addr, .flags = flags, /* @@ -5417,7 +5419,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, + flags, haddr, address, VM_UFFD_MISSING); goto out; } @@ -5481,7 +5483,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, unlock_page(page); put_page(page); ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, + flags, haddr, address, VM_UFFD_MINOR); goto out; } diff --git a/mm/memory.c b/mm/memory.c index c125c4969913..aae53fde13d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4622,6 +4622,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, + .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), diff --git a/mm/swapfile.c b/mm/swapfile.c index bf0df7aa7158..33c7abb16610 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1951,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct vm_fault vmf = { .vma = vma, .address = addr, + .real_address = addr, .pmd = pmd, };