Message ID | 20210916234100.122368-20-logang@deltatee.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Userspace P2PDMA with O_DIRECT NVMe devices | expand |
On Thu, Sep 16, 2021 at 05:40:59PM -0600, Logan Gunthorpe wrote: > Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap > a hunk of p2pmem into userspace. > > Pages are allocated from the genalloc in bulk and their reference count > incremented. They are returned to the genalloc when the page is put. > > The VMA does not take a reference to the pages when they are inserted > with vmf_insert_mixed() (which is necessary for zone device pages) so > the backing P2P memory is stored in a structures in vm_private_data. > > A pseudo mount is used to allocate an inode for each PCI device. The > inode's address_space is used in the file doing the mmap so that all > VMAs are collected and can be unmapped if the PCI device is unbound. > After unmapping, the VMAs are iterated through and their pages are > put so the device can continue to be unbound. An active flag is used > to signal to VMAs not to allocate any further P2P memory once the > removal process starts. The flag is synchronized with concurrent > access with an RCU lock. > > The VMAs and inode will survive after the unbind of the device, but no > pages will be present in the VMA and a subsequent access will result > in a SIGBUS error. > > Signed-off-by: Logan Gunthorpe <logang@deltatee.com> Acked-by: Bjorn Helgaas <bhelgaas@google.com> I would capitalize "Introduce" in the subject line. > --- > drivers/pci/p2pdma.c | 263 ++++++++++++++++++++++++++++++++++++- > include/linux/pci-p2pdma.h | 11 ++ > include/uapi/linux/magic.h | 1 + > 3 files changed, 273 insertions(+), 2 deletions(-) > > diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c > index 2422af5a529c..a5adf57af53a 100644 > --- a/drivers/pci/p2pdma.c > +++ b/drivers/pci/p2pdma.c > @@ -16,14 +16,19 @@ > #include <linux/genalloc.h> > #include <linux/memremap.h> > #include <linux/percpu-refcount.h> > +#include <linux/pfn_t.h> > +#include <linux/pseudo_fs.h> > #include <linux/random.h> > #include <linux/seq_buf.h> > #include <linux/xarray.h> > +#include <uapi/linux/magic.h> > > struct pci_p2pdma { > struct gen_pool *pool; > bool p2pmem_published; > struct xarray map_types; > + struct inode *inode; > + bool active; > }; > > struct pci_p2pdma_pagemap { > @@ -32,6 +37,14 @@ struct pci_p2pdma_pagemap { > u64 bus_offset; > }; > > +struct pci_p2pdma_map { > + struct kref ref; > + struct pci_dev *pdev; > + struct inode *inode; > + void *kaddr; > + size_t len; > +}; > + > static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) > { > return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap); > @@ -100,6 +113,26 @@ static const struct attribute_group p2pmem_group = { > .name = "p2pmem", > }; > > +/* > + * P2PDMA internal mount > + * Fake an internal VFS mount-point in order to allocate struct address_space > + * mappings to remove VMAs on unbind events. > + */ > +static int pci_p2pdma_fs_cnt; > +static struct vfsmount *pci_p2pdma_fs_mnt; > + > +static int pci_p2pdma_fs_init_fs_context(struct fs_context *fc) > +{ > + return init_pseudo(fc, P2PDMA_MAGIC) ? 0 : -ENOMEM; > +} > + > +static struct file_system_type pci_p2pdma_fs_type = { > + .name = "p2dma", > + .owner = THIS_MODULE, > + .init_fs_context = pci_p2pdma_fs_init_fs_context, > + .kill_sb = kill_anon_super, > +}; > + > static void p2pdma_page_free(struct page *page) > { > struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap); > @@ -128,6 +161,9 @@ static void pci_p2pdma_release(void *data) > gen_pool_destroy(p2pdma->pool); > sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); > xa_destroy(&p2pdma->map_types); > + > + iput(p2pdma->inode); > + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt); > } > > static int pci_p2pdma_setup(struct pci_dev *pdev) > @@ -145,17 +181,32 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) > if (!p2p->pool) > goto out; > > - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); > + error = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt, > + &pci_p2pdma_fs_cnt); > if (error) > goto out_pool_destroy; > > + p2p->inode = alloc_anon_inode(pci_p2pdma_fs_mnt->mnt_sb); > + if (IS_ERR(p2p->inode)) { > + error = -ENOMEM; > + goto out_unpin_fs; > + } > + > + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); > + if (error) > + goto out_put_inode; > + > error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); > if (error) > - goto out_pool_destroy; > + goto out_put_inode; > > rcu_assign_pointer(pdev->p2pdma, p2p); > return 0; > > +out_put_inode: > + iput(p2p->inode); > +out_unpin_fs: > + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt); > out_pool_destroy: > gen_pool_destroy(p2p->pool); > out: > @@ -163,6 +214,45 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) > return error; > } > > +static void pci_p2pdma_map_free_pages(struct pci_p2pdma_map *pmap) > +{ > + int i; > + > + if (!pmap->kaddr) > + return; > + > + for (i = 0; i < pmap->len; i += PAGE_SIZE) > + put_page(virt_to_page(pmap->kaddr + i)); > + > + pmap->kaddr = NULL; > +} > + > +static void pci_p2pdma_free_mappings(struct address_space *mapping) > +{ > + struct vm_area_struct *vma; > + > + i_mmap_lock_write(mapping); > + if (RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) > + goto out; > + > + vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, -1) > + pci_p2pdma_map_free_pages(vma->vm_private_data); > + > +out: > + i_mmap_unlock_write(mapping); > +} > + > +static void pci_p2pdma_unmap_mappings(void *data) > +{ > + struct pci_dev *pdev = data; > + struct pci_p2pdma *p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); > + > + p2pdma->active = false; > + synchronize_rcu(); > + unmap_mapping_range(p2pdma->inode->i_mapping, 0, 0, 1); > + pci_p2pdma_free_mappings(p2pdma->inode->i_mapping); > +} > + > /** > * pci_p2pdma_add_resource - add memory for use as p2p memory > * @pdev: the device to add the memory to > @@ -221,6 +311,11 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, > goto pgmap_free; > } > > + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, > + pdev); > + if (error) > + goto pages_free; > + > p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); > error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr, > pci_bus_address(pdev, bar) + offset, > @@ -229,6 +324,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, > if (error) > goto pages_free; > > + p2pdma->active = true; > pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n", > pgmap->range.start, pgmap->range.end); > > @@ -1029,3 +1125,166 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, > return sprintf(page, "%s\n", pci_name(p2p_dev)); > } > EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show); > + > +static struct pci_p2pdma_map *pci_p2pdma_map_alloc(struct pci_dev *pdev, > + size_t len) > +{ > + struct pci_p2pdma_map *pmap; > + > + pmap = kzalloc(sizeof(*pmap), GFP_KERNEL); > + if (!pmap) > + return NULL; > + > + kref_init(&pmap->ref); > + pmap->pdev = pci_dev_get(pdev); > + pmap->len = len; > + > + return pmap; > +} > + > +static void pci_p2pdma_map_free(struct kref *ref) > +{ > + struct pci_p2pdma_map *pmap = > + container_of(ref, struct pci_p2pdma_map, ref); > + > + pci_p2pdma_map_free_pages(pmap); > + pci_dev_put(pmap->pdev); > + iput(pmap->inode); > + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt); > + kfree(pmap); > +} > + > +static void pci_p2pdma_vma_open(struct vm_area_struct *vma) > +{ > + struct pci_p2pdma_map *pmap = vma->vm_private_data; > + > + kref_get(&pmap->ref); > +} > + > +static void pci_p2pdma_vma_close(struct vm_area_struct *vma) > +{ > + struct pci_p2pdma_map *pmap = vma->vm_private_data; > + > + kref_put(&pmap->ref, pci_p2pdma_map_free); > +} > + > +static vm_fault_t pci_p2pdma_vma_fault(struct vm_fault *vmf) > +{ > + struct pci_p2pdma_map *pmap = vmf->vma->vm_private_data; > + struct pci_p2pdma *p2pdma; > + void *vaddr; > + pfn_t pfn; > + int i; > + > + if (!pmap->kaddr) { > + rcu_read_lock(); > + p2pdma = rcu_dereference(pmap->pdev->p2pdma); > + if (!p2pdma) > + goto err_out; > + > + if (!p2pdma->active) > + goto err_out; > + > + pmap->kaddr = (void *)gen_pool_alloc(p2pdma->pool, pmap->len); > + if (!pmap->kaddr) > + goto err_out; > + > + for (i = 0; i < pmap->len; i += PAGE_SIZE) > + get_page(virt_to_page(pmap->kaddr + i)); > + > + rcu_read_unlock(); > + } > + > + vaddr = pmap->kaddr + (vmf->pgoff << PAGE_SHIFT); > + pfn = phys_to_pfn_t(virt_to_phys(vaddr), PFN_DEV | PFN_MAP); > + > + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); > + > +err_out: > + rcu_read_unlock(); > + return VM_FAULT_SIGBUS; > +} > +static const struct vm_operations_struct pci_p2pdma_vmops = { > + .open = pci_p2pdma_vma_open, > + .close = pci_p2pdma_vma_close, > + .fault = pci_p2pdma_vma_fault, > +}; > + > +/** > + * pci_p2pdma_mmap_file_open - setup file mapping to store P2PMEM VMAs > + * @pdev: the device to allocate memory from > + * @vma: the userspace vma to map the memory to > + * > + * Set f_mapping of the file to the p2pdma inode so that mappings > + * are can be torn down on device unbind. > + * > + * Returns 0 on success, or a negative error code on failure > + */ > +void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, struct file *file) > +{ > + struct pci_p2pdma *p2pdma; > + > + rcu_read_lock(); > + p2pdma = rcu_dereference(pdev->p2pdma); > + if (p2pdma) > + file->f_mapping = p2pdma->inode->i_mapping; > + rcu_read_unlock(); > +} > +EXPORT_SYMBOL_GPL(pci_p2pdma_mmap_file_open); > + > +/** > + * pci_mmap_p2pmem - setup an mmap region to be backed with P2PDMA memory > + * that was registered with pci_p2pdma_add_resource() > + * @pdev: the device to allocate memory from > + * @vma: the userspace vma to map the memory to > + * > + * The file must call pci_p2pdma_mmap_file_open() in its open() operation. > + * > + * Returns 0 on success, or a negative error code on failure > + */ > +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma) > +{ > + struct pci_p2pdma_map *pmap; > + struct pci_p2pdma *p2pdma; > + int ret; > + > + /* prevent private mappings from being established */ > + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { > + pci_info_ratelimited(pdev, > + "%s: fail, attempted private mapping\n", > + current->comm); > + return -EINVAL; > + } > + > + pmap = pci_p2pdma_map_alloc(pdev, vma->vm_end - vma->vm_start); > + if (!pmap) > + return -ENOMEM; > + > + rcu_read_lock(); > + p2pdma = rcu_dereference(pdev->p2pdma); > + if (!p2pdma) { > + ret = -ENODEV; > + goto out; > + } > + > + ret = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt, > + &pci_p2pdma_fs_cnt); > + if (ret) > + goto out; > + > + ihold(p2pdma->inode); > + pmap->inode = p2pdma->inode; > + rcu_read_unlock(); > + > + vma->vm_flags |= VM_MIXEDMAP; > + vma->vm_private_data = pmap; > + vma->vm_ops = &pci_p2pdma_vmops; > + > + return 0; > + > +out: > + rcu_read_unlock(); > + kfree(pmap); > + return ret; > +} > +EXPORT_SYMBOL_GPL(pci_mmap_p2pmem); > diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h > index 0c33a40a86e7..f9f19f3db676 100644 > --- a/include/linux/pci-p2pdma.h > +++ b/include/linux/pci-p2pdma.h > @@ -81,6 +81,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, > bool *use_p2pdma); > ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, > bool use_p2pdma); > +void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, struct file *file); > +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma); > #else /* CONFIG_PCI_P2PDMA */ > static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, > size_t size, u64 offset) > @@ -152,6 +154,15 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, > { > return sprintf(page, "none\n"); > } > +static inline void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, > + struct file *file) > +{ > +} > +static inline int pci_mmap_p2pmem(struct pci_dev *pdev, > + struct vm_area_struct *vma) > +{ > + return -EOPNOTSUPP; > +} > #endif /* CONFIG_PCI_P2PDMA */ > > > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h > index 35687dcb1a42..af737842c56f 100644 > --- a/include/uapi/linux/magic.h > +++ b/include/uapi/linux/magic.h > @@ -88,6 +88,7 @@ > #define BPF_FS_MAGIC 0xcafe4a11 > #define AAFS_MAGIC 0x5a3c69f0 > #define ZONEFS_MAGIC 0x5a4f4653 > +#define P2PDMA_MAGIC 0x70327064 > > /* Since UDF 2.01 is ISO 13346 based... */ > #define UDF_SUPER_MAGIC 0x15013346 > -- > 2.30.2 >
On Thu, Sep 16, 2021 at 05:40:59PM -0600, Logan Gunthorpe wrote: > +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma) > +{ > + struct pci_p2pdma_map *pmap; > + struct pci_p2pdma *p2pdma; > + int ret; > + > + /* prevent private mappings from being established */ > + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { > + pci_info_ratelimited(pdev, > + "%s: fail, attempted private mapping\n", > + current->comm); > + return -EINVAL; > + } > + > + pmap = pci_p2pdma_map_alloc(pdev, vma->vm_end - vma->vm_start); > + if (!pmap) > + return -ENOMEM; > + > + rcu_read_lock(); > + p2pdma = rcu_dereference(pdev->p2pdma); > + if (!p2pdma) { > + ret = -ENODEV; > + goto out; > + } > + > + ret = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt, > + &pci_p2pdma_fs_cnt); > + if (ret) > + goto out; > + > + ihold(p2pdma->inode); > + pmap->inode = p2pdma->inode; > + rcu_read_unlock(); > + > + vma->vm_flags |= VM_MIXEDMAP; Why is this a VM_MIXEDMAP? Everything fault sticks in here has a struct page, right? Jason
On Thu, Sep 16, 2021 at 05:40:59PM -0600, Logan Gunthorpe wrote: > +static void pci_p2pdma_unmap_mappings(void *data) > +{ > + struct pci_dev *pdev = data; > + struct pci_p2pdma *p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); > + > + p2pdma->active = false; > + synchronize_rcu(); > + unmap_mapping_range(p2pdma->inode->i_mapping, 0, 0, 1); > + pci_p2pdma_free_mappings(p2pdma->inode->i_mapping); > +} If this is going to rely on unmap_mapping_range then GUP should also reject this memory for FOLL_LONGTERM.. What along this control flow: > + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, > + pdev); Waits for all the page refcounts to go to zero? Jason
On 2021-09-28 1:55 p.m., Jason Gunthorpe wrote: > On Thu, Sep 16, 2021 at 05:40:59PM -0600, Logan Gunthorpe wrote: >> +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma) >> +{ >> + struct pci_p2pdma_map *pmap; >> + struct pci_p2pdma *p2pdma; >> + int ret; >> + >> + /* prevent private mappings from being established */ >> + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { >> + pci_info_ratelimited(pdev, >> + "%s: fail, attempted private mapping\n", >> + current->comm); >> + return -EINVAL; >> + } >> + >> + pmap = pci_p2pdma_map_alloc(pdev, vma->vm_end - vma->vm_start); >> + if (!pmap) >> + return -ENOMEM; >> + >> + rcu_read_lock(); >> + p2pdma = rcu_dereference(pdev->p2pdma); >> + if (!p2pdma) { >> + ret = -ENODEV; >> + goto out; >> + } >> + >> + ret = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt, >> + &pci_p2pdma_fs_cnt); >> + if (ret) >> + goto out; >> + >> + ihold(p2pdma->inode); >> + pmap->inode = p2pdma->inode; >> + rcu_read_unlock(); >> + >> + vma->vm_flags |= VM_MIXEDMAP; > > Why is this a VM_MIXEDMAP? Everything fault sticks in here has a > struct page, right? Yes. This decision is not so simple, I tried a few variations before settling on this. The main reason is probably this: if we don't use VM_MIXEDMAP, then we can't set pte_devmap(). If we don't set pte_devmap(), then every single page that GUP processes needs to check if it's a ZONE_DEVICE page and also if it's a P2PDMA page (thus dereferencing pgmap) in order to satisfy the requirements of FOLL_PCI_P2PDMA. I didn't think other developers would go for that kind of performance hit. With VM_MIXEDMAP we hide the performance penalty behind the existing check. And with the current pgmap code as is, we only need to do that check once for every new pgmap pointer. Logan
On 2021-09-28 2:05 p.m., Jason Gunthorpe wrote: > On Thu, Sep 16, 2021 at 05:40:59PM -0600, Logan Gunthorpe wrote: > >> +static void pci_p2pdma_unmap_mappings(void *data) >> +{ >> + struct pci_dev *pdev = data; >> + struct pci_p2pdma *p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); >> + >> + p2pdma->active = false; >> + synchronize_rcu(); >> + unmap_mapping_range(p2pdma->inode->i_mapping, 0, 0, 1); >> + pci_p2pdma_free_mappings(p2pdma->inode->i_mapping); >> +} > > If this is going to rely on unmap_mapping_range then GUP should also > reject this memory for FOLL_LONGTERM.. Right, makes sense. > > What along this control flow: > >> + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, >> + pdev); > > Waits for all the page refcounts to go to zero? That's already in the existing code as part of memunmap_pages() which puts the original reference to all the pages and then waits for the reference to go to zero. This new action unmaps all the VMAs so that the subsequent call to memunmap_pages() doesn't block on userspace processes. Logan
On Wed, Sep 29, 2021 at 03:42:00PM -0600, Logan Gunthorpe wrote: > The main reason is probably this: if we don't use VM_MIXEDMAP, then we > can't set pte_devmap(). I think that is an API limitation in the fault routines.. finish_fault() should set the pte_devmap - eg by passing the PFN_DEV|PFN_MAP somehow through the vma->vm_page_prot to mk_pte() or otherwise signaling do_set_pte() that it should set those PTE bits when it creates the entry. (or there should be a vmf_* helper for this special case, but using the vmf->page seems righter to me) > If we don't set pte_devmap(), then every single page that GUP > processes needs to check if it's a ZONE_DEVICE page and also if it's > a P2PDMA page (thus dereferencing pgmap) in order to satisfy the > requirements of FOLL_PCI_P2PDMA. Definately not suggesting not to set pte_devmap(), only that VM_MIXEDMAP should not be set on VMAs that only contain struct pages. That is an abuse of what it is intended for. At the very least there should be a big comment above the usage explaining that this is just working around a limitation in finish_fault() where it cannot set the PFN_DEV|PFN_MAP bits today. Jason
On 2021-09-29 5:05 p.m., Jason Gunthorpe wrote: > On Wed, Sep 29, 2021 at 03:42:00PM -0600, Logan Gunthorpe wrote: > >> The main reason is probably this: if we don't use VM_MIXEDMAP, then we >> can't set pte_devmap(). > > I think that is an API limitation in the fault routines.. > > finish_fault() should set the pte_devmap - eg by passing the > PFN_DEV|PFN_MAP somehow through the vma->vm_page_prot to mk_pte() or > otherwise signaling do_set_pte() that it should set those PTE bits > when it creates the entry. > > (or there should be a vmf_* helper for this special case, but using > the vmf->page seems righter to me) I'm not opposed to this. Though I'm not sure what's best here. >> If we don't set pte_devmap(), then every single page that GUP >> processes needs to check if it's a ZONE_DEVICE page and also if it's >> a P2PDMA page (thus dereferencing pgmap) in order to satisfy the >> requirements of FOLL_PCI_P2PDMA. > > Definately not suggesting not to set pte_devmap(), only that > VM_MIXEDMAP should not be set on VMAs that only contain struct > pages. That is an abuse of what it is intended for. > > At the very least there should be a big comment above the usage > explaining that this is just working around a limitation in > finish_fault() where it cannot set the PFN_DEV|PFN_MAP bits today. Is it? Documentation on vmf_insert_mixed() and VM_MIXEDMAP is not good and the intention is not clear. I got the impression that mm people wanted those interfaces used for users of pte_devmap(). device-dax uses these interfaces and as far as I can see it also only contains struct pages (or at least dev_dax_huge_fault() calls pfn_to_page() on every page when VM_FAULT_NOPAGE happens). So it would be nice to get some direction here from mm developers on what they'd prefer. Logan
On Wed, Sep 29, 2021 at 05:27:22PM -0600, Logan Gunthorpe wrote: > > finish_fault() should set the pte_devmap - eg by passing the > > PFN_DEV|PFN_MAP somehow through the vma->vm_page_prot to mk_pte() or > > otherwise signaling do_set_pte() that it should set those PTE bits > > when it creates the entry. > > > > (or there should be a vmf_* helper for this special case, but using > > the vmf->page seems righter to me) > > I'm not opposed to this. Though I'm not sure what's best here. > > >> If we don't set pte_devmap(), then every single page that GUP > >> processes needs to check if it's a ZONE_DEVICE page and also if it's > >> a P2PDMA page (thus dereferencing pgmap) in order to satisfy the > >> requirements of FOLL_PCI_P2PDMA. > > > > Definately not suggesting not to set pte_devmap(), only that > > VM_MIXEDMAP should not be set on VMAs that only contain struct > > pages. That is an abuse of what it is intended for. > > > > At the very least there should be a big comment above the usage > > explaining that this is just working around a limitation in > > finish_fault() where it cannot set the PFN_DEV|PFN_MAP bits today. > > Is it? Documentation on vmf_insert_mixed() and VM_MIXEDMAP is not good > and the intention is not clear. I got the impression that mm people > wanted those interfaces used for users of pte_devmap(). I thought VM_MIXEDMAP was quite clear: #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ This VMA does not include PFN pages, so it should not be tagged VM_MIXEDMAP. Aside from enabling the special vmf_ API, it only controls some special behavior in vm_normal_page: * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The disadvantage is that pages are refcounted * (which can be slower and simply not an option for some PFNMAP users). The * advantage is that we don't have to follow the strict linearity rule of * PFNMAP mappings in order to support COWable mappings. Which again does not describe this case. > device-dax uses these interfaces and as far as I can see it also only > contains struct pages (or at least dev_dax_huge_fault() calls > pfn_to_page() on every page when VM_FAULT_NOPAGE happens). hacky hacky :) I think DAX probably did it that way for the same reason you are doing it that way - no other choice without changing something Jason
On 2021-09-29 5:35 p.m., Jason Gunthorpe wrote: > On Wed, Sep 29, 2021 at 05:27:22PM -0600, Logan Gunthorpe wrote: > >>> finish_fault() should set the pte_devmap - eg by passing the >>> PFN_DEV|PFN_MAP somehow through the vma->vm_page_prot to mk_pte() or >>> otherwise signaling do_set_pte() that it should set those PTE bits >>> when it creates the entry. >>> >>> (or there should be a vmf_* helper for this special case, but using >>> the vmf->page seems righter to me) >> >> I'm not opposed to this. Though I'm not sure what's best here. >> >>>> If we don't set pte_devmap(), then every single page that GUP >>>> processes needs to check if it's a ZONE_DEVICE page and also if it's >>>> a P2PDMA page (thus dereferencing pgmap) in order to satisfy the >>>> requirements of FOLL_PCI_P2PDMA. >>> >>> Definately not suggesting not to set pte_devmap(), only that >>> VM_MIXEDMAP should not be set on VMAs that only contain struct >>> pages. That is an abuse of what it is intended for. >>> >>> At the very least there should be a big comment above the usage >>> explaining that this is just working around a limitation in >>> finish_fault() where it cannot set the PFN_DEV|PFN_MAP bits today. >> >> Is it? Documentation on vmf_insert_mixed() and VM_MIXEDMAP is not good >> and the intention is not clear. I got the impression that mm people >> wanted those interfaces used for users of pte_devmap(). > > I thought VM_MIXEDMAP was quite clear: > > #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ > > This VMA does not include PFN pages, so it should not be tagged > VM_MIXEDMAP. > > Aside from enabling the special vmf_ API, it only controls some > special behavior in vm_normal_page: > > * VM_MIXEDMAP mappings can likewise contain memory with or without "struct > * page" backing, however the difference is that _all_ pages with a struct > * page (that is, those where pfn_valid is true) are refcounted and considered > * normal pages by the VM. The disadvantage is that pages are refcounted > * (which can be slower and simply not an option for some PFNMAP users). The > * advantage is that we don't have to follow the strict linearity rule of > * PFNMAP mappings in order to support COWable mappings. > > Which again does not describe this case. Some of this seems out of date. Pretty sure the pages are not refcounted with vmf_insert_mixed() and vmf_insert_mixed() is currently the only way to use VM_MIXEDMAP mappings. >> device-dax uses these interfaces and as far as I can see it also only >> contains struct pages (or at least dev_dax_huge_fault() calls >> pfn_to_page() on every page when VM_FAULT_NOPAGE happens). > > hacky hacky :) > > I think DAX probably did it that way for the same reason you are > doing it that way - no other choice without changing something Sure but if you look at other vmf_insert_mixed() (of which there are few) you see similar patterns. Seems more like it was documented with one thing in mind but then used in a completely different manner. Which is why I suggested the documentation was not so good. Logan
On Wed, Sep 29, 2021 at 05:49:36PM -0600, Logan Gunthorpe wrote: > Some of this seems out of date. Pretty sure the pages are not refcounted > with vmf_insert_mixed() and vmf_insert_mixed() is currently the only way > to use VM_MIXEDMAP mappings. Hum. vmf_insert_mixed() boils down to insert_pfn() which always sets the special bit, so vm_normal_page() returns NULL and thus the pages are not freed during zap. So, if the pages are always special and not refcounted all the docs seem really out of date - or rather they describe the situation without the special bit, I think. Why would DAX want to do this in the first place?? This means the address space zap is much more important that just speeding up destruction, it is essential for correctness since the PTEs are not holding refcounts naturally... Sigh. Jason
On Wed, Sep 29, 2021 at 09:36:52PM -0300, Jason Gunthorpe wrote: > Why would DAX want to do this in the first place?? This means the > address space zap is much more important that just speeding up > destruction, it is essential for correctness since the PTEs are not > holding refcounts naturally... It is not really for this series to fix, but I think the whole thing is probably racy once you start allowing pte_special pages to be accessed by GUP. If we look at unmapping the PTE relative to GUP fast the important sequence is how the TLB flushing doesn't decrement the page refcount until after it knows any concurrent GUP fast is completed. This is arch specific, eg it could be done async through a call_rcu handler. This ensures that pages can't cross back into the free pool and be reallocated until we know for certain that nobody is walking the PTEs and could potentially take an additional reference on it. The scheme cannot rely on the page refcount being 0 because oce it goes into the free pool it could be immeidately reallocated back to a non-zero refcount. A DAX user that simply does an address space invalidation doesn't sequence itself with any of this mechanism. So we can race with a thread doing GUP fast and another thread re-cycling the page into another use - creating a leakage of the page from one security context to another. This seems to be made worse for the pgmap stuff due to the wonky refcount usage - at least if the refcount had dropped to zero gup fast would be blocked for a time, but even that doesn't happen. In short, I think using pg special for anything that can be returned by gup fast (and maybe even gup!) is racy/wrong. We must have the normal refcount mechanism work for correctness of the recycling flow. I don't know why DAX did this, I think we should be talking about udoing it all of it, not just the wonky refcounting Alistair and Felix are working on, but also the use of MIXEDMAP and pte special for struct page backed memory. Jason
On 2021-10-01 7:48 a.m., Jason Gunthorpe wrote: > On Wed, Sep 29, 2021 at 09:36:52PM -0300, Jason Gunthorpe wrote: > >> Why would DAX want to do this in the first place?? This means the >> address space zap is much more important that just speeding up >> destruction, it is essential for correctness since the PTEs are not >> holding refcounts naturally... > > It is not really for this series to fix, but I think the whole thing > is probably racy once you start allowing pte_special pages to be > accessed by GUP. > > If we look at unmapping the PTE relative to GUP fast the important > sequence is how the TLB flushing doesn't decrement the page refcount > until after it knows any concurrent GUP fast is completed. This is > arch specific, eg it could be done async through a call_rcu handler. > > This ensures that pages can't cross back into the free pool and be > reallocated until we know for certain that nobody is walking the PTEs > and could potentially take an additional reference on it. The scheme > cannot rely on the page refcount being 0 because oce it goes into the > free pool it could be immeidately reallocated back to a non-zero > refcount. > > A DAX user that simply does an address space invalidation doesn't > sequence itself with any of this mechanism. So we can race with a > thread doing GUP fast and another thread re-cycling the page into > another use - creating a leakage of the page from one security context > to another. > > This seems to be made worse for the pgmap stuff due to the wonky > refcount usage - at least if the refcount had dropped to zero gup fast > would be blocked for a time, but even that doesn't happen. > > In short, I think using pg special for anything that can be returned > by gup fast (and maybe even gup!) is racy/wrong. We must have the > normal refcount mechanism work for correctness of the recycling flow. I'm not quite following all of this. I'm not entirely sure how fs/dax works in this regard, but for device-dax (and similarly p2pdma) it doesn't seem as bad as you say. In device-dax, the refcount is only used to prevent the device, and therefore the pages, from going away on device unbind. Pages cannot be recycled, as you say, as they are mapped linearly within the device. The address space invalidation is done only when the device is unbound. Before the invalidation, an active flag is cleared to ensure no new mappings can be created while the unmap is proceeding. unmap_mapping_range() should sequence itself with the TLB flush and GUP-fast using the same mechanism it does for regular pages. As far as I can see, by the time unmap_mapping_range() returns, we should be confident that there are no pages left in any mapping (seeing no new pages could be added since before the call). Then before finishing the unbind, device-dax decrements the refcount of all pages and then waits for the refcount of all pages to go to zero. Thus, any pages that successfully were got with GUP, during or before unmap_mapping_range should hold a reference and once all those references are returned, unbind can finish. P2PDMA follows this pattern, except pages are not mapped linearly and are returned to the genalloc when their refcount falls to 1. This only happens after a VMA is closed which should imply the PTEs have already been unlinked from the pages. And the same situation occurs on unbind with a flag preventing new mappings from being created before unmap_mapping_range(), etc. Not to say that all this couldn't use a big conceptual cleanup. A similar question exists with the single find_special_page() user (xen/gntdev) and it's definitely not clear what the differences are between the find_special_page() and vmf_insert_mixed() techniques and when one should be used over the other. Or could they both be merged to use the same technique? Logan
On Fri, Oct 01, 2021 at 11:01:49AM -0600, Logan Gunthorpe wrote: > In device-dax, the refcount is only used to prevent the device, and > therefore the pages, from going away on device unbind. Pages cannot be > recycled, as you say, as they are mapped linearly within the device. The > address space invalidation is done only when the device is unbound. By address space invalidation I mean invalidation of the VMA that is pointing to those pages. device-dax may not have a issue with use-after-VMA-invalidation by it's very nature since every PFN always points to the same thing. fsdax and this p2p stuff are different though. > Before the invalidation, an active flag is cleared to ensure no new > mappings can be created while the unmap is proceeding. > unmap_mapping_range() should sequence itself with the TLB flush and AFIAK unmap_mapping_range() kicks off the TLB flush and then returns. It doesn't always wait for the flush to fully finish. Ie some cases use RCU to lock the page table against GUP fast and so the put_page() doesn't happen until the call_rcu completes - after a grace period. The unmap_mapping_range() does not wait for grace periods. This is why for normal memory the put_page is done after the TLB flush completes, not when unmap_mapping_range() finishes. This ensures that before the refcount reaches 0 no concurrent GUP fast can still observe the old PTEs. > GUP-fast using the same mechanism it does for regular pages. As far as I > can see, by the time unmap_mapping_range() returns, we should be > confident that there are no pages left in any mapping (seeing no new > pages could be added since before the call). When viewed under the page table locks this is true, but the 'fast' walkers like gup_fast and hmm_range_fault can continue to be working on old data in the ptes because they don't take the page table locks. They interact with unmap_mapping_range() via the IPI/rcu (gup fast) or mmu notifier sequence count (hmm_range_fault) > P2PDMA follows this pattern, except pages are not mapped linearly and > are returned to the genalloc when their refcount falls to 1. This only > happens after a VMA is closed which should imply the PTEs have already > been unlinked from the pages. And here is the problem, since the genalloc is being used we now care that a page should not continue to be accessed by userspace after it has be placed back into the genalloc. I suppose fsdax has the same basic issue too. > Not to say that all this couldn't use a big conceptual cleanup. A > similar question exists with the single find_special_page() user > (xen/gntdev) and it's definitely not clear what the differences are > between the find_special_page() and vmf_insert_mixed() techniques and > when one should be used over the other. Or could they both be merged to > use the same technique? Oh that gntdev stuff is just nonsense. IIRC is trying to delegate control over a PTE entry itself to the hypervisor. /* * gntdev takes the address of the PTE in find_grant_ptes() and * passes it to the hypervisor in gntdev_map_grant_pages(). The * purpose of the notifier is to prevent the hypervisor pointer * to the PTE from going stale. * * Since this vma's mappings can't be touched without the * mmap_lock, and we are holding it now, there is no need for * the notifier_range locking pattern. I vaugely recall it stuffs in a normal page then has the hypervisor overwrite the PTE. When it comes time to free the PTE it recovers the normal page via the 'find_special_page' hack and frees it. Somehow the hypervisor is also using the normal page for something. It is all very strange and one shouldn't think about it :| Jason
On 2021-10-01 11:45 a.m., Jason Gunthorpe wrote: >> Before the invalidation, an active flag is cleared to ensure no new >> mappings can be created while the unmap is proceeding. >> unmap_mapping_range() should sequence itself with the TLB flush and > > AFIAK unmap_mapping_range() kicks off the TLB flush and then > returns. It doesn't always wait for the flush to fully finish. Ie some > cases use RCU to lock the page table against GUP fast and so the > put_page() doesn't happen until the call_rcu completes - after a grace > period. The unmap_mapping_range() does not wait for grace periods. Admittedly, the tlb flush code isn't the easiest code to understand. But, yes it seems at least on some arches the pages are freed by call_rcu(). But can't this be fixed easily by adding a synchronize_rcu() call after calling unmap_mapping_range()? Certainly after a synchronize_rcu(), the TLB has been flushed and it is safe to free those pages. >> P2PDMA follows this pattern, except pages are not mapped linearly and >> are returned to the genalloc when their refcount falls to 1. This only >> happens after a VMA is closed which should imply the PTEs have already >> been unlinked from the pages. > > And here is the problem, since the genalloc is being used we now care > that a page should not continue to be accessed by userspace after it > has be placed back into the genalloc. I suppose fsdax has the same > basic issue too. Ok, similar question. Then if we call synchronize_rcu() in vm_close(), before the put_page() calls which return the pages to the genalloc, would that not guarantee the TLBs have been appropriately flushed? >> Not to say that all this couldn't use a big conceptual cleanup. A >> similar question exists with the single find_special_page() user >> (xen/gntdev) and it's definitely not clear what the differences are >> between the find_special_page() and vmf_insert_mixed() techniques and >> when one should be used over the other. Or could they both be merged to >> use the same technique? > > Oh that gntdev stuff is just nonsense. IIRC is trying to delegate > control over a PTE entry itself to the hypervisor. > > /* > * gntdev takes the address of the PTE in find_grant_ptes() and > * passes it to the hypervisor in gntdev_map_grant_pages(). The > * purpose of the notifier is to prevent the hypervisor pointer > * to the PTE from going stale. > * > * Since this vma's mappings can't be touched without the > * mmap_lock, and we are holding it now, there is no need for > * the notifier_range locking pattern. > > I vaugely recall it stuffs in a normal page then has the hypervisor > overwrite the PTE. When it comes time to free the PTE it recovers the > normal page via the 'find_special_page' hack and frees it. Somehow the > hypervisor is also using the normal page for something. > > It is all very strange and one shouldn't think about it :| Found this from an old commit message which seems to be a better explanation, though I still don't fully understand it: In a Xen PV guest, the PTEs contain MFNs so get_user_pages() (for example) must do an MFN to PFN (M2P) lookup before it can get the page. For foreign pages (those owned by another guest) the M2P lookup returns the PFN as seen by the foreign guest (which would be completely the wrong page for the local guest). This cannot be fixed up improving the M2P lookup since one MFN may be mapped onto two or more pages so getting the right page is impossible given just the MFN. Yes, all very strange. Logan
On Fri, Oct 01, 2021 at 02:13:14PM -0600, Logan Gunthorpe wrote: > > > On 2021-10-01 11:45 a.m., Jason Gunthorpe wrote: > >> Before the invalidation, an active flag is cleared to ensure no new > >> mappings can be created while the unmap is proceeding. > >> unmap_mapping_range() should sequence itself with the TLB flush and > > > > AFIAK unmap_mapping_range() kicks off the TLB flush and then > > returns. It doesn't always wait for the flush to fully finish. Ie some > > cases use RCU to lock the page table against GUP fast and so the > > put_page() doesn't happen until the call_rcu completes - after a grace > > period. The unmap_mapping_range() does not wait for grace periods. > > Admittedly, the tlb flush code isn't the easiest code to understand. > But, yes it seems at least on some arches the pages are freed by > call_rcu(). But can't this be fixed easily by adding a synchronize_rcu() > call after calling unmap_mapping_range()? Certainly after a > synchronize_rcu(), the TLB has been flushed and it is safe to free those > pages. It would close this issue, however synchronize_rcu() is very slow (think > 1second) in some cases and thus cannot be inserted here. I'm also not completely sure that rcu is the only case, I don't know how every arch handles its gather structure.. I have a feeling the general intention was for this to be asynchronous My preferences are to either remove devmap from gup_fast, or fix it to not use special pages - the latter being obviously better. Jason
On 2021-10-01 4:14 p.m., Jason Gunthorpe wrote: > On Fri, Oct 01, 2021 at 02:13:14PM -0600, Logan Gunthorpe wrote: >> >> >> On 2021-10-01 11:45 a.m., Jason Gunthorpe wrote: >>>> Before the invalidation, an active flag is cleared to ensure no new >>>> mappings can be created while the unmap is proceeding. >>>> unmap_mapping_range() should sequence itself with the TLB flush and >>> >>> AFIAK unmap_mapping_range() kicks off the TLB flush and then >>> returns. It doesn't always wait for the flush to fully finish. Ie some >>> cases use RCU to lock the page table against GUP fast and so the >>> put_page() doesn't happen until the call_rcu completes - after a grace >>> period. The unmap_mapping_range() does not wait for grace periods. >> >> Admittedly, the tlb flush code isn't the easiest code to understand. >> But, yes it seems at least on some arches the pages are freed by >> call_rcu(). But can't this be fixed easily by adding a synchronize_rcu() >> call after calling unmap_mapping_range()? Certainly after a >> synchronize_rcu(), the TLB has been flushed and it is safe to free those >> pages. > > It would close this issue, however synchronize_rcu() is very slow > (think > 1second) in some cases and thus cannot be inserted here. It shouldn't be *that* slow, at least not the vast majority of the time... it seems a bit unreasonable that a CPU wouldn't schedule for more than a second. But these aren't fast paths and synchronize_rcu() already gets called in the unbind path for p2pdma a of couple times. I'm sure it would also be fine to slow down the vma_close() path as well. > I'm also not completely sure that rcu is the only case, I don't know > how every arch handles its gather structure.. I have a feeling the > general intention was for this to be asynchronous Yeah, this is not clear to me either. > My preferences are to either remove devmap from gup_fast, or fix it to > not use special pages - the latter being obviously better. Yeah, I rather expect DAX users want the optimization provided by gup_fast. I don't think P2PDMA users would be happy about being stuck with slow gup either. Loga
On Fri, Oct 01, 2021 at 04:22:28PM -0600, Logan Gunthorpe wrote: > > It would close this issue, however synchronize_rcu() is very slow > > (think > 1second) in some cases and thus cannot be inserted here. > > It shouldn't be *that* slow, at least not the vast majority of the > time... it seems a bit unreasonable that a CPU wouldn't schedule for > more than a second. I've seen bug reports on exactly this, it is well known. Loaded big multi-cpu systems have high delays here, for whatever reason. > But these aren't fast paths and synchronize_rcu() already gets > called in the unbind path for p2pdma a of couple times. I'm sure it > would also be fine to slow down the vma_close() path as well. vma_close is done in a loop destroying vma's and if each synchronize costs > 1s it can take forever to close a process. We had to kill a similar use of synchronize_rcu in RDMA because users were complaining of > 40s process exit times. The driver unload path is fine to be slow, and is probably done on an unloaded system where synchronize_rcu is not so bad Anyway, it is not really something for this series to fix, just something we should all be aware of and probably ought to get fixed before we do much more with ZONE_DEVICE pages Jason
On 10/1/21 15:46, Jason Gunthorpe wrote: > On Fri, Oct 01, 2021 at 04:22:28PM -0600, Logan Gunthorpe wrote: > >>> It would close this issue, however synchronize_rcu() is very slow >>> (think > 1second) in some cases and thus cannot be inserted here. >> >> It shouldn't be *that* slow, at least not the vast majority of the >> time... it seems a bit unreasonable that a CPU wouldn't schedule for >> more than a second. > > I've seen bug reports on exactly this, it is well known. Loaded > big multi-cpu systems have high delays here, for whatever reason. > So have I. One reason is that synchronize_rcu() doesn't merely wait for a context switch on each CPU--it also waits for callbacks (such as those set up by call_rcu(), if I understand correctly) to run. These can really add up to something quite substantial. In fact, I don't think there is an upper limit on the running times, anywhere. thanks,
On 2021-10-01 4:46 p.m., Jason Gunthorpe wrote: > On Fri, Oct 01, 2021 at 04:22:28PM -0600, Logan Gunthorpe wrote: > >>> It would close this issue, however synchronize_rcu() is very slow >>> (think > 1second) in some cases and thus cannot be inserted here. >> >> It shouldn't be *that* slow, at least not the vast majority of the >> time... it seems a bit unreasonable that a CPU wouldn't schedule for >> more than a second. > > I've seen bug reports on exactly this, it is well known. Loaded > big multi-cpu systems have high delays here, for whatever reason. > >> But these aren't fast paths and synchronize_rcu() already gets >> called in the unbind path for p2pdma a of couple times. I'm sure it >> would also be fine to slow down the vma_close() path as well. > > vma_close is done in a loop destroying vma's and if each synchronize > costs > 1s it can take forever to close a process. We had to kill a > similar use of synchronize_rcu in RDMA because users were complaining > of > 40s process exit times. Ah, fair. This adds a bit of complexity, but we could do a call_rcu() in vma_close to do the page frees. Logan
I'm not following this discussion to closely, but try to look into it from time to time. Am 01.10.21 um 19:45 schrieb Jason Gunthorpe: > On Fri, Oct 01, 2021 at 11:01:49AM -0600, Logan Gunthorpe wrote: > >> In device-dax, the refcount is only used to prevent the device, and >> therefore the pages, from going away on device unbind. Pages cannot be >> recycled, as you say, as they are mapped linearly within the device. The >> address space invalidation is done only when the device is unbound. > By address space invalidation I mean invalidation of the VMA that is > pointing to those pages. > > device-dax may not have a issue with use-after-VMA-invalidation by > it's very nature since every PFN always points to the same > thing. fsdax and this p2p stuff are different though. > >> Before the invalidation, an active flag is cleared to ensure no new >> mappings can be created while the unmap is proceeding. >> unmap_mapping_range() should sequence itself with the TLB flush and > AFIAK unmap_mapping_range() kicks off the TLB flush and then > returns. It doesn't always wait for the flush to fully finish. Ie some > cases use RCU to lock the page table against GUP fast and so the > put_page() doesn't happen until the call_rcu completes - after a grace > period. The unmap_mapping_range() does not wait for grace periods. Wow, wait a second. That is quite a boomer. At least in all GEM/TTM based graphics drivers that could potentially cause a lot of trouble. I've just double checked and we certainly have the assumption that when unmap_mapping_range() returns the pte is gone and the TLB flush completed in quite a number of places. Do you have more information when and why that can happen? Thanks, Christian.
On Mon, Oct 04, 2021 at 08:58:35AM +0200, Christian König wrote: > I'm not following this discussion to closely, but try to look into it from > time to time. > > Am 01.10.21 um 19:45 schrieb Jason Gunthorpe: > > On Fri, Oct 01, 2021 at 11:01:49AM -0600, Logan Gunthorpe wrote: > > > > > In device-dax, the refcount is only used to prevent the device, and > > > therefore the pages, from going away on device unbind. Pages cannot be > > > recycled, as you say, as they are mapped linearly within the device. The > > > address space invalidation is done only when the device is unbound. > > By address space invalidation I mean invalidation of the VMA that is > > pointing to those pages. > > > > device-dax may not have a issue with use-after-VMA-invalidation by > > it's very nature since every PFN always points to the same > > thing. fsdax and this p2p stuff are different though. > > > > > Before the invalidation, an active flag is cleared to ensure no new > > > mappings can be created while the unmap is proceeding. > > > unmap_mapping_range() should sequence itself with the TLB flush and > > AFIAK unmap_mapping_range() kicks off the TLB flush and then > > returns. It doesn't always wait for the flush to fully finish. Ie some > > cases use RCU to lock the page table against GUP fast and so the > > put_page() doesn't happen until the call_rcu completes - after a grace > > period. The unmap_mapping_range() does not wait for grace periods. > > Wow, wait a second. That is quite a boomer. At least in all GEM/TTM based > graphics drivers that could potentially cause a lot of trouble. > > I've just double checked and we certainly have the assumption that when > unmap_mapping_range() returns the pte is gone and the TLB flush completed in > quite a number of places. > > Do you have more information when and why that can happen? There are two things to keep in mind, flushing the PTEs from the HW and serializing against gup_fast. If you start at unmap_mapping_range() the page is eventually discovered in zap_pte_range() and the PTE cleared. It is then passed into __tlb_remove_page() which puts it on the batch->pages list The page free happens in tlb_batch_pages_flush() via free_pages_and_swap_cache() The tlb_batch_pages_flush() happens via zap_page_range() -> tlb_finish_mmu(), presumably after the HW has wiped the TLB's on all CPUs. On x86 this is done with an IPI and also serializes gup fast, so OK The interesting case is CONFIG_MMU_GATHER_RCU_TABLE_FREE which doesn't rely on IPIs anymore to synchronize with gup-fast. In this configuration it means when unmap_mapping_range() returns the TLB will have been flushed, but no serialization with GUP fast was done. This is OK if the GUP fast cannot return the page at all. I assume this generally describes the DRM caes? However, if the GUP fast can return the page then something, somewhere, needs to serialize the page free with the RCU as the GUP fast can be observing the old PTE before it was zap'd until the RCU grace expires. Relying on the page ref being !0 to protect GUP fast is not safe because the page ref can be incr'd immediately upon page re-use. Interestingly I looked around for this on PPC and I only found RCU delayed freeing of the page table level, not RCU delayed freeing of pages themselves.. I wonder if it was missed? There is a path on PPC (tlb_remove_table_sync_one) that triggers an IPI but it looks like an exception, and we wouldn't need the RCU at all if we used IPI to serialize GUP fast... It makes logical sense if the RCU also frees the pages on CONFIG_MMU_GATHER_RCU_TABLE_FREE so anything returnable by GUP fast must be refcounted and freed by tlb_batch_pages_flush(), not by the caller of unmap_mapping_range(). If we expect to allow the caller of unmap_mapping_range() to free then CONFIG_MMU_GATHER_RCU_TABLE_FREE can't really exist, we always need to trigger a serializing IPI during tlb_batch_pages_flush() AFAICT, at least Jason
Am 04.10.21 um 15:11 schrieb Jason Gunthorpe: > On Mon, Oct 04, 2021 at 08:58:35AM +0200, Christian König wrote: >> I'm not following this discussion to closely, but try to look into it from >> time to time. >> >> Am 01.10.21 um 19:45 schrieb Jason Gunthorpe: >>> On Fri, Oct 01, 2021 at 11:01:49AM -0600, Logan Gunthorpe wrote: >>> >>>> In device-dax, the refcount is only used to prevent the device, and >>>> therefore the pages, from going away on device unbind. Pages cannot be >>>> recycled, as you say, as they are mapped linearly within the device. The >>>> address space invalidation is done only when the device is unbound. >>> By address space invalidation I mean invalidation of the VMA that is >>> pointing to those pages. >>> >>> device-dax may not have a issue with use-after-VMA-invalidation by >>> it's very nature since every PFN always points to the same >>> thing. fsdax and this p2p stuff are different though. >>> >>>> Before the invalidation, an active flag is cleared to ensure no new >>>> mappings can be created while the unmap is proceeding. >>>> unmap_mapping_range() should sequence itself with the TLB flush and >>> AFIAK unmap_mapping_range() kicks off the TLB flush and then >>> returns. It doesn't always wait for the flush to fully finish. Ie some >>> cases use RCU to lock the page table against GUP fast and so the >>> put_page() doesn't happen until the call_rcu completes - after a grace >>> period. The unmap_mapping_range() does not wait for grace periods. >> Wow, wait a second. That is quite a boomer. At least in all GEM/TTM based >> graphics drivers that could potentially cause a lot of trouble. >> >> I've just double checked and we certainly have the assumption that when >> unmap_mapping_range() returns the pte is gone and the TLB flush completed in >> quite a number of places. >> >> Do you have more information when and why that can happen? > There are two things to keep in mind, flushing the PTEs from the HW > and serializing against gup_fast. > > If you start at unmap_mapping_range() the page is eventually > discovered in zap_pte_range() and the PTE cleared. It is then passed > into __tlb_remove_page() which puts it on the batch->pages list > > The page free happens in tlb_batch_pages_flush() via > free_pages_and_swap_cache() > > The tlb_batch_pages_flush() happens via zap_page_range() -> > tlb_finish_mmu(), presumably after the HW has wiped the TLB's on all > CPUs. On x86 this is done with an IPI and also serializes gup fast, so > OK > > The interesting case is CONFIG_MMU_GATHER_RCU_TABLE_FREE which doesn't > rely on IPIs anymore to synchronize with gup-fast. > > In this configuration it means when unmap_mapping_range() returns the > TLB will have been flushed, but no serialization with GUP fast was > done. > > This is OK if the GUP fast cannot return the page at all. I assume > this generally describes the DRM caes? Yes, exactly that. GUP is completely forbidden for such mappings. But what about accesses by other CPUs? In other words our use case is like the following: 1. We found that we need exclusive access to the higher level object a page belongs to. 2. The lock of the higher level object is taken. The lock is also taken in the fault handler for the VMA which inserts the PTE in the first place. 3. unmap_mapping_range() for the range of the object is called, the expectation is that when that function returns only the kernel can have a mapping of the pages backing the object. 4. The kernel has exclusive access to the pages and we know that userspace can't mess with them any more. That use case is completely unrelated to GUP and when this doesn't work we have quite a problem. I should probably note that we recently switched from VM_MIXEDMAP to using VM_PFNMAP because the former didn't prevented GUP on all architectures. Christian. > However, if the GUP fast can return the page then something, > somewhere, needs to serialize the page free with the RCU as the GUP > fast can be observing the old PTE before it was zap'd until the RCU > grace expires. > > Relying on the page ref being !0 to protect GUP fast is not safe > because the page ref can be incr'd immediately upon page re-use. > > Interestingly I looked around for this on PPC and I only found RCU > delayed freeing of the page table level, not RCU delayed freeing of > pages themselves.. I wonder if it was missed? > > There is a path on PPC (tlb_remove_table_sync_one) that triggers an > IPI but it looks like an exception, and we wouldn't need the RCU at > all if we used IPI to serialize GUP fast... > > It makes logical sense if the RCU also frees the pages on > CONFIG_MMU_GATHER_RCU_TABLE_FREE so anything returnable by GUP fast > must be refcounted and freed by tlb_batch_pages_flush(), not by the > caller of unmap_mapping_range(). > > If we expect to allow the caller of unmap_mapping_range() to free then > CONFIG_MMU_GATHER_RCU_TABLE_FREE can't really exist, we always need to > trigger a serializing IPI during tlb_batch_pages_flush() > > AFAICT, at least > > Jason
On Mon, Oct 04, 2021 at 03:22:22PM +0200, Christian König wrote: > That use case is completely unrelated to GUP and when this doesn't work we > have quite a problem. My read is that unmap_mapping_range() guarentees the physical TLB hardware is serialized across all CPUs upon return. It also guarentees GUP slow is serialized due to the page table spinlocks. Jason
Am 04.10.21 um 15:27 schrieb Jason Gunthorpe: > On Mon, Oct 04, 2021 at 03:22:22PM +0200, Christian König wrote: > >> That use case is completely unrelated to GUP and when this doesn't work we >> have quite a problem. > My read is that unmap_mapping_range() guarentees the physical TLB > hardware is serialized across all CPUs upon return. Thanks, that's what I wanted to make sure. Christian. > > It also guarentees GUP slow is serialized due to the page table > spinlocks. > > Jason
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 2422af5a529c..a5adf57af53a 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -16,14 +16,19 @@ #include <linux/genalloc.h> #include <linux/memremap.h> #include <linux/percpu-refcount.h> +#include <linux/pfn_t.h> +#include <linux/pseudo_fs.h> #include <linux/random.h> #include <linux/seq_buf.h> #include <linux/xarray.h> +#include <uapi/linux/magic.h> struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; struct xarray map_types; + struct inode *inode; + bool active; }; struct pci_p2pdma_pagemap { @@ -32,6 +37,14 @@ struct pci_p2pdma_pagemap { u64 bus_offset; }; +struct pci_p2pdma_map { + struct kref ref; + struct pci_dev *pdev; + struct inode *inode; + void *kaddr; + size_t len; +}; + static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) { return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap); @@ -100,6 +113,26 @@ static const struct attribute_group p2pmem_group = { .name = "p2pmem", }; +/* + * P2PDMA internal mount + * Fake an internal VFS mount-point in order to allocate struct address_space + * mappings to remove VMAs on unbind events. + */ +static int pci_p2pdma_fs_cnt; +static struct vfsmount *pci_p2pdma_fs_mnt; + +static int pci_p2pdma_fs_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, P2PDMA_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type pci_p2pdma_fs_type = { + .name = "p2dma", + .owner = THIS_MODULE, + .init_fs_context = pci_p2pdma_fs_init_fs_context, + .kill_sb = kill_anon_super, +}; + static void p2pdma_page_free(struct page *page) { struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap); @@ -128,6 +161,9 @@ static void pci_p2pdma_release(void *data) gen_pool_destroy(p2pdma->pool); sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); xa_destroy(&p2pdma->map_types); + + iput(p2pdma->inode); + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt); } static int pci_p2pdma_setup(struct pci_dev *pdev) @@ -145,17 +181,32 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) if (!p2p->pool) goto out; - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); + error = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt, + &pci_p2pdma_fs_cnt); if (error) goto out_pool_destroy; + p2p->inode = alloc_anon_inode(pci_p2pdma_fs_mnt->mnt_sb); + if (IS_ERR(p2p->inode)) { + error = -ENOMEM; + goto out_unpin_fs; + } + + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); + if (error) + goto out_put_inode; + error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); if (error) - goto out_pool_destroy; + goto out_put_inode; rcu_assign_pointer(pdev->p2pdma, p2p); return 0; +out_put_inode: + iput(p2p->inode); +out_unpin_fs: + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt); out_pool_destroy: gen_pool_destroy(p2p->pool); out: @@ -163,6 +214,45 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) return error; } +static void pci_p2pdma_map_free_pages(struct pci_p2pdma_map *pmap) +{ + int i; + + if (!pmap->kaddr) + return; + + for (i = 0; i < pmap->len; i += PAGE_SIZE) + put_page(virt_to_page(pmap->kaddr + i)); + + pmap->kaddr = NULL; +} + +static void pci_p2pdma_free_mappings(struct address_space *mapping) +{ + struct vm_area_struct *vma; + + i_mmap_lock_write(mapping); + if (RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) + goto out; + + vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, -1) + pci_p2pdma_map_free_pages(vma->vm_private_data); + +out: + i_mmap_unlock_write(mapping); +} + +static void pci_p2pdma_unmap_mappings(void *data) +{ + struct pci_dev *pdev = data; + struct pci_p2pdma *p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); + + p2pdma->active = false; + synchronize_rcu(); + unmap_mapping_range(p2pdma->inode->i_mapping, 0, 0, 1); + pci_p2pdma_free_mappings(p2pdma->inode->i_mapping); +} + /** * pci_p2pdma_add_resource - add memory for use as p2p memory * @pdev: the device to add the memory to @@ -221,6 +311,11 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, goto pgmap_free; } + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, + pdev); + if (error) + goto pages_free; + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr, pci_bus_address(pdev, bar) + offset, @@ -229,6 +324,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, if (error) goto pages_free; + p2pdma->active = true; pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n", pgmap->range.start, pgmap->range.end); @@ -1029,3 +1125,166 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, return sprintf(page, "%s\n", pci_name(p2p_dev)); } EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show); + +static struct pci_p2pdma_map *pci_p2pdma_map_alloc(struct pci_dev *pdev, + size_t len) +{ + struct pci_p2pdma_map *pmap; + + pmap = kzalloc(sizeof(*pmap), GFP_KERNEL); + if (!pmap) + return NULL; + + kref_init(&pmap->ref); + pmap->pdev = pci_dev_get(pdev); + pmap->len = len; + + return pmap; +} + +static void pci_p2pdma_map_free(struct kref *ref) +{ + struct pci_p2pdma_map *pmap = + container_of(ref, struct pci_p2pdma_map, ref); + + pci_p2pdma_map_free_pages(pmap); + pci_dev_put(pmap->pdev); + iput(pmap->inode); + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt); + kfree(pmap); +} + +static void pci_p2pdma_vma_open(struct vm_area_struct *vma) +{ + struct pci_p2pdma_map *pmap = vma->vm_private_data; + + kref_get(&pmap->ref); +} + +static void pci_p2pdma_vma_close(struct vm_area_struct *vma) +{ + struct pci_p2pdma_map *pmap = vma->vm_private_data; + + kref_put(&pmap->ref, pci_p2pdma_map_free); +} + +static vm_fault_t pci_p2pdma_vma_fault(struct vm_fault *vmf) +{ + struct pci_p2pdma_map *pmap = vmf->vma->vm_private_data; + struct pci_p2pdma *p2pdma; + void *vaddr; + pfn_t pfn; + int i; + + if (!pmap->kaddr) { + rcu_read_lock(); + p2pdma = rcu_dereference(pmap->pdev->p2pdma); + if (!p2pdma) + goto err_out; + + if (!p2pdma->active) + goto err_out; + + pmap->kaddr = (void *)gen_pool_alloc(p2pdma->pool, pmap->len); + if (!pmap->kaddr) + goto err_out; + + for (i = 0; i < pmap->len; i += PAGE_SIZE) + get_page(virt_to_page(pmap->kaddr + i)); + + rcu_read_unlock(); + } + + vaddr = pmap->kaddr + (vmf->pgoff << PAGE_SHIFT); + pfn = phys_to_pfn_t(virt_to_phys(vaddr), PFN_DEV | PFN_MAP); + + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + +err_out: + rcu_read_unlock(); + return VM_FAULT_SIGBUS; +} +static const struct vm_operations_struct pci_p2pdma_vmops = { + .open = pci_p2pdma_vma_open, + .close = pci_p2pdma_vma_close, + .fault = pci_p2pdma_vma_fault, +}; + +/** + * pci_p2pdma_mmap_file_open - setup file mapping to store P2PMEM VMAs + * @pdev: the device to allocate memory from + * @vma: the userspace vma to map the memory to + * + * Set f_mapping of the file to the p2pdma inode so that mappings + * are can be torn down on device unbind. + * + * Returns 0 on success, or a negative error code on failure + */ +void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, struct file *file) +{ + struct pci_p2pdma *p2pdma; + + rcu_read_lock(); + p2pdma = rcu_dereference(pdev->p2pdma); + if (p2pdma) + file->f_mapping = p2pdma->inode->i_mapping; + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(pci_p2pdma_mmap_file_open); + +/** + * pci_mmap_p2pmem - setup an mmap region to be backed with P2PDMA memory + * that was registered with pci_p2pdma_add_resource() + * @pdev: the device to allocate memory from + * @vma: the userspace vma to map the memory to + * + * The file must call pci_p2pdma_mmap_file_open() in its open() operation. + * + * Returns 0 on success, or a negative error code on failure + */ +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma) +{ + struct pci_p2pdma_map *pmap; + struct pci_p2pdma *p2pdma; + int ret; + + /* prevent private mappings from being established */ + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { + pci_info_ratelimited(pdev, + "%s: fail, attempted private mapping\n", + current->comm); + return -EINVAL; + } + + pmap = pci_p2pdma_map_alloc(pdev, vma->vm_end - vma->vm_start); + if (!pmap) + return -ENOMEM; + + rcu_read_lock(); + p2pdma = rcu_dereference(pdev->p2pdma); + if (!p2pdma) { + ret = -ENODEV; + goto out; + } + + ret = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt, + &pci_p2pdma_fs_cnt); + if (ret) + goto out; + + ihold(p2pdma->inode); + pmap->inode = p2pdma->inode; + rcu_read_unlock(); + + vma->vm_flags |= VM_MIXEDMAP; + vma->vm_private_data = pmap; + vma->vm_ops = &pci_p2pdma_vmops; + + return 0; + +out: + rcu_read_unlock(); + kfree(pmap); + return ret; +} +EXPORT_SYMBOL_GPL(pci_mmap_p2pmem); diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 0c33a40a86e7..f9f19f3db676 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -81,6 +81,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); +void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, struct file *file); +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma); #else /* CONFIG_PCI_P2PDMA */ static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) @@ -152,6 +154,15 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, { return sprintf(page, "none\n"); } +static inline void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, + struct file *file) +{ +} +static inline int pci_mmap_p2pmem(struct pci_dev *pdev, + struct vm_area_struct *vma) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_PCI_P2PDMA */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 35687dcb1a42..af737842c56f 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -88,6 +88,7 @@ #define BPF_FS_MAGIC 0xcafe4a11 #define AAFS_MAGIC 0x5a3c69f0 #define ZONEFS_MAGIC 0x5a4f4653 +#define P2PDMA_MAGIC 0x70327064 /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC 0x15013346
Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap a hunk of p2pmem into userspace. Pages are allocated from the genalloc in bulk and their reference count incremented. They are returned to the genalloc when the page is put. The VMA does not take a reference to the pages when they are inserted with vmf_insert_mixed() (which is necessary for zone device pages) so the backing P2P memory is stored in a structures in vm_private_data. A pseudo mount is used to allocate an inode for each PCI device. The inode's address_space is used in the file doing the mmap so that all VMAs are collected and can be unmapped if the PCI device is unbound. After unmapping, the VMAs are iterated through and their pages are put so the device can continue to be unbound. An active flag is used to signal to VMAs not to allocate any further P2P memory once the removal process starts. The flag is synchronized with concurrent access with an RCU lock. The VMAs and inode will survive after the unbind of the device, but no pages will be present in the VMA and a subsequent access will result in a SIGBUS error. Signed-off-by: Logan Gunthorpe <logang@deltatee.com> --- drivers/pci/p2pdma.c | 263 ++++++++++++++++++++++++++++++++++++- include/linux/pci-p2pdma.h | 11 ++ include/uapi/linux/magic.h | 1 + 3 files changed, 273 insertions(+), 2 deletions(-)