diff mbox series

[v9,7/8] PCI/P2PDMA: Allow userspace VMA allocations through sysfs

Message ID 20220825152425.6296-8-logang@deltatee.com (mailing list archive)
State Superseded
Headers show
Series Userspace P2PDMA with O_DIRECT NVMe devices | expand

Commit Message

Logan Gunthorpe Aug. 25, 2022, 3:24 p.m. UTC
Create a sysfs bin attribute called "allocate" under the existing
"p2pmem" group. The only allowable operation on this file is the mmap()
call.

When mmap() is called on this attribute, the kernel allocates a chunk of
memory from the genalloc and inserts the pages into the VMA. The
dev_pagemap .page_free callback will indicate when these pages are no
longer used and they will be put back into the genalloc.

On device unbind, remove the sysfs file before the memremap_pages are
cleaned up. This ensures unmap_mapping_range() is called on the files
inode and no new mappings can be created.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/pci/p2pdma.c | 124 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)

Comments

Greg Kroah-Hartman Sept. 1, 2022, 4:20 p.m. UTC | #1
On Thu, Aug 25, 2022 at 09:24:24AM -0600, Logan Gunthorpe wrote:
> Create a sysfs bin attribute called "allocate" under the existing
> "p2pmem" group. The only allowable operation on this file is the mmap()
> call.
> 
> When mmap() is called on this attribute, the kernel allocates a chunk of
> memory from the genalloc and inserts the pages into the VMA. The
> dev_pagemap .page_free callback will indicate when these pages are no
> longer used and they will be put back into the genalloc.
> 
> On device unbind, remove the sysfs file before the memremap_pages are
> cleaned up. This ensures unmap_mapping_range() is called on the files
> inode and no new mappings can be created.
> 
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> ---
>  drivers/pci/p2pdma.c | 124 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 124 insertions(+)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 4496a7c5c478..a6ed6bbca214 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -89,6 +89,90 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
>  }
>  static DEVICE_ATTR_RO(published);
>  
> +static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
> +		struct bin_attribute *attr, struct vm_area_struct *vma)
> +{
> +	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
> +	size_t len = vma->vm_end - vma->vm_start;
> +	struct pci_p2pdma *p2pdma;
> +	struct percpu_ref *ref;
> +	unsigned long vaddr;
> +	void *kaddr;
> +	int ret;
> +
> +	/* prevent private mappings from being established */
> +	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
> +		pci_info_ratelimited(pdev,
> +				     "%s: fail, attempted private mapping\n",
> +				     current->comm);
> +		return -EINVAL;
> +	}
> +
> +	if (vma->vm_pgoff) {
> +		pci_info_ratelimited(pdev,
> +				     "%s: fail, attempted mapping with non-zero offset\n",
> +				     current->comm);
> +		return -EINVAL;
> +	}
> +
> +	rcu_read_lock();
> +	p2pdma = rcu_dereference(pdev->p2pdma);
> +	if (!p2pdma) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
> +	if (!kaddr) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	/*
> +	 * vm_insert_page() can sleep, so a reference is taken to mapping
> +	 * such that rcu_read_unlock() can be done before inserting the
> +	 * pages
> +	 */
> +	if (unlikely(!percpu_ref_tryget_live_rcu(ref))) {
> +		ret = -ENODEV;
> +		goto out_free_mem;
> +	}
> +	rcu_read_unlock();
> +
> +	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
> +		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
> +		if (ret) {
> +			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
> +			return ret;
> +		}
> +		percpu_ref_get(ref);
> +		put_page(virt_to_page(kaddr));
> +		kaddr += PAGE_SIZE;
> +		len -= PAGE_SIZE;
> +	}
> +
> +	percpu_ref_put(ref);
> +
> +	return 0;
> +out_free_mem:
> +	gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
> +out:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
> +static struct bin_attribute p2pmem_alloc_attr = {
> +	.attr = { .name = "allocate", .mode = 0660 },
> +	.mmap = p2pmem_alloc_mmap,
> +	/*
> +	 * Some places where we want to call mmap (ie. python) will check
> +	 * that the file size is greater than the mmap size before allowing
> +	 * the mmap to continue. To work around this, just set the size
> +	 * to be very large.
> +	 */
> +	.size = SZ_1T,
> +};
> +
>  static struct attribute *p2pmem_attrs[] = {
>  	&dev_attr_size.attr,
>  	&dev_attr_available.attr,
> @@ -96,11 +180,32 @@ static struct attribute *p2pmem_attrs[] = {
>  	NULL,
>  };
>  
> +static struct bin_attribute *p2pmem_bin_attrs[] = {
> +	&p2pmem_alloc_attr,
> +	NULL,
> +};
> +
>  static const struct attribute_group p2pmem_group = {
>  	.attrs = p2pmem_attrs,
> +	.bin_attrs = p2pmem_bin_attrs,
>  	.name = "p2pmem",
>  };
>  
> +static void p2pdma_page_free(struct page *page)
> +{
> +	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
> +	struct percpu_ref *ref;
> +
> +	gen_pool_free_owner(pgmap->provider->p2pdma->pool,
> +			    (uintptr_t)page_to_virt(page), PAGE_SIZE,
> +			    (void **)&ref);
> +	percpu_ref_put(ref);
> +}
> +
> +static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
> +	.page_free = p2pdma_page_free,
> +};
> +
>  static void pci_p2pdma_release(void *data)
>  {
>  	struct pci_dev *pdev = data;
> @@ -152,6 +257,19 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
>  	return error;
>  }
>  
> +static void pci_p2pdma_unmap_mappings(void *data)
> +{
> +	struct pci_dev *pdev = data;
> +
> +	/*
> +	 * Removing the alloc attribute from sysfs will call
> +	 * unmap_mapping_range() on the inode, teardown any existing userspace
> +	 * mappings and prevent new ones from being created.
> +	 */
> +	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
> +				     p2pmem_group.name);

Wait, why are you manually removing the sysfs file here?  It's part of
the group, if you do this then it is gone for forever, right?  Why
manually do this the sysfs core should handle this for you if the device
is removed.

And worst case, just pass in the device, not the pci device.

thanks,

greg k-h
Logan Gunthorpe Sept. 1, 2022, 4:32 p.m. UTC | #2
On 2022-09-01 10:20, Greg Kroah-Hartman wrote:
> On Thu, Aug 25, 2022 at 09:24:24AM -0600, Logan Gunthorpe wrote:
>> Create a sysfs bin attribute called "allocate" under the existing
>> "p2pmem" group. The only allowable operation on this file is the mmap()
>> call.
>>
>> When mmap() is called on this attribute, the kernel allocates a chunk of
>> memory from the genalloc and inserts the pages into the VMA. The
>> dev_pagemap .page_free callback will indicate when these pages are no
>> longer used and they will be put back into the genalloc.
>>
>> On device unbind, remove the sysfs file before the memremap_pages are
>> cleaned up. This ensures unmap_mapping_range() is called on the files
>> inode and no new mappings can be created.
>>
>> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
>> ---
>>  drivers/pci/p2pdma.c | 124 +++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 124 insertions(+)
>>
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index 4496a7c5c478..a6ed6bbca214 100644
>> --- a/drivers/pci/p2pdma.c
>> +++ b/drivers/pci/p2pdma.c
>> @@ -89,6 +89,90 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
>>  }
>>  static DEVICE_ATTR_RO(published);
>>  
>> +static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
>> +		struct bin_attribute *attr, struct vm_area_struct *vma)
>> +{
>> +	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
>> +	size_t len = vma->vm_end - vma->vm_start;
>> +	struct pci_p2pdma *p2pdma;
>> +	struct percpu_ref *ref;
>> +	unsigned long vaddr;
>> +	void *kaddr;
>> +	int ret;
>> +
>> +	/* prevent private mappings from being established */
>> +	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
>> +		pci_info_ratelimited(pdev,
>> +				     "%s: fail, attempted private mapping\n",
>> +				     current->comm);
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (vma->vm_pgoff) {
>> +		pci_info_ratelimited(pdev,
>> +				     "%s: fail, attempted mapping with non-zero offset\n",
>> +				     current->comm);
>> +		return -EINVAL;
>> +	}
>> +
>> +	rcu_read_lock();
>> +	p2pdma = rcu_dereference(pdev->p2pdma);
>> +	if (!p2pdma) {
>> +		ret = -ENODEV;
>> +		goto out;
>> +	}
>> +
>> +	kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
>> +	if (!kaddr) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	/*
>> +	 * vm_insert_page() can sleep, so a reference is taken to mapping
>> +	 * such that rcu_read_unlock() can be done before inserting the
>> +	 * pages
>> +	 */
>> +	if (unlikely(!percpu_ref_tryget_live_rcu(ref))) {
>> +		ret = -ENODEV;
>> +		goto out_free_mem;
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
>> +		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
>> +		if (ret) {
>> +			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
>> +			return ret;
>> +		}
>> +		percpu_ref_get(ref);
>> +		put_page(virt_to_page(kaddr));
>> +		kaddr += PAGE_SIZE;
>> +		len -= PAGE_SIZE;
>> +	}
>> +
>> +	percpu_ref_put(ref);
>> +
>> +	return 0;
>> +out_free_mem:
>> +	gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
>> +out:
>> +	rcu_read_unlock();
>> +	return ret;
>> +}
>> +
>> +static struct bin_attribute p2pmem_alloc_attr = {
>> +	.attr = { .name = "allocate", .mode = 0660 },
>> +	.mmap = p2pmem_alloc_mmap,
>> +	/*
>> +	 * Some places where we want to call mmap (ie. python) will check
>> +	 * that the file size is greater than the mmap size before allowing
>> +	 * the mmap to continue. To work around this, just set the size
>> +	 * to be very large.
>> +	 */
>> +	.size = SZ_1T,
>> +};
>> +
>>  static struct attribute *p2pmem_attrs[] = {
>>  	&dev_attr_size.attr,
>>  	&dev_attr_available.attr,
>> @@ -96,11 +180,32 @@ static struct attribute *p2pmem_attrs[] = {
>>  	NULL,
>>  };
>>  
>> +static struct bin_attribute *p2pmem_bin_attrs[] = {
>> +	&p2pmem_alloc_attr,
>> +	NULL,
>> +};
>> +
>>  static const struct attribute_group p2pmem_group = {
>>  	.attrs = p2pmem_attrs,
>> +	.bin_attrs = p2pmem_bin_attrs,
>>  	.name = "p2pmem",
>>  };
>>  
>> +static void p2pdma_page_free(struct page *page)
>> +{
>> +	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
>> +	struct percpu_ref *ref;
>> +
>> +	gen_pool_free_owner(pgmap->provider->p2pdma->pool,
>> +			    (uintptr_t)page_to_virt(page), PAGE_SIZE,
>> +			    (void **)&ref);
>> +	percpu_ref_put(ref);
>> +}
>> +
>> +static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
>> +	.page_free = p2pdma_page_free,
>> +};
>> +
>>  static void pci_p2pdma_release(void *data)
>>  {
>>  	struct pci_dev *pdev = data;
>> @@ -152,6 +257,19 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
>>  	return error;
>>  }
>>  
>> +static void pci_p2pdma_unmap_mappings(void *data)
>> +{
>> +	struct pci_dev *pdev = data;
>> +
>> +	/*
>> +	 * Removing the alloc attribute from sysfs will call
>> +	 * unmap_mapping_range() on the inode, teardown any existing userspace
>> +	 * mappings and prevent new ones from being created.
>> +	 */
>> +	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
>> +				     p2pmem_group.name);
> 
> Wait, why are you manually removing the sysfs file here?  It's part of
> the group, if you do this then it is gone for forever, right?  Why
> manually do this the sysfs core should handle this for you if the device
> is removed.

We have to make sure the mappings are all removed before the cleanup of
devm_memremap_pages() which will wait for all the pages to be freed. If
we don't do this any userspace mapping will hang the cleanup until those
uses are unmapped themselves.

> And worst case, just pass in the device, not the pci device.

Ok, I'll make that change for v10.

Logan
Greg Kroah-Hartman Sept. 1, 2022, 4:42 p.m. UTC | #3
On Thu, Sep 01, 2022 at 10:32:55AM -0600, Logan Gunthorpe wrote:
> 
> 
> 
> On 2022-09-01 10:20, Greg Kroah-Hartman wrote:
> > On Thu, Aug 25, 2022 at 09:24:24AM -0600, Logan Gunthorpe wrote:
> >> Create a sysfs bin attribute called "allocate" under the existing
> >> "p2pmem" group. The only allowable operation on this file is the mmap()
> >> call.
> >>
> >> When mmap() is called on this attribute, the kernel allocates a chunk of
> >> memory from the genalloc and inserts the pages into the VMA. The
> >> dev_pagemap .page_free callback will indicate when these pages are no
> >> longer used and they will be put back into the genalloc.
> >>
> >> On device unbind, remove the sysfs file before the memremap_pages are
> >> cleaned up. This ensures unmap_mapping_range() is called on the files
> >> inode and no new mappings can be created.
> >>
> >> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> >> ---
> >>  drivers/pci/p2pdma.c | 124 +++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 124 insertions(+)
> >>
> >> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> >> index 4496a7c5c478..a6ed6bbca214 100644
> >> --- a/drivers/pci/p2pdma.c
> >> +++ b/drivers/pci/p2pdma.c
> >> @@ -89,6 +89,90 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
> >>  }
> >>  static DEVICE_ATTR_RO(published);
> >>  
> >> +static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
> >> +		struct bin_attribute *attr, struct vm_area_struct *vma)
> >> +{
> >> +	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
> >> +	size_t len = vma->vm_end - vma->vm_start;
> >> +	struct pci_p2pdma *p2pdma;
> >> +	struct percpu_ref *ref;
> >> +	unsigned long vaddr;
> >> +	void *kaddr;
> >> +	int ret;
> >> +
> >> +	/* prevent private mappings from being established */
> >> +	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
> >> +		pci_info_ratelimited(pdev,
> >> +				     "%s: fail, attempted private mapping\n",
> >> +				     current->comm);
> >> +		return -EINVAL;
> >> +	}
> >> +
> >> +	if (vma->vm_pgoff) {
> >> +		pci_info_ratelimited(pdev,
> >> +				     "%s: fail, attempted mapping with non-zero offset\n",
> >> +				     current->comm);
> >> +		return -EINVAL;
> >> +	}
> >> +
> >> +	rcu_read_lock();
> >> +	p2pdma = rcu_dereference(pdev->p2pdma);
> >> +	if (!p2pdma) {
> >> +		ret = -ENODEV;
> >> +		goto out;
> >> +	}
> >> +
> >> +	kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
> >> +	if (!kaddr) {
> >> +		ret = -ENOMEM;
> >> +		goto out;
> >> +	}
> >> +
> >> +	/*
> >> +	 * vm_insert_page() can sleep, so a reference is taken to mapping
> >> +	 * such that rcu_read_unlock() can be done before inserting the
> >> +	 * pages
> >> +	 */
> >> +	if (unlikely(!percpu_ref_tryget_live_rcu(ref))) {
> >> +		ret = -ENODEV;
> >> +		goto out_free_mem;
> >> +	}
> >> +	rcu_read_unlock();
> >> +
> >> +	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
> >> +		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
> >> +		if (ret) {
> >> +			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
> >> +			return ret;
> >> +		}
> >> +		percpu_ref_get(ref);
> >> +		put_page(virt_to_page(kaddr));
> >> +		kaddr += PAGE_SIZE;
> >> +		len -= PAGE_SIZE;
> >> +	}
> >> +
> >> +	percpu_ref_put(ref);
> >> +
> >> +	return 0;
> >> +out_free_mem:
> >> +	gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
> >> +out:
> >> +	rcu_read_unlock();
> >> +	return ret;
> >> +}
> >> +
> >> +static struct bin_attribute p2pmem_alloc_attr = {
> >> +	.attr = { .name = "allocate", .mode = 0660 },
> >> +	.mmap = p2pmem_alloc_mmap,
> >> +	/*
> >> +	 * Some places where we want to call mmap (ie. python) will check
> >> +	 * that the file size is greater than the mmap size before allowing
> >> +	 * the mmap to continue. To work around this, just set the size
> >> +	 * to be very large.
> >> +	 */
> >> +	.size = SZ_1T,
> >> +};
> >> +
> >>  static struct attribute *p2pmem_attrs[] = {
> >>  	&dev_attr_size.attr,
> >>  	&dev_attr_available.attr,
> >> @@ -96,11 +180,32 @@ static struct attribute *p2pmem_attrs[] = {
> >>  	NULL,
> >>  };
> >>  
> >> +static struct bin_attribute *p2pmem_bin_attrs[] = {
> >> +	&p2pmem_alloc_attr,
> >> +	NULL,
> >> +};
> >> +
> >>  static const struct attribute_group p2pmem_group = {
> >>  	.attrs = p2pmem_attrs,
> >> +	.bin_attrs = p2pmem_bin_attrs,
> >>  	.name = "p2pmem",
> >>  };
> >>  
> >> +static void p2pdma_page_free(struct page *page)
> >> +{
> >> +	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
> >> +	struct percpu_ref *ref;
> >> +
> >> +	gen_pool_free_owner(pgmap->provider->p2pdma->pool,
> >> +			    (uintptr_t)page_to_virt(page), PAGE_SIZE,
> >> +			    (void **)&ref);
> >> +	percpu_ref_put(ref);
> >> +}
> >> +
> >> +static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
> >> +	.page_free = p2pdma_page_free,
> >> +};
> >> +
> >>  static void pci_p2pdma_release(void *data)
> >>  {
> >>  	struct pci_dev *pdev = data;
> >> @@ -152,6 +257,19 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
> >>  	return error;
> >>  }
> >>  
> >> +static void pci_p2pdma_unmap_mappings(void *data)
> >> +{
> >> +	struct pci_dev *pdev = data;
> >> +
> >> +	/*
> >> +	 * Removing the alloc attribute from sysfs will call
> >> +	 * unmap_mapping_range() on the inode, teardown any existing userspace
> >> +	 * mappings and prevent new ones from being created.
> >> +	 */
> >> +	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
> >> +				     p2pmem_group.name);
> > 
> > Wait, why are you manually removing the sysfs file here?  It's part of
> > the group, if you do this then it is gone for forever, right?  Why
> > manually do this the sysfs core should handle this for you if the device
> > is removed.
> 
> We have to make sure the mappings are all removed before the cleanup of
> devm_memremap_pages() which will wait for all the pages to be freed.

Then don't use devm_ functions.  Why not just use the manual functions
instead as you know when you want to tear this down.

> If
> we don't do this any userspace mapping will hang the cleanup until those
> uses are unmapped themselves.

Just do this in the remove call yourself and you should be fine.

thanks,

greg k-h
Logan Gunthorpe Sept. 1, 2022, 6:14 p.m. UTC | #4
On 2022-09-01 10:42, Greg Kroah-Hartman wrote:
> On Thu, Sep 01, 2022 at 10:32:55AM -0600, Logan Gunthorpe wrote:
>> On 2022-09-01 10:20, Greg Kroah-Hartman wrote:
>>> On Thu, Aug 25, 2022 at 09:24:24AM -0600, Logan Gunthorpe wrote:
>>>> +	/*
>>>> +	 * Removing the alloc attribute from sysfs will call
>>>> +	 * unmap_mapping_range() on the inode, teardown any existing userspace
>>>> +	 * mappings and prevent new ones from being created.
>>>> +	 */
>>>> +	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
>>>> +				     p2pmem_group.name);
>>>
>>> Wait, why are you manually removing the sysfs file here?  It's part of
>>> the group, if you do this then it is gone for forever, right?  Why
>>> manually do this the sysfs core should handle this for you if the device
>>> is removed.
>>
>> We have to make sure the mappings are all removed before the cleanup of
>> devm_memremap_pages() which will wait for all the pages to be freed.
> 
> Then don't use devm_ functions.  Why not just use the manual functions
> instead as you know when you want to tear this down.

Well we haven't plugged in a remove call into p2pdma, that would be more
work and more interfaces touching the PCI code. Note: this code isn't a
driver but a set of PCI helpers available to other PCI drivers.
Everything that's setup is using the devm interfaces and gets torn down
with the same. So I don't really see the benefit of making the change
you propose.

Logan
Greg Kroah-Hartman Sept. 1, 2022, 6:36 p.m. UTC | #5
On Thu, Sep 01, 2022 at 12:14:25PM -0600, Logan Gunthorpe wrote:
> 
> 
> 
> On 2022-09-01 10:42, Greg Kroah-Hartman wrote:
> > On Thu, Sep 01, 2022 at 10:32:55AM -0600, Logan Gunthorpe wrote:
> >> On 2022-09-01 10:20, Greg Kroah-Hartman wrote:
> >>> On Thu, Aug 25, 2022 at 09:24:24AM -0600, Logan Gunthorpe wrote:
> >>>> +	/*
> >>>> +	 * Removing the alloc attribute from sysfs will call
> >>>> +	 * unmap_mapping_range() on the inode, teardown any existing userspace
> >>>> +	 * mappings and prevent new ones from being created.
> >>>> +	 */
> >>>> +	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
> >>>> +				     p2pmem_group.name);
> >>>
> >>> Wait, why are you manually removing the sysfs file here?  It's part of
> >>> the group, if you do this then it is gone for forever, right?  Why
> >>> manually do this the sysfs core should handle this for you if the device
> >>> is removed.
> >>
> >> We have to make sure the mappings are all removed before the cleanup of
> >> devm_memremap_pages() which will wait for all the pages to be freed.
> > 
> > Then don't use devm_ functions.  Why not just use the manual functions
> > instead as you know when you want to tear this down.
> 
> Well we haven't plugged in a remove call into p2pdma, that would be more
> work and more interfaces touching the PCI code. Note: this code isn't a
> driver but a set of PCI helpers available to other PCI drivers.
> Everything that's setup is using the devm interfaces and gets torn down
> with the same. So I don't really see the benefit of making the change
> you propose.

The issue is the classic one with the devm helpers.  They do not lend
themselves to resource management issues that require ordering or other
sort of dependencies.  Please do not use them here, just put in a remove
callback as you eventually will need it anyway, as you have a strong
requirement for what gets freed when, and the devm api does not provide
for that well.

thanks,

greg k-h
Logan Gunthorpe Sept. 1, 2022, 7:16 p.m. UTC | #6
On 2022-09-01 12:36, Greg Kroah-Hartman wrote:
> On Thu, Sep 01, 2022 at 12:14:25PM -0600, Logan Gunthorpe wrote:
>> Well we haven't plugged in a remove call into p2pdma, that would be more
>> work and more interfaces touching the PCI code. Note: this code isn't a
>> driver but a set of PCI helpers available to other PCI drivers.
>> Everything that's setup is using the devm interfaces and gets torn down
>> with the same. So I don't really see the benefit of making the change
>> you propose.
> 
> The issue is the classic one with the devm helpers.  They do not lend
> themselves to resource management issues that require ordering or other
> sort of dependencies.  Please do not use them here, just put in a remove
> callback as you eventually will need it anyway, as you have a strong
> requirement for what gets freed when, and the devm api does not provide
> for that well.

This surprises me. Can you elaborate on this classic issue?

I've definitely seen uses of devm that expect the calls will be torn
down in reverse order they are added. The existing p2pdma code will
certainly fail quite significantly if a devm_kzalloc() releases its
memory before the devm_memmap_pages() cleans up. There's also already an
action that is used to cleanup before the last devm_kzalloc() call
happens. If ordering is not guaranteed, then devm seems fairly broken
and unusable and I'd have to drop all uses from this code and go back to
the error prone method. Also what's the point of
devm_add_action_or_reset() if it doesn't guarantee the ordering or the
release?

But if it's that important I can make the change to these patches for v10.

Logan
Greg Kroah-Hartman Sept. 2, 2022, 5:53 a.m. UTC | #7
On Thu, Sep 01, 2022 at 01:16:54PM -0600, Logan Gunthorpe wrote:
> 
> 
> On 2022-09-01 12:36, Greg Kroah-Hartman wrote:
> > On Thu, Sep 01, 2022 at 12:14:25PM -0600, Logan Gunthorpe wrote:
> >> Well we haven't plugged in a remove call into p2pdma, that would be more
> >> work and more interfaces touching the PCI code. Note: this code isn't a
> >> driver but a set of PCI helpers available to other PCI drivers.
> >> Everything that's setup is using the devm interfaces and gets torn down
> >> with the same. So I don't really see the benefit of making the change
> >> you propose.
> > 
> > The issue is the classic one with the devm helpers.  They do not lend
> > themselves to resource management issues that require ordering or other
> > sort of dependencies.  Please do not use them here, just put in a remove
> > callback as you eventually will need it anyway, as you have a strong
> > requirement for what gets freed when, and the devm api does not provide
> > for that well.
> 
> This surprises me. Can you elaborate on this classic issue?

There's long threads about it on the ksummit discuss mailing list and
other places.

> I've definitely seen uses of devm that expect the calls will be torn
> down in reverse order they are added.

Sorry, I didn't mean to imply the ordering of the devm code is
incorrect, that's fine.

It's when you have things in the devm "chain" that need to be freed in a
different order that stuff gets messy.  Like irqs and clocks and other
types of resources that have "actions" associated with them.

> The existing p2pdma code will
> certainly fail quite significantly if a devm_kzalloc() releases its
> memory before the devm_memmap_pages() cleans up. There's also already an
> action that is used to cleanup before the last devm_kzalloc() call
> happens. If ordering is not guaranteed, then devm seems fairly broken
> and unusable and I'd have to drop all uses from this code and go back to
> the error prone method. Also what's the point of
> devm_add_action_or_reset() if it doesn't guarantee the ordering or the
> release?

I have never used devm_add_action_or_reset() so I can't say why it is
there.  I am just pointing out that manually messing with a sysfs group
from a driver is a huge flag that something is wrong.  A driver should
almost never be touching a raw kobject or calling any sysfs_* call if
all is normal, which is why I questioned this.

> But if it's that important I can make the change to these patches for v10.

Try it the way I suggest, with a remove() callback, and see if that
looks simpler and easier to follow and maintain over time.

thanks,

greg k-h
Logan Gunthorpe Sept. 2, 2022, 6:46 p.m. UTC | #8
On 2022-09-01 23:53, Greg Kroah-Hartman wrote:
> On Thu, Sep 01, 2022 at 01:16:54PM -0600, Logan Gunthorpe wrote:
>> This surprises me. Can you elaborate on this classic issue?
> 
> There's long threads about it on the ksummit discuss mailing list and
> other places.

I've managed to find one such thread dealing with lifetime issues of
different objects and bugs that are common with mistakes with its usage.
I've dealt with similar issues in the past, but as best as I can see 
there are no lifetime issues in this code.

> I have never used devm_add_action_or_reset() so I can't say why it is
> there.  I am just pointing out that manually messing with a sysfs group
> from a driver is a huge flag that something is wrong.  A driver should
> almost never be touching a raw kobject or calling any sysfs_* call if
> all is normal, which is why I questioned this.

In this case we need to remove the specifc sysfs file to teardown any
vmas earlier in the remove sequence than it would be done normally. Whether
we do that through devm or remove() doesn't change the fact that we need
to access the dev->kobj to do that early.

>> But if it's that important I can make the change to these patches for v10.
> 
> Try it the way I suggest, with a remove() callback, and see if that
> looks simpler and easier to follow and maintain over time.

See the diff at the bottom of this email. I can apply it on top of this
patch, but IMO it is neither easier to follow nor maintain. Unless you 
have a different suggestion...

Thanks,

Logan

--

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index a6ed6bbca214..4e1211a2a6cd 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -206,6 +206,23 @@ static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
 	.page_free = p2pdma_page_free,
 };
 
+void pci_p2pdma_remove(struct pci_dev *pdev)
+{
+	if (!rcu_access_pointer(pdev->p2pdma))
+		return;
+
+	/*
+	 * Any userspace mappings must be unmapped before the
+	 * devm_memremap_pages() release happens, otherwise a device remove
+	 * will hang on any processes that have pages mapped. To avoid this,
+	 * remove the alloc attribute from sysfs which will call
+	 * unmap_mapping_range() on the inode and teardown any existing
+	 * userspace mappings.
+	 */
+	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
+				     p2pmem_group.name);
+}
+
 static void pci_p2pdma_release(void *data)
 {
 	struct pci_dev *pdev = data;
@@ -257,19 +274,6 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
 	return error;
 }
 
-static void pci_p2pdma_unmap_mappings(void *data)
-{
-	struct pci_dev *pdev = data;
-
-	/*
-	 * Removing the alloc attribute from sysfs will call
-	 * unmap_mapping_range() on the inode, teardown any existing userspace
-	 * mappings and prevent new ones from being created.
-	 */
-	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
-				     p2pmem_group.name);
-}
-
 /**
  * pci_p2pdma_add_resource - add memory for use as p2p memory
  * @pdev: the device to add the memory to
@@ -328,11 +332,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 		goto pgmap_free;
 	}
 
-	error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
-					 pdev);
-	if (error)
-		goto pages_free;
-
 	p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
 	error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
 			pci_bus_address(pdev, bar) + offset,
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 49238ddd39ee..a096f2723eac 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -471,6 +471,8 @@ static void pci_device_remove(struct device *dev)
 	struct pci_dev *pci_dev = to_pci_dev(dev);
 	struct pci_driver *drv = pci_dev->driver;
 
+	pci_p2pdma_remove(pci_dev);
+
 	if (drv->remove) {
 		pm_runtime_get_sync(dev);
 		drv->remove(pci_dev);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 785f31086313..1c5c901a2fcc 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -774,4 +774,12 @@ static inline pci_power_t mid_pci_get_power_state(struct pci_dev *pdev)
 }
 #endif
 
+#ifdef CONFIG_PCI_P2PDMA
+void pci_p2pdma_remove(struct pci_dev *dev);
+#else
+static inline void pci_p2pdma_remove(struct pci_dev *dev);
+{
+}
+#endif
+
 #endif /* DRIVERS_PCI_H */
Christoph Hellwig Sept. 20, 2022, 6:46 a.m. UTC | #9
On Fri, Sep 02, 2022 at 12:46:54PM -0600, Logan Gunthorpe wrote:
> See the diff at the bottom of this email. I can apply it on top of this
> patch, but IMO it is neither easier to follow nor maintain. Unless you 
> have a different suggestion...

Greg, can you chime in on this?  Besides this item we just have a few
cosmetic bits left I think, and I'd really like to get the series into
this merge window.
Greg Kroah-Hartman Sept. 22, 2022, 8:38 a.m. UTC | #10
On Tue, Sep 20, 2022 at 08:46:13AM +0200, Christoph Hellwig wrote:
> On Fri, Sep 02, 2022 at 12:46:54PM -0600, Logan Gunthorpe wrote:
> > See the diff at the bottom of this email. I can apply it on top of this
> > patch, but IMO it is neither easier to follow nor maintain. Unless you 
> > have a different suggestion...
> 
> Greg, can you chime in on this?  Besides this item we just have a few
> cosmetic bits left I think, and I'd really like to get the series into
> this merge window.
> 

I don't seem to have this in my inbox at all anymore, sorry.

The original should be fine, Logan, thanks for trying to split it out a
bit more.  So this can be taken as-is for 6.1-rc1.

thanks,

greg k-h
Logan Gunthorpe Sept. 22, 2022, 2:58 p.m. UTC | #11
On 2022-09-22 02:38, Greg Kroah-Hartman wrote:
> On Tue, Sep 20, 2022 at 08:46:13AM +0200, Christoph Hellwig wrote:
>> On Fri, Sep 02, 2022 at 12:46:54PM -0600, Logan Gunthorpe wrote:
>>> See the diff at the bottom of this email. I can apply it on top of this
>>> patch, but IMO it is neither easier to follow nor maintain. Unless you 
>>> have a different suggestion...
>>
>> Greg, can you chime in on this?  Besides this item we just have a few
>> cosmetic bits left I think, and I'd really like to get the series into
>> this merge window.
>>
> 
> I don't seem to have this in my inbox at all anymore, sorry.
> 
> The original should be fine, Logan, thanks for trying to split it out a
> bit more.  So this can be taken as-is for 6.1-rc1.

Thanks Greg,

I'll send a v10 with changes from the other feedback later today.

Logan
diff mbox series

Patch

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 4496a7c5c478..a6ed6bbca214 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -89,6 +89,90 @@  static ssize_t published_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(published);
 
+static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr, struct vm_area_struct *vma)
+{
+	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
+	size_t len = vma->vm_end - vma->vm_start;
+	struct pci_p2pdma *p2pdma;
+	struct percpu_ref *ref;
+	unsigned long vaddr;
+	void *kaddr;
+	int ret;
+
+	/* prevent private mappings from being established */
+	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+		pci_info_ratelimited(pdev,
+				     "%s: fail, attempted private mapping\n",
+				     current->comm);
+		return -EINVAL;
+	}
+
+	if (vma->vm_pgoff) {
+		pci_info_ratelimited(pdev,
+				     "%s: fail, attempted mapping with non-zero offset\n",
+				     current->comm);
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+	p2pdma = rcu_dereference(pdev->p2pdma);
+	if (!p2pdma) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
+	if (!kaddr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * vm_insert_page() can sleep, so a reference is taken to mapping
+	 * such that rcu_read_unlock() can be done before inserting the
+	 * pages
+	 */
+	if (unlikely(!percpu_ref_tryget_live_rcu(ref))) {
+		ret = -ENODEV;
+		goto out_free_mem;
+	}
+	rcu_read_unlock();
+
+	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
+		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
+		if (ret) {
+			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
+			return ret;
+		}
+		percpu_ref_get(ref);
+		put_page(virt_to_page(kaddr));
+		kaddr += PAGE_SIZE;
+		len -= PAGE_SIZE;
+	}
+
+	percpu_ref_put(ref);
+
+	return 0;
+out_free_mem:
+	gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static struct bin_attribute p2pmem_alloc_attr = {
+	.attr = { .name = "allocate", .mode = 0660 },
+	.mmap = p2pmem_alloc_mmap,
+	/*
+	 * Some places where we want to call mmap (ie. python) will check
+	 * that the file size is greater than the mmap size before allowing
+	 * the mmap to continue. To work around this, just set the size
+	 * to be very large.
+	 */
+	.size = SZ_1T,
+};
+
 static struct attribute *p2pmem_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_available.attr,
@@ -96,11 +180,32 @@  static struct attribute *p2pmem_attrs[] = {
 	NULL,
 };
 
+static struct bin_attribute *p2pmem_bin_attrs[] = {
+	&p2pmem_alloc_attr,
+	NULL,
+};
+
 static const struct attribute_group p2pmem_group = {
 	.attrs = p2pmem_attrs,
+	.bin_attrs = p2pmem_bin_attrs,
 	.name = "p2pmem",
 };
 
+static void p2pdma_page_free(struct page *page)
+{
+	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
+	struct percpu_ref *ref;
+
+	gen_pool_free_owner(pgmap->provider->p2pdma->pool,
+			    (uintptr_t)page_to_virt(page), PAGE_SIZE,
+			    (void **)&ref);
+	percpu_ref_put(ref);
+}
+
+static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
+	.page_free = p2pdma_page_free,
+};
+
 static void pci_p2pdma_release(void *data)
 {
 	struct pci_dev *pdev = data;
@@ -152,6 +257,19 @@  static int pci_p2pdma_setup(struct pci_dev *pdev)
 	return error;
 }
 
+static void pci_p2pdma_unmap_mappings(void *data)
+{
+	struct pci_dev *pdev = data;
+
+	/*
+	 * Removing the alloc attribute from sysfs will call
+	 * unmap_mapping_range() on the inode, teardown any existing userspace
+	 * mappings and prevent new ones from being created.
+	 */
+	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
+				     p2pmem_group.name);
+}
+
 /**
  * pci_p2pdma_add_resource - add memory for use as p2p memory
  * @pdev: the device to add the memory to
@@ -198,6 +316,7 @@  int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	pgmap->range.end = pgmap->range.start + size - 1;
 	pgmap->nr_range = 1;
 	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
+	pgmap->ops = &p2pdma_pgmap_ops;
 
 	p2p_pgmap->provider = pdev;
 	p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) -
@@ -209,6 +328,11 @@  int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 		goto pgmap_free;
 	}
 
+	error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
+					 pdev);
+	if (error)
+		goto pages_free;
+
 	p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
 	error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
 			pci_bus_address(pdev, bar) + offset,