@@ -17,14 +17,19 @@
#include <linux/genalloc.h>
#include <linux/memremap.h>
#include <linux/percpu-refcount.h>
+#include <linux/pfn_t.h>
+#include <linux/pseudo_fs.h>
#include <linux/random.h>
#include <linux/seq_buf.h>
#include <linux/xarray.h>
+#include <uapi/linux/magic.h>
struct pci_p2pdma {
struct gen_pool *pool;
bool p2pmem_published;
struct xarray map_types;
+ struct inode *inode;
+ bool active;
};
struct pci_p2pdma_pagemap {
@@ -33,6 +38,15 @@ struct pci_p2pdma_pagemap {
u64 bus_offset;
};
+struct pci_p2pdma_map {
+ struct kref ref;
+ struct rcu_head rcu;
+ struct pci_dev *pdev;
+ struct inode *inode;
+ void *kaddr;
+ size_t len;
+};
+
static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
{
return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap);
@@ -101,6 +115,26 @@ static const struct attribute_group p2pmem_group = {
.name = "p2pmem",
};
+/*
+ * P2PDMA internal mount
+ * Fake an internal VFS mount-point in order to allocate struct address_space
+ * mappings to remove VMAs on unbind events.
+ */
+static int pci_p2pdma_fs_cnt;
+static struct vfsmount *pci_p2pdma_fs_mnt;
+
+static int pci_p2pdma_fs_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, P2PDMA_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type pci_p2pdma_fs_type = {
+ .name = "p2dma",
+ .owner = THIS_MODULE,
+ .init_fs_context = pci_p2pdma_fs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
static void p2pdma_page_free(struct page *page)
{
struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
@@ -129,6 +163,9 @@ static void pci_p2pdma_release(void *data)
gen_pool_destroy(p2pdma->pool);
sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
xa_destroy(&p2pdma->map_types);
+
+ iput(p2pdma->inode);
+ simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt);
}
static int pci_p2pdma_setup(struct pci_dev *pdev)
@@ -146,17 +183,32 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
if (!p2p->pool)
goto out;
- error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
+ error = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt,
+ &pci_p2pdma_fs_cnt);
if (error)
goto out_pool_destroy;
+ p2p->inode = alloc_anon_inode(pci_p2pdma_fs_mnt->mnt_sb);
+ if (IS_ERR(p2p->inode)) {
+ error = -ENOMEM;
+ goto out_unpin_fs;
+ }
+
+ error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
+ if (error)
+ goto out_put_inode;
+
error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
if (error)
- goto out_pool_destroy;
+ goto out_put_inode;
rcu_assign_pointer(pdev->p2pdma, p2p);
return 0;
+out_put_inode:
+ iput(p2p->inode);
+out_unpin_fs:
+ simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt);
out_pool_destroy:
gen_pool_destroy(p2p->pool);
out:
@@ -164,6 +216,54 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
return error;
}
+static void pci_p2pdma_map_free_pages(struct pci_p2pdma_map *pmap)
+{
+ int i;
+
+ if (!pmap->kaddr)
+ return;
+
+ for (i = 0; i < pmap->len; i += PAGE_SIZE)
+ put_page(virt_to_page(pmap->kaddr + i));
+
+ pmap->kaddr = NULL;
+}
+
+static void pci_p2pdma_free_mappings(struct address_space *mapping)
+{
+ struct vm_area_struct *vma;
+
+ i_mmap_lock_write(mapping);
+ if (RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
+ goto out;
+
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, -1)
+ pci_p2pdma_map_free_pages(vma->vm_private_data);
+
+out:
+ i_mmap_unlock_write(mapping);
+}
+
+static void pci_p2pdma_unmap_mappings(void *data)
+{
+ struct pci_dev *pdev = data;
+ struct pci_p2pdma *p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
+
+ /* Ensure no new pages can be allocated in mappings */
+ p2pdma->active = false;
+ synchronize_rcu();
+
+ unmap_mapping_range(p2pdma->inode->i_mapping, 0, 0, 1);
+
+ /*
+ * On some architectures, TLB flushes are done with call_rcu()
+ * so to ensure GUP fast is done with the pages, call synchronize_rcu()
+ * before freeing them.
+ */
+ synchronize_rcu();
+ pci_p2pdma_free_mappings(p2pdma->inode->i_mapping);
+}
+
/**
* pci_p2pdma_add_resource - add memory for use as p2p memory
* @pdev: the device to add the memory to
@@ -222,6 +322,11 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
goto pgmap_free;
}
+ error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
+ pdev);
+ if (error)
+ goto pages_free;
+
p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
pci_bus_address(pdev, bar) + offset,
@@ -230,6 +335,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
if (error)
goto pages_free;
+ p2pdma->active = true;
pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n",
pgmap->range.start, pgmap->range.end);
@@ -1030,3 +1136,194 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
return sprintf(page, "%s\n", pci_name(p2p_dev));
}
EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
+
+static struct pci_p2pdma_map *pci_p2pdma_map_alloc(struct pci_dev *pdev,
+ size_t len)
+{
+ struct pci_p2pdma_map *pmap;
+
+ pmap = kzalloc(sizeof(*pmap), GFP_KERNEL);
+ if (!pmap)
+ return NULL;
+
+ kref_init(&pmap->ref);
+ pmap->pdev = pci_dev_get(pdev);
+ pmap->len = len;
+
+ return pmap;
+}
+
+static void pci_p2pdma_map_free(struct rcu_head *rcu)
+{
+ struct pci_p2pdma_map *pmap =
+ container_of(rcu, struct pci_p2pdma_map, rcu);
+
+ pci_p2pdma_map_free_pages(pmap);
+ kfree(pmap);
+}
+
+static void pci_p2pdma_map_release(struct kref *ref)
+{
+ struct pci_p2pdma_map *pmap =
+ container_of(ref, struct pci_p2pdma_map, ref);
+
+ iput(pmap->inode);
+ simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt);
+ pci_dev_put(pmap->pdev);
+
+ if (pmap->kaddr) {
+ /*
+ * Make sure to wait for the TLB flush (which some
+ * architectures do using call_rcu()) before returning the
+ * pages to the genalloc. This ensures the pages are not reused
+ * before GUP-fast is finished with them. So the mapping is
+ * freed using call_rcu() seeing adding synchronize_rcu() to
+ * the munmap path can cause long delays on large systems
+ * during process cleanup.
+ */
+ call_rcu(&pmap->rcu, pci_p2pdma_map_free);
+ return;
+ }
+
+ /*
+ * If there are no pages, just free the object immediately. There
+ * are no more references to it so there is nothing that can race
+ * with adding the pages.
+ */
+ pci_p2pdma_map_free(&pmap->rcu);
+}
+
+static void pci_p2pdma_vma_open(struct vm_area_struct *vma)
+{
+ struct pci_p2pdma_map *pmap = vma->vm_private_data;
+
+ kref_get(&pmap->ref);
+}
+
+static void pci_p2pdma_vma_close(struct vm_area_struct *vma)
+{
+ struct pci_p2pdma_map *pmap = vma->vm_private_data;
+
+ kref_put(&pmap->ref, pci_p2pdma_map_release);
+}
+
+static vm_fault_t pci_p2pdma_vma_fault(struct vm_fault *vmf)
+{
+ struct pci_p2pdma_map *pmap = vmf->vma->vm_private_data;
+ struct pci_p2pdma *p2pdma;
+ void *vaddr;
+ pfn_t pfn;
+ int i;
+
+ if (!pmap->kaddr) {
+ rcu_read_lock();
+ p2pdma = rcu_dereference(pmap->pdev->p2pdma);
+ if (!p2pdma)
+ goto err_out;
+
+ if (!p2pdma->active)
+ goto err_out;
+
+ pmap->kaddr = (void *)gen_pool_alloc(p2pdma->pool, pmap->len);
+ if (!pmap->kaddr)
+ goto err_out;
+
+ for (i = 0; i < pmap->len; i += PAGE_SIZE)
+ get_page(virt_to_page(pmap->kaddr + i));
+
+ rcu_read_unlock();
+ }
+
+ vaddr = pmap->kaddr + (vmf->pgoff << PAGE_SHIFT);
+ pfn = phys_to_pfn_t(virt_to_phys(vaddr), PFN_DEV | PFN_MAP);
+
+ return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+
+err_out:
+ rcu_read_unlock();
+ return VM_FAULT_SIGBUS;
+}
+static const struct vm_operations_struct pci_p2pdma_vmops = {
+ .open = pci_p2pdma_vma_open,
+ .close = pci_p2pdma_vma_close,
+ .fault = pci_p2pdma_vma_fault,
+};
+
+/**
+ * pci_p2pdma_file_open - setup file mapping to store P2PMEM VMAs
+ * @pdev: the device to allocate memory from
+ * @vma: the userspace vma to map the memory to
+ *
+ * Set f_mapping of the file to the p2pdma inode so that mappings
+ * are can be torn down on device unbind.
+ *
+ * Returns 0 on success, or a negative error code on failure
+ */
+void pci_p2pdma_file_open(struct pci_dev *pdev, struct file *file)
+{
+ struct pci_p2pdma *p2pdma;
+
+ rcu_read_lock();
+ p2pdma = rcu_dereference(pdev->p2pdma);
+ if (p2pdma)
+ file->f_mapping = p2pdma->inode->i_mapping;
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_file_open);
+
+/**
+ * pci_mmap_p2pmem - setup an mmap region to be backed with P2PDMA memory
+ * that was registered with pci_p2pdma_add_resource()
+ * @pdev: the device to allocate memory from
+ * @vma: the userspace vma to map the memory to
+ *
+ * The file must call pci_p2pdma_mmap_file_open() in its open() operation.
+ *
+ * Returns 0 on success, or a negative error code on failure
+ */
+int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma)
+{
+ struct pci_p2pdma_map *pmap;
+ struct pci_p2pdma *p2pdma;
+ int ret;
+
+ /* prevent private mappings from being established */
+ if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+ pci_info_ratelimited(pdev,
+ "%s: fail, attempted private mapping\n",
+ current->comm);
+ return -EINVAL;
+ }
+
+ pmap = pci_p2pdma_map_alloc(pdev, vma->vm_end - vma->vm_start);
+ if (!pmap)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ p2pdma = rcu_dereference(pdev->p2pdma);
+ if (!p2pdma) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ ret = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt,
+ &pci_p2pdma_fs_cnt);
+ if (ret)
+ goto out;
+
+ ihold(p2pdma->inode);
+ pmap->inode = p2pdma->inode;
+ rcu_read_unlock();
+
+ vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_private_data = pmap;
+ vma->vm_ops = &pci_p2pdma_vmops;
+
+ return 0;
+
+out:
+ rcu_read_unlock();
+ kfree(pmap);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pci_mmap_p2pmem);
@@ -34,6 +34,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
bool *use_p2pdma);
ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
bool use_p2pdma);
+void pci_p2pdma_file_open(struct pci_dev *pdev, struct file *file);
+int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma);
#else /* CONFIG_PCI_P2PDMA */
static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
size_t size, u64 offset)
@@ -90,6 +92,15 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
{
return sprintf(page, "none\n");
}
+static inline void pci_p2pdma_file_open(struct pci_dev *pdev,
+ struct file *file)
+{
+}
+static inline int pci_mmap_p2pmem(struct pci_dev *pdev,
+ struct vm_area_struct *vma)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_PCI_P2PDMA */
@@ -94,6 +94,7 @@
#define BPF_FS_MAGIC 0xcafe4a11
#define AAFS_MAGIC 0x5a3c69f0
#define ZONEFS_MAGIC 0x5a4f4653
+#define P2PDMA_MAGIC 0x70327064
/* Since UDF 2.01 is ISO 13346 based... */
#define UDF_SUPER_MAGIC 0x15013346