diff mbox

[v6,2/3] arm64: Add IOMMU dma_ops

Message ID 80cb035144a2648a5d94eb1fec3336f17ad249f1.1443718557.git.robin.murphy@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Robin Murphy Oct. 1, 2015, 7:13 p.m. UTC
Taking some inspiration from the arch/arm code, implement the
arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.

Since there is still work to do elsewhere to make DMA configuration happen
in a more appropriate order and properly support platform devices in the
IOMMU core, the device setup code unfortunately starts out carrying some
workarounds to ensure it works correctly in the current state of things.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/arm64/mm/dma-mapping.c | 435 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 435 insertions(+)

Comments

Yong Wu (吴勇) Oct. 6, 2015, 11 a.m. UTC | #1
On Thu, 2015-10-01 at 20:13 +0100, Robin Murphy wrote:
> Taking some inspiration from the arch/arm code, implement the
> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
> 
> Since there is still work to do elsewhere to make DMA configuration happen
> in a more appropriate order and properly support platform devices in the
> IOMMU core, the device setup code unfortunately starts out carrying some
> workarounds to ensure it works correctly in the current state of things.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>  arch/arm64/mm/dma-mapping.c | 435 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 435 insertions(+)
> 
[...]
> +/*
> + * TODO: Right now __iommu_setup_dma_ops() gets called too early to do
> + * everything it needs to - the device is only partially created and the
> + * IOMMU driver hasn't seen it yet, so it can't have a group. Thus we
> + * need this delayed attachment dance. Once IOMMU probe ordering is sorted
> + * to move the arch_setup_dma_ops() call later, all the notifier bits below
> + * become unnecessary, and will go away.
> + */

Hi Robin,
      Could I ask a question about the plan in the future:
      How to move arch_setup_dma_ops() call later than IOMMU probe?
      
      arch_setup_dma_ops is from of_dma_configure which is from
arm64_device_init, and IOMMU probe is subsys_init. So
arch_setup_dma_ops will run before IOMMU probe normally, is it right?
      Does Laurent's probe-deferral series could help do this? what's
the state of this series.

> +struct iommu_dma_notifier_data {
> +	struct list_head list;
> +	struct device *dev;
> +	const struct iommu_ops *ops;
> +	u64 dma_base;
> +	u64 size;
> +};
> +static LIST_HEAD(iommu_dma_masters);
> +static DEFINE_MUTEX(iommu_dma_notifier_lock);
> +
> +/*
> + * Temporarily "borrow" a domain feature flag to to tell if we had to resort
> + * to creating our own domain here, in case we need to clean it up again.
> + */
> +#define __IOMMU_DOMAIN_FAKE_DEFAULT		(1U << 31)
> +
> +static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops,
> +			   u64 dma_base, u64 size)
> +{
> +	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +
> +	/*
> +	 * Best case: The device is either part of a group which was
> +	 * already attached to a domain in a previous call, or it's
> +	 * been put in a default DMA domain by the IOMMU core.
> +	 */
> +	if (!domain) {
> +		/*
> +		 * Urgh. The IOMMU core isn't going to do default domains
> +		 * for non-PCI devices anyway, until it has some means of
> +		 * abstracting the entirely implementation-specific
> +		 * sideband data/SoC topology/unicorn dust that may or
> +		 * may not differentiate upstream masters.
> +		 * So until then, HORRIBLE HACKS!
> +		 */
> +		domain = ops->domain_alloc(IOMMU_DOMAIN_DMA);
> +		if (!domain)
> +			goto out_no_domain;
> +
> +		domain->ops = ops;
> +		domain->type = IOMMU_DOMAIN_DMA | __IOMMU_DOMAIN_FAKE_DEFAULT;
> +
> +		if (iommu_attach_device(domain, dev))
> +			goto out_put_domain;
> +	}
> +
> +	if (iommu_dma_init_domain(domain, dma_base, size))
> +		goto out_detach;
> +
> +	dev->archdata.dma_ops = &iommu_dma_ops;
> +	return true;
> +
> +out_detach:
> +	iommu_detach_device(domain, dev);
> +out_put_domain:
> +	if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
> +		iommu_domain_free(domain);
> +out_no_domain:
> +	pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
> +		dev_name(dev));
> +	return false;
> +}
[...]
> +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
> +				  const struct iommu_ops *ops)
> +{
> +	struct iommu_group *group;
> +
> +	if (!ops)
> +		return;
> +	/*
> +	 * TODO: As a concession to the future, we're ready to handle being
> +	 * called both early and late (i.e. after bus_add_device). Once all
> +	 * the platform bus code is reworked to call us late and the notifier
> +	 * junk above goes away, move the body of do_iommu_attach here.
> +	 */
> +	group = iommu_group_get(dev);

   If iommu_setup_dma_ops run after bus_add_device, then the device has
its group here. It will enter do_iommu_attach which will alloc a default
iommu domain and attach this device to the new iommu domain.
   But mtk-iommu don't expect like this, we would like to attach to the
same domain. So we should alloc a default iommu domain(if there is no
iommu domain at that time) and attach the device to the same domain in
our xx_add_device, is it right?

> +	if (group) {
> +		do_iommu_attach(dev, ops, dma_base, size);
> +		iommu_group_put(group);
> +	} else {
> +		queue_iommu_attach(dev, ops, dma_base, size);
> +	}
> +}
> +
> +#else
> +
> +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
> +				  struct iommu_ops *iommu)
> +{ }
> +
> +#endif  /* CONFIG_IOMMU_DMA */
> +
Anup Patel Oct. 7, 2015, 9:03 a.m. UTC | #2
Hi Robin,

On Fri, Oct 2, 2015 at 12:43 AM, Robin Murphy <robin.murphy@arm.com> wrote:
> Taking some inspiration from the arch/arm code, implement the
> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
>
> Since there is still work to do elsewhere to make DMA configuration happen
> in a more appropriate order and properly support platform devices in the
> IOMMU core, the device setup code unfortunately starts out carrying some
> workarounds to ensure it works correctly in the current state of things.
>
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>  arch/arm64/mm/dma-mapping.c | 435 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 435 insertions(+)
>
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index 0bcc4bc..dd2d6e6 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -533,3 +533,438 @@ static int __init dma_debug_do_init(void)
>         return 0;
>  }
>  fs_initcall(dma_debug_do_init);
> +
> +
> +#ifdef CONFIG_IOMMU_DMA
> +#include <linux/dma-iommu.h>
> +#include <linux/platform_device.h>
> +#include <linux/amba/bus.h>
> +
> +/* Thankfully, all cache ops are by VA so we can ignore phys here */
> +static void flush_page(struct device *dev, const void *virt, phys_addr_t phys)
> +{
> +       __dma_flush_range(virt, virt + PAGE_SIZE);
> +}
> +
> +static void *__iommu_alloc_attrs(struct device *dev, size_t size,
> +                                dma_addr_t *handle, gfp_t gfp,
> +                                struct dma_attrs *attrs)
> +{
> +       bool coherent = is_device_dma_coherent(dev);
> +       int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
> +       void *addr;
> +
> +       if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
> +               return NULL;
> +       /*
> +        * Some drivers rely on this, and we probably don't want the
> +        * possibility of stale kernel data being read by devices anyway.
> +        */
> +       gfp |= __GFP_ZERO;
> +
> +       if (gfp & __GFP_WAIT) {
> +               struct page **pages;
> +               pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
> +
> +               pages = iommu_dma_alloc(dev, size, gfp, ioprot, handle,
> +                                       flush_page);
> +               if (!pages)
> +                       return NULL;
> +
> +               addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
> +                                             __builtin_return_address(0));
> +               if (!addr)
> +                       iommu_dma_free(dev, pages, size, handle);
> +       } else {
> +               struct page *page;
> +               /*
> +                * In atomic context we can't remap anything, so we'll only
> +                * get the virtually contiguous buffer we need by way of a
> +                * physically contiguous allocation.
> +                */
> +               if (coherent) {
> +                       page = alloc_pages(gfp, get_order(size));
> +                       addr = page ? page_address(page) : NULL;
> +               } else {
> +                       addr = __alloc_from_pool(size, &page, gfp);
> +               }
> +               if (!addr)
> +                       return NULL;
> +
> +               *handle = iommu_dma_map_page(dev, page, 0, size, ioprot);
> +               if (iommu_dma_mapping_error(dev, *handle)) {
> +                       if (coherent)
> +                               __free_pages(page, get_order(size));
> +                       else
> +                               __free_from_pool(addr, size);
> +                       addr = NULL;
> +               }
> +       }
> +       return addr;
> +}
> +
> +static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
> +                              dma_addr_t handle, struct dma_attrs *attrs)
> +{
> +       /*
> +        * @cpu_addr will be one of 3 things depending on how it was allocated:
> +        * - A remapped array of pages from iommu_dma_alloc(), for all
> +        *   non-atomic allocations.
> +        * - A non-cacheable alias from the atomic pool, for atomic
> +        *   allocations by non-coherent devices.
> +        * - A normal lowmem address, for atomic allocations by
> +        *   coherent devices.
> +        * Hence how dodgy the below logic looks...
> +        */
> +       if (__in_atomic_pool(cpu_addr, size)) {
> +               iommu_dma_unmap_page(dev, handle, size, 0, NULL);
> +               __free_from_pool(cpu_addr, size);
> +       } else if (is_vmalloc_addr(cpu_addr)){
> +               struct vm_struct *area = find_vm_area(cpu_addr);
> +
> +               if (WARN_ON(!area || !area->pages))
> +                       return;
> +               iommu_dma_free(dev, area->pages, size, &handle);
> +               dma_common_free_remap(cpu_addr, size, VM_USERMAP);
> +       } else {
> +               iommu_dma_unmap_page(dev, handle, size, 0, NULL);
> +               __free_pages(virt_to_page(cpu_addr), get_order(size));
> +       }
> +}
> +
> +static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
> +                             void *cpu_addr, dma_addr_t dma_addr, size_t size,
> +                             struct dma_attrs *attrs)
> +{
> +       struct vm_struct *area;
> +       int ret;
> +
> +       vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot,
> +                                            is_device_dma_coherent(dev));
> +
> +       if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
> +               return ret;
> +
> +       area = find_vm_area(cpu_addr);
> +       if (WARN_ON(!area || !area->pages))
> +               return -ENXIO;
> +
> +       return iommu_dma_mmap(area->pages, size, vma);
> +}
> +
> +static int __iommu_get_sgtable(struct device *dev, struct sg_table *sgt,
> +                              void *cpu_addr, dma_addr_t dma_addr,
> +                              size_t size, struct dma_attrs *attrs)
> +{
> +       unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
> +       struct vm_struct *area = find_vm_area(cpu_addr);
> +
> +       if (WARN_ON(!area || !area->pages))
> +               return -ENXIO;
> +
> +       return sg_alloc_table_from_pages(sgt, area->pages, count, 0, size,
> +                                        GFP_KERNEL);
> +}
> +
> +static void __iommu_sync_single_for_cpu(struct device *dev,
> +                                       dma_addr_t dev_addr, size_t size,
> +                                       enum dma_data_direction dir)
> +{
> +       phys_addr_t phys;
> +
> +       if (is_device_dma_coherent(dev))
> +               return;
> +
> +       phys = iommu_iova_to_phys(iommu_get_domain_for_dev(dev), dev_addr);
> +       __dma_unmap_area(phys_to_virt(phys), size, dir);
> +}
> +
> +static void __iommu_sync_single_for_device(struct device *dev,
> +                                          dma_addr_t dev_addr, size_t size,
> +                                          enum dma_data_direction dir)
> +{
> +       phys_addr_t phys;
> +
> +       if (is_device_dma_coherent(dev))
> +               return;
> +
> +       phys = iommu_iova_to_phys(iommu_get_domain_for_dev(dev), dev_addr);
> +       __dma_map_area(phys_to_virt(phys), size, dir);
> +}
> +
> +static dma_addr_t __iommu_map_page(struct device *dev, struct page *page,
> +                                  unsigned long offset, size_t size,
> +                                  enum dma_data_direction dir,
> +                                  struct dma_attrs *attrs)
> +{
> +       bool coherent = is_device_dma_coherent(dev);
> +       int prot = dma_direction_to_prot(dir, coherent);
> +       dma_addr_t dev_addr = iommu_dma_map_page(dev, page, offset, size, prot);
> +
> +       if (!iommu_dma_mapping_error(dev, dev_addr) &&
> +           !dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
> +               __iommu_sync_single_for_device(dev, dev_addr, size, dir);
> +
> +       return dev_addr;
> +}
> +
> +static void __iommu_unmap_page(struct device *dev, dma_addr_t dev_addr,
> +                              size_t size, enum dma_data_direction dir,
> +                              struct dma_attrs *attrs)
> +{
> +       if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
> +               __iommu_sync_single_for_cpu(dev, dev_addr, size, dir);
> +
> +       iommu_dma_unmap_page(dev, dev_addr, size, dir, attrs);
> +}
> +
> +static void __iommu_sync_sg_for_cpu(struct device *dev,
> +                                   struct scatterlist *sgl, int nelems,
> +                                   enum dma_data_direction dir)
> +{
> +       struct scatterlist *sg;
> +       int i;
> +
> +       if (is_device_dma_coherent(dev))
> +               return;
> +
> +       for_each_sg(sgl, sg, nelems, i)
> +               __dma_unmap_area(sg_virt(sg), sg->length, dir);
> +}
> +
> +static void __iommu_sync_sg_for_device(struct device *dev,
> +                                      struct scatterlist *sgl, int nelems,
> +                                      enum dma_data_direction dir)
> +{
> +       struct scatterlist *sg;
> +       int i;
> +
> +       if (is_device_dma_coherent(dev))
> +               return;
> +
> +       for_each_sg(sgl, sg, nelems, i)
> +               __dma_map_area(sg_virt(sg), sg->length, dir);
> +}
> +
> +static int __iommu_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
> +                               int nelems, enum dma_data_direction dir,
> +                               struct dma_attrs *attrs)
> +{
> +       bool coherent = is_device_dma_coherent(dev);
> +
> +       if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
> +               __iommu_sync_sg_for_device(dev, sgl, nelems, dir);
> +
> +       return iommu_dma_map_sg(dev, sgl, nelems,
> +                       dma_direction_to_prot(dir, coherent));
> +}
> +
> +static void __iommu_unmap_sg_attrs(struct device *dev,
> +                                  struct scatterlist *sgl, int nelems,
> +                                  enum dma_data_direction dir,
> +                                  struct dma_attrs *attrs)
> +{
> +       if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
> +               __iommu_sync_sg_for_cpu(dev, sgl, nelems, dir);
> +
> +       iommu_dma_unmap_sg(dev, sgl, nelems, dir, attrs);
> +}
> +
> +static struct dma_map_ops iommu_dma_ops = {
> +       .alloc = __iommu_alloc_attrs,
> +       .free = __iommu_free_attrs,
> +       .mmap = __iommu_mmap_attrs,
> +       .get_sgtable = __iommu_get_sgtable,
> +       .map_page = __iommu_map_page,
> +       .unmap_page = __iommu_unmap_page,
> +       .map_sg = __iommu_map_sg_attrs,
> +       .unmap_sg = __iommu_unmap_sg_attrs,
> +       .sync_single_for_cpu = __iommu_sync_single_for_cpu,
> +       .sync_single_for_device = __iommu_sync_single_for_device,
> +       .sync_sg_for_cpu = __iommu_sync_sg_for_cpu,
> +       .sync_sg_for_device = __iommu_sync_sg_for_device,
> +       .dma_supported = iommu_dma_supported,
> +       .mapping_error = iommu_dma_mapping_error,
> +};
> +
> +/*
> + * TODO: Right now __iommu_setup_dma_ops() gets called too early to do
> + * everything it needs to - the device is only partially created and the
> + * IOMMU driver hasn't seen it yet, so it can't have a group. Thus we
> + * need this delayed attachment dance. Once IOMMU probe ordering is sorted
> + * to move the arch_setup_dma_ops() call later, all the notifier bits below
> + * become unnecessary, and will go away.
> + */
> +struct iommu_dma_notifier_data {
> +       struct list_head list;
> +       struct device *dev;
> +       const struct iommu_ops *ops;
> +       u64 dma_base;
> +       u64 size;
> +};
> +static LIST_HEAD(iommu_dma_masters);
> +static DEFINE_MUTEX(iommu_dma_notifier_lock);
> +
> +/*
> + * Temporarily "borrow" a domain feature flag to to tell if we had to resort
> + * to creating our own domain here, in case we need to clean it up again.
> + */
> +#define __IOMMU_DOMAIN_FAKE_DEFAULT            (1U << 31)
> +
> +static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops,
> +                          u64 dma_base, u64 size)
> +{
> +       struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +
> +       /*
> +        * Best case: The device is either part of a group which was
> +        * already attached to a domain in a previous call, or it's
> +        * been put in a default DMA domain by the IOMMU core.
> +        */
> +       if (!domain) {
> +               /*
> +                * Urgh. The IOMMU core isn't going to do default domains
> +                * for non-PCI devices anyway, until it has some means of
> +                * abstracting the entirely implementation-specific
> +                * sideband data/SoC topology/unicorn dust that may or
> +                * may not differentiate upstream masters.
> +                * So until then, HORRIBLE HACKS!
> +                */
> +               domain = ops->domain_alloc(IOMMU_DOMAIN_DMA);
> +               if (!domain)
> +                       goto out_no_domain;
> +
> +               domain->ops = ops;
> +               domain->type = IOMMU_DOMAIN_DMA | __IOMMU_DOMAIN_FAKE_DEFAULT;

We require iommu_get_dma_cookie(domain) here. If we dont
allocate iommu cookie then iommu_dma_init_domain() will fail.

> +
> +               if (iommu_attach_device(domain, dev))
> +                       goto out_put_domain;
> +       }
> +
> +       if (iommu_dma_init_domain(domain, dma_base, size))
> +               goto out_detach;
> +
> +       dev->archdata.dma_ops = &iommu_dma_ops;
> +       return true;
> +
> +out_detach:
> +       iommu_detach_device(domain, dev);
> +out_put_domain:
> +       if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
> +               iommu_domain_free(domain);
> +out_no_domain:
> +       pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
> +               dev_name(dev));
> +       return false;
> +}
> +
> +static void queue_iommu_attach(struct device *dev, const struct iommu_ops *ops,
> +                             u64 dma_base, u64 size)
> +{
> +       struct iommu_dma_notifier_data *iommudata;
> +
> +       iommudata = kzalloc(sizeof(*iommudata), GFP_KERNEL);
> +       if (!iommudata)
> +               return;
> +
> +       iommudata->dev = dev;
> +       iommudata->ops = ops;
> +       iommudata->dma_base = dma_base;
> +       iommudata->size = size;
> +
> +       mutex_lock(&iommu_dma_notifier_lock);
> +       list_add(&iommudata->list, &iommu_dma_masters);
> +       mutex_unlock(&iommu_dma_notifier_lock);
> +}
> +
> +static int __iommu_attach_notifier(struct notifier_block *nb,
> +                                  unsigned long action, void *data)
> +{
> +       struct iommu_dma_notifier_data *master, *tmp;
> +
> +       if (action != BUS_NOTIFY_ADD_DEVICE)
> +               return 0;
> +
> +       mutex_lock(&iommu_dma_notifier_lock);
> +       list_for_each_entry_safe(master, tmp, &iommu_dma_masters, list) {
> +               if (do_iommu_attach(master->dev, master->ops,
> +                               master->dma_base, master->size)) {
> +                       list_del(&master->list);
> +                       kfree(master);
> +               }
> +       }
> +       mutex_unlock(&iommu_dma_notifier_lock);
> +       return 0;
> +}
> +
> +static int register_iommu_dma_ops_notifier(struct bus_type *bus)
> +{
> +       struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL);
> +       int ret;
> +
> +       if (!nb)
> +               return -ENOMEM;
> +       /*
> +        * The device must be attached to a domain before the driver probe
> +        * routine gets a chance to start allocating DMA buffers. However,
> +        * the IOMMU driver also needs a chance to configure the iommu_group
> +        * via its add_device callback first, so we need to make the attach
> +        * happen between those two points. Since the IOMMU core uses a bus
> +        * notifier with default priority for add_device, do the same but
> +        * with a lower priority to ensure the appropriate ordering.
> +        */
> +       nb->notifier_call = __iommu_attach_notifier;
> +       nb->priority = -100;
> +
> +       ret = bus_register_notifier(bus, nb);
> +       if (ret) {
> +               pr_warn("Failed to register DMA domain notifier; IOMMU DMA ops unavailable on bus '%s'\n",
> +                       bus->name);
> +               kfree(nb);
> +       }
> +       return ret;
> +}
> +
> +static int __init __iommu_dma_init(void)
> +{
> +       int ret;
> +
> +       ret = iommu_dma_init();
> +       if (!ret)
> +               ret = register_iommu_dma_ops_notifier(&platform_bus_type);
> +       if (!ret)
> +               ret = register_iommu_dma_ops_notifier(&amba_bustype);
> +       return ret;
> +}
> +arch_initcall(__iommu_dma_init);
> +
> +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
> +                                 const struct iommu_ops *ops)
> +{
> +       struct iommu_group *group;
> +
> +       if (!ops)
> +               return;
> +       /*
> +        * TODO: As a concession to the future, we're ready to handle being
> +        * called both early and late (i.e. after bus_add_device). Once all
> +        * the platform bus code is reworked to call us late and the notifier
> +        * junk above goes away, move the body of do_iommu_attach here.
> +        */
> +       group = iommu_group_get(dev);
> +       if (group) {
> +               do_iommu_attach(dev, ops, dma_base, size);
> +               iommu_group_put(group);
> +       } else {
> +               queue_iommu_attach(dev, ops, dma_base, size);
> +       }
> +}
> +
> +#else
> +
> +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
> +                                 struct iommu_ops *iommu)
> +{ }
> +
> +#endif  /* CONFIG_IOMMU_DMA */
> +
> --
> 1.9.1
>

Regards,
Anup
Robin Murphy Oct. 7, 2015, 4:07 p.m. UTC | #3
On 06/10/15 12:00, Yong Wu wrote:
> On Thu, 2015-10-01 at 20:13 +0100, Robin Murphy wrote:
>> Taking some inspiration from the arch/arm code, implement the
>> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
>>
>> Since there is still work to do elsewhere to make DMA configuration happen
>> in a more appropriate order and properly support platform devices in the
>> IOMMU core, the device setup code unfortunately starts out carrying some
>> workarounds to ensure it works correctly in the current state of things.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>>   arch/arm64/mm/dma-mapping.c | 435 ++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 435 insertions(+)
>>
> [...]
>> +/*
>> + * TODO: Right now __iommu_setup_dma_ops() gets called too early to do
>> + * everything it needs to - the device is only partially created and the
>> + * IOMMU driver hasn't seen it yet, so it can't have a group. Thus we
>> + * need this delayed attachment dance. Once IOMMU probe ordering is sorted
>> + * to move the arch_setup_dma_ops() call later, all the notifier bits below
>> + * become unnecessary, and will go away.
>> + */
>
> Hi Robin,
>        Could I ask a question about the plan in the future:
>        How to move arch_setup_dma_ops() call later than IOMMU probe?
>
>        arch_setup_dma_ops is from of_dma_configure which is from
> arm64_device_init, and IOMMU probe is subsys_init. So
> arch_setup_dma_ops will run before IOMMU probe normally, is it right?

Yup, hence the need to call of_platform_device_create() manually in your 
IOMMU_OF_DECLARE init function if you need the actual device instance to 
be ready before the root of_platform_populate() runs.

>        Does Laurent's probe-deferral series could help do this? what's
> the state of this series.

What Laurent's patches do is to leave the DMA mask configuration where 
it is early in device creation, but split out the dma_ops configuration 
to be called just before the actual driver probe, and defer that if the 
IOMMU device hasn't probed yet. At the moment, those patches (plus a bit 
of my own development on top) are working fairly well in the simple 
case, but I've seen things start falling apart if the client driver then 
requests its own probe deferral, and there are probably other 
troublesome edge cases to find - I need to dig into that further, but 
sorting out my ARM SMMU driver patches is currently looking like a 
higher priority.

>> +struct iommu_dma_notifier_data {
>> +	struct list_head list;
>> +	struct device *dev;
>> +	const struct iommu_ops *ops;
>> +	u64 dma_base;
>> +	u64 size;
>> +};
>> +static LIST_HEAD(iommu_dma_masters);
>> +static DEFINE_MUTEX(iommu_dma_notifier_lock);
>> +
>> +/*
>> + * Temporarily "borrow" a domain feature flag to to tell if we had to resort
>> + * to creating our own domain here, in case we need to clean it up again.
>> + */
>> +#define __IOMMU_DOMAIN_FAKE_DEFAULT		(1U << 31)
>> +
>> +static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops,
>> +			   u64 dma_base, u64 size)
>> +{
>> +	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
>> +
>> +	/*
>> +	 * Best case: The device is either part of a group which was
>> +	 * already attached to a domain in a previous call, or it's
>> +	 * been put in a default DMA domain by the IOMMU core.
>> +	 */
>> +	if (!domain) {
>> +		/*
>> +		 * Urgh. The IOMMU core isn't going to do default domains
>> +		 * for non-PCI devices anyway, until it has some means of
>> +		 * abstracting the entirely implementation-specific
>> +		 * sideband data/SoC topology/unicorn dust that may or
>> +		 * may not differentiate upstream masters.
>> +		 * So until then, HORRIBLE HACKS!
>> +		 */
>> +		domain = ops->domain_alloc(IOMMU_DOMAIN_DMA);
>> +		if (!domain)
>> +			goto out_no_domain;
>> +
>> +		domain->ops = ops;
>> +		domain->type = IOMMU_DOMAIN_DMA | __IOMMU_DOMAIN_FAKE_DEFAULT;
>> +
>> +		if (iommu_attach_device(domain, dev))
>> +			goto out_put_domain;
>> +	}
>> +
>> +	if (iommu_dma_init_domain(domain, dma_base, size))
>> +		goto out_detach;
>> +
>> +	dev->archdata.dma_ops = &iommu_dma_ops;
>> +	return true;
>> +
>> +out_detach:
>> +	iommu_detach_device(domain, dev);
>> +out_put_domain:
>> +	if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
>> +		iommu_domain_free(domain);
>> +out_no_domain:
>> +	pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
>> +		dev_name(dev));
>> +	return false;
>> +}
> [...]
>> +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
>> +				  const struct iommu_ops *ops)
>> +{
>> +	struct iommu_group *group;
>> +
>> +	if (!ops)
>> +		return;
>> +	/*
>> +	 * TODO: As a concession to the future, we're ready to handle being
>> +	 * called both early and late (i.e. after bus_add_device). Once all
>> +	 * the platform bus code is reworked to call us late and the notifier
>> +	 * junk above goes away, move the body of do_iommu_attach here.
>> +	 */
>> +	group = iommu_group_get(dev);
>
>     If iommu_setup_dma_ops run after bus_add_device, then the device has
> its group here. It will enter do_iommu_attach which will alloc a default
> iommu domain and attach this device to the new iommu domain.
>     But mtk-iommu don't expect like this, we would like to attach to the
> same domain. So we should alloc a default iommu domain(if there is no
> iommu domain at that time) and attach the device to the same domain in
> our xx_add_device, is it right?

Yes, if you attach the device to your own 'real' default domain after 
setting up the group in add_device, then do_iommu_attach() will now pick 
that domain up and use it instead of trying to create a new one, and the 
arch code will stop short of tearing the domain down if the device probe 
fails and it gets detached again. Additionally, since from add_device 
you should hopefully have all the information you need to get back to 
the relevant m4u instance, it should now be OK to keep the default 
domain there and finally get rid of that pesky global variable.

Robin.

>> +	if (group) {
>> +		do_iommu_attach(dev, ops, dma_base, size);
>> +		iommu_group_put(group);
>> +	} else {
>> +		queue_iommu_attach(dev, ops, dma_base, size);
>> +	}
>> +}
>> +
>> +#else
>> +
>> +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
>> +				  struct iommu_ops *iommu)
>> +{ }
>> +
>> +#endif  /* CONFIG_IOMMU_DMA */
>> +
>
>
Robin Murphy Oct. 7, 2015, 4:36 p.m. UTC | #4
On 07/10/15 10:03, Anup Patel wrote:
[...]
>> +static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops,
>> +                          u64 dma_base, u64 size)
>> +{
>> +       struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
>> +
>> +       /*
>> +        * Best case: The device is either part of a group which was
>> +        * already attached to a domain in a previous call, or it's
>> +        * been put in a default DMA domain by the IOMMU core.
>> +        */
>> +       if (!domain) {
>> +               /*
>> +                * Urgh. The IOMMU core isn't going to do default domains
>> +                * for non-PCI devices anyway, until it has some means of
>> +                * abstracting the entirely implementation-specific
>> +                * sideband data/SoC topology/unicorn dust that may or
>> +                * may not differentiate upstream masters.
>> +                * So until then, HORRIBLE HACKS!
>> +                */
>> +               domain = ops->domain_alloc(IOMMU_DOMAIN_DMA);
>> +               if (!domain)
>> +                       goto out_no_domain;
>> +
>> +               domain->ops = ops;
>> +               domain->type = IOMMU_DOMAIN_DMA | __IOMMU_DOMAIN_FAKE_DEFAULT;
>
> We require iommu_get_dma_cookie(domain) here. If we dont
> allocate iommu cookie then iommu_dma_init_domain() will fail.

The iova cookie is tightly coupled with the domain, so it really only 
makes sense for the IOMMU driver to deal with it as part of its 
domain_alloc/domain_free callbacks.

Doing that here was one of the nastier 'compatibility' hacks which I've 
now taken out; trying to make things work without modifying existing 
IOMMU drivers was just too impractical. Drivers which want to play are 
now required to support the IOMMU_DOMAIN_DMA type appropriately, but 
it's still only a minimal change, as per the example diff for the ARM 
SMMU driver below (big pile of further patches necessary to make said 
driver compatible in other respects notwithstanding).

Robin.

--->8---
@@ -29,6 +29,7 @@
  #define pr_fmt(fmt) "arm-smmu: " fmt

  #include <linux/delay.h>
+#include <linux/dma-iommu.h>
  #include <linux/dma-mapping.h>
  #include <linux/err.h>
  #include <linux/interrupt.h>
@@ -973,7 +974,7 @@ static struct iommu_domain 
*arm_smmu_domain_alloc(unsigned type)
  {
  	struct arm_smmu_domain *smmu_domain;

-	if (type != IOMMU_DOMAIN_UNMANAGED)
+	if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
  		return NULL;
  	/*
  	 * Allocate the domain and initialise some of its data structures.
@@ -984,6 +985,12 @@ static struct iommu_domain 
*arm_smmu_domain_alloc(unsigned type)
  	if (!smmu_domain)
  		return NULL;

+	if (type == IOMMU_DOMAIN_DMA &&
+	    iommu_get_dma_cookie(&smmu_domain->domain)) {
+		kfree(smmu_domain);
+		return NULL;
+	}
+
  	mutex_init(&smmu_domain->init_mutex);
  	spin_lock_init(&smmu_domain->pgtbl_lock);

@@ -998,6 +1005,7 @@ static void arm_smmu_domain_free(struct 
iommu_domain *domain)
  	 * Free the domain resources. We assume that all devices have
  	 * already been detached.
  	 */
+	iommu_put_dma_cookie(domain);
  	arm_smmu_destroy_domain_context(domain);
  	kfree(smmu_domain);
  }
Anup Patel Oct. 7, 2015, 5:40 p.m. UTC | #5
On Wed, Oct 7, 2015 at 10:06 PM, Robin Murphy <robin.murphy@arm.com> wrote:
> On 07/10/15 10:03, Anup Patel wrote:
> [...]
>
>>> +static bool do_iommu_attach(struct device *dev, const struct iommu_ops
>>> *ops,
>>> +                          u64 dma_base, u64 size)
>>> +{
>>> +       struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
>>> +
>>> +       /*
>>> +        * Best case: The device is either part of a group which was
>>> +        * already attached to a domain in a previous call, or it's
>>> +        * been put in a default DMA domain by the IOMMU core.
>>> +        */
>>> +       if (!domain) {
>>> +               /*
>>> +                * Urgh. The IOMMU core isn't going to do default domains
>>> +                * for non-PCI devices anyway, until it has some means of
>>> +                * abstracting the entirely implementation-specific
>>> +                * sideband data/SoC topology/unicorn dust that may or
>>> +                * may not differentiate upstream masters.
>>> +                * So until then, HORRIBLE HACKS!
>>> +                */
>>> +               domain = ops->domain_alloc(IOMMU_DOMAIN_DMA);
>>> +               if (!domain)
>>> +                       goto out_no_domain;
>>> +
>>> +               domain->ops = ops;
>>> +               domain->type = IOMMU_DOMAIN_DMA |
>>> __IOMMU_DOMAIN_FAKE_DEFAULT;
>>
>>
>> We require iommu_get_dma_cookie(domain) here. If we dont
>> allocate iommu cookie then iommu_dma_init_domain() will fail.
>
>
> The iova cookie is tightly coupled with the domain, so it really only makes
> sense for the IOMMU driver to deal with it as part of its
> domain_alloc/domain_free callbacks.
>
> Doing that here was one of the nastier 'compatibility' hacks which I've now
> taken out; trying to make things work without modifying existing IOMMU
> drivers was just too impractical. Drivers which want to play are now
> required to support the IOMMU_DOMAIN_DMA type appropriately, but it's still
> only a minimal change, as per the example diff for the ARM SMMU driver below
> (big pile of further patches necessary to make said driver compatible in
> other respects notwithstanding).

Many thanks for the info.

Actually, I was trying your dma-mapping patches with SMMU driver. I have got
SMMU driver somewhat working for IOMMU_DOMAIN_DMA. Your suggestion
will help me restrict my changes to SMMU driver only.

This patchset looks good to me.

Thanks,
Anup
Yong Wu (吴勇) Oct. 9, 2015, 5:44 a.m. UTC | #6
On Wed, 2015-10-07 at 17:07 +0100, Robin Murphy wrote:
> On 06/10/15 12:00, Yong Wu wrote:
> > On Thu, 2015-10-01 at 20:13 +0100, Robin Murphy wrote:
> >> Taking some inspiration from the arch/arm code, implement the
> >> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
> >>
> >> Since there is still work to do elsewhere to make DMA configuration happen
> >> in a more appropriate order and properly support platform devices in the
> >> IOMMU core, the device setup code unfortunately starts out carrying some
> >> workarounds to ensure it works correctly in the current state of things.
> >>
> >> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> >> ---
> >>   arch/arm64/mm/dma-mapping.c | 435 ++++++++++++++++++++++++++++++++++++++++++++
> >>   1 file changed, 435 insertions(+)
> >>
> > [...]
> >> +/*
> >> + * TODO: Right now __iommu_setup_dma_ops() gets called too early to do
> >> + * everything it needs to - the device is only partially created and the
> >> + * IOMMU driver hasn't seen it yet, so it can't have a group. Thus we
> >> + * need this delayed attachment dance. Once IOMMU probe ordering is sorted
> >> + * to move the arch_setup_dma_ops() call later, all the notifier bits below
> >> + * become unnecessary, and will go away.
> >> + */
> >
> > Hi Robin,
> >        Could I ask a question about the plan in the future:
> >        How to move arch_setup_dma_ops() call later than IOMMU probe?
> >
> >        arch_setup_dma_ops is from of_dma_configure which is from
> > arm64_device_init, and IOMMU probe is subsys_init. So
> > arch_setup_dma_ops will run before IOMMU probe normally, is it right?
> 
> Yup, hence the need to call of_platform_device_create() manually in your 
> IOMMU_OF_DECLARE init function if you need the actual device instance to 
> be ready before the root of_platform_populate() runs.

Thanks. I have added of_platform_device_create.

If the arch_setup_dma_ops always be called before IOMMU probe, What's
the meaning of the TODO comment here?  Does the arch_setup_dma_ops()
will be moved to run later than IOMMU probe? How to do this.

> 
> >        Does Laurent's probe-deferral series could help do this? what's
> > the state of this series.
> 
> What Laurent's patches do is to leave the DMA mask configuration where 
> it is early in device creation, but split out the dma_ops configuration 
> to be called just before the actual driver probe, and defer that if the 
> IOMMU device hasn't probed yet. At the moment, those patches (plus a bit 
> of my own development on top) are working fairly well in the simple 
> case, but I've seen things start falling apart if the client driver then 
> requests its own probe deferral, and there are probably other 
> troublesome edge cases to find - I need to dig into that further, but 
> sorting out my ARM SMMU driver patches is currently looking like a 
> higher priority.
> 
> >> +struct iommu_dma_notifier_data {
> >> +	struct list_head list;
> >> +	struct device *dev;
> >> +	const struct iommu_ops *ops;
> >> +	u64 dma_base;
> >> +	u64 size;
> >> +};
> [...]
> >> +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
> >> +				  const struct iommu_ops *ops)
> >> +{
> >> +	struct iommu_group *group;
> >> +
> >> +	if (!ops)
> >> +		return;
> >> +	/*
> >> +	 * TODO: As a concession to the future, we're ready to handle being
> >> +	 * called both early and late (i.e. after bus_add_device). Once all
> >> +	 * the platform bus code is reworked to call us late and the notifier
> >> +	 * junk above goes away, move the body of do_iommu_attach here.
> >> +	 */
> >> +	group = iommu_group_get(dev);
> >
> >     If iommu_setup_dma_ops run after bus_add_device, then the device has
> > its group here. It will enter do_iommu_attach which will alloc a default
> > iommu domain and attach this device to the new iommu domain.
> >     But mtk-iommu don't expect like this, we would like to attach to the
> > same domain. So we should alloc a default iommu domain(if there is no
> > iommu domain at that time) and attach the device to the same domain in
> > our xx_add_device, is it right?
> 
> Yes, if you attach the device to your own 'real' default domain after 
> setting up the group in add_device, then do_iommu_attach() will now pick 
> that domain up and use it instead of trying to create a new one, and the 
> arch code will stop short of tearing the domain down if the device probe 
> fails and it gets detached again. Additionally, since from add_device 
> you should hopefully have all the information you need to get back to 
> the relevant m4u instance, it should now be OK to keep the default 
> domain there and finally get rid of that pesky global variable.
> 
> Robin.

Thanks very much for your confirm.

I have added it following this. As above, the arch_setup_dma_ops always
be called before IOMMU probe, so the attach_device will be called
earlier than probe and the iommu domain will exist already in
add_device.

Meanwhile I attach all the iommu devices into the same domain in the
add_device and free the other unnecessary domains in the attach_device.

Here I wrote may be not clear, so I send the detail code[1]. when you
are free, please also help have a look.

Currently it's based on the condition that arch_setup_dma_ops run before
IOMMU probe, But from your TODO comment, the sequence of
arch_setup_dma_ops may be changed. this series is not a final version?
You also question me "how can you guarantee domain_alloc() happens
before this driver is probed?". So I am a little confused this TODO.

[1]:http://lists.linuxfoundation.org/pipermail/iommu/2015-October/014591.html

> 
> >> +	if (group) {
> >> +		do_iommu_attach(dev, ops, dma_base, size);
> >> +		iommu_group_put(group);
> >> +	} else {
> >> +		queue_iommu_attach(dev, ops, dma_base, size);
> >> +	}
> >> +}
> >> +
> >> +#else
> >> +
> >> +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
> >> +				  struct iommu_ops *iommu)
> >> +{ }
> >> +
> >> +#endif  /* CONFIG_IOMMU_DMA */
> >> +
> >
>
Joerg Roedel Oct. 14, 2015, 11:47 a.m. UTC | #7
On Thu, Oct 01, 2015 at 08:13:59PM +0100, Robin Murphy wrote:
> Taking some inspiration from the arch/arm code, implement the
> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
> 
> Since there is still work to do elsewhere to make DMA configuration happen
> in a more appropriate order and properly support platform devices in the
> IOMMU core, the device setup code unfortunately starts out carrying some
> workarounds to ensure it works correctly in the current state of things.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>  arch/arm64/mm/dma-mapping.c | 435 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 435 insertions(+)

This needs an ack from the arm64 maintainers.
Catalin Marinas Oct. 14, 2015, 1:35 p.m. UTC | #8
On Thu, Oct 01, 2015 at 08:13:59PM +0100, Robin Murphy wrote:
> Taking some inspiration from the arch/arm code, implement the
> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
> 
> Since there is still work to do elsewhere to make DMA configuration happen
> in a more appropriate order and properly support platform devices in the
> IOMMU core, the device setup code unfortunately starts out carrying some
> workarounds to ensure it works correctly in the current state of things.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>

Sorry, I reviewed this patch before but forgot to ack it, so here it is:

Acked-by: Catalin Marinas <catalin.marinas@arm.com>

(and I'm fined for the arm64 patches here to go in via the iommu tree)

I assume part of this patch will disappear at some point when the device
probing order is sorted.
Robin Murphy Oct. 14, 2015, 4:34 p.m. UTC | #9
On 14/10/15 14:35, Catalin Marinas wrote:
> On Thu, Oct 01, 2015 at 08:13:59PM +0100, Robin Murphy wrote:
>> Taking some inspiration from the arch/arm code, implement the
>> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
>>
>> Since there is still work to do elsewhere to make DMA configuration happen
>> in a more appropriate order and properly support platform devices in the
>> IOMMU core, the device setup code unfortunately starts out carrying some
>> workarounds to ensure it works correctly in the current state of things.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>
> Sorry, I reviewed this patch before but forgot to ack it, so here it is:
>
> Acked-by: Catalin Marinas <catalin.marinas@arm.com>

Thanks! Although it turns out I'm at least partly to blame there - you 
did give a reviewed-by on v5, but I didn't add it here since I'd made 
significant changes - I should have checked and called that out, my bad.

> (and I'm fined for the arm64 patches here to go in via the iommu tree)
>
> I assume part of this patch will disappear at some point when the device
> probing order is sorted.

I'll be working on that for 4.5, indeed. Getting the initialisation 
order sorted out also stands in the way of converting 32-bit to the 
common ops, so it's very high up my priority list.

Robin.
Yong Wu (吴勇) Nov. 4, 2015, 8:39 a.m. UTC | #10
On Thu, 2015-10-01 at 20:13 +0100, Robin Murphy wrote:
> Taking some inspiration from the arch/arm code, implement the
> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
[...]
> +static void *__iommu_alloc_attrs(struct device *dev, size_t size,
> +				 dma_addr_t *handle, gfp_t gfp,
> +				 struct dma_attrs *attrs)
> +{
> +	bool coherent = is_device_dma_coherent(dev);
> +	int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
> +	void *addr;
> +
> +	if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
> +		return NULL;
> +	/*
> +	 * Some drivers rely on this, and we probably don't want the
> +	 * possibility of stale kernel data being read by devices anyway.
> +	 */
> +	gfp |= __GFP_ZERO;
> +
> +	if (gfp & __GFP_WAIT) {
> +		struct page **pages;
> +		pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
> +
> +		pages = iommu_dma_alloc(dev, size, gfp, ioprot,	handle,
> +					flush_page);
> +		if (!pages)
> +			return NULL;
> +
> +		addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
> +					      __builtin_return_address(0));
> +		if (!addr)
> +			iommu_dma_free(dev, pages, size, handle);
> +	} else {
> +		struct page *page;
> +		/*
> +		 * In atomic context we can't remap anything, so we'll only
> +		 * get the virtually contiguous buffer we need by way of a
> +		 * physically contiguous allocation.
> +		 */
> +		if (coherent) {
> +			page = alloc_pages(gfp, get_order(size));
> +			addr = page ? page_address(page) : NULL;
> +		} else {
> +			addr = __alloc_from_pool(size, &page, gfp);
> +		}
> +		if (!addr)
> +			return NULL;
> +
> +		*handle = iommu_dma_map_page(dev, page, 0, size, ioprot);
> +		if (iommu_dma_mapping_error(dev, *handle)) {
> +			if (coherent)
> +				__free_pages(page, get_order(size));
> +			else
> +				__free_from_pool(addr, size);
> +			addr = NULL;
> +		}
> +	}
> +	return addr;
> +}
> +
> +static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
> +			       dma_addr_t handle, struct dma_attrs *attrs)
> +{
> +	/*
> +	 * @cpu_addr will be one of 3 things depending on how it was allocated:
> +	 * - A remapped array of pages from iommu_dma_alloc(), for all
> +	 *   non-atomic allocations.
> +	 * - A non-cacheable alias from the atomic pool, for atomic
> +	 *   allocations by non-coherent devices.
> +	 * - A normal lowmem address, for atomic allocations by
> +	 *   coherent devices.
> +	 * Hence how dodgy the below logic looks...
> +	 */
> +	if (__in_atomic_pool(cpu_addr, size)) {
> +		iommu_dma_unmap_page(dev, handle, size, 0, NULL);
> +		__free_from_pool(cpu_addr, size);
> +	} else if (is_vmalloc_addr(cpu_addr)){
> +		struct vm_struct *area = find_vm_area(cpu_addr);
> +
> +		if (WARN_ON(!area || !area->pages))
> +			return;
> +		iommu_dma_free(dev, area->pages, size, &handle);
> +		dma_common_free_remap(cpu_addr, size, VM_USERMAP);

Hi Robin,
    We get a WARN issue while the size is not aligned here.

    The WARN log is:
[  206.852002] WARNING: CPU: 0 PID: 23329
at /mnt/host/source/src/third_party/kernel/v3.18/mm/vmalloc.c:65
vunmap_page_range+0x190/0x1b4()
[  206.864438] Modules linked in: nls_iso8859_1 nls_cp437 vfat fat
rfcomm i2c_dev uinput dm9601 uvcvideo btmrvl_sdio mwifiex_sdio mwifiex
btmrvl bluetooth zram fuse cfg80211 nf_conntrack_ipv6 nf_defrag_ipv6
ip6table_filter ip6_tables cdc_ether usbnet mii joydev snd_seq_midi
snd_seq_midi_event snd_rawmidi snd_seq snd_seq_device ppp_async
ppp_generic slhc tun
[  206.902983] CPU: 0 PID: 23329 Comm: chrome Not tainted 3.18.0 #17
[  206.910430] Hardware name: Mediatek Oak rev3 board (DT)
[  206.920018] Call trace:
[  206.925537] [<ffffffc000208c00>] dump_backtrace+0x0/0x140
[  206.931905] [<ffffffc000208d5c>] show_stack+0x1c/0x28
[  206.939158] [<ffffffc000870f80>] dump_stack+0x74/0x94
[  206.947459] [<ffffffc0002219a4>] warn_slowpath_common+0x90/0xb8
[  206.954100] [<ffffffc000221b58>] warn_slowpath_null+0x34/0x44
[  206.961537] [<ffffffc000321358>] vunmap_page_range+0x18c/0x1b4
[  206.967630] [<ffffffc0003213e4>] unmap_kernel_range+0x2c/0x78
[  206.976977] [<ffffffc000582224>] dma_common_free_remap+0x68/0x80
[  206.983581] [<ffffffc000217260>] __iommu_free_attrs+0x14c/0x160
[  206.989646] [<ffffffc00066fc1c>] mtk_vcodec_mem_free+0xa0/0x15c
[  206.996481] [<ffffffc00067e278>] vp9_free_work_buf+0x54/0x70
[  207.002260] [<ffffffc00067f168>] vdec_vp9_deinit+0x7c/0xe8
[  207.008134] [<ffffffc0006787d8>] vdec_if_deinit+0x84/0xec
[  207.013820] [<ffffffc000677898>] mtk_vcodec_vdec_release+0x54/0x6c
[  207.020672] [<ffffffc000673e3c>] fops_vcodec_release+0x7c/0xf8
[  207.026607] [<ffffffc000652b78>] v4l2_release+0x3c/0x84
[  207.031824] [<ffffffc00033b218>] __fput+0xf8/0x1c0
[  207.036599] [<ffffffc00033b350>] ____fput+0x1c/0x2c
[  207.041454] [<ffffffc00023ed78>] task_work_run+0xb0/0xd4
[  207.046756] [<ffffffc00020872c>] do_notify_resume+0x54/0x6c


   From the log I get in this fail case, the size of unmap here is
0x10080, and its map size of dma_common_pages_remap in
__iommu_alloc_attrs is 0x10080, and the corresponding dma-map size is
0x11000(after iova_align). I think all the parameters of map and unmap
are good, it look like not a DMA issue. but I don't know why we get this
warning.
Have you met this problem and give us some advices, Thanks.

(If we add PAGE_ALIGN for the size in dma_alloc and dma_free, It is OK.)

> +	} else {
> +		iommu_dma_unmap_page(dev, handle, size, 0, NULL);
> +		__free_pages(virt_to_page(cpu_addr), get_order(size));
> +	}
> +}
> +
[...]
Robin Murphy Nov. 4, 2015, 1:11 p.m. UTC | #11
On 04/11/15 08:39, Yong Wu wrote:
> On Thu, 2015-10-01 at 20:13 +0100, Robin Murphy wrote:
>> Taking some inspiration from the arch/arm code, implement the
>> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
> [...]
>> +static void *__iommu_alloc_attrs(struct device *dev, size_t size,
>> +				 dma_addr_t *handle, gfp_t gfp,
>> +				 struct dma_attrs *attrs)
>> +{
>> +	bool coherent = is_device_dma_coherent(dev);
>> +	int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
>> +	void *addr;
>> +
>> +	if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
>> +		return NULL;
>> +	/*
>> +	 * Some drivers rely on this, and we probably don't want the
>> +	 * possibility of stale kernel data being read by devices anyway.
>> +	 */
>> +	gfp |= __GFP_ZERO;
>> +
>> +	if (gfp & __GFP_WAIT) {
>> +		struct page **pages;
>> +		pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
>> +
>> +		pages = iommu_dma_alloc(dev, size, gfp, ioprot,	handle,
>> +					flush_page);
>> +		if (!pages)
>> +			return NULL;
>> +
>> +		addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
>> +					      __builtin_return_address(0));
>> +		if (!addr)
>> +			iommu_dma_free(dev, pages, size, handle);
>> +	} else {
>> +		struct page *page;
>> +		/*
>> +		 * In atomic context we can't remap anything, so we'll only
>> +		 * get the virtually contiguous buffer we need by way of a
>> +		 * physically contiguous allocation.
>> +		 */
>> +		if (coherent) {
>> +			page = alloc_pages(gfp, get_order(size));
>> +			addr = page ? page_address(page) : NULL;
>> +		} else {
>> +			addr = __alloc_from_pool(size, &page, gfp);
>> +		}
>> +		if (!addr)
>> +			return NULL;
>> +
>> +		*handle = iommu_dma_map_page(dev, page, 0, size, ioprot);
>> +		if (iommu_dma_mapping_error(dev, *handle)) {
>> +			if (coherent)
>> +				__free_pages(page, get_order(size));
>> +			else
>> +				__free_from_pool(addr, size);
>> +			addr = NULL;
>> +		}
>> +	}
>> +	return addr;
>> +}
>> +
>> +static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
>> +			       dma_addr_t handle, struct dma_attrs *attrs)
>> +{
>> +	/*
>> +	 * @cpu_addr will be one of 3 things depending on how it was allocated:
>> +	 * - A remapped array of pages from iommu_dma_alloc(), for all
>> +	 *   non-atomic allocations.
>> +	 * - A non-cacheable alias from the atomic pool, for atomic
>> +	 *   allocations by non-coherent devices.
>> +	 * - A normal lowmem address, for atomic allocations by
>> +	 *   coherent devices.
>> +	 * Hence how dodgy the below logic looks...
>> +	 */
>> +	if (__in_atomic_pool(cpu_addr, size)) {
>> +		iommu_dma_unmap_page(dev, handle, size, 0, NULL);
>> +		__free_from_pool(cpu_addr, size);
>> +	} else if (is_vmalloc_addr(cpu_addr)){
>> +		struct vm_struct *area = find_vm_area(cpu_addr);
>> +
>> +		if (WARN_ON(!area || !area->pages))
>> +			return;
>> +		iommu_dma_free(dev, area->pages, size, &handle);
>> +		dma_common_free_remap(cpu_addr, size, VM_USERMAP);
>
> Hi Robin,
>      We get a WARN issue while the size is not aligned here.
>
>      The WARN log is:
> [  206.852002] WARNING: CPU: 0 PID: 23329
> at /mnt/host/source/src/third_party/kernel/v3.18/mm/vmalloc.c:65
> vunmap_page_range+0x190/0x1b4()
> [  206.864438] Modules linked in: nls_iso8859_1 nls_cp437 vfat fat
> rfcomm i2c_dev uinput dm9601 uvcvideo btmrvl_sdio mwifiex_sdio mwifiex
> btmrvl bluetooth zram fuse cfg80211 nf_conntrack_ipv6 nf_defrag_ipv6
> ip6table_filter ip6_tables cdc_ether usbnet mii joydev snd_seq_midi
> snd_seq_midi_event snd_rawmidi snd_seq snd_seq_device ppp_async
> ppp_generic slhc tun
> [  206.902983] CPU: 0 PID: 23329 Comm: chrome Not tainted 3.18.0 #17
> [  206.910430] Hardware name: Mediatek Oak rev3 board (DT)
> [  206.920018] Call trace:
> [  206.925537] [<ffffffc000208c00>] dump_backtrace+0x0/0x140
> [  206.931905] [<ffffffc000208d5c>] show_stack+0x1c/0x28
> [  206.939158] [<ffffffc000870f80>] dump_stack+0x74/0x94
> [  206.947459] [<ffffffc0002219a4>] warn_slowpath_common+0x90/0xb8
> [  206.954100] [<ffffffc000221b58>] warn_slowpath_null+0x34/0x44
> [  206.961537] [<ffffffc000321358>] vunmap_page_range+0x18c/0x1b4
> [  206.967630] [<ffffffc0003213e4>] unmap_kernel_range+0x2c/0x78
> [  206.976977] [<ffffffc000582224>] dma_common_free_remap+0x68/0x80
> [  206.983581] [<ffffffc000217260>] __iommu_free_attrs+0x14c/0x160
> [  206.989646] [<ffffffc00066fc1c>] mtk_vcodec_mem_free+0xa0/0x15c
> [  206.996481] [<ffffffc00067e278>] vp9_free_work_buf+0x54/0x70
> [  207.002260] [<ffffffc00067f168>] vdec_vp9_deinit+0x7c/0xe8
> [  207.008134] [<ffffffc0006787d8>] vdec_if_deinit+0x84/0xec
> [  207.013820] [<ffffffc000677898>] mtk_vcodec_vdec_release+0x54/0x6c
> [  207.020672] [<ffffffc000673e3c>] fops_vcodec_release+0x7c/0xf8
> [  207.026607] [<ffffffc000652b78>] v4l2_release+0x3c/0x84
> [  207.031824] [<ffffffc00033b218>] __fput+0xf8/0x1c0
> [  207.036599] [<ffffffc00033b350>] ____fput+0x1c/0x2c
> [  207.041454] [<ffffffc00023ed78>] task_work_run+0xb0/0xd4
> [  207.046756] [<ffffffc00020872c>] do_notify_resume+0x54/0x6c
>
>
>     From the log I get in this fail case, the size of unmap here is
> 0x10080, and its map size of dma_common_pages_remap in
> __iommu_alloc_attrs is 0x10080, and the corresponding dma-map size is
> 0x11000(after iova_align). I think all the parameters of map and unmap
> are good, it look like not a DMA issue. but I don't know why we get this
> warning.
> Have you met this problem and give us some advices, Thanks.
>
> (If we add PAGE_ALIGN for the size in dma_alloc and dma_free, It is OK.)

OK, having dug into this it looks like the root cause comes from some 
asymmetry in the common code: dma_common_pages remap() just passes the 
size through to get_vm_area_caller(), and the first thing that does is 
to page-align it. On the other hand, neither dma_common_free_remap() nor 
unmap_kernel_range() does anything with the size, so we wind up giving 
an unaligned end address to vunmap_page_range() and messing up the 
vmalloc page tables.

I wonder if dma_common_free_remap() should be page-aligning the size to 
match expectations (i.e. make it correctly unmap any request the other 
functions happily mapped), or conversely, perhaps both the map and unmap 
functions should have a WARN_ON(size & PAGE_MASK) to enforce being 
called as actually intended. Laura?

Either way, I'll send out a patch to make the arm64 side deal with it 
explicitly.

Robin.

>
>> +	} else {
>> +		iommu_dma_unmap_page(dev, handle, size, 0, NULL);
>> +		__free_pages(virt_to_page(cpu_addr), get_order(size));
>> +	}
>> +}
>> +
> [...]
>
>
Laura Abbott Nov. 4, 2015, 5:35 p.m. UTC | #12
On 11/04/2015 05:11 AM, Robin Murphy wrote:
> On 04/11/15 08:39, Yong Wu wrote:
>> On Thu, 2015-10-01 at 20:13 +0100, Robin Murphy wrote:
>>> Taking some inspiration from the arch/arm code, implement the
>>> arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.
>> [...]
>>> +static void *__iommu_alloc_attrs(struct device *dev, size_t size,
>>> +                 dma_addr_t *handle, gfp_t gfp,
>>> +                 struct dma_attrs *attrs)
>>> +{
>>> +    bool coherent = is_device_dma_coherent(dev);
>>> +    int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
>>> +    void *addr;
>>> +
>>> +    if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
>>> +        return NULL;
>>> +    /*
>>> +     * Some drivers rely on this, and we probably don't want the
>>> +     * possibility of stale kernel data being read by devices anyway.
>>> +     */
>>> +    gfp |= __GFP_ZERO;
>>> +
>>> +    if (gfp & __GFP_WAIT) {
>>> +        struct page **pages;
>>> +        pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
>>> +
>>> +        pages = iommu_dma_alloc(dev, size, gfp, ioprot,    handle,
>>> +                    flush_page);
>>> +        if (!pages)
>>> +            return NULL;
>>> +
>>> +        addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
>>> +                          __builtin_return_address(0));
>>> +        if (!addr)
>>> +            iommu_dma_free(dev, pages, size, handle);
>>> +    } else {
>>> +        struct page *page;
>>> +        /*
>>> +         * In atomic context we can't remap anything, so we'll only
>>> +         * get the virtually contiguous buffer we need by way of a
>>> +         * physically contiguous allocation.
>>> +         */
>>> +        if (coherent) {
>>> +            page = alloc_pages(gfp, get_order(size));
>>> +            addr = page ? page_address(page) : NULL;
>>> +        } else {
>>> +            addr = __alloc_from_pool(size, &page, gfp);
>>> +        }
>>> +        if (!addr)
>>> +            return NULL;
>>> +
>>> +        *handle = iommu_dma_map_page(dev, page, 0, size, ioprot);
>>> +        if (iommu_dma_mapping_error(dev, *handle)) {
>>> +            if (coherent)
>>> +                __free_pages(page, get_order(size));
>>> +            else
>>> +                __free_from_pool(addr, size);
>>> +            addr = NULL;
>>> +        }
>>> +    }
>>> +    return addr;
>>> +}
>>> +
>>> +static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
>>> +                   dma_addr_t handle, struct dma_attrs *attrs)
>>> +{
>>> +    /*
>>> +     * @cpu_addr will be one of 3 things depending on how it was allocated:
>>> +     * - A remapped array of pages from iommu_dma_alloc(), for all
>>> +     *   non-atomic allocations.
>>> +     * - A non-cacheable alias from the atomic pool, for atomic
>>> +     *   allocations by non-coherent devices.
>>> +     * - A normal lowmem address, for atomic allocations by
>>> +     *   coherent devices.
>>> +     * Hence how dodgy the below logic looks...
>>> +     */
>>> +    if (__in_atomic_pool(cpu_addr, size)) {
>>> +        iommu_dma_unmap_page(dev, handle, size, 0, NULL);
>>> +        __free_from_pool(cpu_addr, size);
>>> +    } else if (is_vmalloc_addr(cpu_addr)){
>>> +        struct vm_struct *area = find_vm_area(cpu_addr);
>>> +
>>> +        if (WARN_ON(!area || !area->pages))
>>> +            return;
>>> +        iommu_dma_free(dev, area->pages, size, &handle);
>>> +        dma_common_free_remap(cpu_addr, size, VM_USERMAP);
>>
>> Hi Robin,
>>      We get a WARN issue while the size is not aligned here.
>>
>>      The WARN log is:
>> [  206.852002] WARNING: CPU: 0 PID: 23329
>> at /mnt/host/source/src/third_party/kernel/v3.18/mm/vmalloc.c:65
>> vunmap_page_range+0x190/0x1b4()
>> [  206.864438] Modules linked in: nls_iso8859_1 nls_cp437 vfat fat
>> rfcomm i2c_dev uinput dm9601 uvcvideo btmrvl_sdio mwifiex_sdio mwifiex
>> btmrvl bluetooth zram fuse cfg80211 nf_conntrack_ipv6 nf_defrag_ipv6
>> ip6table_filter ip6_tables cdc_ether usbnet mii joydev snd_seq_midi
>> snd_seq_midi_event snd_rawmidi snd_seq snd_seq_device ppp_async
>> ppp_generic slhc tun
>> [  206.902983] CPU: 0 PID: 23329 Comm: chrome Not tainted 3.18.0 #17
>> [  206.910430] Hardware name: Mediatek Oak rev3 board (DT)
>> [  206.920018] Call trace:
>> [  206.925537] [<ffffffc000208c00>] dump_backtrace+0x0/0x140
>> [  206.931905] [<ffffffc000208d5c>] show_stack+0x1c/0x28
>> [  206.939158] [<ffffffc000870f80>] dump_stack+0x74/0x94
>> [  206.947459] [<ffffffc0002219a4>] warn_slowpath_common+0x90/0xb8
>> [  206.954100] [<ffffffc000221b58>] warn_slowpath_null+0x34/0x44
>> [  206.961537] [<ffffffc000321358>] vunmap_page_range+0x18c/0x1b4
>> [  206.967630] [<ffffffc0003213e4>] unmap_kernel_range+0x2c/0x78
>> [  206.976977] [<ffffffc000582224>] dma_common_free_remap+0x68/0x80
>> [  206.983581] [<ffffffc000217260>] __iommu_free_attrs+0x14c/0x160
>> [  206.989646] [<ffffffc00066fc1c>] mtk_vcodec_mem_free+0xa0/0x15c
>> [  206.996481] [<ffffffc00067e278>] vp9_free_work_buf+0x54/0x70
>> [  207.002260] [<ffffffc00067f168>] vdec_vp9_deinit+0x7c/0xe8
>> [  207.008134] [<ffffffc0006787d8>] vdec_if_deinit+0x84/0xec
>> [  207.013820] [<ffffffc000677898>] mtk_vcodec_vdec_release+0x54/0x6c
>> [  207.020672] [<ffffffc000673e3c>] fops_vcodec_release+0x7c/0xf8
>> [  207.026607] [<ffffffc000652b78>] v4l2_release+0x3c/0x84
>> [  207.031824] [<ffffffc00033b218>] __fput+0xf8/0x1c0
>> [  207.036599] [<ffffffc00033b350>] ____fput+0x1c/0x2c
>> [  207.041454] [<ffffffc00023ed78>] task_work_run+0xb0/0xd4
>> [  207.046756] [<ffffffc00020872c>] do_notify_resume+0x54/0x6c
>>
>>
>>     From the log I get in this fail case, the size of unmap here is
>> 0x10080, and its map size of dma_common_pages_remap in
>> __iommu_alloc_attrs is 0x10080, and the corresponding dma-map size is
>> 0x11000(after iova_align). I think all the parameters of map and unmap
>> are good, it look like not a DMA issue. but I don't know why we get this
>> warning.
>> Have you met this problem and give us some advices, Thanks.
>>
>> (If we add PAGE_ALIGN for the size in dma_alloc and dma_free, It is OK.)
>
> OK, having dug into this it looks like the root cause comes from some asymmetry in the common code: dma_common_pages remap() just passes the size through to get_vm_area_caller(), and the first thing that does is to page-align it. On the other hand, neither dma_common_free_remap() nor unmap_kernel_range() does anything with the size, so we wind up giving an unaligned end address to vunmap_page_range() and messing up the vmalloc page tables.
>
> I wonder if dma_common_free_remap() should be page-aligning the size to match expectations (i.e. make it correctly unmap any request the other functions happily mapped), or conversely, perhaps both the map and unmap functions should have a WARN_ON(size & PAGE_MASK) to enforce being called as actually intended. Laura?
>

Based on what I've found, the DMA mapping API needs to be able to handle unaligned sizes
gracefully so I don't think a warn is appropriate. I was aligning at the higher level but
it would be best for dma_common_free_remap to align as well.
  
> Either way, I'll send out a patch to make the arm64 side deal with it explicitly.
>
> Robin.
>

Thanks,
Laura
diff mbox

Patch

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 0bcc4bc..dd2d6e6 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -533,3 +533,438 @@  static int __init dma_debug_do_init(void)
 	return 0;
 }
 fs_initcall(dma_debug_do_init);
+
+
+#ifdef CONFIG_IOMMU_DMA
+#include <linux/dma-iommu.h>
+#include <linux/platform_device.h>
+#include <linux/amba/bus.h>
+
+/* Thankfully, all cache ops are by VA so we can ignore phys here */
+static void flush_page(struct device *dev, const void *virt, phys_addr_t phys)
+{
+	__dma_flush_range(virt, virt + PAGE_SIZE);
+}
+
+static void *__iommu_alloc_attrs(struct device *dev, size_t size,
+				 dma_addr_t *handle, gfp_t gfp,
+				 struct dma_attrs *attrs)
+{
+	bool coherent = is_device_dma_coherent(dev);
+	int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
+	void *addr;
+
+	if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
+		return NULL;
+	/*
+	 * Some drivers rely on this, and we probably don't want the
+	 * possibility of stale kernel data being read by devices anyway.
+	 */
+	gfp |= __GFP_ZERO;
+
+	if (gfp & __GFP_WAIT) {
+		struct page **pages;
+		pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
+
+		pages = iommu_dma_alloc(dev, size, gfp, ioprot,	handle,
+					flush_page);
+		if (!pages)
+			return NULL;
+
+		addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
+					      __builtin_return_address(0));
+		if (!addr)
+			iommu_dma_free(dev, pages, size, handle);
+	} else {
+		struct page *page;
+		/*
+		 * In atomic context we can't remap anything, so we'll only
+		 * get the virtually contiguous buffer we need by way of a
+		 * physically contiguous allocation.
+		 */
+		if (coherent) {
+			page = alloc_pages(gfp, get_order(size));
+			addr = page ? page_address(page) : NULL;
+		} else {
+			addr = __alloc_from_pool(size, &page, gfp);
+		}
+		if (!addr)
+			return NULL;
+
+		*handle = iommu_dma_map_page(dev, page, 0, size, ioprot);
+		if (iommu_dma_mapping_error(dev, *handle)) {
+			if (coherent)
+				__free_pages(page, get_order(size));
+			else
+				__free_from_pool(addr, size);
+			addr = NULL;
+		}
+	}
+	return addr;
+}
+
+static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+			       dma_addr_t handle, struct dma_attrs *attrs)
+{
+	/*
+	 * @cpu_addr will be one of 3 things depending on how it was allocated:
+	 * - A remapped array of pages from iommu_dma_alloc(), for all
+	 *   non-atomic allocations.
+	 * - A non-cacheable alias from the atomic pool, for atomic
+	 *   allocations by non-coherent devices.
+	 * - A normal lowmem address, for atomic allocations by
+	 *   coherent devices.
+	 * Hence how dodgy the below logic looks...
+	 */
+	if (__in_atomic_pool(cpu_addr, size)) {
+		iommu_dma_unmap_page(dev, handle, size, 0, NULL);
+		__free_from_pool(cpu_addr, size);
+	} else if (is_vmalloc_addr(cpu_addr)){
+		struct vm_struct *area = find_vm_area(cpu_addr);
+
+		if (WARN_ON(!area || !area->pages))
+			return;
+		iommu_dma_free(dev, area->pages, size, &handle);
+		dma_common_free_remap(cpu_addr, size, VM_USERMAP);
+	} else {
+		iommu_dma_unmap_page(dev, handle, size, 0, NULL);
+		__free_pages(virt_to_page(cpu_addr), get_order(size));
+	}
+}
+
+static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
+			      void *cpu_addr, dma_addr_t dma_addr, size_t size,
+			      struct dma_attrs *attrs)
+{
+	struct vm_struct *area;
+	int ret;
+
+	vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot,
+					     is_device_dma_coherent(dev));
+
+	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+		return ret;
+
+	area = find_vm_area(cpu_addr);
+	if (WARN_ON(!area || !area->pages))
+		return -ENXIO;
+
+	return iommu_dma_mmap(area->pages, size, vma);
+}
+
+static int __iommu_get_sgtable(struct device *dev, struct sg_table *sgt,
+			       void *cpu_addr, dma_addr_t dma_addr,
+			       size_t size, struct dma_attrs *attrs)
+{
+	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	struct vm_struct *area = find_vm_area(cpu_addr);
+
+	if (WARN_ON(!area || !area->pages))
+		return -ENXIO;
+
+	return sg_alloc_table_from_pages(sgt, area->pages, count, 0, size,
+					 GFP_KERNEL);
+}
+
+static void __iommu_sync_single_for_cpu(struct device *dev,
+					dma_addr_t dev_addr, size_t size,
+					enum dma_data_direction dir)
+{
+	phys_addr_t phys;
+
+	if (is_device_dma_coherent(dev))
+		return;
+
+	phys = iommu_iova_to_phys(iommu_get_domain_for_dev(dev), dev_addr);
+	__dma_unmap_area(phys_to_virt(phys), size, dir);
+}
+
+static void __iommu_sync_single_for_device(struct device *dev,
+					   dma_addr_t dev_addr, size_t size,
+					   enum dma_data_direction dir)
+{
+	phys_addr_t phys;
+
+	if (is_device_dma_coherent(dev))
+		return;
+
+	phys = iommu_iova_to_phys(iommu_get_domain_for_dev(dev), dev_addr);
+	__dma_map_area(phys_to_virt(phys), size, dir);
+}
+
+static dma_addr_t __iommu_map_page(struct device *dev, struct page *page,
+				   unsigned long offset, size_t size,
+				   enum dma_data_direction dir,
+				   struct dma_attrs *attrs)
+{
+	bool coherent = is_device_dma_coherent(dev);
+	int prot = dma_direction_to_prot(dir, coherent);
+	dma_addr_t dev_addr = iommu_dma_map_page(dev, page, offset, size, prot);
+
+	if (!iommu_dma_mapping_error(dev, dev_addr) &&
+	    !dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
+		__iommu_sync_single_for_device(dev, dev_addr, size, dir);
+
+	return dev_addr;
+}
+
+static void __iommu_unmap_page(struct device *dev, dma_addr_t dev_addr,
+			       size_t size, enum dma_data_direction dir,
+			       struct dma_attrs *attrs)
+{
+	if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
+		__iommu_sync_single_for_cpu(dev, dev_addr, size, dir);
+
+	iommu_dma_unmap_page(dev, dev_addr, size, dir, attrs);
+}
+
+static void __iommu_sync_sg_for_cpu(struct device *dev,
+				    struct scatterlist *sgl, int nelems,
+				    enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	if (is_device_dma_coherent(dev))
+		return;
+
+	for_each_sg(sgl, sg, nelems, i)
+		__dma_unmap_area(sg_virt(sg), sg->length, dir);
+}
+
+static void __iommu_sync_sg_for_device(struct device *dev,
+				       struct scatterlist *sgl, int nelems,
+				       enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	if (is_device_dma_coherent(dev))
+		return;
+
+	for_each_sg(sgl, sg, nelems, i)
+		__dma_map_area(sg_virt(sg), sg->length, dir);
+}
+
+static int __iommu_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
+				int nelems, enum dma_data_direction dir,
+				struct dma_attrs *attrs)
+{
+	bool coherent = is_device_dma_coherent(dev);
+
+	if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
+		__iommu_sync_sg_for_device(dev, sgl, nelems, dir);
+
+	return iommu_dma_map_sg(dev, sgl, nelems,
+			dma_direction_to_prot(dir, coherent));
+}
+
+static void __iommu_unmap_sg_attrs(struct device *dev,
+				   struct scatterlist *sgl, int nelems,
+				   enum dma_data_direction dir,
+				   struct dma_attrs *attrs)
+{
+	if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
+		__iommu_sync_sg_for_cpu(dev, sgl, nelems, dir);
+
+	iommu_dma_unmap_sg(dev, sgl, nelems, dir, attrs);
+}
+
+static struct dma_map_ops iommu_dma_ops = {
+	.alloc = __iommu_alloc_attrs,
+	.free = __iommu_free_attrs,
+	.mmap = __iommu_mmap_attrs,
+	.get_sgtable = __iommu_get_sgtable,
+	.map_page = __iommu_map_page,
+	.unmap_page = __iommu_unmap_page,
+	.map_sg = __iommu_map_sg_attrs,
+	.unmap_sg = __iommu_unmap_sg_attrs,
+	.sync_single_for_cpu = __iommu_sync_single_for_cpu,
+	.sync_single_for_device = __iommu_sync_single_for_device,
+	.sync_sg_for_cpu = __iommu_sync_sg_for_cpu,
+	.sync_sg_for_device = __iommu_sync_sg_for_device,
+	.dma_supported = iommu_dma_supported,
+	.mapping_error = iommu_dma_mapping_error,
+};
+
+/*
+ * TODO: Right now __iommu_setup_dma_ops() gets called too early to do
+ * everything it needs to - the device is only partially created and the
+ * IOMMU driver hasn't seen it yet, so it can't have a group. Thus we
+ * need this delayed attachment dance. Once IOMMU probe ordering is sorted
+ * to move the arch_setup_dma_ops() call later, all the notifier bits below
+ * become unnecessary, and will go away.
+ */
+struct iommu_dma_notifier_data {
+	struct list_head list;
+	struct device *dev;
+	const struct iommu_ops *ops;
+	u64 dma_base;
+	u64 size;
+};
+static LIST_HEAD(iommu_dma_masters);
+static DEFINE_MUTEX(iommu_dma_notifier_lock);
+
+/*
+ * Temporarily "borrow" a domain feature flag to to tell if we had to resort
+ * to creating our own domain here, in case we need to clean it up again.
+ */
+#define __IOMMU_DOMAIN_FAKE_DEFAULT		(1U << 31)
+
+static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops,
+			   u64 dma_base, u64 size)
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+
+	/*
+	 * Best case: The device is either part of a group which was
+	 * already attached to a domain in a previous call, or it's
+	 * been put in a default DMA domain by the IOMMU core.
+	 */
+	if (!domain) {
+		/*
+		 * Urgh. The IOMMU core isn't going to do default domains
+		 * for non-PCI devices anyway, until it has some means of
+		 * abstracting the entirely implementation-specific
+		 * sideband data/SoC topology/unicorn dust that may or
+		 * may not differentiate upstream masters.
+		 * So until then, HORRIBLE HACKS!
+		 */
+		domain = ops->domain_alloc(IOMMU_DOMAIN_DMA);
+		if (!domain)
+			goto out_no_domain;
+
+		domain->ops = ops;
+		domain->type = IOMMU_DOMAIN_DMA | __IOMMU_DOMAIN_FAKE_DEFAULT;
+
+		if (iommu_attach_device(domain, dev))
+			goto out_put_domain;
+	}
+
+	if (iommu_dma_init_domain(domain, dma_base, size))
+		goto out_detach;
+
+	dev->archdata.dma_ops = &iommu_dma_ops;
+	return true;
+
+out_detach:
+	iommu_detach_device(domain, dev);
+out_put_domain:
+	if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
+		iommu_domain_free(domain);
+out_no_domain:
+	pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
+		dev_name(dev));
+	return false;
+}
+
+static void queue_iommu_attach(struct device *dev, const struct iommu_ops *ops,
+			      u64 dma_base, u64 size)
+{
+	struct iommu_dma_notifier_data *iommudata;
+
+	iommudata = kzalloc(sizeof(*iommudata), GFP_KERNEL);
+	if (!iommudata)
+		return;
+
+	iommudata->dev = dev;
+	iommudata->ops = ops;
+	iommudata->dma_base = dma_base;
+	iommudata->size = size;
+
+	mutex_lock(&iommu_dma_notifier_lock);
+	list_add(&iommudata->list, &iommu_dma_masters);
+	mutex_unlock(&iommu_dma_notifier_lock);
+}
+
+static int __iommu_attach_notifier(struct notifier_block *nb,
+				   unsigned long action, void *data)
+{
+	struct iommu_dma_notifier_data *master, *tmp;
+
+	if (action != BUS_NOTIFY_ADD_DEVICE)
+		return 0;
+
+	mutex_lock(&iommu_dma_notifier_lock);
+	list_for_each_entry_safe(master, tmp, &iommu_dma_masters, list) {
+		if (do_iommu_attach(master->dev, master->ops,
+				master->dma_base, master->size)) {
+			list_del(&master->list);
+			kfree(master);
+		}
+	}
+	mutex_unlock(&iommu_dma_notifier_lock);
+	return 0;
+}
+
+static int register_iommu_dma_ops_notifier(struct bus_type *bus)
+{
+	struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL);
+	int ret;
+
+	if (!nb)
+		return -ENOMEM;
+	/*
+	 * The device must be attached to a domain before the driver probe
+	 * routine gets a chance to start allocating DMA buffers. However,
+	 * the IOMMU driver also needs a chance to configure the iommu_group
+	 * via its add_device callback first, so we need to make the attach
+	 * happen between those two points. Since the IOMMU core uses a bus
+	 * notifier with default priority for add_device, do the same but
+	 * with a lower priority to ensure the appropriate ordering.
+	 */
+	nb->notifier_call = __iommu_attach_notifier;
+	nb->priority = -100;
+
+	ret = bus_register_notifier(bus, nb);
+	if (ret) {
+		pr_warn("Failed to register DMA domain notifier; IOMMU DMA ops unavailable on bus '%s'\n",
+			bus->name);
+		kfree(nb);
+	}
+	return ret;
+}
+
+static int __init __iommu_dma_init(void)
+{
+	int ret;
+
+	ret = iommu_dma_init();
+	if (!ret)
+		ret = register_iommu_dma_ops_notifier(&platform_bus_type);
+	if (!ret)
+		ret = register_iommu_dma_ops_notifier(&amba_bustype);
+	return ret;
+}
+arch_initcall(__iommu_dma_init);
+
+static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
+				  const struct iommu_ops *ops)
+{
+	struct iommu_group *group;
+
+	if (!ops)
+		return;
+	/*
+	 * TODO: As a concession to the future, we're ready to handle being
+	 * called both early and late (i.e. after bus_add_device). Once all
+	 * the platform bus code is reworked to call us late and the notifier
+	 * junk above goes away, move the body of do_iommu_attach here.
+	 */
+	group = iommu_group_get(dev);
+	if (group) {
+		do_iommu_attach(dev, ops, dma_base, size);
+		iommu_group_put(group);
+	} else {
+		queue_iommu_attach(dev, ops, dma_base, size);
+	}
+}
+
+#else
+
+static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
+				  struct iommu_ops *iommu)
+{ }
+
+#endif  /* CONFIG_IOMMU_DMA */
+