diff mbox

iommu/arm-smmu: Defer TLB flush in case of unmap op

Message ID 1501667598-16404-1-git-send-email-vivek.gautam@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Vivek Gautam Aug. 2, 2017, 9:53 a.m. UTC
We don't want to touch the TLB when smmu is suspended.
Defer it until resume.

Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
---

Hi all,

Here's the small patch in response of suggestion to defer tlb operations
when smmu is in suspend state.
The patch stores the TLB requests in 'unmap' when the smmu device is
suspended. On resume, it checks all the pending TLB requests, and
performs the unmap over those.

Right now, I have applied the patch on top of the pm runtime series.
Let me know what you think of the change. It will also be helpful if
somebody can please test a valid use case with this.

regards
Vivek

 drivers/iommu/arm-smmu.c | 59 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 6 deletions(-)

Comments

Robin Murphy Aug. 2, 2017, 12:17 p.m. UTC | #1
On 02/08/17 10:53, Vivek Gautam wrote:
> We don't want to touch the TLB when smmu is suspended.
> Defer it until resume.
> 
> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
> ---
> 
> Hi all,
> 
> Here's the small patch in response of suggestion to defer tlb operations
> when smmu is in suspend state.
> The patch stores the TLB requests in 'unmap' when the smmu device is
> suspended. On resume, it checks all the pending TLB requests, and
> performs the unmap over those.
> 
> Right now, I have applied the patch on top of the pm runtime series.
> Let me know what you think of the change. It will also be helpful if
> somebody can please test a valid use case with this.

The patch itself doesn't make much sense to me, but more crucially it's
definitely broken in concept. We can't return from arm_smmu_unmap()
without having actually unmapped anything, because that leaves the page
tables out of sync with what the caller expects - they may immmediately
reuse that IOVA to map something else for a different device and hit an
unexpected failure from io-pgtable when the PTE turns out to be non-empty.

However, if in general suspend *might* power-gate any part of the SMMU,
then I don't think we have any guarantee of what state any TLBs could be
in upon resume. Therefore any individual invalidations we skip while
suspended are probably moot, since resume would almost certainly have to
invalidate everything to get back to a safe state anyway.

Conversely though, the situation that still concerns me is whether this
can work at all for a distributed SMMU if things *don't* lose state. Say
the GPU and its local TBU are in the same clock domain - if the GPU has
just gone idle and we've clock-gated it, but "the SMMU" (i.e. the TCU)
is still active servicing other devices, we will assume we can happily
unmap GPU buffers and issue TLBIs, but what happens with entries held in
the unclocked TBU's micro-TLB?

Robin.

> 
> regards
> Vivek
> 
>  drivers/iommu/arm-smmu.c | 59 +++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 53 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index fe8e7fd61282..1f9c2b16aabb 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -51,6 +51,7 @@
>  #include <linux/pm_runtime.h>
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
> +#include <linux/list.h>
>  
>  #include <linux/amba/bus.h>
>  
> @@ -151,6 +152,14 @@ struct arm_smmu_master_cfg {
>  #define for_each_cfg_sme(fw, i, idx) \
>  	for (i = 0; idx = fwspec_smendx(fw, i), i < fw->num_ids; ++i)
>  
> +struct arm_smmu_tlb_req_info {
> +	struct iommu_domain *domain;
> +	unsigned long iova;
> +	size_t size;
> +	bool tlb_flush_pending;
> +	struct list_head list;
> +};
> +
>  struct arm_smmu_device {
>  	struct device			*dev;
>  
> @@ -182,6 +191,7 @@ struct arm_smmu_device {
>  	u32				num_s2_context_banks;
>  	DECLARE_BITMAP(context_map, ARM_SMMU_MAX_CBS);
>  	atomic_t			irptndx;
> +	struct list_head		domain_list;
>  
>  	u32				num_mapping_groups;
>  	u16				streamid_mask;
> @@ -1239,17 +1249,32 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>  			     size_t size)
>  {
>  	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> +	struct arm_smmu_device *smmu = smmu_domain->smmu;
>  	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
> -	size_t ret;
> +	struct arm_smmu_tlb_req_info *tlb_info;
>  
>  	if (!ops)
>  		return 0;
>  
> -	pm_runtime_get_sync(smmu_domain->smmu->dev);
> -	ret = ops->unmap(ops, iova, size);
> -	pm_runtime_put_sync(smmu_domain->smmu->dev);
> +	/* if the device is suspended; we can't unmap, defer any tlb operations */
> +	if (pm_runtime_suspended(smmu->dev)) {
> +		tlb_info = devm_kzalloc(smmu->dev, sizeof(*tlb_info), GFP_ATOMIC);
> +		if (!tlb_info)
> +			return -ENOMEM;
>  
> -	return ret;
> +		tlb_info->domain = domain;
> +		tlb_info->iova = iova;
> +		tlb_info->size = size;
> +		tlb_info->tlb_flush_pending = true;
> +		INIT_LIST_HEAD(&tlb_info->list);
> +
> +		/* XXX: We need locks here, but that again introduce the slowpath ? */
> +		list_add_tail(&tlb_info->list, &smmu->domain_list);
> +
> +		return size;
> +	}
> +
> +	return ops->unmap(ops, iova, size);
>  }
>  
>  static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
> @@ -2166,6 +2191,8 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
>  		smmu->irqs[i] = irq;
>  	}
>  
> +	INIT_LIST_HEAD(&smmu->domain_list);
> +
>  	err = arm_smmu_init_clocks(smmu);
>  	if (err)
>  		return err;
> @@ -2268,8 +2295,28 @@ static int arm_smmu_device_remove(struct platform_device *pdev)
>  static int arm_smmu_resume(struct device *dev)
>  {
>  	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
> +	struct arm_smmu_tlb_req_info  *tlb_info, *temp;
> +	int ret;
> +
> +	ret = arm_smmu_enable_clocks(smmu);
> +	if (ret)
> +		return ret;
> +
> +	list_for_each_entry_safe(tlb_info, temp, &smmu->domain_list, list) {
> +		printk("\n\n %s %d :: iterating over pending tlb request\n\n", __func__, __LINE__);
> +		if (tlb_info->tlb_flush_pending) {
> +			ret = arm_smmu_unmap(tlb_info->domain, tlb_info->iova, tlb_info->size);
> +			if (!ret)
> +				return -EINVAL;
>  
> -	return arm_smmu_enable_clocks(smmu);
> +			tlb_info->tlb_flush_pending = false;
> +
> +			/* we are done with this request; delete it */
> +			list_del(&tlb_info->list);
> +		}
> +	}
> +
> +	return 0;
>  }
>  
>  static int arm_smmu_suspend(struct device *dev)
>
Vivek Gautam Aug. 3, 2017, 5:35 a.m. UTC | #2
Hi Robin,



On 08/02/2017 05:47 PM, Robin Murphy wrote:
> On 02/08/17 10:53, Vivek Gautam wrote:
>> We don't want to touch the TLB when smmu is suspended.
>> Defer it until resume.
>>
>> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
>> ---
>>
>> Hi all,
>>
>> Here's the small patch in response of suggestion to defer tlb operations
>> when smmu is in suspend state.
>> The patch stores the TLB requests in 'unmap' when the smmu device is
>> suspended. On resume, it checks all the pending TLB requests, and
>> performs the unmap over those.
>>
>> Right now, I have applied the patch on top of the pm runtime series.
>> Let me know what you think of the change. It will also be helpful if
>> somebody can please test a valid use case with this.
> The patch itself doesn't make much sense to me, but more crucially it's
> definitely broken in concept. We can't return from arm_smmu_unmap()
> without having actually unmapped anything, because that leaves the page
> tables out of sync with what the caller expects - they may immmediately
> reuse that IOVA to map something else for a different device and hit an
> unexpected failure from io-pgtable when the PTE turns out to be non-empty.

To understand things bit more,
once we don't *unmap* in arm_smmu_unmap(), and leave the TLBs as is,
the next mapping can happen only with the *knowledge* of smmu, i.e.,
smmu should be active at that time.
If that's true then, the _runtime()_resume() method will take care of
invalidating the TLBs when we call arm_smmu_unmap() from _runtime_resume().
Is my understanding correct here?

>
> However, if in general suspend *might* power-gate any part of the SMMU,
> then I don't think we have any guarantee of what state any TLBs could be
> in upon resume. Therefore any individual invalidations we skip while
> suspended are probably moot, since resume would almost certainly have to
> invalidate everything to get back to a safe state anyway.

Right, in case when the suspend power-gates the SMMU, then
the TLB context is lost anyways. So resume path can freshly start.
This is something that exynos does at present.

>
> Conversely though, the situation that still concerns me is whether this
> can work at all for a distributed SMMU if things *don't* lose state. Say
> the GPU and its local TBU are in the same clock domain - if the GPU has
> just gone idle and we've clock-gated it, but "the SMMU" (i.e. the TCU)
> is still active servicing other devices, we will assume we can happily
> unmap GPU buffers and issue TLBIs, but what happens with entries held in
> the unclocked TBU's micro-TLB?

We know of platforms we have that have shared TCU and multiple TBUs.
Each TBU is available in its own power domain, not in master's power domain.
In such cases we may want to runtime_get() the TBUs, so that unmap() 
call with
master clock gated gets through.

Can we have a situation where the TBU and master are in the same power
domain, and the unmap is called when the master is not runtime active?
How will such a situation be handled?

Best regards
Vivek

>
> Robin.
>
>> regards
>> Vivek
>>
>>   drivers/iommu/arm-smmu.c | 59 +++++++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 53 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>> index fe8e7fd61282..1f9c2b16aabb 100644
>> --- a/drivers/iommu/arm-smmu.c
>> +++ b/drivers/iommu/arm-smmu.c
>> @@ -51,6 +51,7 @@
>>   #include <linux/pm_runtime.h>
>>   #include <linux/slab.h>
>>   #include <linux/spinlock.h>
>> +#include <linux/list.h>
>>   
>>   #include <linux/amba/bus.h>
>>   
>> @@ -151,6 +152,14 @@ struct arm_smmu_master_cfg {
>>   #define for_each_cfg_sme(fw, i, idx) \
>>   	for (i = 0; idx = fwspec_smendx(fw, i), i < fw->num_ids; ++i)
>>   
>> +struct arm_smmu_tlb_req_info {
>> +	struct iommu_domain *domain;
>> +	unsigned long iova;
>> +	size_t size;
>> +	bool tlb_flush_pending;
>> +	struct list_head list;
>> +};
>> +
>>   struct arm_smmu_device {
>>   	struct device			*dev;
>>   
>> @@ -182,6 +191,7 @@ struct arm_smmu_device {
>>   	u32				num_s2_context_banks;
>>   	DECLARE_BITMAP(context_map, ARM_SMMU_MAX_CBS);
>>   	atomic_t			irptndx;
>> +	struct list_head		domain_list;
>>   
>>   	u32				num_mapping_groups;
>>   	u16				streamid_mask;
>> @@ -1239,17 +1249,32 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>   			     size_t size)
>>   {
>>   	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>> +	struct arm_smmu_device *smmu = smmu_domain->smmu;
>>   	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>> -	size_t ret;
>> +	struct arm_smmu_tlb_req_info *tlb_info;
>>   
>>   	if (!ops)
>>   		return 0;
>>   
>> -	pm_runtime_get_sync(smmu_domain->smmu->dev);
>> -	ret = ops->unmap(ops, iova, size);
>> -	pm_runtime_put_sync(smmu_domain->smmu->dev);
>> +	/* if the device is suspended; we can't unmap, defer any tlb operations */
>> +	if (pm_runtime_suspended(smmu->dev)) {
>> +		tlb_info = devm_kzalloc(smmu->dev, sizeof(*tlb_info), GFP_ATOMIC);
>> +		if (!tlb_info)
>> +			return -ENOMEM;
>>   
>> -	return ret;
>> +		tlb_info->domain = domain;
>> +		tlb_info->iova = iova;
>> +		tlb_info->size = size;
>> +		tlb_info->tlb_flush_pending = true;
>> +		INIT_LIST_HEAD(&tlb_info->list);
>> +
>> +		/* XXX: We need locks here, but that again introduce the slowpath ? */
>> +		list_add_tail(&tlb_info->list, &smmu->domain_list);
>> +
>> +		return size;
>> +	}
>> +
>> +	return ops->unmap(ops, iova, size);
>>   }
>>   
>>   static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
>> @@ -2166,6 +2191,8 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
>>   		smmu->irqs[i] = irq;
>>   	}
>>   
>> +	INIT_LIST_HEAD(&smmu->domain_list);
>> +
>>   	err = arm_smmu_init_clocks(smmu);
>>   	if (err)
>>   		return err;
>> @@ -2268,8 +2295,28 @@ static int arm_smmu_device_remove(struct platform_device *pdev)
>>   static int arm_smmu_resume(struct device *dev)
>>   {
>>   	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
>> +	struct arm_smmu_tlb_req_info  *tlb_info, *temp;
>> +	int ret;
>> +
>> +	ret = arm_smmu_enable_clocks(smmu);
>> +	if (ret)
>> +		return ret;
>> +
>> +	list_for_each_entry_safe(tlb_info, temp, &smmu->domain_list, list) {
>> +		printk("\n\n %s %d :: iterating over pending tlb request\n\n", __func__, __LINE__);
>> +		if (tlb_info->tlb_flush_pending) {
>> +			ret = arm_smmu_unmap(tlb_info->domain, tlb_info->iova, tlb_info->size);
>> +			if (!ret)
>> +				return -EINVAL;
>>   
>> -	return arm_smmu_enable_clocks(smmu);
>> +			tlb_info->tlb_flush_pending = false;
>> +
>> +			/* we are done with this request; delete it */
>> +			list_del(&tlb_info->list);
>> +		}
>> +	}
>> +
>> +	return 0;
>>   }
>>   
>>   static int arm_smmu_suspend(struct device *dev)
>>
Robin Murphy Aug. 4, 2017, 5:04 p.m. UTC | #3
On 03/08/17 06:35, Vivek Gautam wrote:
> Hi Robin,
> 
> 
> 
> On 08/02/2017 05:47 PM, Robin Murphy wrote:
>> On 02/08/17 10:53, Vivek Gautam wrote:
>>> We don't want to touch the TLB when smmu is suspended.
>>> Defer it until resume.
>>>
>>> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
>>> ---
>>>
>>> Hi all,
>>>
>>> Here's the small patch in response of suggestion to defer tlb operations
>>> when smmu is in suspend state.
>>> The patch stores the TLB requests in 'unmap' when the smmu device is
>>> suspended. On resume, it checks all the pending TLB requests, and
>>> performs the unmap over those.
>>>
>>> Right now, I have applied the patch on top of the pm runtime series.
>>> Let me know what you think of the change. It will also be helpful if
>>> somebody can please test a valid use case with this.
>> The patch itself doesn't make much sense to me, but more crucially it's
>> definitely broken in concept. We can't return from arm_smmu_unmap()
>> without having actually unmapped anything, because that leaves the page
>> tables out of sync with what the caller expects - they may immmediately
>> reuse that IOVA to map something else for a different device and hit an
>> unexpected failure from io-pgtable when the PTE turns out to be
>> non-empty.
> 
> To understand things bit more,
> once we don't *unmap* in arm_smmu_unmap(), and leave the TLBs as is,
> the next mapping can happen only with the *knowledge* of smmu, i.e.,
> smmu should be active at that time.
> If that's true then, the _runtime()_resume() method will take care of
> invalidating the TLBs when we call arm_smmu_unmap() from _runtime_resume().
> Is my understanding correct here?

What I mean is that it's OK for arm_smmu_unmap() to defer the physical
TLB maintenance for an unmap request if the SMMU is suspended, but it
*must* still update the pagetable so that the given address is logically
unmapped before returning. In other words, the place to make decisions
based on the SMMU PM state would be in the .tlb_add_flush and .tlb_sync
callbacks, rather than at the top level.

>> However, if in general suspend *might* power-gate any part of the SMMU,
>> then I don't think we have any guarantee of what state any TLBs could be
>> in upon resume. Therefore any individual invalidations we skip while
>> suspended are probably moot, since resume would almost certainly have to
>> invalidate everything to get back to a safe state anyway.
> 
> Right, in case when the suspend power-gates the SMMU, then
> the TLB context is lost anyways. So resume path can freshly start.
> This is something that exynos does at present.

Yes, in general I don't think we can assume any SMMU state is preserved,
so the only safe option would be for .runtime_resume to do the same
thing as .resume, which does at least make things nice and simple.

>> Conversely though, the situation that still concerns me is whether this
>> can work at all for a distributed SMMU if things *don't* lose state. Say
>> the GPU and its local TBU are in the same clock domain - if the GPU has
>> just gone idle and we've clock-gated it, but "the SMMU" (i.e. the TCU)
>> is still active servicing other devices, we will assume we can happily
>> unmap GPU buffers and issue TLBIs, but what happens with entries held in
>> the unclocked TBU's micro-TLB?
> 
> We know of platforms we have that have shared TCU and multiple TBUs.
> Each TBU is available in its own power domain, not in master's power
> domain.
> In such cases we may want to runtime_get() the TBUs, so that unmap()
> call with
> master clock gated gets through.
> 
> Can we have a situation where the TBU and master are in the same power
> domain, and the unmap is called when the master is not runtime active?
> How will such a situation be handled?

Having thought about it a bit more, I think the
unmap-after-master-suspended case is only one facet of the problem - if
we can power down individual TBUs/micro-TLBs without suspending the rest
of the SMMU, do we also have any guarantee that such TLBs don't power
back on full of valid-looking random junk?

I'm starting to think the only way to be generally safe would be to
globally invalidate all TLBs after any *master* is resumed, and I'm not
even sure that's feasible :/

Robin.
Vivek Gautam Aug. 7, 2017, 7:44 a.m. UTC | #4
Hi Robin,


On Fri, Aug 4, 2017 at 10:34 PM, Robin Murphy <robin.murphy@arm.com> wrote:
> On 03/08/17 06:35, Vivek Gautam wrote:
>> Hi Robin,
>>
>>
>>
>> On 08/02/2017 05:47 PM, Robin Murphy wrote:
>>> On 02/08/17 10:53, Vivek Gautam wrote:
>>>> We don't want to touch the TLB when smmu is suspended.
>>>> Defer it until resume.
>>>>
>>>> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
>>>> ---
>>>>
>>>> Hi all,
>>>>
>>>> Here's the small patch in response of suggestion to defer tlb operations
>>>> when smmu is in suspend state.
>>>> The patch stores the TLB requests in 'unmap' when the smmu device is
>>>> suspended. On resume, it checks all the pending TLB requests, and
>>>> performs the unmap over those.
>>>>
>>>> Right now, I have applied the patch on top of the pm runtime series.
>>>> Let me know what you think of the change. It will also be helpful if
>>>> somebody can please test a valid use case with this.
>>> The patch itself doesn't make much sense to me, but more crucially it's
>>> definitely broken in concept. We can't return from arm_smmu_unmap()
>>> without having actually unmapped anything, because that leaves the page
>>> tables out of sync with what the caller expects - they may immmediately
>>> reuse that IOVA to map something else for a different device and hit an
>>> unexpected failure from io-pgtable when the PTE turns out to be
>>> non-empty.
>>
>> To understand things bit more,
>> once we don't *unmap* in arm_smmu_unmap(), and leave the TLBs as is,
>> the next mapping can happen only with the *knowledge* of smmu, i.e.,
>> smmu should be active at that time.
>> If that's true then, the _runtime()_resume() method will take care of
>> invalidating the TLBs when we call arm_smmu_unmap() from _runtime_resume().
>> Is my understanding correct here?
>
> What I mean is that it's OK for arm_smmu_unmap() to defer the physical
> TLB maintenance for an unmap request if the SMMU is suspended, but it
> *must* still update the pagetable so that the given address is logically
> unmapped before returning. In other words, the place to make decisions
> based on the SMMU PM state would be in the .tlb_add_flush and .tlb_sync
> callbacks, rather than at the top level.

Okay, i understand it better now.
.tlb_add_flush and .tlb_sync callbacks should be the right place.

>
>>> However, if in general suspend *might* power-gate any part of the SMMU,
>>> then I don't think we have any guarantee of what state any TLBs could be
>>> in upon resume. Therefore any individual invalidations we skip while
>>> suspended are probably moot, since resume would almost certainly have to
>>> invalidate everything to get back to a safe state anyway.
>>
>> Right, in case when the suspend power-gates the SMMU, then
>> the TLB context is lost anyways. So resume path can freshly start.
>> This is something that exynos does at present.
>
> Yes, in general I don't think we can assume any SMMU state is preserved,
> so the only safe option would be for .runtime_resume to do the same
> thing as .resume, which does at least make things nice and simple.

Let me try to find out more about the state of TLBs. As far as the
programmable registers are concerned, qcom platforms have retention
enabled for them. So they don't loose state after SMMU power down.

>
>>> Conversely though, the situation that still concerns me is whether this
>>> can work at all for a distributed SMMU if things *don't* lose state. Say
>>> the GPU and its local TBU are in the same clock domain - if the GPU has
>>> just gone idle and we've clock-gated it, but "the SMMU" (i.e. the TCU)
>>> is still active servicing other devices, we will assume we can happily
>>> unmap GPU buffers and issue TLBIs, but what happens with entries held in
>>> the unclocked TBU's micro-TLB?
>>
>> We know of platforms we have that have shared TCU and multiple TBUs.
>> Each TBU is available in its own power domain, not in master's power
>> domain.
>> In such cases we may want to runtime_get() the TBUs, so that unmap()
>> call with
>> master clock gated gets through.
>>
>> Can we have a situation where the TBU and master are in the same power
>> domain, and the unmap is called when the master is not runtime active?
>> How will such a situation be handled?
>
> Having thought about it a bit more, I think the
> unmap-after-master-suspended case is only one facet of the problem - if
> we can power down individual TBUs/micro-TLBs without suspending the rest
> of the SMMU, do we also have any guarantee that such TLBs don't power
> back on full of valid-looking random junk?
>
> I'm starting to think the only way to be generally safe would be to
> globally invalidate all TLBs after any *master* is resumed, and I'm not
> even sure that's feasible :/
>
> Robin.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


regards
Vivek
diff mbox

Patch

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index fe8e7fd61282..1f9c2b16aabb 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -51,6 +51,7 @@ 
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/list.h>
 
 #include <linux/amba/bus.h>
 
@@ -151,6 +152,14 @@  struct arm_smmu_master_cfg {
 #define for_each_cfg_sme(fw, i, idx) \
 	for (i = 0; idx = fwspec_smendx(fw, i), i < fw->num_ids; ++i)
 
+struct arm_smmu_tlb_req_info {
+	struct iommu_domain *domain;
+	unsigned long iova;
+	size_t size;
+	bool tlb_flush_pending;
+	struct list_head list;
+};
+
 struct arm_smmu_device {
 	struct device			*dev;
 
@@ -182,6 +191,7 @@  struct arm_smmu_device {
 	u32				num_s2_context_banks;
 	DECLARE_BITMAP(context_map, ARM_SMMU_MAX_CBS);
 	atomic_t			irptndx;
+	struct list_head		domain_list;
 
 	u32				num_mapping_groups;
 	u16				streamid_mask;
@@ -1239,17 +1249,32 @@  static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
 			     size_t size)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
-	size_t ret;
+	struct arm_smmu_tlb_req_info *tlb_info;
 
 	if (!ops)
 		return 0;
 
-	pm_runtime_get_sync(smmu_domain->smmu->dev);
-	ret = ops->unmap(ops, iova, size);
-	pm_runtime_put_sync(smmu_domain->smmu->dev);
+	/* if the device is suspended; we can't unmap, defer any tlb operations */
+	if (pm_runtime_suspended(smmu->dev)) {
+		tlb_info = devm_kzalloc(smmu->dev, sizeof(*tlb_info), GFP_ATOMIC);
+		if (!tlb_info)
+			return -ENOMEM;
 
-	return ret;
+		tlb_info->domain = domain;
+		tlb_info->iova = iova;
+		tlb_info->size = size;
+		tlb_info->tlb_flush_pending = true;
+		INIT_LIST_HEAD(&tlb_info->list);
+
+		/* XXX: We need locks here, but that again introduce the slowpath ? */
+		list_add_tail(&tlb_info->list, &smmu->domain_list);
+
+		return size;
+	}
+
+	return ops->unmap(ops, iova, size);
 }
 
 static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
@@ -2166,6 +2191,8 @@  static int arm_smmu_device_probe(struct platform_device *pdev)
 		smmu->irqs[i] = irq;
 	}
 
+	INIT_LIST_HEAD(&smmu->domain_list);
+
 	err = arm_smmu_init_clocks(smmu);
 	if (err)
 		return err;
@@ -2268,8 +2295,28 @@  static int arm_smmu_device_remove(struct platform_device *pdev)
 static int arm_smmu_resume(struct device *dev)
 {
 	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+	struct arm_smmu_tlb_req_info  *tlb_info, *temp;
+	int ret;
+
+	ret = arm_smmu_enable_clocks(smmu);
+	if (ret)
+		return ret;
+
+	list_for_each_entry_safe(tlb_info, temp, &smmu->domain_list, list) {
+		printk("\n\n %s %d :: iterating over pending tlb request\n\n", __func__, __LINE__);
+		if (tlb_info->tlb_flush_pending) {
+			ret = arm_smmu_unmap(tlb_info->domain, tlb_info->iova, tlb_info->size);
+			if (!ret)
+				return -EINVAL;
 
-	return arm_smmu_enable_clocks(smmu);
+			tlb_info->tlb_flush_pending = false;
+
+			/* we are done with this request; delete it */
+			list_del(&tlb_info->list);
+		}
+	}
+
+	return 0;
 }
 
 static int arm_smmu_suspend(struct device *dev)