diff mbox series

[v3,4/4] iommu/arm-smmu-v3: Enable HTTU for stage1 with io-pgtable mapping

Message ID 20240430134308.1604-5-shameerali.kolothum.thodi@huawei.com (mailing list archive)
State New, archived
Headers show
Series iommu/smmuv3: Add IOMMUFD dirty tracking support for SMMUv3 | expand

Commit Message

Shameer Kolothum April 30, 2024, 1:43 p.m. UTC
From: Kunkun Jiang <jiangkunkun@huawei.com>

If io-pgtable quirk flag indicates support for hardware update of
dirty state, enable HA/HD bits in the SMMU CD and also set the DBM
bit in the page descriptor.

Now report the dirty page tracking capability of SMMUv3 and
select IOMMUFD_DRIVER for ARM_SMMU_V3 if IOMMUFD is enabled.

Co-developed-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/iommu/Kconfig                       |  1 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 15 +++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 +++
 drivers/iommu/io-pgtable-arm.c              |  5 ++++-
 4 files changed, 23 insertions(+), 1 deletion(-)

Comments

Jason Gunthorpe May 12, 2024, 12:08 p.m. UTC | #1
On Tue, Apr 30, 2024 at 02:43:08PM +0100, Shameer Kolothum wrote:

> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index da6cc52859ba..20ac0e833c7b 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -433,6 +433,8 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
>  		pte = ARM_LPAE_PTE_nG;
>  		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
>  			pte |= ARM_LPAE_PTE_AP_RDONLY;
> +		else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
> +			pte |= ARM_LPAE_PTE_AP_WRITABLE_CLEAN;

This seems a bit suboptimal, it means the HTTU will be generating
dirty's before the tracking is turned on. As I understand it if the
SMMU wants to write a dirty bit it has to do an atomic RMW to memory,
so this would be a drag on baseline performance?

Should this start out as dirty and let the enable flow clean it to
turn it on?

Looks fine otherwise:

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
Tian, Kevin May 22, 2024, 7:19 a.m. UTC | #2
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Sunday, May 12, 2024 8:09 PM
> 
> On Tue, Apr 30, 2024 at 02:43:08PM +0100, Shameer Kolothum wrote:
> 
> > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-
> arm.c
> > index da6cc52859ba..20ac0e833c7b 100644
> > --- a/drivers/iommu/io-pgtable-arm.c
> > +++ b/drivers/iommu/io-pgtable-arm.c
> > @@ -433,6 +433,8 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct
> arm_lpae_io_pgtable *data,
> >  		pte = ARM_LPAE_PTE_nG;
> >  		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
> >  			pte |= ARM_LPAE_PTE_AP_RDONLY;
> > +		else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
> > +			pte |= ARM_LPAE_PTE_AP_WRITABLE_CLEAN;
> 
> This seems a bit suboptimal, it means the HTTU will be generating
> dirty's before the tracking is turned on. As I understand it if the
> SMMU wants to write a dirty bit it has to do an atomic RMW to memory,
> so this would be a drag on baseline performance?
> 
> Should this start out as dirty and let the enable flow clean it to
> turn it on?
> 

this appears to be good for other vendors too?
Jason Gunthorpe May 22, 2024, 12:39 p.m. UTC | #3
On Wed, May 22, 2024 at 07:19:05AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Sunday, May 12, 2024 8:09 PM
> > 
> > On Tue, Apr 30, 2024 at 02:43:08PM +0100, Shameer Kolothum wrote:
> > 
> > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-
> > arm.c
> > > index da6cc52859ba..20ac0e833c7b 100644
> > > --- a/drivers/iommu/io-pgtable-arm.c
> > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > @@ -433,6 +433,8 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct
> > arm_lpae_io_pgtable *data,
> > >  		pte = ARM_LPAE_PTE_nG;
> > >  		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
> > >  			pte |= ARM_LPAE_PTE_AP_RDONLY;
> > > +		else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
> > > +			pte |= ARM_LPAE_PTE_AP_WRITABLE_CLEAN;
> > 
> > This seems a bit suboptimal, it means the HTTU will be generating
> > dirty's before the tracking is turned on. As I understand it if the
> > SMMU wants to write a dirty bit it has to do an atomic RMW to memory,
> > so this would be a drag on baseline performance?
> > 
> > Should this start out as dirty and let the enable flow clean it to
> > turn it on?
> > 
> 
> this appears to be good for other vendors too?

I thought Intel and AMD both had a per-table flag to turn on tracking
and without that bit the dirties are not written back?

Jason
Shameer Kolothum May 22, 2024, 1:26 p.m. UTC | #4
> -----Original Message-----
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Sunday, May 12, 2024 1:09 PM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: iommu@lists.linux.dev; linux-arm-kernel@lists.infradead.org;
> robin.murphy@arm.com; will@kernel.org; joro@8bytes.org;
> ryan.roberts@arm.com; kevin.tian@intel.com; nicolinc@nvidia.com;
> mshavit@google.com; eric.auger@redhat.com; joao.m.martins@oracle.com;
> jiangkunkun <jiangkunkun@huawei.com>; zhukeqian
> <zhukeqian1@huawei.com>; Linuxarm <linuxarm@huawei.com>
> Subject: Re: [PATCH v3 4/4] iommu/arm-smmu-v3: Enable HTTU for stage1
> with io-pgtable mapping
> 
> On Tue, Apr 30, 2024 at 02:43:08PM +0100, Shameer Kolothum wrote:
> 
> > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-
> arm.c
> > index da6cc52859ba..20ac0e833c7b 100644
> > --- a/drivers/iommu/io-pgtable-arm.c
> > +++ b/drivers/iommu/io-pgtable-arm.c
> > @@ -433,6 +433,8 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct
> arm_lpae_io_pgtable *data,
> >  		pte = ARM_LPAE_PTE_nG;
> >  		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
> >  			pte |= ARM_LPAE_PTE_AP_RDONLY;
> > +		else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
> > +			pte |= ARM_LPAE_PTE_AP_WRITABLE_CLEAN;
> 
> This seems a bit suboptimal, it means the HTTU will be generating
> dirty's before the tracking is turned on. As I understand it if the
> SMMU wants to write a dirty bit it has to do an atomic RMW to memory,
> so this would be a drag on baseline performance?

Our initial tests has not shown any difference so far. But it is a possibility.
 
> Should this start out as dirty and let the enable flow clean it to
> turn it on?

Ok. Just to be clear, so the suggestion is just set the DBM bit here and let the
read_and_clear_dirty() set the  AP_RDONLY_BIT through the iopt_set_dirty_tracking()
path.

> 
> Looks fine otherwise:
> 
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Thanks,
Shameer
Jason Gunthorpe May 22, 2024, 1:41 p.m. UTC | #5
On Wed, May 22, 2024 at 01:26:20PM +0000, Shameerali Kolothum Thodi wrote:

> > On Tue, Apr 30, 2024 at 02:43:08PM +0100, Shameer Kolothum wrote:
> > 
> > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-
> > arm.c
> > > index da6cc52859ba..20ac0e833c7b 100644
> > > --- a/drivers/iommu/io-pgtable-arm.c
> > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > @@ -433,6 +433,8 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct
> > arm_lpae_io_pgtable *data,
> > >  		pte = ARM_LPAE_PTE_nG;
> > >  		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
> > >  			pte |= ARM_LPAE_PTE_AP_RDONLY;
> > > +		else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
> > > +			pte |= ARM_LPAE_PTE_AP_WRITABLE_CLEAN;
> > 
> > This seems a bit suboptimal, it means the HTTU will be generating
> > dirty's before the tracking is turned on. As I understand it if the
> > SMMU wants to write a dirty bit it has to do an atomic RMW to memory,
> > so this would be a drag on baseline performance?
> 
> Our initial tests has not shown any difference so far. But it is a possibility.

That is good news, it means the setting is quite low overhead

> > Should this start out as dirty and let the enable flow clean it to
> > turn it on?
> 
> Ok. Just to be clear, so the suggestion is just set the DBM bit here and let the
> read_and_clear_dirty() set the  AP_RDONLY_BIT through the iopt_set_dirty_tracking()
> path.

Yes, but it would be good to check with someone who knows what the IP
does to be sure. My guess is that should avoid memory writes.

Jason
Tian, Kevin May 22, 2024, 11:52 p.m. UTC | #6
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, May 22, 2024 8:39 PM
> 
> On Wed, May 22, 2024 at 07:19:05AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Sunday, May 12, 2024 8:09 PM
> > >
> > > On Tue, Apr 30, 2024 at 02:43:08PM +0100, Shameer Kolothum wrote:
> > >
> > > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-
> pgtable-
> > > arm.c
> > > > index da6cc52859ba..20ac0e833c7b 100644
> > > > --- a/drivers/iommu/io-pgtable-arm.c
> > > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > > @@ -433,6 +433,8 @@ static arm_lpae_iopte
> arm_lpae_prot_to_pte(struct
> > > arm_lpae_io_pgtable *data,
> > > >  		pte = ARM_LPAE_PTE_nG;
> > > >  		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
> > > >  			pte |= ARM_LPAE_PTE_AP_RDONLY;
> > > > +		else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
> > > > +			pte |= ARM_LPAE_PTE_AP_WRITABLE_CLEAN;
> > >
> > > This seems a bit suboptimal, it means the HTTU will be generating
> > > dirty's before the tracking is turned on. As I understand it if the
> > > SMMU wants to write a dirty bit it has to do an atomic RMW to memory,
> > > so this would be a drag on baseline performance?
> > >
> > > Should this start out as dirty and let the enable flow clean it to
> > > turn it on?
> > >
> >
> > this appears to be good for other vendors too?
> 
> I thought Intel and AMD both had a per-table flag to turn on tracking
> and without that bit the dirties are not written back?
> 

yes, I misunderstood the original context.
diff mbox series

Patch

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f872aeccd820..912cffc9f001 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -390,6 +390,7 @@  config ARM_SMMU_V3
 	select IOMMU_API
 	select IOMMU_IO_PGTABLE_LPAE
 	select GENERIC_MSI_IRQ
+	select IOMMUFD_DRIVER if IOMMUFD
 	help
 	  Support for implementations of the ARM System MMU architecture
 	  version 3 providing translation support to a PCIe root complex.
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ad18436c5f7f..e3143be3cfc6 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1333,6 +1333,12 @@  void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
 		CTXDESC_CD_0_ASET |
 		FIELD_PREP(CTXDESC_CD_0_ASID, smmu_domain->asid)
 		);
+
+	/* To enable dirty flag update, set both Access flag and dirty state update */
+	if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_HD)
+		target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_HA |
+					       CTXDESC_CD_0_TCR_HD);
+
 	target->data[1] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.ttbr &
 				      CTXDESC_CD_1_TTB0_MASK);
 	target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.mair);
@@ -2264,6 +2270,13 @@  static const struct iommu_flush_ops arm_smmu_flush_ops = {
 	.tlb_add_page	= arm_smmu_tlb_inv_page_nosync,
 };
 
+static bool arm_smmu_dbm_capable(struct arm_smmu_device *smmu)
+{
+	u32 features = (ARM_SMMU_FEAT_HD | ARM_SMMU_FEAT_COHERENCY);
+
+	return (smmu->features & features) == features;
+}
+
 /* IOMMU API */
 static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
 {
@@ -2276,6 +2289,8 @@  static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
 	case IOMMU_CAP_NOEXEC:
 	case IOMMU_CAP_DEFERRED_FLUSH:
 		return true;
+	case IOMMU_CAP_DIRTY_TRACKING:
+		return arm_smmu_dbm_capable(master->smmu);
 	default:
 		return false;
 	}
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index fd60052a9ec6..f684676d0bb5 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -312,6 +312,9 @@  struct arm_smmu_cd {
 #define CTXDESC_CD_0_TCR_IPS		GENMASK_ULL(34, 32)
 #define CTXDESC_CD_0_TCR_TBI0		(1ULL << 38)
 
+#define CTXDESC_CD_0_TCR_HA            (1UL << 43)
+#define CTXDESC_CD_0_TCR_HD            (1UL << 42)
+
 #define CTXDESC_CD_0_AA64		(1UL << 41)
 #define CTXDESC_CD_0_S			(1UL << 44)
 #define CTXDESC_CD_0_R			(1UL << 45)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index da6cc52859ba..20ac0e833c7b 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -433,6 +433,8 @@  static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
 		pte = ARM_LPAE_PTE_nG;
 		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
 			pte |= ARM_LPAE_PTE_AP_RDONLY;
+		else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
+			pte |= ARM_LPAE_PTE_AP_WRITABLE_CLEAN;
 		if (!(prot & IOMMU_PRIV))
 			pte |= ARM_LPAE_PTE_AP_UNPRIV;
 	} else {
@@ -923,7 +925,8 @@  arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 
 	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
 			    IO_PGTABLE_QUIRK_ARM_TTBR1 |
-			    IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+			    IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
+			    IO_PGTABLE_QUIRK_ARM_HD))
 		return NULL;
 
 	data = arm_lpae_alloc_pgtable(cfg);