diff mbox series

[RFC,08/15] iommu/riscv: Add IRQ domain for interrupt remapping

Message ID 20241114161845.502027-25-ajones@ventanamicro.com (mailing list archive)
State New
Headers show
Series iommu/riscv: Add irqbypass support | expand

Checks

Context Check Description
conchuod/vmtest-fixes-PR fail merge-conflict

Commit Message

Andrew Jones Nov. 14, 2024, 4:18 p.m. UTC
This is just a skeleton. Until irq_set_vcpu_affinity() is
implemented the IRQ domain doesn't serve any purpose.

Signed-off-by: Andrew Jones <ajones@ventanamicro.com>
---
 drivers/iommu/riscv/Makefile   |   2 +-
 drivers/iommu/riscv/iommu-ir.c | 209 +++++++++++++++++++++++++++++++++
 drivers/iommu/riscv/iommu.c    |  43 ++++++-
 drivers/iommu/riscv/iommu.h    |  21 ++++
 4 files changed, 270 insertions(+), 5 deletions(-)
 create mode 100644 drivers/iommu/riscv/iommu-ir.c

Comments

Jason Gunthorpe Nov. 18, 2024, 6:43 p.m. UTC | #1
On Thu, Nov 14, 2024 at 05:18:53PM +0100, Andrew Jones wrote:
> @@ -1276,10 +1279,30 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
>  	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
>  	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
>  	struct riscv_iommu_dc dc = {0};
> +	int ret;
>  
>  	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
>  		return -ENODEV;
>  
> +	if (riscv_iommu_bond_link(domain, dev))
> +		return -ENOMEM;
> +
> +	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {

Drivers should not be making tests like this.

> +		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
> +						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
> +		if (domain->gscid < 0) {
> +			riscv_iommu_bond_unlink(domain, dev);
> +			return -ENOMEM;
> +		}
> +
> +		ret = riscv_iommu_irq_domain_create(domain, dev);
> +		if (ret) {
> +			riscv_iommu_bond_unlink(domain, dev);
> +			ida_free(&riscv_iommu_gscids, domain->gscid);
> +			return ret;
> +		}
> +	}

What are you trying to do? Make something behave different for VFIO?
That isn't OK, we are trying to remove all the hacky VFIO special
cases in drivers.

What is the HW issue here? It is very very strange (and probably not
going to work right) that the irq domains change when domain
attachment changes.

The IRQ setup should really be fixed before any device drivers probe
onto the device.

Jason
Andrew Jones Nov. 19, 2024, 7:49 a.m. UTC | #2
On Mon, Nov 18, 2024 at 02:43:36PM -0400, Jason Gunthorpe wrote:
> On Thu, Nov 14, 2024 at 05:18:53PM +0100, Andrew Jones wrote:
> > @@ -1276,10 +1279,30 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
> >  	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> >  	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> >  	struct riscv_iommu_dc dc = {0};
> > +	int ret;
> >  
> >  	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
> >  		return -ENODEV;
> >  
> > +	if (riscv_iommu_bond_link(domain, dev))
> > +		return -ENOMEM;
> > +
> > +	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {
> 
> Drivers should not be making tests like this.
> 
> > +		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
> > +						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
> > +		if (domain->gscid < 0) {
> > +			riscv_iommu_bond_unlink(domain, dev);
> > +			return -ENOMEM;
> > +		}
> > +
> > +		ret = riscv_iommu_irq_domain_create(domain, dev);
> > +		if (ret) {
> > +			riscv_iommu_bond_unlink(domain, dev);
> > +			ida_free(&riscv_iommu_gscids, domain->gscid);
> > +			return ret;
> > +		}
> > +	}
> 
> What are you trying to do? Make something behave different for VFIO?
> That isn't OK, we are trying to remove all the hacky VFIO special
> cases in drivers.
> 
> What is the HW issue here? It is very very strange (and probably not
> going to work right) that the irq domains change when domain
> attachment changes.
> 
> The IRQ setup should really be fixed before any device drivers probe
> onto the device.

I can't disagree with the statement that this looks hacky, but considering
a VFIO domain needs to use the g-stage for its single-stage translation
and a paging domain for the host would use s-stage, then it seems we need
to identify the VFIO domains for their special treatment. Is there an
example of converting VFIO special casing in other drivers to something
cleaner that you can point me at?

The IRQ domain will only be useful for device assignment, as that's when
an MSI translation will be needed. I can't think of any problems that
could arise from only creating the IRQ domain when probing assigned
devices, but I could certainly be missing something. Do you have some
potential problems in mind?

Thanks,
drew
Jason Gunthorpe Nov. 19, 2024, 2 p.m. UTC | #3
On Tue, Nov 19, 2024 at 08:49:37AM +0100, Andrew Jones wrote:
> On Mon, Nov 18, 2024 at 02:43:36PM -0400, Jason Gunthorpe wrote:
> > On Thu, Nov 14, 2024 at 05:18:53PM +0100, Andrew Jones wrote:
> > > @@ -1276,10 +1279,30 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
> > >  	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > >  	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> > >  	struct riscv_iommu_dc dc = {0};
> > > +	int ret;
> > >  
> > >  	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
> > >  		return -ENODEV;
> > >  
> > > +	if (riscv_iommu_bond_link(domain, dev))
> > > +		return -ENOMEM;
> > > +
> > > +	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {
> > 
> > Drivers should not be making tests like this.
> > 
> > > +		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
> > > +						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
> > > +		if (domain->gscid < 0) {
> > > +			riscv_iommu_bond_unlink(domain, dev);
> > > +			return -ENOMEM;
> > > +		}
> > > +
> > > +		ret = riscv_iommu_irq_domain_create(domain, dev);
> > > +		if (ret) {
> > > +			riscv_iommu_bond_unlink(domain, dev);
> > > +			ida_free(&riscv_iommu_gscids, domain->gscid);
> > > +			return ret;
> > > +		}
> > > +	}
> > 
> > What are you trying to do? Make something behave different for VFIO?
> > That isn't OK, we are trying to remove all the hacky VFIO special
> > cases in drivers.
> > 
> > What is the HW issue here? It is very very strange (and probably not
> > going to work right) that the irq domains change when domain
> > attachment changes.
> > 
> > The IRQ setup should really be fixed before any device drivers probe
> > onto the device.
> 
> I can't disagree with the statement that this looks hacky, but considering
> a VFIO domain needs to use the g-stage for its single-stage translation
> and a paging domain for the host would use s-stage, then it seems we need
> to identify the VFIO domains for their special treatment.

This is the wrong thinking entirely. There is no such thing as a "VFIO
domain".

Default VFIO created domains should act excatly the same as a DMA API
domain.

If you want your system to have irq remapping, then it should be on by
default and DMA API gets remapping too. There would need to be a very
strong reason not to do that in order to make something special for
riscv. If so you'd need to add some kind of flag to select it.

Until you reach nested translation there is no "need" for VFIO to use
any particular stage. The design is that default VFIO uses the same
stage as the DMA API because it is doing the same basic default
translation function.

Nested translation has a control to select the stage, and you can
then force the g-stage for VFIO users at that point.

Regardless, you must not use UNMANAGED as some indication of VFIO,
that is not what it means, that is not what it is for.

> Is there an example of converting VFIO special casing in other
> drivers to something cleaner that you can point me at?

Nobody has had an issue where they want interrupt remapping on/off
depending on VFIO. I think that is inherently wrong.

> The IRQ domain will only be useful for device assignment, as that's when
> an MSI translation will be needed. I can't think of any problems that
> could arise from only creating the IRQ domain when probing assigned
> devices, but I could certainly be missing something. Do you have some
> potential problems in mind?

I'm not an expert in the interrupt subsystem, but my understanding was
we expect the interrupt domains/etc to be static once a device driver
is probed. Changing things during iommu domain attach is after drivers
are probed. I don't really expect it to work correctly in all corner
cases.

VFIO is allowed to change the translation as it operates and we expect
that interrupts are not disturbed.

Jason
Andrew Jones Nov. 19, 2024, 3:03 p.m. UTC | #4
On November 19, 2024 3:00:47 PM GMT+01:00, Jason Gunthorpe <jgg@ziepe.ca> wrote:
>On Tue, Nov 19, 2024 at 08:49:37AM +0100, Andrew Jones wrote:
>> On Mon, Nov 18, 2024 at 02:43:36PM -0400, Jason Gunthorpe wrote:
>> > On Thu, Nov 14, 2024 at 05:18:53PM +0100, Andrew Jones wrote:
>> > > @@ -1276,10 +1279,30 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
>> > >  	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
>> > >  	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
>> > >  	struct riscv_iommu_dc dc = {0};
>> > > +	int ret;
>> > >  
>> > >  	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
>> > >  		return -ENODEV;
>> > >  
>> > > +	if (riscv_iommu_bond_link(domain, dev))
>> > > +		return -ENOMEM;
>> > > +
>> > > +	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {
>> > 
>> > Drivers should not be making tests like this.
>> > 
>> > > +		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
>> > > +						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
>> > > +		if (domain->gscid < 0) {
>> > > +			riscv_iommu_bond_unlink(domain, dev);
>> > > +			return -ENOMEM;
>> > > +		}
>> > > +
>> > > +		ret = riscv_iommu_irq_domain_create(domain, dev);
>> > > +		if (ret) {
>> > > +			riscv_iommu_bond_unlink(domain, dev);
>> > > +			ida_free(&riscv_iommu_gscids, domain->gscid);
>> > > +			return ret;
>> > > +		}
>> > > +	}
>> > 
>> > What are you trying to do? Make something behave different for VFIO?
>> > That isn't OK, we are trying to remove all the hacky VFIO special
>> > cases in drivers.
>> > 
>> > What is the HW issue here? It is very very strange (and probably not
>> > going to work right) that the irq domains change when domain
>> > attachment changes.
>> > 
>> > The IRQ setup should really be fixed before any device drivers probe
>> > onto the device.
>> 
>> I can't disagree with the statement that this looks hacky, but considering
>> a VFIO domain needs to use the g-stage for its single-stage translation
>> and a paging domain for the host would use s-stage, then it seems we need
>> to identify the VFIO domains for their special treatment.
>
>This is the wrong thinking entirely. There is no such thing as a "VFIO
>domain".
>
>Default VFIO created domains should act excatly the same as a DMA API
>domain.
>
>If you want your system to have irq remapping, then it should be on by
>default and DMA API gets remapping too. There would need to be a very
>strong reason not to do that in order to make something special for
>riscv. If so you'd need to add some kind of flag to select it.
>
>Until you reach nested translation there is no "need" for VFIO to use
>any particular stage. The design is that default VFIO uses the same
>stage as the DMA API because it is doing the same basic default
>translation function.

The RISC-V IOMMU needs to use g-stage for device assignment, if we also want to enable irqbypass, because the IOMMU is specified to only look at the MSI table when g-stage is in use. This is actually another reason the irq domain only makes sense for device assignment.

>
>Nested translation has a control to select the stage, and you can
>then force the g-stage for VFIO users at that point.

We could force riscv device assignment to always be nested, and when not providing an iommu to the guest, it will still be single-stage, but g-stage, but I don't think that's currently possible with VFIO, is it?

>
>Regardless, you must not use UNMANAGED as some indication of VFIO,
>that is not what it means, that is not what it is for.
>
>> Is there an example of converting VFIO special casing in other
>> drivers to something cleaner that you can point me at?
>
>Nobody has had an issue where they want interrupt remapping on/off
>depending on VFIO. I think that is inherently wrong.
>
>> The IRQ domain will only be useful for device assignment, as that's when
>> an MSI translation will be needed. I can't think of any problems that
>> could arise from only creating the IRQ domain when probing assigned
>> devices, but I could certainly be missing something. Do you have some
>> potential problems in mind?
>
>I'm not an expert in the interrupt subsystem, but my understanding was
>we expect the interrupt domains/etc to be static once a device driver
>is probed. Changing things during iommu domain attach is after drivers
>are probed. I don't really expect it to work correctly in all corner
>cases.

With VFIO the iommu domain attach comes after an unbind/bind, so the new driver is probed. I think that's a safe time. However, if there could be cases where the attach does not follow an unbind/bind, then I agree that wouldn't be safe. I'll consider always creating an IRQ domain, even if it won't provide any additional functionality unless the device is assigned.

>
>VFIO is allowed to change the translation as it operates and we expect
>that interrupts are not disturbed.
>

The IRQ domain stays the same during operation, the only changes are the mappings from what the guest believes are its s-mode interrupt files to the hypervisor selected guest interrupt files, and these changes are made possible by the IRQ domain's vcpu-affinity support.

Thanks,
drew
Jason Gunthorpe Nov. 19, 2024, 3:36 p.m. UTC | #5
On Tue, Nov 19, 2024 at 04:03:05PM +0100, Andrew Jones wrote:

> >This is the wrong thinking entirely. There is no such thing as a "VFIO
> >domain".
> >
> >Default VFIO created domains should act excatly the same as a DMA API
> >domain.
> >
> >If you want your system to have irq remapping, then it should be on by
> >default and DMA API gets remapping too. There would need to be a very
> >strong reason not to do that in order to make something special for
> >riscv. If so you'd need to add some kind of flag to select it.
> >
> >Until you reach nested translation there is no "need" for VFIO to use
> >any particular stage. The design is that default VFIO uses the same
> >stage as the DMA API because it is doing the same basic default
> >translation function.
> 
> The RISC-V IOMMU needs to use g-stage for device assignment, if we
> also want to enable irqbypass, because the IOMMU is specified to
> only look at the MSI table when g-stage is in use. This is actually
> another reason the irq domain only makes sense for device
> assignment.

Most HW has enablable interrupt remapping and typically Linux just
turns it always on.

Is there a reason the DMA API shouldn't use this translation mode too?
That seems to be the main issue here, you are trying to avoid
interrupt remapping for DMA API and use it only for VFIO, and haven't
explained why we need such complexity. Just use it always?

> >Nested translation has a control to select the stage, and you can
> >then force the g-stage for VFIO users at that point.
> 
> We could force riscv device assignment to always be nested, and when
> not providing an iommu to the guest, it will still be single-stage,
> but g-stage, but I don't think that's currently possible with VFIO,
> is it?

That isn't what I mean, I mean you should not be forcing the kind of
domain being created until you get to special cases like nested.

Default VFIO should work the same as the DMA API.

> >> The IRQ domain will only be useful for device assignment, as that's when
> >> an MSI translation will be needed. I can't think of any problems that
> >> could arise from only creating the IRQ domain when probing assigned
> >> devices, but I could certainly be missing something. Do you have some
> >> potential problems in mind?
> >
> >I'm not an expert in the interrupt subsystem, but my understanding was
> >we expect the interrupt domains/etc to be static once a device driver
> >is probed. Changing things during iommu domain attach is after drivers
> >are probed. I don't really expect it to work correctly in all corner
> >cases.
> 
> With VFIO the iommu domain attach comes after an unbind/bind, so the
> new driver is probed.

That's the opposite of what I mean. The irq domain should be setup
*before* VFIO binds to the driver.

Changing the active irq_domain while VFIO is already probed to the
device is highly unlikely to work right in all cases.

> I think that's a safe time. However, if there
> could be cases where the attach does not follow an unbind/bind, then
> I agree that wouldn't be safe.

These cases exist.

> I'll consider always creating an IRQ
> domain, even if it won't provide any additional functionality unless
> the device is assigned.

That isn't ideal, the translation under the IRQs shouldn't really be
changing as the translation under the IOMMU changes.

Further, VFIO assumes iommu_group_has_isolated_msi(), ie
IRQ_DOMAIN_FLAG_ISOLATED_MSI, is fixed while it is is bound. Will that
be true if the iommu is flapping all about? What will you do when VFIO
has it attached to a blocked domain?

It just doesn't make sense to change something so fundamental as the
interrupt path on an iommu domain attachement. :\

> >VFIO is allowed to change the translation as it operates and we expect
> >that interrupts are not disturbed.
> 
> The IRQ domain stays the same during operation, the only changes are
> the mappings from what the guest believes are its s-mode interrupt
> files to the hypervisor selected guest interrupt files, and these
> changes are made possible by the IRQ domain's vcpu-affinity support.

That is only the case when talking about kvm, this all still has to
work fully for non-kvm VFIO uses cases too.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
index f54c9ed17d41..8420dd1776cb 100644
--- a/drivers/iommu/riscv/Makefile
+++ b/drivers/iommu/riscv/Makefile
@@ -1,3 +1,3 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o
+obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-ir.o iommu-platform.o
 obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c
new file mode 100644
index 000000000000..c177e064b205
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-ir.c
@@ -0,0 +1,209 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU Interrupt Remapping
+ *
+ * Copyright © 2024 Ventana Micro Systems Inc.
+ */
+#include <linux/irqdomain.h>
+#include <linux/msi.h>
+
+#include "../iommu-pages.h"
+#include "iommu.h"
+
+static size_t riscv_iommu_ir_get_msipte_idx(struct riscv_iommu_domain *domain,
+					    phys_addr_t msi_pa)
+{
+	phys_addr_t addr = msi_pa >> 12;
+	size_t idx;
+
+	if (domain->group_index_bits) {
+		phys_addr_t group_mask = BIT(domain->group_index_bits) - 1;
+		phys_addr_t group_shift = domain->group_index_shift - 12;
+		phys_addr_t group = (addr >> group_shift) & group_mask;
+		phys_addr_t mask = domain->msiptp.msi_addr_mask & ~(group_mask << group_shift);
+
+		idx = addr & mask;
+		idx |= group << fls64(mask);
+	} else {
+		idx = addr & domain->msiptp.msi_addr_mask;
+	}
+
+	return idx;
+}
+
+static struct riscv_iommu_msipte *riscv_iommu_ir_get_msipte(struct riscv_iommu_domain *domain,
+							    phys_addr_t msi_pa)
+{
+	size_t idx;
+
+	if (((msi_pa >> 12) & ~domain->msiptp.msi_addr_mask) != domain->msiptp.msi_addr_pattern)
+		return NULL;
+
+	idx = riscv_iommu_ir_get_msipte_idx(domain, msi_pa);
+	return &domain->msi_root[idx];
+}
+
+static size_t riscv_iommu_ir_nr_msiptes(struct riscv_iommu_domain *domain)
+{
+	phys_addr_t base = domain->msiptp.msi_addr_pattern << 12;
+	phys_addr_t max_addr = base | (domain->msiptp.msi_addr_mask << 12);
+	size_t max_idx = riscv_iommu_ir_get_msipte_idx(domain, max_addr);
+
+	return max_idx + 1;
+}
+
+static struct irq_chip riscv_iommu_irq_chip = {
+	.name			= "IOMMU-IR",
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= irq_chip_unmask_parent,
+};
+
+static int riscv_iommu_irq_domain_alloc_irqs(struct irq_domain *irqdomain,
+					     unsigned int irq_base, unsigned int nr_irqs,
+					     void *arg)
+{
+	struct irq_data *data;
+	int i, ret;
+
+	ret = irq_domain_alloc_irqs_parent(irqdomain, irq_base, nr_irqs, arg);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nr_irqs; i++) {
+		data = irq_domain_get_irq_data(irqdomain, irq_base + i);
+		data->chip = &riscv_iommu_irq_chip;
+	}
+
+	return 0;
+}
+
+static const struct irq_domain_ops riscv_iommu_irq_domain_ops = {
+	.alloc = riscv_iommu_irq_domain_alloc_irqs,
+	.free = irq_domain_free_irqs_parent,
+};
+
+static const struct msi_parent_ops riscv_iommu_msi_parent_ops = {
+	.prefix			= "IR-",
+	.supported_flags	= MSI_GENERIC_FLAGS_MASK |
+				  MSI_FLAG_PCI_MSIX,
+	.required_flags		= MSI_FLAG_USE_DEF_DOM_OPS |
+				  MSI_FLAG_USE_DEF_CHIP_OPS,
+	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
+};
+
+int riscv_iommu_irq_domain_create(struct riscv_iommu_domain *domain,
+				  struct device *dev)
+{
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+	struct fwnode_handle *fn;
+	char *fwname;
+
+	if (domain->irqdomain) {
+		dev_set_msi_domain(dev, domain->irqdomain);
+		return 0;
+	}
+
+	if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT)) {
+		dev_warn(iommu->dev, "Cannot enable interrupt remapping\n");
+		return 0;
+	}
+
+	spin_lock_init(&domain->msi_lock);
+	/*
+	 * TODO: The hypervisor should be in control of this size. For now
+	 * we just allocate enough space for 512 VCPUs.
+	 */
+	domain->msi_order = 1;
+	domain->msi_root = iommu_alloc_pages_node(domain->numa_node,
+						  GFP_KERNEL_ACCOUNT, domain->msi_order);
+	if (!domain->msi_root)
+		return -ENOMEM;
+
+	fwname = kasprintf(GFP_KERNEL, "IOMMU-IR-%s", dev_name(dev));
+	if (!fwname) {
+		iommu_free_pages(domain->msi_root, domain->msi_order);
+		return -ENOMEM;
+	}
+
+	fn = irq_domain_alloc_named_fwnode(fwname);
+	kfree(fwname);
+	if (!fn) {
+		dev_err(iommu->dev, "Couldn't allocate fwnode\n");
+		iommu_free_pages(domain->msi_root, domain->msi_order);
+		return -ENOMEM;
+	}
+
+	domain->irqdomain = irq_domain_create_hierarchy(dev_get_msi_domain(dev),
+							0, 0, fn,
+							&riscv_iommu_irq_domain_ops,
+							domain);
+	if (!domain->irqdomain) {
+		dev_err(iommu->dev, "Failed to create IOMMU irq domain\n");
+		iommu_free_pages(domain->msi_root, domain->msi_order);
+		irq_domain_free_fwnode(fn);
+		return -ENOMEM;
+	}
+
+	domain->irqdomain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
+				    IRQ_DOMAIN_FLAG_ISOLATED_MSI;
+	domain->irqdomain->msi_parent_ops = &riscv_iommu_msi_parent_ops;
+	irq_domain_update_bus_token(domain->irqdomain, DOMAIN_BUS_MSI_REMAP);
+	dev_set_msi_domain(dev, domain->irqdomain);
+
+	return 0;
+}
+
+void riscv_iommu_ir_get_resv_regions(struct device *dev, struct list_head *head)
+{
+	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+	struct riscv_iommu_domain *domain = info->domain;
+	struct iommu_resv_region *reg;
+	phys_addr_t base, addr;
+	size_t nr_pages, i;
+
+	if (!domain || !domain->msiptp.msiptp)
+		return;
+
+	base = domain->msiptp.msi_addr_pattern << 12;
+
+	if (domain->group_index_bits) {
+		phys_addr_t group_mask = BIT(domain->group_index_bits) - 1;
+		phys_addr_t group_shift = domain->group_index_shift - 12;
+		phys_addr_t mask = domain->msiptp.msi_addr_mask & ~(group_mask << group_shift);
+
+		nr_pages = mask + 1;
+	} else {
+		nr_pages = domain->msiptp.msi_addr_mask + 1;
+	}
+
+	for (i = 0; i < BIT(domain->group_index_bits); i++) {
+		addr = base | (i << domain->group_index_shift);
+		reg = iommu_alloc_resv_region(addr, nr_pages * 4096,
+					      0, IOMMU_RESV_MSI, GFP_KERNEL);
+		if (reg)
+			list_add_tail(&reg->list, head);
+	}
+}
+
+void riscv_iommu_irq_domain_remove(struct riscv_iommu_domain *domain)
+{
+	struct fwnode_handle *fn;
+
+	if (!domain->irqdomain)
+		return;
+
+	iommu_free_pages(domain->msi_root, domain->msi_order);
+
+	fn = domain->irqdomain->fwnode;
+	irq_domain_remove(domain->irqdomain);
+	irq_domain_free_fwnode(fn);
+}
+
+void riscv_iommu_irq_domain_unlink(struct riscv_iommu_domain *domain,
+				   struct device *dev)
+{
+	if (!domain || !domain->irqdomain)
+		return;
+
+	dev_set_msi_domain(dev, domain->irqdomain->parent);
+}
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 6e8ea3d22ff5..c4a47b21c58f 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -943,7 +943,8 @@  static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
 	rcu_read_unlock();
 }
 
-#define RISCV_IOMMU_FSC_BARE 0
+#define RISCV_IOMMU_FSC_BARE		0
+#define RISCV_IOMMU_IOHGATP_BARE	0
 
 /*
  * Update IODIR for the device.
@@ -1245,6 +1246,8 @@  static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
 
 	WARN_ON(!list_empty(&domain->bonds));
 
+	riscv_iommu_irq_domain_remove(domain);
+
 	if (domain->pscid > 0)
 		ida_free(&riscv_iommu_pscids, domain->pscid);
 	if (domain->gscid > 0)
@@ -1276,10 +1279,30 @@  static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
 	struct riscv_iommu_dc dc = {0};
+	int ret;
 
 	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
 		return -ENODEV;
 
+	if (riscv_iommu_bond_link(domain, dev))
+		return -ENOMEM;
+
+	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {
+		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
+						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
+		if (domain->gscid < 0) {
+			riscv_iommu_bond_unlink(domain, dev);
+			return -ENOMEM;
+		}
+
+		ret = riscv_iommu_irq_domain_create(domain, dev);
+		if (ret) {
+			riscv_iommu_bond_unlink(domain, dev);
+			ida_free(&riscv_iommu_gscids, domain->gscid);
+			return ret;
+		}
+	}
+
 	if (domain->gscid) {
 		dc.iohgatp = FIELD_PREP(RISCV_IOMMU_DC_IOHGATP_MODE, domain->pgd_mode) |
 			     FIELD_PREP(RISCV_IOMMU_DC_IOHGATP_GSCID, domain->gscid) |
@@ -1292,10 +1315,9 @@  static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
 	dc.ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
 			   RISCV_IOMMU_PC_TA_V;
 
-	if (riscv_iommu_bond_link(domain, dev))
-		return -ENOMEM;
-
 	riscv_iommu_iodir_update(iommu, dev, &dc);
+
+	riscv_iommu_irq_domain_unlink(info->domain, dev);
 	riscv_iommu_bond_unlink(info->domain, dev);
 	info->domain = domain;
 
@@ -1389,9 +1411,12 @@  static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
 	struct riscv_iommu_dc dc = {0};
 
 	dc.fsc = RISCV_IOMMU_FSC_BARE;
+	dc.iohgatp = RISCV_IOMMU_IOHGATP_BARE;
 
 	/* Make device context invalid, translation requests will fault w/ #258 */
 	riscv_iommu_iodir_update(iommu, dev, &dc);
+
+	riscv_iommu_irq_domain_unlink(info->domain, dev);
 	riscv_iommu_bond_unlink(info->domain, dev);
 	info->domain = NULL;
 
@@ -1413,15 +1438,24 @@  static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
 	struct riscv_iommu_dc dc = {0};
 
 	dc.fsc = RISCV_IOMMU_FSC_BARE;
+	dc.iohgatp = RISCV_IOMMU_IOHGATP_BARE;
 	dc.ta = RISCV_IOMMU_PC_TA_V;
 
 	riscv_iommu_iodir_update(iommu, dev, &dc);
+
+	riscv_iommu_irq_domain_unlink(info->domain, dev);
 	riscv_iommu_bond_unlink(info->domain, dev);
 	info->domain = NULL;
 
 	return 0;
 }
 
+static void riscv_iommu_get_resv_regions(struct device *dev,
+					 struct list_head *head)
+{
+	riscv_iommu_ir_get_resv_regions(dev, head);
+}
+
 static struct iommu_domain riscv_iommu_identity_domain = {
 	.type = IOMMU_DOMAIN_IDENTITY,
 	.ops = &(const struct iommu_domain_ops) {
@@ -1516,6 +1550,7 @@  static const struct iommu_ops riscv_iommu_ops = {
 	.blocked_domain = &riscv_iommu_blocking_domain,
 	.release_domain = &riscv_iommu_blocking_domain,
 	.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
+	.get_resv_regions = riscv_iommu_get_resv_regions,
 	.device_group = riscv_iommu_device_group,
 	.probe_device = riscv_iommu_probe_device,
 	.release_device	= riscv_iommu_release_device,
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index dd538b19fbb7..6ce71095781c 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -23,6 +23,12 @@ 
 #define RISCV_IOMMU_DDTP_TIMEOUT	10000000
 #define RISCV_IOMMU_IOTINVAL_TIMEOUT	90000000
 
+struct riscv_iommu_msiptp_state {
+	u64 msiptp;
+	u64 msi_addr_mask;
+	u64 msi_addr_pattern;
+};
+
 /* This struct contains protection domain specific IOMMU driver data. */
 struct riscv_iommu_domain {
 	struct iommu_domain domain;
@@ -34,6 +40,13 @@  struct riscv_iommu_domain {
 	int numa_node;
 	unsigned int pgd_mode;
 	unsigned long *pgd_root;
+	u32 group_index_bits;
+	u32 group_index_shift;
+	int msi_order;
+	struct riscv_iommu_msipte *msi_root;
+	spinlock_t msi_lock;
+	struct riscv_iommu_msiptp_state msiptp;
+	struct irq_domain *irqdomain;
 };
 
 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
@@ -119,6 +132,14 @@  void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
 void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
 			  unsigned int timeout_us);
 
+int riscv_iommu_irq_domain_create(struct riscv_iommu_domain *domain,
+				  struct device *dev);
+void riscv_iommu_irq_domain_remove(struct riscv_iommu_domain *domain);
+void riscv_iommu_irq_domain_unlink(struct riscv_iommu_domain *domain,
+				   struct device *dev);
+void riscv_iommu_ir_get_resv_regions(struct device *dev,
+				     struct list_head *head);
+
 #define riscv_iommu_readl(iommu, addr) \
 	readl_relaxed((iommu)->reg + (addr))