diff mbox series

[RFC,1/1] irqchip/gic-v3-its: Add irq domain and chip for Direct LPI without ITS

Message ID MW4PR21MB2002BA20FA4A8DAA1C27F4C6C0199@MW4PR21MB2002.namprd21.prod.outlook.com (mailing list archive)
State New, archived
Headers show
Series [RFC,1/1] irqchip/gic-v3-its: Add irq domain and chip for Direct LPI without ITS | expand

Commit Message

Sunil Muthuswamy July 8, 2021, 7:36 p.m. UTC
The current GIC v3 driver doesn't support Direct LPI when there is
no ITS. The spec architecturally supports Direct LPI without an ITS.
This patch introduces an IRQ domain and chip to manage Direct LPI
when there is no ITS.

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
---
 drivers/irqchip/Makefile          |   2 +-
 drivers/irqchip/irq-gic-common.h  |  22 +
 drivers/irqchip/irq-gic-v3-dlpi.c | 639 ++++++++++++++++++++++++++++++
 drivers/irqchip/irq-gic-v3-its.c  |  51 ++-
 4 files changed, 694 insertions(+), 20 deletions(-)
 create mode 100644 drivers/irqchip/irq-gic-v3-dlpi.c

Comments

Marc Zyngier July 11, 2021, 11:09 a.m. UTC | #1
On Thu, 08 Jul 2021 20:36:51 +0100,
Sunil Muthuswamy <sunilmut@microsoft.com> wrote:
> 
> The current GIC v3 driver doesn't support Direct LPI when there is
> no ITS. The spec architecturally supports Direct LPI without an ITS.
> This patch introduces an IRQ domain and chip to manage Direct LPI
> when there is no ITS.
> 
> Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
> ---
>  drivers/irqchip/Makefile          |   2 +-
>  drivers/irqchip/irq-gic-common.h  |  22 +
>  drivers/irqchip/irq-gic-v3-dlpi.c | 639 ++++++++++++++++++++++++++++++
>  drivers/irqchip/irq-gic-v3-its.c  |  51 ++-
>  4 files changed, 694 insertions(+), 20 deletions(-)
>  create mode 100644 drivers/irqchip/irq-gic-v3-dlpi.c
> 
> diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
> index f88cbf36a9d2..c6ea9620161b 100644
> --- a/drivers/irqchip/Makefile
> +++ b/drivers/irqchip/Makefile
> @@ -32,7 +32,7 @@ obj-$(CONFIG_ARM_GIC_PM)		+= irq-gic-pm.o
>  obj-$(CONFIG_ARCH_REALVIEW)		+= irq-gic-realview.o
>  obj-$(CONFIG_ARM_GIC_V2M)		+= irq-gic-v2m.o
>  obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o
> -obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-platform-msi.o irq-gic-v4.o
> +obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-platform-msi.o irq-gic-v4.o irq-gic-v3-dlpi.o
>  obj-$(CONFIG_ARM_GIC_V3_ITS_PCI)	+= irq-gic-v3-its-pci-msi.o
>  obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC)	+= irq-gic-v3-its-fsl-mc-msi.o
>  obj-$(CONFIG_PARTITION_PERCPU)		+= irq-partition-percpu.o
> diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h
> index ccba8b0fe0f5..21e4e8f6ba70 100644
> --- a/drivers/irqchip/irq-gic-common.h
> +++ b/drivers/irqchip/irq-gic-common.h
> @@ -30,4 +30,26 @@ void gic_enable_of_quirks(const struct device_node *np,
>  
>  void gic_set_kvm_info(const struct gic_kvm_info *info);
>  
> +/* LPI related functionality */
> +/*
> + * TODO: Ideally, I think these should be moved to a different file
> + *	 such as irq-gic-v3-lpi-common.h/.c. But, keeping it here
> + *	 for now to get comments from the RFC.
> + */
> +DECLARE_PER_CPU(struct cpu_lpi_count, cpu_lpi_count);
> +
> +__maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu);
> +void its_inc_lpi_count(struct irq_data *d, int cpu);
> +void its_dec_lpi_count(struct irq_data *d, int cpu);
> +unsigned int cpumask_pick_least_loaded(struct irq_data *d,
> +				       const struct cpumask *cpu_mask);

Ideally, you shouldn't use this at all. At least not until we have
figured out what you are trying to achieve. At best, this is premature
optimisation.

> +int its_irq_gic_domain_alloc(struct irq_domain *domain,
> +			     unsigned int virq,
> +			     irq_hw_number_t hwirq);
> +unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids);
> +void its_lpi_free(unsigned long *bitmap, u32 base, u32 nr_ids);

And certainly not use these *ever*.

> +
> +struct rdists;
> +int direct_lpi_init(struct irq_domain *parent, struct rdists *rdists);
> +
>  #endif /* _IRQ_GIC_COMMON_H */
> diff --git a/drivers/irqchip/irq-gic-v3-dlpi.c b/drivers/irqchip/irq-gic-v3-dlpi.c
> new file mode 100644
> index 000000000000..722a0630fced
> --- /dev/null
> +++ b/drivers/irqchip/irq-gic-v3-dlpi.c
> @@ -0,0 +1,639 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) Microsoft Corporation.
> + * Author: Sunil Muthuswamy <sunilmut@microsoft.com>
> + *
> + * This file implements an IRQ domain and chip to handle Direct LPI
> + * when there is no ITS, for GIC v3.
> + */
> +
> +#include <linux/acpi_iort.h>
> +#include <linux/bitfield.h>
> +#include <linux/bitmap.h>
> +#include <linux/cpu.h>
> +#include <linux/interrupt.h>
> +#include <linux/irqdomain.h>
> +#include <linux/list.h>
> +#include <linux/msi.h>
> +#include <linux/irq.h>
> +#include <linux/percpu.h>
> +#include <linux/dma-iommu.h>
> +
> +#include <linux/irqchip.h>
> +#include <linux/irqchip/arm-gic-v3.h>
> +#include <linux/irqchip/arm-gic-v4.h>
> +
> +#include "irq-gic-common.h"
> +
> +static struct rdists *gic_rdists;
> +
> +#define gic_data_rdist_cpu(cpu)		(per_cpu_ptr(gic_rdists->rdist, cpu))
> +
> +#define RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING	(1 << 0)
> +
> +/*
> + * Structure that holds most of the infrastructure needed to support
> + * DirectLPI without an ITS.
> + *
> + * dev_alloc_lock has to be taken for device allocations, while the
> + * spinlock must be taken to parse data structures such as the device
> + * list.
> + */
> +
> +struct direct_lpi {
> +	raw_spinlock_t		lock;
> +	struct mutex		dev_alloc_lock;
> +	struct list_head	entry;
> +	struct fwnode_handle	*fwnode_handle;
> +	struct list_head	device_list;
> +	u64			flags;
> +	unsigned int		msi_domain_flags;
> +};
> +
> +struct event_lpi_map {
> +	unsigned long		*lpi_map;
> +	u16			*col_map;
> +	irq_hw_number_t		lpi_base;
> +	int			nr_lpis;
> +};

DirectLPI has no notion of event, collection, or anything like
this. It deals with LPIs, and that's it.

> +
> +struct direct_lpi_device {
> +	struct list_head	entry;
> +	struct direct_lpi	*dlpi;
> +	struct event_lpi_map	event_map;
> +	u32			device_id;
> +	bool			shared;
> +};

Same this here. DirectLPI has no notion of DeviceID.

> +
> +static int dlpi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
> +				unsigned int nr_irqs, void *args);
> +
> +static void dlpi_irq_domain_free(struct irq_domain *domain, unsigned int virq,
> +				unsigned int nr_irqs);
> +
> +static int dlpi_irq_domain_activate(struct irq_domain *domain,
> +				   struct irq_data *d, bool reserve);
> +
> +static void dlpi_irq_domain_deactivate(struct irq_domain *domain,
> +				      struct irq_data *d);
> +
> +static const struct irq_domain_ops dlpi_domain_ops = {
> +	.alloc			= dlpi_irq_domain_alloc,
> +	.free			= dlpi_irq_domain_free,
> +	.activate		= dlpi_irq_domain_activate,
> +	.deactivate		= dlpi_irq_domain_deactivate,
> +};
> +
> +static int dlpi_msi_prepare(struct irq_domain *domain, struct device *dev,
> +			   int nvec, msi_alloc_info_t *info);
> +
> +static struct msi_domain_ops dlpi_msi_domain_ops = {
> +	.msi_prepare	= dlpi_msi_prepare,
> +};
> +
> +static int dlpi_init_domain(struct fwnode_handle *handle,
> +			      struct irq_domain *parent_domain,
> +			      struct direct_lpi *dlpi)
> +{
> +	struct irq_domain *inner_domain;
> +	struct msi_domain_info *info;
> +
> +	info = kzalloc(sizeof(*info), GFP_KERNEL);
> +	if (!info)
> +		return -ENOMEM;
> +
> +	inner_domain = irq_domain_create_tree(handle, &dlpi_domain_ops, NULL);
> +	if (!inner_domain) {
> +		kfree(info);
> +		return -ENOMEM;
> +	}
> +
> +	inner_domain->parent = parent_domain;
> +	irq_domain_update_bus_token(inner_domain, DOMAIN_BUS_NEXUS);
> +	inner_domain->flags |= dlpi->msi_domain_flags;
> +	info->ops = &dlpi_msi_domain_ops;
> +	info->data = dlpi;
> +	inner_domain->host_data = info;
> +
> +	return 0;
> +}
> +
> +int __init direct_lpi_init(struct irq_domain *parent, struct rdists *rdists)
> +{
> +	struct fwnode_handle *fwnode;
> +	int err;
> +	struct direct_lpi *dlpi = NULL;
> +
> +	gic_rdists = rdists;
> +	fwnode = irq_domain_alloc_named_fwnode("Direct LPI");
> +	if (!fwnode)
> +		return -ENOMEM;
> +
> +	/*
> +	 * Registering with the iort allows other services to query the
> +	 * fwnode. But, the registration requires an ITS ID and base address,
> +	 * which does not apply here. So, probably best to just export the
> +	 * fwnode handle for other services. Keeping it here for comments
> +	 * from RFC submission.
> +	 */
> +	err = iort_register_domain_token(0, 0, fwnode);
> +	if (err)
> +		goto out_free_fwnode;

Please remove this. IORT has no notion of DirectLPI, and doesn't
describe it at all.

> +
> +	dlpi = kzalloc(sizeof(*dlpi), GFP_KERNEL);
> +	if (!dlpi) {
> +		err = -ENOMEM;
> +		goto out_unregister_fwnode;
> +	}
> +
> +	raw_spin_lock_init(&dlpi->lock);
> +	mutex_init(&dlpi->dev_alloc_lock);
> +	INIT_LIST_HEAD(&dlpi->entry);
> +	INIT_LIST_HEAD(&dlpi->device_list);
> +	dlpi->msi_domain_flags = IRQ_DOMAIN_FLAG_MSI_REMAP;

Remap what? There is nothing to remap at all, since you don't have an
ITS! From a security/isolation perspective, DirectLPI is a gaping hole.

> +	err = dlpi_init_domain(fwnode, parent, dlpi);
> +	if (err)
> +		goto out_unregister_fwnode;
> +
> +	return 0;
> +
> +out_unregister_fwnode:
> +	iort_deregister_domain_token(0);
> +out_free_fwnode:
> +	irq_domain_free_fwnode(fwnode);
> +	kfree(dlpi);
> +
> +	return err;
> +
> +}
> +
> +static inline u32 dlpi_get_event_id(struct irq_data *d)
> +{
> +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> +
> +	return d->hwirq - dlpi_dev->event_map.lpi_base;

This is more and more puzzling. DirectLPI *does not* perform any
device isolation, so there is no per-device interrupt namespace. You
are just copy-pasting bits of the ITS code without understanding the
architecture.

> +}
> +
> +static int dlpi_irq_to_cpuid(struct irq_data *d, unsigned long *flags)
> +{
> +	int cpu;
> +
> +	/* Physical LPIs are already locked via the irq_desc lock */
> +	struct direct_lpi_device *dlpi_dev =
> +		irq_data_get_irq_chip_data(d);
> +	cpu = dlpi_dev->event_map.col_map[dlpi_get_event_id(d)];
> +	/* Keep GCC quiet... */
> +	*flags = 0;

More pointless copy-paste.

> +
> +	return cpu;
> +}
> +
> +/*
> + * irqchip functions - assumes MSI, mostly.
> + */
> +//TODO: Maybe its::lpi_write_config can call into this routine
> +static void lpi_write_config(struct irq_data *d, u8 clr, u8 set)
> +{
> +	irq_hw_number_t hwirq;
> +	void *va;
> +	u8 *cfg;
> +
> +	va = gic_rdists->prop_table_va;
> +	hwirq = d->hwirq;
> +	cfg = va + hwirq - 8192;
> +	*cfg &= ~clr;
> +	*cfg |= set | LPI_PROP_GROUP1;
> +
> +	/*
> +	 * Make the above write visible to the redistributors.
> +	 * And yes, we're flushing exactly: One. Single. Byte.
> +	 * Humpf...
> +	 */
> +	if (gic_rdists->flags & RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING)
> +		gic_flush_dcache_to_poc(cfg, sizeof(*cfg));
> +	else
> +		dsb(ishst);
> +}
> +
> +static void wait_for_syncr(void __iomem *rdbase)
> +{
> +	while (readl_relaxed(rdbase + GICR_SYNCR) & 1)
> +		cpu_relax();
> +}
> +
> +static void dlpi_direct_lpi_inv(struct irq_data *d)
> +{
> +	void __iomem *rdbase;
> +	unsigned long flags;
> +	u64 val;
> +	int cpu;
> +
> +	val = d->hwirq;
> +
> +	/* Target the redistributor this LPI is currently routed to */
> +	cpu = dlpi_irq_to_cpuid(d, &flags);
> +	raw_spin_lock(&gic_data_rdist_cpu(cpu)->rd_lock);
> +	rdbase = per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base;
> +	gic_write_lpir(val, rdbase + GICR_INVLPIR);
> +	wait_for_syncr(rdbase);
> +	raw_spin_unlock(&gic_data_rdist_cpu(cpu)->rd_lock);
> +}
> +
> +static int dlpi_alloc_device_irq(struct direct_lpi_device *dlpi_dev,
> +				 int nvecs, irq_hw_number_t *hwirq)
> +{
> +	int idx;
> +
> +	/* Find a free LPI region in lpi_map and allocate them. */
> +	idx = bitmap_find_free_region(dlpi_dev->event_map.lpi_map,
> +				      dlpi_dev->event_map.nr_lpis,
> +				      get_count_order(nvecs));
> +	if (idx < 0)
> +		return -ENOSPC;
> +
> +	*hwirq = dlpi_dev->event_map.lpi_base + idx;
> +
> +	return 0;

Totally pointless, again. Why do you maintain a per-device allocation
given that there is no per-device HW tracking?

> +}
> +
> +static void no_lpi_update_config(struct irq_data *d, u8 clr, u8 set)
> +{
> +	lpi_write_config(d, clr, set);
> +	dlpi_direct_lpi_inv(d);
> +}
> +
> +static void dlpi_unmask_irq(struct irq_data *d)
> +{
> +	no_lpi_update_config(d, 0, LPI_PROP_ENABLED);
> +}
> +
> +static void dlpi_mask_irq(struct irq_data *d)
> +{
> +	no_lpi_update_config(d, LPI_PROP_ENABLED, 0);
> +}
> +
> +static int dlpi_select_cpu(struct irq_data *d,
> +			   const struct cpumask *aff_mask)
> +{
> +	cpumask_var_t tmpmask;
> +	int cpu;
> +
> +	if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
> +		return -ENOMEM;
> +
> +	/* There is no NUMA node affliation */

Why?

> +	if (!irqd_affinity_is_managed(d)) {
> +		/* Try the intersection of the affinity and online masks */
> +		cpumask_and(tmpmask, aff_mask, cpu_online_mask);
> +
> +		/* If that doesn't fly, the online mask is the last resort */
> +		if (cpumask_empty(tmpmask))
> +			cpumask_copy(tmpmask, cpu_online_mask);
> +
> +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> +	} else {
> +		cpumask_and(tmpmask, irq_data_get_affinity_mask(d), cpu_online_mask);
> +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> +	}
> +
> +	free_cpumask_var(tmpmask);
> +	pr_debug("IRQ%d -> %*pbl CPU%d\n", d->irq, cpumask_pr_args(aff_mask), cpu);
> +
> +	return cpu;
> +}
> +
> +static int dlpi_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
> +			     bool force)
> +{
> +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> +	u32 id = dlpi_get_event_id(d);
> +	int cpu, prev_cpu;
> +
> +	/*
> +	 * A forwarded interrupt should use irq_set_vcpu_affinity. Anyways,
> +	 * vcpu is not supported for Direct LPI, as it requires an ITS.
> +	 */
> +	if (irqd_is_forwarded_to_vcpu(d))
> +		return -EINVAL;
> +
> +	prev_cpu = dlpi_dev->event_map.col_map[id];
> +	its_dec_lpi_count(d, prev_cpu);
> +
> +	if (!force)
> +		cpu = dlpi_select_cpu(d, mask_val);
> +	else
> +		cpu = cpumask_pick_least_loaded(d, mask_val);
> +
> +	if (cpu < 0 || cpu >= nr_cpu_ids)
> +		goto err;
> +
> +	/* don't set the affinity when the target cpu is same as current one */
> +	if (cpu != prev_cpu) {
> +		dlpi_dev->event_map.col_map[id] = cpu;
> +		irq_data_update_effective_affinity(d, cpumask_of(cpu));
> +	}
> +
> +	its_inc_lpi_count(d, cpu);
> +
> +	return IRQ_SET_MASK_OK_DONE;
> +
> +err:
> +	its_inc_lpi_count(d, prev_cpu);
> +	return -EINVAL;
> +}
> +
> +static u64 dlpi_get_msi_base(struct irq_data *d)
> +{
> +	u64 addr;
> +	int cpu;
> +	unsigned long flags;
> +
> +	cpu = dlpi_irq_to_cpuid(d, &flags);
> +	addr = (u64)(per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base +
> +		     GICR_SETLPIR);
> +
> +	return addr;
> +}
> +
> +/*
> + * As per the spec, MSI address is the address of the target processor's
> + * GICR_SETLPIR location.
> + */
> +static void dlpi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
> +{
> +	u64 addr;
> +
> +	addr = dlpi_get_msi_base(d);
> +
> +	msg->address_lo		= lower_32_bits(addr);
> +	msg->address_hi		= upper_32_bits(addr);

Oh, that's is going to work so well with MultiMSI...

> +	msg->data		= dlpi_get_event_id(d);
> +
> +	iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg);
> +}
> +
> +static int dlpi_irq_set_irqchip_state(struct irq_data *d,
> +				     enum irqchip_irq_state which,
> +				     bool state)
> +{
> +	if (which != IRQCHIP_STATE_PENDING)
> +		return -EINVAL;
> +
> +	return 0;

This is a joke, right?

> +}
> +
> +static int dlpi_irq_retrigger(struct irq_data *d)
> +{
> +	return !dlpi_irq_set_irqchip_state(d, IRQCHIP_STATE_PENDING, true);
> +}
> +
> +static int dlpi_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
> +{
> +	/* vCPU support requires an ITS */
> +	return -EINVAL;

Then why are you providing this callback the first place?

> +}
> +
> +static struct irq_chip dlpi_irq_chip = {
> +	.name			= "Direct LPI",
> +	.irq_mask		= dlpi_mask_irq,
> +	.irq_unmask		= dlpi_unmask_irq,
> +	.irq_eoi		= irq_chip_eoi_parent,
> +	.irq_set_affinity	= dlpi_set_affinity,
> +	.irq_compose_msi_msg	= dlpi_irq_compose_msi_msg,
> +	.irq_set_irqchip_state	= dlpi_irq_set_irqchip_state,
> +	.irq_retrigger		= dlpi_irq_retrigger,
> +	.irq_set_vcpu_affinity	= dlpi_irq_set_vcpu_affinity,
> +};
> +
> +static int dlpi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
> +				 unsigned int nr_irqs, void *args)
> +{
> +	msi_alloc_info_t *info = args;
> +	struct direct_lpi_device *dlpi_dev = info->scratchpad[0].ptr;
> +	struct irq_data *irqd;
> +	irq_hw_number_t hwirq;
> +	int err;
> +	int i;
> +
> +	err = dlpi_alloc_device_irq(dlpi_dev, nr_irqs, &hwirq);
> +	if (err)
> +		return err;
> +
> +	/*
> +	 * TODO: Need to call 'iommu_dma_prepare_msi' to prepare for DMA,
> +	 *	 but, that requires an MSI address. And, for Direct LPI
> +	 *	 the MSI address comes from the Redistributor from
> +	 *	 'GICR_SETLPIR', which is per CPU and that is not known
> +	 *	 at the moment. Not sure what is the best way to handle
> +	 *	 this.
> +	 */
> +
> +	/*
> +	err = iommu_dma_prepare_msi(info->desc, its->get_msi_base(its_dev));
> +	if (err)
> +		return err;
> +	*/

What is the point of this? IORT cannot describe the routing of MSIs to
redistributors, only to an ITS.

> +
> +	for (i = 0; i < nr_irqs; i++) {
> +		err = its_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
> +		if (err)
> +			return err;
> +
> +		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
> +					      &dlpi_irq_chip, dlpi_dev);
> +		irqd = irq_get_irq_data(virq + i);
> +		irqd_set_single_target(irqd);
> +		irqd_set_affinity_on_activate(irqd);
> +		pr_debug("ID:%d pID:%d vID:%d\n",
> +			 (int)(hwirq + i - dlpi_dev->event_map.lpi_base),
> +			 (int)(hwirq + i), virq + i);
> +	}
> +
> +	return 0;
> +}
> +
> +static void dlpi_free_device(struct direct_lpi_device *dlpi_dev)
> +{
> +	unsigned long flags;
> +
> +	raw_spin_lock_irqsave(&dlpi_dev->dlpi->lock, flags);
> +	list_del(&dlpi_dev->entry);
> +	raw_spin_unlock_irqrestore(&dlpi_dev->dlpi->lock, flags);
> +	kfree(dlpi_dev->event_map.col_map);
> +	kfree(dlpi_dev->event_map.lpi_map);
> +	kfree(dlpi_dev);
> +}
> +
> +static void dlpi_irq_domain_free(struct irq_domain *domain, unsigned int virq,
> +				unsigned int nr_irqs)
> +{
> +	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
> +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> +	int i;
> +	struct direct_lpi *dlpi = dlpi_dev->dlpi;
> +
> +	bitmap_release_region(dlpi_dev->event_map.lpi_map,
> +			      dlpi_get_event_id(irq_domain_get_irq_data(domain, virq)),
> +			      get_count_order(nr_irqs));
> +
> +	for (i = 0; i < nr_irqs; i++) {
> +		struct irq_data *data = irq_domain_get_irq_data(domain,
> +								virq + i);
> +		/* Nuke the entry in the domain */
> +		irq_domain_reset_irq_data(data);
> +	}
> +
> +	mutex_lock(&dlpi->dev_alloc_lock);
> +
> +	/*
> +	 * If all interrupts have been freed, start mopping the
> +	 * floor. This is conditionned on the device not being shared.
> +	 */
> +	if (!dlpi_dev->shared &&
> +	    bitmap_empty(dlpi_dev->event_map.lpi_map,
> +			 dlpi_dev->event_map.nr_lpis)) {
> +		its_lpi_free(dlpi_dev->event_map.lpi_map,
> +			     dlpi_dev->event_map.lpi_base,
> +			     dlpi_dev->event_map.nr_lpis);
> +
> +		dlpi_free_device(dlpi_dev);
> +	}
> +
> +	mutex_unlock(&dlpi->dev_alloc_lock);
> +	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
> +}
> +
> +static int dlpi_irq_domain_activate(struct irq_domain *domain,
> +				   struct irq_data *d, bool reserve)
> +{
> +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> +	u32 event;
> +	int cpu;
> +
> +	event = dlpi_get_event_id(d);
> +	cpu = dlpi_select_cpu(d, cpu_online_mask);
> +	if (cpu < 0 || cpu >= nr_cpu_ids)
> +		return -EINVAL;
> +
> +	its_inc_lpi_count(d, cpu);
> +	dlpi_dev->event_map.col_map[event] = cpu;
> +	irq_data_update_effective_affinity(d, cpumask_of(cpu));
> +
> +	return 0;
> +}
> +
> +static void dlpi_irq_domain_deactivate(struct irq_domain *domain,
> +				      struct irq_data *d)
> +{
> +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> +	u32 event = dlpi_get_event_id(d);
> +
> +	its_dec_lpi_count(d, dlpi_dev->event_map.col_map[event]);
> +}
> +
> +static struct direct_lpi_device *dlpi_create_device(struct direct_lpi *dlpi,
> +					u32 dev_id, int nvecs, bool alloc_lpis)
> +{
> +	struct direct_lpi_device *dlpi_dev = NULL;
> +	unsigned long *lpi_map = NULL;
> +	u16 *col_map = NULL;
> +	int lpi_base;
> +	int nr_lpis;
> +	unsigned long flags;
> +
> +	if (WARN_ON(!is_power_of_2(nvecs)))
> +		nvecs = roundup_pow_of_two(nvecs);
> +
> +	dlpi_dev = kzalloc(sizeof(*dlpi_dev), GFP_KERNEL);
> +	if (!dlpi_dev)
> +		return NULL;
> +
> +	lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
> +	if (!lpi_map) {
> +		kfree(dlpi_dev);
> +		return NULL;
> +	}
> +
> +	col_map = kcalloc(nr_lpis, sizeof(*col_map), GFP_KERNEL);
> +	if (!col_map) {
> +		kfree(dlpi_dev);
> +		kfree(lpi_map);
> +		return NULL;
> +	}
> +
> +	dlpi_dev->dlpi = dlpi;
> +	dlpi_dev->event_map.lpi_map = lpi_map;
> +	dlpi_dev->event_map.col_map = col_map;
> +	dlpi_dev->event_map.lpi_base = lpi_base;
> +	dlpi_dev->event_map.nr_lpis = nr_lpis;
> +	dlpi_dev->device_id = dev_id;
> +
> +	raw_spin_lock_irqsave(&dlpi->lock, flags);
> +	list_add(&dlpi_dev->entry, &dlpi->device_list);
> +	raw_spin_unlock_irqrestore(&dlpi->lock, flags);
> +
> +	return dlpi_dev;
> +}
> +
> +static struct direct_lpi_device *dlpi_find_device(struct direct_lpi *dlpi, u32 dev_id)
> +{
> +	struct direct_lpi_device *dlpi_dev = NULL, *tmp;
> +	unsigned long flags;
> +
> +	raw_spin_lock_irqsave(&dlpi->lock, flags);
> +	list_for_each_entry(tmp, &dlpi->device_list, entry) {
> +		if (tmp->device_id == dev_id) {
> +			dlpi_dev = tmp;
> +			break;
> +		}
> +	}
> +
> +	raw_spin_unlock_irqrestore(&dlpi->lock, flags);
> +
> +	return dlpi_dev;
> +}
> +
> +static int dlpi_msi_prepare(struct irq_domain *domain, struct device *dev,
> +			   int nvec, msi_alloc_info_t *info)
> +{
> +	struct direct_lpi_device *dlpi_dev;
> +	struct direct_lpi *dlpi;
> +	struct msi_domain_info *msi_info;
> +	u32 dev_id;
> +	int err = 0;
> +
> +	/*
> +	 * We ignore "dev" entirely, and rely on the dev_id that has
> +	 * been passed via the scratchpad. This limits this domain's
> +	 * usefulness to upper layers that definitely know that they
> +	 * are built on top of the ITS.
> +	 */

This is complete nonsense. You are just copying bits of the ITS code
and randomly reorganising it. You are trying to reuse the PCI-specific
front-end of the ITS code, not even realising that it *cannot* work.
For example, how can MultiMSI be supported?

> +	dev_id = info->scratchpad[0].ul;
> +	msi_info = msi_get_domain_info(domain);
> +	dlpi = msi_info->data;
> +
> +	mutex_lock(&dlpi->dev_alloc_lock);
> +	dlpi_dev = dlpi_find_device(dlpi, dev_id);
> +	if (dlpi_dev) {
> +		/*
> +		 * We already have seen this ID, probably through
> +		 * another alias (PCI bridge of some sort). No need to
> +		 * create the device.
> +		 */
> +		dlpi_dev->shared = true;
> +		pr_debug("Reusing ITT for devID %x\n", dev_id);
> +		goto out;
> +	}
> +
> +	dlpi_dev = dlpi_create_device(dlpi, dev_id, nvec, true);
> +	if (!dlpi_dev) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +
> +out:
> +	mutex_unlock(&dlpi->dev_alloc_lock);
> +	info->scratchpad[0].ptr = dlpi_dev;
> +
> +	return err;
> +}
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index ba39668c3e08..aa101dfcbbfc 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -178,7 +178,7 @@ struct cpu_lpi_count {
>  	atomic_t	unmanaged;
>  };
>  
> -static DEFINE_PER_CPU(struct cpu_lpi_count, cpu_lpi_count);
> +DEFINE_PER_CPU(struct cpu_lpi_count, cpu_lpi_count);
>  
>  static LIST_HEAD(its_nodes);
>  static DEFINE_RAW_SPINLOCK(its_lock);
> @@ -1521,7 +1521,7 @@ static void its_unmask_irq(struct irq_data *d)
>  	lpi_update_config(d, 0, LPI_PROP_ENABLED);
>  }
>  
> -static __maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
> +__maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
>  {
>  	if (irqd_affinity_is_managed(d))
>  		return atomic_read(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed);
> @@ -1529,7 +1529,7 @@ static __maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
>  	return atomic_read(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
>  }
>  
> -static void its_inc_lpi_count(struct irq_data *d, int cpu)
> +void its_inc_lpi_count(struct irq_data *d, int cpu)
>  {
>  	if (irqd_affinity_is_managed(d))
>  		atomic_inc(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed);
> @@ -1537,7 +1537,7 @@ static void its_inc_lpi_count(struct irq_data *d, int cpu)
>  		atomic_inc(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
>  }
>  
> -static void its_dec_lpi_count(struct irq_data *d, int cpu)
> +void its_dec_lpi_count(struct irq_data *d, int cpu)
>  {
>  	if (irqd_affinity_is_managed(d))
>  		atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed);
> @@ -1545,7 +1545,7 @@ static void its_dec_lpi_count(struct irq_data *d, int cpu)
>  		atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
>  }
>  
> -static unsigned int cpumask_pick_least_loaded(struct irq_data *d,
> +unsigned int cpumask_pick_least_loaded(struct irq_data *d,
>  					      const struct cpumask *cpu_mask)
>  {
>  	unsigned int cpu = nr_cpu_ids, tmp;
> @@ -2121,7 +2121,7 @@ static int __init its_lpi_init(u32 id_bits)
>  	return err;
>  }
>  
> -static unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
> +unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
>  {
>  	unsigned long *bitmap = NULL;
>  	int err = 0;
> @@ -2153,7 +2153,7 @@ static unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
>  	return bitmap;
>  }
>  
> -static void its_lpi_free(unsigned long *bitmap, u32 base, u32 nr_ids)
> +void its_lpi_free(unsigned long *bitmap, u32 base, u32 nr_ids)
>  {
>  	WARN_ON(free_lpi_range(base, nr_ids));
>  	kfree(bitmap);
> @@ -3506,9 +3506,9 @@ static struct msi_domain_ops its_msi_domain_ops = {
>  	.msi_prepare	= its_msi_prepare,
>  };
>  
> -static int its_irq_gic_domain_alloc(struct irq_domain *domain,
> -				    unsigned int virq,
> -				    irq_hw_number_t hwirq)
> +int its_irq_gic_domain_alloc(struct irq_domain *domain,
> +			     unsigned int virq,
> +			     irq_hw_number_t hwirq)
>  {
>  	struct irq_fwspec fwspec;

There is no way you are messing with the ITS driver internals like
this. The only thing you *may* do is to have a common infrastructure
to allocate the RD tables, and only that. None of the ITS
infrastructure makes any sense for DirectLPI anyway.

Please rearchitect your code to be fully independent of the ITS, and
without any of the pointless abstractions that the ITS requires. This
should be much closer to GICv3-MBI than the ITS. Heck, you could even
use the MBI driver directly instead of trying to use DirectLPI, as it
gives you similar functionalities.

I also want to understand *how* you are going to plumb this into both
ACPI and DT, given that neither understand how to link a PCI endpoint
to a set of RDs.

	M.
Sunil Muthuswamy July 26, 2021, 3:33 p.m. UTC | #2
> On Thu, 08 Jul 2021 20:36:51 +0100,
> Sunil Muthuswamy <sunilmut@microsoft.com> wrote:
> >
> > The current GIC v3 driver doesn't support Direct LPI when there is
> > no ITS. The spec architecturally supports Direct LPI without an ITS.
> > This patch introduces an IRQ domain and chip to manage Direct LPI
> > when there is no ITS.
> >
> > Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
> > ---
> >  drivers/irqchip/Makefile          |   2 +-
> >  drivers/irqchip/irq-gic-common.h  |  22 +
> >  drivers/irqchip/irq-gic-v3-dlpi.c | 639 ++++++++++++++++++++++++++++++
> >  drivers/irqchip/irq-gic-v3-its.c  |  51 ++-
> >  4 files changed, 694 insertions(+), 20 deletions(-)
> >  create mode 100644 drivers/irqchip/irq-gic-v3-dlpi.c
> >
> > diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
> > index f88cbf36a9d2..c6ea9620161b 100644
> > --- a/drivers/irqchip/Makefile
> > +++ b/drivers/irqchip/Makefile
> > @@ -32,7 +32,7 @@ obj-$(CONFIG_ARM_GIC_PM)		+= irq-gic-pm.o
> >  obj-$(CONFIG_ARCH_REALVIEW)		+= irq-gic-realview.o
> >  obj-$(CONFIG_ARM_GIC_V2M)		+= irq-gic-v2m.o
> >  obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o
> > -obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-platform-msi.o irq-gic-v4.o
> > +obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-platform-msi.o irq-gic-v4.o irq-gic-v3-dlpi.o
> >  obj-$(CONFIG_ARM_GIC_V3_ITS_PCI)	+= irq-gic-v3-its-pci-msi.o
> >  obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC)	+= irq-gic-v3-its-fsl-mc-msi.o
> >  obj-$(CONFIG_PARTITION_PERCPU)		+= irq-partition-percpu.o
> > diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h
> > index ccba8b0fe0f5..21e4e8f6ba70 100644
> > --- a/drivers/irqchip/irq-gic-common.h
> > +++ b/drivers/irqchip/irq-gic-common.h
> > @@ -30,4 +30,26 @@ void gic_enable_of_quirks(const struct device_node *np,
> >
> >  void gic_set_kvm_info(const struct gic_kvm_info *info);
> >
> > +/* LPI related functionality */
> > +/*
> > + * TODO: Ideally, I think these should be moved to a different file
> > + *	 such as irq-gic-v3-lpi-common.h/.c. But, keeping it here
> > + *	 for now to get comments from the RFC.
> > + */
> > +DECLARE_PER_CPU(struct cpu_lpi_count, cpu_lpi_count);
> > +
> > +__maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu);
> > +void its_inc_lpi_count(struct irq_data *d, int cpu);
> > +void its_dec_lpi_count(struct irq_data *d, int cpu);
> > +unsigned int cpumask_pick_least_loaded(struct irq_data *d,
> > +				       const struct cpumask *cpu_mask);
> 
> Ideally, you shouldn't use this at all. At least not until we have
> figured out what you are trying to achieve. At best, this is premature
> optimisation.
> 
> > +int its_irq_gic_domain_alloc(struct irq_domain *domain,
> > +			     unsigned int virq,
> > +			     irq_hw_number_t hwirq);
> > +unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids);
> > +void its_lpi_free(unsigned long *bitmap, u32 base, u32 nr_ids);
> 
> And certainly not use these *ever*.
> 
> > +
> > +struct rdists;
> > +int direct_lpi_init(struct irq_domain *parent, struct rdists *rdists);
> > +
> >  #endif /* _IRQ_GIC_COMMON_H */
> > diff --git a/drivers/irqchip/irq-gic-v3-dlpi.c b/drivers/irqchip/irq-gic-v3-dlpi.c
> > new file mode 100644
> > index 000000000000..722a0630fced
> > --- /dev/null
> > +++ b/drivers/irqchip/irq-gic-v3-dlpi.c
> > @@ -0,0 +1,639 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (c) Microsoft Corporation.
> > + * Author: Sunil Muthuswamy <sunilmut@microsoft.com>
> > + *
> > + * This file implements an IRQ domain and chip to handle Direct LPI
> > + * when there is no ITS, for GIC v3.
> > + */
> > +
> > +#include <linux/acpi_iort.h>
> > +#include <linux/bitfield.h>
> > +#include <linux/bitmap.h>
> > +#include <linux/cpu.h>
> > +#include <linux/interrupt.h>
> > +#include <linux/irqdomain.h>
> > +#include <linux/list.h>
> > +#include <linux/msi.h>
> > +#include <linux/irq.h>
> > +#include <linux/percpu.h>
> > +#include <linux/dma-iommu.h>
> > +
> > +#include <linux/irqchip.h>
> > +#include <linux/irqchip/arm-gic-v3.h>
> > +#include <linux/irqchip/arm-gic-v4.h>
> > +
> > +#include "irq-gic-common.h"
> > +
> > +static struct rdists *gic_rdists;
> > +
> > +#define gic_data_rdist_cpu(cpu)		(per_cpu_ptr(gic_rdists->rdist, cpu))
> > +
> > +#define RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING	(1 << 0)
> > +
> > +/*
> > + * Structure that holds most of the infrastructure needed to support
> > + * DirectLPI without an ITS.
> > + *
> > + * dev_alloc_lock has to be taken for device allocations, while the
> > + * spinlock must be taken to parse data structures such as the device
> > + * list.
> > + */
> > +
> > +struct direct_lpi {
> > +	raw_spinlock_t		lock;
> > +	struct mutex		dev_alloc_lock;
> > +	struct list_head	entry;
> > +	struct fwnode_handle	*fwnode_handle;
> > +	struct list_head	device_list;
> > +	u64			flags;
> > +	unsigned int		msi_domain_flags;
> > +};
> > +
> > +struct event_lpi_map {
> > +	unsigned long		*lpi_map;
> > +	u16			*col_map;
> > +	irq_hw_number_t		lpi_base;
> > +	int			nr_lpis;
> > +};
> 
> DirectLPI has no notion of event, collection, or anything like
> this. It deals with LPIs, and that's it.
> 
> > +
> > +struct direct_lpi_device {
> > +	struct list_head	entry;
> > +	struct direct_lpi	*dlpi;
> > +	struct event_lpi_map	event_map;
> > +	u32			device_id;
> > +	bool			shared;
> > +};
> 
> Same this here. DirectLPI has no notion of DeviceID.
> 
> > +
> > +static int dlpi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
> > +				unsigned int nr_irqs, void *args);
> > +
> > +static void dlpi_irq_domain_free(struct irq_domain *domain, unsigned int virq,
> > +				unsigned int nr_irqs);
> > +
> > +static int dlpi_irq_domain_activate(struct irq_domain *domain,
> > +				   struct irq_data *d, bool reserve);
> > +
> > +static void dlpi_irq_domain_deactivate(struct irq_domain *domain,
> > +				      struct irq_data *d);
> > +
> > +static const struct irq_domain_ops dlpi_domain_ops = {
> > +	.alloc			= dlpi_irq_domain_alloc,
> > +	.free			= dlpi_irq_domain_free,
> > +	.activate		= dlpi_irq_domain_activate,
> > +	.deactivate		= dlpi_irq_domain_deactivate,
> > +};
> > +
> > +static int dlpi_msi_prepare(struct irq_domain *domain, struct device *dev,
> > +			   int nvec, msi_alloc_info_t *info);
> > +
> > +static struct msi_domain_ops dlpi_msi_domain_ops = {
> > +	.msi_prepare	= dlpi_msi_prepare,
> > +};
> > +
> > +static int dlpi_init_domain(struct fwnode_handle *handle,
> > +			      struct irq_domain *parent_domain,
> > +			      struct direct_lpi *dlpi)
> > +{
> > +	struct irq_domain *inner_domain;
> > +	struct msi_domain_info *info;
> > +
> > +	info = kzalloc(sizeof(*info), GFP_KERNEL);
> > +	if (!info)
> > +		return -ENOMEM;
> > +
> > +	inner_domain = irq_domain_create_tree(handle, &dlpi_domain_ops, NULL);
> > +	if (!inner_domain) {
> > +		kfree(info);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	inner_domain->parent = parent_domain;
> > +	irq_domain_update_bus_token(inner_domain, DOMAIN_BUS_NEXUS);
> > +	inner_domain->flags |= dlpi->msi_domain_flags;
> > +	info->ops = &dlpi_msi_domain_ops;
> > +	info->data = dlpi;
> > +	inner_domain->host_data = info;
> > +
> > +	return 0;
> > +}
> > +
> > +int __init direct_lpi_init(struct irq_domain *parent, struct rdists *rdists)
> > +{
> > +	struct fwnode_handle *fwnode;
> > +	int err;
> > +	struct direct_lpi *dlpi = NULL;
> > +
> > +	gic_rdists = rdists;
> > +	fwnode = irq_domain_alloc_named_fwnode("Direct LPI");
> > +	if (!fwnode)
> > +		return -ENOMEM;
> > +
> > +	/*
> > +	 * Registering with the iort allows other services to query the
> > +	 * fwnode. But, the registration requires an ITS ID and base address,
> > +	 * which does not apply here. So, probably best to just export the
> > +	 * fwnode handle for other services. Keeping it here for comments
> > +	 * from RFC submission.
> > +	 */
> > +	err = iort_register_domain_token(0, 0, fwnode);
> > +	if (err)
> > +		goto out_free_fwnode;
> 
> Please remove this. IORT has no notion of DirectLPI, and doesn't
> describe it at all.
> 
> > +
> > +	dlpi = kzalloc(sizeof(*dlpi), GFP_KERNEL);
> > +	if (!dlpi) {
> > +		err = -ENOMEM;
> > +		goto out_unregister_fwnode;
> > +	}
> > +
> > +	raw_spin_lock_init(&dlpi->lock);
> > +	mutex_init(&dlpi->dev_alloc_lock);
> > +	INIT_LIST_HEAD(&dlpi->entry);
> > +	INIT_LIST_HEAD(&dlpi->device_list);
> > +	dlpi->msi_domain_flags = IRQ_DOMAIN_FLAG_MSI_REMAP;
> 
> Remap what? There is nothing to remap at all, since you don't have an
> ITS! From a security/isolation perspective, DirectLPI is a gaping hole.
> 
> > +	err = dlpi_init_domain(fwnode, parent, dlpi);
> > +	if (err)
> > +		goto out_unregister_fwnode;
> > +
> > +	return 0;
> > +
> > +out_unregister_fwnode:
> > +	iort_deregister_domain_token(0);
> > +out_free_fwnode:
> > +	irq_domain_free_fwnode(fwnode);
> > +	kfree(dlpi);
> > +
> > +	return err;
> > +
> > +}
> > +
> > +static inline u32 dlpi_get_event_id(struct irq_data *d)
> > +{
> > +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> > +
> > +	return d->hwirq - dlpi_dev->event_map.lpi_base;
> 
> This is more and more puzzling. DirectLPI *does not* perform any
> device isolation, so there is no per-device interrupt namespace. You
> are just copy-pasting bits of the ITS code without understanding the
> architecture.
> 
> > +}
> > +
> > +static int dlpi_irq_to_cpuid(struct irq_data *d, unsigned long *flags)
> > +{
> > +	int cpu;
> > +
> > +	/* Physical LPIs are already locked via the irq_desc lock */
> > +	struct direct_lpi_device *dlpi_dev =
> > +		irq_data_get_irq_chip_data(d);
> > +	cpu = dlpi_dev->event_map.col_map[dlpi_get_event_id(d)];
> > +	/* Keep GCC quiet... */
> > +	*flags = 0;
> 
> More pointless copy-paste.
> 
> > +
> > +	return cpu;
> > +}
> > +
> > +/*
> > + * irqchip functions - assumes MSI, mostly.
> > + */
> > +//TODO: Maybe its::lpi_write_config can call into this routine
> > +static void lpi_write_config(struct irq_data *d, u8 clr, u8 set)
> > +{
> > +	irq_hw_number_t hwirq;
> > +	void *va;
> > +	u8 *cfg;
> > +
> > +	va = gic_rdists->prop_table_va;
> > +	hwirq = d->hwirq;
> > +	cfg = va + hwirq - 8192;
> > +	*cfg &= ~clr;
> > +	*cfg |= set | LPI_PROP_GROUP1;
> > +
> > +	/*
> > +	 * Make the above write visible to the redistributors.
> > +	 * And yes, we're flushing exactly: One. Single. Byte.
> > +	 * Humpf...
> > +	 */
> > +	if (gic_rdists->flags & RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING)
> > +		gic_flush_dcache_to_poc(cfg, sizeof(*cfg));
> > +	else
> > +		dsb(ishst);
> > +}
> > +
> > +static void wait_for_syncr(void __iomem *rdbase)
> > +{
> > +	while (readl_relaxed(rdbase + GICR_SYNCR) & 1)
> > +		cpu_relax();
> > +}
> > +
> > +static void dlpi_direct_lpi_inv(struct irq_data *d)
> > +{
> > +	void __iomem *rdbase;
> > +	unsigned long flags;
> > +	u64 val;
> > +	int cpu;
> > +
> > +	val = d->hwirq;
> > +
> > +	/* Target the redistributor this LPI is currently routed to */
> > +	cpu = dlpi_irq_to_cpuid(d, &flags);
> > +	raw_spin_lock(&gic_data_rdist_cpu(cpu)->rd_lock);
> > +	rdbase = per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base;
> > +	gic_write_lpir(val, rdbase + GICR_INVLPIR);
> > +	wait_for_syncr(rdbase);
> > +	raw_spin_unlock(&gic_data_rdist_cpu(cpu)->rd_lock);
> > +}
> > +
> > +static int dlpi_alloc_device_irq(struct direct_lpi_device *dlpi_dev,
> > +				 int nvecs, irq_hw_number_t *hwirq)
> > +{
> > +	int idx;
> > +
> > +	/* Find a free LPI region in lpi_map and allocate them. */
> > +	idx = bitmap_find_free_region(dlpi_dev->event_map.lpi_map,
> > +				      dlpi_dev->event_map.nr_lpis,
> > +				      get_count_order(nvecs));
> > +	if (idx < 0)
> > +		return -ENOSPC;
> > +
> > +	*hwirq = dlpi_dev->event_map.lpi_base + idx;
> > +
> > +	return 0;
> 
> Totally pointless, again. Why do you maintain a per-device allocation
> given that there is no per-device HW tracking?
> 
> > +}
> > +
> > +static void no_lpi_update_config(struct irq_data *d, u8 clr, u8 set)
> > +{
> > +	lpi_write_config(d, clr, set);
> > +	dlpi_direct_lpi_inv(d);
> > +}
> > +
> > +static void dlpi_unmask_irq(struct irq_data *d)
> > +{
> > +	no_lpi_update_config(d, 0, LPI_PROP_ENABLED);
> > +}
> > +
> > +static void dlpi_mask_irq(struct irq_data *d)
> > +{
> > +	no_lpi_update_config(d, LPI_PROP_ENABLED, 0);
> > +}
> > +
> > +static int dlpi_select_cpu(struct irq_data *d,
> > +			   const struct cpumask *aff_mask)
> > +{
> > +	cpumask_var_t tmpmask;
> > +	int cpu;
> > +
> > +	if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
> > +		return -ENOMEM;
> > +
> > +	/* There is no NUMA node affliation */
> 
> Why?
> 
> > +	if (!irqd_affinity_is_managed(d)) {
> > +		/* Try the intersection of the affinity and online masks */
> > +		cpumask_and(tmpmask, aff_mask, cpu_online_mask);
> > +
> > +		/* If that doesn't fly, the online mask is the last resort */
> > +		if (cpumask_empty(tmpmask))
> > +			cpumask_copy(tmpmask, cpu_online_mask);
> > +
> > +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> > +	} else {
> > +		cpumask_and(tmpmask, irq_data_get_affinity_mask(d), cpu_online_mask);
> > +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> > +	}
> > +
> > +	free_cpumask_var(tmpmask);
> > +	pr_debug("IRQ%d -> %*pbl CPU%d\n", d->irq, cpumask_pr_args(aff_mask), cpu);
> > +
> > +	return cpu;
> > +}
> > +
> > +static int dlpi_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
> > +			     bool force)
> > +{
> > +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> > +	u32 id = dlpi_get_event_id(d);
> > +	int cpu, prev_cpu;
> > +
> > +	/*
> > +	 * A forwarded interrupt should use irq_set_vcpu_affinity. Anyways,
> > +	 * vcpu is not supported for Direct LPI, as it requires an ITS.
> > +	 */
> > +	if (irqd_is_forwarded_to_vcpu(d))
> > +		return -EINVAL;
> > +
> > +	prev_cpu = dlpi_dev->event_map.col_map[id];
> > +	its_dec_lpi_count(d, prev_cpu);
> > +
> > +	if (!force)
> > +		cpu = dlpi_select_cpu(d, mask_val);
> > +	else
> > +		cpu = cpumask_pick_least_loaded(d, mask_val);
> > +
> > +	if (cpu < 0 || cpu >= nr_cpu_ids)
> > +		goto err;
> > +
> > +	/* don't set the affinity when the target cpu is same as current one */
> > +	if (cpu != prev_cpu) {
> > +		dlpi_dev->event_map.col_map[id] = cpu;
> > +		irq_data_update_effective_affinity(d, cpumask_of(cpu));
> > +	}
> > +
> > +	its_inc_lpi_count(d, cpu);
> > +
> > +	return IRQ_SET_MASK_OK_DONE;
> > +
> > +err:
> > +	its_inc_lpi_count(d, prev_cpu);
> > +	return -EINVAL;
> > +}
> > +
> > +static u64 dlpi_get_msi_base(struct irq_data *d)
> > +{
> > +	u64 addr;
> > +	int cpu;
> > +	unsigned long flags;
> > +
> > +	cpu = dlpi_irq_to_cpuid(d, &flags);
> > +	addr = (u64)(per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base +
> > +		     GICR_SETLPIR);
> > +
> > +	return addr;
> > +}
> > +
> > +/*
> > + * As per the spec, MSI address is the address of the target processor's
> > + * GICR_SETLPIR location.
> > + */
> > +static void dlpi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
> > +{
> > +	u64 addr;
> > +
> > +	addr = dlpi_get_msi_base(d);
> > +
> > +	msg->address_lo		= lower_32_bits(addr);
> > +	msg->address_hi		= upper_32_bits(addr);
> 
> Oh, that's is going to work so well with MultiMSI...
> 
> > +	msg->data		= dlpi_get_event_id(d);
> > +
> > +	iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg);
> > +}
> > +
> > +static int dlpi_irq_set_irqchip_state(struct irq_data *d,
> > +				     enum irqchip_irq_state which,
> > +				     bool state)
> > +{
> > +	if (which != IRQCHIP_STATE_PENDING)
> > +		return -EINVAL;
> > +
> > +	return 0;
> 
> This is a joke, right?
> 
> > +}
> > +
> > +static int dlpi_irq_retrigger(struct irq_data *d)
> > +{
> > +	return !dlpi_irq_set_irqchip_state(d, IRQCHIP_STATE_PENDING, true);
> > +}
> > +
> > +static int dlpi_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
> > +{
> > +	/* vCPU support requires an ITS */
> > +	return -EINVAL;
> 
> Then why are you providing this callback the first place?
> 
> > +}
> > +
> > +static struct irq_chip dlpi_irq_chip = {
> > +	.name			= "Direct LPI",
> > +	.irq_mask		= dlpi_mask_irq,
> > +	.irq_unmask		= dlpi_unmask_irq,
> > +	.irq_eoi		= irq_chip_eoi_parent,
> > +	.irq_set_affinity	= dlpi_set_affinity,
> > +	.irq_compose_msi_msg	= dlpi_irq_compose_msi_msg,
> > +	.irq_set_irqchip_state	= dlpi_irq_set_irqchip_state,
> > +	.irq_retrigger		= dlpi_irq_retrigger,
> > +	.irq_set_vcpu_affinity	= dlpi_irq_set_vcpu_affinity,
> > +};
> > +
> > +static int dlpi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
> > +				 unsigned int nr_irqs, void *args)
> > +{
> > +	msi_alloc_info_t *info = args;
> > +	struct direct_lpi_device *dlpi_dev = info->scratchpad[0].ptr;
> > +	struct irq_data *irqd;
> > +	irq_hw_number_t hwirq;
> > +	int err;
> > +	int i;
> > +
> > +	err = dlpi_alloc_device_irq(dlpi_dev, nr_irqs, &hwirq);
> > +	if (err)
> > +		return err;
> > +
> > +	/*
> > +	 * TODO: Need to call 'iommu_dma_prepare_msi' to prepare for DMA,
> > +	 *	 but, that requires an MSI address. And, for Direct LPI
> > +	 *	 the MSI address comes from the Redistributor from
> > +	 *	 'GICR_SETLPIR', which is per CPU and that is not known
> > +	 *	 at the moment. Not sure what is the best way to handle
> > +	 *	 this.
> > +	 */
> > +
> > +	/*
> > +	err = iommu_dma_prepare_msi(info->desc, its->get_msi_base(its_dev));
> > +	if (err)
> > +		return err;
> > +	*/
> 
> What is the point of this? IORT cannot describe the routing of MSIs to
> redistributors, only to an ITS.
> 
> > +
> > +	for (i = 0; i < nr_irqs; i++) {
> > +		err = its_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
> > +		if (err)
> > +			return err;
> > +
> > +		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
> > +					      &dlpi_irq_chip, dlpi_dev);
> > +		irqd = irq_get_irq_data(virq + i);
> > +		irqd_set_single_target(irqd);
> > +		irqd_set_affinity_on_activate(irqd);
> > +		pr_debug("ID:%d pID:%d vID:%d\n",
> > +			 (int)(hwirq + i - dlpi_dev->event_map.lpi_base),
> > +			 (int)(hwirq + i), virq + i);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static void dlpi_free_device(struct direct_lpi_device *dlpi_dev)
> > +{
> > +	unsigned long flags;
> > +
> > +	raw_spin_lock_irqsave(&dlpi_dev->dlpi->lock, flags);
> > +	list_del(&dlpi_dev->entry);
> > +	raw_spin_unlock_irqrestore(&dlpi_dev->dlpi->lock, flags);
> > +	kfree(dlpi_dev->event_map.col_map);
> > +	kfree(dlpi_dev->event_map.lpi_map);
> > +	kfree(dlpi_dev);
> > +}
> > +
> > +static void dlpi_irq_domain_free(struct irq_domain *domain, unsigned int virq,
> > +				unsigned int nr_irqs)
> > +{
> > +	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
> > +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> > +	int i;
> > +	struct direct_lpi *dlpi = dlpi_dev->dlpi;
> > +
> > +	bitmap_release_region(dlpi_dev->event_map.lpi_map,
> > +			      dlpi_get_event_id(irq_domain_get_irq_data(domain, virq)),
> > +			      get_count_order(nr_irqs));
> > +
> > +	for (i = 0; i < nr_irqs; i++) {
> > +		struct irq_data *data = irq_domain_get_irq_data(domain,
> > +								virq + i);
> > +		/* Nuke the entry in the domain */
> > +		irq_domain_reset_irq_data(data);
> > +	}
> > +
> > +	mutex_lock(&dlpi->dev_alloc_lock);
> > +
> > +	/*
> > +	 * If all interrupts have been freed, start mopping the
> > +	 * floor. This is conditionned on the device not being shared.
> > +	 */
> > +	if (!dlpi_dev->shared &&
> > +	    bitmap_empty(dlpi_dev->event_map.lpi_map,
> > +			 dlpi_dev->event_map.nr_lpis)) {
> > +		its_lpi_free(dlpi_dev->event_map.lpi_map,
> > +			     dlpi_dev->event_map.lpi_base,
> > +			     dlpi_dev->event_map.nr_lpis);
> > +
> > +		dlpi_free_device(dlpi_dev);
> > +	}
> > +
> > +	mutex_unlock(&dlpi->dev_alloc_lock);
> > +	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
> > +}
> > +
> > +static int dlpi_irq_domain_activate(struct irq_domain *domain,
> > +				   struct irq_data *d, bool reserve)
> > +{
> > +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> > +	u32 event;
> > +	int cpu;
> > +
> > +	event = dlpi_get_event_id(d);
> > +	cpu = dlpi_select_cpu(d, cpu_online_mask);
> > +	if (cpu < 0 || cpu >= nr_cpu_ids)
> > +		return -EINVAL;
> > +
> > +	its_inc_lpi_count(d, cpu);
> > +	dlpi_dev->event_map.col_map[event] = cpu;
> > +	irq_data_update_effective_affinity(d, cpumask_of(cpu));
> > +
> > +	return 0;
> > +}
> > +
> > +static void dlpi_irq_domain_deactivate(struct irq_domain *domain,
> > +				      struct irq_data *d)
> > +{
> > +	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
> > +	u32 event = dlpi_get_event_id(d);
> > +
> > +	its_dec_lpi_count(d, dlpi_dev->event_map.col_map[event]);
> > +}
> > +
> > +static struct direct_lpi_device *dlpi_create_device(struct direct_lpi *dlpi,
> > +					u32 dev_id, int nvecs, bool alloc_lpis)
> > +{
> > +	struct direct_lpi_device *dlpi_dev = NULL;
> > +	unsigned long *lpi_map = NULL;
> > +	u16 *col_map = NULL;
> > +	int lpi_base;
> > +	int nr_lpis;
> > +	unsigned long flags;
> > +
> > +	if (WARN_ON(!is_power_of_2(nvecs)))
> > +		nvecs = roundup_pow_of_two(nvecs);
> > +
> > +	dlpi_dev = kzalloc(sizeof(*dlpi_dev), GFP_KERNEL);
> > +	if (!dlpi_dev)
> > +		return NULL;
> > +
> > +	lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
> > +	if (!lpi_map) {
> > +		kfree(dlpi_dev);
> > +		return NULL;
> > +	}
> > +
> > +	col_map = kcalloc(nr_lpis, sizeof(*col_map), GFP_KERNEL);
> > +	if (!col_map) {
> > +		kfree(dlpi_dev);
> > +		kfree(lpi_map);
> > +		return NULL;
> > +	}
> > +
> > +	dlpi_dev->dlpi = dlpi;
> > +	dlpi_dev->event_map.lpi_map = lpi_map;
> > +	dlpi_dev->event_map.col_map = col_map;
> > +	dlpi_dev->event_map.lpi_base = lpi_base;
> > +	dlpi_dev->event_map.nr_lpis = nr_lpis;
> > +	dlpi_dev->device_id = dev_id;
> > +
> > +	raw_spin_lock_irqsave(&dlpi->lock, flags);
> > +	list_add(&dlpi_dev->entry, &dlpi->device_list);
> > +	raw_spin_unlock_irqrestore(&dlpi->lock, flags);
> > +
> > +	return dlpi_dev;
> > +}
> > +
> > +static struct direct_lpi_device *dlpi_find_device(struct direct_lpi *dlpi, u32 dev_id)
> > +{
> > +	struct direct_lpi_device *dlpi_dev = NULL, *tmp;
> > +	unsigned long flags;
> > +
> > +	raw_spin_lock_irqsave(&dlpi->lock, flags);
> > +	list_for_each_entry(tmp, &dlpi->device_list, entry) {
> > +		if (tmp->device_id == dev_id) {
> > +			dlpi_dev = tmp;
> > +			break;
> > +		}
> > +	}
> > +
> > +	raw_spin_unlock_irqrestore(&dlpi->lock, flags);
> > +
> > +	return dlpi_dev;
> > +}
> > +
> > +static int dlpi_msi_prepare(struct irq_domain *domain, struct device *dev,
> > +			   int nvec, msi_alloc_info_t *info)
> > +{
> > +	struct direct_lpi_device *dlpi_dev;
> > +	struct direct_lpi *dlpi;
> > +	struct msi_domain_info *msi_info;
> > +	u32 dev_id;
> > +	int err = 0;
> > +
> > +	/*
> > +	 * We ignore "dev" entirely, and rely on the dev_id that has
> > +	 * been passed via the scratchpad. This limits this domain's
> > +	 * usefulness to upper layers that definitely know that they
> > +	 * are built on top of the ITS.
> > +	 */
> 
> This is complete nonsense. You are just copying bits of the ITS code
> and randomly reorganising it. You are trying to reuse the PCI-specific
> front-end of the ITS code, not even realising that it *cannot* work.
> For example, how can MultiMSI be supported?
> 
> > +	dev_id = info->scratchpad[0].ul;
> > +	msi_info = msi_get_domain_info(domain);
> > +	dlpi = msi_info->data;
> > +
> > +	mutex_lock(&dlpi->dev_alloc_lock);
> > +	dlpi_dev = dlpi_find_device(dlpi, dev_id);
> > +	if (dlpi_dev) {
> > +		/*
> > +		 * We already have seen this ID, probably through
> > +		 * another alias (PCI bridge of some sort). No need to
> > +		 * create the device.
> > +		 */
> > +		dlpi_dev->shared = true;
> > +		pr_debug("Reusing ITT for devID %x\n", dev_id);
> > +		goto out;
> > +	}
> > +
> > +	dlpi_dev = dlpi_create_device(dlpi, dev_id, nvec, true);
> > +	if (!dlpi_dev) {
> > +		err = -ENOMEM;
> > +		goto out;
> > +	}
> > +
> > +out:
> > +	mutex_unlock(&dlpi->dev_alloc_lock);
> > +	info->scratchpad[0].ptr = dlpi_dev;
> > +
> > +	return err;
> > +}
> > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> > index ba39668c3e08..aa101dfcbbfc 100644
> > --- a/drivers/irqchip/irq-gic-v3-its.c
> > +++ b/drivers/irqchip/irq-gic-v3-its.c
> > @@ -178,7 +178,7 @@ struct cpu_lpi_count {
> >  	atomic_t	unmanaged;
> >  };
> >
> > -static DEFINE_PER_CPU(struct cpu_lpi_count, cpu_lpi_count);
> > +DEFINE_PER_CPU(struct cpu_lpi_count, cpu_lpi_count);
> >
> >  static LIST_HEAD(its_nodes);
> >  static DEFINE_RAW_SPINLOCK(its_lock);
> > @@ -1521,7 +1521,7 @@ static void its_unmask_irq(struct irq_data *d)
> >  	lpi_update_config(d, 0, LPI_PROP_ENABLED);
> >  }
> >
> > -static __maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
> > +__maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
> >  {
> >  	if (irqd_affinity_is_managed(d))
> >  		return atomic_read(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed);
> > @@ -1529,7 +1529,7 @@ static __maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
> >  	return atomic_read(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
> >  }
> >
> > -static void its_inc_lpi_count(struct irq_data *d, int cpu)
> > +void its_inc_lpi_count(struct irq_data *d, int cpu)
> >  {
> >  	if (irqd_affinity_is_managed(d))
> >  		atomic_inc(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed);
> > @@ -1537,7 +1537,7 @@ static void its_inc_lpi_count(struct irq_data *d, int cpu)
> >  		atomic_inc(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
> >  }
> >
> > -static void its_dec_lpi_count(struct irq_data *d, int cpu)
> > +void its_dec_lpi_count(struct irq_data *d, int cpu)
> >  {
> >  	if (irqd_affinity_is_managed(d))
> >  		atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed);
> > @@ -1545,7 +1545,7 @@ static void its_dec_lpi_count(struct irq_data *d, int cpu)
> >  		atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
> >  }
> >
> > -static unsigned int cpumask_pick_least_loaded(struct irq_data *d,
> > +unsigned int cpumask_pick_least_loaded(struct irq_data *d,
> >  					      const struct cpumask *cpu_mask)
> >  {
> >  	unsigned int cpu = nr_cpu_ids, tmp;
> > @@ -2121,7 +2121,7 @@ static int __init its_lpi_init(u32 id_bits)
> >  	return err;
> >  }
> >
> > -static unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
> > +unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
> >  {
> >  	unsigned long *bitmap = NULL;
> >  	int err = 0;
> > @@ -2153,7 +2153,7 @@ static unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
> >  	return bitmap;
> >  }
> >
> > -static void its_lpi_free(unsigned long *bitmap, u32 base, u32 nr_ids)
> > +void its_lpi_free(unsigned long *bitmap, u32 base, u32 nr_ids)
> >  {
> >  	WARN_ON(free_lpi_range(base, nr_ids));
> >  	kfree(bitmap);
> > @@ -3506,9 +3506,9 @@ static struct msi_domain_ops its_msi_domain_ops = {
> >  	.msi_prepare	= its_msi_prepare,
> >  };
> >
> > -static int its_irq_gic_domain_alloc(struct irq_domain *domain,
> > -				    unsigned int virq,
> > -				    irq_hw_number_t hwirq)
> > +int its_irq_gic_domain_alloc(struct irq_domain *domain,
> > +			     unsigned int virq,
> > +			     irq_hw_number_t hwirq)
> >  {
> >  	struct irq_fwspec fwspec;
> 
> There is no way you are messing with the ITS driver internals like
> this. The only thing you *may* do is to have a common infrastructure
> to allocate the RD tables, and only that. None of the ITS
> infrastructure makes any sense for DirectLPI anyway.
> 
> Please rearchitect your code to be fully independent of the ITS, and
> without any of the pointless abstractions that the ITS requires. This
> should be much closer to GICv3-MBI than the ITS. Heck, you could even
> use the MBI driver directly instead of trying to use DirectLPI, as it
> gives you similar functionalities.

Thanks for your feedback, Marc. I am reorganizing the code and incorporating
the above feedback.

> 
> I also want to understand *how* you are going to plumb this into both
> ACPI and DT, given that neither understand how to link a PCI endpoint
> to a set of RDs.
> 
> 	M.

One way to do this for NUMA-aware systems would be to use the NUMA
related information that is available with PCI endpoints or root complex, to
pick a Redistributor/CPU that is in the NUMA node, as specified by the PCI
endpoint/root complex. In DT PCI devices can specify this using
'numa-node-id' and in ACPI using the '_PXM (Proximity)'. For systems that
are not NUMA-aware, we can go with *any* Redistributor/CPU.

Is there any additional information we would be able to gather from ACPI
or DT that's not there currently, that would be useful here?

The other question is, what DT property can be used to instantiate the
PCI-MSI IRQ domain for Direct LPI? As per the DT spec, there is only
'msi-controller' Sub-node for the ITS. 

- Sunil
Marc Zyngier July 31, 2021, 9:52 a.m. UTC | #3
On Mon, 26 Jul 2021 16:33:39 +0100,
Sunil Muthuswamy <sunilmut@microsoft.com> wrote:

[...]

> > I also want to understand *how* you are going to plumb this into both
> > ACPI and DT, given that neither understand how to link a PCI endpoint
> > to a set of RDs.
> > 
> > 	M.
> 
> One way to do this for NUMA-aware systems would be to use the NUMA
> related information that is available with PCI endpoints or root complex, to
> pick a Redistributor/CPU that is in the NUMA node, as specified by the PCI
> endpoint/root complex. In DT PCI devices can specify this using
> 'numa-node-id' and in ACPI using the '_PXM (Proximity)'. For systems that
> are not NUMA-aware, we can go with *any* Redistributor/CPU.

This makes zero sense. From the point of view of a device, all the RDs
should be reachable, and firmware has no say in it. Dealing with
interrupt affinity is the responsibility of the endpoint driver, and
NUMA affinity is only a performance optimisation.

> Is there any additional information we would be able to gather from ACPI
> or DT that's not there currently, that would be useful here?

You will need some firmware information describing that a given set of
devices must use the RDs for their MSIs. Just like we currently
describe it in IORT for the ITS. You cannot /assume/ things. At the
moment, there is nothing at all, because no-one (including Microsoft)
thought it would be a good idea not to have an ITS, which is also why
ACPI doesn't describe MBIs as a potential MSI provider.

> The other question is, what DT property can be used to instantiate the
> PCI-MSI IRQ domain for Direct LPI? As per the DT spec, there is only
> 'msi-controller' Sub-node for the ITS. 

Read again. We already a msi-controller property in the main GICv3
node for the purpose of MBIs. You can reuse this property, but you
will have to discriminate whether you want MBIs or DirectLPIs with
some extra property.

	M.
Sunil Muthuswamy Aug. 3, 2021, 2:11 a.m. UTC | #4
On Saturday, July 31, 2021 2:52 AM,
 Marc Zyngier <maz@kernel.org> wrote:
> 
> [...]
> 
> > > I also want to understand *how* you are going to plumb this into both
> > > ACPI and DT, given that neither understand how to link a PCI endpoint
> > > to a set of RDs.
> > >
> > > 	M.
> >
> > One way to do this for NUMA-aware systems would be to use the NUMA
> > related information that is available with PCI endpoints or root complex, to
> > pick a Redistributor/CPU that is in the NUMA node, as specified by the PCI
> > endpoint/root complex. In DT PCI devices can specify this using
> > 'numa-node-id' and in ACPI using the '_PXM (Proximity)'. For systems that
> > are not NUMA-aware, we can go with *any* Redistributor/CPU.
> 
> This makes zero sense. From the point of view of a device, all the RDs
> should be reachable, and firmware has no say in it. Dealing with
> interrupt affinity is the responsibility of the endpoint driver, and
> NUMA affinity is only a performance optimisation.
> 
> > Is there any additional information we would be able to gather from ACPI
> > or DT that's not there currently, that would be useful here?
> 
> You will need some firmware information describing that a given set of
> devices must use the RDs for their MSIs. Just like we currently
> describe it in IORT for the ITS. You cannot /assume/ things. At the
> moment, there is nothing at all, because no-one (including Microsoft)
> thought it would be a good idea not to have an ITS, which is also why
> ACPI doesn't describe MBIs as a potential MSI provider.
> 
I am a little bit confused by your above comment. Maybe you can help me
understand the ask. You indicate that from the point of the view of the
device, all the RDs should be reachable. But, then if we define a mapping
between PCI endpoint and RD in the firmware, we would be doing exactly
the opposite. i.e. restricting the RDs that are reachable by the device. Can
you please clarify?

Is your concern that the device should be able to only DMA to a subset of
GIC Redistributor, for the MSIs? If so, in the IORT, there is "memory address
size limit" for both device and root complex nodes. In the implementation,
we can enforce that the GICR is within that range. And, if a device deviates
further than that (ex: by having accessibility gaps within the GICR range),
then that is out of scope for support.

- Sunil
Robin Murphy Aug. 3, 2021, 8:35 a.m. UTC | #5
On 2021-08-03 03:11, Sunil Muthuswamy wrote:
>   On Saturday, July 31, 2021 2:52 AM,
>   Marc Zyngier <maz@kernel.org> wrote:
>>
>> [...]
>>
>>>> I also want to understand *how* you are going to plumb this into both
>>>> ACPI and DT, given that neither understand how to link a PCI endpoint
>>>> to a set of RDs.
>>>>
>>>> 	M.
>>>
>>> One way to do this for NUMA-aware systems would be to use the NUMA
>>> related information that is available with PCI endpoints or root complex, to
>>> pick a Redistributor/CPU that is in the NUMA node, as specified by the PCI
>>> endpoint/root complex. In DT PCI devices can specify this using
>>> 'numa-node-id' and in ACPI using the '_PXM (Proximity)'. For systems that
>>> are not NUMA-aware, we can go with *any* Redistributor/CPU.
>>
>> This makes zero sense. From the point of view of a device, all the RDs
>> should be reachable, and firmware has no say in it. Dealing with
>> interrupt affinity is the responsibility of the endpoint driver, and
>> NUMA affinity is only a performance optimisation.
>>
>>> Is there any additional information we would be able to gather from ACPI
>>> or DT that's not there currently, that would be useful here?
>>
>> You will need some firmware information describing that a given set of
>> devices must use the RDs for their MSIs. Just like we currently
>> describe it in IORT for the ITS. You cannot /assume/ things. At the
>> moment, there is nothing at all, because no-one (including Microsoft)
>> thought it would be a good idea not to have an ITS, which is also why
>> ACPI doesn't describe MBIs as a potential MSI provider.
>>
> I am a little bit confused by your above comment. Maybe you can help me
> understand the ask. You indicate that from the point of the view of the
> device, all the RDs should be reachable. But, then if we define a mapping
> between PCI endpoint and RD in the firmware, we would be doing exactly
> the opposite. i.e. restricting the RDs that are reachable by the device. Can
> you please clarify?
> 
> Is your concern that the device should be able to only DMA to a subset of
> GIC Redistributor, for the MSIs? If so, in the IORT, there is "memory address
> size limit" for both device and root complex nodes. In the implementation,
> we can enforce that the GICR is within that range. And, if a device deviates
> further than that (ex: by having accessibility gaps within the GICR range),
> then that is out of scope for support.

No, please don't try to abuse the Memory Address Size Limit - that has 
far more chance of adversely affecting normal DMA operation than of 
being any use here.

I believe the point Marc was trying to make is that firmware should not 
associate a device with any one *specific* redistributor, however ACPI 
currently has no way to describe that MSIs can target redistributors *at 
all*, only ITS groups - there is no such concept as a "redistributor group".

Robin.
Marc Zyngier Aug. 4, 2021, 9:21 a.m. UTC | #6
On Tue, 03 Aug 2021 09:35:12 +0100,
Robin Murphy <robin.murphy@arm.com> wrote:
> 
> On 2021-08-03 03:11, Sunil Muthuswamy wrote:
> >   On Saturday, July 31, 2021 2:52 AM,
> >   Marc Zyngier <maz@kernel.org> wrote:
> >> 
> >> [...]
> >> 
> >>>> I also want to understand *how* you are going to plumb this into both
> >>>> ACPI and DT, given that neither understand how to link a PCI endpoint
> >>>> to a set of RDs.
> >>>> 
> >>>> 	M.
> >>> 
> >>> One way to do this for NUMA-aware systems would be to use the NUMA
> >>> related information that is available with PCI endpoints or root complex, to
> >>> pick a Redistributor/CPU that is in the NUMA node, as specified by the PCI
> >>> endpoint/root complex. In DT PCI devices can specify this using
> >>> 'numa-node-id' and in ACPI using the '_PXM (Proximity)'. For systems that
> >>> are not NUMA-aware, we can go with *any* Redistributor/CPU.
> >> 
> >> This makes zero sense. From the point of view of a device, all the RDs
> >> should be reachable, and firmware has no say in it. Dealing with
> >> interrupt affinity is the responsibility of the endpoint driver, and
> >> NUMA affinity is only a performance optimisation.
> >> 
> >>> Is there any additional information we would be able to gather from ACPI
> >>> or DT that's not there currently, that would be useful here?
> >> 
> >> You will need some firmware information describing that a given set of
> >> devices must use the RDs for their MSIs. Just like we currently
> >> describe it in IORT for the ITS. You cannot /assume/ things. At the
> >> moment, there is nothing at all, because no-one (including Microsoft)
> >> thought it would be a good idea not to have an ITS, which is also why
> >> ACPI doesn't describe MBIs as a potential MSI provider.
> >> 
> > I am a little bit confused by your above comment. Maybe you can help me
> > understand the ask. You indicate that from the point of the view of the
> > device, all the RDs should be reachable. But, then if we define a mapping
> > between PCI endpoint and RD in the firmware, we would be doing exactly
> > the opposite. i.e. restricting the RDs that are reachable by the device. Can
> > you please clarify?

Not at all. What I am saying is that you need to describe that the
MSIs have to be routed to the RDs, and there is no way to express this
at the moment.

> > 
> > Is your concern that the device should be able to only DMA to a subset of
> > GIC Redistributor, for the MSIs? If so, in the IORT, there is "memory address
> > size limit" for both device and root complex nodes. In the implementation,
> > we can enforce that the GICR is within that range. And, if a device deviates
> > further than that (ex: by having accessibility gaps within the GICR range),
> > then that is out of scope for support.
> 
> No, please don't try to abuse the Memory Address Size Limit - that has
> far more chance of adversely affecting normal DMA operation than of
> being any use here.
> 
> I believe the point Marc was trying to make is that firmware should
> not associate a device with any one *specific* redistributor, however
> ACPI currently has no way to describe that MSIs can target
> redistributors *at all*, only ITS groups - there is no such concept as
> a "redistributor group".

Thanks Robin.

That is exactly my point. There is no linkage whatsoever between a
device generating MSIs and the redistributors. In that respect, this
is the same issue we have with DT, as none of the firmware interfaces
can currently describe MSIs directly targeting the RDs. The only way
to describe MSIs with GICv3 in ACPI is to describe an ITS, and you
cannot use the *absence* of an ITS to decide and use DirectLPIs.

Given the state of things, using DirectLPI means that you need to
extend the firmware interfaces. This means both an extension to the DT
binding, and an update to the ACPI spec. There is no way around this.

Thanks,

	M.
Sunil Muthuswamy Aug. 4, 2021, 8:10 p.m. UTC | #7
On Wednesday, August 4, 2021 2:21 AM,
Marc Zyngier <maz@kernel.org> wrote:
>
> On Tue, 03 Aug 2021 09:35:12 +0100,
> Robin Murphy <robin.murphy@arm.com> wrote:
> >
> > On 2021-08-03 03:11, Sunil Muthuswamy wrote:
> > >   On Saturday, July 31, 2021 2:52 AM,
> > >   Marc Zyngier <maz@kernel.org> wrote:
> > >>
> > >> [...]
> > >>
> > >>>> I also want to understand *how* you are going to plumb this into both
> > >>>> ACPI and DT, given that neither understand how to link a PCI endpoint
> > >>>> to a set of RDs.
> > >>>>
> > >>>> 	M.
> > >>>
> > >>> One way to do this for NUMA-aware systems would be to use the NUMA
> > >>> related information that is available with PCI endpoints or root complex, to
> > >>> pick a Redistributor/CPU that is in the NUMA node, as specified by the PCI
> > >>> endpoint/root complex. In DT PCI devices can specify this using
> > >>> 'numa-node-id' and in ACPI using the '_PXM (Proximity)'. For systems that
> > >>> are not NUMA-aware, we can go with *any* Redistributor/CPU.
> > >>
> > >> This makes zero sense. From the point of view of a device, all the RDs
> > >> should be reachable, and firmware has no say in it. Dealing with
> > >> interrupt affinity is the responsibility of the endpoint driver, and
> > >> NUMA affinity is only a performance optimisation.
> > >>
> > >>> Is there any additional information we would be able to gather from ACPI
> > >>> or DT that's not there currently, that would be useful here?
> > >>
> > >> You will need some firmware information describing that a given set of
> > >> devices must use the RDs for their MSIs. Just like we currently
> > >> describe it in IORT for the ITS. You cannot /assume/ things. At the
> > >> moment, there is nothing at all, because no-one (including Microsoft)
> > >> thought it would be a good idea not to have an ITS, which is also why
> > >> ACPI doesn't describe MBIs as a potential MSI provider.
> > >>
> > > I am a little bit confused by your above comment. Maybe you can help me
> > > understand the ask. You indicate that from the point of the view of the
> > > device, all the RDs should be reachable. But, then if we define a mapping
> > > between PCI endpoint and RD in the firmware, we would be doing exactly
> > > the opposite. i.e. restricting the RDs that are reachable by the device. Can
> > > you please clarify?
> 
> Not at all. What I am saying is that you need to describe that the
> MSIs have to be routed to the RDs, and there is no way to express this
> at the moment.
> 
> > >
> > > Is your concern that the device should be able to only DMA to a subset of
> > > GIC Redistributor, for the MSIs? If so, in the IORT, there is "memory address
> > > size limit" for both device and root complex nodes. In the implementation,
> > > we can enforce that the GICR is within that range. And, if a device deviates
> > > further than that (ex: by having accessibility gaps within the GICR range),
> > > then that is out of scope for support.
> >
> > No, please don't try to abuse the Memory Address Size Limit - that has
> > far more chance of adversely affecting normal DMA operation than of
> > being any use here.
> >
> > I believe the point Marc was trying to make is that firmware should
> > not associate a device with any one *specific* redistributor, however
> > ACPI currently has no way to describe that MSIs can target
> > redistributors *at all*, only ITS groups - there is no such concept as
> > a "redistributor group".
> 
> Thanks Robin.
> 
> That is exactly my point. There is no linkage whatsoever between a
> device generating MSIs and the redistributors. In that respect, this
> is the same issue we have with DT, as none of the firmware interfaces
> can currently describe MSIs directly targeting the RDs. The only way
> to describe MSIs with GICv3 in ACPI is to describe an ITS, and you
> cannot use the *absence* of an ITS to decide and use DirectLPIs.
> 
> Given the state of things, using DirectLPI means that you need to
> extend the firmware interfaces. This means both an extension to the DT
> binding, and an update to the ACPI spec. There is no way around this.
> 
> Thanks,
> 
> 	M.
> 
Thanks Marc and Robin for clarifying. I see and understand the point
about having explicit MSI mappings in the firmware specification for
Direct LPIs for generic hardware support.

Hey Mark, would you be willing to consider a scoped down implementation of
GIC Direct LPI with just an IRQ chip implementation and no
Direct LPI PCI-MSI IRQ chip. This will allow a MSI provider (such as Hyper-V vPCI) to
provide a PCI-MSI IRQ chip on top of the Direct LPI IRQ chip and enable
PCI-MSI scenarios, and avoid building in assumptions in other cases (like PCI) where
firmware specification is not available.
- This scoped down implementation would allow Microsoft to build virtual
PCI on top, to enable our business needs.
- If there's a need for a generic support for a Direct LPI PCI MSI IRQ, that could
drive firmware revision efforts, and we are happy to assist there.

Looking forward to hearing back.

Thanks,
Sunil
Marc Zyngier Aug. 5, 2021, 8:35 a.m. UTC | #8
On Wed, 04 Aug 2021 21:10:43 +0100,
Sunil Muthuswamy <sunilmut@microsoft.com> wrote:
> 
> Thanks Marc and Robin for clarifying. I see and understand the point
> about having explicit MSI mappings in the firmware specification for
> Direct LPIs for generic hardware support.
> 
> Hey Mark

I assume this is for me?

> would you be willing to consider a scoped down implementation of GIC
> Direct LPI with just an IRQ chip implementation and no Direct LPI
> PCI-MSI IRQ chip.

Could you please clarify? If you are not implementing MSIs, how can a
device signal LPIs? At the end of the day, something has to write into
the RD, and it isn't going to happen by sheer magic.

> This will allow a MSI provider (such as Hyper-V vPCI) to provide a
> PCI-MSI IRQ chip on top of the Direct LPI IRQ chip and enable
> PCI-MSI scenarios, and avoid building in assumptions in other cases
> (like PCI) where firmware specification is not available.

I really don't get what you are suggesting. Could you please describe
what you have in mind?

	M.
Sunil Muthuswamy Aug. 6, 2021, 7:14 p.m. UTC | #9
On Thursday, August 5, 2021 1:35 AM,
Marc Zyngier <maz@kernel.org> wrote:
[...]
> > Hey Mark
> 
> I assume this is for me?
> 
Yes, sorry.

> > would you be willing to consider a scoped down implementation of GIC
> > Direct LPI with just an IRQ chip implementation and no Direct LPI
> > PCI-MSI IRQ chip.
> 
> Could you please clarify? If you are not implementing MSIs, how can a
> device signal LPIs? At the end of the day, something has to write into
> the RD, and it isn't going to happen by sheer magic.
> 
> > This will allow a MSI provider (such as Hyper-V vPCI) to provide a
> > PCI-MSI IRQ chip on top of the Direct LPI IRQ chip and enable
> > PCI-MSI scenarios, and avoid building in assumptions in other cases
> > (like PCI) where firmware specification is not available.
> 
> I really don't get what you are suggesting. Could you please describe
> what you have in mind?
> 
The suggestion was to *not* implement the PCI-MSI IRQ chip in the
Direct LPI GIC implementation, but let the endpoint/specific
implementation provide for the MSI IRQ chip.

For example, the Hyper-V vPCI module does implement a PCI-MSI IRQ
chip (refer to 'hv_pcie_init_irq_domain'). Microsoft Hypervisor
provides/virtualizes the MSI address to be used for Hyper-V vPCI. The
Hypervisor might be using the ITS in the background, but it expects
the device to be programmed with the MSI address that it provides.
And, the Hypervisor takes care of routing the interrupt to the guest.
This is done by the Hyper-V vPCI MSI IRQ chip (refer to
hv_msi_irq_chip) compose MSI message.

Today, for X64, Hyper-V vPCI PCI-MSI irq chip parents itself to the
architectural 'x86_vector_domain'. The suggestion here was to see
if we can support a similar setup for ARM, where the Hyper-V vPCI
PCI-MSI irq chip can parent itself to the 'Direct LPI arch IRQ chip'
(to be implemented).

The arch Direct LPI arch IRQ chip is needed to enable LPIs, invalidate
the LPI configuration (GICR_INVLPIR et. al. ) and allocate LPI IRQs from
the LPI interrupt namespace (i.e. 8192-<num LPIS>).

Happy to expand on this further, if the above is not clear.

Thanks,
Sunil
Marc Zyngier Aug. 8, 2021, 10:19 a.m. UTC | #10
On Fri, 06 Aug 2021 20:14:13 +0100,
Sunil Muthuswamy <sunilmut@microsoft.com> wrote:
> 
> On Thursday, August 5, 2021 1:35 AM,
> Marc Zyngier <maz@kernel.org> wrote:
> [...]
> > > Hey Mark
> > 
> > I assume this is for me?
> > 
> Yes, sorry.
> 
> > > would you be willing to consider a scoped down implementation of GIC
> > > Direct LPI with just an IRQ chip implementation and no Direct LPI
> > > PCI-MSI IRQ chip.
> > 
> > Could you please clarify? If you are not implementing MSIs, how can a
> > device signal LPIs? At the end of the day, something has to write into
> > the RD, and it isn't going to happen by sheer magic.
> > 
> > > This will allow a MSI provider (such as Hyper-V vPCI) to provide a
> > > PCI-MSI IRQ chip on top of the Direct LPI IRQ chip and enable
> > > PCI-MSI scenarios, and avoid building in assumptions in other cases
> > > (like PCI) where firmware specification is not available.
> > 
> > I really don't get what you are suggesting. Could you please describe
> > what you have in mind?
> > 
> The suggestion was to *not* implement the PCI-MSI IRQ chip in the
> Direct LPI GIC implementation, but let the endpoint/specific
> implementation provide for the MSI IRQ chip.
> 
> For example, the Hyper-V vPCI module does implement a PCI-MSI IRQ
> chip (refer to 'hv_pcie_init_irq_domain'). Microsoft Hypervisor
> provides/virtualizes the MSI address to be used for Hyper-V vPCI. The
> Hypervisor might be using the ITS in the background, but it expects
> the device to be programmed with the MSI address that it provides.
> And, the Hypervisor takes care of routing the interrupt to the guest.
> This is done by the Hyper-V vPCI MSI IRQ chip (refer to
> hv_msi_irq_chip) compose MSI message.
> 
> Today, for X64, Hyper-V vPCI PCI-MSI irq chip parents itself to the
> architectural 'x86_vector_domain'. The suggestion here was to see
> if we can support a similar setup for ARM, where the Hyper-V vPCI
> PCI-MSI irq chip can parent itself to the 'Direct LPI arch IRQ chip'
> (to be implemented).
> 
> The arch Direct LPI arch IRQ chip is needed to enable LPIs, invalidate
> the LPI configuration (GICR_INVLPIR et. al. ) and allocate LPI IRQs from
> the LPI interrupt namespace (i.e. 8192-<num LPIS>).
> 
> Happy to expand on this further, if the above is not clear.

[+Lorenzo, as he deals with both PCI and ACPI]

I think it is clear enough, but I don't see what this buys us other
than turning DirectLPI into something that is essentially private to
Hyper-V, just for the sake of sidestepping a firmware description.
Furthermore, the Hyper-V "single MSI address" model doesn't match the
way DirectLPI works (after all, this is one of the many reasons why we
have the ITS).

The architecture already gives you everything you need to make things
work in Hyper-V. You can easily implement the DirectLPI support (you
definitely need most of it anyway), and the PCI-MSI layer is
*free*. All you need is a firmware description. Which I don't believe
is the end of the world, given that DT is freely hackable and that
IORT is an ARM-private extension to ACPI. I'm sure you know who to
reach out to in order to start a draft (and if you don't, I can give
you names offline).

I am not interested in expanding the GICv3 architecture support if it
cannot be generally used in a way that is *compliant with the
architecture*. If you think DirectLPIs can make the hypervisor
simpler, I'm fine with it. But let's fully embrace the concept instead
of hiding it behind a layer of PV muck. It will even have a chance of
working on bare metal (such as on the machine that is humming behind
me, or even the Fast Model).

Thanks,

	M.
Sunil Muthuswamy Aug. 9, 2021, 2:35 a.m. UTC | #11
Sunday, August 8, 2021 3:19 AM,
Marc Zyngier <maz@kernel.org> wrote:
> 
> [+Lorenzo, as he deals with both PCI and ACPI]
> 
> I think it is clear enough, but I don't see what this buys us other
> than turning DirectLPI into something that is essentially private to
> Hyper-V, just for the sake of sidestepping a firmware description.
> Furthermore, the Hyper-V "single MSI address" model doesn't match the
> way DirectLPI works (after all, this is one of the many reasons why we
> have the ITS).
> 
> The architecture already gives you everything you need to make things
> work in Hyper-V. You can easily implement the DirectLPI support (you
> definitely need most of it anyway), and the PCI-MSI layer is
> *free*. All you need is a firmware description. Which I don't believe
> is the end of the world, given that DT is freely hackable and that
> IORT is an ARM-private extension to ACPI. I'm sure you know who to
> reach out to in order to start a draft (and if you don't, I can give
> you names offline).
> 
That sounds reasonable. The DT extension here alone wont suffice for
Hyper-V and we would need the ACPI IORT extension here as well. Our
general experience with ACPI extension is that they can take time. But,
I am curious to hear from Lorenzo on his thoughts here and if just an
IORT extension means anything else in terms of timelines.

> I am not interested in expanding the GICv3 architecture support if it
> cannot be generally used in a way that is *compliant with the
> architecture*. If you think DirectLPIs can make the hypervisor
> simpler, I'm fine with it. But let's fully embrace the concept instead
> of hiding it behind a layer of PV muck. It will even have a chance of
> working on bare metal (such as on the machine that is humming behind
> me, or even the Fast Model).
> 
Appreciate your inputs. Since we are discussing options, there is one more
option to enable Hyper-V virtual PCI for ARM64 that I do would like to
discuss. That option is to implement the IRQ chip completely within the
Hyper-V module itself. That IRQ chip will take care of allocating LPI vectors,
enabling the interrupt (unmask, mask etc..). It won't be a Direct LPI based,
but based on custom Hyper-V methods. That chip will parent itself to the
GIC v3 IRQ domain. The only extra thing needed for this is for the Hyper-V
IRQ chip to be able to discover the GIC v3 IRQ domain, for which it
cannot use the 'irq_find_matching_fwnode' method as Hyper-V virtual
PCI bus/devices are not firmware enumerated. I am not sure if there is any
other way to discover the GIC (DOMAIN_BUS_WIRED) domain.

What are your thoughts/feedback on the above? This will allow us to
enable the scenario for the business timelines we are targeting for, while
we wait for firmware spec updates. Long term we also want to use
architectural methods here as that helps the Hypervisor as well, and I
would be glad to pursue the firmware spec updates in parallel.

Thanks,
Sunil
Marc Zyngier Aug. 9, 2021, 9:15 a.m. UTC | #12
On Mon, 09 Aug 2021 03:35:05 +0100,
Sunil Muthuswamy <sunilmut@microsoft.com> wrote:
> 
> Sunday, August 8, 2021 3:19 AM,
> Marc Zyngier <maz@kernel.org> wrote:
> > 
> > [+Lorenzo, as he deals with both PCI and ACPI]
> > 
> > I think it is clear enough, but I don't see what this buys us other
> > than turning DirectLPI into something that is essentially private to
> > Hyper-V, just for the sake of sidestepping a firmware description.
> > Furthermore, the Hyper-V "single MSI address" model doesn't match the
> > way DirectLPI works (after all, this is one of the many reasons why we
> > have the ITS).
> > 
> > The architecture already gives you everything you need to make things
> > work in Hyper-V. You can easily implement the DirectLPI support (you
> > definitely need most of it anyway), and the PCI-MSI layer is
> > *free*. All you need is a firmware description. Which I don't believe
> > is the end of the world, given that DT is freely hackable and that
> > IORT is an ARM-private extension to ACPI. I'm sure you know who to
> > reach out to in order to start a draft (and if you don't, I can give
> > you names offline).
> > 
> That sounds reasonable. The DT extension here alone wont suffice for
> Hyper-V and we would need the ACPI IORT extension here as well. Our
> general experience with ACPI extension is that they can take time. But,
> I am curious to hear from Lorenzo on his thoughts here and if just an
> IORT extension means anything else in terms of timelines.
> 
> > I am not interested in expanding the GICv3 architecture support if it
> > cannot be generally used in a way that is *compliant with the
> > architecture*. If you think DirectLPIs can make the hypervisor
> > simpler, I'm fine with it. But let's fully embrace the concept instead
> > of hiding it behind a layer of PV muck. It will even have a chance of
> > working on bare metal (such as on the machine that is humming behind
> > me, or even the Fast Model).
> > 
> Appreciate your inputs. Since we are discussing options, there is one more
> option to enable Hyper-V virtual PCI for ARM64 that I do would like to
> discuss. That option is to implement the IRQ chip completely within the
> Hyper-V module itself. That IRQ chip will take care of allocating LPI vectors,
> enabling the interrupt (unmask, mask etc..). It won't be a Direct LPI based,
> but based on custom Hyper-V methods. That chip will parent itself to the
> GIC v3 IRQ domain. The only extra thing needed for this is for the Hyper-V
> IRQ chip to be able to discover the GIC v3 IRQ domain, for which it
> cannot use the 'irq_find_matching_fwnode' method as Hyper-V virtual
> PCI bus/devices are not firmware enumerated. I am not sure if there is any
> other way to discover the GIC (DOMAIN_BUS_WIRED) domain.
>
> What are your thoughts/feedback on the above? This will allow us to
> enable the scenario for the business timelines we are targeting for, while
> we wait for firmware spec updates. Long term we also want to use
> architectural methods here as that helps the Hypervisor as well, and I
> would be glad to pursue the firmware spec updates in parallel.

This feels like yet another layer of PV ugliness that has the side
effect of fragmenting the architectural support.

If you plug directly into the GICv3 layer, I'd rather you inject SPIs,
just like any other non-architectural MSI controller. You can directly
interface with the ACPI GSI layer for that, without any need to mess
with the GICv3 internals. The SPI space isn't very large, but still
much larger than the equivalent x86 space (close to 1000).

If time is of the essence, I suggest you go the SPI way. For anything
involving LPIs, I really want to see a firmware spec that works for
everyone as opposed to a localised Hyper-V hack.

Thanks,

	M.
Sunil Muthuswamy Aug. 10, 2021, 1:10 a.m. UTC | #13
On Monday, August 9, 2021 2:15 AM,
Marc Zyngier <maz@kernel.org> wrote:
[...]
> If you plug directly into the GICv3 layer, I'd rather you inject SPIs,
> just like any other non-architectural MSI controller. You can directly
> interface with the ACPI GSI layer for that, without any need to mess
> with the GICv3 internals. The SPI space isn't very large, but still
> much larger than the equivalent x86 space (close to 1000).
> 
> If time is of the essence, I suggest you go the SPI way. For anything
> involving LPIs, I really want to see a firmware spec that works for
> everyone as opposed to a localised Hyper-V hack.
> 
Ok, thanks. Before we commit to anything, I would like to make sure
that I am on the same page in terms of your description. With that in
mind, I have few questions. Hopefully, these should settle the matter.
1. If we go with the SPI route, then the way I envision it is that the 
    Hyper-V vPCI driver will implement an IRQ chip, which will take
    care of allocating & managing the SPI interrupt for Hyper-V vPCI.
    This IRQ chip will parent itself to the architectural GIC IRQ chip for
    general interrupt management. Does that match with your
    understanding/suggestion as well?

2. In the above, how will Hyper-V vPCI module discover the
    architectural GIC IRQ domain generically for virtual devices that
    are not firmware enumerated? Today, the GIC v3 IRQ domain is
    not exported and the general 'irq_find_xyz' APIs only work for
    firmware enumerated devices (i.e. something that has a fwnode
    handle).

3. Longer term, if we implement LPIs (with an ITS or Direct LPI), to
    be able to support all scenarios such as Live Migration, the
    Hyper-V virtual PCI driver would like to be able to control the
    MSI address & data that gets programmed on the device
   (i.e. .irq_compose_msi_msg). We can use the architectural
   methods for everything else. Does that fit into the realm of
   what would be acceptable upstream?

Thanks,
Sunil
Marc Zyngier Aug. 10, 2021, 1:57 p.m. UTC | #14
On Tue, 10 Aug 2021 02:10:40 +0100,
Sunil Muthuswamy <sunilmut@microsoft.com> wrote:
> 
> On Monday, August 9, 2021 2:15 AM,
> Marc Zyngier <maz@kernel.org> wrote:
> [...]
> > If you plug directly into the GICv3 layer, I'd rather you inject SPIs,
> > just like any other non-architectural MSI controller. You can directly
> > interface with the ACPI GSI layer for that, without any need to mess
> > with the GICv3 internals. The SPI space isn't very large, but still
> > much larger than the equivalent x86 space (close to 1000).
> > 
> > If time is of the essence, I suggest you go the SPI way. For anything
> > involving LPIs, I really want to see a firmware spec that works for
> > everyone as opposed to a localised Hyper-V hack.
> > 
> Ok, thanks. Before we commit to anything, I would like to make sure
> that I am on the same page in terms of your description. With that in
> mind, I have few questions. Hopefully, these should settle the matter.
> 1. If we go with the SPI route, then the way I envision it is that the 
>     Hyper-V vPCI driver will implement an IRQ chip, which will take
>     care of allocating & managing the SPI interrupt for Hyper-V vPCI.
>     This IRQ chip will parent itself to the architectural GIC IRQ chip for
>     general interrupt management. Does that match with your
>     understanding/suggestion as well?

Yes.

> 
> 2. In the above, how will Hyper-V vPCI module discover the
>     architectural GIC IRQ domain generically for virtual devices that
>     are not firmware enumerated? Today, the GIC v3 IRQ domain is
>     not exported and the general 'irq_find_xyz' APIs only work for
>     firmware enumerated devices (i.e. something that has a fwnode
>     handle).

You don't need to discover it with ACPI. You simply instantiate your
own irqdomain using acpi_irq_create_hierarchy(), which will do the
right thing. Your PCI driver will have to create its own fwnode out of
thin air (there is an API for that), and call into this function to
plumb everything.

> 
> 3. Longer term, if we implement LPIs (with an ITS or Direct LPI), to
>     be able to support all scenarios such as Live Migration, the
>     Hyper-V virtual PCI driver would like to be able to control the
>     MSI address & data that gets programmed on the device
>    (i.e. .irq_compose_msi_msg). We can use the architectural
>    methods for everything else. Does that fit into the realm of
>    what would be acceptable upstream?

I cannot see how this works. The address has to match that of the
virtual HW you target (whether this is a redistributor or an ITS), and
the data is only meaningful in that context. And it really shouldn't
matter at all, as I expect you don't let the guest directly write to
the PCI MSI-X table.

If you let the guest have access direct to that table (which seems to
contradict your "live migration" argument), then your best bet is to
use provide a skeletal IOMMU implementation, and get
iommu_dma_compose_msi_msg() to do the remapping. But frankly, that's
horrible and I fully expect the IOMMU people to push back (and that
still doesn't give you any control over the data, only the address).

Thanks,

	M.
diff mbox series

Patch

diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index f88cbf36a9d2..c6ea9620161b 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -32,7 +32,7 @@  obj-$(CONFIG_ARM_GIC_PM)		+= irq-gic-pm.o
 obj-$(CONFIG_ARCH_REALVIEW)		+= irq-gic-realview.o
 obj-$(CONFIG_ARM_GIC_V2M)		+= irq-gic-v2m.o
 obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o
-obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-platform-msi.o irq-gic-v4.o
+obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-platform-msi.o irq-gic-v4.o irq-gic-v3-dlpi.o
 obj-$(CONFIG_ARM_GIC_V3_ITS_PCI)	+= irq-gic-v3-its-pci-msi.o
 obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC)	+= irq-gic-v3-its-fsl-mc-msi.o
 obj-$(CONFIG_PARTITION_PERCPU)		+= irq-partition-percpu.o
diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h
index ccba8b0fe0f5..21e4e8f6ba70 100644
--- a/drivers/irqchip/irq-gic-common.h
+++ b/drivers/irqchip/irq-gic-common.h
@@ -30,4 +30,26 @@  void gic_enable_of_quirks(const struct device_node *np,
 
 void gic_set_kvm_info(const struct gic_kvm_info *info);
 
+/* LPI related functionality */
+/*
+ * TODO: Ideally, I think these should be moved to a different file
+ *	 such as irq-gic-v3-lpi-common.h/.c. But, keeping it here
+ *	 for now to get comments from the RFC.
+ */
+DECLARE_PER_CPU(struct cpu_lpi_count, cpu_lpi_count);
+
+__maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu);
+void its_inc_lpi_count(struct irq_data *d, int cpu);
+void its_dec_lpi_count(struct irq_data *d, int cpu);
+unsigned int cpumask_pick_least_loaded(struct irq_data *d,
+				       const struct cpumask *cpu_mask);
+int its_irq_gic_domain_alloc(struct irq_domain *domain,
+			     unsigned int virq,
+			     irq_hw_number_t hwirq);
+unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids);
+void its_lpi_free(unsigned long *bitmap, u32 base, u32 nr_ids);
+
+struct rdists;
+int direct_lpi_init(struct irq_domain *parent, struct rdists *rdists);
+
 #endif /* _IRQ_GIC_COMMON_H */
diff --git a/drivers/irqchip/irq-gic-v3-dlpi.c b/drivers/irqchip/irq-gic-v3-dlpi.c
new file mode 100644
index 000000000000..722a0630fced
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v3-dlpi.c
@@ -0,0 +1,639 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Author: Sunil Muthuswamy <sunilmut@microsoft.com>
+ *
+ * This file implements an IRQ domain and chip to handle Direct LPI
+ * when there is no ITS, for GIC v3.
+ */
+
+#include <linux/acpi_iort.h>
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
+#include <linux/list.h>
+#include <linux/msi.h>
+#include <linux/irq.h>
+#include <linux/percpu.h>
+#include <linux/dma-iommu.h>
+
+#include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/irqchip/arm-gic-v4.h>
+
+#include "irq-gic-common.h"
+
+static struct rdists *gic_rdists;
+
+#define gic_data_rdist_cpu(cpu)		(per_cpu_ptr(gic_rdists->rdist, cpu))
+
+#define RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING	(1 << 0)
+
+/*
+ * Structure that holds most of the infrastructure needed to support
+ * DirectLPI without an ITS.
+ *
+ * dev_alloc_lock has to be taken for device allocations, while the
+ * spinlock must be taken to parse data structures such as the device
+ * list.
+ */
+
+struct direct_lpi {
+	raw_spinlock_t		lock;
+	struct mutex		dev_alloc_lock;
+	struct list_head	entry;
+	struct fwnode_handle	*fwnode_handle;
+	struct list_head	device_list;
+	u64			flags;
+	unsigned int		msi_domain_flags;
+};
+
+struct event_lpi_map {
+	unsigned long		*lpi_map;
+	u16			*col_map;
+	irq_hw_number_t		lpi_base;
+	int			nr_lpis;
+};
+
+struct direct_lpi_device {
+	struct list_head	entry;
+	struct direct_lpi	*dlpi;
+	struct event_lpi_map	event_map;
+	u32			device_id;
+	bool			shared;
+};
+
+static int dlpi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				unsigned int nr_irqs, void *args);
+
+static void dlpi_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+				unsigned int nr_irqs);
+
+static int dlpi_irq_domain_activate(struct irq_domain *domain,
+				   struct irq_data *d, bool reserve);
+
+static void dlpi_irq_domain_deactivate(struct irq_domain *domain,
+				      struct irq_data *d);
+
+static const struct irq_domain_ops dlpi_domain_ops = {
+	.alloc			= dlpi_irq_domain_alloc,
+	.free			= dlpi_irq_domain_free,
+	.activate		= dlpi_irq_domain_activate,
+	.deactivate		= dlpi_irq_domain_deactivate,
+};
+
+static int dlpi_msi_prepare(struct irq_domain *domain, struct device *dev,
+			   int nvec, msi_alloc_info_t *info);
+
+static struct msi_domain_ops dlpi_msi_domain_ops = {
+	.msi_prepare	= dlpi_msi_prepare,
+};
+
+static int dlpi_init_domain(struct fwnode_handle *handle,
+			      struct irq_domain *parent_domain,
+			      struct direct_lpi *dlpi)
+{
+	struct irq_domain *inner_domain;
+	struct msi_domain_info *info;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	inner_domain = irq_domain_create_tree(handle, &dlpi_domain_ops, NULL);
+	if (!inner_domain) {
+		kfree(info);
+		return -ENOMEM;
+	}
+
+	inner_domain->parent = parent_domain;
+	irq_domain_update_bus_token(inner_domain, DOMAIN_BUS_NEXUS);
+	inner_domain->flags |= dlpi->msi_domain_flags;
+	info->ops = &dlpi_msi_domain_ops;
+	info->data = dlpi;
+	inner_domain->host_data = info;
+
+	return 0;
+}
+
+int __init direct_lpi_init(struct irq_domain *parent, struct rdists *rdists)
+{
+	struct fwnode_handle *fwnode;
+	int err;
+	struct direct_lpi *dlpi = NULL;
+
+	gic_rdists = rdists;
+	fwnode = irq_domain_alloc_named_fwnode("Direct LPI");
+	if (!fwnode)
+		return -ENOMEM;
+
+	/*
+	 * Registering with the iort allows other services to query the
+	 * fwnode. But, the registration requires an ITS ID and base address,
+	 * which does not apply here. So, probably best to just export the
+	 * fwnode handle for other services. Keeping it here for comments
+	 * from RFC submission.
+	 */
+	err = iort_register_domain_token(0, 0, fwnode);
+	if (err)
+		goto out_free_fwnode;
+
+	dlpi = kzalloc(sizeof(*dlpi), GFP_KERNEL);
+	if (!dlpi) {
+		err = -ENOMEM;
+		goto out_unregister_fwnode;
+	}
+
+	raw_spin_lock_init(&dlpi->lock);
+	mutex_init(&dlpi->dev_alloc_lock);
+	INIT_LIST_HEAD(&dlpi->entry);
+	INIT_LIST_HEAD(&dlpi->device_list);
+	dlpi->msi_domain_flags = IRQ_DOMAIN_FLAG_MSI_REMAP;
+	err = dlpi_init_domain(fwnode, parent, dlpi);
+	if (err)
+		goto out_unregister_fwnode;
+
+	return 0;
+
+out_unregister_fwnode:
+	iort_deregister_domain_token(0);
+out_free_fwnode:
+	irq_domain_free_fwnode(fwnode);
+	kfree(dlpi);
+
+	return err;
+
+}
+
+static inline u32 dlpi_get_event_id(struct irq_data *d)
+{
+	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
+
+	return d->hwirq - dlpi_dev->event_map.lpi_base;
+}
+
+static int dlpi_irq_to_cpuid(struct irq_data *d, unsigned long *flags)
+{
+	int cpu;
+
+	/* Physical LPIs are already locked via the irq_desc lock */
+	struct direct_lpi_device *dlpi_dev =
+		irq_data_get_irq_chip_data(d);
+	cpu = dlpi_dev->event_map.col_map[dlpi_get_event_id(d)];
+	/* Keep GCC quiet... */
+	*flags = 0;
+
+	return cpu;
+}
+
+/*
+ * irqchip functions - assumes MSI, mostly.
+ */
+//TODO: Maybe its::lpi_write_config can call into this routine
+static void lpi_write_config(struct irq_data *d, u8 clr, u8 set)
+{
+	irq_hw_number_t hwirq;
+	void *va;
+	u8 *cfg;
+
+	va = gic_rdists->prop_table_va;
+	hwirq = d->hwirq;
+	cfg = va + hwirq - 8192;
+	*cfg &= ~clr;
+	*cfg |= set | LPI_PROP_GROUP1;
+
+	/*
+	 * Make the above write visible to the redistributors.
+	 * And yes, we're flushing exactly: One. Single. Byte.
+	 * Humpf...
+	 */
+	if (gic_rdists->flags & RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING)
+		gic_flush_dcache_to_poc(cfg, sizeof(*cfg));
+	else
+		dsb(ishst);
+}
+
+static void wait_for_syncr(void __iomem *rdbase)
+{
+	while (readl_relaxed(rdbase + GICR_SYNCR) & 1)
+		cpu_relax();
+}
+
+static void dlpi_direct_lpi_inv(struct irq_data *d)
+{
+	void __iomem *rdbase;
+	unsigned long flags;
+	u64 val;
+	int cpu;
+
+	val = d->hwirq;
+
+	/* Target the redistributor this LPI is currently routed to */
+	cpu = dlpi_irq_to_cpuid(d, &flags);
+	raw_spin_lock(&gic_data_rdist_cpu(cpu)->rd_lock);
+	rdbase = per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base;
+	gic_write_lpir(val, rdbase + GICR_INVLPIR);
+	wait_for_syncr(rdbase);
+	raw_spin_unlock(&gic_data_rdist_cpu(cpu)->rd_lock);
+}
+
+static int dlpi_alloc_device_irq(struct direct_lpi_device *dlpi_dev,
+				 int nvecs, irq_hw_number_t *hwirq)
+{
+	int idx;
+
+	/* Find a free LPI region in lpi_map and allocate them. */
+	idx = bitmap_find_free_region(dlpi_dev->event_map.lpi_map,
+				      dlpi_dev->event_map.nr_lpis,
+				      get_count_order(nvecs));
+	if (idx < 0)
+		return -ENOSPC;
+
+	*hwirq = dlpi_dev->event_map.lpi_base + idx;
+
+	return 0;
+}
+
+static void no_lpi_update_config(struct irq_data *d, u8 clr, u8 set)
+{
+	lpi_write_config(d, clr, set);
+	dlpi_direct_lpi_inv(d);
+}
+
+static void dlpi_unmask_irq(struct irq_data *d)
+{
+	no_lpi_update_config(d, 0, LPI_PROP_ENABLED);
+}
+
+static void dlpi_mask_irq(struct irq_data *d)
+{
+	no_lpi_update_config(d, LPI_PROP_ENABLED, 0);
+}
+
+static int dlpi_select_cpu(struct irq_data *d,
+			   const struct cpumask *aff_mask)
+{
+	cpumask_var_t tmpmask;
+	int cpu;
+
+	if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	/* There is no NUMA node affliation */
+	if (!irqd_affinity_is_managed(d)) {
+		/* Try the intersection of the affinity and online masks */
+		cpumask_and(tmpmask, aff_mask, cpu_online_mask);
+
+		/* If that doesn't fly, the online mask is the last resort */
+		if (cpumask_empty(tmpmask))
+			cpumask_copy(tmpmask, cpu_online_mask);
+
+		cpu = cpumask_pick_least_loaded(d, tmpmask);
+	} else {
+		cpumask_and(tmpmask, irq_data_get_affinity_mask(d), cpu_online_mask);
+		cpu = cpumask_pick_least_loaded(d, tmpmask);
+	}
+
+	free_cpumask_var(tmpmask);
+	pr_debug("IRQ%d -> %*pbl CPU%d\n", d->irq, cpumask_pr_args(aff_mask), cpu);
+
+	return cpu;
+}
+
+static int dlpi_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
+			     bool force)
+{
+	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
+	u32 id = dlpi_get_event_id(d);
+	int cpu, prev_cpu;
+
+	/*
+	 * A forwarded interrupt should use irq_set_vcpu_affinity. Anyways,
+	 * vcpu is not supported for Direct LPI, as it requires an ITS.
+	 */
+	if (irqd_is_forwarded_to_vcpu(d))
+		return -EINVAL;
+
+	prev_cpu = dlpi_dev->event_map.col_map[id];
+	its_dec_lpi_count(d, prev_cpu);
+
+	if (!force)
+		cpu = dlpi_select_cpu(d, mask_val);
+	else
+		cpu = cpumask_pick_least_loaded(d, mask_val);
+
+	if (cpu < 0 || cpu >= nr_cpu_ids)
+		goto err;
+
+	/* don't set the affinity when the target cpu is same as current one */
+	if (cpu != prev_cpu) {
+		dlpi_dev->event_map.col_map[id] = cpu;
+		irq_data_update_effective_affinity(d, cpumask_of(cpu));
+	}
+
+	its_inc_lpi_count(d, cpu);
+
+	return IRQ_SET_MASK_OK_DONE;
+
+err:
+	its_inc_lpi_count(d, prev_cpu);
+	return -EINVAL;
+}
+
+static u64 dlpi_get_msi_base(struct irq_data *d)
+{
+	u64 addr;
+	int cpu;
+	unsigned long flags;
+
+	cpu = dlpi_irq_to_cpuid(d, &flags);
+	addr = (u64)(per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base +
+		     GICR_SETLPIR);
+
+	return addr;
+}
+
+/*
+ * As per the spec, MSI address is the address of the target processor's
+ * GICR_SETLPIR location.
+ */
+static void dlpi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
+{
+	u64 addr;
+
+	addr = dlpi_get_msi_base(d);
+
+	msg->address_lo		= lower_32_bits(addr);
+	msg->address_hi		= upper_32_bits(addr);
+	msg->data		= dlpi_get_event_id(d);
+
+	iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg);
+}
+
+static int dlpi_irq_set_irqchip_state(struct irq_data *d,
+				     enum irqchip_irq_state which,
+				     bool state)
+{
+	if (which != IRQCHIP_STATE_PENDING)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int dlpi_irq_retrigger(struct irq_data *d)
+{
+	return !dlpi_irq_set_irqchip_state(d, IRQCHIP_STATE_PENDING, true);
+}
+
+static int dlpi_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
+{
+	/* vCPU support requires an ITS */
+	return -EINVAL;
+}
+
+static struct irq_chip dlpi_irq_chip = {
+	.name			= "Direct LPI",
+	.irq_mask		= dlpi_mask_irq,
+	.irq_unmask		= dlpi_unmask_irq,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_set_affinity	= dlpi_set_affinity,
+	.irq_compose_msi_msg	= dlpi_irq_compose_msi_msg,
+	.irq_set_irqchip_state	= dlpi_irq_set_irqchip_state,
+	.irq_retrigger		= dlpi_irq_retrigger,
+	.irq_set_vcpu_affinity	= dlpi_irq_set_vcpu_affinity,
+};
+
+static int dlpi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				 unsigned int nr_irqs, void *args)
+{
+	msi_alloc_info_t *info = args;
+	struct direct_lpi_device *dlpi_dev = info->scratchpad[0].ptr;
+	struct irq_data *irqd;
+	irq_hw_number_t hwirq;
+	int err;
+	int i;
+
+	err = dlpi_alloc_device_irq(dlpi_dev, nr_irqs, &hwirq);
+	if (err)
+		return err;
+
+	/*
+	 * TODO: Need to call 'iommu_dma_prepare_msi' to prepare for DMA,
+	 *	 but, that requires an MSI address. And, for Direct LPI
+	 *	 the MSI address comes from the Redistributor from
+	 *	 'GICR_SETLPIR', which is per CPU and that is not known
+	 *	 at the moment. Not sure what is the best way to handle
+	 *	 this.
+	 */
+
+	/*
+	err = iommu_dma_prepare_msi(info->desc, its->get_msi_base(its_dev));
+	if (err)
+		return err;
+	*/
+
+	for (i = 0; i < nr_irqs; i++) {
+		err = its_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
+		if (err)
+			return err;
+
+		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+					      &dlpi_irq_chip, dlpi_dev);
+		irqd = irq_get_irq_data(virq + i);
+		irqd_set_single_target(irqd);
+		irqd_set_affinity_on_activate(irqd);
+		pr_debug("ID:%d pID:%d vID:%d\n",
+			 (int)(hwirq + i - dlpi_dev->event_map.lpi_base),
+			 (int)(hwirq + i), virq + i);
+	}
+
+	return 0;
+}
+
+static void dlpi_free_device(struct direct_lpi_device *dlpi_dev)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&dlpi_dev->dlpi->lock, flags);
+	list_del(&dlpi_dev->entry);
+	raw_spin_unlock_irqrestore(&dlpi_dev->dlpi->lock, flags);
+	kfree(dlpi_dev->event_map.col_map);
+	kfree(dlpi_dev->event_map.lpi_map);
+	kfree(dlpi_dev);
+}
+
+static void dlpi_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+				unsigned int nr_irqs)
+{
+	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
+	int i;
+	struct direct_lpi *dlpi = dlpi_dev->dlpi;
+
+	bitmap_release_region(dlpi_dev->event_map.lpi_map,
+			      dlpi_get_event_id(irq_domain_get_irq_data(domain, virq)),
+			      get_count_order(nr_irqs));
+
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_data *data = irq_domain_get_irq_data(domain,
+								virq + i);
+		/* Nuke the entry in the domain */
+		irq_domain_reset_irq_data(data);
+	}
+
+	mutex_lock(&dlpi->dev_alloc_lock);
+
+	/*
+	 * If all interrupts have been freed, start mopping the
+	 * floor. This is conditionned on the device not being shared.
+	 */
+	if (!dlpi_dev->shared &&
+	    bitmap_empty(dlpi_dev->event_map.lpi_map,
+			 dlpi_dev->event_map.nr_lpis)) {
+		its_lpi_free(dlpi_dev->event_map.lpi_map,
+			     dlpi_dev->event_map.lpi_base,
+			     dlpi_dev->event_map.nr_lpis);
+
+		dlpi_free_device(dlpi_dev);
+	}
+
+	mutex_unlock(&dlpi->dev_alloc_lock);
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+static int dlpi_irq_domain_activate(struct irq_domain *domain,
+				   struct irq_data *d, bool reserve)
+{
+	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
+	u32 event;
+	int cpu;
+
+	event = dlpi_get_event_id(d);
+	cpu = dlpi_select_cpu(d, cpu_online_mask);
+	if (cpu < 0 || cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	its_inc_lpi_count(d, cpu);
+	dlpi_dev->event_map.col_map[event] = cpu;
+	irq_data_update_effective_affinity(d, cpumask_of(cpu));
+
+	return 0;
+}
+
+static void dlpi_irq_domain_deactivate(struct irq_domain *domain,
+				      struct irq_data *d)
+{
+	struct direct_lpi_device *dlpi_dev = irq_data_get_irq_chip_data(d);
+	u32 event = dlpi_get_event_id(d);
+
+	its_dec_lpi_count(d, dlpi_dev->event_map.col_map[event]);
+}
+
+static struct direct_lpi_device *dlpi_create_device(struct direct_lpi *dlpi,
+					u32 dev_id, int nvecs, bool alloc_lpis)
+{
+	struct direct_lpi_device *dlpi_dev = NULL;
+	unsigned long *lpi_map = NULL;
+	u16 *col_map = NULL;
+	int lpi_base;
+	int nr_lpis;
+	unsigned long flags;
+
+	if (WARN_ON(!is_power_of_2(nvecs)))
+		nvecs = roundup_pow_of_two(nvecs);
+
+	dlpi_dev = kzalloc(sizeof(*dlpi_dev), GFP_KERNEL);
+	if (!dlpi_dev)
+		return NULL;
+
+	lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
+	if (!lpi_map) {
+		kfree(dlpi_dev);
+		return NULL;
+	}
+
+	col_map = kcalloc(nr_lpis, sizeof(*col_map), GFP_KERNEL);
+	if (!col_map) {
+		kfree(dlpi_dev);
+		kfree(lpi_map);
+		return NULL;
+	}
+
+	dlpi_dev->dlpi = dlpi;
+	dlpi_dev->event_map.lpi_map = lpi_map;
+	dlpi_dev->event_map.col_map = col_map;
+	dlpi_dev->event_map.lpi_base = lpi_base;
+	dlpi_dev->event_map.nr_lpis = nr_lpis;
+	dlpi_dev->device_id = dev_id;
+
+	raw_spin_lock_irqsave(&dlpi->lock, flags);
+	list_add(&dlpi_dev->entry, &dlpi->device_list);
+	raw_spin_unlock_irqrestore(&dlpi->lock, flags);
+
+	return dlpi_dev;
+}
+
+static struct direct_lpi_device *dlpi_find_device(struct direct_lpi *dlpi, u32 dev_id)
+{
+	struct direct_lpi_device *dlpi_dev = NULL, *tmp;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&dlpi->lock, flags);
+	list_for_each_entry(tmp, &dlpi->device_list, entry) {
+		if (tmp->device_id == dev_id) {
+			dlpi_dev = tmp;
+			break;
+		}
+	}
+
+	raw_spin_unlock_irqrestore(&dlpi->lock, flags);
+
+	return dlpi_dev;
+}
+
+static int dlpi_msi_prepare(struct irq_domain *domain, struct device *dev,
+			   int nvec, msi_alloc_info_t *info)
+{
+	struct direct_lpi_device *dlpi_dev;
+	struct direct_lpi *dlpi;
+	struct msi_domain_info *msi_info;
+	u32 dev_id;
+	int err = 0;
+
+	/*
+	 * We ignore "dev" entirely, and rely on the dev_id that has
+	 * been passed via the scratchpad. This limits this domain's
+	 * usefulness to upper layers that definitely know that they
+	 * are built on top of the ITS.
+	 */
+	dev_id = info->scratchpad[0].ul;
+	msi_info = msi_get_domain_info(domain);
+	dlpi = msi_info->data;
+
+	mutex_lock(&dlpi->dev_alloc_lock);
+	dlpi_dev = dlpi_find_device(dlpi, dev_id);
+	if (dlpi_dev) {
+		/*
+		 * We already have seen this ID, probably through
+		 * another alias (PCI bridge of some sort). No need to
+		 * create the device.
+		 */
+		dlpi_dev->shared = true;
+		pr_debug("Reusing ITT for devID %x\n", dev_id);
+		goto out;
+	}
+
+	dlpi_dev = dlpi_create_device(dlpi, dev_id, nvec, true);
+	if (!dlpi_dev) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+out:
+	mutex_unlock(&dlpi->dev_alloc_lock);
+	info->scratchpad[0].ptr = dlpi_dev;
+
+	return err;
+}
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index ba39668c3e08..aa101dfcbbfc 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -178,7 +178,7 @@  struct cpu_lpi_count {
 	atomic_t	unmanaged;
 };
 
-static DEFINE_PER_CPU(struct cpu_lpi_count, cpu_lpi_count);
+DEFINE_PER_CPU(struct cpu_lpi_count, cpu_lpi_count);
 
 static LIST_HEAD(its_nodes);
 static DEFINE_RAW_SPINLOCK(its_lock);
@@ -1521,7 +1521,7 @@  static void its_unmask_irq(struct irq_data *d)
 	lpi_update_config(d, 0, LPI_PROP_ENABLED);
 }
 
-static __maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
+__maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
 {
 	if (irqd_affinity_is_managed(d))
 		return atomic_read(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed);
@@ -1529,7 +1529,7 @@  static __maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
 	return atomic_read(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
 }
 
-static void its_inc_lpi_count(struct irq_data *d, int cpu)
+void its_inc_lpi_count(struct irq_data *d, int cpu)
 {
 	if (irqd_affinity_is_managed(d))
 		atomic_inc(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed);
@@ -1537,7 +1537,7 @@  static void its_inc_lpi_count(struct irq_data *d, int cpu)
 		atomic_inc(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
 }
 
-static void its_dec_lpi_count(struct irq_data *d, int cpu)
+void its_dec_lpi_count(struct irq_data *d, int cpu)
 {
 	if (irqd_affinity_is_managed(d))
 		atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed);
@@ -1545,7 +1545,7 @@  static void its_dec_lpi_count(struct irq_data *d, int cpu)
 		atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
 }
 
-static unsigned int cpumask_pick_least_loaded(struct irq_data *d,
+unsigned int cpumask_pick_least_loaded(struct irq_data *d,
 					      const struct cpumask *cpu_mask)
 {
 	unsigned int cpu = nr_cpu_ids, tmp;
@@ -2121,7 +2121,7 @@  static int __init its_lpi_init(u32 id_bits)
 	return err;
 }
 
-static unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
+unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
 {
 	unsigned long *bitmap = NULL;
 	int err = 0;
@@ -2153,7 +2153,7 @@  static unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
 	return bitmap;
 }
 
-static void its_lpi_free(unsigned long *bitmap, u32 base, u32 nr_ids)
+void its_lpi_free(unsigned long *bitmap, u32 base, u32 nr_ids)
 {
 	WARN_ON(free_lpi_range(base, nr_ids));
 	kfree(bitmap);
@@ -3506,9 +3506,9 @@  static struct msi_domain_ops its_msi_domain_ops = {
 	.msi_prepare	= its_msi_prepare,
 };
 
-static int its_irq_gic_domain_alloc(struct irq_domain *domain,
-				    unsigned int virq,
-				    irq_hw_number_t hwirq)
+int its_irq_gic_domain_alloc(struct irq_domain *domain,
+			     unsigned int virq,
+			     irq_hw_number_t hwirq)
 {
 	struct irq_fwspec fwspec;
 
@@ -5186,16 +5186,21 @@  static int redist_disable_lpis(void)
 
 int its_cpu_init(void)
 {
-	if (!list_empty(&its_nodes)) {
-		int ret;
+	int ret;
+
+	/* LPI's can be supported with an ITS or with DirectLPI */
+	if (list_empty(&its_nodes) && !gic_rdists->has_direct_lpi)
+		return 0;
+
+	ret = redist_disable_lpis();
+	if (ret)
+		return ret;
 
-		ret = redist_disable_lpis();
-		if (ret)
-			return ret;
+	its_cpu_init_lpis();
 
-		its_cpu_init_lpis();
+	/* collections require an ITS */
+	if (!list_empty(&its_nodes))
 		its_cpu_init_collections();
-	}
 
 	return 0;
 }
@@ -5402,8 +5407,16 @@  int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
 		its_acpi_probe();
 
 	if (list_empty(&its_nodes)) {
-		pr_warn("ITS: No ITS available, not enabling LPIs\n");
-		return -ENXIO;
+		/* DirectLPI without ITS is architecturally supported. */
+		if (gic_rdists->has_direct_lpi) {
+			pr_info("Direct LPI with no ITS\n");
+			err = direct_lpi_init(parent_domain, rdists);
+			if (err)
+				return err;
+		} else {
+			pr_warn("ITS: No ITS or Direct LPI available, not enabling LPIs\n");
+			return -ENXIO;
+		}
 	}
 
 	err = allocate_lpi_tables();