diff mbox series

[v9,2/6] PCI: endpoint: Add RC-to-EP doorbell support using platform MSI controller

Message ID 20241203-ep-msi-v9-2-a60dbc3f15dd@nxp.com (mailing list archive)
State Superseded
Headers show
Series PCI: EP: Add RC-to-EP doorbell with platform MSI controller | expand

Commit Message

Frank Li Dec. 3, 2024, 8:36 p.m. UTC
Doorbell feature is implemented by mapping the EP's MSI interrupt
controller message address to a dedicated BAR in the EPC core. It is the
responsibility of the EPF driver to pass the actual message data to be
written by the host to the doorbell BAR region through its own logic.

Tested-by: Niklas Cassel <cassel@kernel.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
---
Chagne from v8 to v9
- sort header file
- use pci_epc_get(dev_name(msi_desc_to_dev(desc)));
- check epf number at pci_epf_alloc_doorbell
- Add comments for miss msi-parent case

change from v5 to v8
-none

Change from v4 to v5
- Remove request_irq() in pci_epc_alloc_doorbell() and leave to EP function
driver, so ep function driver can register differece call back function for
difference doorbell events and set irq affinity to differece CPU core.
- Improve error message when MSI allocate failure.

Change from v3 to v4
- msi change to use msi_get_virq() avoid use msi_for_each_desc().
- add new struct for pci_epf_doorbell_msg to msi msg,virq and irq name.
- move mutex lock to epc function
- initialize variable at declear place.
- passdown epf to epc*() function to simplify code.
---
 drivers/pci/endpoint/Makefile     |   2 +-
 drivers/pci/endpoint/pci-ep-msi.c | 106 ++++++++++++++++++++++++++++++++++++++
 include/linux/pci-ep-msi.h        |  15 ++++++
 include/linux/pci-epf.h           |  16 ++++++
 4 files changed, 138 insertions(+), 1 deletion(-)

Comments

Thomas Gleixner Dec. 3, 2024, 10:15 p.m. UTC | #1
On Tue, Dec 03 2024 at 15:36, Frank Li wrote:
> +static void pci_epc_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
> +{
> +	struct pci_epc *epc;
> +	struct pci_epf *epf;
> +
> +	epc = pci_epc_get(dev_name(msi_desc_to_dev(desc)));
> +	if (!epc)

This is wrong as pci_epc_get() never returns NULL on failure. It returns
an error pointer.

> +		return;
> +
> +	epf = list_first_entry_or_null(&epc->pci_epf, struct pci_epf, list);

How can the list be empty?

> +	if (epf && epf->db_msg && desc->msi_index < epf->num_db)
> +		memcpy(&epf->db_msg[desc->msi_index].msg, msg, sizeof(*msg));

So now the message is copied out into that db_msg array which is
somewhere in the memory which was allocated on the EP side.

How is the host side supposed to know about the change of the message?

This only works reliably if:

  1) the message address/data pair is immutable once it is set up and
     subsequent affinity changes are not affecting it

  2) The ordering on the EP driver is:

     request_irq()
     publish_msg_to_host()
     tell_host_that_message_is_ready()

#2 is a documentation problem, but #1 needs some thought.

It only works for MSI parent domains which use a translation table and
affinity changes only happen at the translation table level, which means
the address/data pair is unaffected.

Sure GIC-ITS, AMD/Intel remap domains work that way, but what happens if
the underlying MSI parent domain actually changes the message
(address/data pair) during an affinity change?

These domains expect that the message is known to the other side at the
time when irq_set_affinity() returns. In case of regular PCI/MSI this is
not a problem because the message is written to the device before the
function returns, but in this EP case nothing guarantees that the
modified message is host visible at that point.

The fact that a MSI parent domain supports DOMAIN_BUS_DEVICE_MSI does
not guarantee that the parent is translation table based.

As this is intended to be a generic library for all sorts of EP
implementations, there needs to be

  - either a mechanism to prevent the initialization if the underlying
    MSI parent domain does not provide immutable messages

  - or support for endpoint specific msi_write_msg() implementations

Thanks,

        tglx
Frank Li Dec. 3, 2024, 11:24 p.m. UTC | #2
On Tue, Dec 03, 2024 at 11:15:27PM +0100, Thomas Gleixner wrote:
> On Tue, Dec 03 2024 at 15:36, Frank Li wrote:
> > +static void pci_epc_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
> > +{
> > +	struct pci_epc *epc;
> > +	struct pci_epf *epf;
> > +
> > +	epc = pci_epc_get(dev_name(msi_desc_to_dev(desc)));
> > +	if (!epc)
>
> This is wrong as pci_epc_get() never returns NULL on failure. It returns
> an error pointer.
>
> > +		return;
> > +
> > +	epf = list_first_entry_or_null(&epc->pci_epf, struct pci_epf, list);
>
> How can the list be empty?

It already checked at pci_epc_alloc_doorbell(), which should be never
empty when this function called.

>
> > +	if (epf && epf->db_msg && desc->msi_index < epf->num_db)
> > +		memcpy(&epf->db_msg[desc->msi_index].msg, msg, sizeof(*msg));
>
> So now the message is copied out into that db_msg array which is
> somewhere in the memory which was allocated on the EP side.
>
> How is the host side supposed to know about the change of the message?
>
> This only works reliably if:
>
>   1) the message address/data pair is immutable once it is set up and
>      subsequent affinity changes are not affecting it
>
>   2) The ordering on the EP driver is:
>
>      request_irq()
>      publish_msg_to_host()
>      tell_host_that_message_is_ready()
>
> #2 is a documentation problem, but #1 needs some thought.
>
> It only works for MSI parent domains which use a translation table and
> affinity changes only happen at the translation table level, which means
> the address/data pair is unaffected.
>
> Sure GIC-ITS, AMD/Intel remap domains work that way, but what happens if
> the underlying MSI parent domain actually changes the message
> (address/data pair) during an affinity change?
>
> These domains expect that the message is known to the other side at the
> time when irq_set_affinity() returns. In case of regular PCI/MSI this is
> not a problem because the message is written to the device before the
> function returns, but in this EP case nothing guarantees that the
> modified message is host visible at that point.

If irq_set_affinity() can change address/data pair, how to avoid below
raise condition:
	1. device send out write data to address, but write command still
in bus fabric or some internal command FIFO, not reach MSI controller yet.
	2. irq_set_affinity() change address/data pair.

1 and 2 is totally async. if 2 affect firstly, 1 maybe missed.

>
> The fact that a MSI parent domain supports DOMAIN_BUS_DEVICE_MSI does
> not guarantee that the parent is translation table based.
>
> As this is intended to be a generic library for all sorts of EP
> implementations, there needs to be
>
>   - either a mechanism to prevent the initialization if the underlying
>     MSI parent domain does not provide immutable messages

How to know such information?

>
>   - or support for endpoint specific msi_write_msg() implementations

Even provide specific msi_write_msg(), write to address/data to shared
memory.

host driver:
1. read address/data from shared memory
2. write data to address.

1 and 2 is not atomic. So it can't avoid above raise conditon.

Frank

>
> Thanks,
>
>         tglx
>
Thomas Gleixner Dec. 4, 2024, 1:08 p.m. UTC | #3
On Tue, Dec 03 2024 at 18:24, Frank Li wrote:
> On Tue, Dec 03, 2024 at 11:15:27PM +0100, Thomas Gleixner wrote:
>> The fact that a MSI parent domain supports DOMAIN_BUS_DEVICE_MSI does
>> not guarantee that the parent is translation table based.
>>
>> As this is intended to be a generic library for all sorts of EP
>> implementations, there needs to be
>>
>>   - either a mechanism to prevent the initialization if the underlying
>>     MSI parent domain does not provide immutable messages
>
> How to know such information?

It obviously needs to be flagged somehow in the domain and the EP magic
needs to check that flag. 

>>
>>   - or support for endpoint specific msi_write_msg() implementations
>
> Even provide specific msi_write_msg(), write to address/data to shared
> memory.
>
> host driver:
> 1. read address/data from shared memory
> 2. write data to address.
>
> 1 and 2 is not atomic. So it can't avoid above raise conditon.

Correct. So you cannot support that case, which in turn requires to have
a mechanism to check for the immutable property.

Thanks,

        tglx
diff mbox series

Patch

diff --git a/drivers/pci/endpoint/Makefile b/drivers/pci/endpoint/Makefile
index 95b2fe47e3b06..a1ccce440c2c5 100644
--- a/drivers/pci/endpoint/Makefile
+++ b/drivers/pci/endpoint/Makefile
@@ -5,4 +5,4 @@ 
 
 obj-$(CONFIG_PCI_ENDPOINT_CONFIGFS)	+= pci-ep-cfs.o
 obj-$(CONFIG_PCI_ENDPOINT)		+= pci-epc-core.o pci-epf-core.o\
-					   pci-epc-mem.o functions/
+					   pci-epc-mem.o pci-ep-msi.o functions/
diff --git a/drivers/pci/endpoint/pci-ep-msi.c b/drivers/pci/endpoint/pci-ep-msi.c
new file mode 100644
index 0000000000000..8f92f447712d8
--- /dev/null
+++ b/drivers/pci/endpoint/pci-ep-msi.c
@@ -0,0 +1,106 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Endpoint *Controller* (EPC) MSI library
+ *
+ * Copyright (C) 2024 NXP
+ * Author: Frank Li <Frank.Li@nxp.com>
+ */
+
+#include <linux/cleanup.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/pci-epc.h>
+#include <linux/pci-epf.h>
+#include <linux/pci-ep-cfs.h>
+#include <linux/pci-ep-msi.h>
+#include <linux/slab.h>
+
+static void pci_epc_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+	struct pci_epc *epc;
+	struct pci_epf *epf;
+
+	epc = pci_epc_get(dev_name(msi_desc_to_dev(desc)));
+	if (!epc)
+		return;
+
+	epf = list_first_entry_or_null(&epc->pci_epf, struct pci_epf, list);
+
+	if (epf && epf->db_msg && desc->msi_index < epf->num_db)
+		memcpy(&epf->db_msg[desc->msi_index].msg, msg, sizeof(*msg));
+
+	pci_epc_put(epc);
+}
+
+static void pci_epc_free_doorbell(struct pci_epc *epc, struct pci_epf *epf)
+{
+	guard(mutex)(&epc->lock);
+
+	platform_device_msi_free_irqs_all(epc->dev.parent);
+}
+
+static int pci_epc_alloc_doorbell(struct pci_epc *epc, struct pci_epf *epf)
+{
+	struct device *dev = epc->dev.parent;
+	u16 num_db = epf->num_db;
+	int i = 0;
+	int ret;
+
+	guard(mutex)(&epc->lock);
+
+	if (list_first_entry_or_null(&epc->pci_epf, struct pci_epf, list) != epf) {
+		dev_err(dev, "MSI doorbell only support one endpoint function\n");
+		return -EINVAL;
+	}
+
+	ret = platform_device_msi_init_and_alloc_irqs(dev, num_db, pci_epc_write_msi_msg);
+	if (ret) {
+		/*
+		 * The pcie_ep DT node has to specify 'msi-parent' for EP
+		 * doorbell support to work. Right now only GIC ITS is
+		 * supported. If you have GIC ITS and reached this print,
+		 * perhaps you are missing 'msi-parent' in DT.
+		 */
+		dev_err(dev, "Failed to allocate MSI\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < num_db; i++)
+		epf->db_msg[i].virq = msi_get_virq(dev, i);
+
+	return 0;
+};
+
+int pci_epf_alloc_doorbell(struct pci_epf *epf, u16 num_db)
+{
+	struct pci_epc *epc = epf->epc;
+	void *msg;
+	int ret;
+
+	msg = kcalloc(num_db, sizeof(struct pci_epf_doorbell_msg), GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	epf->num_db = num_db;
+	epf->db_msg = msg;
+
+	ret = pci_epc_alloc_doorbell(epc, epf);
+	if (ret)
+		kfree(msg);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epf_alloc_doorbell);
+
+void pci_epf_free_doorbell(struct pci_epf *epf)
+{
+	struct pci_epc *epc = epf->epc;
+
+	pci_epc_free_doorbell(epc, epf);
+
+	kfree(epf->db_msg);
+	epf->db_msg = NULL;
+	epf->num_db = 0;
+}
+EXPORT_SYMBOL_GPL(pci_epf_free_doorbell);
diff --git a/include/linux/pci-ep-msi.h b/include/linux/pci-ep-msi.h
new file mode 100644
index 0000000000000..f0cfecf491199
--- /dev/null
+++ b/include/linux/pci-ep-msi.h
@@ -0,0 +1,15 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PCI Endpoint *Function* side MSI header file
+ *
+ * Copyright (C) 2024 NXP
+ * Author: Frank Li <Frank.Li@nxp.com>
+ */
+
+#ifndef __PCI_EP_MSI__
+#define __PCI_EP_MSI__
+
+int pci_epf_alloc_doorbell(struct pci_epf *epf, u16 nums);
+void pci_epf_free_doorbell(struct pci_epf *epf);
+
+#endif /* __PCI_EP_MSI__ */
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 18a3aeb62ae4e..5374e6515ffa0 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -12,6 +12,7 @@ 
 #include <linux/configfs.h>
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
+#include <linux/msi.h>
 #include <linux/pci.h>
 
 struct pci_epf;
@@ -125,6 +126,17 @@  struct pci_epf_bar {
 	int		flags;
 };
 
+/**
+ * struct pci_epf_doorbell_msg - represents doorbell message
+ * @msi_msg: MSI message
+ * @virq: irq number of this doorbell MSI message
+ * @name: irq name for doorbell interrupt
+ */
+struct pci_epf_doorbell_msg {
+	struct msi_msg msg;
+	int virq;
+};
+
 /**
  * struct pci_epf - represents the PCI EPF device
  * @dev: the PCI EPF device
@@ -152,6 +164,8 @@  struct pci_epf_bar {
  * @vfunction_num_map: bitmap to manage virtual function number
  * @pci_vepf: list of virtual endpoint functions associated with this function
  * @event_ops: Callbacks for capturing the EPC events
+ * @db_msg: data for MSI from RC side
+ * @num_db: number of doorbells
  */
 struct pci_epf {
 	struct device		dev;
@@ -182,6 +196,8 @@  struct pci_epf {
 	unsigned long		vfunction_num_map;
 	struct list_head	pci_vepf;
 	const struct pci_epc_event_ops *event_ops;
+	struct pci_epf_doorbell_msg *db_msg;
+	u16 num_db;
 };
 
 /**