diff mbox series

[04/14] cxl/mem: Implement polled mode mailbox

Message ID 20210130002438.1872527-5-ben.widawsky@intel.com (mailing list archive)
State Superseded
Headers show
Series CXL 2.0 Support | expand

Commit Message

Ben Widawsky Jan. 30, 2021, 12:24 a.m. UTC
Provide enough functionality to utilize the mailbox of a memory device.
The mailbox is used to interact with the firmware running on the memory
device.

The CXL specification defines separate capabilities for the mailbox and
the memory device. The mailbox interface has a doorbell to indicate
ready to accept commands and the memory device has a capability register
that indicates the mailbox interface is ready. The expectation is that
the doorbell-ready is always later than the memory-device-indication
that the mailbox is ready.

Create a function to handle sending a command, optionally with a
payload, to the memory device, polling on a result, and then optionally
copying out the payload. The algorithm for doing this comes straight out
of the CXL 2.0 specification.

Primary mailboxes are capable of generating an interrupt when submitting
a command in the background. That implementation is saved for a later
time.

Secondary mailboxes aren't implemented at this time.

The flow is proven with one implemented command, "identify". Because the
class code has already told the driver this is a memory device and the
identify command is mandatory.

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
---
 drivers/cxl/Kconfig |  14 ++
 drivers/cxl/cxl.h   |  39 +++++
 drivers/cxl/mem.c   | 342 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 394 insertions(+), 1 deletion(-)

Comments

David Rientjes Jan. 30, 2021, 11:51 p.m. UTC | #1
On Fri, 29 Jan 2021, Ben Widawsky wrote:

> Provide enough functionality to utilize the mailbox of a memory device.
> The mailbox is used to interact with the firmware running on the memory
> device.
> 
> The CXL specification defines separate capabilities for the mailbox and
> the memory device. The mailbox interface has a doorbell to indicate
> ready to accept commands and the memory device has a capability register
> that indicates the mailbox interface is ready. The expectation is that
> the doorbell-ready is always later than the memory-device-indication
> that the mailbox is ready.
> 
> Create a function to handle sending a command, optionally with a
> payload, to the memory device, polling on a result, and then optionally
> copying out the payload. The algorithm for doing this comes straight out
> of the CXL 2.0 specification.
> 
> Primary mailboxes are capable of generating an interrupt when submitting
> a command in the background. That implementation is saved for a later
> time.
> 
> Secondary mailboxes aren't implemented at this time.
> 
> The flow is proven with one implemented command, "identify". Because the
> class code has already told the driver this is a memory device and the
> identify command is mandatory.
> 
> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> ---
>  drivers/cxl/Kconfig |  14 ++
>  drivers/cxl/cxl.h   |  39 +++++
>  drivers/cxl/mem.c   | 342 +++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 394 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> index 3b66b46af8a0..fe591f74af96 100644
> --- a/drivers/cxl/Kconfig
> +++ b/drivers/cxl/Kconfig
> @@ -32,4 +32,18 @@ config CXL_MEM
>  	  Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
>  
>  	  If unsure say 'm'.
> +
> +config CXL_MEM_INSECURE_DEBUG
> +	bool "CXL.mem debugging"
> +	depends on CXL_MEM
> +	help
> +	  Enable debug of all CXL command payloads.
> +
> +	  Some CXL devices and controllers support encryption and other
> +	  security features. The payloads for the commands that enable
> +	  those features may contain sensitive clear-text security
> +	  material. Disable debug of those command payloads by default.
> +	  If you are a kernel developer actively working on CXL
> +	  security enabling say Y, otherwise say N.

Not specific to this patch, but the reference to encryption made me 
curious about integrity: are all CXL.mem devices compatible with DIMP? 
Some?  None?

> +
>  endif
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index a3da7f8050c4..df3d97154b63 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -31,9 +31,36 @@
>  #define CXLDEV_MB_CAPS_OFFSET 0x00
>  #define   CXLDEV_MB_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
>  #define CXLDEV_MB_CTRL_OFFSET 0x04
> +#define   CXLDEV_MB_CTRL_DOORBELL BIT(0)
>  #define CXLDEV_MB_CMD_OFFSET 0x08
> +#define   CXLDEV_MB_CMD_COMMAND_OPCODE_MASK GENMASK(15, 0)
> +#define   CXLDEV_MB_CMD_PAYLOAD_LENGTH_MASK GENMASK(36, 16)
>  #define CXLDEV_MB_STATUS_OFFSET 0x10
> +#define   CXLDEV_MB_STATUS_RET_CODE_MASK GENMASK(47, 32)
>  #define CXLDEV_MB_BG_CMD_STATUS_OFFSET 0x18
> +#define CXLDEV_MB_PAYLOAD_OFFSET 0x20
> +
> +/* Memory Device (CXL 2.0 - 8.2.8.5.1.1) */
> +#define CXLMDEV_STATUS_OFFSET 0x0
> +#define   CXLMDEV_DEV_FATAL BIT(0)
> +#define   CXLMDEV_FW_HALT BIT(1)
> +#define   CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2)
> +#define     CXLMDEV_MS_NOT_READY 0
> +#define     CXLMDEV_MS_READY 1
> +#define     CXLMDEV_MS_ERROR 2
> +#define     CXLMDEV_MS_DISABLED 3
> +#define   CXLMDEV_READY(status) \
> +		(CXL_GET_FIELD(status, CXLMDEV_STATUS_MEDIA_STATUS) == CXLMDEV_MS_READY)
> +#define   CXLMDEV_MBOX_IF_READY BIT(4)
> +#define   CXLMDEV_RESET_NEEDED_SHIFT 5
> +#define   CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5)
> +#define     CXLMDEV_RESET_NEEDED_NOT 0
> +#define     CXLMDEV_RESET_NEEDED_COLD 1
> +#define     CXLMDEV_RESET_NEEDED_WARM 2
> +#define     CXLMDEV_RESET_NEEDED_HOT 3
> +#define     CXLMDEV_RESET_NEEDED_CXL 4
> +#define   CXLMDEV_RESET_NEEDED(status) \
> +		(CXL_GET_FIELD(status, CXLMDEV_RESET_NEEDED) != CXLMDEV_RESET_NEEDED_NOT)
>  
>  /**
>   * struct cxl_mem - A CXL memory device
> @@ -44,6 +71,16 @@ struct cxl_mem {
>  	struct pci_dev *pdev;
>  	void __iomem *regs;
>  
> +	struct {
> +		struct range range;
> +	} pmem;
> +
> +	struct {
> +		struct range range;
> +	} ram;
> +
> +	char firmware_version[0x10];
> +
>  	/* Cap 0001h - CXL_CAP_CAP_ID_DEVICE_STATUS */
>  	struct {
>  		void __iomem *regs;
> @@ -51,6 +88,7 @@ struct cxl_mem {
>  
>  	/* Cap 0002h - CXL_CAP_CAP_ID_PRIMARY_MAILBOX */
>  	struct {
> +		struct mutex mutex; /* Protects device mailbox and firmware */
>  		void __iomem *regs;
>  		size_t payload_size;
>  	} mbox;
> @@ -89,5 +127,6 @@ struct cxl_mem {
>  
>  cxl_reg(status);
>  cxl_reg(mbox);
> +cxl_reg(mem);
>  
>  #endif /* __CXL_H__ */
> diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> index fa14d51243ee..69ed15bfa5d4 100644
> --- a/drivers/cxl/mem.c
> +++ b/drivers/cxl/mem.c
> @@ -6,6 +6,270 @@
>  #include "pci.h"
>  #include "cxl.h"
>  
> +#define cxl_doorbell_busy(cxlm)                                                \
> +	(cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
> +	 CXLDEV_MB_CTRL_DOORBELL)
> +
> +#define CXL_MAILBOX_TIMEOUT_US 2000

This should be _MS?

> +
> +enum opcode {
> +	CXL_MBOX_OP_IDENTIFY		= 0x4000,
> +	CXL_MBOX_OP_MAX			= 0x10000
> +};
> +
> +/**
> + * struct mbox_cmd - A command to be submitted to hardware.
> + * @opcode: (input) The command set and command submitted to hardware.
> + * @payload_in: (input) Pointer to the input payload.
> + * @payload_out: (output) Pointer to the output payload. Must be allocated by
> + *		 the caller.
> + * @size_in: (input) Number of bytes to load from @payload.
> + * @size_out: (output) Number of bytes loaded into @payload.
> + * @return_code: (output) Error code returned from hardware.
> + *
> + * This is the primary mechanism used to send commands to the hardware.
> + * All the fields except @payload_* correspond exactly to the fields described in
> + * Command Register section of the CXL 2.0 spec (8.2.8.4.5). @payload_in and
> + * @payload_out are written to, and read from the Command Payload Registers
> + * defined in (8.2.8.4.8).
> + */
> +struct mbox_cmd {
> +	u16 opcode;
> +	void *payload_in;
> +	void *payload_out;
> +	size_t size_in;
> +	size_t size_out;
> +	u16 return_code;
> +#define CXL_MBOX_SUCCESS 0
> +};
> +
> +static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
> +{
> +	const int timeout = msecs_to_jiffies(CXL_MAILBOX_TIMEOUT_US);
> +	const unsigned long start = jiffies;
> +	unsigned long end = start;
> +
> +	while (cxl_doorbell_busy(cxlm)) {
> +		end = jiffies;
> +
> +		if (time_after(end, start + timeout)) {
> +			/* Check again in case preempted before timeout test */
> +			if (!cxl_doorbell_busy(cxlm))
> +				break;
> +			return -ETIMEDOUT;
> +		}
> +		cpu_relax();
> +	}
> +
> +	dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
> +		jiffies_to_msecs(end) - jiffies_to_msecs(start));
> +	return 0;
> +}
> +
> +static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
> +				 struct mbox_cmd *mbox_cmd)
> +{
> +	dev_warn(&cxlm->pdev->dev, "Mailbox command timed out\n");
> +	dev_info(&cxlm->pdev->dev,
> +		 "\topcode: 0x%04x\n"
> +		 "\tpayload size: %zub\n",
> +		 mbox_cmd->opcode, mbox_cmd->size_in);
> +
> +	if (IS_ENABLED(CONFIG_CXL_MEM_INSECURE_DEBUG)) {
> +		print_hex_dump_debug("Payload ", DUMP_PREFIX_OFFSET, 16, 1,
> +				     mbox_cmd->payload_in, mbox_cmd->size_in,
> +				     true);
> +	}
> +
> +	/* Here's a good place to figure out if a device reset is needed */

What are the implications if we don't do a reset, as this implementation 
does not?  IOW, does a timeout require a device to be recovered through a 
reset before it can receive additional commands, or is it safe to simply 
drop the command that timed out on the floor and proceed?

> +}
> +
> +/**
> + * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
> + * @cxlm: The CXL memory device to communicate with.
> + * @mbox_cmd: Command to send to the memory device.
> + *
> + * Context: Any context. Expects mbox_lock to be held.
> + * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
> + *         Caller should check the return code in @mbox_cmd to make sure it
> + *         succeeded.
> + *
> + * This is a generic form of the CXL mailbox send command, thus the only I/O
> + * operations used are cxl_read_mbox_reg(). Memory devices, and perhaps other
> + * types of CXL devices may have further information available upon error
> + * conditions.
> + *
> + * The CXL spec allows for up to two mailboxes. The intention is for the primary
> + * mailbox to be OS controlled and the secondary mailbox to be used by system
> + * firmware. This allows the OS and firmware to communicate with the device and
> + * not need to coordinate with each other. The driver only uses the primary
> + * mailbox.
> + */
> +static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
> +				 struct mbox_cmd *mbox_cmd)
> +{
> +	void __iomem *payload = cxlm->mbox.regs + CXLDEV_MB_PAYLOAD_OFFSET;

Do you need to verify the payload is non-empty per 8.2.8.4?

> +	u64 cmd_reg, status_reg;
> +	size_t out_len;
> +	int rc;
> +
> +	lockdep_assert_held(&cxlm->mbox.mutex);
> +
> +	/*
> +	 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
> +	 *   1. Caller reads MB Control Register to verify doorbell is clear
> +	 *   2. Caller writes Command Register
> +	 *   3. Caller writes Command Payload Registers if input payload is non-empty
> +	 *   4. Caller writes MB Control Register to set doorbell
> +	 *   5. Caller either polls for doorbell to be clear or waits for interrupt if configured
> +	 *   6. Caller reads MB Status Register to fetch Return code
> +	 *   7. If command successful, Caller reads Command Register to get Payload Length
> +	 *   8. If output payload is non-empty, host reads Command Payload Registers
> +	 *
> +	 *   Hardware is free to do whatever it wants before the doorbell is
> +	 *   rung, and isn't allowed to change anything after it clears the
> +	 *   doorbell. As such, steps 2 and 3 can happen in any order, and steps
> +	 *   6, 7, 8 can also happen in any order (though some orders might not
> +	 *   make sense).

I'd remove the indent from this paragraph since it's not part of the spec, 
it's an inference based on the constraints imposed by the spec.

> +	 */
> +
> +	/* #1 */
> +	if (cxl_doorbell_busy(cxlm)) {
> +		dev_err_ratelimited(&cxlm->pdev->dev,
> +				    "Mailbox re-busy after acquiring\n");
> +		return -EBUSY;
> +	}
> +
> +	cmd_reg = CXL_SET_FIELD(mbox_cmd->opcode, CXLDEV_MB_CMD_COMMAND_OPCODE);
> +	if (mbox_cmd->size_in) {
> +		if (WARN_ON(!mbox_cmd->payload_in))
> +			return -EINVAL;
> +
> +		cmd_reg |= CXL_SET_FIELD(mbox_cmd->size_in,
> +					 CXLDEV_MB_CMD_PAYLOAD_LENGTH);
> +		memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
> +	}
> +
> +	/* #2, #3 */
> +	cxl_write_mbox_reg64(cxlm, CXLDEV_MB_CMD_OFFSET, cmd_reg);
> +
> +	/* #4 */
> +	dev_dbg(&cxlm->pdev->dev, "Sending command\n");
> +	cxl_write_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET,
> +			     CXLDEV_MB_CTRL_DOORBELL);
> +
> +	/* #5 */
> +	rc = cxl_mem_wait_for_doorbell(cxlm);
> +	if (rc == -ETIMEDOUT) {
> +		cxl_mem_mbox_timeout(cxlm, mbox_cmd);
> +		return rc;
> +	}
> +
> +	/* #6 */
> +	status_reg = cxl_read_mbox_reg64(cxlm, CXLDEV_MB_STATUS_OFFSET);
> +	mbox_cmd->return_code =
> +		CXL_GET_FIELD(status_reg, CXLDEV_MB_STATUS_RET_CODE);
> +
> +	if (mbox_cmd->return_code != 0) {
> +		dev_dbg(&cxlm->pdev->dev, "Mailbox operation had an error\n");
> +		return 0;
> +	}
> +
> +	/* #7 */
> +	cmd_reg = cxl_read_mbox_reg64(cxlm, CXLDEV_MB_CMD_OFFSET);
> +	out_len = CXL_GET_FIELD(cmd_reg, CXLDEV_MB_CMD_PAYLOAD_LENGTH);
> +
> +	/* #8 */
> +	if (out_len && mbox_cmd->payload_out)
> +		memcpy_fromio(mbox_cmd->payload_out, payload, out_len);
> +
> +	mbox_cmd->size_out = out_len;
> +
> +	return 0;
> +}
> +
> +/**
> + * cxl_mem_mbox_get() - Acquire exclusive access to the mailbox.
> + * @cxlm: The memory device to gain access to.
> + *
> + * Context: Any context. Takes the mbox_lock.
> + * Return: 0 if exclusive access was acquired.
> + */
> +static int cxl_mem_mbox_get(struct cxl_mem *cxlm)
> +{
> +	struct device *dev = &cxlm->pdev->dev;
> +	int rc = -EBUSY;
> +	u64 md_status;
> +
> +	mutex_lock_io(&cxlm->mbox.mutex);
> +
> +	/*
> +	 * XXX: There is some amount of ambiguity in the 2.0 version of the spec
> +	 * around the mailbox interface ready (8.2.8.5.1.1).  The purpose of the
> +	 * bit is to allow firmware running on the device to notify the driver
> +	 * that it's ready to receive commands. It is unclear if the bit needs
> +	 * to be read for each transaction mailbox, ie. the firmware can switch
> +	 * it on and off as needed. Second, there is no defined timeout for
> +	 * mailbox ready, like there is for the doorbell interface.
> +	 *
> +	 * Assumptions:
> +	 * 1. The firmware might toggle the Mailbox Interface Ready bit, check
> +	 *    it for every command.
> +	 *
> +	 * 2. If the doorbell is clear, the firmware should have first set the
> +	 *    Mailbox Interface Ready bit. Therefore, waiting for the doorbell
> +	 *    to be ready is sufficient.
> +	 */
> +	rc = cxl_mem_wait_for_doorbell(cxlm);
> +	if (rc) {
> +		dev_warn(dev, "Mailbox interface not ready\n");
> +		goto out;
> +	}
> +
> +	md_status = cxl_read_mem_reg64(cxlm, CXLMDEV_STATUS_OFFSET);
> +	if (!(md_status & CXLMDEV_MBOX_IF_READY && CXLMDEV_READY(md_status))) {
> +		dev_err(dev,
> +			"mbox: reported doorbell ready, but not mbox ready\n");
> +		goto out;
> +	}
> +
> +	/*
> +	 * Hardware shouldn't allow a ready status but also have failure bits
> +	 * set. Spit out an error, this should be a bug report
> +	 */
> +	rc = -EFAULT;
> +	if (md_status & CXLMDEV_DEV_FATAL) {
> +		dev_err(dev, "mbox: reported ready, but fatal\n");
> +		goto out;
> +	}
> +	if (md_status & CXLMDEV_FW_HALT) {
> +		dev_err(dev, "mbox: reported ready, but halted\n");
> +		goto out;
> +	}
> +	if (CXLMDEV_RESET_NEEDED(md_status)) {
> +		dev_err(dev, "mbox: reported ready, but reset needed\n");
> +		goto out;
> +	}
> +
> +	/* with lock held */
> +	return 0;
> +
> +out:
> +	mutex_unlock(&cxlm->mbox.mutex);
> +	return rc;
> +}
> +
> +/**
> + * cxl_mem_mbox_put() - Release exclusive access to the mailbox.
> + * @cxlm: The CXL memory device to communicate with.
> + *
> + * Context: Any context. Expects mbox_lock to be held.
> + */
> +static void cxl_mem_mbox_put(struct cxl_mem *cxlm)
> +{
> +	mutex_unlock(&cxlm->mbox.mutex);
> +}
> +
>  /**
>   * cxl_mem_setup_regs() - Setup necessary MMIO.
>   * @cxlm: The CXL memory device to communicate with.
> @@ -142,6 +406,8 @@ static struct cxl_mem *cxl_mem_create(struct pci_dev *pdev, u32 reg_lo,
>  		return NULL;
>  	}
>  
> +	mutex_init(&cxlm->mbox.mutex);
> +
>  	regs = pcim_iomap_table(pdev)[bar];
>  	cxlm->pdev = pdev;
>  	cxlm->regs = regs + offset;
> @@ -174,6 +440,76 @@ static int cxl_mem_dvsec(struct pci_dev *pdev, int dvsec)
>  	return 0;
>  }
>  
> +/**
> + * cxl_mem_identify() - Send the IDENTIFY command to the device.
> + * @cxlm: The device to identify.
> + *
> + * Return: 0 if identify was executed successfully.
> + *
> + * This will dispatch the identify command to the device and on success populate
> + * structures to be exported to sysfs.
> + */
> +static int cxl_mem_identify(struct cxl_mem *cxlm)
> +{
> +	struct cxl_mbox_identify {
> +		char fw_revision[0x10];
> +		__le64 total_capacity;
> +		__le64 volatile_capacity;
> +		__le64 persistent_capacity;
> +		__le64 partition_align;
> +		__le16 info_event_log_size;
> +		__le16 warning_event_log_size;
> +		__le16 failure_event_log_size;
> +		__le16 fatal_event_log_size;
> +		__le32 lsa_size;
> +		u8 poison_list_max_mer[3];
> +		__le16 inject_poison_limit;
> +		u8 poison_caps;
> +		u8 qos_telemetry_caps;
> +	} __packed id;
> +	struct mbox_cmd mbox_cmd;

Just initialize mbox_cmd here?

> +	int rc;
> +
> +	/* Retrieve initial device memory map */
> +	rc = cxl_mem_mbox_get(cxlm);
> +	if (rc)
> +		return rc;
> +
> +	mbox_cmd = (struct mbox_cmd){
> +		.opcode = CXL_MBOX_OP_IDENTIFY,
> +		.payload_out = &id,
> +		.size_in = 0,
> +	};
> +	rc = cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
> +	cxl_mem_mbox_put(cxlm);
> +	if (rc)
> +		return rc;
> +
> +	/* TODO: Handle retry or reset responses from firmware. */
> +	if (mbox_cmd.return_code != CXL_MBOX_SUCCESS) {
> +		dev_err(&cxlm->pdev->dev, "Mailbox command failed (%d)\n",
> +			mbox_cmd.return_code);
> +		return -ENXIO;
> +	}
> +
> +	if (mbox_cmd.size_out != sizeof(id))
> +		return -ENXIO;
> +
> +	/*
> +	 * TODO: enumerate DPA map, as 'ram' and 'pmem' do not alias.
> +	 * For now, only the capacity is exported in sysfs
> +	 */
> +	cxlm->ram.range.start = 0;
> +	cxlm->ram.range.end = le64_to_cpu(id.volatile_capacity) - 1;
> +
> +	cxlm->pmem.range.start = 0;
> +	cxlm->pmem.range.end = le64_to_cpu(id.persistent_capacity) - 1;
> +
> +	memcpy(cxlm->firmware_version, id.fw_revision, sizeof(id.fw_revision));
> +
> +	return rc;
> +}
> +
>  static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
>  	struct device *dev = &pdev->dev;
> @@ -219,7 +555,11 @@ static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> -	return cxl_mem_setup_mailbox(cxlm);
> +	rc = cxl_mem_setup_mailbox(cxlm);
> +	if (rc)
> +		return rc;
> +
> +	return cxl_mem_identify(cxlm);
>  }
>  
>  static const struct pci_device_id cxl_mem_pci_tbl[] = {
> -- 
> 2.30.0
> 
>
Konrad Rzeszutek Wilk Feb. 1, 2021, 5:54 p.m. UTC | #2
> +#define cxl_doorbell_busy(cxlm)                                                \
> +	(cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
> +	 CXLDEV_MB_CTRL_DOORBELL)
> +
> +#define CXL_MAILBOX_TIMEOUT_US 2000

You been using the spec for the values. Is that number also from it ?

> +
> +enum opcode {
> +	CXL_MBOX_OP_IDENTIFY		= 0x4000,
> +	CXL_MBOX_OP_MAX			= 0x10000
> +};
> +
> +/**
> + * struct mbox_cmd - A command to be submitted to hardware.
> + * @opcode: (input) The command set and command submitted to hardware.
> + * @payload_in: (input) Pointer to the input payload.
> + * @payload_out: (output) Pointer to the output payload. Must be allocated by
> + *		 the caller.
> + * @size_in: (input) Number of bytes to load from @payload.
> + * @size_out: (output) Number of bytes loaded into @payload.
> + * @return_code: (output) Error code returned from hardware.
> + *
> + * This is the primary mechanism used to send commands to the hardware.
> + * All the fields except @payload_* correspond exactly to the fields described in
> + * Command Register section of the CXL 2.0 spec (8.2.8.4.5). @payload_in and
> + * @payload_out are written to, and read from the Command Payload Registers
> + * defined in (8.2.8.4.8).
> + */
> +struct mbox_cmd {
> +	u16 opcode;
> +	void *payload_in;
> +	void *payload_out;

On a 32-bit OS (not that we use those that more, but lets assume
someone really wants to), the void is 4-bytes, while on 64-bit it is
8-bytes.

`pahole` is your friend as I think there is a gap between opcode and
payload_in in the structure.

> +	size_t size_in;
> +	size_t size_out;

And those can also change depending on 32-bit/64-bit.

> +	u16 return_code;
> +#define CXL_MBOX_SUCCESS 0
> +};

Do you want to use __packed to match with the spec?

Ah, reading later you don't care about it.

In that case may I recommend you move 'return_code' (or perhaps just
call it rc?) to be right after opcode? Less of gaps in that structure.

> +
> +static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
> +{
> +	const int timeout = msecs_to_jiffies(CXL_MAILBOX_TIMEOUT_US);
> +	const unsigned long start = jiffies;
> +	unsigned long end = start;
> +
> +	while (cxl_doorbell_busy(cxlm)) {
> +		end = jiffies;
> +
> +		if (time_after(end, start + timeout)) {
> +			/* Check again in case preempted before timeout test */
> +			if (!cxl_doorbell_busy(cxlm))
> +				break;
> +			return -ETIMEDOUT;
> +		}
> +		cpu_relax();
> +	}

Hm, that is not very scheduler friendly. I mean we are sitting here for
2000us (2 ms) - that is quite the amount of time spinning.

Should this perhaps be put in a workqueue?
> +
> +	dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
> +		jiffies_to_msecs(end) - jiffies_to_msecs(start));
> +	return 0;
> +}
> +
> +static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
> +				 struct mbox_cmd *mbox_cmd)
> +{
> +	dev_warn(&cxlm->pdev->dev, "Mailbox command timed out\n");
> +	dev_info(&cxlm->pdev->dev,
> +		 "\topcode: 0x%04x\n"
> +		 "\tpayload size: %zub\n",
> +		 mbox_cmd->opcode, mbox_cmd->size_in);
> +
> +	if (IS_ENABLED(CONFIG_CXL_MEM_INSECURE_DEBUG)) {
> +		print_hex_dump_debug("Payload ", DUMP_PREFIX_OFFSET, 16, 1,
> +				     mbox_cmd->payload_in, mbox_cmd->size_in,
> +				     true);
> +	}
> +
> +	/* Here's a good place to figure out if a device reset is needed */
> +}
> +
> +/**
> + * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
> + * @cxlm: The CXL memory device to communicate with.
> + * @mbox_cmd: Command to send to the memory device.
> + *
> + * Context: Any context. Expects mbox_lock to be held.
> + * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
> + *         Caller should check the return code in @mbox_cmd to make sure it
> + *         succeeded.
> + *
> + * This is a generic form of the CXL mailbox send command, thus the only I/O
> + * operations used are cxl_read_mbox_reg(). Memory devices, and perhaps other
> + * types of CXL devices may have further information available upon error
> + * conditions.
> + *
> + * The CXL spec allows for up to two mailboxes. The intention is for the primary
> + * mailbox to be OS controlled and the secondary mailbox to be used by system
> + * firmware. This allows the OS and firmware to communicate with the device and
> + * not need to coordinate with each other. The driver only uses the primary
> + * mailbox.
> + */
> +static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
> +				 struct mbox_cmd *mbox_cmd)
> +{
> +	void __iomem *payload = cxlm->mbox.regs + CXLDEV_MB_PAYLOAD_OFFSET;
> +	u64 cmd_reg, status_reg;
> +	size_t out_len;
> +	int rc;
> +
> +	lockdep_assert_held(&cxlm->mbox.mutex);
> +
> +	/*
> +	 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
> +	 *   1. Caller reads MB Control Register to verify doorbell is clear
> +	 *   2. Caller writes Command Register
> +	 *   3. Caller writes Command Payload Registers if input payload is non-empty
> +	 *   4. Caller writes MB Control Register to set doorbell
> +	 *   5. Caller either polls for doorbell to be clear or waits for interrupt if configured
> +	 *   6. Caller reads MB Status Register to fetch Return code
> +	 *   7. If command successful, Caller reads Command Register to get Payload Length
> +	 *   8. If output payload is non-empty, host reads Command Payload Registers
> +	 *
> +	 *   Hardware is free to do whatever it wants before the doorbell is
> +	 *   rung, and isn't allowed to change anything after it clears the
> +	 *   doorbell. As such, steps 2 and 3 can happen in any order, and steps
> +	 *   6, 7, 8 can also happen in any order (though some orders might not
> +	 *   make sense).
> +	 */
> +
> +	/* #1 */
> +	if (cxl_doorbell_busy(cxlm)) {
> +		dev_err_ratelimited(&cxlm->pdev->dev,
> +				    "Mailbox re-busy after acquiring\n");
> +		return -EBUSY;
> +	}
> +
> +	cmd_reg = CXL_SET_FIELD(mbox_cmd->opcode, CXLDEV_MB_CMD_COMMAND_OPCODE);
> +	if (mbox_cmd->size_in) {
> +		if (WARN_ON(!mbox_cmd->payload_in))
> +			return -EINVAL;
> +
> +		cmd_reg |= CXL_SET_FIELD(mbox_cmd->size_in,
> +					 CXLDEV_MB_CMD_PAYLOAD_LENGTH);
> +		memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
> +	}
> +
> +	/* #2, #3 */
> +	cxl_write_mbox_reg64(cxlm, CXLDEV_MB_CMD_OFFSET, cmd_reg);
> +
> +	/* #4 */
> +	dev_dbg(&cxlm->pdev->dev, "Sending command\n");
> +	cxl_write_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET,
> +			     CXLDEV_MB_CTRL_DOORBELL);
> +
> +	/* #5 */
> +	rc = cxl_mem_wait_for_doorbell(cxlm);
> +	if (rc == -ETIMEDOUT) {
> +		cxl_mem_mbox_timeout(cxlm, mbox_cmd);
> +		return rc;
> +	}
> +
> +	/* #6 */
> +	status_reg = cxl_read_mbox_reg64(cxlm, CXLDEV_MB_STATUS_OFFSET);
> +	mbox_cmd->return_code =
> +		CXL_GET_FIELD(status_reg, CXLDEV_MB_STATUS_RET_CODE);
> +
> +	if (mbox_cmd->return_code != 0) {
> +		dev_dbg(&cxlm->pdev->dev, "Mailbox operation had an error\n");
> +		return 0;
> +	}
> +
> +	/* #7 */
> +	cmd_reg = cxl_read_mbox_reg64(cxlm, CXLDEV_MB_CMD_OFFSET);
> +	out_len = CXL_GET_FIELD(cmd_reg, CXLDEV_MB_CMD_PAYLOAD_LENGTH);
> +
> +	/* #8 */
> +	if (out_len && mbox_cmd->payload_out)
> +		memcpy_fromio(mbox_cmd->payload_out, payload, out_len);
> +
> +	mbox_cmd->size_out = out_len;
> +
> +	return 0;
> +}
> +
> +/**
> + * cxl_mem_mbox_get() - Acquire exclusive access to the mailbox.
> + * @cxlm: The memory device to gain access to.
> + *
> + * Context: Any context. Takes the mbox_lock.
> + * Return: 0 if exclusive access was acquired.
> + */
> +static int cxl_mem_mbox_get(struct cxl_mem *cxlm)
> +{
> +	struct device *dev = &cxlm->pdev->dev;
> +	int rc = -EBUSY;
> +	u64 md_status;
> +
> +	mutex_lock_io(&cxlm->mbox.mutex);
> +
> +	/*
> +	 * XXX: There is some amount of ambiguity in the 2.0 version of the spec
> +	 * around the mailbox interface ready (8.2.8.5.1.1).  The purpose of the
> +	 * bit is to allow firmware running on the device to notify the driver
> +	 * that it's ready to receive commands. It is unclear if the bit needs
> +	 * to be read for each transaction mailbox, ie. the firmware can switch
> +	 * it on and off as needed. Second, there is no defined timeout for
> +	 * mailbox ready, like there is for the doorbell interface.
> +	 *
> +	 * Assumptions:
> +	 * 1. The firmware might toggle the Mailbox Interface Ready bit, check
> +	 *    it for every command.
> +	 *
> +	 * 2. If the doorbell is clear, the firmware should have first set the
> +	 *    Mailbox Interface Ready bit. Therefore, waiting for the doorbell
> +	 *    to be ready is sufficient.
> +	 */
> +	rc = cxl_mem_wait_for_doorbell(cxlm);
> +	if (rc) {
> +		dev_warn(dev, "Mailbox interface not ready\n");
> +		goto out;
> +	}
> +
> +	md_status = cxl_read_mem_reg64(cxlm, CXLMDEV_STATUS_OFFSET);
> +	if (!(md_status & CXLMDEV_MBOX_IF_READY && CXLMDEV_READY(md_status))) {
> +		dev_err(dev,
> +			"mbox: reported doorbell ready, but not mbox ready\n");

You can make that oneline.
> +		goto out;
> +	}
> +
> +	/*
> +	 * Hardware shouldn't allow a ready status but also have failure bits
> +	 * set. Spit out an error, this should be a bug report
> +	 */
> +	rc = -EFAULT;

Should these include more details? As in a dump of other registers to
help in the field to debug why the device is busted?

> +	if (md_status & CXLMDEV_DEV_FATAL) {
> +		dev_err(dev, "mbox: reported ready, but fatal\n");
> +		goto out;
> +	}
> +	if (md_status & CXLMDEV_FW_HALT) {
> +		dev_err(dev, "mbox: reported ready, but halted\n");
> +		goto out;
> +	}
> +	if (CXLMDEV_RESET_NEEDED(md_status)) {
> +		dev_err(dev, "mbox: reported ready, but reset needed\n");
> +		goto out;
> +	}
> +
> +	/* with lock held */
> +	return 0;
> +
> +out:
> +	mutex_unlock(&cxlm->mbox.mutex);
> +	return rc;
> +}
> +
> +/**
> + * cxl_mem_mbox_put() - Release exclusive access to the mailbox.
> + * @cxlm: The CXL memory device to communicate with.
> + *
> + * Context: Any context. Expects mbox_lock to be held.
> + */
> +static void cxl_mem_mbox_put(struct cxl_mem *cxlm)
> +{
> +	mutex_unlock(&cxlm->mbox.mutex);
> +}
> +
>  /**
>   * cxl_mem_setup_regs() - Setup necessary MMIO.
>   * @cxlm: The CXL memory device to communicate with.
> @@ -142,6 +406,8 @@ static struct cxl_mem *cxl_mem_create(struct pci_dev *pdev, u32 reg_lo,
>  		return NULL;
>  	}
>  
> +	mutex_init(&cxlm->mbox.mutex);
> +
>  	regs = pcim_iomap_table(pdev)[bar];
>  	cxlm->pdev = pdev;
>  	cxlm->regs = regs + offset;
> @@ -174,6 +440,76 @@ static int cxl_mem_dvsec(struct pci_dev *pdev, int dvsec)
>  	return 0;
>  }
>  
> +/**
> + * cxl_mem_identify() - Send the IDENTIFY command to the device.
> + * @cxlm: The device to identify.
> + *
> + * Return: 0 if identify was executed successfully.
> + *
> + * This will dispatch the identify command to the device and on success populate
> + * structures to be exported to sysfs.
> + */
> +static int cxl_mem_identify(struct cxl_mem *cxlm)
> +{
> +	struct cxl_mbox_identify {
> +		char fw_revision[0x10];
> +		__le64 total_capacity;
> +		__le64 volatile_capacity;
> +		__le64 persistent_capacity;
> +		__le64 partition_align;
> +		__le16 info_event_log_size;
> +		__le16 warning_event_log_size;
> +		__le16 failure_event_log_size;
> +		__le16 fatal_event_log_size;
> +		__le32 lsa_size;
> +		u8 poison_list_max_mer[3];
> +		__le16 inject_poison_limit;
> +		u8 poison_caps;
> +		u8 qos_telemetry_caps;
> +	} __packed id;
> +	struct mbox_cmd mbox_cmd;
> +	int rc;
> +
> +	/* Retrieve initial device memory map */
> +	rc = cxl_mem_mbox_get(cxlm);
> +	if (rc)
> +		return rc;
> +
> +	mbox_cmd = (struct mbox_cmd){
> +		.opcode = CXL_MBOX_OP_IDENTIFY,
> +		.payload_out = &id,
> +		.size_in = 0,
> +	};
> +	rc = cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
> +	cxl_mem_mbox_put(cxlm);
> +	if (rc)
> +		return rc;
> +
> +	/* TODO: Handle retry or reset responses from firmware. */
> +	if (mbox_cmd.return_code != CXL_MBOX_SUCCESS) {
> +		dev_err(&cxlm->pdev->dev, "Mailbox command failed (%d)\n",
> +			mbox_cmd.return_code);
> +		return -ENXIO;
> +	}
> +
> +	if (mbox_cmd.size_out != sizeof(id))
> +		return -ENXIO;
> +
> +	/*
> +	 * TODO: enumerate DPA map, as 'ram' and 'pmem' do not alias.

??? Not sure I understand.

> +	 * For now, only the capacity is exported in sysfs
> +	 */
> +	cxlm->ram.range.start = 0;
> +	cxlm->ram.range.end = le64_to_cpu(id.volatile_capacity) - 1;
> +
> +	cxlm->pmem.range.start = 0;
> +	cxlm->pmem.range.end = le64_to_cpu(id.persistent_capacity) - 1;
> +
> +	memcpy(cxlm->firmware_version, id.fw_revision, sizeof(id.fw_revision));
> +
> +	return rc;
> +}
> +
>  static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
>  	struct device *dev = &pdev->dev;
> @@ -219,7 +555,11 @@ static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> -	return cxl_mem_setup_mailbox(cxlm);
> +	rc = cxl_mem_setup_mailbox(cxlm);
> +	if (rc)
> +		return rc;
> +
> +	return cxl_mem_identify(cxlm);
>  }
>  
>  static const struct pci_device_id cxl_mem_pci_tbl[] = {
> -- 
> 2.30.0
>
Ben Widawsky Feb. 1, 2021, 7:13 p.m. UTC | #3
On 21-02-01 12:54:00, Konrad Rzeszutek Wilk wrote:
> > +#define cxl_doorbell_busy(cxlm)                                                \
> > +	(cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
> > +	 CXLDEV_MB_CTRL_DOORBELL)
> > +
> > +#define CXL_MAILBOX_TIMEOUT_US 2000
> 
> You been using the spec for the values. Is that number also from it ?
> 

Yes it is. I'll add a comment with the spec reference.

> > +
> > +enum opcode {
> > +	CXL_MBOX_OP_IDENTIFY		= 0x4000,
> > +	CXL_MBOX_OP_MAX			= 0x10000
> > +};
> > +
> > +/**
> > + * struct mbox_cmd - A command to be submitted to hardware.
> > + * @opcode: (input) The command set and command submitted to hardware.
> > + * @payload_in: (input) Pointer to the input payload.
> > + * @payload_out: (output) Pointer to the output payload. Must be allocated by
> > + *		 the caller.
> > + * @size_in: (input) Number of bytes to load from @payload.
> > + * @size_out: (output) Number of bytes loaded into @payload.
> > + * @return_code: (output) Error code returned from hardware.
> > + *
> > + * This is the primary mechanism used to send commands to the hardware.
> > + * All the fields except @payload_* correspond exactly to the fields described in
> > + * Command Register section of the CXL 2.0 spec (8.2.8.4.5). @payload_in and
> > + * @payload_out are written to, and read from the Command Payload Registers
> > + * defined in (8.2.8.4.8).
> > + */
> > +struct mbox_cmd {
> > +	u16 opcode;
> > +	void *payload_in;
> > +	void *payload_out;
> 
> On a 32-bit OS (not that we use those that more, but lets assume
> someone really wants to), the void is 4-bytes, while on 64-bit it is
> 8-bytes.
> 
> `pahole` is your friend as I think there is a gap between opcode and
> payload_in in the structure.
> 
> > +	size_t size_in;
> > +	size_t size_out;
> 
> And those can also change depending on 32-bit/64-bit.
> 
> > +	u16 return_code;
> > +#define CXL_MBOX_SUCCESS 0
> > +};
> 
> Do you want to use __packed to match with the spec?
> 
> Ah, reading later you don't care about it.
> 
> In that case may I recommend you move 'return_code' (or perhaps just
> call it rc?) to be right after opcode? Less of gaps in that structure.
> 

I guess I hadn't realized we're supposed to try to fully pack structs by
default.

> > +
> > +static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
> > +{
> > +	const int timeout = msecs_to_jiffies(CXL_MAILBOX_TIMEOUT_US);
> > +	const unsigned long start = jiffies;
> > +	unsigned long end = start;
> > +
> > +	while (cxl_doorbell_busy(cxlm)) {
> > +		end = jiffies;
> > +
> > +		if (time_after(end, start + timeout)) {
> > +			/* Check again in case preempted before timeout test */
> > +			if (!cxl_doorbell_busy(cxlm))
> > +				break;
> > +			return -ETIMEDOUT;
> > +		}
> > +		cpu_relax();
> > +	}
> 
> Hm, that is not very scheduler friendly. I mean we are sitting here for
> 2000us (2 ms) - that is quite the amount of time spinning.
> 
> Should this perhaps be put in a workqueue?

So let me first point you to the friendlier version which was shot down:
https://lore.kernel.org/linux-cxl/20201111054356.793390-8-ben.widawsky@intel.com/

I'm not opposed to this being moved to a workqueue at some point, but I think
that's unnecessary complexity currently. The reality is that it's expected that
commands will finish way sooner than this or be implemented as background
commands. I've heard a person who makes a lot of the spec decisions say, "if
it's 2 seconds, nobody will use these things".

I think adding the summary of this back and forth as a comment to the existing
code makes a lot of sense.

> > +
> > +	dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
> > +		jiffies_to_msecs(end) - jiffies_to_msecs(start));
> > +	return 0;
> > +}
> > +
> > +static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
> > +				 struct mbox_cmd *mbox_cmd)
> > +{
> > +	dev_warn(&cxlm->pdev->dev, "Mailbox command timed out\n");
> > +	dev_info(&cxlm->pdev->dev,
> > +		 "\topcode: 0x%04x\n"
> > +		 "\tpayload size: %zub\n",
> > +		 mbox_cmd->opcode, mbox_cmd->size_in);
> > +
> > +	if (IS_ENABLED(CONFIG_CXL_MEM_INSECURE_DEBUG)) {
> > +		print_hex_dump_debug("Payload ", DUMP_PREFIX_OFFSET, 16, 1,
> > +				     mbox_cmd->payload_in, mbox_cmd->size_in,
> > +				     true);
> > +	}
> > +
> > +	/* Here's a good place to figure out if a device reset is needed */
> > +}
> > +
> > +/**
> > + * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
> > + * @cxlm: The CXL memory device to communicate with.
> > + * @mbox_cmd: Command to send to the memory device.
> > + *
> > + * Context: Any context. Expects mbox_lock to be held.
> > + * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
> > + *         Caller should check the return code in @mbox_cmd to make sure it
> > + *         succeeded.
> > + *
> > + * This is a generic form of the CXL mailbox send command, thus the only I/O
> > + * operations used are cxl_read_mbox_reg(). Memory devices, and perhaps other
> > + * types of CXL devices may have further information available upon error
> > + * conditions.
> > + *
> > + * The CXL spec allows for up to two mailboxes. The intention is for the primary
> > + * mailbox to be OS controlled and the secondary mailbox to be used by system
> > + * firmware. This allows the OS and firmware to communicate with the device and
> > + * not need to coordinate with each other. The driver only uses the primary
> > + * mailbox.
> > + */
> > +static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
> > +				 struct mbox_cmd *mbox_cmd)
> > +{
> > +	void __iomem *payload = cxlm->mbox.regs + CXLDEV_MB_PAYLOAD_OFFSET;
> > +	u64 cmd_reg, status_reg;
> > +	size_t out_len;
> > +	int rc;
> > +
> > +	lockdep_assert_held(&cxlm->mbox.mutex);
> > +
> > +	/*
> > +	 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
> > +	 *   1. Caller reads MB Control Register to verify doorbell is clear
> > +	 *   2. Caller writes Command Register
> > +	 *   3. Caller writes Command Payload Registers if input payload is non-empty
> > +	 *   4. Caller writes MB Control Register to set doorbell
> > +	 *   5. Caller either polls for doorbell to be clear or waits for interrupt if configured
> > +	 *   6. Caller reads MB Status Register to fetch Return code
> > +	 *   7. If command successful, Caller reads Command Register to get Payload Length
> > +	 *   8. If output payload is non-empty, host reads Command Payload Registers
> > +	 *
> > +	 *   Hardware is free to do whatever it wants before the doorbell is
> > +	 *   rung, and isn't allowed to change anything after it clears the
> > +	 *   doorbell. As such, steps 2 and 3 can happen in any order, and steps
> > +	 *   6, 7, 8 can also happen in any order (though some orders might not
> > +	 *   make sense).
> > +	 */
> > +
> > +	/* #1 */
> > +	if (cxl_doorbell_busy(cxlm)) {
> > +		dev_err_ratelimited(&cxlm->pdev->dev,
> > +				    "Mailbox re-busy after acquiring\n");
> > +		return -EBUSY;
> > +	}
> > +
> > +	cmd_reg = CXL_SET_FIELD(mbox_cmd->opcode, CXLDEV_MB_CMD_COMMAND_OPCODE);
> > +	if (mbox_cmd->size_in) {
> > +		if (WARN_ON(!mbox_cmd->payload_in))
> > +			return -EINVAL;
> > +
> > +		cmd_reg |= CXL_SET_FIELD(mbox_cmd->size_in,
> > +					 CXLDEV_MB_CMD_PAYLOAD_LENGTH);
> > +		memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
> > +	}
> > +
> > +	/* #2, #3 */
> > +	cxl_write_mbox_reg64(cxlm, CXLDEV_MB_CMD_OFFSET, cmd_reg);
> > +
> > +	/* #4 */
> > +	dev_dbg(&cxlm->pdev->dev, "Sending command\n");
> > +	cxl_write_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET,
> > +			     CXLDEV_MB_CTRL_DOORBELL);
> > +
> > +	/* #5 */
> > +	rc = cxl_mem_wait_for_doorbell(cxlm);
> > +	if (rc == -ETIMEDOUT) {
> > +		cxl_mem_mbox_timeout(cxlm, mbox_cmd);
> > +		return rc;
> > +	}
> > +
> > +	/* #6 */
> > +	status_reg = cxl_read_mbox_reg64(cxlm, CXLDEV_MB_STATUS_OFFSET);
> > +	mbox_cmd->return_code =
> > +		CXL_GET_FIELD(status_reg, CXLDEV_MB_STATUS_RET_CODE);
> > +
> > +	if (mbox_cmd->return_code != 0) {
> > +		dev_dbg(&cxlm->pdev->dev, "Mailbox operation had an error\n");
> > +		return 0;
> > +	}
> > +
> > +	/* #7 */
> > +	cmd_reg = cxl_read_mbox_reg64(cxlm, CXLDEV_MB_CMD_OFFSET);
> > +	out_len = CXL_GET_FIELD(cmd_reg, CXLDEV_MB_CMD_PAYLOAD_LENGTH);
> > +
> > +	/* #8 */
> > +	if (out_len && mbox_cmd->payload_out)
> > +		memcpy_fromio(mbox_cmd->payload_out, payload, out_len);
> > +
> > +	mbox_cmd->size_out = out_len;
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * cxl_mem_mbox_get() - Acquire exclusive access to the mailbox.
> > + * @cxlm: The memory device to gain access to.
> > + *
> > + * Context: Any context. Takes the mbox_lock.
> > + * Return: 0 if exclusive access was acquired.
> > + */
> > +static int cxl_mem_mbox_get(struct cxl_mem *cxlm)
> > +{
> > +	struct device *dev = &cxlm->pdev->dev;
> > +	int rc = -EBUSY;
> > +	u64 md_status;
> > +
> > +	mutex_lock_io(&cxlm->mbox.mutex);
> > +
> > +	/*
> > +	 * XXX: There is some amount of ambiguity in the 2.0 version of the spec
> > +	 * around the mailbox interface ready (8.2.8.5.1.1).  The purpose of the
> > +	 * bit is to allow firmware running on the device to notify the driver
> > +	 * that it's ready to receive commands. It is unclear if the bit needs
> > +	 * to be read for each transaction mailbox, ie. the firmware can switch
> > +	 * it on and off as needed. Second, there is no defined timeout for
> > +	 * mailbox ready, like there is for the doorbell interface.
> > +	 *
> > +	 * Assumptions:
> > +	 * 1. The firmware might toggle the Mailbox Interface Ready bit, check
> > +	 *    it for every command.
> > +	 *
> > +	 * 2. If the doorbell is clear, the firmware should have first set the
> > +	 *    Mailbox Interface Ready bit. Therefore, waiting for the doorbell
> > +	 *    to be ready is sufficient.
> > +	 */
> > +	rc = cxl_mem_wait_for_doorbell(cxlm);
> > +	if (rc) {
> > +		dev_warn(dev, "Mailbox interface not ready\n");
> > +		goto out;
> > +	}
> > +
> > +	md_status = cxl_read_mem_reg64(cxlm, CXLMDEV_STATUS_OFFSET);
> > +	if (!(md_status & CXLMDEV_MBOX_IF_READY && CXLMDEV_READY(md_status))) {
> > +		dev_err(dev,
> > +			"mbox: reported doorbell ready, but not mbox ready\n");
> 
> You can make that oneline.
> > +		goto out;
> > +	}
> > +
> > +	/*
> > +	 * Hardware shouldn't allow a ready status but also have failure bits
> > +	 * set. Spit out an error, this should be a bug report
> > +	 */
> > +	rc = -EFAULT;
> 
> Should these include more details? As in a dump of other registers to
> help in the field to debug why the device is busted?
> 

We've discussed a bit, having kind of a general error state (my driver
background is in i915 where we did such a thing). I'm not opposed, but as the
error handling at this point is very minimal, I think it can wait until the next
round of patches for further enabling.

> > +	if (md_status & CXLMDEV_DEV_FATAL) {
> > +		dev_err(dev, "mbox: reported ready, but fatal\n");
> > +		goto out;
> > +	}
> > +	if (md_status & CXLMDEV_FW_HALT) {
> > +		dev_err(dev, "mbox: reported ready, but halted\n");
> > +		goto out;
> > +	}
> > +	if (CXLMDEV_RESET_NEEDED(md_status)) {
> > +		dev_err(dev, "mbox: reported ready, but reset needed\n");
> > +		goto out;
> > +	}
> > +
> > +	/* with lock held */
> > +	return 0;
> > +
> > +out:
> > +	mutex_unlock(&cxlm->mbox.mutex);
> > +	return rc;
> > +}
> > +
> > +/**
> > + * cxl_mem_mbox_put() - Release exclusive access to the mailbox.
> > + * @cxlm: The CXL memory device to communicate with.
> > + *
> > + * Context: Any context. Expects mbox_lock to be held.
> > + */
> > +static void cxl_mem_mbox_put(struct cxl_mem *cxlm)
> > +{
> > +	mutex_unlock(&cxlm->mbox.mutex);
> > +}
> > +
> >  /**
> >   * cxl_mem_setup_regs() - Setup necessary MMIO.
> >   * @cxlm: The CXL memory device to communicate with.
> > @@ -142,6 +406,8 @@ static struct cxl_mem *cxl_mem_create(struct pci_dev *pdev, u32 reg_lo,
> >  		return NULL;
> >  	}
> >  
> > +	mutex_init(&cxlm->mbox.mutex);
> > +
> >  	regs = pcim_iomap_table(pdev)[bar];
> >  	cxlm->pdev = pdev;
> >  	cxlm->regs = regs + offset;
> > @@ -174,6 +440,76 @@ static int cxl_mem_dvsec(struct pci_dev *pdev, int dvsec)
> >  	return 0;
> >  }
> >  
> > +/**
> > + * cxl_mem_identify() - Send the IDENTIFY command to the device.
> > + * @cxlm: The device to identify.
> > + *
> > + * Return: 0 if identify was executed successfully.
> > + *
> > + * This will dispatch the identify command to the device and on success populate
> > + * structures to be exported to sysfs.
> > + */
> > +static int cxl_mem_identify(struct cxl_mem *cxlm)
> > +{
> > +	struct cxl_mbox_identify {
> > +		char fw_revision[0x10];
> > +		__le64 total_capacity;
> > +		__le64 volatile_capacity;
> > +		__le64 persistent_capacity;
> > +		__le64 partition_align;
> > +		__le16 info_event_log_size;
> > +		__le16 warning_event_log_size;
> > +		__le16 failure_event_log_size;
> > +		__le16 fatal_event_log_size;
> > +		__le32 lsa_size;
> > +		u8 poison_list_max_mer[3];
> > +		__le16 inject_poison_limit;
> > +		u8 poison_caps;
> > +		u8 qos_telemetry_caps;
> > +	} __packed id;
> > +	struct mbox_cmd mbox_cmd;
> > +	int rc;
> > +
> > +	/* Retrieve initial device memory map */
> > +	rc = cxl_mem_mbox_get(cxlm);
> > +	if (rc)
> > +		return rc;
> > +
> > +	mbox_cmd = (struct mbox_cmd){
> > +		.opcode = CXL_MBOX_OP_IDENTIFY,
> > +		.payload_out = &id,
> > +		.size_in = 0,
> > +	};
> > +	rc = cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
> > +	cxl_mem_mbox_put(cxlm);
> > +	if (rc)
> > +		return rc;
> > +
> > +	/* TODO: Handle retry or reset responses from firmware. */
> > +	if (mbox_cmd.return_code != CXL_MBOX_SUCCESS) {
> > +		dev_err(&cxlm->pdev->dev, "Mailbox command failed (%d)\n",
> > +			mbox_cmd.return_code);
> > +		return -ENXIO;
> > +	}
> > +
> > +	if (mbox_cmd.size_out != sizeof(id))
> > +		return -ENXIO;
> > +
> > +	/*
> > +	 * TODO: enumerate DPA map, as 'ram' and 'pmem' do not alias.
> 
> ??? Not sure I understand.
> 

The current code is showing two aliased/overlapping ranges.
[0, id.volatile_capacity)
[0, id.persistent_capacity)

This is not allowed by spec.

> > +	 * For now, only the capacity is exported in sysfs
> > +	 */
> > +	cxlm->ram.range.start = 0;
> > +	cxlm->ram.range.end = le64_to_cpu(id.volatile_capacity) - 1;
> > +
> > +	cxlm->pmem.range.start = 0;
> > +	cxlm->pmem.range.end = le64_to_cpu(id.persistent_capacity) - 1;
> > +
> > +	memcpy(cxlm->firmware_version, id.fw_revision, sizeof(id.fw_revision));
> > +
> > +	return rc;
> > +}
> > +
> >  static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  {
> >  	struct device *dev = &pdev->dev;
> > @@ -219,7 +555,11 @@ static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  	if (rc)
> >  		return rc;
> >  
> > -	return cxl_mem_setup_mailbox(cxlm);
> > +	rc = cxl_mem_setup_mailbox(cxlm);
> > +	if (rc)
> > +		return rc;
> > +
> > +	return cxl_mem_identify(cxlm);
> >  }
> >  
> >  static const struct pci_device_id cxl_mem_pci_tbl[] = {
> > -- 
> > 2.30.0
> >
Dan Williams Feb. 1, 2021, 7:28 p.m. UTC | #4
On Mon, Feb 1, 2021 at 11:13 AM Ben Widawsky <ben.widawsky@intel.com> wrote:
>
> On 21-02-01 12:54:00, Konrad Rzeszutek Wilk wrote:
> > > +#define cxl_doorbell_busy(cxlm)                                                \
> > > +   (cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
> > > +    CXLDEV_MB_CTRL_DOORBELL)
> > > +
> > > +#define CXL_MAILBOX_TIMEOUT_US 2000
> >
> > You been using the spec for the values. Is that number also from it ?
> >
>
> Yes it is. I'll add a comment with the spec reference.
>
> > > +
> > > +enum opcode {
> > > +   CXL_MBOX_OP_IDENTIFY            = 0x4000,
> > > +   CXL_MBOX_OP_MAX                 = 0x10000
> > > +};
> > > +
> > > +/**
> > > + * struct mbox_cmd - A command to be submitted to hardware.
> > > + * @opcode: (input) The command set and command submitted to hardware.
> > > + * @payload_in: (input) Pointer to the input payload.
> > > + * @payload_out: (output) Pointer to the output payload. Must be allocated by
> > > + *          the caller.
> > > + * @size_in: (input) Number of bytes to load from @payload.
> > > + * @size_out: (output) Number of bytes loaded into @payload.
> > > + * @return_code: (output) Error code returned from hardware.
> > > + *
> > > + * This is the primary mechanism used to send commands to the hardware.
> > > + * All the fields except @payload_* correspond exactly to the fields described in
> > > + * Command Register section of the CXL 2.0 spec (8.2.8.4.5). @payload_in and
> > > + * @payload_out are written to, and read from the Command Payload Registers
> > > + * defined in (8.2.8.4.8).
> > > + */
> > > +struct mbox_cmd {
> > > +   u16 opcode;
> > > +   void *payload_in;
> > > +   void *payload_out;
> >
> > On a 32-bit OS (not that we use those that more, but lets assume
> > someone really wants to), the void is 4-bytes, while on 64-bit it is
> > 8-bytes.
> >
> > `pahole` is your friend as I think there is a gap between opcode and
> > payload_in in the structure.
> >
> > > +   size_t size_in;
> > > +   size_t size_out;
> >
> > And those can also change depending on 32-bit/64-bit.
> >
> > > +   u16 return_code;
> > > +#define CXL_MBOX_SUCCESS 0
> > > +};
> >
> > Do you want to use __packed to match with the spec?
> >
> > Ah, reading later you don't care about it.
> >
> > In that case may I recommend you move 'return_code' (or perhaps just
> > call it rc?) to be right after opcode? Less of gaps in that structure.
> >
>
> I guess I hadn't realized we're supposed to try to fully pack structs by
> default.

This is just the internal parsed context of a command, I can't imagine
packing is relevant here. pahole optimization feels premature as well.

>
> > > +
> > > +static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
> > > +{
> > > +   const int timeout = msecs_to_jiffies(CXL_MAILBOX_TIMEOUT_US);
> > > +   const unsigned long start = jiffies;
> > > +   unsigned long end = start;
> > > +
> > > +   while (cxl_doorbell_busy(cxlm)) {
> > > +           end = jiffies;
> > > +
> > > +           if (time_after(end, start + timeout)) {
> > > +                   /* Check again in case preempted before timeout test */
> > > +                   if (!cxl_doorbell_busy(cxlm))
> > > +                           break;
> > > +                   return -ETIMEDOUT;
> > > +           }
> > > +           cpu_relax();
> > > +   }
> >
> > Hm, that is not very scheduler friendly. I mean we are sitting here for
> > 2000us (2 ms) - that is quite the amount of time spinning.
> >
> > Should this perhaps be put in a workqueue?
>
> So let me first point you to the friendlier version which was shot down:
> https://lore.kernel.org/linux-cxl/20201111054356.793390-8-ben.widawsky@intel.com/
>
> I'm not opposed to this being moved to a workqueue at some point, but I think
> that's unnecessary complexity currently. The reality is that it's expected that
> commands will finish way sooner than this or be implemented as background
> commands. I've heard a person who makes a lot of the spec decisions say, "if
> it's 2 seconds, nobody will use these things".

That said, asynchronous probe needs to be enabled for the next driver update.
Dan Williams Feb. 1, 2021, 8 p.m. UTC | #5
On Sat, Jan 30, 2021 at 3:52 PM David Rientjes <rientjes@google.com> wrote:
>
> On Fri, 29 Jan 2021, Ben Widawsky wrote:
>
> > Provide enough functionality to utilize the mailbox of a memory device.
> > The mailbox is used to interact with the firmware running on the memory
> > device.
> >
> > The CXL specification defines separate capabilities for the mailbox and
> > the memory device. The mailbox interface has a doorbell to indicate
> > ready to accept commands and the memory device has a capability register
> > that indicates the mailbox interface is ready. The expectation is that
> > the doorbell-ready is always later than the memory-device-indication
> > that the mailbox is ready.
> >
> > Create a function to handle sending a command, optionally with a
> > payload, to the memory device, polling on a result, and then optionally
> > copying out the payload. The algorithm for doing this comes straight out
> > of the CXL 2.0 specification.
> >
> > Primary mailboxes are capable of generating an interrupt when submitting
> > a command in the background. That implementation is saved for a later
> > time.
> >
> > Secondary mailboxes aren't implemented at this time.
> >
> > The flow is proven with one implemented command, "identify". Because the
> > class code has already told the driver this is a memory device and the
> > identify command is mandatory.
> >
> > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > ---
> >  drivers/cxl/Kconfig |  14 ++
> >  drivers/cxl/cxl.h   |  39 +++++
> >  drivers/cxl/mem.c   | 342 +++++++++++++++++++++++++++++++++++++++++++-
> >  3 files changed, 394 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > index 3b66b46af8a0..fe591f74af96 100644
> > --- a/drivers/cxl/Kconfig
> > +++ b/drivers/cxl/Kconfig
> > @@ -32,4 +32,18 @@ config CXL_MEM
> >         Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
> >
> >         If unsure say 'm'.
> > +
> > +config CXL_MEM_INSECURE_DEBUG
> > +     bool "CXL.mem debugging"
> > +     depends on CXL_MEM
> > +     help
> > +       Enable debug of all CXL command payloads.
> > +
> > +       Some CXL devices and controllers support encryption and other
> > +       security features. The payloads for the commands that enable
> > +       those features may contain sensitive clear-text security
> > +       material. Disable debug of those command payloads by default.
> > +       If you are a kernel developer actively working on CXL
> > +       security enabling say Y, otherwise say N.
>
> Not specific to this patch, but the reference to encryption made me
> curious about integrity: are all CXL.mem devices compatible with DIMP?
> Some?  None?

The encryption here is "device passphrase" similar to the NVDIMM
Security Management described here:

https://pmem.io/documents/IntelOptanePMem_DSM_Interface-V2.0.pdf

The LIBNVDIMM enabling wrapped this support with the Linux keys
interface which among other things enforces wrapping the clear text
passphrase with a Linux "trusted/encrypted" key.

Additionally, the CXL.io interface optionally supports PCI IDE:

https://www.intel.com/content/dam/www/public/us/en/documents/reference-guides/pcie-device-security-enhancements.pdf

I'm otherwise not familiar with the DIMP acronym?

> > +
> >  endif
> > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > index a3da7f8050c4..df3d97154b63 100644
> > --- a/drivers/cxl/cxl.h
> > +++ b/drivers/cxl/cxl.h
> > @@ -31,9 +31,36 @@
> >  #define CXLDEV_MB_CAPS_OFFSET 0x00
> >  #define   CXLDEV_MB_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
> >  #define CXLDEV_MB_CTRL_OFFSET 0x04
> > +#define   CXLDEV_MB_CTRL_DOORBELL BIT(0)
> >  #define CXLDEV_MB_CMD_OFFSET 0x08
> > +#define   CXLDEV_MB_CMD_COMMAND_OPCODE_MASK GENMASK(15, 0)
> > +#define   CXLDEV_MB_CMD_PAYLOAD_LENGTH_MASK GENMASK(36, 16)
> >  #define CXLDEV_MB_STATUS_OFFSET 0x10
> > +#define   CXLDEV_MB_STATUS_RET_CODE_MASK GENMASK(47, 32)
> >  #define CXLDEV_MB_BG_CMD_STATUS_OFFSET 0x18
> > +#define CXLDEV_MB_PAYLOAD_OFFSET 0x20
> > +
> > +/* Memory Device (CXL 2.0 - 8.2.8.5.1.1) */
> > +#define CXLMDEV_STATUS_OFFSET 0x0
> > +#define   CXLMDEV_DEV_FATAL BIT(0)
> > +#define   CXLMDEV_FW_HALT BIT(1)
> > +#define   CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2)
> > +#define     CXLMDEV_MS_NOT_READY 0
> > +#define     CXLMDEV_MS_READY 1
> > +#define     CXLMDEV_MS_ERROR 2
> > +#define     CXLMDEV_MS_DISABLED 3
> > +#define   CXLMDEV_READY(status) \
> > +             (CXL_GET_FIELD(status, CXLMDEV_STATUS_MEDIA_STATUS) == CXLMDEV_MS_READY)
> > +#define   CXLMDEV_MBOX_IF_READY BIT(4)
> > +#define   CXLMDEV_RESET_NEEDED_SHIFT 5
> > +#define   CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5)
> > +#define     CXLMDEV_RESET_NEEDED_NOT 0
> > +#define     CXLMDEV_RESET_NEEDED_COLD 1
> > +#define     CXLMDEV_RESET_NEEDED_WARM 2
> > +#define     CXLMDEV_RESET_NEEDED_HOT 3
> > +#define     CXLMDEV_RESET_NEEDED_CXL 4
> > +#define   CXLMDEV_RESET_NEEDED(status) \
> > +             (CXL_GET_FIELD(status, CXLMDEV_RESET_NEEDED) != CXLMDEV_RESET_NEEDED_NOT)
> >
> >  /**
> >   * struct cxl_mem - A CXL memory device
> > @@ -44,6 +71,16 @@ struct cxl_mem {
> >       struct pci_dev *pdev;
> >       void __iomem *regs;
> >
> > +     struct {
> > +             struct range range;
> > +     } pmem;
> > +
> > +     struct {
> > +             struct range range;
> > +     } ram;
> > +
> > +     char firmware_version[0x10];
> > +
> >       /* Cap 0001h - CXL_CAP_CAP_ID_DEVICE_STATUS */
> >       struct {
> >               void __iomem *regs;
> > @@ -51,6 +88,7 @@ struct cxl_mem {
> >
> >       /* Cap 0002h - CXL_CAP_CAP_ID_PRIMARY_MAILBOX */
> >       struct {
> > +             struct mutex mutex; /* Protects device mailbox and firmware */
> >               void __iomem *regs;
> >               size_t payload_size;
> >       } mbox;
> > @@ -89,5 +127,6 @@ struct cxl_mem {
> >
> >  cxl_reg(status);
> >  cxl_reg(mbox);
> > +cxl_reg(mem);
> >
> >  #endif /* __CXL_H__ */
> > diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> > index fa14d51243ee..69ed15bfa5d4 100644
> > --- a/drivers/cxl/mem.c
> > +++ b/drivers/cxl/mem.c
> > @@ -6,6 +6,270 @@
> >  #include "pci.h"
> >  #include "cxl.h"
> >
> > +#define cxl_doorbell_busy(cxlm)                                                \
> > +     (cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
> > +      CXLDEV_MB_CTRL_DOORBELL)
> > +
> > +#define CXL_MAILBOX_TIMEOUT_US 2000
>
> This should be _MS?
>
> > +
> > +enum opcode {
> > +     CXL_MBOX_OP_IDENTIFY            = 0x4000,
> > +     CXL_MBOX_OP_MAX                 = 0x10000
> > +};
> > +
> > +/**
> > + * struct mbox_cmd - A command to be submitted to hardware.
> > + * @opcode: (input) The command set and command submitted to hardware.
> > + * @payload_in: (input) Pointer to the input payload.
> > + * @payload_out: (output) Pointer to the output payload. Must be allocated by
> > + *            the caller.
> > + * @size_in: (input) Number of bytes to load from @payload.
> > + * @size_out: (output) Number of bytes loaded into @payload.
> > + * @return_code: (output) Error code returned from hardware.
> > + *
> > + * This is the primary mechanism used to send commands to the hardware.
> > + * All the fields except @payload_* correspond exactly to the fields described in
> > + * Command Register section of the CXL 2.0 spec (8.2.8.4.5). @payload_in and
> > + * @payload_out are written to, and read from the Command Payload Registers
> > + * defined in (8.2.8.4.8).
> > + */
> > +struct mbox_cmd {
> > +     u16 opcode;
> > +     void *payload_in;
> > +     void *payload_out;
> > +     size_t size_in;
> > +     size_t size_out;
> > +     u16 return_code;
> > +#define CXL_MBOX_SUCCESS 0
> > +};
> > +
> > +static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
> > +{
> > +     const int timeout = msecs_to_jiffies(CXL_MAILBOX_TIMEOUT_US);
> > +     const unsigned long start = jiffies;
> > +     unsigned long end = start;
> > +
> > +     while (cxl_doorbell_busy(cxlm)) {
> > +             end = jiffies;
> > +
> > +             if (time_after(end, start + timeout)) {
> > +                     /* Check again in case preempted before timeout test */
> > +                     if (!cxl_doorbell_busy(cxlm))
> > +                             break;
> > +                     return -ETIMEDOUT;
> > +             }
> > +             cpu_relax();
> > +     }
> > +
> > +     dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
> > +             jiffies_to_msecs(end) - jiffies_to_msecs(start));
> > +     return 0;
> > +}
> > +
> > +static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
> > +                              struct mbox_cmd *mbox_cmd)
> > +{
> > +     dev_warn(&cxlm->pdev->dev, "Mailbox command timed out\n");
> > +     dev_info(&cxlm->pdev->dev,
> > +              "\topcode: 0x%04x\n"
> > +              "\tpayload size: %zub\n",
> > +              mbox_cmd->opcode, mbox_cmd->size_in);
> > +
> > +     if (IS_ENABLED(CONFIG_CXL_MEM_INSECURE_DEBUG)) {
> > +             print_hex_dump_debug("Payload ", DUMP_PREFIX_OFFSET, 16, 1,
> > +                                  mbox_cmd->payload_in, mbox_cmd->size_in,
> > +                                  true);
> > +     }
> > +
> > +     /* Here's a good place to figure out if a device reset is needed */
>
> What are the implications if we don't do a reset, as this implementation
> does not?  IOW, does a timeout require a device to be recovered through a
> reset before it can receive additional commands, or is it safe to simply
> drop the command that timed out on the floor and proceed?

Not a satisfying answer, but "it depends". It's also complicated by
the fact that a reset may need to be coordinated with other devices in
the interleave-set as the HDM decoders may bounce.

For comparison, to date there have been no problems with the "drop on
the floor" policy of LIBNVDIMM command timeouts. At the same time
there simply was not a software visible reset mechanism for those
devices so this problem never came out. This mailbox isn't a fast
path, so the device is likely completely dead if this timeout is ever
violated, and the firmware reporting a timeout might as well assume
that the OS gives up on the device.

I'll let Ben chime in on the rest...
Ben Widawsky Feb. 2, 2021, 10:50 p.m. UTC | #6
On 21-01-30 15:51:57, David Rientjes wrote:
> On Fri, 29 Jan 2021, Ben Widawsky wrote:
> 

[snip]

> > +/**
> > + * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
> > + * @cxlm: The CXL memory device to communicate with.
> > + * @mbox_cmd: Command to send to the memory device.
> > + *
> > + * Context: Any context. Expects mbox_lock to be held.
> > + * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
> > + *         Caller should check the return code in @mbox_cmd to make sure it
> > + *         succeeded.
> > + *
> > + * This is a generic form of the CXL mailbox send command, thus the only I/O
> > + * operations used are cxl_read_mbox_reg(). Memory devices, and perhaps other
> > + * types of CXL devices may have further information available upon error
> > + * conditions.
> > + *
> > + * The CXL spec allows for up to two mailboxes. The intention is for the primary
> > + * mailbox to be OS controlled and the secondary mailbox to be used by system
> > + * firmware. This allows the OS and firmware to communicate with the device and
> > + * not need to coordinate with each other. The driver only uses the primary
> > + * mailbox.
> > + */
> > +static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
> > +				 struct mbox_cmd *mbox_cmd)
> > +{
> > +	void __iomem *payload = cxlm->mbox.regs + CXLDEV_MB_PAYLOAD_OFFSET;
> 
> Do you need to verify the payload is non-empty per 8.2.8.4?
> 

What do you mean exactly? Emptiness or lack thereof is what determines the size
parameter of the mailbox interface (if we want to input data, we need to write
size, if we got output data, we have to read size bytes out of the payload
registers).

Perhaps I miss the point though, could you elaborate?

[snip]
Ben Widawsky Feb. 2, 2021, 10:57 p.m. UTC | #7
On 21-02-01 12:00:18, Dan Williams wrote:
> On Sat, Jan 30, 2021 at 3:52 PM David Rientjes <rientjes@google.com> wrote:
> >
> > On Fri, 29 Jan 2021, Ben Widawsky wrote:
> >
> > > Provide enough functionality to utilize the mailbox of a memory device.
> > > The mailbox is used to interact with the firmware running on the memory
> > > device.
> > >
> > > The CXL specification defines separate capabilities for the mailbox and
> > > the memory device. The mailbox interface has a doorbell to indicate
> > > ready to accept commands and the memory device has a capability register
> > > that indicates the mailbox interface is ready. The expectation is that
> > > the doorbell-ready is always later than the memory-device-indication
> > > that the mailbox is ready.
> > >
> > > Create a function to handle sending a command, optionally with a
> > > payload, to the memory device, polling on a result, and then optionally
> > > copying out the payload. The algorithm for doing this comes straight out
> > > of the CXL 2.0 specification.
> > >
> > > Primary mailboxes are capable of generating an interrupt when submitting
> > > a command in the background. That implementation is saved for a later
> > > time.
> > >
> > > Secondary mailboxes aren't implemented at this time.
> > >
> > > The flow is proven with one implemented command, "identify". Because the
> > > class code has already told the driver this is a memory device and the
> > > identify command is mandatory.
> > >
> > > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > > ---
> > >  drivers/cxl/Kconfig |  14 ++
> > >  drivers/cxl/cxl.h   |  39 +++++
> > >  drivers/cxl/mem.c   | 342 +++++++++++++++++++++++++++++++++++++++++++-
> > >  3 files changed, 394 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > > index 3b66b46af8a0..fe591f74af96 100644
> > > --- a/drivers/cxl/Kconfig
> > > +++ b/drivers/cxl/Kconfig
> > > @@ -32,4 +32,18 @@ config CXL_MEM
> > >         Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
> > >
> > >         If unsure say 'm'.
> > > +
> > > +config CXL_MEM_INSECURE_DEBUG
> > > +     bool "CXL.mem debugging"
> > > +     depends on CXL_MEM
> > > +     help
> > > +       Enable debug of all CXL command payloads.
> > > +
> > > +       Some CXL devices and controllers support encryption and other
> > > +       security features. The payloads for the commands that enable
> > > +       those features may contain sensitive clear-text security
> > > +       material. Disable debug of those command payloads by default.
> > > +       If you are a kernel developer actively working on CXL
> > > +       security enabling say Y, otherwise say N.
> >
> > Not specific to this patch, but the reference to encryption made me
> > curious about integrity: are all CXL.mem devices compatible with DIMP?
> > Some?  None?
> 
> The encryption here is "device passphrase" similar to the NVDIMM
> Security Management described here:
> 
> https://pmem.io/documents/IntelOptanePMem_DSM_Interface-V2.0.pdf
> 
> The LIBNVDIMM enabling wrapped this support with the Linux keys
> interface which among other things enforces wrapping the clear text
> passphrase with a Linux "trusted/encrypted" key.
> 
> Additionally, the CXL.io interface optionally supports PCI IDE:
> 
> https://www.intel.com/content/dam/www/public/us/en/documents/reference-guides/pcie-device-security-enhancements.pdf
> 
> I'm otherwise not familiar with the DIMP acronym?
> 
> > > +
> > >  endif
> > > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > > index a3da7f8050c4..df3d97154b63 100644
> > > --- a/drivers/cxl/cxl.h
> > > +++ b/drivers/cxl/cxl.h
> > > @@ -31,9 +31,36 @@
> > >  #define CXLDEV_MB_CAPS_OFFSET 0x00
> > >  #define   CXLDEV_MB_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
> > >  #define CXLDEV_MB_CTRL_OFFSET 0x04
> > > +#define   CXLDEV_MB_CTRL_DOORBELL BIT(0)
> > >  #define CXLDEV_MB_CMD_OFFSET 0x08
> > > +#define   CXLDEV_MB_CMD_COMMAND_OPCODE_MASK GENMASK(15, 0)
> > > +#define   CXLDEV_MB_CMD_PAYLOAD_LENGTH_MASK GENMASK(36, 16)
> > >  #define CXLDEV_MB_STATUS_OFFSET 0x10
> > > +#define   CXLDEV_MB_STATUS_RET_CODE_MASK GENMASK(47, 32)
> > >  #define CXLDEV_MB_BG_CMD_STATUS_OFFSET 0x18
> > > +#define CXLDEV_MB_PAYLOAD_OFFSET 0x20
> > > +
> > > +/* Memory Device (CXL 2.0 - 8.2.8.5.1.1) */
> > > +#define CXLMDEV_STATUS_OFFSET 0x0
> > > +#define   CXLMDEV_DEV_FATAL BIT(0)
> > > +#define   CXLMDEV_FW_HALT BIT(1)
> > > +#define   CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2)
> > > +#define     CXLMDEV_MS_NOT_READY 0
> > > +#define     CXLMDEV_MS_READY 1
> > > +#define     CXLMDEV_MS_ERROR 2
> > > +#define     CXLMDEV_MS_DISABLED 3
> > > +#define   CXLMDEV_READY(status) \
> > > +             (CXL_GET_FIELD(status, CXLMDEV_STATUS_MEDIA_STATUS) == CXLMDEV_MS_READY)
> > > +#define   CXLMDEV_MBOX_IF_READY BIT(4)
> > > +#define   CXLMDEV_RESET_NEEDED_SHIFT 5
> > > +#define   CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5)
> > > +#define     CXLMDEV_RESET_NEEDED_NOT 0
> > > +#define     CXLMDEV_RESET_NEEDED_COLD 1
> > > +#define     CXLMDEV_RESET_NEEDED_WARM 2
> > > +#define     CXLMDEV_RESET_NEEDED_HOT 3
> > > +#define     CXLMDEV_RESET_NEEDED_CXL 4
> > > +#define   CXLMDEV_RESET_NEEDED(status) \
> > > +             (CXL_GET_FIELD(status, CXLMDEV_RESET_NEEDED) != CXLMDEV_RESET_NEEDED_NOT)
> > >
> > >  /**
> > >   * struct cxl_mem - A CXL memory device
> > > @@ -44,6 +71,16 @@ struct cxl_mem {
> > >       struct pci_dev *pdev;
> > >       void __iomem *regs;
> > >
> > > +     struct {
> > > +             struct range range;
> > > +     } pmem;
> > > +
> > > +     struct {
> > > +             struct range range;
> > > +     } ram;
> > > +
> > > +     char firmware_version[0x10];
> > > +
> > >       /* Cap 0001h - CXL_CAP_CAP_ID_DEVICE_STATUS */
> > >       struct {
> > >               void __iomem *regs;
> > > @@ -51,6 +88,7 @@ struct cxl_mem {
> > >
> > >       /* Cap 0002h - CXL_CAP_CAP_ID_PRIMARY_MAILBOX */
> > >       struct {
> > > +             struct mutex mutex; /* Protects device mailbox and firmware */
> > >               void __iomem *regs;
> > >               size_t payload_size;
> > >       } mbox;
> > > @@ -89,5 +127,6 @@ struct cxl_mem {
> > >
> > >  cxl_reg(status);
> > >  cxl_reg(mbox);
> > > +cxl_reg(mem);
> > >
> > >  #endif /* __CXL_H__ */
> > > diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> > > index fa14d51243ee..69ed15bfa5d4 100644
> > > --- a/drivers/cxl/mem.c
> > > +++ b/drivers/cxl/mem.c
> > > @@ -6,6 +6,270 @@
> > >  #include "pci.h"
> > >  #include "cxl.h"
> > >
> > > +#define cxl_doorbell_busy(cxlm)                                                \
> > > +     (cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
> > > +      CXLDEV_MB_CTRL_DOORBELL)
> > > +
> > > +#define CXL_MAILBOX_TIMEOUT_US 2000
> >
> > This should be _MS?
> >
> > > +
> > > +enum opcode {
> > > +     CXL_MBOX_OP_IDENTIFY            = 0x4000,
> > > +     CXL_MBOX_OP_MAX                 = 0x10000
> > > +};
> > > +
> > > +/**
> > > + * struct mbox_cmd - A command to be submitted to hardware.
> > > + * @opcode: (input) The command set and command submitted to hardware.
> > > + * @payload_in: (input) Pointer to the input payload.
> > > + * @payload_out: (output) Pointer to the output payload. Must be allocated by
> > > + *            the caller.
> > > + * @size_in: (input) Number of bytes to load from @payload.
> > > + * @size_out: (output) Number of bytes loaded into @payload.
> > > + * @return_code: (output) Error code returned from hardware.
> > > + *
> > > + * This is the primary mechanism used to send commands to the hardware.
> > > + * All the fields except @payload_* correspond exactly to the fields described in
> > > + * Command Register section of the CXL 2.0 spec (8.2.8.4.5). @payload_in and
> > > + * @payload_out are written to, and read from the Command Payload Registers
> > > + * defined in (8.2.8.4.8).
> > > + */
> > > +struct mbox_cmd {
> > > +     u16 opcode;
> > > +     void *payload_in;
> > > +     void *payload_out;
> > > +     size_t size_in;
> > > +     size_t size_out;
> > > +     u16 return_code;
> > > +#define CXL_MBOX_SUCCESS 0
> > > +};
> > > +
> > > +static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
> > > +{
> > > +     const int timeout = msecs_to_jiffies(CXL_MAILBOX_TIMEOUT_US);
> > > +     const unsigned long start = jiffies;
> > > +     unsigned long end = start;
> > > +
> > > +     while (cxl_doorbell_busy(cxlm)) {
> > > +             end = jiffies;
> > > +
> > > +             if (time_after(end, start + timeout)) {
> > > +                     /* Check again in case preempted before timeout test */
> > > +                     if (!cxl_doorbell_busy(cxlm))
> > > +                             break;
> > > +                     return -ETIMEDOUT;
> > > +             }
> > > +             cpu_relax();
> > > +     }
> > > +
> > > +     dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
> > > +             jiffies_to_msecs(end) - jiffies_to_msecs(start));
> > > +     return 0;
> > > +}
> > > +
> > > +static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
> > > +                              struct mbox_cmd *mbox_cmd)
> > > +{
> > > +     dev_warn(&cxlm->pdev->dev, "Mailbox command timed out\n");
> > > +     dev_info(&cxlm->pdev->dev,
> > > +              "\topcode: 0x%04x\n"
> > > +              "\tpayload size: %zub\n",
> > > +              mbox_cmd->opcode, mbox_cmd->size_in);
> > > +
> > > +     if (IS_ENABLED(CONFIG_CXL_MEM_INSECURE_DEBUG)) {
> > > +             print_hex_dump_debug("Payload ", DUMP_PREFIX_OFFSET, 16, 1,
> > > +                                  mbox_cmd->payload_in, mbox_cmd->size_in,
> > > +                                  true);
> > > +     }
> > > +
> > > +     /* Here's a good place to figure out if a device reset is needed */
> >
> > What are the implications if we don't do a reset, as this implementation
> > does not?  IOW, does a timeout require a device to be recovered through a
> > reset before it can receive additional commands, or is it safe to simply
> > drop the command that timed out on the floor and proceed?
> 
> Not a satisfying answer, but "it depends". It's also complicated by
> the fact that a reset may need to be coordinated with other devices in
> the interleave-set as the HDM decoders may bounce.
> 
> For comparison, to date there have been no problems with the "drop on
> the floor" policy of LIBNVDIMM command timeouts. At the same time
> there simply was not a software visible reset mechanism for those
> devices so this problem never came out. This mailbox isn't a fast
> path, so the device is likely completely dead if this timeout is ever
> violated, and the firmware reporting a timeout might as well assume
> that the OS gives up on the device.
> 
> I'll let Ben chime in on the rest...

Reset handling is next on the TODO list for the driver. I had two main reasons
for not even taking a stab at it.
1. I have no good way to test it. We are working on adding some test conditions
   to QEMU for it.
2. The main difficulty in my mind with reset is you can't pull the memory out
   from under the OS here. While the driver doesn't yet handle persistent memory
   capacities, it may have volatile capacity configured by the BIOS. So the goal
   was, get the bits of the driver in that would at least allow developers,
   hardware vendors, and folks contributing to the spec a way to have basic
   interaction with a CXL type 3 device.
Dan Williams Feb. 2, 2021, 11:54 p.m. UTC | #8
On Tue, Feb 2, 2021 at 2:57 PM Ben Widawsky <ben.widawsky@intel.com> wrote:
>
> On 21-02-01 12:00:18, Dan Williams wrote:
> > On Sat, Jan 30, 2021 at 3:52 PM David Rientjes <rientjes@google.com> wrote:
> > >
> > > On Fri, 29 Jan 2021, Ben Widawsky wrote:
> > >
> > > > Provide enough functionality to utilize the mailbox of a memory device.
> > > > The mailbox is used to interact with the firmware running on the memory
> > > > device.
> > > >
> > > > The CXL specification defines separate capabilities for the mailbox and
> > > > the memory device. The mailbox interface has a doorbell to indicate
> > > > ready to accept commands and the memory device has a capability register
> > > > that indicates the mailbox interface is ready. The expectation is that
> > > > the doorbell-ready is always later than the memory-device-indication
> > > > that the mailbox is ready.
> > > >
> > > > Create a function to handle sending a command, optionally with a
> > > > payload, to the memory device, polling on a result, and then optionally
> > > > copying out the payload. The algorithm for doing this comes straight out
> > > > of the CXL 2.0 specification.
> > > >
> > > > Primary mailboxes are capable of generating an interrupt when submitting
> > > > a command in the background. That implementation is saved for a later
> > > > time.
> > > >
> > > > Secondary mailboxes aren't implemented at this time.
> > > >
> > > > The flow is proven with one implemented command, "identify". Because the
> > > > class code has already told the driver this is a memory device and the
> > > > identify command is mandatory.
> > > >
> > > > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > > > ---
> > > >  drivers/cxl/Kconfig |  14 ++
> > > >  drivers/cxl/cxl.h   |  39 +++++
> > > >  drivers/cxl/mem.c   | 342 +++++++++++++++++++++++++++++++++++++++++++-
> > > >  3 files changed, 394 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > > > index 3b66b46af8a0..fe591f74af96 100644
> > > > --- a/drivers/cxl/Kconfig
> > > > +++ b/drivers/cxl/Kconfig
> > > > @@ -32,4 +32,18 @@ config CXL_MEM
> > > >         Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
> > > >
> > > >         If unsure say 'm'.
> > > > +
> > > > +config CXL_MEM_INSECURE_DEBUG
> > > > +     bool "CXL.mem debugging"
> > > > +     depends on CXL_MEM
> > > > +     help
> > > > +       Enable debug of all CXL command payloads.
> > > > +
> > > > +       Some CXL devices and controllers support encryption and other
> > > > +       security features. The payloads for the commands that enable
> > > > +       those features may contain sensitive clear-text security
> > > > +       material. Disable debug of those command payloads by default.
> > > > +       If you are a kernel developer actively working on CXL
> > > > +       security enabling say Y, otherwise say N.
> > >
> > > Not specific to this patch, but the reference to encryption made me
> > > curious about integrity: are all CXL.mem devices compatible with DIMP?
> > > Some?  None?
> >
> > The encryption here is "device passphrase" similar to the NVDIMM
> > Security Management described here:
> >
> > https://pmem.io/documents/IntelOptanePMem_DSM_Interface-V2.0.pdf
> >
> > The LIBNVDIMM enabling wrapped this support with the Linux keys
> > interface which among other things enforces wrapping the clear text
> > passphrase with a Linux "trusted/encrypted" key.
> >
> > Additionally, the CXL.io interface optionally supports PCI IDE:
> >
> > https://www.intel.com/content/dam/www/public/us/en/documents/reference-guides/pcie-device-security-enhancements.pdf
> >
> > I'm otherwise not familiar with the DIMP acronym?
> >
> > > > +
> > > >  endif
> > > > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > > > index a3da7f8050c4..df3d97154b63 100644
> > > > --- a/drivers/cxl/cxl.h
> > > > +++ b/drivers/cxl/cxl.h
> > > > @@ -31,9 +31,36 @@
> > > >  #define CXLDEV_MB_CAPS_OFFSET 0x00
> > > >  #define   CXLDEV_MB_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
> > > >  #define CXLDEV_MB_CTRL_OFFSET 0x04
> > > > +#define   CXLDEV_MB_CTRL_DOORBELL BIT(0)
> > > >  #define CXLDEV_MB_CMD_OFFSET 0x08
> > > > +#define   CXLDEV_MB_CMD_COMMAND_OPCODE_MASK GENMASK(15, 0)
> > > > +#define   CXLDEV_MB_CMD_PAYLOAD_LENGTH_MASK GENMASK(36, 16)
> > > >  #define CXLDEV_MB_STATUS_OFFSET 0x10
> > > > +#define   CXLDEV_MB_STATUS_RET_CODE_MASK GENMASK(47, 32)
> > > >  #define CXLDEV_MB_BG_CMD_STATUS_OFFSET 0x18
> > > > +#define CXLDEV_MB_PAYLOAD_OFFSET 0x20
> > > > +
> > > > +/* Memory Device (CXL 2.0 - 8.2.8.5.1.1) */
> > > > +#define CXLMDEV_STATUS_OFFSET 0x0
> > > > +#define   CXLMDEV_DEV_FATAL BIT(0)
> > > > +#define   CXLMDEV_FW_HALT BIT(1)
> > > > +#define   CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2)
> > > > +#define     CXLMDEV_MS_NOT_READY 0
> > > > +#define     CXLMDEV_MS_READY 1
> > > > +#define     CXLMDEV_MS_ERROR 2
> > > > +#define     CXLMDEV_MS_DISABLED 3
> > > > +#define   CXLMDEV_READY(status) \
> > > > +             (CXL_GET_FIELD(status, CXLMDEV_STATUS_MEDIA_STATUS) == CXLMDEV_MS_READY)
> > > > +#define   CXLMDEV_MBOX_IF_READY BIT(4)
> > > > +#define   CXLMDEV_RESET_NEEDED_SHIFT 5
> > > > +#define   CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5)
> > > > +#define     CXLMDEV_RESET_NEEDED_NOT 0
> > > > +#define     CXLMDEV_RESET_NEEDED_COLD 1
> > > > +#define     CXLMDEV_RESET_NEEDED_WARM 2
> > > > +#define     CXLMDEV_RESET_NEEDED_HOT 3
> > > > +#define     CXLMDEV_RESET_NEEDED_CXL 4
> > > > +#define   CXLMDEV_RESET_NEEDED(status) \
> > > > +             (CXL_GET_FIELD(status, CXLMDEV_RESET_NEEDED) != CXLMDEV_RESET_NEEDED_NOT)
> > > >
> > > >  /**
> > > >   * struct cxl_mem - A CXL memory device
> > > > @@ -44,6 +71,16 @@ struct cxl_mem {
> > > >       struct pci_dev *pdev;
> > > >       void __iomem *regs;
> > > >
> > > > +     struct {
> > > > +             struct range range;
> > > > +     } pmem;
> > > > +
> > > > +     struct {
> > > > +             struct range range;
> > > > +     } ram;
> > > > +
> > > > +     char firmware_version[0x10];
> > > > +
> > > >       /* Cap 0001h - CXL_CAP_CAP_ID_DEVICE_STATUS */
> > > >       struct {
> > > >               void __iomem *regs;
> > > > @@ -51,6 +88,7 @@ struct cxl_mem {
> > > >
> > > >       /* Cap 0002h - CXL_CAP_CAP_ID_PRIMARY_MAILBOX */
> > > >       struct {
> > > > +             struct mutex mutex; /* Protects device mailbox and firmware */
> > > >               void __iomem *regs;
> > > >               size_t payload_size;
> > > >       } mbox;
> > > > @@ -89,5 +127,6 @@ struct cxl_mem {
> > > >
> > > >  cxl_reg(status);
> > > >  cxl_reg(mbox);
> > > > +cxl_reg(mem);
> > > >
> > > >  #endif /* __CXL_H__ */
> > > > diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> > > > index fa14d51243ee..69ed15bfa5d4 100644
> > > > --- a/drivers/cxl/mem.c
> > > > +++ b/drivers/cxl/mem.c
> > > > @@ -6,6 +6,270 @@
> > > >  #include "pci.h"
> > > >  #include "cxl.h"
> > > >
> > > > +#define cxl_doorbell_busy(cxlm)                                                \
> > > > +     (cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
> > > > +      CXLDEV_MB_CTRL_DOORBELL)
> > > > +
> > > > +#define CXL_MAILBOX_TIMEOUT_US 2000
> > >
> > > This should be _MS?
> > >
> > > > +
> > > > +enum opcode {
> > > > +     CXL_MBOX_OP_IDENTIFY            = 0x4000,
> > > > +     CXL_MBOX_OP_MAX                 = 0x10000
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct mbox_cmd - A command to be submitted to hardware.
> > > > + * @opcode: (input) The command set and command submitted to hardware.
> > > > + * @payload_in: (input) Pointer to the input payload.
> > > > + * @payload_out: (output) Pointer to the output payload. Must be allocated by
> > > > + *            the caller.
> > > > + * @size_in: (input) Number of bytes to load from @payload.
> > > > + * @size_out: (output) Number of bytes loaded into @payload.
> > > > + * @return_code: (output) Error code returned from hardware.
> > > > + *
> > > > + * This is the primary mechanism used to send commands to the hardware.
> > > > + * All the fields except @payload_* correspond exactly to the fields described in
> > > > + * Command Register section of the CXL 2.0 spec (8.2.8.4.5). @payload_in and
> > > > + * @payload_out are written to, and read from the Command Payload Registers
> > > > + * defined in (8.2.8.4.8).
> > > > + */
> > > > +struct mbox_cmd {
> > > > +     u16 opcode;
> > > > +     void *payload_in;
> > > > +     void *payload_out;
> > > > +     size_t size_in;
> > > > +     size_t size_out;
> > > > +     u16 return_code;
> > > > +#define CXL_MBOX_SUCCESS 0
> > > > +};
> > > > +
> > > > +static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
> > > > +{
> > > > +     const int timeout = msecs_to_jiffies(CXL_MAILBOX_TIMEOUT_US);
> > > > +     const unsigned long start = jiffies;
> > > > +     unsigned long end = start;
> > > > +
> > > > +     while (cxl_doorbell_busy(cxlm)) {
> > > > +             end = jiffies;
> > > > +
> > > > +             if (time_after(end, start + timeout)) {
> > > > +                     /* Check again in case preempted before timeout test */
> > > > +                     if (!cxl_doorbell_busy(cxlm))
> > > > +                             break;
> > > > +                     return -ETIMEDOUT;
> > > > +             }
> > > > +             cpu_relax();
> > > > +     }
> > > > +
> > > > +     dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
> > > > +             jiffies_to_msecs(end) - jiffies_to_msecs(start));
> > > > +     return 0;
> > > > +}
> > > > +
> > > > +static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
> > > > +                              struct mbox_cmd *mbox_cmd)
> > > > +{
> > > > +     dev_warn(&cxlm->pdev->dev, "Mailbox command timed out\n");
> > > > +     dev_info(&cxlm->pdev->dev,
> > > > +              "\topcode: 0x%04x\n"
> > > > +              "\tpayload size: %zub\n",
> > > > +              mbox_cmd->opcode, mbox_cmd->size_in);
> > > > +
> > > > +     if (IS_ENABLED(CONFIG_CXL_MEM_INSECURE_DEBUG)) {
> > > > +             print_hex_dump_debug("Payload ", DUMP_PREFIX_OFFSET, 16, 1,
> > > > +                                  mbox_cmd->payload_in, mbox_cmd->size_in,
> > > > +                                  true);
> > > > +     }
> > > > +
> > > > +     /* Here's a good place to figure out if a device reset is needed */
> > >
> > > What are the implications if we don't do a reset, as this implementation
> > > does not?  IOW, does a timeout require a device to be recovered through a
> > > reset before it can receive additional commands, or is it safe to simply
> > > drop the command that timed out on the floor and proceed?
> >
> > Not a satisfying answer, but "it depends". It's also complicated by
> > the fact that a reset may need to be coordinated with other devices in
> > the interleave-set as the HDM decoders may bounce.
> >
> > For comparison, to date there have been no problems with the "drop on
> > the floor" policy of LIBNVDIMM command timeouts. At the same time
> > there simply was not a software visible reset mechanism for those
> > devices so this problem never came out. This mailbox isn't a fast
> > path, so the device is likely completely dead if this timeout is ever
> > violated, and the firmware reporting a timeout might as well assume
> > that the OS gives up on the device.
> >
> > I'll let Ben chime in on the rest...
>
> Reset handling is next on the TODO list for the driver. I had two main reasons
> for not even taking a stab at it.
> 1. I have no good way to test it. We are working on adding some test conditions
>    to QEMU for it.
> 2. The main difficulty in my mind with reset is you can't pull the memory out
>    from under the OS here. While the driver doesn't yet handle persistent memory
>    capacities, it may have volatile capacity configured by the BIOS. So the goal
>    was, get the bits of the driver in that would at least allow developers,
>    hardware vendors, and folks contributing to the spec a way to have basic
>    interaction with a CXL type 3 device.

Honestly I think in most cases if the firmware decides to return a
"reset required" status the Linux response will be "lol, no" because
the firmware has no concept of the violence that would impose on the
rest of the system.
Ben Widawsky Feb. 3, 2021, 12:54 a.m. UTC | #9
On 21-02-02 15:54:03, Dan Williams wrote:
> On Tue, Feb 2, 2021 at 2:57 PM Ben Widawsky <ben.widawsky@intel.com> wrote:
> >
> > On 21-02-01 12:00:18, Dan Williams wrote:
> > > On Sat, Jan 30, 2021 at 3:52 PM David Rientjes <rientjes@google.com> wrote:
> > > >
> > > > On Fri, 29 Jan 2021, Ben Widawsky wrote:
> > > >
> > > > > Provide enough functionality to utilize the mailbox of a memory device.
> > > > > The mailbox is used to interact with the firmware running on the memory
> > > > > device.
> > > > >
> > > > > The CXL specification defines separate capabilities for the mailbox and
> > > > > the memory device. The mailbox interface has a doorbell to indicate
> > > > > ready to accept commands and the memory device has a capability register
> > > > > that indicates the mailbox interface is ready. The expectation is that
> > > > > the doorbell-ready is always later than the memory-device-indication
> > > > > that the mailbox is ready.
> > > > >
> > > > > Create a function to handle sending a command, optionally with a
> > > > > payload, to the memory device, polling on a result, and then optionally
> > > > > copying out the payload. The algorithm for doing this comes straight out
> > > > > of the CXL 2.0 specification.
> > > > >
> > > > > Primary mailboxes are capable of generating an interrupt when submitting
> > > > > a command in the background. That implementation is saved for a later
> > > > > time.
> > > > >
> > > > > Secondary mailboxes aren't implemented at this time.
> > > > >
> > > > > The flow is proven with one implemented command, "identify". Because the
> > > > > class code has already told the driver this is a memory device and the
> > > > > identify command is mandatory.
> > > > >
> > > > > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > > > > ---
> > > > >  drivers/cxl/Kconfig |  14 ++
> > > > >  drivers/cxl/cxl.h   |  39 +++++
> > > > >  drivers/cxl/mem.c   | 342 +++++++++++++++++++++++++++++++++++++++++++-
> > > > >  3 files changed, 394 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > > > > index 3b66b46af8a0..fe591f74af96 100644
> > > > > --- a/drivers/cxl/Kconfig
> > > > > +++ b/drivers/cxl/Kconfig
> > > > > @@ -32,4 +32,18 @@ config CXL_MEM
> > > > >         Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
> > > > >
> > > > >         If unsure say 'm'.
> > > > > +
> > > > > +config CXL_MEM_INSECURE_DEBUG
> > > > > +     bool "CXL.mem debugging"
> > > > > +     depends on CXL_MEM
> > > > > +     help
> > > > > +       Enable debug of all CXL command payloads.
> > > > > +
> > > > > +       Some CXL devices and controllers support encryption and other
> > > > > +       security features. The payloads for the commands that enable
> > > > > +       those features may contain sensitive clear-text security
> > > > > +       material. Disable debug of those command payloads by default.
> > > > > +       If you are a kernel developer actively working on CXL
> > > > > +       security enabling say Y, otherwise say N.
> > > >
> > > > Not specific to this patch, but the reference to encryption made me
> > > > curious about integrity: are all CXL.mem devices compatible with DIMP?
> > > > Some?  None?
> > >
> > > The encryption here is "device passphrase" similar to the NVDIMM
> > > Security Management described here:
> > >
> > > https://pmem.io/documents/IntelOptanePMem_DSM_Interface-V2.0.pdf
> > >
> > > The LIBNVDIMM enabling wrapped this support with the Linux keys
> > > interface which among other things enforces wrapping the clear text
> > > passphrase with a Linux "trusted/encrypted" key.
> > >
> > > Additionally, the CXL.io interface optionally supports PCI IDE:
> > >
> > > https://www.intel.com/content/dam/www/public/us/en/documents/reference-guides/pcie-device-security-enhancements.pdf
> > >
> > > I'm otherwise not familiar with the DIMP acronym?
> > >
> > > > > +
> > > > >  endif
> > > > > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > > > > index a3da7f8050c4..df3d97154b63 100644
> > > > > --- a/drivers/cxl/cxl.h
> > > > > +++ b/drivers/cxl/cxl.h
> > > > > @@ -31,9 +31,36 @@
> > > > >  #define CXLDEV_MB_CAPS_OFFSET 0x00
> > > > >  #define   CXLDEV_MB_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
> > > > >  #define CXLDEV_MB_CTRL_OFFSET 0x04
> > > > > +#define   CXLDEV_MB_CTRL_DOORBELL BIT(0)
> > > > >  #define CXLDEV_MB_CMD_OFFSET 0x08
> > > > > +#define   CXLDEV_MB_CMD_COMMAND_OPCODE_MASK GENMASK(15, 0)
> > > > > +#define   CXLDEV_MB_CMD_PAYLOAD_LENGTH_MASK GENMASK(36, 16)
> > > > >  #define CXLDEV_MB_STATUS_OFFSET 0x10
> > > > > +#define   CXLDEV_MB_STATUS_RET_CODE_MASK GENMASK(47, 32)
> > > > >  #define CXLDEV_MB_BG_CMD_STATUS_OFFSET 0x18
> > > > > +#define CXLDEV_MB_PAYLOAD_OFFSET 0x20
> > > > > +
> > > > > +/* Memory Device (CXL 2.0 - 8.2.8.5.1.1) */
> > > > > +#define CXLMDEV_STATUS_OFFSET 0x0
> > > > > +#define   CXLMDEV_DEV_FATAL BIT(0)
> > > > > +#define   CXLMDEV_FW_HALT BIT(1)
> > > > > +#define   CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2)
> > > > > +#define     CXLMDEV_MS_NOT_READY 0
> > > > > +#define     CXLMDEV_MS_READY 1
> > > > > +#define     CXLMDEV_MS_ERROR 2
> > > > > +#define     CXLMDEV_MS_DISABLED 3
> > > > > +#define   CXLMDEV_READY(status) \
> > > > > +             (CXL_GET_FIELD(status, CXLMDEV_STATUS_MEDIA_STATUS) == CXLMDEV_MS_READY)
> > > > > +#define   CXLMDEV_MBOX_IF_READY BIT(4)
> > > > > +#define   CXLMDEV_RESET_NEEDED_SHIFT 5
> > > > > +#define   CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5)
> > > > > +#define     CXLMDEV_RESET_NEEDED_NOT 0
> > > > > +#define     CXLMDEV_RESET_NEEDED_COLD 1
> > > > > +#define     CXLMDEV_RESET_NEEDED_WARM 2
> > > > > +#define     CXLMDEV_RESET_NEEDED_HOT 3
> > > > > +#define     CXLMDEV_RESET_NEEDED_CXL 4
> > > > > +#define   CXLMDEV_RESET_NEEDED(status) \
> > > > > +             (CXL_GET_FIELD(status, CXLMDEV_RESET_NEEDED) != CXLMDEV_RESET_NEEDED_NOT)
> > > > >
> > > > >  /**
> > > > >   * struct cxl_mem - A CXL memory device
> > > > > @@ -44,6 +71,16 @@ struct cxl_mem {
> > > > >       struct pci_dev *pdev;
> > > > >       void __iomem *regs;
> > > > >
> > > > > +     struct {
> > > > > +             struct range range;
> > > > > +     } pmem;
> > > > > +
> > > > > +     struct {
> > > > > +             struct range range;
> > > > > +     } ram;
> > > > > +
> > > > > +     char firmware_version[0x10];
> > > > > +
> > > > >       /* Cap 0001h - CXL_CAP_CAP_ID_DEVICE_STATUS */
> > > > >       struct {
> > > > >               void __iomem *regs;
> > > > > @@ -51,6 +88,7 @@ struct cxl_mem {
> > > > >
> > > > >       /* Cap 0002h - CXL_CAP_CAP_ID_PRIMARY_MAILBOX */
> > > > >       struct {
> > > > > +             struct mutex mutex; /* Protects device mailbox and firmware */
> > > > >               void __iomem *regs;
> > > > >               size_t payload_size;
> > > > >       } mbox;
> > > > > @@ -89,5 +127,6 @@ struct cxl_mem {
> > > > >
> > > > >  cxl_reg(status);
> > > > >  cxl_reg(mbox);
> > > > > +cxl_reg(mem);
> > > > >
> > > > >  #endif /* __CXL_H__ */
> > > > > diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> > > > > index fa14d51243ee..69ed15bfa5d4 100644
> > > > > --- a/drivers/cxl/mem.c
> > > > > +++ b/drivers/cxl/mem.c
> > > > > @@ -6,6 +6,270 @@
> > > > >  #include "pci.h"
> > > > >  #include "cxl.h"
> > > > >
> > > > > +#define cxl_doorbell_busy(cxlm)                                                \
> > > > > +     (cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
> > > > > +      CXLDEV_MB_CTRL_DOORBELL)
> > > > > +
> > > > > +#define CXL_MAILBOX_TIMEOUT_US 2000
> > > >
> > > > This should be _MS?
> > > >
> > > > > +
> > > > > +enum opcode {
> > > > > +     CXL_MBOX_OP_IDENTIFY            = 0x4000,
> > > > > +     CXL_MBOX_OP_MAX                 = 0x10000
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct mbox_cmd - A command to be submitted to hardware.
> > > > > + * @opcode: (input) The command set and command submitted to hardware.
> > > > > + * @payload_in: (input) Pointer to the input payload.
> > > > > + * @payload_out: (output) Pointer to the output payload. Must be allocated by
> > > > > + *            the caller.
> > > > > + * @size_in: (input) Number of bytes to load from @payload.
> > > > > + * @size_out: (output) Number of bytes loaded into @payload.
> > > > > + * @return_code: (output) Error code returned from hardware.
> > > > > + *
> > > > > + * This is the primary mechanism used to send commands to the hardware.
> > > > > + * All the fields except @payload_* correspond exactly to the fields described in
> > > > > + * Command Register section of the CXL 2.0 spec (8.2.8.4.5). @payload_in and
> > > > > + * @payload_out are written to, and read from the Command Payload Registers
> > > > > + * defined in (8.2.8.4.8).
> > > > > + */
> > > > > +struct mbox_cmd {
> > > > > +     u16 opcode;
> > > > > +     void *payload_in;
> > > > > +     void *payload_out;
> > > > > +     size_t size_in;
> > > > > +     size_t size_out;
> > > > > +     u16 return_code;
> > > > > +#define CXL_MBOX_SUCCESS 0
> > > > > +};
> > > > > +
> > > > > +static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
> > > > > +{
> > > > > +     const int timeout = msecs_to_jiffies(CXL_MAILBOX_TIMEOUT_US);
> > > > > +     const unsigned long start = jiffies;
> > > > > +     unsigned long end = start;
> > > > > +
> > > > > +     while (cxl_doorbell_busy(cxlm)) {
> > > > > +             end = jiffies;
> > > > > +
> > > > > +             if (time_after(end, start + timeout)) {
> > > > > +                     /* Check again in case preempted before timeout test */
> > > > > +                     if (!cxl_doorbell_busy(cxlm))
> > > > > +                             break;
> > > > > +                     return -ETIMEDOUT;
> > > > > +             }
> > > > > +             cpu_relax();
> > > > > +     }
> > > > > +
> > > > > +     dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
> > > > > +             jiffies_to_msecs(end) - jiffies_to_msecs(start));
> > > > > +     return 0;
> > > > > +}
> > > > > +
> > > > > +static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
> > > > > +                              struct mbox_cmd *mbox_cmd)
> > > > > +{
> > > > > +     dev_warn(&cxlm->pdev->dev, "Mailbox command timed out\n");
> > > > > +     dev_info(&cxlm->pdev->dev,
> > > > > +              "\topcode: 0x%04x\n"
> > > > > +              "\tpayload size: %zub\n",
> > > > > +              mbox_cmd->opcode, mbox_cmd->size_in);
> > > > > +
> > > > > +     if (IS_ENABLED(CONFIG_CXL_MEM_INSECURE_DEBUG)) {
> > > > > +             print_hex_dump_debug("Payload ", DUMP_PREFIX_OFFSET, 16, 1,
> > > > > +                                  mbox_cmd->payload_in, mbox_cmd->size_in,
> > > > > +                                  true);
> > > > > +     }
> > > > > +
> > > > > +     /* Here's a good place to figure out if a device reset is needed */
> > > >
> > > > What are the implications if we don't do a reset, as this implementation
> > > > does not?  IOW, does a timeout require a device to be recovered through a
> > > > reset before it can receive additional commands, or is it safe to simply
> > > > drop the command that timed out on the floor and proceed?
> > >
> > > Not a satisfying answer, but "it depends". It's also complicated by
> > > the fact that a reset may need to be coordinated with other devices in
> > > the interleave-set as the HDM decoders may bounce.
> > >
> > > For comparison, to date there have been no problems with the "drop on
> > > the floor" policy of LIBNVDIMM command timeouts. At the same time
> > > there simply was not a software visible reset mechanism for those
> > > devices so this problem never came out. This mailbox isn't a fast
> > > path, so the device is likely completely dead if this timeout is ever
> > > violated, and the firmware reporting a timeout might as well assume
> > > that the OS gives up on the device.
> > >
> > > I'll let Ben chime in on the rest...
> >
> > Reset handling is next on the TODO list for the driver. I had two main reasons
> > for not even taking a stab at it.
> > 1. I have no good way to test it. We are working on adding some test conditions
> >    to QEMU for it.
> > 2. The main difficulty in my mind with reset is you can't pull the memory out
> >    from under the OS here. While the driver doesn't yet handle persistent memory
> >    capacities, it may have volatile capacity configured by the BIOS. So the goal
> >    was, get the bits of the driver in that would at least allow developers,
> >    hardware vendors, and folks contributing to the spec a way to have basic
> >    interaction with a CXL type 3 device.
> 
> Honestly I think in most cases if the firmware decides to return a
> "reset required" status the Linux response will be "lol, no" because
> the firmware has no concept of the violence that would impose on the
> rest of the system.

How about UAPI to initiate a reset? I think a sysfs bool would do the trick.
Maybe sysfs file to display current error status, and one to reset?
John Groves (jgroves) Feb. 4, 2021, 9:53 p.m. UTC | #10
Micron Confidential



From: Dan Williams <dan.j.williams@intel.com>
Date: Monday, February 1, 2021 at 1:28 PM
To: Ben Widawsky <ben.widawsky@intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>, linux-cxl@vger.kernel.org <linux-cxl@vger.kernel.org>, Linux ACPI <linux-acpi@vger.kernel.org>, Linux Kernel Mailing List <linux-kernel@vger.kernel.org>, linux-nvdimm <linux-nvdimm@lists.01.org>, Linux PCI <linux-pci@vger.kernel.org>, Bjorn Helgaas <helgaas@kernel.org>, Chris Browy <cbrowy@avery-design.com>, Christoph Hellwig <hch@infradead.org>, Ira Weiny <ira.weiny@intel.com>, Jon Masters <jcm@jonmasters.org>, Jonathan Cameron <Jonathan.Cameron@huawei.com>, Rafael Wysocki <rafael.j.wysocki@intel.com>, Randy Dunlap <rdunlap@infradead.org>, Vishal Verma <vishal.l.verma@intel.com>, daniel.lll@alibaba-inc.com <daniel.lll@alibaba-inc.com>, John Groves (jgroves) <jgroves@micron.com>, Kelley, Sean V <sean.v.kelley@intel.com>
Subject: [EXT] Re: [PATCH 04/14] cxl/mem: Implement polled mode mailbox
CAUTION: EXTERNAL EMAIL. Do not click links or open attachments unless you recognize the sender and were expecting this message.


On Mon, Feb 1, 2021 at 11:13 AM Ben Widawsky <ben.widawsky@intel.com> wrote:
>
> On 21-02-01 12:54:00, Konrad Rzeszutek Wilk wrote:
> > > +#define cxl_doorbell_busy(cxlm)                                                \
> > > +   (cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
> > > +    CXLDEV_MB_CTRL_DOORBELL)
> > > +
> > > +#define CXL_MAILBOX_TIMEOUT_US 2000
> >
> > You been using the spec for the values. Is that number also from it ?
> >
>
> Yes it is. I'll add a comment with the spec reference.


From section 8.2.8.4 in the CXL 2.0 spec: “The mailbox command timeout is 2 seconds.”  So this should be:

#define CXL_MAILBOX_TIMEOUT_US 2000000

…right? 2000us is 2ms…

[snip]


Micron Confidential
Ben Widawsky Feb. 4, 2021, 10:24 p.m. UTC | #11
On 21-02-04 21:53:29, John Groves (jgroves) wrote:
>    Micron Confidential
> 
> 
> 
>    From: Dan Williams <dan.j.williams@intel.com>
>    Date: Monday, February 1, 2021 at 1:28 PM
>    To: Ben Widawsky <ben.widawsky@intel.com>
>    Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>,
>    linux-cxl@vger.kernel.org <linux-cxl@vger.kernel.org>, Linux ACPI
>    <linux-acpi@vger.kernel.org>, Linux Kernel Mailing List
>    <linux-kernel@vger.kernel.org>, linux-nvdimm
>    <linux-nvdimm@lists.01.org>, Linux PCI <linux-pci@vger.kernel.org>,
>    Bjorn Helgaas <helgaas@kernel.org>, Chris Browy
>    <cbrowy@avery-design.com>, Christoph Hellwig <hch@infradead.org>, Ira
>    Weiny <ira.weiny@intel.com>, Jon Masters <jcm@jonmasters.org>, Jonathan
>    Cameron <Jonathan.Cameron@huawei.com>, Rafael Wysocki
>    <rafael.j.wysocki@intel.com>, Randy Dunlap <rdunlap@infradead.org>,
>    Vishal Verma <vishal.l.verma@intel.com>, daniel.lll@alibaba-inc.com
>    <daniel.lll@alibaba-inc.com>, John Groves (jgroves)
>    <jgroves@micron.com>, Kelley, Sean V <sean.v.kelley@intel.com>
>    Subject: [EXT] Re: [PATCH 04/14] cxl/mem: Implement polled mode mailbox
> 
>    CAUTION: EXTERNAL EMAIL. Do not click links or open attachments unless
>    you recognize the sender and were expecting this message.
>    On Mon, Feb 1, 2021 at 11:13 AM Ben Widawsky <ben.widawsky@intel.com>
>    wrote:
>    >
>    > On 21-02-01 12:54:00, Konrad Rzeszutek Wilk wrote:
>    > > > +#define
>    cxl_doorbell_busy(cxlm)
>    \
>    > > > +   (cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET)
>    &                    \
>    > > > +    CXLDEV_MB_CTRL_DOORBELL)
>    > > > +
>    > > > +#define CXL_MAILBOX_TIMEOUT_US 2000
>    > >
>    > > You been using the spec for the values. Is that number also from it
>    ?
>    > >
>    >
>    > Yes it is. I'll add a comment with the spec reference.
> 
> 
> 
>    From section 8.2.8.4 in the CXL 2.0 spec: “The mailbox command timeout
>    is 2 seconds.”  So this should be:
> 
> 
>    #define CXL_MAILBOX_TIMEOUT_US 2000000
> 
> 
>    …right? 2000us is 2ms…
> 
> 

Thanks. This was caught already in earlier review by David Rientjes
<rientjes@google.com>

It's renamed CXL_MAILBOX_TIMEOUT_MS 2000
diff mbox series

Patch

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 3b66b46af8a0..fe591f74af96 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -32,4 +32,18 @@  config CXL_MEM
 	  Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
 
 	  If unsure say 'm'.
+
+config CXL_MEM_INSECURE_DEBUG
+	bool "CXL.mem debugging"
+	depends on CXL_MEM
+	help
+	  Enable debug of all CXL command payloads.
+
+	  Some CXL devices and controllers support encryption and other
+	  security features. The payloads for the commands that enable
+	  those features may contain sensitive clear-text security
+	  material. Disable debug of those command payloads by default.
+	  If you are a kernel developer actively working on CXL
+	  security enabling say Y, otherwise say N.
+
 endif
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index a3da7f8050c4..df3d97154b63 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -31,9 +31,36 @@ 
 #define CXLDEV_MB_CAPS_OFFSET 0x00
 #define   CXLDEV_MB_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
 #define CXLDEV_MB_CTRL_OFFSET 0x04
+#define   CXLDEV_MB_CTRL_DOORBELL BIT(0)
 #define CXLDEV_MB_CMD_OFFSET 0x08
+#define   CXLDEV_MB_CMD_COMMAND_OPCODE_MASK GENMASK(15, 0)
+#define   CXLDEV_MB_CMD_PAYLOAD_LENGTH_MASK GENMASK(36, 16)
 #define CXLDEV_MB_STATUS_OFFSET 0x10
+#define   CXLDEV_MB_STATUS_RET_CODE_MASK GENMASK(47, 32)
 #define CXLDEV_MB_BG_CMD_STATUS_OFFSET 0x18
+#define CXLDEV_MB_PAYLOAD_OFFSET 0x20
+
+/* Memory Device (CXL 2.0 - 8.2.8.5.1.1) */
+#define CXLMDEV_STATUS_OFFSET 0x0
+#define   CXLMDEV_DEV_FATAL BIT(0)
+#define   CXLMDEV_FW_HALT BIT(1)
+#define   CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2)
+#define     CXLMDEV_MS_NOT_READY 0
+#define     CXLMDEV_MS_READY 1
+#define     CXLMDEV_MS_ERROR 2
+#define     CXLMDEV_MS_DISABLED 3
+#define   CXLMDEV_READY(status) \
+		(CXL_GET_FIELD(status, CXLMDEV_STATUS_MEDIA_STATUS) == CXLMDEV_MS_READY)
+#define   CXLMDEV_MBOX_IF_READY BIT(4)
+#define   CXLMDEV_RESET_NEEDED_SHIFT 5
+#define   CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5)
+#define     CXLMDEV_RESET_NEEDED_NOT 0
+#define     CXLMDEV_RESET_NEEDED_COLD 1
+#define     CXLMDEV_RESET_NEEDED_WARM 2
+#define     CXLMDEV_RESET_NEEDED_HOT 3
+#define     CXLMDEV_RESET_NEEDED_CXL 4
+#define   CXLMDEV_RESET_NEEDED(status) \
+		(CXL_GET_FIELD(status, CXLMDEV_RESET_NEEDED) != CXLMDEV_RESET_NEEDED_NOT)
 
 /**
  * struct cxl_mem - A CXL memory device
@@ -44,6 +71,16 @@  struct cxl_mem {
 	struct pci_dev *pdev;
 	void __iomem *regs;
 
+	struct {
+		struct range range;
+	} pmem;
+
+	struct {
+		struct range range;
+	} ram;
+
+	char firmware_version[0x10];
+
 	/* Cap 0001h - CXL_CAP_CAP_ID_DEVICE_STATUS */
 	struct {
 		void __iomem *regs;
@@ -51,6 +88,7 @@  struct cxl_mem {
 
 	/* Cap 0002h - CXL_CAP_CAP_ID_PRIMARY_MAILBOX */
 	struct {
+		struct mutex mutex; /* Protects device mailbox and firmware */
 		void __iomem *regs;
 		size_t payload_size;
 	} mbox;
@@ -89,5 +127,6 @@  struct cxl_mem {
 
 cxl_reg(status);
 cxl_reg(mbox);
+cxl_reg(mem);
 
 #endif /* __CXL_H__ */
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index fa14d51243ee..69ed15bfa5d4 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -6,6 +6,270 @@ 
 #include "pci.h"
 #include "cxl.h"
 
+#define cxl_doorbell_busy(cxlm)                                                \
+	(cxl_read_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET) &                    \
+	 CXLDEV_MB_CTRL_DOORBELL)
+
+#define CXL_MAILBOX_TIMEOUT_US 2000
+
+enum opcode {
+	CXL_MBOX_OP_IDENTIFY		= 0x4000,
+	CXL_MBOX_OP_MAX			= 0x10000
+};
+
+/**
+ * struct mbox_cmd - A command to be submitted to hardware.
+ * @opcode: (input) The command set and command submitted to hardware.
+ * @payload_in: (input) Pointer to the input payload.
+ * @payload_out: (output) Pointer to the output payload. Must be allocated by
+ *		 the caller.
+ * @size_in: (input) Number of bytes to load from @payload.
+ * @size_out: (output) Number of bytes loaded into @payload.
+ * @return_code: (output) Error code returned from hardware.
+ *
+ * This is the primary mechanism used to send commands to the hardware.
+ * All the fields except @payload_* correspond exactly to the fields described in
+ * Command Register section of the CXL 2.0 spec (8.2.8.4.5). @payload_in and
+ * @payload_out are written to, and read from the Command Payload Registers
+ * defined in (8.2.8.4.8).
+ */
+struct mbox_cmd {
+	u16 opcode;
+	void *payload_in;
+	void *payload_out;
+	size_t size_in;
+	size_t size_out;
+	u16 return_code;
+#define CXL_MBOX_SUCCESS 0
+};
+
+static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
+{
+	const int timeout = msecs_to_jiffies(CXL_MAILBOX_TIMEOUT_US);
+	const unsigned long start = jiffies;
+	unsigned long end = start;
+
+	while (cxl_doorbell_busy(cxlm)) {
+		end = jiffies;
+
+		if (time_after(end, start + timeout)) {
+			/* Check again in case preempted before timeout test */
+			if (!cxl_doorbell_busy(cxlm))
+				break;
+			return -ETIMEDOUT;
+		}
+		cpu_relax();
+	}
+
+	dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
+		jiffies_to_msecs(end) - jiffies_to_msecs(start));
+	return 0;
+}
+
+static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
+				 struct mbox_cmd *mbox_cmd)
+{
+	dev_warn(&cxlm->pdev->dev, "Mailbox command timed out\n");
+	dev_info(&cxlm->pdev->dev,
+		 "\topcode: 0x%04x\n"
+		 "\tpayload size: %zub\n",
+		 mbox_cmd->opcode, mbox_cmd->size_in);
+
+	if (IS_ENABLED(CONFIG_CXL_MEM_INSECURE_DEBUG)) {
+		print_hex_dump_debug("Payload ", DUMP_PREFIX_OFFSET, 16, 1,
+				     mbox_cmd->payload_in, mbox_cmd->size_in,
+				     true);
+	}
+
+	/* Here's a good place to figure out if a device reset is needed */
+}
+
+/**
+ * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
+ * @cxlm: The CXL memory device to communicate with.
+ * @mbox_cmd: Command to send to the memory device.
+ *
+ * Context: Any context. Expects mbox_lock to be held.
+ * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
+ *         Caller should check the return code in @mbox_cmd to make sure it
+ *         succeeded.
+ *
+ * This is a generic form of the CXL mailbox send command, thus the only I/O
+ * operations used are cxl_read_mbox_reg(). Memory devices, and perhaps other
+ * types of CXL devices may have further information available upon error
+ * conditions.
+ *
+ * The CXL spec allows for up to two mailboxes. The intention is for the primary
+ * mailbox to be OS controlled and the secondary mailbox to be used by system
+ * firmware. This allows the OS and firmware to communicate with the device and
+ * not need to coordinate with each other. The driver only uses the primary
+ * mailbox.
+ */
+static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
+				 struct mbox_cmd *mbox_cmd)
+{
+	void __iomem *payload = cxlm->mbox.regs + CXLDEV_MB_PAYLOAD_OFFSET;
+	u64 cmd_reg, status_reg;
+	size_t out_len;
+	int rc;
+
+	lockdep_assert_held(&cxlm->mbox.mutex);
+
+	/*
+	 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
+	 *   1. Caller reads MB Control Register to verify doorbell is clear
+	 *   2. Caller writes Command Register
+	 *   3. Caller writes Command Payload Registers if input payload is non-empty
+	 *   4. Caller writes MB Control Register to set doorbell
+	 *   5. Caller either polls for doorbell to be clear or waits for interrupt if configured
+	 *   6. Caller reads MB Status Register to fetch Return code
+	 *   7. If command successful, Caller reads Command Register to get Payload Length
+	 *   8. If output payload is non-empty, host reads Command Payload Registers
+	 *
+	 *   Hardware is free to do whatever it wants before the doorbell is
+	 *   rung, and isn't allowed to change anything after it clears the
+	 *   doorbell. As such, steps 2 and 3 can happen in any order, and steps
+	 *   6, 7, 8 can also happen in any order (though some orders might not
+	 *   make sense).
+	 */
+
+	/* #1 */
+	if (cxl_doorbell_busy(cxlm)) {
+		dev_err_ratelimited(&cxlm->pdev->dev,
+				    "Mailbox re-busy after acquiring\n");
+		return -EBUSY;
+	}
+
+	cmd_reg = CXL_SET_FIELD(mbox_cmd->opcode, CXLDEV_MB_CMD_COMMAND_OPCODE);
+	if (mbox_cmd->size_in) {
+		if (WARN_ON(!mbox_cmd->payload_in))
+			return -EINVAL;
+
+		cmd_reg |= CXL_SET_FIELD(mbox_cmd->size_in,
+					 CXLDEV_MB_CMD_PAYLOAD_LENGTH);
+		memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
+	}
+
+	/* #2, #3 */
+	cxl_write_mbox_reg64(cxlm, CXLDEV_MB_CMD_OFFSET, cmd_reg);
+
+	/* #4 */
+	dev_dbg(&cxlm->pdev->dev, "Sending command\n");
+	cxl_write_mbox_reg32(cxlm, CXLDEV_MB_CTRL_OFFSET,
+			     CXLDEV_MB_CTRL_DOORBELL);
+
+	/* #5 */
+	rc = cxl_mem_wait_for_doorbell(cxlm);
+	if (rc == -ETIMEDOUT) {
+		cxl_mem_mbox_timeout(cxlm, mbox_cmd);
+		return rc;
+	}
+
+	/* #6 */
+	status_reg = cxl_read_mbox_reg64(cxlm, CXLDEV_MB_STATUS_OFFSET);
+	mbox_cmd->return_code =
+		CXL_GET_FIELD(status_reg, CXLDEV_MB_STATUS_RET_CODE);
+
+	if (mbox_cmd->return_code != 0) {
+		dev_dbg(&cxlm->pdev->dev, "Mailbox operation had an error\n");
+		return 0;
+	}
+
+	/* #7 */
+	cmd_reg = cxl_read_mbox_reg64(cxlm, CXLDEV_MB_CMD_OFFSET);
+	out_len = CXL_GET_FIELD(cmd_reg, CXLDEV_MB_CMD_PAYLOAD_LENGTH);
+
+	/* #8 */
+	if (out_len && mbox_cmd->payload_out)
+		memcpy_fromio(mbox_cmd->payload_out, payload, out_len);
+
+	mbox_cmd->size_out = out_len;
+
+	return 0;
+}
+
+/**
+ * cxl_mem_mbox_get() - Acquire exclusive access to the mailbox.
+ * @cxlm: The memory device to gain access to.
+ *
+ * Context: Any context. Takes the mbox_lock.
+ * Return: 0 if exclusive access was acquired.
+ */
+static int cxl_mem_mbox_get(struct cxl_mem *cxlm)
+{
+	struct device *dev = &cxlm->pdev->dev;
+	int rc = -EBUSY;
+	u64 md_status;
+
+	mutex_lock_io(&cxlm->mbox.mutex);
+
+	/*
+	 * XXX: There is some amount of ambiguity in the 2.0 version of the spec
+	 * around the mailbox interface ready (8.2.8.5.1.1).  The purpose of the
+	 * bit is to allow firmware running on the device to notify the driver
+	 * that it's ready to receive commands. It is unclear if the bit needs
+	 * to be read for each transaction mailbox, ie. the firmware can switch
+	 * it on and off as needed. Second, there is no defined timeout for
+	 * mailbox ready, like there is for the doorbell interface.
+	 *
+	 * Assumptions:
+	 * 1. The firmware might toggle the Mailbox Interface Ready bit, check
+	 *    it for every command.
+	 *
+	 * 2. If the doorbell is clear, the firmware should have first set the
+	 *    Mailbox Interface Ready bit. Therefore, waiting for the doorbell
+	 *    to be ready is sufficient.
+	 */
+	rc = cxl_mem_wait_for_doorbell(cxlm);
+	if (rc) {
+		dev_warn(dev, "Mailbox interface not ready\n");
+		goto out;
+	}
+
+	md_status = cxl_read_mem_reg64(cxlm, CXLMDEV_STATUS_OFFSET);
+	if (!(md_status & CXLMDEV_MBOX_IF_READY && CXLMDEV_READY(md_status))) {
+		dev_err(dev,
+			"mbox: reported doorbell ready, but not mbox ready\n");
+		goto out;
+	}
+
+	/*
+	 * Hardware shouldn't allow a ready status but also have failure bits
+	 * set. Spit out an error, this should be a bug report
+	 */
+	rc = -EFAULT;
+	if (md_status & CXLMDEV_DEV_FATAL) {
+		dev_err(dev, "mbox: reported ready, but fatal\n");
+		goto out;
+	}
+	if (md_status & CXLMDEV_FW_HALT) {
+		dev_err(dev, "mbox: reported ready, but halted\n");
+		goto out;
+	}
+	if (CXLMDEV_RESET_NEEDED(md_status)) {
+		dev_err(dev, "mbox: reported ready, but reset needed\n");
+		goto out;
+	}
+
+	/* with lock held */
+	return 0;
+
+out:
+	mutex_unlock(&cxlm->mbox.mutex);
+	return rc;
+}
+
+/**
+ * cxl_mem_mbox_put() - Release exclusive access to the mailbox.
+ * @cxlm: The CXL memory device to communicate with.
+ *
+ * Context: Any context. Expects mbox_lock to be held.
+ */
+static void cxl_mem_mbox_put(struct cxl_mem *cxlm)
+{
+	mutex_unlock(&cxlm->mbox.mutex);
+}
+
 /**
  * cxl_mem_setup_regs() - Setup necessary MMIO.
  * @cxlm: The CXL memory device to communicate with.
@@ -142,6 +406,8 @@  static struct cxl_mem *cxl_mem_create(struct pci_dev *pdev, u32 reg_lo,
 		return NULL;
 	}
 
+	mutex_init(&cxlm->mbox.mutex);
+
 	regs = pcim_iomap_table(pdev)[bar];
 	cxlm->pdev = pdev;
 	cxlm->regs = regs + offset;
@@ -174,6 +440,76 @@  static int cxl_mem_dvsec(struct pci_dev *pdev, int dvsec)
 	return 0;
 }
 
+/**
+ * cxl_mem_identify() - Send the IDENTIFY command to the device.
+ * @cxlm: The device to identify.
+ *
+ * Return: 0 if identify was executed successfully.
+ *
+ * This will dispatch the identify command to the device and on success populate
+ * structures to be exported to sysfs.
+ */
+static int cxl_mem_identify(struct cxl_mem *cxlm)
+{
+	struct cxl_mbox_identify {
+		char fw_revision[0x10];
+		__le64 total_capacity;
+		__le64 volatile_capacity;
+		__le64 persistent_capacity;
+		__le64 partition_align;
+		__le16 info_event_log_size;
+		__le16 warning_event_log_size;
+		__le16 failure_event_log_size;
+		__le16 fatal_event_log_size;
+		__le32 lsa_size;
+		u8 poison_list_max_mer[3];
+		__le16 inject_poison_limit;
+		u8 poison_caps;
+		u8 qos_telemetry_caps;
+	} __packed id;
+	struct mbox_cmd mbox_cmd;
+	int rc;
+
+	/* Retrieve initial device memory map */
+	rc = cxl_mem_mbox_get(cxlm);
+	if (rc)
+		return rc;
+
+	mbox_cmd = (struct mbox_cmd){
+		.opcode = CXL_MBOX_OP_IDENTIFY,
+		.payload_out = &id,
+		.size_in = 0,
+	};
+	rc = cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
+	cxl_mem_mbox_put(cxlm);
+	if (rc)
+		return rc;
+
+	/* TODO: Handle retry or reset responses from firmware. */
+	if (mbox_cmd.return_code != CXL_MBOX_SUCCESS) {
+		dev_err(&cxlm->pdev->dev, "Mailbox command failed (%d)\n",
+			mbox_cmd.return_code);
+		return -ENXIO;
+	}
+
+	if (mbox_cmd.size_out != sizeof(id))
+		return -ENXIO;
+
+	/*
+	 * TODO: enumerate DPA map, as 'ram' and 'pmem' do not alias.
+	 * For now, only the capacity is exported in sysfs
+	 */
+	cxlm->ram.range.start = 0;
+	cxlm->ram.range.end = le64_to_cpu(id.volatile_capacity) - 1;
+
+	cxlm->pmem.range.start = 0;
+	cxlm->pmem.range.end = le64_to_cpu(id.persistent_capacity) - 1;
+
+	memcpy(cxlm->firmware_version, id.fw_revision, sizeof(id.fw_revision));
+
+	return rc;
+}
+
 static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct device *dev = &pdev->dev;
@@ -219,7 +555,11 @@  static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
-	return cxl_mem_setup_mailbox(cxlm);
+	rc = cxl_mem_setup_mailbox(cxlm);
+	if (rc)
+		return rc;
+
+	return cxl_mem_identify(cxlm);
 }
 
 static const struct pci_device_id cxl_mem_pci_tbl[] = {