diff mbox

[v2,2/2] libnvdimm: Add a poison list and export badblocks

Message ID 1451010103-11462-3-git-send-email-vishal.l.verma@intel.com (mailing list archive)
State Accepted
Commit 0caeef63e6d2
Headers show

Commit Message

Verma, Vishal L Dec. 25, 2015, 2:21 a.m. UTC
During region creation, perform Address Range Scrubs (ARS) for the SPA
(System Physical Address) ranges to retrieve known poison locations from
firmware. Add a new data structure 'nd_poison' which is used as a list
in nvdimm_bus to store these poison locations.

When creating a pmem namespace, if there is any known poison associated
with its physical address space, convert the poison ranges to bad sectors
that are exposed using the badblocks interface.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 drivers/acpi/nfit.c       | 203 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/core.c     | 187 ++++++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/nd-core.h  |   3 +
 drivers/nvdimm/nd.h       |   6 ++
 drivers/nvdimm/pmem.c     |   6 ++
 include/linux/libnvdimm.h |   1 +
 6 files changed, 406 insertions(+)

Comments

Linda Knippers Jan. 7, 2016, 9:18 p.m. UTC | #1
On 12/24/2015 9:21 PM, Vishal Verma wrote:
> During region creation, perform Address Range Scrubs (ARS) for the SPA
> (System Physical Address) ranges to retrieve known poison locations from
> firmware. Add a new data structure 'nd_poison' which is used as a list
> in nvdimm_bus to store these poison locations.
> 
> When creating a pmem namespace, if there is any known poison associated
> with its physical address space, convert the poison ranges to bad sectors
> that are exposed using the badblocks interface.
> 
> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
> ---
>  drivers/acpi/nfit.c       | 203 ++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/nvdimm/core.c     | 187 ++++++++++++++++++++++++++++++++++++++++++
>  drivers/nvdimm/nd-core.h  |   3 +
>  drivers/nvdimm/nd.h       |   6 ++
>  drivers/nvdimm/pmem.c     |   6 ++
>  include/linux/libnvdimm.h |   1 +
>  6 files changed, 406 insertions(+)
> 
> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c

Why are the ARS function in acpi/nfit.c rather than in the core?
I realize that the interfaces are/will be defined in the ACPI spec but
it's not really part of the NFIT.

Why isn't this part of the nvdimm core, where the calls are made?

> index aa45d48..ad6d8c6 100644
> --- a/drivers/acpi/nfit.c
> +++ b/drivers/acpi/nfit.c
> @@ -15,6 +15,7 @@
>  #include <linux/module.h>
>  #include <linux/mutex.h>
>  #include <linux/ndctl.h>
> +#include <linux/delay.h>
>  #include <linux/list.h>
>  #include <linux/acpi.h>
>  #include <linux/sort.h>
> @@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
>  	/* devm will free nfit_blk */
>  }
>  
> +static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc,
> +		struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
> +{
> +	cmd->address = addr;
> +	cmd->length = length;
> +
> +	return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
> +			sizeof(*cmd));
> +}
> +
> +static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
> +		struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
> +{
> +	int rc;
> +
> +	cmd->address = addr;
> +	cmd->length = length;
> +	cmd->type = ND_ARS_PERSISTENT;
> +
> +	while (1) {
> +		rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
> +				sizeof(*cmd));
> +		if (rc)
> +			return rc;
> +		switch (cmd->status) {
> +		case 0:
> +			return 0;
> +		case 1:
> +			/* ARS unsupported, but we should never get here */
> +			return 0;
> +		case 2:
> +			return -EINVAL;
> +		case 3:

Are these values in flux in the ACPI spec?  Would #defines help?

> +			/* ARS is in progress */
> +			msleep(1000);
> +			break;

Should this be considered an error (perhaps EAGAIN?) if an ARS
is in progress since you're supposed to check the status before
attempting to start one?  Do we really want to keep retrying
it, potentially forever, here?

> +		default:
> +			return -ENXIO;
> +		}
> +	}
> +}
> +
> +static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
> +		struct nd_cmd_ars_status *cmd)
> +{
> +	int rc;
> +
> +	while (1) {
> +		rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
> +			sizeof(*cmd));
> +		if (rc || cmd->status & 0xffff)
> +			return -ENXIO;
> +
> +		/* Check extended status (Upper two bytes) */
> +		switch (cmd->status >> 16) {
> +		case 0:
> +			return 0;
> +		case 1:
> +			/* ARS is in progress */
> +			msleep(1000);
> +			break;

Why isn't ARS in progress a legitimate status to return,
especially if you're supposed to check if one is in progress
before starting one?

You could have a different call to wait for completion if
that's what you want to do, but it wouldn't be called
ars_get_status().

> +		case 2:
> +			/* No ARS performed for the current boot */
> +			return 0;
> +		default:
> +			return -ENXIO;
> +		}
> +	}
> +}
> +
> +static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
> +		struct nd_cmd_ars_status *ars_status, u64 start)
> +{
> +	int rc;
> +	u32 i;
> +
> +	/*
> +	 * The address field returned by ars_status should be either
> +	 * less than or equal to the address we last started ARS for.
> +	 * The (start, length) returned by ars_status should also have
> +	 * non-zero overlap with the range we started ARS for.
> +	 * If this is not the case, bail.
> +	 */
> +	if (ars_status->address > start ||
> +			(ars_status->address + ars_status->length < start))
> +		return -ENXIO;
> +
> +	for (i = 0; i < ars_status->num_records; i++) {
> +		rc = nvdimm_bus_add_poison(nvdimm_bus,
> +				ars_status->records[i].err_address,
> +				ars_status->records[i].length);
> +		if (rc)
> +			return rc;
> +	}
> +
> +	return 0;
> +}
> +
> +static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
> +		struct nd_region_desc *ndr_desc)
> +{
> +	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
> +	struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
> +	struct nd_cmd_ars_status *ars_status = NULL;
> +	struct nd_cmd_ars_start *ars_start = NULL;
> +	struct nd_cmd_ars_cap *ars_cap = NULL;
> +	u64 start, len, cur, remaining;
> +	int rc;
> +
> +	ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
> +	if (!ars_cap)
> +		return -ENOMEM;
> +
> +	start = ndr_desc->res->start;
> +	len = ndr_desc->res->end - ndr_desc->res->start + 1;
> +
> +	rc = ars_get_cap(nd_desc, ars_cap, start, len);
> +	if (rc)
> +		goto out;
> +
> +	/*
> +	 * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
> +	 * extended status is not set, skip this but continue initialization
> +	 */
> +	if ((ars_cap->status & 0xffff) ||
> +		!(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
> +		dev_warn(acpi_desc->dev,
> +			"ARS unsupported (status: 0x%x), won't create an error list\n",
> +			ars_cap->status);
> +		goto out;
> +	}
> +
> +	/*
> +	 * Check if a full-range ARS has been run. If so, use those results
> +	 * without having to start a new ARS.
> +	 */
> +	ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status),
> +			GFP_KERNEL);
> +	if (!ars_status) {
> +		rc = -ENOMEM;
> +		goto out;
> +	}
> +
> +	rc = ars_get_status(nd_desc, ars_status);
> +	if (rc)
> +		goto out;
> +
> +	if (ars_status->address <= start &&
> +		(ars_status->address + ars_status->length >= start + len)) {
> +		rc = ars_status_process_records(nvdimm_bus, ars_status, start);
> +		goto out;
> +	}
> +
> +	/*
> +	 * ARS_STATUS can overflow if the number of poison entries found is
> +	 * greater than the maximum buffer size (ars_cap->max_ars_out)
> +	 * To detect overflow, check if the length field of ars_status
> +	 * is less than the length we supplied. If so, process the
> +	 * error entries we got, adjust the start point, and start again
> +	 */
> +	ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL);
> +	if (!ars_start)
> +		return -ENOMEM;
> +
> +	cur = start;
> +	remaining = len;
> +	do {
> +		u64 done, end;
> +
> +		rc = ars_do_start(nd_desc, ars_start, cur, remaining);

I think you should get status, start, wait for completion, then process
records, at least that's how I read the example DSM spec and other
specs.  It says "You must first issue a Query ARS Status command...".

> +		if (rc)
> +			goto out;
> +
> +		rc = ars_get_status(nd_desc, ars_status);
> +		if (rc)
> +			goto out;
> +
> +		rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
> +		if (rc)
> +			goto out;
> +
> +		end = min(cur + remaining,
> +			ars_status->address + ars_status->length);
> +		done = end - cur;
> +		cur += done;
> +		remaining -= done;
> +	} while (remaining);
> +
> + out:
> +	kfree(ars_cap);
> +	kfree(ars_start);
> +	kfree(ars_status);
> +	return rc;
> +}
> +

I haven't looked at the rest of this, will try to later.

-- ljk

>  static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
>  		struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
>  		struct acpi_nfit_memory_map *memdev,
> @@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
>  
>  	nvdimm_bus = acpi_desc->nvdimm_bus;
>  	if (nfit_spa_type(spa) == NFIT_SPA_PM) {
> +		rc = acpi_nfit_find_poison(acpi_desc, ndr_desc);
> +		if (rc) {
> +			dev_err(acpi_desc->dev,
> +				"error while performing ARS to find poison: %d\n",
> +				rc);
> +			return rc;
> +		}
>  		if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
>  			return -ENOMEM;
>  	} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
> diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
> index 82c49bb..21003b7 100644
> --- a/drivers/nvdimm/core.c
> +++ b/drivers/nvdimm/core.c
> @@ -325,6 +325,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
>  	if (!nvdimm_bus)
>  		return NULL;
>  	INIT_LIST_HEAD(&nvdimm_bus->list);
> +	INIT_LIST_HEAD(&nvdimm_bus->poison_list);
>  	init_waitqueue_head(&nvdimm_bus->probe_wait);
>  	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
>  	mutex_init(&nvdimm_bus->reconfig_mutex);
> @@ -359,6 +360,191 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
>  }
>  EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
>  
> +/**
> + * __add_badblock_range() - Convert a physical address range to bad sectors
> + * @disk:	the disk associated with the namespace
> + * @ns_offset:	namespace offset where the error range begins (in bytes)
> + * @len:	number of bytes of poison to be added
> + *
> + * This assumes that the range provided with (ns_offset, len) is within
> + * the bounds of physical addresses for this namespace, i.e. lies in the
> + * interval [ns_start, ns_start + ns_size)
> + */
> +static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len)
> +{
> +	unsigned int sector_size = queue_logical_block_size(disk->queue);
> +	sector_t start_sector;
> +	u64 num_sectors;
> +	u32 rem;
> +	int rc;
> +
> +	start_sector = div_u64(ns_offset, sector_size);
> +	num_sectors = div_u64_rem(len, sector_size, &rem);
> +	if (rem)
> +		num_sectors++;
> +
> +	if (!disk->bb) {
> +		rc = disk_alloc_badblocks(disk);
> +		if (rc)
> +			return rc;
> +	}
> +
> +	if (unlikely(num_sectors > (u64)INT_MAX)) {
> +		u64 remaining = num_sectors;
> +		sector_t s = start_sector;
> +
> +		while (remaining) {
> +			int done = min_t(u64, remaining, INT_MAX);
> +
> +			rc = disk_set_badblocks(disk, s, done);
> +			if (rc)
> +				return rc;
> +			remaining -= done;
> +			s += done;
> +		}
> +		return 0;
> +	} else
> +		return disk_set_badblocks(disk, start_sector, num_sectors);
> +}
> +
> +/**
> + * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks
> + * @disk:	the gendisk associated with the namespace where badblocks
> + *		will be stored
> + * @offset:	offset at the start of the namespace before 'sector 0'
> + * @ndns:	the namespace containing poison ranges
> + *
> + * The poison list generated during NFIT initialization may contain multiple,
> + * possibly overlapping ranges in the SPA (System Physical Address) space.
> + * Compare each of these ranges to the namespace currently being initialized,
> + * and add badblocks to the gendisk for all matching sub-ranges
> + *
> + * Return:
> + * 0 - Success
> + */
> +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset,
> +		struct nd_namespace_common *ndns)
> +{
> +	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
> +	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
> +	struct nvdimm_bus *nvdimm_bus;
> +	struct list_head *poison_list;
> +	u64 ns_start, ns_end, ns_size;
> +	struct nd_poison *pl;
> +	int rc;
> +
> +	ns_size = nvdimm_namespace_capacity(ndns) - offset;
> +	ns_start = nsio->res.start + offset;
> +	ns_end = nsio->res.end;
> +
> +	nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent);
> +	poison_list = &nvdimm_bus->poison_list;
> +	if (list_empty(poison_list))
> +		return 0;
> +
> +	list_for_each_entry(pl, poison_list, list) {
> +		u64 pl_end = pl->start + pl->length - 1;
> +
> +		/* Discard intervals with no intersection */
> +		if (pl_end < ns_start)
> +			continue;
> +		if (pl->start > ns_end)
> +			continue;
> +		/* Deal with any overlap after start of the namespace */
> +		if (pl->start >= ns_start) {
> +			u64 start = pl->start;
> +			u64 len;
> +
> +			if (pl_end <= ns_end)
> +				len = pl->length;
> +			else
> +				len = ns_start + ns_size - pl->start;
> +
> +			rc = __add_badblock_range(disk, start - ns_start, len);
> +			if (rc)
> +				return rc;
> +			dev_info(&nvdimm_bus->dev,
> +				"Found a poison range (0x%llx, 0x%llx)\n",
> +				start, len);
> +			continue;
> +		}
> +		/* Deal with overlap for poison starting before the namespace */
> +		if (pl->start < ns_start) {
> +			u64 len;
> +
> +			if (pl_end < ns_end)
> +				len = pl->start + pl->length - ns_start;
> +			else
> +				len = ns_size;
> +
> +			rc = __add_badblock_range(disk, 0, len);
> +			if (rc)
> +				return rc;
> +			dev_info(&nvdimm_bus->dev,
> +				"Found a poison range (0x%llx, 0x%llx)\n",
> +				pl->start, len);
> +		}
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison);
> +
> +static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
> +{
> +	struct nd_poison *pl;
> +
> +	pl = kzalloc(sizeof(*pl), GFP_KERNEL);
> +	if (!pl)
> +		return -ENOMEM;
> +
> +	pl->start = addr;
> +	pl->length = length;
> +	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
> +
> +	return 0;
> +}
> +
> +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
> +{
> +	struct nd_poison *pl;
> +
> +	if (list_empty(&nvdimm_bus->poison_list))
> +		return __add_poison(nvdimm_bus, addr, length);
> +
> +	/*
> +	 * There is a chance this is a duplicate, check for those first.
> +	 * This will be the common case as ARS_STATUS returns all known
> +	 * errors in the SPA space, and we can't query it per region
> +	 */
> +	list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
> +		if (pl->start == addr) {
> +			/* If length has changed, update this list entry */
> +			if (pl->length != length)
> +				pl->length = length;
> +			return 0;
> +		}
> +
> +	/*
> +	 * If not a duplicate or a simple length update, add the entry as is,
> +	 * as any overlapping ranges will get resolved when the list is consumed
> +	 * and converted to badblocks
> +	 */
> +	return __add_poison(nvdimm_bus, addr, length);
> +}
> +EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
> +
> +static void free_poison_list(struct list_head *poison_list)
> +{
> +	struct nd_poison *pl, *next;
> +
> +	list_for_each_entry_safe(pl, next, poison_list, list) {
> +		list_del(&pl->list);
> +		kfree(pl);
> +	}
> +	list_del_init(poison_list);
> +}
> +
>  static int child_unregister(struct device *dev, void *data)
>  {
>  	/*
> @@ -385,6 +571,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
>  
>  	nd_synchronize();
>  	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
> +	free_poison_list(&nvdimm_bus->poison_list);
>  	nvdimm_bus_destroy_ndctl(nvdimm_bus);
>  
>  	device_unregister(&nvdimm_bus->dev);
> diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
> index 159aed5..d3b7ea7 100644
> --- a/drivers/nvdimm/nd-core.h
> +++ b/drivers/nvdimm/nd-core.h
> @@ -30,6 +30,7 @@ struct nvdimm_bus {
>  	struct list_head list;
>  	struct device dev;
>  	int id, probe_active;
> +	struct list_head poison_list;
>  	struct mutex reconfig_mutex;
>  };
>  
> @@ -89,4 +90,6 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
>  ssize_t nd_namespace_store(struct device *dev,
>  		struct nd_namespace_common **_ndns, const char *buf,
>  		size_t len);
> +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset,
> +		struct nd_namespace_common *ndns);
>  #endif /* __ND_CORE_H__ */
> diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
> index 417e521..ba91fcd 100644
> --- a/drivers/nvdimm/nd.h
> +++ b/drivers/nvdimm/nd.h
> @@ -38,6 +38,12 @@ enum {
>  #endif
>  };
>  
> +struct nd_poison {
> +	u64 start;
> +	u64 length;
> +	struct list_head list;
> +};
> +
>  struct nvdimm_drvdata {
>  	struct device *dev;
>  	int nsindex_size;
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 8ee7989..5b95043 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -27,6 +27,7 @@
>  #include <linux/slab.h>
>  #include <linux/pmem.h>
>  #include <linux/nd.h>
> +#include "nd-core.h"
>  #include "pfn.h"
>  #include "nd.h"
>  
> @@ -168,6 +169,7 @@ static int pmem_attach_disk(struct device *dev,
>  {
>  	int nid = dev_to_node(dev);
>  	struct gendisk *disk;
> +	int ret;
>  
>  	pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid);
>  	if (!pmem->pmem_queue)
> @@ -196,6 +198,10 @@ static int pmem_attach_disk(struct device *dev,
>  	set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
>  	pmem->pmem_disk = disk;
>  
> +	ret = nvdimm_namespace_add_poison(disk, pmem->data_offset, ndns);
> +	if (ret)
> +		return ret;
> +
>  	add_disk(disk);
>  	revalidate_disk(disk);
>  
> diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
> index 3f021dc..bed40df 100644
> --- a/include/linux/libnvdimm.h
> +++ b/include/linux/libnvdimm.h
> @@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
>  
>  }
>  
> +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
>  struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
>  		struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
>  #define nvdimm_bus_register(parent, desc) \
>
Dan Williams Jan. 7, 2016, 10:49 p.m. UTC | #2
On Thu, Jan 7, 2016 at 1:18 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>
> On 12/24/2015 9:21 PM, Vishal Verma wrote:
>> During region creation, perform Address Range Scrubs (ARS) for the SPA
>> (System Physical Address) ranges to retrieve known poison locations from
>> firmware. Add a new data structure 'nd_poison' which is used as a list
>> in nvdimm_bus to store these poison locations.
>>
>> When creating a pmem namespace, if there is any known poison associated
>> with its physical address space, convert the poison ranges to bad sectors
>> that are exposed using the badblocks interface.
>>
>> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
>> ---
>>  drivers/acpi/nfit.c       | 203 ++++++++++++++++++++++++++++++++++++++++++++++
>>  drivers/nvdimm/core.c     | 187 ++++++++++++++++++++++++++++++++++++++++++
>>  drivers/nvdimm/nd-core.h  |   3 +
>>  drivers/nvdimm/nd.h       |   6 ++
>>  drivers/nvdimm/pmem.c     |   6 ++
>>  include/linux/libnvdimm.h |   1 +
>>  6 files changed, 406 insertions(+)
>>
>> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
>
> Why are the ARS function in acpi/nfit.c rather than in the core?
> I realize that the interfaces are/will be defined in the ACPI spec but
> it's not really part of the NFIT.
>
> Why isn't this part of the nvdimm core, where the calls are made?

True.  acpi_nfit_find_poison() could easily become
nvdimm_bus_find_poison() and move to the core.  However we only need
to take that step when / if another nvdimm bus type show up that isn't
nfit defined.  For now only nfit-defined nvdimms have a concept of
ARS.

>
>> index aa45d48..ad6d8c6 100644
>> --- a/drivers/acpi/nfit.c
>> +++ b/drivers/acpi/nfit.c
>> @@ -15,6 +15,7 @@
>>  #include <linux/module.h>
>>  #include <linux/mutex.h>
>>  #include <linux/ndctl.h>
>> +#include <linux/delay.h>
>>  #include <linux/list.h>
>>  #include <linux/acpi.h>
>>  #include <linux/sort.h>
>> @@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
>>       /* devm will free nfit_blk */
>>  }
>>
>> +static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc,
>> +             struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
>> +{
>> +     cmd->address = addr;
>> +     cmd->length = length;
>> +
>> +     return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
>> +                     sizeof(*cmd));
>> +}
>> +
>> +static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
>> +             struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
>> +{
>> +     int rc;
>> +
>> +     cmd->address = addr;
>> +     cmd->length = length;
>> +     cmd->type = ND_ARS_PERSISTENT;
>> +
>> +     while (1) {
>> +             rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
>> +                             sizeof(*cmd));
>> +             if (rc)
>> +                     return rc;
>> +             switch (cmd->status) {
>> +             case 0:
>> +                     return 0;
>> +             case 1:
>> +                     /* ARS unsupported, but we should never get here */
>> +                     return 0;
>> +             case 2:
>> +                     return -EINVAL;
>> +             case 3:
>
> Are these values in flux in the ACPI spec?  Would #defines help?

Yes that would be more readable...

>> +                     /* ARS is in progress */
>> +                     msleep(1000);
>> +                     break;
>
> Should this be considered an error (perhaps EAGAIN?) if an ARS
> is in progress since you're supposed to check the status before
> attempting to start one?  Do we really want to keep retrying
> it, potentially forever, here?

We indeed need a timeout here.  I'll take a look at this and the above
as Vishal is out for the rest of the week.

There's also plans for 4.6 to make ARS polling asynchronous to the
rest of the nvdimm system coming up, but deemed too risky /
complicated for intercepting 4.5.

>
>> +             default:
>> +                     return -ENXIO;
>> +             }
>> +     }
>> +}
>> +
>> +static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
>> +             struct nd_cmd_ars_status *cmd)
>> +{
>> +     int rc;
>> +
>> +     while (1) {
>> +             rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
>> +                     sizeof(*cmd));
>> +             if (rc || cmd->status & 0xffff)
>> +                     return -ENXIO;
>> +
>> +             /* Check extended status (Upper two bytes) */
>> +             switch (cmd->status >> 16) {
>> +             case 0:
>> +                     return 0;
>> +             case 1:
>> +                     /* ARS is in progress */
>> +                     msleep(1000);
>> +                     break;
>
> Why isn't ARS in progress a legitimate status to return,
> especially if you're supposed to check if one is in progress
> before starting one?
>
> You could have a different call to wait for completion if
> that's what you want to do, but it wouldn't be called
> ars_get_status().

Right, this is something to address when we make ARS fully asynchronous.

>
>> +             case 2:
>> +                     /* No ARS performed for the current boot */
>> +                     return 0;
>> +             default:
>> +                     return -ENXIO;
>> +             }
>> +     }
>> +}
>> +
>> +static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
>> +             struct nd_cmd_ars_status *ars_status, u64 start)
>> +{
>> +     int rc;
>> +     u32 i;
>> +
>> +     /*
>> +      * The address field returned by ars_status should be either
>> +      * less than or equal to the address we last started ARS for.
>> +      * The (start, length) returned by ars_status should also have
>> +      * non-zero overlap with the range we started ARS for.
>> +      * If this is not the case, bail.
>> +      */
>> +     if (ars_status->address > start ||
>> +                     (ars_status->address + ars_status->length < start))
>> +             return -ENXIO;
>> +
>> +     for (i = 0; i < ars_status->num_records; i++) {
>> +             rc = nvdimm_bus_add_poison(nvdimm_bus,
>> +                             ars_status->records[i].err_address,
>> +                             ars_status->records[i].length);
>> +             if (rc)
>> +                     return rc;
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>> +static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
>> +             struct nd_region_desc *ndr_desc)
>> +{
>> +     struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
>> +     struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
>> +     struct nd_cmd_ars_status *ars_status = NULL;
>> +     struct nd_cmd_ars_start *ars_start = NULL;
>> +     struct nd_cmd_ars_cap *ars_cap = NULL;
>> +     u64 start, len, cur, remaining;
>> +     int rc;
>> +
>> +     ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
>> +     if (!ars_cap)
>> +             return -ENOMEM;
>> +
>> +     start = ndr_desc->res->start;
>> +     len = ndr_desc->res->end - ndr_desc->res->start + 1;
>> +
>> +     rc = ars_get_cap(nd_desc, ars_cap, start, len);
>> +     if (rc)
>> +             goto out;
>> +
>> +     /*
>> +      * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
>> +      * extended status is not set, skip this but continue initialization
>> +      */
>> +     if ((ars_cap->status & 0xffff) ||
>> +             !(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
>> +             dev_warn(acpi_desc->dev,
>> +                     "ARS unsupported (status: 0x%x), won't create an error list\n",
>> +                     ars_cap->status);
>> +             goto out;
>> +     }
>> +
>> +     /*
>> +      * Check if a full-range ARS has been run. If so, use those results
>> +      * without having to start a new ARS.
>> +      */
>> +     ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status),
>> +                     GFP_KERNEL);
>> +     if (!ars_status) {
>> +             rc = -ENOMEM;
>> +             goto out;
>> +     }
>> +
>> +     rc = ars_get_status(nd_desc, ars_status);
>> +     if (rc)
>> +             goto out;
>> +
>> +     if (ars_status->address <= start &&
>> +             (ars_status->address + ars_status->length >= start + len)) {
>> +             rc = ars_status_process_records(nvdimm_bus, ars_status, start);
>> +             goto out;
>> +     }
>> +
>> +     /*
>> +      * ARS_STATUS can overflow if the number of poison entries found is
>> +      * greater than the maximum buffer size (ars_cap->max_ars_out)
>> +      * To detect overflow, check if the length field of ars_status
>> +      * is less than the length we supplied. If so, process the
>> +      * error entries we got, adjust the start point, and start again
>> +      */
>> +     ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL);
>> +     if (!ars_start)
>> +             return -ENOMEM;
>> +
>> +     cur = start;
>> +     remaining = len;
>> +     do {
>> +             u64 done, end;
>> +
>> +             rc = ars_do_start(nd_desc, ars_start, cur, remaining);
>
> I think you should get status, start, wait for completion, then process
> records, at least that's how I read the example DSM spec and other
> specs.  It says "You must first issue a Query ARS Status command...".

We do the lead-in ars_status check above...

>
>> +             if (rc)
>> +                     goto out;
>> +
>> +             rc = ars_get_status(nd_desc, ars_status);
>> +             if (rc)
>> +                     goto out;
>> +
>> +             rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
>> +             if (rc)
>> +                     goto out;
>> +
>> +             end = min(cur + remaining,
>> +                     ars_status->address + ars_status->length);
>> +             done = end - cur;
>> +             cur += done;
>> +             remaining -= done;
>> +     } while (remaining);
>> +
>> + out:
>> +     kfree(ars_cap);
>> +     kfree(ars_start);
>> +     kfree(ars_status);
>> +     return rc;
>> +}
>> +
>
> I haven't looked at the rest of this, will try to later.
>

Thanks Linda!
Linda Knippers Jan. 7, 2016, 11:15 p.m. UTC | #3
On 1/7/2016 5:49 PM, Dan Williams wrote:
> On Thu, Jan 7, 2016 at 1:18 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>>
>> On 12/24/2015 9:21 PM, Vishal Verma wrote:
>>> During region creation, perform Address Range Scrubs (ARS) for the SPA
>>> (System Physical Address) ranges to retrieve known poison locations from
>>> firmware. Add a new data structure 'nd_poison' which is used as a list
>>> in nvdimm_bus to store these poison locations.
>>>
>>> When creating a pmem namespace, if there is any known poison associated
>>> with its physical address space, convert the poison ranges to bad sectors
>>> that are exposed using the badblocks interface.
>>>
>>> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
>>> ---
>>>  drivers/acpi/nfit.c       | 203 ++++++++++++++++++++++++++++++++++++++++++++++
>>>  drivers/nvdimm/core.c     | 187 ++++++++++++++++++++++++++++++++++++++++++
>>>  drivers/nvdimm/nd-core.h  |   3 +
>>>  drivers/nvdimm/nd.h       |   6 ++
>>>  drivers/nvdimm/pmem.c     |   6 ++
>>>  include/linux/libnvdimm.h |   1 +
>>>  6 files changed, 406 insertions(+)
>>>
>>> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
>>
>> Why are the ARS function in acpi/nfit.c rather than in the core?
>> I realize that the interfaces are/will be defined in the ACPI spec but
>> it's not really part of the NFIT.
>>
>> Why isn't this part of the nvdimm core, where the calls are made?
> 
> True.  acpi_nfit_find_poison() could easily become
> nvdimm_bus_find_poison() and move to the core.  However we only need
> to take that step when / if another nvdimm bus type show up that isn't
> nfit defined.  For now only nfit-defined nvdimms have a concept of
> ARS.

Why not go ahead and move it now?

I agree that ARS is related to NFIT, but practically everything else
in the core is as well.  It doesn't seem to belongs in nfit.c.  I
didn't even notice that's where it was since the subject line says
"libnvdimm".

> 
>>
>>> index aa45d48..ad6d8c6 100644
>>> --- a/drivers/acpi/nfit.c
>>> +++ b/drivers/acpi/nfit.c
>>> @@ -15,6 +15,7 @@
>>>  #include <linux/module.h>
>>>  #include <linux/mutex.h>
>>>  #include <linux/ndctl.h>
>>> +#include <linux/delay.h>
>>>  #include <linux/list.h>
>>>  #include <linux/acpi.h>
>>>  #include <linux/sort.h>
>>> @@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
>>>       /* devm will free nfit_blk */
>>>  }
>>>
>>> +static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc,
>>> +             struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
>>> +{
>>> +     cmd->address = addr;
>>> +     cmd->length = length;
>>> +
>>> +     return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
>>> +                     sizeof(*cmd));
>>> +}
>>> +
>>> +static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
>>> +             struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
>>> +{
>>> +     int rc;
>>> +
>>> +     cmd->address = addr;
>>> +     cmd->length = length;
>>> +     cmd->type = ND_ARS_PERSISTENT;
>>> +
>>> +     while (1) {
>>> +             rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
>>> +                             sizeof(*cmd));
>>> +             if (rc)
>>> +                     return rc;
>>> +             switch (cmd->status) {
>>> +             case 0:
>>> +                     return 0;
>>> +             case 1:
>>> +                     /* ARS unsupported, but we should never get here */
>>> +                     return 0;
>>> +             case 2:
>>> +                     return -EINVAL;
>>> +             case 3:
>>
>> Are these values in flux in the ACPI spec?  Would #defines help?
> 
> Yes that would be more readable...
> 
>>> +                     /* ARS is in progress */
>>> +                     msleep(1000);
>>> +                     break;
>>
>> Should this be considered an error (perhaps EAGAIN?) if an ARS
>> is in progress since you're supposed to check the status before
>> attempting to start one?  Do we really want to keep retrying
>> it, potentially forever, here?
> 
> We indeed need a timeout here.  I'll take a look at this and the above
> as Vishal is out for the rest of the week.
> 
> There's also plans for 4.6 to make ARS polling asynchronous to the
> rest of the nvdimm system coming up, but deemed too risky /
> complicated for intercepting 4.5.

Ok, makes sense.

> 
>>
>>> +             default:
>>> +                     return -ENXIO;
>>> +             }
>>> +     }
>>> +}
>>> +
>>> +static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
>>> +             struct nd_cmd_ars_status *cmd)
>>> +{
>>> +     int rc;
>>> +
>>> +     while (1) {
>>> +             rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
>>> +                     sizeof(*cmd));
>>> +             if (rc || cmd->status & 0xffff)
>>> +                     return -ENXIO;
>>> +
>>> +             /* Check extended status (Upper two bytes) */
>>> +             switch (cmd->status >> 16) {
>>> +             case 0:
>>> +                     return 0;
>>> +             case 1:
>>> +                     /* ARS is in progress */
>>> +                     msleep(1000);
>>> +                     break;
>>
>> Why isn't ARS in progress a legitimate status to return,
>> especially if you're supposed to check if one is in progress
>> before starting one?
>>
>> You could have a different call to wait for completion if
>> that's what you want to do, but it wouldn't be called
>> ars_get_status().
> 
> Right, this is something to address when we make ARS fully asynchronous.
> 
>>
>>> +             case 2:
>>> +                     /* No ARS performed for the current boot */
>>> +                     return 0;
>>> +             default:
>>> +                     return -ENXIO;
>>> +             }
>>> +     }
>>> +}
>>> +
>>> +static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
>>> +             struct nd_cmd_ars_status *ars_status, u64 start)
>>> +{
>>> +     int rc;
>>> +     u32 i;
>>> +
>>> +     /*
>>> +      * The address field returned by ars_status should be either
>>> +      * less than or equal to the address we last started ARS for.
>>> +      * The (start, length) returned by ars_status should also have
>>> +      * non-zero overlap with the range we started ARS for.
>>> +      * If this is not the case, bail.
>>> +      */
>>> +     if (ars_status->address > start ||
>>> +                     (ars_status->address + ars_status->length < start))
>>> +             return -ENXIO;
>>> +
>>> +     for (i = 0; i < ars_status->num_records; i++) {
>>> +             rc = nvdimm_bus_add_poison(nvdimm_bus,
>>> +                             ars_status->records[i].err_address,
>>> +                             ars_status->records[i].length);
>>> +             if (rc)
>>> +                     return rc;
>>> +     }
>>> +
>>> +     return 0;
>>> +}
>>> +
>>> +static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
>>> +             struct nd_region_desc *ndr_desc)
>>> +{
>>> +     struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
>>> +     struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
>>> +     struct nd_cmd_ars_status *ars_status = NULL;
>>> +     struct nd_cmd_ars_start *ars_start = NULL;
>>> +     struct nd_cmd_ars_cap *ars_cap = NULL;
>>> +     u64 start, len, cur, remaining;
>>> +     int rc;
>>> +
>>> +     ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
>>> +     if (!ars_cap)
>>> +             return -ENOMEM;
>>> +
>>> +     start = ndr_desc->res->start;
>>> +     len = ndr_desc->res->end - ndr_desc->res->start + 1;
>>> +
>>> +     rc = ars_get_cap(nd_desc, ars_cap, start, len);
>>> +     if (rc)
>>> +             goto out;
>>> +
>>> +     /*
>>> +      * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
>>> +      * extended status is not set, skip this but continue initialization
>>> +      */
>>> +     if ((ars_cap->status & 0xffff) ||
>>> +             !(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
>>> +             dev_warn(acpi_desc->dev,
>>> +                     "ARS unsupported (status: 0x%x), won't create an error list\n",
>>> +                     ars_cap->status);
>>> +             goto out;
>>> +     }
>>> +
>>> +     /*
>>> +      * Check if a full-range ARS has been run. If so, use those results
>>> +      * without having to start a new ARS.
>>> +      */
>>> +     ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status),
>>> +                     GFP_KERNEL);
>>> +     if (!ars_status) {
>>> +             rc = -ENOMEM;
>>> +             goto out;
>>> +     }
>>> +
>>> +     rc = ars_get_status(nd_desc, ars_status);
>>> +     if (rc)
>>> +             goto out;
>>> +
>>> +     if (ars_status->address <= start &&
>>> +             (ars_status->address + ars_status->length >= start + len)) {
>>> +             rc = ars_status_process_records(nvdimm_bus, ars_status, start);
>>> +             goto out;
>>> +     }
>>> +
>>> +     /*
>>> +      * ARS_STATUS can overflow if the number of poison entries found is
>>> +      * greater than the maximum buffer size (ars_cap->max_ars_out)
>>> +      * To detect overflow, check if the length field of ars_status
>>> +      * is less than the length we supplied. If so, process the
>>> +      * error entries we got, adjust the start point, and start again
>>> +      */
>>> +     ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL);
>>> +     if (!ars_start)
>>> +             return -ENOMEM;
>>> +
>>> +     cur = start;
>>> +     remaining = len;
>>> +     do {
>>> +             u64 done, end;
>>> +
>>> +             rc = ars_do_start(nd_desc, ars_start, cur, remaining);
>>
>> I think you should get status, start, wait for completion, then process
>> records, at least that's how I read the example DSM spec and other
>> specs.  It says "You must first issue a Query ARS Status command...".
> 
> We do the lead-in ars_status check above...

Oh right, I missed that.

The way the locking works, the registration of regions are serialized so we
can't never have overlapping calls to this function, right?  A comment
about how this is synchronized might be helpful since the lock is many
functions away.

-- ljk

> 
>>
>>> +             if (rc)
>>> +                     goto out;
>>> +
>>> +             rc = ars_get_status(nd_desc, ars_status);
>>> +             if (rc)
>>> +                     goto out;
>>> +
>>> +             rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
>>> +             if (rc)
>>> +                     goto out;
>>> +
>>> +             end = min(cur + remaining,
>>> +                     ars_status->address + ars_status->length);
>>> +             done = end - cur;
>>> +             cur += done;
>>> +             remaining -= done;
>>> +     } while (remaining);
>>> +
>>> + out:
>>> +     kfree(ars_cap);
>>> +     kfree(ars_start);
>>> +     kfree(ars_status);
>>> +     return rc;
>>> +}
>>> +
>>
>> I haven't looked at the rest of this, will try to later.
>>
> 
> Thanks Linda!
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Dan Williams Jan. 7, 2016, 11:22 p.m. UTC | #4
On Thu, Jan 7, 2016 at 3:15 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
> On 1/7/2016 5:49 PM, Dan Williams wrote:
>> On Thu, Jan 7, 2016 at 1:18 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>>>
>>> On 12/24/2015 9:21 PM, Vishal Verma wrote:
>>>> During region creation, perform Address Range Scrubs (ARS) for the SPA
>>>> (System Physical Address) ranges to retrieve known poison locations from
>>>> firmware. Add a new data structure 'nd_poison' which is used as a list
>>>> in nvdimm_bus to store these poison locations.
>>>>
>>>> When creating a pmem namespace, if there is any known poison associated
>>>> with its physical address space, convert the poison ranges to bad sectors
>>>> that are exposed using the badblocks interface.
>>>>
>>>> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
>>>> ---
>>>>  drivers/acpi/nfit.c       | 203 ++++++++++++++++++++++++++++++++++++++++++++++
>>>>  drivers/nvdimm/core.c     | 187 ++++++++++++++++++++++++++++++++++++++++++
>>>>  drivers/nvdimm/nd-core.h  |   3 +
>>>>  drivers/nvdimm/nd.h       |   6 ++
>>>>  drivers/nvdimm/pmem.c     |   6 ++
>>>>  include/linux/libnvdimm.h |   1 +
>>>>  6 files changed, 406 insertions(+)
>>>>
>>>> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
>>>
>>> Why are the ARS function in acpi/nfit.c rather than in the core?
>>> I realize that the interfaces are/will be defined in the ACPI spec but
>>> it's not really part of the NFIT.
>>>
>>> Why isn't this part of the nvdimm core, where the calls are made?
>>
>> True.  acpi_nfit_find_poison() could easily become
>> nvdimm_bus_find_poison() and move to the core.  However we only need
>> to take that step when / if another nvdimm bus type show up that isn't
>> nfit defined.  For now only nfit-defined nvdimms have a concept of
>> ARS.
>
> Why not go ahead and move it now?
>
> I agree that ARS is related to NFIT, but practically everything else
> in the core is as well.  It doesn't seem to belongs in nfit.c.  I
> didn't even notice that's where it was since the subject line says
> "libnvdimm".

Should be straightforward, will do.

>>>> index aa45d48..ad6d8c6 100644
>>>> --- a/drivers/acpi/nfit.c
>>>> +++ b/drivers/acpi/nfit.c
>>>> @@ -15,6 +15,7 @@
[..]
>>> I think you should get status, start, wait for completion, then process
>>> records, at least that's how I read the example DSM spec and other
>>> specs.  It says "You must first issue a Query ARS Status command...".
>>
>> We do the lead-in ars_status check above...
>
> Oh right, I missed that.
>
> The way the locking works, the registration of regions are serialized so we
> can't never have overlapping calls to this function, right?  A comment
> about how this is synchronized might be helpful since the lock is many
> functions away.

Sounds good.
Dan Williams Feb. 13, 2016, 7:41 p.m. UTC | #5
On Thu, Jan 7, 2016 at 3:22 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Thu, Jan 7, 2016 at 3:15 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>> On 1/7/2016 5:49 PM, Dan Williams wrote:
>>> On Thu, Jan 7, 2016 at 1:18 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>>>>
>>>> On 12/24/2015 9:21 PM, Vishal Verma wrote:
>>>>> During region creation, perform Address Range Scrubs (ARS) for the SPA
>>>>> (System Physical Address) ranges to retrieve known poison locations from
>>>>> firmware. Add a new data structure 'nd_poison' which is used as a list
>>>>> in nvdimm_bus to store these poison locations.
>>>>>
>>>>> When creating a pmem namespace, if there is any known poison associated
>>>>> with its physical address space, convert the poison ranges to bad sectors
>>>>> that are exposed using the badblocks interface.
>>>>>
>>>>> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
>>>>> ---
>>>>>  drivers/acpi/nfit.c       | 203 ++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  drivers/nvdimm/core.c     | 187 ++++++++++++++++++++++++++++++++++++++++++
>>>>>  drivers/nvdimm/nd-core.h  |   3 +
>>>>>  drivers/nvdimm/nd.h       |   6 ++
>>>>>  drivers/nvdimm/pmem.c     |   6 ++
>>>>>  include/linux/libnvdimm.h |   1 +
>>>>>  6 files changed, 406 insertions(+)
>>>>>
>>>>> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
>>>>
>>>> Why are the ARS function in acpi/nfit.c rather than in the core?
>>>> I realize that the interfaces are/will be defined in the ACPI spec but
>>>> it's not really part of the NFIT.
>>>>
>>>> Why isn't this part of the nvdimm core, where the calls are made?
>>>
>>> True.  acpi_nfit_find_poison() could easily become
>>> nvdimm_bus_find_poison() and move to the core.  However we only need
>>> to take that step when / if another nvdimm bus type show up that isn't
>>> nfit defined.  For now only nfit-defined nvdimms have a concept of
>>> ARS.
>>
>> Why not go ahead and move it now?
>>
>> I agree that ARS is related to NFIT, but practically everything else
>> in the core is as well.  It doesn't seem to belongs in nfit.c.  I
>> didn't even notice that's where it was since the subject line says
>> "libnvdimm".
>
> Should be straightforward, will do.

So this turns out to not be straightforward because the ars record
format is ACPI specific.  We would need to define a generic record
format to move the poison processing to the core, and I don't see this
being worth the effort until a non-ACPI ars record format appears.
diff mbox

Patch

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index aa45d48..ad6d8c6 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -15,6 +15,7 @@ 
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/ndctl.h>
+#include <linux/delay.h>
 #include <linux/list.h>
 #include <linux/acpi.h>
 #include <linux/sort.h>
@@ -1473,6 +1474,201 @@  static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
 	/* devm will free nfit_blk */
 }
 
+static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc,
+		struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
+{
+	cmd->address = addr;
+	cmd->length = length;
+
+	return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
+			sizeof(*cmd));
+}
+
+static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
+		struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
+{
+	int rc;
+
+	cmd->address = addr;
+	cmd->length = length;
+	cmd->type = ND_ARS_PERSISTENT;
+
+	while (1) {
+		rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
+				sizeof(*cmd));
+		if (rc)
+			return rc;
+		switch (cmd->status) {
+		case 0:
+			return 0;
+		case 1:
+			/* ARS unsupported, but we should never get here */
+			return 0;
+		case 2:
+			return -EINVAL;
+		case 3:
+			/* ARS is in progress */
+			msleep(1000);
+			break;
+		default:
+			return -ENXIO;
+		}
+	}
+}
+
+static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
+		struct nd_cmd_ars_status *cmd)
+{
+	int rc;
+
+	while (1) {
+		rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
+			sizeof(*cmd));
+		if (rc || cmd->status & 0xffff)
+			return -ENXIO;
+
+		/* Check extended status (Upper two bytes) */
+		switch (cmd->status >> 16) {
+		case 0:
+			return 0;
+		case 1:
+			/* ARS is in progress */
+			msleep(1000);
+			break;
+		case 2:
+			/* No ARS performed for the current boot */
+			return 0;
+		default:
+			return -ENXIO;
+		}
+	}
+}
+
+static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
+		struct nd_cmd_ars_status *ars_status, u64 start)
+{
+	int rc;
+	u32 i;
+
+	/*
+	 * The address field returned by ars_status should be either
+	 * less than or equal to the address we last started ARS for.
+	 * The (start, length) returned by ars_status should also have
+	 * non-zero overlap with the range we started ARS for.
+	 * If this is not the case, bail.
+	 */
+	if (ars_status->address > start ||
+			(ars_status->address + ars_status->length < start))
+		return -ENXIO;
+
+	for (i = 0; i < ars_status->num_records; i++) {
+		rc = nvdimm_bus_add_poison(nvdimm_bus,
+				ars_status->records[i].err_address,
+				ars_status->records[i].length);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
+		struct nd_region_desc *ndr_desc)
+{
+	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+	struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
+	struct nd_cmd_ars_status *ars_status = NULL;
+	struct nd_cmd_ars_start *ars_start = NULL;
+	struct nd_cmd_ars_cap *ars_cap = NULL;
+	u64 start, len, cur, remaining;
+	int rc;
+
+	ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
+	if (!ars_cap)
+		return -ENOMEM;
+
+	start = ndr_desc->res->start;
+	len = ndr_desc->res->end - ndr_desc->res->start + 1;
+
+	rc = ars_get_cap(nd_desc, ars_cap, start, len);
+	if (rc)
+		goto out;
+
+	/*
+	 * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
+	 * extended status is not set, skip this but continue initialization
+	 */
+	if ((ars_cap->status & 0xffff) ||
+		!(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
+		dev_warn(acpi_desc->dev,
+			"ARS unsupported (status: 0x%x), won't create an error list\n",
+			ars_cap->status);
+		goto out;
+	}
+
+	/*
+	 * Check if a full-range ARS has been run. If so, use those results
+	 * without having to start a new ARS.
+	 */
+	ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status),
+			GFP_KERNEL);
+	if (!ars_status) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = ars_get_status(nd_desc, ars_status);
+	if (rc)
+		goto out;
+
+	if (ars_status->address <= start &&
+		(ars_status->address + ars_status->length >= start + len)) {
+		rc = ars_status_process_records(nvdimm_bus, ars_status, start);
+		goto out;
+	}
+
+	/*
+	 * ARS_STATUS can overflow if the number of poison entries found is
+	 * greater than the maximum buffer size (ars_cap->max_ars_out)
+	 * To detect overflow, check if the length field of ars_status
+	 * is less than the length we supplied. If so, process the
+	 * error entries we got, adjust the start point, and start again
+	 */
+	ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL);
+	if (!ars_start)
+		return -ENOMEM;
+
+	cur = start;
+	remaining = len;
+	do {
+		u64 done, end;
+
+		rc = ars_do_start(nd_desc, ars_start, cur, remaining);
+		if (rc)
+			goto out;
+
+		rc = ars_get_status(nd_desc, ars_status);
+		if (rc)
+			goto out;
+
+		rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
+		if (rc)
+			goto out;
+
+		end = min(cur + remaining,
+			ars_status->address + ars_status->length);
+		done = end - cur;
+		cur += done;
+		remaining -= done;
+	} while (remaining);
+
+ out:
+	kfree(ars_cap);
+	kfree(ars_start);
+	kfree(ars_status);
+	return rc;
+}
+
 static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 		struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
 		struct acpi_nfit_memory_map *memdev,
@@ -1585,6 +1781,13 @@  static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 
 	nvdimm_bus = acpi_desc->nvdimm_bus;
 	if (nfit_spa_type(spa) == NFIT_SPA_PM) {
+		rc = acpi_nfit_find_poison(acpi_desc, ndr_desc);
+		if (rc) {
+			dev_err(acpi_desc->dev,
+				"error while performing ARS to find poison: %d\n",
+				rc);
+			return rc;
+		}
 		if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
 			return -ENOMEM;
 	} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 82c49bb..21003b7 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -325,6 +325,7 @@  struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
 	if (!nvdimm_bus)
 		return NULL;
 	INIT_LIST_HEAD(&nvdimm_bus->list);
+	INIT_LIST_HEAD(&nvdimm_bus->poison_list);
 	init_waitqueue_head(&nvdimm_bus->probe_wait);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
 	mutex_init(&nvdimm_bus->reconfig_mutex);
@@ -359,6 +360,191 @@  struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
 }
 EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
 
+/**
+ * __add_badblock_range() - Convert a physical address range to bad sectors
+ * @disk:	the disk associated with the namespace
+ * @ns_offset:	namespace offset where the error range begins (in bytes)
+ * @len:	number of bytes of poison to be added
+ *
+ * This assumes that the range provided with (ns_offset, len) is within
+ * the bounds of physical addresses for this namespace, i.e. lies in the
+ * interval [ns_start, ns_start + ns_size)
+ */
+static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len)
+{
+	unsigned int sector_size = queue_logical_block_size(disk->queue);
+	sector_t start_sector;
+	u64 num_sectors;
+	u32 rem;
+	int rc;
+
+	start_sector = div_u64(ns_offset, sector_size);
+	num_sectors = div_u64_rem(len, sector_size, &rem);
+	if (rem)
+		num_sectors++;
+
+	if (!disk->bb) {
+		rc = disk_alloc_badblocks(disk);
+		if (rc)
+			return rc;
+	}
+
+	if (unlikely(num_sectors > (u64)INT_MAX)) {
+		u64 remaining = num_sectors;
+		sector_t s = start_sector;
+
+		while (remaining) {
+			int done = min_t(u64, remaining, INT_MAX);
+
+			rc = disk_set_badblocks(disk, s, done);
+			if (rc)
+				return rc;
+			remaining -= done;
+			s += done;
+		}
+		return 0;
+	} else
+		return disk_set_badblocks(disk, start_sector, num_sectors);
+}
+
+/**
+ * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks
+ * @disk:	the gendisk associated with the namespace where badblocks
+ *		will be stored
+ * @offset:	offset at the start of the namespace before 'sector 0'
+ * @ndns:	the namespace containing poison ranges
+ *
+ * The poison list generated during NFIT initialization may contain multiple,
+ * possibly overlapping ranges in the SPA (System Physical Address) space.
+ * Compare each of these ranges to the namespace currently being initialized,
+ * and add badblocks to the gendisk for all matching sub-ranges
+ *
+ * Return:
+ * 0 - Success
+ */
+int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset,
+		struct nd_namespace_common *ndns)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+	struct nvdimm_bus *nvdimm_bus;
+	struct list_head *poison_list;
+	u64 ns_start, ns_end, ns_size;
+	struct nd_poison *pl;
+	int rc;
+
+	ns_size = nvdimm_namespace_capacity(ndns) - offset;
+	ns_start = nsio->res.start + offset;
+	ns_end = nsio->res.end;
+
+	nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent);
+	poison_list = &nvdimm_bus->poison_list;
+	if (list_empty(poison_list))
+		return 0;
+
+	list_for_each_entry(pl, poison_list, list) {
+		u64 pl_end = pl->start + pl->length - 1;
+
+		/* Discard intervals with no intersection */
+		if (pl_end < ns_start)
+			continue;
+		if (pl->start > ns_end)
+			continue;
+		/* Deal with any overlap after start of the namespace */
+		if (pl->start >= ns_start) {
+			u64 start = pl->start;
+			u64 len;
+
+			if (pl_end <= ns_end)
+				len = pl->length;
+			else
+				len = ns_start + ns_size - pl->start;
+
+			rc = __add_badblock_range(disk, start - ns_start, len);
+			if (rc)
+				return rc;
+			dev_info(&nvdimm_bus->dev,
+				"Found a poison range (0x%llx, 0x%llx)\n",
+				start, len);
+			continue;
+		}
+		/* Deal with overlap for poison starting before the namespace */
+		if (pl->start < ns_start) {
+			u64 len;
+
+			if (pl_end < ns_end)
+				len = pl->start + pl->length - ns_start;
+			else
+				len = ns_size;
+
+			rc = __add_badblock_range(disk, 0, len);
+			if (rc)
+				return rc;
+			dev_info(&nvdimm_bus->dev,
+				"Found a poison range (0x%llx, 0x%llx)\n",
+				pl->start, len);
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison);
+
+static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
+{
+	struct nd_poison *pl;
+
+	pl = kzalloc(sizeof(*pl), GFP_KERNEL);
+	if (!pl)
+		return -ENOMEM;
+
+	pl->start = addr;
+	pl->length = length;
+	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
+
+	return 0;
+}
+
+int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
+{
+	struct nd_poison *pl;
+
+	if (list_empty(&nvdimm_bus->poison_list))
+		return __add_poison(nvdimm_bus, addr, length);
+
+	/*
+	 * There is a chance this is a duplicate, check for those first.
+	 * This will be the common case as ARS_STATUS returns all known
+	 * errors in the SPA space, and we can't query it per region
+	 */
+	list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
+		if (pl->start == addr) {
+			/* If length has changed, update this list entry */
+			if (pl->length != length)
+				pl->length = length;
+			return 0;
+		}
+
+	/*
+	 * If not a duplicate or a simple length update, add the entry as is,
+	 * as any overlapping ranges will get resolved when the list is consumed
+	 * and converted to badblocks
+	 */
+	return __add_poison(nvdimm_bus, addr, length);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
+
+static void free_poison_list(struct list_head *poison_list)
+{
+	struct nd_poison *pl, *next;
+
+	list_for_each_entry_safe(pl, next, poison_list, list) {
+		list_del(&pl->list);
+		kfree(pl);
+	}
+	list_del_init(poison_list);
+}
+
 static int child_unregister(struct device *dev, void *data)
 {
 	/*
@@ -385,6 +571,7 @@  void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
 
 	nd_synchronize();
 	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+	free_poison_list(&nvdimm_bus->poison_list);
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
 
 	device_unregister(&nvdimm_bus->dev);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 159aed5..d3b7ea7 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -30,6 +30,7 @@  struct nvdimm_bus {
 	struct list_head list;
 	struct device dev;
 	int id, probe_active;
+	struct list_head poison_list;
 	struct mutex reconfig_mutex;
 };
 
@@ -89,4 +90,6 @@  bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
 ssize_t nd_namespace_store(struct device *dev,
 		struct nd_namespace_common **_ndns, const char *buf,
 		size_t len);
+int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset,
+		struct nd_namespace_common *ndns);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 417e521..ba91fcd 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -38,6 +38,12 @@  enum {
 #endif
 };
 
+struct nd_poison {
+	u64 start;
+	u64 length;
+	struct list_head list;
+};
+
 struct nvdimm_drvdata {
 	struct device *dev;
 	int nsindex_size;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8ee7989..5b95043 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -27,6 +27,7 @@ 
 #include <linux/slab.h>
 #include <linux/pmem.h>
 #include <linux/nd.h>
+#include "nd-core.h"
 #include "pfn.h"
 #include "nd.h"
 
@@ -168,6 +169,7 @@  static int pmem_attach_disk(struct device *dev,
 {
 	int nid = dev_to_node(dev);
 	struct gendisk *disk;
+	int ret;
 
 	pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid);
 	if (!pmem->pmem_queue)
@@ -196,6 +198,10 @@  static int pmem_attach_disk(struct device *dev,
 	set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
 	pmem->pmem_disk = disk;
 
+	ret = nvdimm_namespace_add_poison(disk, pmem->data_offset, ndns);
+	if (ret)
+		return ret;
+
 	add_disk(disk);
 	revalidate_disk(disk);
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 3f021dc..bed40df 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -116,6 +116,7 @@  static inline struct nd_blk_region_desc *to_blk_region_desc(
 
 }
 
+int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
 struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
 #define nvdimm_bus_register(parent, desc) \