diff mbox

[3/3] libnvdimm, pmem: clear poison on write

Message ID 20160308224729.16298.49406.stgit@dwillia2-desk3.jf.intel.com (mailing list archive)
State Accepted
Commit 59e6473980f3
Headers show

Commit Message

Dan Williams March 8, 2016, 10:47 p.m. UTC
If a write is directed at a known bad block perform the following:

1/ write the data

2/ send a clear poison command

3/ invalidate the poison out of the cache hierarchy

Cc: <x86@kernel.org>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/include/asm/pmem.h |    5 +++++
 drivers/nvdimm/bus.c        |   46 +++++++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/nd.h         |    2 ++
 drivers/nvdimm/pmem.c       |   29 ++++++++++++++++++++++++++-
 include/linux/pmem.h        |   19 ++++++++++++++++++
 5 files changed, 100 insertions(+), 1 deletion(-)

Comments

Verma, Vishal L March 11, 2016, 12:39 a.m. UTC | #1
On Tue, 2016-03-08 at 14:47 -0800, Dan Williams wrote:
> If a write is directed at a known bad block perform the following:

> 

> 1/ write the data

> 

> 2/ send a clear poison command

> 

> 3/ invalidate the poison out of the cache hierarchy

> 

> Cc: <x86@kernel.org>

> Cc: Vishal Verma <vishal.l.verma@intel.com>

> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>

> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

> ---

>  arch/x86/include/asm/pmem.h |    5 +++++

>  drivers/nvdimm/bus.c        |   46

> +++++++++++++++++++++++++++++++++++++++++++

>  drivers/nvdimm/nd.h         |    2 ++

>  drivers/nvdimm/pmem.c       |   29 ++++++++++++++++++++++++++-

>  include/linux/pmem.h        |   19 ++++++++++++++++++

>  5 files changed, 100 insertions(+), 1 deletion(-)

> 

> diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h

> index c57fd1ea9689..bf8b35d2035a 100644

> --- a/arch/x86/include/asm/pmem.h

> +++ b/arch/x86/include/asm/pmem.h


<>

>  static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,

>  			unsigned int len, unsigned int off, int rw,

>  			sector_t sector)

>  {

>  	int rc = 0;

> +	bool bad_pmem = false;

>  	void *mem = kmap_atomic(page);

>  	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;

>  	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;

>  

> +	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))

> +		bad_pmem = true;

> +

>  	if (rw == READ) {

> -		if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))

> +		if (unlikely(bad_pmem))

>  			rc = -EIO;

>  		else {

>  			memcpy_from_pmem(mem + off, pmem_addr, len);

> @@ -81,6 +104,10 @@ static int pmem_do_bvec(struct pmem_device *pmem,

> struct page *page,

>  	} else {

>  		flush_dcache_page(page);

>  		memcpy_to_pmem(pmem_addr, mem + off, len);

> +		if (unlikely(bad_pmem)) {

> +			pmem_clear_poison(pmem, pmem_off, len);

> +			memcpy_to_pmem(pmem_addr, mem + off, len);

> +		}

>  	}


Just noticed this -- why do we memcpy_to_pmem twice in the error case?
Sh
ouldn't it be:

	if (unlikely(bad_pmem))
		pmem_clear_poison(pmem, pmem_off, len);
	memcpy_to_pmem(pmem_addr, mem + off, len);
Dan Williams March 11, 2016, 12:50 a.m. UTC | #2
On Thu, Mar 10, 2016 at 4:39 PM, Verma, Vishal L
<vishal.l.verma@intel.com> wrote:
> On Tue, 2016-03-08 at 14:47 -0800, Dan Williams wrote:
>> If a write is directed at a known bad block perform the following:
>>
>> 1/ write the data
>>
>> 2/ send a clear poison command
>>
>> 3/ invalidate the poison out of the cache hierarchy
>>
>> Cc: <x86@kernel.org>
>> Cc: Vishal Verma <vishal.l.verma@intel.com>
>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  arch/x86/include/asm/pmem.h |    5 +++++
>>  drivers/nvdimm/bus.c        |   46
>> +++++++++++++++++++++++++++++++++++++++++++
>>  drivers/nvdimm/nd.h         |    2 ++
>>  drivers/nvdimm/pmem.c       |   29 ++++++++++++++++++++++++++-
>>  include/linux/pmem.h        |   19 ++++++++++++++++++
>>  5 files changed, 100 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
>> index c57fd1ea9689..bf8b35d2035a 100644
>> --- a/arch/x86/include/asm/pmem.h
>> +++ b/arch/x86/include/asm/pmem.h
>
> <>
>
>>  static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
>>                       unsigned int len, unsigned int off, int rw,
>>                       sector_t sector)
>>  {
>>       int rc = 0;
>> +     bool bad_pmem = false;
>>       void *mem = kmap_atomic(page);
>>       phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
>>       void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
>>
>> +     if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
>> +             bad_pmem = true;
>> +
>>       if (rw == READ) {
>> -             if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
>> +             if (unlikely(bad_pmem))
>>                       rc = -EIO;
>>               else {
>>                       memcpy_from_pmem(mem + off, pmem_addr, len);
>> @@ -81,6 +104,10 @@ static int pmem_do_bvec(struct pmem_device *pmem,
>> struct page *page,
>>       } else {
>>               flush_dcache_page(page);
>>               memcpy_to_pmem(pmem_addr, mem + off, len);
>> +             if (unlikely(bad_pmem)) {
>> +                     pmem_clear_poison(pmem, pmem_off, len);
>> +                     memcpy_to_pmem(pmem_addr, mem + off, len);
>> +             }
>>       }
>
> Just noticed this -- why do we memcpy_to_pmem twice in the error case?
> Sh
> ouldn't it be:
>
>         if (unlikely(bad_pmem))
>                 pmem_clear_poison(pmem, pmem_off, len);
>         memcpy_to_pmem(pmem_addr, mem + off, len);
>

There is an open question of whether clear_poison implementations
guarantee determinant data after clear, or otherwise guarantee that
the data written before the clear_poison stays in place.  So I write
twice to cover all those bases.  Probably deserves a comment.
diff mbox

Patch

diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index c57fd1ea9689..bf8b35d2035a 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -137,6 +137,11 @@  static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 	arch_wb_cache_pmem(addr, size);
 }
 
+static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
+{
+	clflush_cache_range((void __force *) addr, size);
+}
+
 static inline bool __arch_has_wmb_pmem(void)
 {
 	/*
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index cb6fd64b13e3..33557481d452 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -159,6 +159,52 @@  void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event)
 }
 EXPORT_SYMBOL_GPL(nvdimm_region_notify);
 
+long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
+		unsigned int len)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc;
+	struct nd_cmd_clear_error clear_err;
+	struct nd_cmd_ars_cap ars_cap;
+	u32 clear_err_unit, mask;
+	int cmd_rc, rc;
+
+	if (!nvdimm_bus)
+		return -ENXIO;
+
+	nd_desc = nvdimm_bus->nd_desc;
+	if (!nd_desc->ndctl)
+		return -ENXIO;
+
+	memset(&ars_cap, 0, sizeof(ars_cap));
+	ars_cap.address = phys;
+	ars_cap.length = len;
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, &ars_cap,
+			sizeof(ars_cap), &cmd_rc);
+	if (rc < 0)
+		return rc;
+	if (cmd_rc < 0)
+		return cmd_rc;
+	clear_err_unit = ars_cap.clear_err_unit;
+	if (!clear_err_unit || !is_power_of_2(clear_err_unit))
+		return -ENXIO;
+
+	mask = clear_err_unit - 1;
+	if ((phys | len) & mask)
+		return -ENXIO;
+	memset(&clear_err, 0, sizeof(clear_err));
+	clear_err.address = phys;
+	clear_err.length = len;
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_CLEAR_ERROR, &clear_err,
+			sizeof(clear_err), &cmd_rc);
+	if (rc < 0)
+		return rc;
+	if (cmd_rc < 0)
+		return cmd_rc;
+	return clear_err.cleared;
+}
+EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
+
 static struct bus_type nvdimm_bus_type = {
 	.name = "nd",
 	.uevent = nvdimm_bus_uevent,
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 78b82f6dd191..1799bd97a9ce 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -186,6 +186,8 @@  int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
 int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
 int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
 		void *buf, size_t len);
+long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
+		unsigned int len);
 struct nd_btt *to_nd_btt(struct device *dev);
 
 struct nd_gen_sb {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e7b86a7fca0a..adc387236fe7 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -62,17 +62,40 @@  static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
 	return false;
 }
 
+static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
+		unsigned int len)
+{
+	struct device *dev = disk_to_dev(pmem->pmem_disk);
+	sector_t sector;
+	long cleared;
+
+	sector = (offset - pmem->data_offset) / 512;
+	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
+
+	if (cleared > 0 && cleared / 512) {
+		dev_dbg(dev, "%s: %llx clear %ld sector%s\n",
+				__func__, (unsigned long long) sector,
+				cleared / 512, cleared / 512 > 1 ? "s" : "");
+		badblocks_clear(&pmem->bb, sector, cleared / 512);
+	}
+	invalidate_pmem(pmem->virt_addr + offset, len);
+}
+
 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 			unsigned int len, unsigned int off, int rw,
 			sector_t sector)
 {
 	int rc = 0;
+	bool bad_pmem = false;
 	void *mem = kmap_atomic(page);
 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
 	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
 
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+		bad_pmem = true;
+
 	if (rw == READ) {
-		if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+		if (unlikely(bad_pmem))
 			rc = -EIO;
 		else {
 			memcpy_from_pmem(mem + off, pmem_addr, len);
@@ -81,6 +104,10 @@  static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 	} else {
 		flush_dcache_page(page);
 		memcpy_to_pmem(pmem_addr, mem + off, len);
+		if (unlikely(bad_pmem)) {
+			pmem_clear_poison(pmem, pmem_off, len);
+			memcpy_to_pmem(pmem_addr, mem + off, len);
+		}
 	}
 
 	kunmap_atomic(mem);
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index 7c3d11a6b4ad..3ec5309e29f3 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -58,6 +58,11 @@  static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
 {
 	BUG();
 }
+
+static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
+{
+	BUG();
+}
 #endif
 
 /*
@@ -186,6 +191,20 @@  static inline void clear_pmem(void __pmem *addr, size_t size)
 }
 
 /**
+ * invalidate_pmem - flush a pmem range from the cache hierarchy
+ * @addr:	virtual start address
+ * @size:	bytes to invalidate (internally aligned to cache line size)
+ *
+ * For platforms that support clearing poison this flushes any poisoned
+ * ranges out of the cache
+ */
+static inline void invalidate_pmem(void __pmem *addr, size_t size)
+{
+	if (arch_has_pmem_api())
+		arch_invalidate_pmem(addr, size);
+}
+
+/**
  * wb_cache_pmem - write back processor cache for PMEM memory range
  * @addr:	virtual start address
  * @size:	number of bytes to write back