diff mbox

dax: adding fsync/msync support for device DAX

Message ID 152287929452.28903.15383389230749046740.stgit@djiang5-desk3.ch.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Jiang April 4, 2018, 10:01 p.m. UTC
When msync() is called on a device dax region, eventually ops->fsync() is
called. By providing fsync in the device dax file operations, we can provide
support for both. nvdimm_flush() for the nd_region is called when msync/fsync
is called in order to provide deep flush to the user app through standard POSIX
calls.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dax/dax-private.h |    1 +
 drivers/dax/device-dax.h  |    2 ++
 drivers/dax/device.c      |   19 +++++++++++++++++++
 drivers/dax/pmem.c        |   10 ++++++++++
 4 files changed, 32 insertions(+)

Comments

Dan Williams April 5, 2018, 12:03 a.m. UTC | #1
Include lkml on the next posting as well.

On Wed, Apr 4, 2018 at 3:01 PM, Dave Jiang <dave.jiang@intel.com> wrote:
> When msync() is called on a device dax region, eventually ops->fsync() is
> called. By providing fsync in the device dax file operations, we can provide
> support for both. nvdimm_flush() for the nd_region is called when msync/fsync
> is called in order to provide deep flush to the user app through standard POSIX
> calls.
>

Lets clarify the justification:

"Currently, fsdax applications can assume that if they call fsync or
msync on a dax mapped file that any pending writes that have been
flushed out of the cpu cache will be also be flushed to the lowest
possible persistence / failure domain available on the platform. In
typical scenarios the platform ADR capability handles marshaling
writes that have reached global visibility to persistence. In
exceptional cases where ADR fails to complete its operation software
can detect that scenario the the "last shutdown" health status check
and otherwise mitigate the effects of an ADR failure by protecting
metadata with the WPQ flush. In other words, enabling device-dax to
optionally trigger WPQ Flush on msync() allows applications to have
common implementation for persistence domain handling across fs-dax
and device-dax."


> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/dax/dax-private.h |    1 +
>  drivers/dax/device-dax.h  |    2 ++
>  drivers/dax/device.c      |   19 +++++++++++++++++++
>  drivers/dax/pmem.c        |   10 ++++++++++
>  4 files changed, 32 insertions(+)
>
> diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
> index b6fc4f04636d..5fc20191c89e 100644
> --- a/drivers/dax/dax-private.h
> +++ b/drivers/dax/dax-private.h
> @@ -35,6 +35,7 @@ struct dax_region {
>         unsigned int align;
>         struct resource res;
>         unsigned long pfn_flags;
> +       void (*sync)(struct device *);
>  };
>
>  /**
> diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h
> index 688b051750bd..651f2e763058 100644
> --- a/drivers/dax/device-dax.h
> +++ b/drivers/dax/device-dax.h
> @@ -22,4 +22,6 @@ struct dax_region *alloc_dax_region(struct device *parent,
>                 void *addr, unsigned long flags);
>  struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
>                 int id, struct resource *res, int count);
> +void dax_set_sync(struct dax_region *dax_region, void (*sync)(struct device *dev));
> +
>  #endif /* __DEVICE_DAX_H__ */
> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index 37be5a306c8f..5341760cc987 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -523,6 +523,24 @@ static int dax_release(struct inode *inode, struct file *filp)
>         return 0;
>  }
>
> +void dax_set_sync(struct dax_region *dax_region, void (*sync)(struct device *))
> +{
> +       dax_region->sync = sync;
> +}
> +EXPORT_SYMBOL_GPL(dax_set_sync);

We don't need dax_set_sync(), the dax_pmem driver set up the
dax_region in the first place, it can set the callback at that time.
Christoph Hellwig April 5, 2018, 7:23 a.m. UTC | #2
On Wed, Apr 04, 2018 at 05:03:07PM -0700, Dan Williams wrote:
> "Currently, fsdax applications can assume that if they call fsync or
> msync on a dax mapped file that any pending writes that have been
> flushed out of the cpu cache will be also be flushed to the lowest
> possible persistence / failure domain available on the platform. In
> typical scenarios the platform ADR capability handles marshaling
> writes that have reached global visibility to persistence. In
> exceptional cases where ADR fails to complete its operation software
> can detect that scenario the the "last shutdown" health status check
> and otherwise mitigate the effects of an ADR failure by protecting
> metadata with the WPQ flush. In other words, enabling device-dax to
> optionally trigger WPQ Flush on msync() allows applications to have
> common implementation for persistence domain handling across fs-dax
> and device-dax."

This sounds totally bogus.  Either ADR is reliable and we can rely on
it all the time (like we assume for say capacitors on ssds with non-
volatile write caches), or we can't rely on it and the write through
store model is a blatant lie.  In other words - msync/fsync is what
we use for normal persistence, not for working around broken hardware.

In many ways this sounds like a plot to make normal programming models
not listening to the pmem.io hype look bad in benchmarks..
Dan Williams April 5, 2018, 7:56 a.m. UTC | #3
On Thu, Apr 5, 2018 at 12:23 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Apr 04, 2018 at 05:03:07PM -0700, Dan Williams wrote:
>> "Currently, fsdax applications can assume that if they call fsync or
>> msync on a dax mapped file that any pending writes that have been
>> flushed out of the cpu cache will be also be flushed to the lowest
>> possible persistence / failure domain available on the platform. In
>> typical scenarios the platform ADR capability handles marshaling
>> writes that have reached global visibility to persistence. In
>> exceptional cases where ADR fails to complete its operation software
>> can detect that scenario the the "last shutdown" health status check
>> and otherwise mitigate the effects of an ADR failure by protecting
>> metadata with the WPQ flush. In other words, enabling device-dax to
>> optionally trigger WPQ Flush on msync() allows applications to have
>> common implementation for persistence domain handling across fs-dax
>> and device-dax."
>
> This sounds totally bogus.  Either ADR is reliable and we can rely on
> it all the time (like we assume for say capacitors on ssds with non-
> volatile write caches), or we can't rely on it and the write through
> store model is a blatant lie.  In other words - msync/fsync is what
> we use for normal persistence, not for working around broken hardware.
>

Yes, I think it is unfortunate that the failure mode is exposed to
software at all. The problem is that ADR is a platform feature that
depends on power supply requirements external to the NVDIMM device. An
SSD is different. It is a self contained system that can arrange for
the whole device to fail if the internal energy source fails and
otherwise hide this detail from software. My personal take, a system
designer that can specify and qualify an entire stack of components
can certainly opt-out of advertising the flush capability to the OS
because, like the SSD vendor, they control the integrated solution. A
platform vendor that allows off the shelf power supplies would in my
opinion be remiss not to give the OS the option to mitigate the
quality of some random power supply. It then follow that if the OS has
the ability to mitigate ADR failure it should be through a common
interface between fsdax and devdax.

> In many ways this sounds like a plot to make normal programming models
> not listening to the pmem.io hype look bad in benchmarks..

No, just no.
Christoph Hellwig April 5, 2018, 8:01 a.m. UTC | #4
On Thu, Apr 05, 2018 at 12:56:02AM -0700, Dan Williams wrote:
> Yes, I think it is unfortunate that the failure mode is exposed to
> software at all. The problem is that ADR is a platform feature that
> depends on power supply requirements external to the NVDIMM device. An
> SSD is different. It is a self contained system that can arrange for
> the whole device to fail if the internal energy source fails and
> otherwise hide this detail from software. My personal take, a system
> designer that can specify and qualify an entire stack of components
> can certainly opt-out of advertising the flush capability to the OS
> because, like the SSD vendor, they control the integrated solution. A
> platform vendor that allows off the shelf power supplies would in my
> opinion be remiss not to give the OS the option to mitigate the
> quality of some random power supply. It then follow that if the OS has
> the ability to mitigate ADR failure it should be through a common
> interface between fsdax and devdax.

That means IFF ADR can fail like this we can't treat it as stable
storage and we must not support MAP_SYNC or equivalent device dax
behavior, period.
Jeff Moyer April 5, 2018, 2:59 p.m. UTC | #5
Christoph Hellwig <hch@infradead.org> writes:

> On Thu, Apr 05, 2018 at 12:56:02AM -0700, Dan Williams wrote:
>> Yes, I think it is unfortunate that the failure mode is exposed to
>> software at all. The problem is that ADR is a platform feature that
>> depends on power supply requirements external to the NVDIMM device. An
>> SSD is different. It is a self contained system that can arrange for
>> the whole device to fail if the internal energy source fails and
>> otherwise hide this detail from software. My personal take, a system
>> designer that can specify and qualify an entire stack of components
>> can certainly opt-out of advertising the flush capability to the OS
>> because, like the SSD vendor, they control the integrated solution. A
>> platform vendor that allows off the shelf power supplies would in my
>> opinion be remiss not to give the OS the option to mitigate the
>> quality of some random power supply. It then follow that if the OS has
>> the ability to mitigate ADR failure it should be through a common
>> interface between fsdax and devdax.
>
> That means IFF ADR can fail like this we can't treat it as stable
> storage and we must not support MAP_SYNC or equivalent device dax
> behavior, period.

So, I also hate this (note that this is already in place today for fs
dax).  You have an operation to make things persistent, and another one
to *really* make things persistent.  It makes no sense to me.  I have no
idea how to communicate that to application developers.  When do you
force things out to the smallest failure domain?

The arguments I've heard are that ADR failures may happen due to a
variety of factors, and that an application (or file system) can make
sure that critical (meta)data is available after a crash by flushing to
the smallest failure domain.  Presumably, this would be a
lower-frequency event (only for metadata changes, etc).

I don't buy it.

What remains to be seen is whether ADR actually is reliable.  And, if it
turns out that it isn't, will there be industry pressure to fix the
hardware, will applications adapt to always call fsync, or will we do as
Christoph suggests, and get rid of fallacy of flush from userspace?

I don't have the answers.

-Jeff
Christoph Hellwig April 5, 2018, 3:10 p.m. UTC | #6
On Thu, Apr 05, 2018 at 10:59:10AM -0400, Jeff Moyer wrote:
> So, I also hate this (note that this is already in place today for fs
> dax).  You have an operation to make things persistent, and another one
> to *really* make things persistent.  It makes no sense to me.  I have no
> idea how to communicate that to application developers.  When do you
> force things out to the smallest failure domain?
> 
> The arguments I've heard are that ADR failures may happen due to a
> variety of factors, and that an application (or file system) can make
> sure that critical (meta)data is available after a crash by flushing to
> the smallest failure domain.  Presumably, this would be a
> lower-frequency event (only for metadata changes, etc).

Which meansit should not abuse fsync but have a special interface
just for that IFF we trust ADR.  IFF we don't trust it anyway fsync
absolutely is the right interface, but then we shouldn't offer MAP_SYNC.
Dan Williams April 5, 2018, 10:17 p.m. UTC | #7
On Thu, Apr 5, 2018 at 1:01 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Thu, Apr 05, 2018 at 12:56:02AM -0700, Dan Williams wrote:
>> Yes, I think it is unfortunate that the failure mode is exposed to
>> software at all. The problem is that ADR is a platform feature that
>> depends on power supply requirements external to the NVDIMM device. An
>> SSD is different. It is a self contained system that can arrange for
>> the whole device to fail if the internal energy source fails and
>> otherwise hide this detail from software. My personal take, a system
>> designer that can specify and qualify an entire stack of components
>> can certainly opt-out of advertising the flush capability to the OS
>> because, like the SSD vendor, they control the integrated solution. A
>> platform vendor that allows off the shelf power supplies would in my
>> opinion be remiss not to give the OS the option to mitigate the
>> quality of some random power supply. It then follow that if the OS has
>> the ability to mitigate ADR failure it should be through a common
>> interface between fsdax and devdax.
>
> That means IFF ADR can fail like this we can't treat it as stable
> storage and we must not support MAP_SYNC or equivalent device dax
> behavior, period.

Makes sense, we won't pursue *sync() support on device-dax it doesn't fit.
Christoph Hellwig April 6, 2018, 7:03 a.m. UTC | #8
On Thu, Apr 05, 2018 at 03:17:17PM -0700, Dan Williams wrote:
> > That means IFF ADR can fail like this we can't treat it as stable
> > storage and we must not support MAP_SYNC or equivalent device dax
> > behavior, period.
> 
> Makes sense, we won't pursue *sync() support on device-dax it doesn't fit.

We still have other bits of this way of thinking in the tree as far as
I can tell, e.g. the nvdimm_flush calls in pmem_make_request and thus
should come up with a coherent strategy if we trust ADR, and if we don't
fully trust it how to mitigate it.
Dan Williams April 6, 2018, 10:41 p.m. UTC | #9
On Fri, Apr 6, 2018 at 12:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Thu, Apr 05, 2018 at 03:17:17PM -0700, Dan Williams wrote:
>> > That means IFF ADR can fail like this we can't treat it as stable
>> > storage and we must not support MAP_SYNC or equivalent device dax
>> > behavior, period.
>>
>> Makes sense, we won't pursue *sync() support on device-dax it doesn't fit.
>
> We still have other bits of this way of thinking in the tree as far as
> I can tell, e.g. the nvdimm_flush calls in pmem_make_request and thus
> should come up with a coherent strategy if we trust ADR, and if we don't
> fully trust it how to mitigate it.

Yes, but the trust interface definition is what is missing, especially
when we consider memmap=ss!nn and qemu-kvm. For example do we turn off
DAX and/or MAP_SYNC on all platforms that don't provide a positive "I
have ADR" indication (ACPI 6.2 Section 5.2.25.9 NFIT Platform
Capabilities Structure)? Require opt-in when the user has trust in the
hardware config that the kernel can't verify?
Christoph Hellwig April 9, 2018, 9:32 a.m. UTC | #10
On Fri, Apr 06, 2018 at 03:41:39PM -0700, Dan Williams wrote:
> Yes, but the trust interface definition is what is missing, especially
> when we consider memmap=ss!nn and qemu-kvm. For example do we turn off
> DAX and/or MAP_SYNC on all platforms that don't provide a positive "I
> have ADR" indication (ACPI 6.2 Section 5.2.25.9 NFIT Platform
> Capabilities Structure)?

Sounds like a default.

> Require opt-in when the user has trust in the
> hardware config that the kernel can't verify?

And that sounds like a good config nob ontop of that default.
Jeff Moyer April 11, 2018, 4:06 p.m. UTC | #11
Christoph Hellwig <hch@infradead.org> writes:

> On Fri, Apr 06, 2018 at 03:41:39PM -0700, Dan Williams wrote:
>> Yes, but the trust interface definition is what is missing, especially
>> when we consider memmap=ss!nn and qemu-kvm. For example do we turn off
>> DAX and/or MAP_SYNC on all platforms that don't provide a positive "I
>> have ADR" indication (ACPI 6.2 Section 5.2.25.9 NFIT Platform
>> Capabilities Structure)?
>
> Sounds like a default.

Which do you turn off?  DAX or MAP_SYNC (or both)?

Systems that support ACPI versions earlier than 6.2 could provide
flush hint addresses.  In that case, we could assume no ADR, and turn
off MAP_SYNC, but still allow DAX.  Note that I'm not aware of any
hardware that actually falls into this category.

Platforms prior to 6.2 that don't support flush hints are currently
assumed to implement ADR.  This proposal would change that default.

>> Require opt-in when the user has trust in the hardware config that
>> the kernel can't verify?
>
> And that sounds like a good config nob ontop of that default.

Well, we could also make a white-list for known good platforms.  I
assume HPE's systems would be on that list.  Otherwise we'd have to cook
up udev rules, I guess?

Dan, is this something you're working on now?

-Jeff
Dan Williams April 11, 2018, 4:27 p.m. UTC | #12
On Wed, Apr 11, 2018 at 9:06 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
> Christoph Hellwig <hch@infradead.org> writes:
>
>> On Fri, Apr 06, 2018 at 03:41:39PM -0700, Dan Williams wrote:
>>> Yes, but the trust interface definition is what is missing, especially
>>> when we consider memmap=ss!nn and qemu-kvm. For example do we turn off
>>> DAX and/or MAP_SYNC on all platforms that don't provide a positive "I
>>> have ADR" indication (ACPI 6.2 Section 5.2.25.9 NFIT Platform
>>> Capabilities Structure)?
>>
>> Sounds like a default.
>
> Which do you turn off?  DAX or MAP_SYNC (or both)?

Only MAP_SYNC I would say, because then userspace can't opt out of
fsync/msync and it gives us a chance to "Do The Right Thing (TM)" with
respect to WPQ flush.

> Systems that support ACPI versions earlier than 6.2 could provide
> flush hint addresses.  In that case, we could assume no ADR, and turn
> off MAP_SYNC, but still allow DAX.  Note that I'm not aware of any
> hardware that actually falls into this category.
>
> Platforms prior to 6.2 that don't support flush hints are currently
> assumed to implement ADR.  This proposal would change that default.
>
>>> Require opt-in when the user has trust in the hardware config that
>>> the kernel can't verify?
>>
>> And that sounds like a good config nob ontop of that default.
>
> Well, we could also make a white-list for known good platforms.  I
> assume HPE's systems would be on that list.  Otherwise we'd have to cook
> up udev rules, I guess?

udev rules sound good. We can distribute them in the ndctl package. I
think I would handle this by making
/sys/bus/nd/devices/regionX/persistence_domain writable to opt-in to a
user specified persistence_domain. If the persistence_domain attribute
file is not writable then you know you're on a kernel that blindly
trusts all pmem descriptions as MAP_SYNC capable.

> Dan, is this something you're working on now?

No, it's behind finalizing dax-dma-vs-truncate and memcpy_mcsafe for
user copies in my queue...
Jeff Moyer April 11, 2018, 5:27 p.m. UTC | #13
Dan Williams <dan.j.williams@intel.com> writes:

> On Wed, Apr 11, 2018 at 9:06 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
>> Christoph Hellwig <hch@infradead.org> writes:
>>
>>> On Fri, Apr 06, 2018 at 03:41:39PM -0700, Dan Williams wrote:
>>>> Yes, but the trust interface definition is what is missing, especially
>>>> when we consider memmap=ss!nn and qemu-kvm. For example do we turn off
>>>> DAX and/or MAP_SYNC on all platforms that don't provide a positive "I
>>>> have ADR" indication (ACPI 6.2 Section 5.2.25.9 NFIT Platform
>>>> Capabilities Structure)?
>>>
>>> Sounds like a default.
>>
>> Which do you turn off?  DAX or MAP_SYNC (or both)?
>
> Only MAP_SYNC I would say, because then userspace can't opt out of
> fsync/msync and it gives us a chance to "Do The Right Thing (TM)" with
> respect to WPQ flush.

That works for me.

>> Systems that support ACPI versions earlier than 6.2 could provide
>> flush hint addresses.  In that case, we could assume no ADR, and turn
>> off MAP_SYNC, but still allow DAX.  Note that I'm not aware of any
>> hardware that actually falls into this category.
>>
>> Platforms prior to 6.2 that don't support flush hints are currently
>> assumed to implement ADR.  This proposal would change that default.
>>
>>>> Require opt-in when the user has trust in the hardware config that
>>>> the kernel can't verify?
>>>
>>> And that sounds like a good config nob ontop of that default.
>>
>> Well, we could also make a white-list for known good platforms.  I
>> assume HPE's systems would be on that list.  Otherwise we'd have to cook
>> up udev rules, I guess?
>
> udev rules sound good. We can distribute them in the ndctl package. I
> think I would handle this by making
> /sys/bus/nd/devices/regionX/persistence_domain writable to opt-in to a
> user specified persistence_domain. If the persistence_domain attribute
> file is not writable then you know you're on a kernel that blindly
> trusts all pmem descriptions as MAP_SYNC capable.
>
>> Dan, is this something you're working on now?
>
> No, it's behind finalizing dax-dma-vs-truncate and memcpy_mcsafe for
> user copies in my queue...

OK, I'll take a stab at it.

Thanks,
Jeff
diff mbox

Patch

diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index b6fc4f04636d..5fc20191c89e 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -35,6 +35,7 @@  struct dax_region {
 	unsigned int align;
 	struct resource res;
 	unsigned long pfn_flags;
+	void (*sync)(struct device *);
 };
 
 /**
diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h
index 688b051750bd..651f2e763058 100644
--- a/drivers/dax/device-dax.h
+++ b/drivers/dax/device-dax.h
@@ -22,4 +22,6 @@  struct dax_region *alloc_dax_region(struct device *parent,
 		void *addr, unsigned long flags);
 struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
 		int id, struct resource *res, int count);
+void dax_set_sync(struct dax_region *dax_region, void (*sync)(struct device *dev));
+
 #endif /* __DEVICE_DAX_H__ */
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 37be5a306c8f..5341760cc987 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -523,6 +523,24 @@  static int dax_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+void dax_set_sync(struct dax_region *dax_region, void (*sync)(struct device *))
+{
+	dax_region->sync = sync;
+}
+EXPORT_SYMBOL_GPL(dax_set_sync);
+
+static int dax_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+	struct dev_dax *dev_dax = filp->private_data;
+	struct dax_region *dax_region = dev_dax->region;
+
+	if (!dax_region->sync)
+		return -EOPNOTSUPP;
+
+	dax_region->sync(dax_region->dev);
+	return 0;
+}
+
 static const struct file_operations dax_fops = {
 	.llseek = noop_llseek,
 	.owner = THIS_MODULE,
@@ -530,6 +548,7 @@  static const struct file_operations dax_fops = {
 	.release = dax_release,
 	.get_unmapped_area = dax_get_unmapped_area,
 	.mmap = dax_mmap,
+	.fsync = dax_fsync,
 };
 
 static void dev_dax_release(struct device *dev)
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index fd49b24fd6af..2b47b4dd2671 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -57,6 +57,14 @@  static void dax_pmem_percpu_kill(void *data)
 	percpu_ref_kill(ref);
 }
 
+static void dax_pmem_sync(struct device *dev)
+{
+	struct nd_namespace_common *ndns;
+
+	ndns = to_ndns(dev);
+	nvdimm_flush(to_nd_region(ndns->dev.parent));
+}
+
 static int dax_pmem_probe(struct device *dev)
 {
 	void *addr;
@@ -133,6 +141,8 @@  static int dax_pmem_probe(struct device *dev)
 	if (!dax_region)
 		return -ENOMEM;
 
+	dax_set_sync(dax_region, dax_pmem_sync);
+
 	/* TODO: support for subdividing a dax region... */
 	dev_dax = devm_create_dev_dax(dax_region, id, &res, 1);