Message ID | 20170606072229.9302-3-haozhong.zhang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Jun 6, 2017 at 12:22 AM, Haozhong Zhang <haozhong.zhang@intel.com> wrote: > Applications in Linux guest that use device-dax never trigger flush > that can be trapped by KVM/QEMU. Meanwhile, if the host backend is not > device-dax, QEMU cannot guarantee the persistence of guest writes. > Before solving this flushing problem, QEMU should warn users if the > host backend is not device-dax. > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > Message-id: CAPcyv4hV2-ZW8SMCRtD0P_86KgR3DHOvNe+6T5SY2u7wXg3gEg@mail.gmail.com > --- > hw/mem/nvdimm.c | 6 ++++++ > include/qemu/osdep.h | 9 ++++++++ > util/osdep.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 76 insertions(+) > > diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c > index a9b0863f20..b23542fbdf 100644 > --- a/hw/mem/nvdimm.c > +++ b/hw/mem/nvdimm.c > @@ -26,6 +26,7 @@ > #include "qapi/error.h" > #include "qapi/visitor.h" > #include "hw/mem/nvdimm.h" > +#include "qemu/error-report.h" > > static void nvdimm_get_label_size(Object *obj, Visitor *v, const char *name, > void *opaque, Error **errp) > @@ -84,6 +85,11 @@ static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp) > NVDIMMDevice *nvdimm = NVDIMM(dimm); > uint64_t align, pmem_size, size = memory_region_size(mr); > > + if (!qemu_fd_is_dev_dax(memory_region_get_fd(mr))) { > + error_report("warning: nvdimm backend does not look like a DAX device, " > + "unable to guarantee persistence of guest writes"); > + } > + > align = memory_region_get_alignment(mr); > > pmem_size = size - nvdimm->label_size; > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index 1c9f5e260c..7f26af371e 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -470,4 +470,13 @@ char *qemu_get_pid_name(pid_t pid); > */ > pid_t qemu_fork(Error **errp); > > +/** > + * qemu_fd_is_dev_dax: > + * > + * Check whether @fd describes a DAX device. > + * > + * Returns true if it is; otherwise, return false. > + */ > +bool qemu_fd_is_dev_dax(int fd); > + > #endif > diff --git a/util/osdep.c b/util/osdep.c > index a2863c8e53..02881f96bc 100644 > --- a/util/osdep.c > +++ b/util/osdep.c > @@ -471,3 +471,64 @@ writev(int fd, const struct iovec *iov, int iov_cnt) > return readv_writev(fd, iov, iov_cnt, true); > } > #endif > + > +#ifdef __linux__ > +static ssize_t qemu_dev_dax_sysfs_read(int fd, const char *entry, > + char *buf, size_t len) > +{ > + ssize_t read_bytes; > + struct stat st; > + unsigned int major, minor; > + char *path, *pos; > + int sysfs_fd; > + > + if (fstat(fd, &st)) { > + return 0; > + } > + > + major = major(st.st_rdev); > + minor = minor(st.st_rdev); > + path = g_strdup_printf("/sys/dev/char/%u:%u/%s", major, minor, entry); > + > + sysfs_fd = open(path, O_RDONLY); > + g_free(path); > + if (sysfs_fd == -1) { > + return 0; > + } > + > + read_bytes = read(sysfs_fd, buf, len - 1); > + close(sysfs_fd); > + if (read_bytes > 0) { > + buf[read_bytes] = '\0'; > + pos = g_strstr_len(buf, read_bytes, "\n"); > + if (pos) { > + *pos = '\0'; > + } > + } > + > + return read_bytes; > +} > +#endif /* __linux__ */ > + > +bool qemu_fd_is_dev_dax(int fd) > +{ > + bool is_dax = false; > + > +#ifdef __linux__ > + char devtype[7]; > + ssize_t len; > + > + if (fd == -1) { > + return false; > + } > + > + len = qemu_dev_dax_sysfs_read(fd, "device/devtype", > + devtype, sizeof(devtype)); > + if (len <= 0) { > + return false; > + } > + is_dax = !strncmp(devtype, "nd_dax", len); There's no guarantee that device-dax instances are always parented by an "nd_dax" device-type. A more reliable check is to see if "/sys/dev/char/%u:%u/subsystem" points to "/sys/class/dax".
On Tue, Jun 06, 2017 at 03:22:27PM +0800, Haozhong Zhang wrote: > diff --git a/util/osdep.c b/util/osdep.c > index a2863c8e53..02881f96bc 100644 > --- a/util/osdep.c > +++ b/util/osdep.c > @@ -471,3 +471,64 @@ writev(int fd, const struct iovec *iov, int iov_cnt) > return readv_writev(fd, iov, iov_cnt, true); > } > #endif > + > +#ifdef __linux__ > +static ssize_t qemu_dev_dax_sysfs_read(int fd, const char *entry, > + char *buf, size_t len) > +{ > + ssize_t read_bytes; > + struct stat st; > + unsigned int major, minor; > + char *path, *pos; > + int sysfs_fd; > + > + if (fstat(fd, &st)) { > + return 0; > + } > + > + major = major(st.st_rdev); > + minor = minor(st.st_rdev); > + path = g_strdup_printf("/sys/dev/char/%u:%u/%s", major, minor, entry); > + > + sysfs_fd = open(path, O_RDONLY); > + g_free(path); > + if (sysfs_fd == -1) { > + return 0; > + } > + > + read_bytes = read(sysfs_fd, buf, len - 1); > + close(sysfs_fd); > + if (read_bytes > 0) { > + buf[read_bytes] = '\0'; > + pos = g_strstr_len(buf, read_bytes, "\n"); > + if (pos) { > + *pos = '\0'; > + } Should read_bytes be adjusted since we made the string shorter?
On 06/06/17 10:59 -0700, Dan Williams wrote: > On Tue, Jun 6, 2017 at 12:22 AM, Haozhong Zhang > <haozhong.zhang@intel.com> wrote: > > Applications in Linux guest that use device-dax never trigger flush > > that can be trapped by KVM/QEMU. Meanwhile, if the host backend is not > > device-dax, QEMU cannot guarantee the persistence of guest writes. > > Before solving this flushing problem, QEMU should warn users if the > > host backend is not device-dax. > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > Message-id: CAPcyv4hV2-ZW8SMCRtD0P_86KgR3DHOvNe+6T5SY2u7wXg3gEg@mail.gmail.com > > --- > > hw/mem/nvdimm.c | 6 ++++++ > > include/qemu/osdep.h | 9 ++++++++ > > util/osdep.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > > 3 files changed, 76 insertions(+) > > > > diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c > > index a9b0863f20..b23542fbdf 100644 > > --- a/hw/mem/nvdimm.c > > +++ b/hw/mem/nvdimm.c > > @@ -26,6 +26,7 @@ > > #include "qapi/error.h" > > #include "qapi/visitor.h" > > #include "hw/mem/nvdimm.h" > > +#include "qemu/error-report.h" > > > > static void nvdimm_get_label_size(Object *obj, Visitor *v, const char *name, > > void *opaque, Error **errp) > > @@ -84,6 +85,11 @@ static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp) > > NVDIMMDevice *nvdimm = NVDIMM(dimm); > > uint64_t align, pmem_size, size = memory_region_size(mr); > > > > + if (!qemu_fd_is_dev_dax(memory_region_get_fd(mr))) { > > + error_report("warning: nvdimm backend does not look like a DAX device, " > > + "unable to guarantee persistence of guest writes"); > > + } > > + > > align = memory_region_get_alignment(mr); > > > > pmem_size = size - nvdimm->label_size; > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > > index 1c9f5e260c..7f26af371e 100644 > > --- a/include/qemu/osdep.h > > +++ b/include/qemu/osdep.h > > @@ -470,4 +470,13 @@ char *qemu_get_pid_name(pid_t pid); > > */ > > pid_t qemu_fork(Error **errp); > > > > +/** > > + * qemu_fd_is_dev_dax: > > + * > > + * Check whether @fd describes a DAX device. > > + * > > + * Returns true if it is; otherwise, return false. > > + */ > > +bool qemu_fd_is_dev_dax(int fd); > > + > > #endif > > diff --git a/util/osdep.c b/util/osdep.c > > index a2863c8e53..02881f96bc 100644 > > --- a/util/osdep.c > > +++ b/util/osdep.c > > @@ -471,3 +471,64 @@ writev(int fd, const struct iovec *iov, int iov_cnt) > > return readv_writev(fd, iov, iov_cnt, true); > > } > > #endif > > + > > +#ifdef __linux__ > > +static ssize_t qemu_dev_dax_sysfs_read(int fd, const char *entry, > > + char *buf, size_t len) > > +{ > > + ssize_t read_bytes; > > + struct stat st; > > + unsigned int major, minor; > > + char *path, *pos; > > + int sysfs_fd; > > + > > + if (fstat(fd, &st)) { > > + return 0; > > + } > > + > > + major = major(st.st_rdev); > > + minor = minor(st.st_rdev); > > + path = g_strdup_printf("/sys/dev/char/%u:%u/%s", major, minor, entry); > > + > > + sysfs_fd = open(path, O_RDONLY); > > + g_free(path); > > + if (sysfs_fd == -1) { > > + return 0; > > + } > > + > > + read_bytes = read(sysfs_fd, buf, len - 1); > > + close(sysfs_fd); > > + if (read_bytes > 0) { > > + buf[read_bytes] = '\0'; > > + pos = g_strstr_len(buf, read_bytes, "\n"); > > + if (pos) { > > + *pos = '\0'; > > + } > > + } > > + > > + return read_bytes; > > +} > > +#endif /* __linux__ */ > > + > > +bool qemu_fd_is_dev_dax(int fd) > > +{ > > + bool is_dax = false; > > + > > +#ifdef __linux__ > > + char devtype[7]; > > + ssize_t len; > > + > > + if (fd == -1) { > > + return false; > > + } > > + > > + len = qemu_dev_dax_sysfs_read(fd, "device/devtype", > > + devtype, sizeof(devtype)); > > + if (len <= 0) { > > + return false; > > + } > > + is_dax = !strncmp(devtype, "nd_dax", len); > > There's no guarantee that device-dax instances are always parented by > an "nd_dax" device-type. A more reliable check is to see if > "/sys/dev/char/%u:%u/subsystem" points to "/sys/class/dax". Thanks for pointing out this, will change in the next version. Haozhong
On 06/07/17 16:14 +0100, Stefan Hajnoczi wrote: > On Tue, Jun 06, 2017 at 03:22:27PM +0800, Haozhong Zhang wrote: > > diff --git a/util/osdep.c b/util/osdep.c > > index a2863c8e53..02881f96bc 100644 > > --- a/util/osdep.c > > +++ b/util/osdep.c > > @@ -471,3 +471,64 @@ writev(int fd, const struct iovec *iov, int iov_cnt) > > return readv_writev(fd, iov, iov_cnt, true); > > } > > #endif > > + > > +#ifdef __linux__ > > +static ssize_t qemu_dev_dax_sysfs_read(int fd, const char *entry, > > + char *buf, size_t len) > > +{ > > + ssize_t read_bytes; > > + struct stat st; > > + unsigned int major, minor; > > + char *path, *pos; > > + int sysfs_fd; > > + > > + if (fstat(fd, &st)) { > > + return 0; > > + } > > + > > + major = major(st.st_rdev); > > + minor = minor(st.st_rdev); > > + path = g_strdup_printf("/sys/dev/char/%u:%u/%s", major, minor, entry); > > + > > + sysfs_fd = open(path, O_RDONLY); > > + g_free(path); > > + if (sysfs_fd == -1) { > > + return 0; > > + } > > + > > + read_bytes = read(sysfs_fd, buf, len - 1); > > + close(sysfs_fd); > > + if (read_bytes > 0) { > > + buf[read_bytes] = '\0'; > > + pos = g_strstr_len(buf, read_bytes, "\n"); > > + if (pos) { > > + *pos = '\0'; > > + } > > Should read_bytes be adjusted since we made the string shorter? Yes, I'll change in the next version. Thanks, Haozhong
On Tue, Jun 06, 2017 at 03:22:27PM +0800, Haozhong Zhang wrote: > Applications in Linux guest that use device-dax never trigger flush > that can be trapped by KVM/QEMU. Meanwhile, if the host backend is not > device-dax, QEMU cannot guarantee the persistence of guest writes. > Before solving this flushing problem, QEMU should warn users if the > host backend is not device-dax. No one reads warnings unless things fail but they can be a debugging aid if they do. But they have to be simple and robust to be helpful for that. This one seems to me too complex and fragile. > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > Message-id: CAPcyv4hV2-ZW8SMCRtD0P_86KgR3DHOvNe+6T5SY2u7wXg3gEg@mail.gmail.com Don't include this line in commit log please. Put it after --- if you must. > --- > hw/mem/nvdimm.c | 6 ++++++ > include/qemu/osdep.h | 9 ++++++++ > util/osdep.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 76 insertions(+) > > diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c > index a9b0863f20..b23542fbdf 100644 > --- a/hw/mem/nvdimm.c > +++ b/hw/mem/nvdimm.c > @@ -26,6 +26,7 @@ > #include "qapi/error.h" > #include "qapi/visitor.h" > #include "hw/mem/nvdimm.h" > +#include "qemu/error-report.h" > > static void nvdimm_get_label_size(Object *obj, Visitor *v, const char *name, > void *opaque, Error **errp) > @@ -84,6 +85,11 @@ static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp) > NVDIMMDevice *nvdimm = NVDIMM(dimm); > uint64_t align, pmem_size, size = memory_region_size(mr); > > + if (!qemu_fd_is_dev_dax(memory_region_get_fd(mr))) { > + error_report("warning: nvdimm backend does not look like a DAX device, " > + "unable to guarantee persistence of guest writes"); > + } > + > align = memory_region_get_alignment(mr); > > pmem_size = size - nvdimm->label_size; > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index 1c9f5e260c..7f26af371e 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -470,4 +470,13 @@ char *qemu_get_pid_name(pid_t pid); > */ > pid_t qemu_fork(Error **errp); > > +/** > + * qemu_fd_is_dev_dax: > + * > + * Check whether @fd describes a DAX device. > + * > + * Returns true if it is; otherwise, return false. > + */ > +bool qemu_fd_is_dev_dax(int fd); > + > #endif > diff --git a/util/osdep.c b/util/osdep.c > index a2863c8e53..02881f96bc 100644 > --- a/util/osdep.c > +++ b/util/osdep.c > @@ -471,3 +471,64 @@ writev(int fd, const struct iovec *iov, int iov_cnt) > return readv_writev(fd, iov, iov_cnt, true); > } > #endif > + > +#ifdef __linux__ > +static ssize_t qemu_dev_dax_sysfs_read(int fd, const char *entry, > + char *buf, size_t len) > +{ > + ssize_t read_bytes; > + struct stat st; > + unsigned int major, minor; > + char *path, *pos; > + int sysfs_fd; > + > + if (fstat(fd, &st)) { > + return 0; > + } > + > + major = major(st.st_rdev); > + minor = minor(st.st_rdev); > + path = g_strdup_printf("/sys/dev/char/%u:%u/%s", major, minor, entry); > + > + sysfs_fd = open(path, O_RDONLY); > + g_free(path); > + if (sysfs_fd == -1) { > + return 0; > + } > + > + read_bytes = read(sysfs_fd, buf, len - 1); > + close(sysfs_fd); > + if (read_bytes > 0) { > + buf[read_bytes] = '\0'; > + pos = g_strstr_len(buf, read_bytes, "\n"); > + if (pos) { > + *pos = '\0'; > + } > + } > + > + return read_bytes; > +} > +#endif /* __linux__ */ > + > +bool qemu_fd_is_dev_dax(int fd) > +{ > + bool is_dax = false; > + > +#ifdef __linux__ > + char devtype[7]; > + ssize_t len; > + > + if (fd == -1) { > + return false; > + } > + > + len = qemu_dev_dax_sysfs_read(fd, "device/devtype", > + devtype, sizeof(devtype)); > + if (len <= 0) { > + return false; > + } This can easily trigger false positives e.g. if sysfs access is blocked by selinux. > + is_dax = !strncmp(devtype, "nd_dax", len); E.g. will return true on any string starting with nd_dax. And it's not clear non-dax devices can't guarantee safety. > +#endif /* __linux__ */ > + > + return is_dax; > +} > -- > 2.11.0
On 06/08/17 15:51 +0300, Michael S. Tsirkin wrote: > On Tue, Jun 06, 2017 at 03:22:27PM +0800, Haozhong Zhang wrote: > > Applications in Linux guest that use device-dax never trigger flush > > that can be trapped by KVM/QEMU. Meanwhile, if the host backend is not > > device-dax, QEMU cannot guarantee the persistence of guest writes. > > Before solving this flushing problem, QEMU should warn users if the > > host backend is not device-dax. > > No one reads warnings unless things fail but they can be a debugging aid > if they do. But they have to be simple and robust to be helpful for > that. This one seems to me too complex and fragile. > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > Message-id: CAPcyv4hV2-ZW8SMCRtD0P_86KgR3DHOvNe+6T5SY2u7wXg3gEg@mail.gmail.com > > Don't include this line in commit log please. Put it after --- if you > must. > > > --- > > hw/mem/nvdimm.c | 6 ++++++ > > include/qemu/osdep.h | 9 ++++++++ > > util/osdep.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > > 3 files changed, 76 insertions(+) > > > > diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c > > index a9b0863f20..b23542fbdf 100644 > > --- a/hw/mem/nvdimm.c > > +++ b/hw/mem/nvdimm.c > > @@ -26,6 +26,7 @@ > > #include "qapi/error.h" > > #include "qapi/visitor.h" > > #include "hw/mem/nvdimm.h" > > +#include "qemu/error-report.h" > > > > static void nvdimm_get_label_size(Object *obj, Visitor *v, const char *name, > > void *opaque, Error **errp) > > @@ -84,6 +85,11 @@ static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp) > > NVDIMMDevice *nvdimm = NVDIMM(dimm); > > uint64_t align, pmem_size, size = memory_region_size(mr); > > > > + if (!qemu_fd_is_dev_dax(memory_region_get_fd(mr))) { > > + error_report("warning: nvdimm backend does not look like a DAX device, " > > + "unable to guarantee persistence of guest writes"); > > + } > > + > > align = memory_region_get_alignment(mr); > > > > pmem_size = size - nvdimm->label_size; > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > > index 1c9f5e260c..7f26af371e 100644 > > --- a/include/qemu/osdep.h > > +++ b/include/qemu/osdep.h > > @@ -470,4 +470,13 @@ char *qemu_get_pid_name(pid_t pid); > > */ > > pid_t qemu_fork(Error **errp); > > > > +/** > > + * qemu_fd_is_dev_dax: > > + * > > + * Check whether @fd describes a DAX device. > > + * > > + * Returns true if it is; otherwise, return false. > > + */ > > +bool qemu_fd_is_dev_dax(int fd); > > + > > #endif > > diff --git a/util/osdep.c b/util/osdep.c > > index a2863c8e53..02881f96bc 100644 > > --- a/util/osdep.c > > +++ b/util/osdep.c > > @@ -471,3 +471,64 @@ writev(int fd, const struct iovec *iov, int iov_cnt) > > return readv_writev(fd, iov, iov_cnt, true); > > } > > #endif > > + > > +#ifdef __linux__ > > +static ssize_t qemu_dev_dax_sysfs_read(int fd, const char *entry, > > + char *buf, size_t len) > > +{ > > + ssize_t read_bytes; > > + struct stat st; > > + unsigned int major, minor; > > + char *path, *pos; > > + int sysfs_fd; > > + > > + if (fstat(fd, &st)) { > > + return 0; > > + } > > + > > + major = major(st.st_rdev); > > + minor = minor(st.st_rdev); > > + path = g_strdup_printf("/sys/dev/char/%u:%u/%s", major, minor, entry); > > + > > + sysfs_fd = open(path, O_RDONLY); > > + g_free(path); > > + if (sysfs_fd == -1) { > > + return 0; > > + } > > + > > + read_bytes = read(sysfs_fd, buf, len - 1); > > + close(sysfs_fd); > > + if (read_bytes > 0) { > > + buf[read_bytes] = '\0'; > > + pos = g_strstr_len(buf, read_bytes, "\n"); > > + if (pos) { > > + *pos = '\0'; > > + } > > + } > > + > > + return read_bytes; > > +} > > +#endif /* __linux__ */ > > + > > +bool qemu_fd_is_dev_dax(int fd) > > +{ > > + bool is_dax = false; > > + > > +#ifdef __linux__ > > + char devtype[7]; > > + ssize_t len; > > + > > + if (fd == -1) { > > + return false; > > + } > > + > > + len = qemu_dev_dax_sysfs_read(fd, "device/devtype", > > + devtype, sizeof(devtype)); > > + if (len <= 0) { > > + return false; > > + } > > This can easily trigger false positives e.g. if sysfs access > is blocked by selinux. > A part of userland interface of nvdimm is exposed via sysfs, so QEMU has to access to certain sysfs entries in order to, e.g. perform the DAX check in this patch and get alignment requirement in patch 4. I agree it's not robust if the permission is not properly granted. We may either 1) require users to grant the permissions to QEMU and document the requirements or 2) get the information from sysfs from the outside of QEMU (e.g. libvirt) which has the permission and then pass to QEMU. Which one do you think is better? > > > + is_dax = !strncmp(devtype, "nd_dax", len); > > E.g. will return true on any string starting with nd_dax. > And it's not clear non-dax devices can't guarantee safety. This check can be done by checking whether /sys/dev/char/MAJ:MIN/subsystem points to /sys/class/dax, as Dan suggested, if we decide to access sysfs in QEMU. Thanks, Haozhong > > > +#endif /* __linux__ */ > > + > > + return is_dax; > > +} > > -- > > 2.11.0
On Mon, Jun 12, 2017 at 11:18:21AM +0800, Haozhong Zhang wrote: > On 06/08/17 15:51 +0300, Michael S. Tsirkin wrote: > > On Tue, Jun 06, 2017 at 03:22:27PM +0800, Haozhong Zhang wrote: > > > Applications in Linux guest that use device-dax never trigger flush > > > that can be trapped by KVM/QEMU. Meanwhile, if the host backend is not > > > device-dax, QEMU cannot guarantee the persistence of guest writes. > > > Before solving this flushing problem, QEMU should warn users if the > > > host backend is not device-dax. > > > > No one reads warnings unless things fail but they can be a debugging aid > > if they do. But they have to be simple and robust to be helpful for > > that. This one seems to me too complex and fragile. > > > > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > > Message-id: CAPcyv4hV2-ZW8SMCRtD0P_86KgR3DHOvNe+6T5SY2u7wXg3gEg@mail.gmail.com > > > > Don't include this line in commit log please. Put it after --- if you > > must. > > > > > --- > > > hw/mem/nvdimm.c | 6 ++++++ > > > include/qemu/osdep.h | 9 ++++++++ > > > util/osdep.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > > > 3 files changed, 76 insertions(+) > > > > > > diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c > > > index a9b0863f20..b23542fbdf 100644 > > > --- a/hw/mem/nvdimm.c > > > +++ b/hw/mem/nvdimm.c > > > @@ -26,6 +26,7 @@ > > > #include "qapi/error.h" > > > #include "qapi/visitor.h" > > > #include "hw/mem/nvdimm.h" > > > +#include "qemu/error-report.h" > > > > > > static void nvdimm_get_label_size(Object *obj, Visitor *v, const char *name, > > > void *opaque, Error **errp) > > > @@ -84,6 +85,11 @@ static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp) > > > NVDIMMDevice *nvdimm = NVDIMM(dimm); > > > uint64_t align, pmem_size, size = memory_region_size(mr); > > > > > > + if (!qemu_fd_is_dev_dax(memory_region_get_fd(mr))) { > > > + error_report("warning: nvdimm backend does not look like a DAX device, " > > > + "unable to guarantee persistence of guest writes"); > > > + } > > > + > > > align = memory_region_get_alignment(mr); > > > > > > pmem_size = size - nvdimm->label_size; > > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > > > index 1c9f5e260c..7f26af371e 100644 > > > --- a/include/qemu/osdep.h > > > +++ b/include/qemu/osdep.h > > > @@ -470,4 +470,13 @@ char *qemu_get_pid_name(pid_t pid); > > > */ > > > pid_t qemu_fork(Error **errp); > > > > > > +/** > > > + * qemu_fd_is_dev_dax: > > > + * > > > + * Check whether @fd describes a DAX device. > > > + * > > > + * Returns true if it is; otherwise, return false. > > > + */ > > > +bool qemu_fd_is_dev_dax(int fd); > > > + > > > #endif > > > diff --git a/util/osdep.c b/util/osdep.c > > > index a2863c8e53..02881f96bc 100644 > > > --- a/util/osdep.c > > > +++ b/util/osdep.c > > > @@ -471,3 +471,64 @@ writev(int fd, const struct iovec *iov, int iov_cnt) > > > return readv_writev(fd, iov, iov_cnt, true); > > > } > > > #endif > > > + > > > +#ifdef __linux__ > > > +static ssize_t qemu_dev_dax_sysfs_read(int fd, const char *entry, > > > + char *buf, size_t len) > > > +{ > > > + ssize_t read_bytes; > > > + struct stat st; > > > + unsigned int major, minor; > > > + char *path, *pos; > > > + int sysfs_fd; > > > + > > > + if (fstat(fd, &st)) { > > > + return 0; > > > + } > > > + > > > + major = major(st.st_rdev); > > > + minor = minor(st.st_rdev); > > > + path = g_strdup_printf("/sys/dev/char/%u:%u/%s", major, minor, entry); > > > + > > > + sysfs_fd = open(path, O_RDONLY); > > > + g_free(path); > > > + if (sysfs_fd == -1) { > > > + return 0; > > > + } > > > + > > > + read_bytes = read(sysfs_fd, buf, len - 1); > > > + close(sysfs_fd); > > > + if (read_bytes > 0) { > > > + buf[read_bytes] = '\0'; > > > + pos = g_strstr_len(buf, read_bytes, "\n"); > > > + if (pos) { > > > + *pos = '\0'; > > > + } > > > + } > > > + > > > + return read_bytes; > > > +} > > > +#endif /* __linux__ */ > > > + > > > +bool qemu_fd_is_dev_dax(int fd) > > > +{ > > > + bool is_dax = false; > > > + > > > +#ifdef __linux__ > > > + char devtype[7]; > > > + ssize_t len; > > > + > > > + if (fd == -1) { > > > + return false; > > > + } > > > + > > > + len = qemu_dev_dax_sysfs_read(fd, "device/devtype", > > > + devtype, sizeof(devtype)); > > > + if (len <= 0) { > > > + return false; > > > + } > > > > This can easily trigger false positives e.g. if sysfs access > > is blocked by selinux. > > > > A part of userland interface of nvdimm is exposed via sysfs, so QEMU > has to access to certain sysfs entries in order to, e.g. perform the > DAX check in this patch and get alignment requirement in patch 4. > > I agree it's not robust if the permission is not properly granted. We > may either > 1) require users to grant the permissions to QEMU and document the > requirements > or > 2) get the information from sysfs from the outside of QEMU > (e.g. libvirt) which has the permission and then pass to QEMU. > > Which one do you think is better? 2) since that allows emulating things without hardware dax. > > > > > + is_dax = !strncmp(devtype, "nd_dax", len); > > > > E.g. will return true on any string starting with nd_dax. > > And it's not clear non-dax devices can't guarantee safety. > > This check can be done by checking whether /sys/dev/char/MAJ:MIN/subsystem > points to /sys/class/dax, as Dan suggested, if we decide to access > sysfs in QEMU. > > Thanks, > Haozhong > > > > > > +#endif /* __linux__ */ > > > + > > > + return is_dax; > > > +} > > > -- > > > 2.11.0
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index a9b0863f20..b23542fbdf 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -26,6 +26,7 @@ #include "qapi/error.h" #include "qapi/visitor.h" #include "hw/mem/nvdimm.h" +#include "qemu/error-report.h" static void nvdimm_get_label_size(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) @@ -84,6 +85,11 @@ static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp) NVDIMMDevice *nvdimm = NVDIMM(dimm); uint64_t align, pmem_size, size = memory_region_size(mr); + if (!qemu_fd_is_dev_dax(memory_region_get_fd(mr))) { + error_report("warning: nvdimm backend does not look like a DAX device, " + "unable to guarantee persistence of guest writes"); + } + align = memory_region_get_alignment(mr); pmem_size = size - nvdimm->label_size; diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 1c9f5e260c..7f26af371e 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -470,4 +470,13 @@ char *qemu_get_pid_name(pid_t pid); */ pid_t qemu_fork(Error **errp); +/** + * qemu_fd_is_dev_dax: + * + * Check whether @fd describes a DAX device. + * + * Returns true if it is; otherwise, return false. + */ +bool qemu_fd_is_dev_dax(int fd); + #endif diff --git a/util/osdep.c b/util/osdep.c index a2863c8e53..02881f96bc 100644 --- a/util/osdep.c +++ b/util/osdep.c @@ -471,3 +471,64 @@ writev(int fd, const struct iovec *iov, int iov_cnt) return readv_writev(fd, iov, iov_cnt, true); } #endif + +#ifdef __linux__ +static ssize_t qemu_dev_dax_sysfs_read(int fd, const char *entry, + char *buf, size_t len) +{ + ssize_t read_bytes; + struct stat st; + unsigned int major, minor; + char *path, *pos; + int sysfs_fd; + + if (fstat(fd, &st)) { + return 0; + } + + major = major(st.st_rdev); + minor = minor(st.st_rdev); + path = g_strdup_printf("/sys/dev/char/%u:%u/%s", major, minor, entry); + + sysfs_fd = open(path, O_RDONLY); + g_free(path); + if (sysfs_fd == -1) { + return 0; + } + + read_bytes = read(sysfs_fd, buf, len - 1); + close(sysfs_fd); + if (read_bytes > 0) { + buf[read_bytes] = '\0'; + pos = g_strstr_len(buf, read_bytes, "\n"); + if (pos) { + *pos = '\0'; + } + } + + return read_bytes; +} +#endif /* __linux__ */ + +bool qemu_fd_is_dev_dax(int fd) +{ + bool is_dax = false; + +#ifdef __linux__ + char devtype[7]; + ssize_t len; + + if (fd == -1) { + return false; + } + + len = qemu_dev_dax_sysfs_read(fd, "device/devtype", + devtype, sizeof(devtype)); + if (len <= 0) { + return false; + } + is_dax = !strncmp(devtype, "nd_dax", len); +#endif /* __linux__ */ + + return is_dax; +}
Applications in Linux guest that use device-dax never trigger flush that can be trapped by KVM/QEMU. Meanwhile, if the host backend is not device-dax, QEMU cannot guarantee the persistence of guest writes. Before solving this flushing problem, QEMU should warn users if the host backend is not device-dax. Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> Message-id: CAPcyv4hV2-ZW8SMCRtD0P_86KgR3DHOvNe+6T5SY2u7wXg3gEg@mail.gmail.com --- hw/mem/nvdimm.c | 6 ++++++ include/qemu/osdep.h | 9 ++++++++ util/osdep.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+)