diff mbox series

[v9,2/7] virtio-pmem: Add virtio pmem driver

Message ID 20190514145422.16923-3-pagupta@redhat.com (mailing list archive)
State New, archived
Headers show
Series virtio pmem driver | expand

Commit Message

Pankaj Gupta May 14, 2019, 2:54 p.m. UTC
This patch adds virtio-pmem driver for KVM guest.

Guest reads the persistent memory range information from
Qemu over VIRTIO and registers it on nvdimm_bus. It also
creates a nd_region object with the persistent memory
range information so that existing 'nvdimm/pmem' driver
can reserve this into system memory map. This way
'virtio-pmem' driver uses existing functionality of pmem
driver to register persistent memory compatible for DAX
capable filesystems.

This also provides function to perform guest flush over
VIRTIO from 'pmem' driver when userspace performs flush
on DAX memory range.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/nvdimm/Makefile          |   1 +
 drivers/nvdimm/nd_virtio.c       | 126 +++++++++++++++++++++++++++++++
 drivers/nvdimm/virtio_pmem.c     | 122 ++++++++++++++++++++++++++++++
 drivers/nvdimm/virtio_pmem.h     |  60 +++++++++++++++
 drivers/virtio/Kconfig           |  11 +++
 include/uapi/linux/virtio_ids.h  |   1 +
 include/uapi/linux/virtio_pmem.h |  10 +++
 7 files changed, 331 insertions(+)
 create mode 100644 drivers/nvdimm/nd_virtio.c
 create mode 100644 drivers/nvdimm/virtio_pmem.c
 create mode 100644 drivers/nvdimm/virtio_pmem.h
 create mode 100644 include/uapi/linux/virtio_pmem.h

Comments

Randy Dunlap May 14, 2019, 3:09 p.m. UTC | #1
On 5/14/19 7:54 AM, Pankaj Gupta wrote:
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index 35897649c24f..94bad084ebab 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -42,6 +42,17 @@ config VIRTIO_PCI_LEGACY
>  
>  	  If unsure, say Y.
>  
> +config VIRTIO_PMEM
> +	tristate "Support for virtio pmem driver"
> +	depends on VIRTIO
> +	depends on LIBNVDIMM
> +	help
> +	This driver provides access to virtio-pmem devices, storage devices
> +	that are mapped into the physical address space - similar to NVDIMMs
> +	 - with a virtio-based flushing interface.
> +
> +	If unsure, say M.

<beep>
from Documentation/process/coding-style.rst:
"Lines under a ``config`` definition
are indented with one tab, while help text is indented an additional two
spaces."

> +
>  config VIRTIO_BALLOON
>  	tristate "Virtio balloon driver"
>  	depends on VIRTIO

thanks.
Pankaj Gupta May 14, 2019, 3:25 p.m. UTC | #2
> On 5/14/19 7:54 AM, Pankaj Gupta wrote:
> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > index 35897649c24f..94bad084ebab 100644
> > --- a/drivers/virtio/Kconfig
> > +++ b/drivers/virtio/Kconfig
> > @@ -42,6 +42,17 @@ config VIRTIO_PCI_LEGACY
> >  
> >  	  If unsure, say Y.
> >  
> > +config VIRTIO_PMEM
> > +	tristate "Support for virtio pmem driver"
> > +	depends on VIRTIO
> > +	depends on LIBNVDIMM
> > +	help
> > +	This driver provides access to virtio-pmem devices, storage devices
> > +	that are mapped into the physical address space - similar to NVDIMMs
> > +	 - with a virtio-based flushing interface.
> > +
> > +	If unsure, say M.
> 
> <beep>
> from Documentation/process/coding-style.rst:
> "Lines under a ``config`` definition
> are indented with one tab, while help text is indented an additional two
> spaces."

ah... I changed help text and 'checkpatch' did not say anything :( .

Will wait for Dan, If its possible to add two spaces to help text while applying
the series.

Thanks,
Pankaj
  

> 
> > +
> >  config VIRTIO_BALLOON
> >  	tristate "Virtio balloon driver"
> >  	depends on VIRTIO
> 
> thanks.
> --
> ~Randy
> 
>
David Hildenbrand May 15, 2019, 8:46 p.m. UTC | #3
> +	vpmem->vdev = vdev;
> +	vdev->priv = vpmem;
> +	err = init_vq(vpmem);
> +	if (err) {
> +		dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
> +		goto out_err;
> +	}
> +
> +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> +			start, &vpmem->start);
> +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> +			size, &vpmem->size);
> +
> +	res.start = vpmem->start;
> +	res.end   = vpmem->start + vpmem->size-1;

nit: " - 1;"

> +	vpmem->nd_desc.provider_name = "virtio-pmem";
> +	vpmem->nd_desc.module = THIS_MODULE;
> +
> +	vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
> +						&vpmem->nd_desc);
> +	if (!vpmem->nvdimm_bus) {
> +		dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
> +		err = -ENXIO;
> +		goto out_vq;
> +	}
> +
> +	dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
> +
> +	ndr_desc.res = &res;
> +	ndr_desc.numa_node = nid;
> +	ndr_desc.flush = async_pmem_flush;
> +	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
> +	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
> +	nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
> +	if (!nd_region) {
> +		dev_err(&vdev->dev, "failed to create nvdimm region\n");
> +		err = -ENXIO;
> +		goto out_nd;
> +	}
> +	nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
> +	return 0;
> +out_nd:
> +	nvdimm_bus_unregister(vpmem->nvdimm_bus);
> +out_vq:
> +	vdev->config->del_vqs(vdev);
> +out_err:
> +	return err;
> +}
> +
> +static void virtio_pmem_remove(struct virtio_device *vdev)
> +{
> +	struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
> +
> +	nvdimm_bus_unregister(nvdimm_bus);
> +	vdev->config->del_vqs(vdev);
> +	vdev->config->reset(vdev);
> +}
> +
> +static struct virtio_driver virtio_pmem_driver = {
> +	.driver.name		= KBUILD_MODNAME,
> +	.driver.owner		= THIS_MODULE,
> +	.id_table		= id_table,
> +	.probe			= virtio_pmem_probe,
> +	.remove			= virtio_pmem_remove,
> +};
> +
> +module_virtio_driver(virtio_pmem_driver);
> +MODULE_DEVICE_TABLE(virtio, id_table);
> +MODULE_DESCRIPTION("Virtio pmem driver");
> +MODULE_LICENSE("GPL");
> diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
> new file mode 100644
> index 000000000000..ab1da877575d
> --- /dev/null
> +++ b/drivers/nvdimm/virtio_pmem.h
> @@ -0,0 +1,60 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * virtio_pmem.h: virtio pmem Driver
> + *
> + * Discovers persistent memory range information
> + * from host and provides a virtio based flushing
> + * interface.
> + **/
> +
> +#ifndef _LINUX_VIRTIO_PMEM_H
> +#define _LINUX_VIRTIO_PMEM_H
> +
> +#include <linux/virtio_ids.h>
> +#include <linux/module.h>
> +#include <linux/virtio_config.h>
> +#include <uapi/linux/virtio_pmem.h>
> +#include <linux/libnvdimm.h>
> +#include <linux/spinlock.h>
> +
> +struct virtio_pmem_request {
> +	/* Host return status corresponding to flush request */
> +	int ret;
> +
> +	/* command name*/
> +	char name[16];

So ... why are we sending string commands and expect native-endianess
integers and don't define a proper request/response structure + request
types in include/uapi/linux/virtio_pmem.h like

struct virtio_pmem_resp {
	__virtio32 ret;
}

#define VIRTIO_PMEM_REQ_TYPE_FLUSH	1
struct virtio_pmem_req {
	__virtio16 type;
}

... and this way we also define a proper endianess format for exchange
and keep it extensible

@MST, what's your take on this?
Dan Williams May 15, 2019, 8:46 p.m. UTC | #4
On Tue, May 14, 2019 at 8:25 AM Pankaj Gupta <pagupta@redhat.com> wrote:
>
>
> > On 5/14/19 7:54 AM, Pankaj Gupta wrote:
> > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > index 35897649c24f..94bad084ebab 100644
> > > --- a/drivers/virtio/Kconfig
> > > +++ b/drivers/virtio/Kconfig
> > > @@ -42,6 +42,17 @@ config VIRTIO_PCI_LEGACY
> > >
> > >       If unsure, say Y.
> > >
> > > +config VIRTIO_PMEM
> > > +   tristate "Support for virtio pmem driver"
> > > +   depends on VIRTIO
> > > +   depends on LIBNVDIMM
> > > +   help
> > > +   This driver provides access to virtio-pmem devices, storage devices
> > > +   that are mapped into the physical address space - similar to NVDIMMs
> > > +    - with a virtio-based flushing interface.
> > > +
> > > +   If unsure, say M.
> >
> > <beep>
> > from Documentation/process/coding-style.rst:
> > "Lines under a ``config`` definition
> > are indented with one tab, while help text is indented an additional two
> > spaces."
>
> ah... I changed help text and 'checkpatch' did not say anything :( .
>
> Will wait for Dan, If its possible to add two spaces to help text while applying
> the series.

I'm inclined to handle this with a fixup appended to the end of the
series just so the patchwork-bot does not get confused by the content
changing from what was sent to the list.
David Hildenbrand May 15, 2019, 8:52 p.m. UTC | #5
On 15.05.19 22:46, David Hildenbrand wrote:
>> +	vpmem->vdev = vdev;
>> +	vdev->priv = vpmem;
>> +	err = init_vq(vpmem);
>> +	if (err) {
>> +		dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
>> +		goto out_err;
>> +	}
>> +
>> +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
>> +			start, &vpmem->start);
>> +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
>> +			size, &vpmem->size);
>> +
>> +	res.start = vpmem->start;
>> +	res.end   = vpmem->start + vpmem->size-1;
> 
> nit: " - 1;"
> 
>> +	vpmem->nd_desc.provider_name = "virtio-pmem";
>> +	vpmem->nd_desc.module = THIS_MODULE;
>> +
>> +	vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
>> +						&vpmem->nd_desc);
>> +	if (!vpmem->nvdimm_bus) {
>> +		dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
>> +		err = -ENXIO;
>> +		goto out_vq;
>> +	}
>> +
>> +	dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
>> +
>> +	ndr_desc.res = &res;
>> +	ndr_desc.numa_node = nid;
>> +	ndr_desc.flush = async_pmem_flush;
>> +	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
>> +	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
>> +	nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
>> +	if (!nd_region) {
>> +		dev_err(&vdev->dev, "failed to create nvdimm region\n");
>> +		err = -ENXIO;
>> +		goto out_nd;
>> +	}
>> +	nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
>> +	return 0;
>> +out_nd:
>> +	nvdimm_bus_unregister(vpmem->nvdimm_bus);
>> +out_vq:
>> +	vdev->config->del_vqs(vdev);
>> +out_err:
>> +	return err;
>> +}
>> +
>> +static void virtio_pmem_remove(struct virtio_device *vdev)
>> +{
>> +	struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
>> +
>> +	nvdimm_bus_unregister(nvdimm_bus);
>> +	vdev->config->del_vqs(vdev);
>> +	vdev->config->reset(vdev);
>> +}
>> +
>> +static struct virtio_driver virtio_pmem_driver = {
>> +	.driver.name		= KBUILD_MODNAME,
>> +	.driver.owner		= THIS_MODULE,
>> +	.id_table		= id_table,
>> +	.probe			= virtio_pmem_probe,
>> +	.remove			= virtio_pmem_remove,
>> +};
>> +
>> +module_virtio_driver(virtio_pmem_driver);
>> +MODULE_DEVICE_TABLE(virtio, id_table);
>> +MODULE_DESCRIPTION("Virtio pmem driver");
>> +MODULE_LICENSE("GPL");
>> diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
>> new file mode 100644
>> index 000000000000..ab1da877575d
>> --- /dev/null
>> +++ b/drivers/nvdimm/virtio_pmem.h
>> @@ -0,0 +1,60 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * virtio_pmem.h: virtio pmem Driver
>> + *
>> + * Discovers persistent memory range information
>> + * from host and provides a virtio based flushing
>> + * interface.
>> + **/
>> +
>> +#ifndef _LINUX_VIRTIO_PMEM_H
>> +#define _LINUX_VIRTIO_PMEM_H
>> +
>> +#include <linux/virtio_ids.h>
>> +#include <linux/module.h>
>> +#include <linux/virtio_config.h>
>> +#include <uapi/linux/virtio_pmem.h>
>> +#include <linux/libnvdimm.h>
>> +#include <linux/spinlock.h>
>> +
>> +struct virtio_pmem_request {
>> +	/* Host return status corresponding to flush request */
>> +	int ret;
>> +
>> +	/* command name*/
>> +	char name[16];
> 
> So ... why are we sending string commands and expect native-endianess
> integers and don't define a proper request/response structure + request
> types in include/uapi/linux/virtio_pmem.h like
> 
> struct virtio_pmem_resp {
> 	__virtio32 ret;
> }

FWIW, I wonder if we should even properly translate return values and
define types like

VIRTIO_PMEM_RESP_TYPE_OK	0
VIRTIO_PMEM_RESP_TYPE_EIO	1

..

> 
> #define VIRTIO_PMEM_REQ_TYPE_FLUSH	1
> struct virtio_pmem_req {
> 	__virtio16 type;
> }
> 
> ... and this way we also define a proper endianess format for exchange
> and keep it extensible
> 
> @MST, what's your take on this?
> 
>
Pankaj Gupta May 16, 2019, 7:49 a.m. UTC | #6
> 
> > +	vpmem->vdev = vdev;
> > +	vdev->priv = vpmem;
> > +	err = init_vq(vpmem);
> > +	if (err) {
> > +		dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
> > +		goto out_err;
> > +	}
> > +
> > +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +			start, &vpmem->start);
> > +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +			size, &vpmem->size);
> > +
> > +	res.start = vpmem->start;
> > +	res.end   = vpmem->start + vpmem->size-1;
> 
> nit: " - 1;"

Sure.

> 
> > +	vpmem->nd_desc.provider_name = "virtio-pmem";
> > +	vpmem->nd_desc.module = THIS_MODULE;
> > +
> > +	vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
> > +						&vpmem->nd_desc);
> > +	if (!vpmem->nvdimm_bus) {
> > +		dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
> > +		err = -ENXIO;
> > +		goto out_vq;
> > +	}
> > +
> > +	dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
> > +
> > +	ndr_desc.res = &res;
> > +	ndr_desc.numa_node = nid;
> > +	ndr_desc.flush = async_pmem_flush;
> > +	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
> > +	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
> > +	nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
> > +	if (!nd_region) {
> > +		dev_err(&vdev->dev, "failed to create nvdimm region\n");
> > +		err = -ENXIO;
> > +		goto out_nd;
> > +	}
> > +	nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
> > +	return 0;
> > +out_nd:
> > +	nvdimm_bus_unregister(vpmem->nvdimm_bus);
> > +out_vq:
> > +	vdev->config->del_vqs(vdev);
> > +out_err:
> > +	return err;
> > +}
> > +
> > +static void virtio_pmem_remove(struct virtio_device *vdev)
> > +{
> > +	struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
> > +
> > +	nvdimm_bus_unregister(nvdimm_bus);
> > +	vdev->config->del_vqs(vdev);
> > +	vdev->config->reset(vdev);
> > +}
> > +
> > +static struct virtio_driver virtio_pmem_driver = {
> > +	.driver.name		= KBUILD_MODNAME,
> > +	.driver.owner		= THIS_MODULE,
> > +	.id_table		= id_table,
> > +	.probe			= virtio_pmem_probe,
> > +	.remove			= virtio_pmem_remove,
> > +};
> > +
> > +module_virtio_driver(virtio_pmem_driver);
> > +MODULE_DEVICE_TABLE(virtio, id_table);
> > +MODULE_DESCRIPTION("Virtio pmem driver");
> > +MODULE_LICENSE("GPL");
> > diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
> > new file mode 100644
> > index 000000000000..ab1da877575d
> > --- /dev/null
> > +++ b/drivers/nvdimm/virtio_pmem.h
> > @@ -0,0 +1,60 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * virtio_pmem.h: virtio pmem Driver
> > + *
> > + * Discovers persistent memory range information
> > + * from host and provides a virtio based flushing
> > + * interface.
> > + **/
> > +
> > +#ifndef _LINUX_VIRTIO_PMEM_H
> > +#define _LINUX_VIRTIO_PMEM_H
> > +
> > +#include <linux/virtio_ids.h>
> > +#include <linux/module.h>
> > +#include <linux/virtio_config.h>
> > +#include <uapi/linux/virtio_pmem.h>
> > +#include <linux/libnvdimm.h>
> > +#include <linux/spinlock.h>
> > +
> > +struct virtio_pmem_request {
> > +	/* Host return status corresponding to flush request */
> > +	int ret;
> > +
> > +	/* command name*/
> > +	char name[16];
> 
> So ... why are we sending string commands and expect native-endianess
> integers and don't define a proper request/response structure + request
> types in include/uapi/linux/virtio_pmem.h like
> 
> struct virtio_pmem_resp {
> 	__virtio32 ret;
> }
> 
> #define VIRTIO_PMEM_REQ_TYPE_FLUSH	1
> struct virtio_pmem_req {
> 	__virtio16 type;
> }
> 
> ... and this way we also define a proper endianess format for exchange
> and keep it extensible

Done the suggested change.

Thank you,
Pankaj

> 
> @MST, what's your take on this?
> 
> 
> --
> 
> Thanks,
> 
> David / dhildenb
>
Pankaj Gupta May 16, 2019, 7:54 a.m. UTC | #7
> >> +	vpmem->vdev = vdev;
> >> +	vdev->priv = vpmem;
> >> +	err = init_vq(vpmem);
> >> +	if (err) {
> >> +		dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
> >> +		goto out_err;
> >> +	}
> >> +
> >> +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> >> +			start, &vpmem->start);
> >> +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> >> +			size, &vpmem->size);
> >> +
> >> +	res.start = vpmem->start;
> >> +	res.end   = vpmem->start + vpmem->size-1;
> > 
> > nit: " - 1;"
> > 
> >> +	vpmem->nd_desc.provider_name = "virtio-pmem";
> >> +	vpmem->nd_desc.module = THIS_MODULE;
> >> +
> >> +	vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
> >> +						&vpmem->nd_desc);
> >> +	if (!vpmem->nvdimm_bus) {
> >> +		dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
> >> +		err = -ENXIO;
> >> +		goto out_vq;
> >> +	}
> >> +
> >> +	dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
> >> +
> >> +	ndr_desc.res = &res;
> >> +	ndr_desc.numa_node = nid;
> >> +	ndr_desc.flush = async_pmem_flush;
> >> +	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
> >> +	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
> >> +	nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
> >> +	if (!nd_region) {
> >> +		dev_err(&vdev->dev, "failed to create nvdimm region\n");
> >> +		err = -ENXIO;
> >> +		goto out_nd;
> >> +	}
> >> +	nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
> >> +	return 0;
> >> +out_nd:
> >> +	nvdimm_bus_unregister(vpmem->nvdimm_bus);
> >> +out_vq:
> >> +	vdev->config->del_vqs(vdev);
> >> +out_err:
> >> +	return err;
> >> +}
> >> +
> >> +static void virtio_pmem_remove(struct virtio_device *vdev)
> >> +{
> >> +	struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
> >> +
> >> +	nvdimm_bus_unregister(nvdimm_bus);
> >> +	vdev->config->del_vqs(vdev);
> >> +	vdev->config->reset(vdev);
> >> +}
> >> +
> >> +static struct virtio_driver virtio_pmem_driver = {
> >> +	.driver.name		= KBUILD_MODNAME,
> >> +	.driver.owner		= THIS_MODULE,
> >> +	.id_table		= id_table,
> >> +	.probe			= virtio_pmem_probe,
> >> +	.remove			= virtio_pmem_remove,
> >> +};
> >> +
> >> +module_virtio_driver(virtio_pmem_driver);
> >> +MODULE_DEVICE_TABLE(virtio, id_table);
> >> +MODULE_DESCRIPTION("Virtio pmem driver");
> >> +MODULE_LICENSE("GPL");
> >> diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
> >> new file mode 100644
> >> index 000000000000..ab1da877575d
> >> --- /dev/null
> >> +++ b/drivers/nvdimm/virtio_pmem.h
> >> @@ -0,0 +1,60 @@
> >> +/* SPDX-License-Identifier: GPL-2.0 */
> >> +/*
> >> + * virtio_pmem.h: virtio pmem Driver
> >> + *
> >> + * Discovers persistent memory range information
> >> + * from host and provides a virtio based flushing
> >> + * interface.
> >> + **/
> >> +
> >> +#ifndef _LINUX_VIRTIO_PMEM_H
> >> +#define _LINUX_VIRTIO_PMEM_H
> >> +
> >> +#include <linux/virtio_ids.h>
> >> +#include <linux/module.h>
> >> +#include <linux/virtio_config.h>
> >> +#include <uapi/linux/virtio_pmem.h>
> >> +#include <linux/libnvdimm.h>
> >> +#include <linux/spinlock.h>
> >> +
> >> +struct virtio_pmem_request {
> >> +	/* Host return status corresponding to flush request */
> >> +	int ret;
> >> +
> >> +	/* command name*/
> >> +	char name[16];
> > 
> > So ... why are we sending string commands and expect native-endianess
> > integers and don't define a proper request/response structure + request
> > types in include/uapi/linux/virtio_pmem.h like
> > 
> > struct virtio_pmem_resp {
> > 	__virtio32 ret;
> > }
> 
> FWIW, I wonder if we should even properly translate return values and
> define types like
> 
> VIRTIO_PMEM_RESP_TYPE_OK	0
> VIRTIO_PMEM_RESP_TYPE_EIO	1

Don't think these are required as only failure and success
return types easy to understand.

Thanks,
Pankaj
> 
> ..
> 
> > 
> > #define VIRTIO_PMEM_REQ_TYPE_FLUSH	1
> > struct virtio_pmem_req {
> > 	__virtio16 type;
> > }
> > 
> > ... and this way we also define a proper endianess format for exchange
> > and keep it extensible
> > 
> > @MST, what's your take on this?
> > 
> > 
> 
> 
> --
> 
> Thanks,
> 
> David / dhildenb
>
Michael S. Tsirkin May 16, 2019, 1:59 p.m. UTC | #8
On Wed, May 15, 2019 at 10:46:00PM +0200, David Hildenbrand wrote:
> > +	vpmem->vdev = vdev;
> > +	vdev->priv = vpmem;
> > +	err = init_vq(vpmem);
> > +	if (err) {
> > +		dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
> > +		goto out_err;
> > +	}
> > +
> > +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +			start, &vpmem->start);
> > +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +			size, &vpmem->size);
> > +
> > +	res.start = vpmem->start;
> > +	res.end   = vpmem->start + vpmem->size-1;
> 
> nit: " - 1;"
> 
> > +	vpmem->nd_desc.provider_name = "virtio-pmem";
> > +	vpmem->nd_desc.module = THIS_MODULE;
> > +
> > +	vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
> > +						&vpmem->nd_desc);
> > +	if (!vpmem->nvdimm_bus) {
> > +		dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
> > +		err = -ENXIO;
> > +		goto out_vq;
> > +	}
> > +
> > +	dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
> > +
> > +	ndr_desc.res = &res;
> > +	ndr_desc.numa_node = nid;
> > +	ndr_desc.flush = async_pmem_flush;
> > +	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
> > +	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
> > +	nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
> > +	if (!nd_region) {
> > +		dev_err(&vdev->dev, "failed to create nvdimm region\n");
> > +		err = -ENXIO;
> > +		goto out_nd;
> > +	}
> > +	nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
> > +	return 0;
> > +out_nd:
> > +	nvdimm_bus_unregister(vpmem->nvdimm_bus);
> > +out_vq:
> > +	vdev->config->del_vqs(vdev);
> > +out_err:
> > +	return err;
> > +}
> > +
> > +static void virtio_pmem_remove(struct virtio_device *vdev)
> > +{
> > +	struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
> > +
> > +	nvdimm_bus_unregister(nvdimm_bus);
> > +	vdev->config->del_vqs(vdev);
> > +	vdev->config->reset(vdev);
> > +}
> > +
> > +static struct virtio_driver virtio_pmem_driver = {
> > +	.driver.name		= KBUILD_MODNAME,
> > +	.driver.owner		= THIS_MODULE,
> > +	.id_table		= id_table,
> > +	.probe			= virtio_pmem_probe,
> > +	.remove			= virtio_pmem_remove,
> > +};
> > +
> > +module_virtio_driver(virtio_pmem_driver);
> > +MODULE_DEVICE_TABLE(virtio, id_table);
> > +MODULE_DESCRIPTION("Virtio pmem driver");
> > +MODULE_LICENSE("GPL");
> > diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
> > new file mode 100644
> > index 000000000000..ab1da877575d
> > --- /dev/null
> > +++ b/drivers/nvdimm/virtio_pmem.h
> > @@ -0,0 +1,60 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * virtio_pmem.h: virtio pmem Driver
> > + *
> > + * Discovers persistent memory range information
> > + * from host and provides a virtio based flushing
> > + * interface.
> > + **/
> > +
> > +#ifndef _LINUX_VIRTIO_PMEM_H
> > +#define _LINUX_VIRTIO_PMEM_H
> > +
> > +#include <linux/virtio_ids.h>
> > +#include <linux/module.h>
> > +#include <linux/virtio_config.h>
> > +#include <uapi/linux/virtio_pmem.h>
> > +#include <linux/libnvdimm.h>
> > +#include <linux/spinlock.h>
> > +
> > +struct virtio_pmem_request {
> > +	/* Host return status corresponding to flush request */
> > +	int ret;
> > +
> > +	/* command name*/
> > +	char name[16];
> 
> So ... why are we sending string commands and expect native-endianess
> integers and don't define a proper request/response structure + request
> types in include/uapi/linux/virtio_pmem.h like

passing names could be ok.
I missed the fact we return a native endian int.
Pls fix that.


> 
> struct virtio_pmem_resp {
> 	__virtio32 ret;
> }
> 
> #define VIRTIO_PMEM_REQ_TYPE_FLUSH	1
> struct virtio_pmem_req {
> 	__virtio16 type;
> }
> 
> ... and this way we also define a proper endianess format for exchange
> and keep it extensible
> 
> @MST, what's your take on this?

Extensions can always use feature bits so I don't think
it's a problem.

> 
> -- 
> 
> Thanks,
> 
> David / dhildenb
Jakub Staroń May 17, 2019, 12:12 a.m. UTC | #9
On 5/14/19 7:54 AM, Pankaj Gupta wrote:
> +		if (!list_empty(&vpmem->req_list)) {
> +			req_buf = list_first_entry(&vpmem->req_list,
> +					struct virtio_pmem_request, list);
> +			req_buf->wq_buf_avail = true;
> +			wake_up(&req_buf->wq_buf);
> +			list_del(&req_buf->list);
Yes, this change is the right one, thank you!

> +	 /*
> +	  * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
> +	  * queue does not have free descriptor. We add the request
> +	  * to req_list and wait for host_ack to wake us up when free
> +	  * slots are available.
> +	  */
> +	while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req,
> +					GFP_ATOMIC)) == -ENOSPC) {
> +
> +		dev_err(&vdev->dev, "failed to send command to virtio pmem" \
> +			"device, no free slots in the virtqueue\n");
> +		req->wq_buf_avail = false;
> +		list_add_tail(&req->list, &vpmem->req_list);
> +		spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
> +
> +		/* A host response results in "host_ack" getting called */
> +		wait_event(req->wq_buf, req->wq_buf_avail);
> +		spin_lock_irqsave(&vpmem->pmem_lock, flags);
> +	}
> +	err1 = virtqueue_kick(vpmem->req_vq);
> +	spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
> +
> +	/*
> +	 * virtqueue_add_sgs failed with error different than -ENOSPC, we can't
> +	 * do anything about that.
> +	 */
> +	if (err || !err1) {
> +		dev_info(&vdev->dev, "failed to send command to virtio pmem device\n");
> +		err = -EIO;
> +	} else {
> +		/* A host repsonse results in "host_ack" getting called */
> +		wait_event(req->host_acked, req->done);
> +		err = req->ret;
> +I confirm that the failures I was facing with the `-ENOSPC` error path are not present in v9.

Best,
Jakub Staron
Pankaj Gupta May 17, 2019, 5:31 a.m. UTC | #10
> 
> On Wed, May 15, 2019 at 10:46:00PM +0200, David Hildenbrand wrote:
> > > +	vpmem->vdev = vdev;
> > > +	vdev->priv = vpmem;
> > > +	err = init_vq(vpmem);
> > > +	if (err) {
> > > +		dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
> > > +		goto out_err;
> > > +	}
> > > +
> > > +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > > +			start, &vpmem->start);
> > > +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > > +			size, &vpmem->size);
> > > +
> > > +	res.start = vpmem->start;
> > > +	res.end   = vpmem->start + vpmem->size-1;
> > 
> > nit: " - 1;"
> > 
> > > +	vpmem->nd_desc.provider_name = "virtio-pmem";
> > > +	vpmem->nd_desc.module = THIS_MODULE;
> > > +
> > > +	vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
> > > +						&vpmem->nd_desc);
> > > +	if (!vpmem->nvdimm_bus) {
> > > +		dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
> > > +		err = -ENXIO;
> > > +		goto out_vq;
> > > +	}
> > > +
> > > +	dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
> > > +
> > > +	ndr_desc.res = &res;
> > > +	ndr_desc.numa_node = nid;
> > > +	ndr_desc.flush = async_pmem_flush;
> > > +	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
> > > +	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
> > > +	nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
> > > +	if (!nd_region) {
> > > +		dev_err(&vdev->dev, "failed to create nvdimm region\n");
> > > +		err = -ENXIO;
> > > +		goto out_nd;
> > > +	}
> > > +	nd_region->provider_data =
> > > dev_to_virtio(nd_region->dev.parent->parent);
> > > +	return 0;
> > > +out_nd:
> > > +	nvdimm_bus_unregister(vpmem->nvdimm_bus);
> > > +out_vq:
> > > +	vdev->config->del_vqs(vdev);
> > > +out_err:
> > > +	return err;
> > > +}
> > > +
> > > +static void virtio_pmem_remove(struct virtio_device *vdev)
> > > +{
> > > +	struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
> > > +
> > > +	nvdimm_bus_unregister(nvdimm_bus);
> > > +	vdev->config->del_vqs(vdev);
> > > +	vdev->config->reset(vdev);
> > > +}
> > > +
> > > +static struct virtio_driver virtio_pmem_driver = {
> > > +	.driver.name		= KBUILD_MODNAME,
> > > +	.driver.owner		= THIS_MODULE,
> > > +	.id_table		= id_table,
> > > +	.probe			= virtio_pmem_probe,
> > > +	.remove			= virtio_pmem_remove,
> > > +};
> > > +
> > > +module_virtio_driver(virtio_pmem_driver);
> > > +MODULE_DEVICE_TABLE(virtio, id_table);
> > > +MODULE_DESCRIPTION("Virtio pmem driver");
> > > +MODULE_LICENSE("GPL");
> > > diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
> > > new file mode 100644
> > > index 000000000000..ab1da877575d
> > > --- /dev/null
> > > +++ b/drivers/nvdimm/virtio_pmem.h
> > > @@ -0,0 +1,60 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +/*
> > > + * virtio_pmem.h: virtio pmem Driver
> > > + *
> > > + * Discovers persistent memory range information
> > > + * from host and provides a virtio based flushing
> > > + * interface.
> > > + **/
> > > +
> > > +#ifndef _LINUX_VIRTIO_PMEM_H
> > > +#define _LINUX_VIRTIO_PMEM_H
> > > +
> > > +#include <linux/virtio_ids.h>
> > > +#include <linux/module.h>
> > > +#include <linux/virtio_config.h>
> > > +#include <uapi/linux/virtio_pmem.h>
> > > +#include <linux/libnvdimm.h>
> > > +#include <linux/spinlock.h>
> > > +
> > > +struct virtio_pmem_request {
> > > +	/* Host return status corresponding to flush request */
> > > +	int ret;
> > > +
> > > +	/* command name*/
> > > +	char name[16];
> > 
> > So ... why are we sending string commands and expect native-endianess
> > integers and don't define a proper request/response structure + request
> > types in include/uapi/linux/virtio_pmem.h like
> 
> passing names could be ok.
> I missed the fact we return a native endian int.
> Pls fix that.

Sure. will fix this.

> 
> 
> > 
> > struct virtio_pmem_resp {
> > 	__virtio32 ret;
> > }
> > 
> > #define VIRTIO_PMEM_REQ_TYPE_FLUSH	1
> > struct virtio_pmem_req {
> > 	__virtio16 type;
> > }
> > 
> > ... and this way we also define a proper endianess format for exchange
> > and keep it extensible
> > 
> > @MST, what's your take on this?
> 
> Extensions can always use feature bits so I don't think
> it's a problem.

That was exactly my thought when I implemented this. Though I am
fine with separate structures for request/response and I made the
change. 

Thank you for all the comments.

Best regards,
Pankaj 
> > 
> > --
> > 
> > Thanks,
> > 
> > David / dhildenb
> 
>
Pankaj Gupta May 17, 2019, 5:35 a.m. UTC | #11
Hi Jakub,

> 
> On 5/14/19 7:54 AM, Pankaj Gupta wrote:
> > +		if (!list_empty(&vpmem->req_list)) {
> > +			req_buf = list_first_entry(&vpmem->req_list,
> > +					struct virtio_pmem_request, list);
> > +			req_buf->wq_buf_avail = true;
> > +			wake_up(&req_buf->wq_buf);
> > +			list_del(&req_buf->list);
> Yes, this change is the right one, thank you!

Thank you for the confirmation.

> 
> > +	 /*
> > +	  * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
> > +	  * queue does not have free descriptor. We add the request
> > +	  * to req_list and wait for host_ack to wake us up when free
> > +	  * slots are available.
> > +	  */
> > +	while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req,
> > +					GFP_ATOMIC)) == -ENOSPC) {
> > +
> > +		dev_err(&vdev->dev, "failed to send command to virtio pmem" \
> > +			"device, no free slots in the virtqueue\n");
> > +		req->wq_buf_avail = false;
> > +		list_add_tail(&req->list, &vpmem->req_list);
> > +		spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
> > +
> > +		/* A host response results in "host_ack" getting called */
> > +		wait_event(req->wq_buf, req->wq_buf_avail);
> > +		spin_lock_irqsave(&vpmem->pmem_lock, flags);
> > +	}
> > +	err1 = virtqueue_kick(vpmem->req_vq);
> > +	spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
> > +
> > +	/*
> > +	 * virtqueue_add_sgs failed with error different than -ENOSPC, we can't
> > +	 * do anything about that.
> > +	 */
> > +	if (err || !err1) {
> > +		dev_info(&vdev->dev, "failed to send command to virtio pmem device\n");
> > +		err = -EIO;
> > +	} else {
> > +		/* A host repsonse results in "host_ack" getting called */
> > +		wait_event(req->host_acked, req->done);
> > +		err = req->ret;
> > +I confirm that the failures I was facing with the `-ENOSPC` error path are
> > not present in v9.

Can I take it your reviewed/acked-by? or tested-by tag? for the virtio patch :)

Thank you,
Pankaj

> 
> Best,
> Jakub Staron
> 
>
Jakub Staroń May 18, 2019, 1:10 a.m. UTC | #12
On 5/16/19 10:35 PM, Pankaj Gupta wrote:
> Can I take it your reviewed/acked-by? or tested-by tag? for the virtio patch :)I don't feel that I have enough expertise to give the reviewed-by tag, but you can
take my acked-by + tested-by.

Acked-by: Jakub Staron <jstaron@google.com>
Tested-by: Jakub Staron <jstaron@google.com>

No kernel panics/stalls encountered during testing this patches (v9) with QEMU + xfstests.
Some CPU stalls encountered while testing with crosvm instead of QEMU with xfstests
(test generic/464) but no repro for QEMU, so the fault may be on the side of crosvm.


The dump for the crosvm/xfstests stall:
[ 2504.175276] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[ 2504.176681] rcu:     0-...!: (1 GPs behind) idle=9b2/1/0x4000000000000000 softirq=1089198/1089202 fqs=0 
[ 2504.178270] rcu:     2-...!: (1 ticks this GP) idle=cfe/1/0x4000000000000002 softirq=1055108/1055110 fqs=0 
[ 2504.179802] rcu:     3-...!: (1 GPs behind) idle=1d6/1/0x4000000000000002 softirq=1046798/1046802 fqs=0 
[ 2504.181215] rcu:     4-...!: (2 ticks this GP) idle=522/1/0x4000000000000002 softirq=1249063/1249064 fqs=0 
[ 2504.182625] rcu:     5-...!: (1 GPs behind) idle=6da/1/0x4000000000000000 softirq=1131036/1131047 fqs=0 
[ 2504.183955]  (detected by 3, t=0 jiffies, g=1232529, q=1370)
[ 2504.184762] Sending NMI from CPU 3 to CPUs 0:
[ 2504.186400] NMI backtrace for cpu 0
[ 2504.186401] CPU: 0 PID: 6670 Comm: 464 Not tainted 5.1.0+ #1
[ 2504.186401] Hardware name: ChromiumOS crosvm, BIOS 0 
[ 2504.186402] RIP: 0010:queued_spin_lock_slowpath+0x1c/0x1e0
[ 2504.186402] Code: e7 89 c8 f0 44 0f b1 07 39 c1 75 dc f3 c3 0f 1f 44 00 00 ba 01 00 00 00 8b 07 85 c0 75 0a f0 0f b1 17 85 c0 75 f2 f3 c3 f3 90 <eb> ec 81 fe 00 01 00 00 0f 84 ab 00 00 00 81 e6 00 ff ff ff 75 44
[ 2504.186403] RSP: 0018:ffffc90000003ee8 EFLAGS: 00000002
[ 2504.186404] RAX: 0000000000000001 RBX: 0000000000000246 RCX: 0000000000404044
[ 2504.186404] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffffffff8244a280
[ 2504.186405] RBP: ffffffff8244a280 R08: 00000000000f4200 R09: 0000024709ed6c32
[ 2504.186405] R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff8244a280
[ 2504.186405] R13: 0000000000000009 R14: 0000000000000009 R15: 0000000000000000
[ 2504.186406] FS:  0000000000000000(0000) GS:ffff8880cc600000(0000) knlGS:0000000000000000
[ 2504.186406] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2504.186406] CR2: 00007efd6b0f15d8 CR3: 000000000260a006 CR4: 0000000000360ef0
[ 2504.186407] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2504.186407] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 2504.186407] Call Trace:
[ 2504.186408]  <IRQ>
[ 2504.186408]  _raw_spin_lock_irqsave+0x1d/0x30
[ 2504.186408]  rcu_core+0x3b6/0x740
[ 2504.186408]  ? __hrtimer_run_queues+0x133/0x280
[ 2504.186409]  ? recalibrate_cpu_khz+0x10/0x10
[ 2504.186409]  __do_softirq+0xd8/0x2e4
[ 2504.186409]  irq_exit+0xa3/0xb0
[ 2504.186410]  smp_apic_timer_interrupt+0x67/0x120
[ 2504.186410]  apic_timer_interrupt+0xf/0x20
[ 2504.186410]  </IRQ>
[ 2504.186410] RIP: 0010:unmap_page_range+0x47a/0x9b0
[ 2504.186411] Code: 0f 46 46 10 49 39 6e 18 49 89 46 10 48 89 e8 49 0f 43 46 18 41 80 4e 20 08 4d 85 c9 49 89 46 18 0f 84 68 ff ff ff 49 8b 51 08 <48> 8d 42 ff 83 e2 01 49 0f 44 c1 f6 40 18 01 75 38 48 ba ff 0f 00
[ 2504.186411] RSP: 0018:ffffc900036cbcc8 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
[ 2504.186412] RAX: ffffffffffffffff RBX: 800000003751d045 RCX: 0000000000000001
[ 2504.186413] RDX: ffffea0002e09288 RSI: 000000000269b000 RDI: ffff8880b6525e40
[ 2504.186413] RBP: 000000000269c000 R08: 0000000000000000 R09: ffffea0000dd4740
[ 2504.186413] R10: ffffea0001755700 R11: ffff8880cc62d120 R12: 0000000002794000
[ 2504.186414] R13: 000000000269b000 R14: ffffc900036cbdf0 R15: ffff8880572434d8
[ 2504.186414]  ? unmap_page_range+0x420/0x9b0
[ 2504.186414]  ? release_pages+0x175/0x390
[ 2504.186414]  unmap_vmas+0x7c/0xe0
[ 2504.186415]  exit_mmap+0xa4/0x190
[ 2504.186415]  mmput+0x3b/0x100
[ 2504.186415]  do_exit+0x276/0xc10
[ 2504.186415]  do_group_exit+0x35/0xa0
[ 2504.186415]  __x64_sys_exit_group+0xf/0x10
[ 2504.186416]  do_syscall_64+0x43/0x120
[ 2504.186416]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 2504.186416] RIP: 0033:0x7efd6ae10618
[ 2504.186416] Code: Bad RIP value.
[ 2504.186417] RSP: 002b:00007ffcac9bde38 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
[ 2504.186417] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007efd6ae10618
[ 2504.186418] RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000
[ 2504.186418] RBP: 00007efd6b0ed8e0 R08: 00000000000000e7 R09: ffffffffffffff98
[ 2504.186418] R10: 00007ffcac9bddb8 R11: 0000000000000246 R12: 00007efd6b0ed8e0
[ 2504.186419] R13: 00007efd6b0f2c20 R14: 0000000000000060 R15: 000000000070e705
[ 2504.186421] NMI backtrace for cpu 3
[ 2504.226980] CPU: 3 PID: 6596 Comm: xfs_io Not tainted 5.1.0+ #1
[ 2504.227661] Hardware name: ChromiumOS crosvm, BIOS 0 
[ 2504.228261] Call Trace:
[ 2504.228552]  <IRQ>
[ 2504.228795]  dump_stack+0x46/0x5b
[ 2504.229180]  nmi_cpu_backtrace+0x89/0x90
[ 2504.229649]  ? lapic_can_unplug_cpu+0x90/0x90
[ 2504.230157]  nmi_trigger_cpumask_backtrace+0x82/0xc0
[ 2504.230751]  rcu_dump_cpu_stacks+0x8b/0xb7
[ 2504.231222]  rcu_sched_clock_irq+0x6f6/0x720
[ 2504.231726]  ? tick_sched_do_timer+0x50/0x50
[ 2504.232214]  update_process_times+0x23/0x50
[ 2504.232693]  tick_sched_handle+0x2f/0x40
[ 2504.233144]  tick_sched_timer+0x32/0x70
[ 2504.233594]  __hrtimer_run_queues+0x103/0x280
[ 2504.234092]  hrtimer_interrupt+0xe0/0x240
[ 2504.234580]  smp_apic_timer_interrupt+0x5d/0x120
[ 2504.235152]  apic_timer_interrupt+0xf/0x20
[ 2504.235627]  </IRQ>
[ 2504.235879] RIP: 0010:__memcpy_flushcache+0x4b/0x180
[ 2504.236452] Code: 8d 5d e0 4c 8d 62 20 48 89 f7 48 29 d7 48 89 d9 48 83 e1 e0 4c 01 e1 48 8d 04 17 4c 8b 02 4c 8b 4a 08 4c 8b 52 10 4c 8b 5a 18 <4c> 0f c3 00 4c 0f c3 48 08 4c 0f c3 50 10 4c 0f c3 58 18 48 83 c2
[ 2504.238592] RSP: 0018:ffffc90003ae38e8 EFLAGS: 00010286 ORIG_RAX: ffffffffffffff13
[ 2504.239467] RAX: ffff888341800000 RBX: 0000000000000fe0 RCX: ffff88801bd22000
[ 2504.240277] RDX: ffff88801bd21000 RSI: ffff888341800000 RDI: 0000000325adf000
[ 2504.241092] RBP: 0000000000001000 R08: cdcdcdcdcdcdcdcd R09: cdcdcdcdcdcdcdcd
[ 2504.241908] R10: cdcdcdcdcdcdcdcd R11: cdcdcdcdcdcdcdcd R12: ffff88801bd21020
[ 2504.242751] R13: ffff8880b916b600 R14: ffff888341800000 R15: ffffea00006f4840
[ 2504.243602]  write_pmem+0x61/0x90
[ 2504.244002]  pmem_do_bvec+0x178/0x2c0
[ 2504.244469]  ? chksum_update+0xe/0x20
[ 2504.244908]  pmem_make_request+0xf7/0x270
[ 2504.245509]  generic_make_request+0x199/0x3f0
[ 2504.246179]  ? submit_bio+0x67/0x130
[ 2504.246710]  submit_bio+0x67/0x130
[ 2504.247117]  ext4_io_submit+0x44/0x50
[ 2504.247556]  ext4_writepages+0x621/0xe80
[ 2504.248028]  ? 0xffffffff81000000
[ 2504.248418]  ? do_writepages+0x46/0xd0
[ 2504.248880]  ? ext4_mark_inode_dirty+0x1d0/0x1d0
[ 2504.249417]  do_writepages+0x46/0xd0
[ 2504.249833]  ? release_pages+0x175/0x390
[ 2504.250290]  ? __filemap_fdatawrite_range+0x7c/0xb0
[ 2504.250879]  __filemap_fdatawrite_range+0x7c/0xb0
[ 2504.251427]  ext4_release_file+0x67/0xa0
[ 2504.251897]  __fput+0xb1/0x220
[ 2504.252260]  task_work_run+0x79/0xa0
[ 2504.252676]  do_exit+0x2ca/0xc10
[ 2504.253063]  ? __switch_to_asm+0x40/0x70
[ 2504.253530]  ? __switch_to_asm+0x34/0x70
[ 2504.253995]  ? __switch_to_asm+0x40/0x70
[ 2504.254446]  do_group_exit+0x35/0xa0
[ 2504.254865]  get_signal+0x14e/0x7a0
[ 2504.255281]  ? __switch_to_asm+0x34/0x70
[ 2504.255749]  ? __switch_to_asm+0x40/0x70
[ 2504.256224]  do_signal+0x2b/0x5e0
[ 2504.256619]  ? __switch_to_asm+0x40/0x70
[ 2504.257086]  ? __switch_to_asm+0x34/0x70
[ 2504.257552]  ? __switch_to_asm+0x40/0x70
[ 2504.258022]  ? __switch_to_asm+0x34/0x70
[ 2504.258488]  ? __schedule+0x253/0x530
[ 2504.258943]  ? __switch_to_asm+0x34/0x70
[ 2504.259398]  exit_to_usermode_loop+0x87/0xa0
[ 2504.259900]  do_syscall_64+0xf7/0x120
[ 2504.260326]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 2504.260923] RIP: 0033:0x7faf347e28bd
[ 2504.261348] Code: Bad RIP value.
[ 2504.261727] RSP: 002b:00007faf33fc5f40 EFLAGS: 00000293 ORIG_RAX: 0000000000000022
[ 2504.262594] RAX: fffffffffffffdfe RBX: 0000000000000000 RCX: 00007faf347e28bd
[ 2504.263416] RDX: 8b9da1f4246cdb38 RSI: 0000000000000000 RDI: 0000000000000000
[ 2504.264215] RBP: 0000000000000000 R08: 00007faf33fc6700 R09: 00007faf33fc6700
[ 2504.265061] R10: 000000000000012d R11: 0000000000000293 R12: 00007ffdf142327e
[ 2504.266082] R13: 00007ffdf142327f R14: 00007faf337c6000 R15: 0000000000000003

Arch: x86_64
Kernel: stable top with virtio-pmem v9 patches applied
Distro: Debian Stretch

But as I said, it may be just a problem with crosvm.


Thank you,
Jakub Staron
Pankaj Gupta May 20, 2019, 3:46 a.m. UTC | #13
> On 5/16/19 10:35 PM, Pankaj Gupta wrote:
> > Can I take it your reviewed/acked-by? or tested-by tag? for the virtio
> > patch :)I don't feel that I have enough expertise to give the reviewed-by
> > tag, but you can
> take my acked-by + tested-by.
> 
> Acked-by: Jakub Staron <jstaron@google.com>
> Tested-by: Jakub Staron <jstaron@google.com>
> 
> No kernel panics/stalls encountered during testing this patches (v9) with
> QEMU + xfstests.

Thank you for testing and confirming the results. I will add your tested &
acked-by in v10.

> Some CPU stalls encountered while testing with crosvm instead of QEMU with
> xfstests
> (test generic/464) but no repro for QEMU, so the fault may be on the side of
> crosvm.

yes, looks like crosvm related as we did not see any of this in my and your
testing with Qemu. 

> 
> 
> The dump for the crosvm/xfstests stall:
> [ 2504.175276] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
> [ 2504.176681] rcu:     0-...!: (1 GPs behind) idle=9b2/1/0x4000000000000000
> softirq=1089198/1089202 fqs=0
> [ 2504.178270] rcu:     2-...!: (1 ticks this GP)
> idle=cfe/1/0x4000000000000002 softirq=1055108/1055110 fqs=0
> [ 2504.179802] rcu:     3-...!: (1 GPs behind) idle=1d6/1/0x4000000000000002
> softirq=1046798/1046802 fqs=0
> [ 2504.181215] rcu:     4-...!: (2 ticks this GP)
> idle=522/1/0x4000000000000002 softirq=1249063/1249064 fqs=0
> [ 2504.182625] rcu:     5-...!: (1 GPs behind) idle=6da/1/0x4000000000000000
> softirq=1131036/1131047 fqs=0
> [ 2504.183955]  (detected by 3, t=0 jiffies, g=1232529, q=1370)
> [ 2504.184762] Sending NMI from CPU 3 to CPUs 0:
> [ 2504.186400] NMI backtrace for cpu 0
> [ 2504.186401] CPU: 0 PID: 6670 Comm: 464 Not tainted 5.1.0+ #1
> [ 2504.186401] Hardware name: ChromiumOS crosvm, BIOS 0
> [ 2504.186402] RIP: 0010:queued_spin_lock_slowpath+0x1c/0x1e0
> [ 2504.186402] Code: e7 89 c8 f0 44 0f b1 07 39 c1 75 dc f3 c3 0f 1f 44 00 00
> ba 01 00 00 00 8b 07 85 c0 75 0a f0 0f b1 17 85 c0 75 f2 f3 c3 f3 90 <eb> ec
> 81 fe 00 01 00 00 0f 84 ab 00 00 00 81 e6 00 ff ff ff 75 44
> [ 2504.186403] RSP: 0018:ffffc90000003ee8 EFLAGS: 00000002
> [ 2504.186404] RAX: 0000000000000001 RBX: 0000000000000246 RCX:
> 0000000000404044
> [ 2504.186404] RDX: 0000000000000001 RSI: 0000000000000001 RDI:
> ffffffff8244a280
> [ 2504.186405] RBP: ffffffff8244a280 R08: 00000000000f4200 R09:
> 0000024709ed6c32
> [ 2504.186405] R10: 0000000000000000 R11: 0000000000000001 R12:
> ffffffff8244a280
> [ 2504.186405] R13: 0000000000000009 R14: 0000000000000009 R15:
> 0000000000000000
> [ 2504.186406] FS:  0000000000000000(0000) GS:ffff8880cc600000(0000)
> knlGS:0000000000000000
> [ 2504.186406] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 2504.186406] CR2: 00007efd6b0f15d8 CR3: 000000000260a006 CR4:
> 0000000000360ef0
> [ 2504.186407] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [ 2504.186407] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> 0000000000000400
> [ 2504.186407] Call Trace:
> [ 2504.186408]  <IRQ>
> [ 2504.186408]  _raw_spin_lock_irqsave+0x1d/0x30
> [ 2504.186408]  rcu_core+0x3b6/0x740
> [ 2504.186408]  ? __hrtimer_run_queues+0x133/0x280
> [ 2504.186409]  ? recalibrate_cpu_khz+0x10/0x10
> [ 2504.186409]  __do_softirq+0xd8/0x2e4
> [ 2504.186409]  irq_exit+0xa3/0xb0
> [ 2504.186410]  smp_apic_timer_interrupt+0x67/0x120
> [ 2504.186410]  apic_timer_interrupt+0xf/0x20
> [ 2504.186410]  </IRQ>
> [ 2504.186410] RIP: 0010:unmap_page_range+0x47a/0x9b0
> [ 2504.186411] Code: 0f 46 46 10 49 39 6e 18 49 89 46 10 48 89 e8 49 0f 43 46
> 18 41 80 4e 20 08 4d 85 c9 49 89 46 18 0f 84 68 ff ff ff 49 8b 51 08 <48> 8d
> 42 ff 83 e2 01 49 0f 44 c1 f6 40 18 01 75 38 48 ba ff 0f 00
> [ 2504.186411] RSP: 0018:ffffc900036cbcc8 EFLAGS: 00000282 ORIG_RAX:
> ffffffffffffff13
> [ 2504.186412] RAX: ffffffffffffffff RBX: 800000003751d045 RCX:
> 0000000000000001
> [ 2504.186413] RDX: ffffea0002e09288 RSI: 000000000269b000 RDI:
> ffff8880b6525e40
> [ 2504.186413] RBP: 000000000269c000 R08: 0000000000000000 R09:
> ffffea0000dd4740
> [ 2504.186413] R10: ffffea0001755700 R11: ffff8880cc62d120 R12:
> 0000000002794000
> [ 2504.186414] R13: 000000000269b000 R14: ffffc900036cbdf0 R15:
> ffff8880572434d8
> [ 2504.186414]  ? unmap_page_range+0x420/0x9b0
> [ 2504.186414]  ? release_pages+0x175/0x390
> [ 2504.186414]  unmap_vmas+0x7c/0xe0
> [ 2504.186415]  exit_mmap+0xa4/0x190
> [ 2504.186415]  mmput+0x3b/0x100
> [ 2504.186415]  do_exit+0x276/0xc10
> [ 2504.186415]  do_group_exit+0x35/0xa0
> [ 2504.186415]  __x64_sys_exit_group+0xf/0x10
> [ 2504.186416]  do_syscall_64+0x43/0x120
> [ 2504.186416]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [ 2504.186416] RIP: 0033:0x7efd6ae10618
> [ 2504.186416] Code: Bad RIP value.
> [ 2504.186417] RSP: 002b:00007ffcac9bde38 EFLAGS: 00000246 ORIG_RAX:
> 00000000000000e7
> [ 2504.186417] RAX: ffffffffffffffda RBX: 0000000000000000 RCX:
> 00007efd6ae10618
> [ 2504.186418] RDX: 0000000000000000 RSI: 000000000000003c RDI:
> 0000000000000000
> [ 2504.186418] RBP: 00007efd6b0ed8e0 R08: 00000000000000e7 R09:
> ffffffffffffff98
> [ 2504.186418] R10: 00007ffcac9bddb8 R11: 0000000000000246 R12:
> 00007efd6b0ed8e0
> [ 2504.186419] R13: 00007efd6b0f2c20 R14: 0000000000000060 R15:
> 000000000070e705
> [ 2504.186421] NMI backtrace for cpu 3
> [ 2504.226980] CPU: 3 PID: 6596 Comm: xfs_io Not tainted 5.1.0+ #1
> [ 2504.227661] Hardware name: ChromiumOS crosvm, BIOS 0
> [ 2504.228261] Call Trace:
> [ 2504.228552]  <IRQ>
> [ 2504.228795]  dump_stack+0x46/0x5b
> [ 2504.229180]  nmi_cpu_backtrace+0x89/0x90
> [ 2504.229649]  ? lapic_can_unplug_cpu+0x90/0x90
> [ 2504.230157]  nmi_trigger_cpumask_backtrace+0x82/0xc0
> [ 2504.230751]  rcu_dump_cpu_stacks+0x8b/0xb7
> [ 2504.231222]  rcu_sched_clock_irq+0x6f6/0x720
> [ 2504.231726]  ? tick_sched_do_timer+0x50/0x50
> [ 2504.232214]  update_process_times+0x23/0x50
> [ 2504.232693]  tick_sched_handle+0x2f/0x40
> [ 2504.233144]  tick_sched_timer+0x32/0x70
> [ 2504.233594]  __hrtimer_run_queues+0x103/0x280
> [ 2504.234092]  hrtimer_interrupt+0xe0/0x240
> [ 2504.234580]  smp_apic_timer_interrupt+0x5d/0x120
> [ 2504.235152]  apic_timer_interrupt+0xf/0x20
> [ 2504.235627]  </IRQ>
> [ 2504.235879] RIP: 0010:__memcpy_flushcache+0x4b/0x180
> [ 2504.236452] Code: 8d 5d e0 4c 8d 62 20 48 89 f7 48 29 d7 48 89 d9 48 83 e1
> e0 4c 01 e1 48 8d 04 17 4c 8b 02 4c 8b 4a 08 4c 8b 52 10 4c 8b 5a 18 <4c> 0f
> c3 00 4c 0f c3 48 08 4c 0f c3 50 10 4c 0f c3 58 18 48 83 c2
> [ 2504.238592] RSP: 0018:ffffc90003ae38e8 EFLAGS: 00010286 ORIG_RAX:
> ffffffffffffff13
> [ 2504.239467] RAX: ffff888341800000 RBX: 0000000000000fe0 RCX:
> ffff88801bd22000
> [ 2504.240277] RDX: ffff88801bd21000 RSI: ffff888341800000 RDI:
> 0000000325adf000
> [ 2504.241092] RBP: 0000000000001000 R08: cdcdcdcdcdcdcdcd R09:
> cdcdcdcdcdcdcdcd
> [ 2504.241908] R10: cdcdcdcdcdcdcdcd R11: cdcdcdcdcdcdcdcd R12:
> ffff88801bd21020
> [ 2504.242751] R13: ffff8880b916b600 R14: ffff888341800000 R15:
> ffffea00006f4840
> [ 2504.243602]  write_pmem+0x61/0x90
> [ 2504.244002]  pmem_do_bvec+0x178/0x2c0
> [ 2504.244469]  ? chksum_update+0xe/0x20
> [ 2504.244908]  pmem_make_request+0xf7/0x270
> [ 2504.245509]  generic_make_request+0x199/0x3f0
> [ 2504.246179]  ? submit_bio+0x67/0x130
> [ 2504.246710]  submit_bio+0x67/0x130
> [ 2504.247117]  ext4_io_submit+0x44/0x50
> [ 2504.247556]  ext4_writepages+0x621/0xe80
> [ 2504.248028]  ? 0xffffffff81000000
> [ 2504.248418]  ? do_writepages+0x46/0xd0
> [ 2504.248880]  ? ext4_mark_inode_dirty+0x1d0/0x1d0
> [ 2504.249417]  do_writepages+0x46/0xd0
> [ 2504.249833]  ? release_pages+0x175/0x390
> [ 2504.250290]  ? __filemap_fdatawrite_range+0x7c/0xb0
> [ 2504.250879]  __filemap_fdatawrite_range+0x7c/0xb0
> [ 2504.251427]  ext4_release_file+0x67/0xa0
> [ 2504.251897]  __fput+0xb1/0x220
> [ 2504.252260]  task_work_run+0x79/0xa0
> [ 2504.252676]  do_exit+0x2ca/0xc10
> [ 2504.253063]  ? __switch_to_asm+0x40/0x70
> [ 2504.253530]  ? __switch_to_asm+0x34/0x70
> [ 2504.253995]  ? __switch_to_asm+0x40/0x70
> [ 2504.254446]  do_group_exit+0x35/0xa0
> [ 2504.254865]  get_signal+0x14e/0x7a0
> [ 2504.255281]  ? __switch_to_asm+0x34/0x70
> [ 2504.255749]  ? __switch_to_asm+0x40/0x70
> [ 2504.256224]  do_signal+0x2b/0x5e0
> [ 2504.256619]  ? __switch_to_asm+0x40/0x70
> [ 2504.257086]  ? __switch_to_asm+0x34/0x70
> [ 2504.257552]  ? __switch_to_asm+0x40/0x70
> [ 2504.258022]  ? __switch_to_asm+0x34/0x70
> [ 2504.258488]  ? __schedule+0x253/0x530
> [ 2504.258943]  ? __switch_to_asm+0x34/0x70
> [ 2504.259398]  exit_to_usermode_loop+0x87/0xa0
> [ 2504.259900]  do_syscall_64+0xf7/0x120
> [ 2504.260326]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [ 2504.260923] RIP: 0033:0x7faf347e28bd
> [ 2504.261348] Code: Bad RIP value.
> [ 2504.261727] RSP: 002b:00007faf33fc5f40 EFLAGS: 00000293 ORIG_RAX:
> 0000000000000022
> [ 2504.262594] RAX: fffffffffffffdfe RBX: 0000000000000000 RCX:
> 00007faf347e28bd
> [ 2504.263416] RDX: 8b9da1f4246cdb38 RSI: 0000000000000000 RDI:
> 0000000000000000
> [ 2504.264215] RBP: 0000000000000000 R08: 00007faf33fc6700 R09:
> 00007faf33fc6700
> [ 2504.265061] R10: 000000000000012d R11: 0000000000000293 R12:
> 00007ffdf142327e
> [ 2504.266082] R13: 00007ffdf142327f R14: 00007faf337c6000 R15:
> 0000000000000003
> 
> Arch: x86_64
> Kernel: stable top with virtio-pmem v9 patches applied
> Distro: Debian Stretch
> 
> But as I said, it may be just a problem with crosvm.
Right.

Thanks,
Pankaj
> 
> 
> Thank you,
> Jakub Staron
> 
>
Pankaj Gupta May 20, 2019, 3:47 a.m. UTC | #14
> 
> 
> > On 5/16/19 10:35 PM, Pankaj Gupta wrote:
> > > Can I take it your reviewed/acked-by? or tested-by tag? for the virtio
> > > patch :)I don't feel that I have enough expertise to give the reviewed-by
> > > tag, but you can
> > take my acked-by + tested-by.
> > 
> > Acked-by: Jakub Staron <jstaron@google.com>
> > Tested-by: Jakub Staron <jstaron@google.com>
> > 
> > No kernel panics/stalls encountered during testing this patches (v9) with
> > QEMU + xfstests.
> 
> Thank you for testing and confirming the results. I will add your tested &
> acked-by in v10.
> 
> > Some CPU stalls encountered while testing with crosvm instead of QEMU with
> > xfstests
> > (test generic/464) but no repro for QEMU, so the fault may be on the side
> > of
> > crosvm.
> 
> yes, looks like crosvm related as we did not see any of this in my and your
> testing with Qemu.

Also, they don't seem to be related with virtio-pmem.

Thanks,
Pankaj
diff mbox series

Patch

diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 6f2a088afad6..cefe233e0b52 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -5,6 +5,7 @@  obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_OF_PMEM) += of_pmem.o
+obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o
 
 nd_pmem-y := pmem.o
 
diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
new file mode 100644
index 000000000000..dd95cfecfe4c
--- /dev/null
+++ b/drivers/nvdimm/nd_virtio.c
@@ -0,0 +1,126 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ */
+#include "virtio_pmem.h"
+#include "nd.h"
+
+ /* The interrupt handler */
+void host_ack(struct virtqueue *vq)
+{
+	struct virtio_pmem *vpmem = vq->vdev->priv;
+	struct virtio_pmem_request *req, *req_buf;
+	unsigned long flags;
+	unsigned int len;
+
+	spin_lock_irqsave(&vpmem->pmem_lock, flags);
+	while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+		req->done = true;
+		wake_up(&req->host_acked);
+
+		if (!list_empty(&vpmem->req_list)) {
+			req_buf = list_first_entry(&vpmem->req_list,
+					struct virtio_pmem_request, list);
+			req_buf->wq_buf_avail = true;
+			wake_up(&req_buf->wq_buf);
+			list_del(&req_buf->list);
+		}
+	}
+	spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+}
+EXPORT_SYMBOL_GPL(host_ack);
+
+ /* The request submission function */
+int virtio_pmem_flush(struct nd_region *nd_region)
+{
+	struct virtio_device *vdev = nd_region->provider_data;
+	struct virtio_pmem *vpmem = vdev->priv;
+	struct scatterlist *sgs[2], sg, ret;
+	struct virtio_pmem_request *req;
+	unsigned long flags;
+	int err, err1;
+
+	might_sleep();
+	req = kmalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->done = false;
+	strcpy(req->name, "FLUSH");
+	init_waitqueue_head(&req->host_acked);
+	init_waitqueue_head(&req->wq_buf);
+	INIT_LIST_HEAD(&req->list);
+	sg_init_one(&sg, req->name, strlen(req->name));
+	sgs[0] = &sg;
+	sg_init_one(&ret, &req->ret, sizeof(req->ret));
+	sgs[1] = &ret;
+
+	spin_lock_irqsave(&vpmem->pmem_lock, flags);
+	 /*
+	  * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
+	  * queue does not have free descriptor. We add the request
+	  * to req_list and wait for host_ack to wake us up when free
+	  * slots are available.
+	  */
+	while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req,
+					GFP_ATOMIC)) == -ENOSPC) {
+
+		dev_err(&vdev->dev, "failed to send command to virtio pmem" \
+			"device, no free slots in the virtqueue\n");
+		req->wq_buf_avail = false;
+		list_add_tail(&req->list, &vpmem->req_list);
+		spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+
+		/* A host response results in "host_ack" getting called */
+		wait_event(req->wq_buf, req->wq_buf_avail);
+		spin_lock_irqsave(&vpmem->pmem_lock, flags);
+	}
+	err1 = virtqueue_kick(vpmem->req_vq);
+	spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+
+	/*
+	 * virtqueue_add_sgs failed with error different than -ENOSPC, we can't
+	 * do anything about that.
+	 */
+	if (err || !err1) {
+		dev_info(&vdev->dev, "failed to send command to virtio pmem device\n");
+		err = -EIO;
+	} else {
+		/* A host repsonse results in "host_ack" getting called */
+		wait_event(req->host_acked, req->done);
+		err = req->ret;
+	}
+
+	kfree(req);
+	return err;
+};
+
+/* The asynchronous flush callback function */
+int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
+{
+	/* Create child bio for asynchronous flush and chain with
+	 * parent bio. Otherwise directly call nd_region flush.
+	 */
+	if (bio && bio->bi_iter.bi_sector != -1) {
+		struct bio *child = bio_alloc(GFP_ATOMIC, 0);
+
+		if (!child)
+			return -ENOMEM;
+		bio_copy_dev(child, bio);
+		child->bi_opf = REQ_PREFLUSH;
+		child->bi_iter.bi_sector = -1;
+		bio_chain(child, bio);
+		submit_bio(child);
+		return 0;
+	}
+	if (virtio_pmem_flush(nd_region))
+		return -EIO;
+
+	return 0;
+};
+EXPORT_SYMBOL_GPL(async_pmem_flush);
+MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
new file mode 100644
index 000000000000..73a1cd085eae
--- /dev/null
+++ b/drivers/nvdimm/virtio_pmem.c
@@ -0,0 +1,122 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and registers the virtual pmem device
+ * with libnvdimm core.
+ */
+#include "virtio_pmem.h"
+#include "nd.h"
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+ /* Initialize virt queue */
+static int init_vq(struct virtio_pmem *vpmem)
+{
+	/* single vq */
+	vpmem->req_vq = virtio_find_single_vq(vpmem->vdev,
+						host_ack, "flush_queue");
+	if (IS_ERR(vpmem->req_vq))
+		return PTR_ERR(vpmem->req_vq);
+
+	spin_lock_init(&vpmem->pmem_lock);
+	INIT_LIST_HEAD(&vpmem->req_list);
+
+	return 0;
+};
+
+static int virtio_pmem_probe(struct virtio_device *vdev)
+{
+	struct nd_region_desc ndr_desc = {};
+	int nid = dev_to_node(&vdev->dev);
+	struct nd_region *nd_region;
+	struct virtio_pmem *vpmem;
+	struct resource res;
+	int err = 0;
+
+	if (!vdev->config->get) {
+		dev_err(&vdev->dev, "%s failure: config access disabled\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), GFP_KERNEL);
+	if (!vpmem) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	vpmem->vdev = vdev;
+	vdev->priv = vpmem;
+	err = init_vq(vpmem);
+	if (err) {
+		dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
+		goto out_err;
+	}
+
+	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
+			start, &vpmem->start);
+	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
+			size, &vpmem->size);
+
+	res.start = vpmem->start;
+	res.end   = vpmem->start + vpmem->size-1;
+	vpmem->nd_desc.provider_name = "virtio-pmem";
+	vpmem->nd_desc.module = THIS_MODULE;
+
+	vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
+						&vpmem->nd_desc);
+	if (!vpmem->nvdimm_bus) {
+		dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
+		err = -ENXIO;
+		goto out_vq;
+	}
+
+	dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
+
+	ndr_desc.res = &res;
+	ndr_desc.numa_node = nid;
+	ndr_desc.flush = async_pmem_flush;
+	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
+	nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
+	if (!nd_region) {
+		dev_err(&vdev->dev, "failed to create nvdimm region\n");
+		err = -ENXIO;
+		goto out_nd;
+	}
+	nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
+	return 0;
+out_nd:
+	nvdimm_bus_unregister(vpmem->nvdimm_bus);
+out_vq:
+	vdev->config->del_vqs(vdev);
+out_err:
+	return err;
+}
+
+static void virtio_pmem_remove(struct virtio_device *vdev)
+{
+	struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
+
+	nvdimm_bus_unregister(nvdimm_bus);
+	vdev->config->del_vqs(vdev);
+	vdev->config->reset(vdev);
+}
+
+static struct virtio_driver virtio_pmem_driver = {
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.probe			= virtio_pmem_probe,
+	.remove			= virtio_pmem_remove,
+};
+
+module_virtio_driver(virtio_pmem_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio pmem driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
new file mode 100644
index 000000000000..ab1da877575d
--- /dev/null
+++ b/drivers/nvdimm/virtio_pmem.h
@@ -0,0 +1,60 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * virtio_pmem.h: virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ **/
+
+#ifndef _LINUX_VIRTIO_PMEM_H
+#define _LINUX_VIRTIO_PMEM_H
+
+#include <linux/virtio_ids.h>
+#include <linux/module.h>
+#include <linux/virtio_config.h>
+#include <uapi/linux/virtio_pmem.h>
+#include <linux/libnvdimm.h>
+#include <linux/spinlock.h>
+
+struct virtio_pmem_request {
+	/* Host return status corresponding to flush request */
+	int ret;
+
+	/* command name*/
+	char name[16];
+
+	/* Wait queue to process deferred work after ack from host */
+	wait_queue_head_t host_acked;
+	bool done;
+
+	/* Wait queue to process deferred work after virt queue buffer avail */
+	wait_queue_head_t wq_buf;
+	bool wq_buf_avail;
+	struct list_head list;
+};
+
+struct virtio_pmem {
+	struct virtio_device *vdev;
+
+	/* Virtio pmem request queue */
+	struct virtqueue *req_vq;
+
+	/* nvdimm bus registers virtio pmem device */
+	struct nvdimm_bus *nvdimm_bus;
+	struct nvdimm_bus_descriptor nd_desc;
+
+	/* List to store deferred work if virtqueue is full */
+	struct list_head req_list;
+
+	/* Synchronize virtqueue data */
+	spinlock_t pmem_lock;
+
+	/* Memory region information */
+	uint64_t start;
+	uint64_t size;
+};
+
+void host_ack(struct virtqueue *vq);
+int async_pmem_flush(struct nd_region *nd_region, struct bio *bio);
+#endif
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 35897649c24f..94bad084ebab 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -42,6 +42,17 @@  config VIRTIO_PCI_LEGACY
 
 	  If unsure, say Y.
 
+config VIRTIO_PMEM
+	tristate "Support for virtio pmem driver"
+	depends on VIRTIO
+	depends on LIBNVDIMM
+	help
+	This driver provides access to virtio-pmem devices, storage devices
+	that are mapped into the physical address space - similar to NVDIMMs
+	 - with a virtio-based flushing interface.
+
+	If unsure, say M.
+
 config VIRTIO_BALLOON
 	tristate "Virtio balloon driver"
 	depends on VIRTIO
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 6d5c3b2d4f4d..32b2f94d1f58 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -43,5 +43,6 @@ 
 #define VIRTIO_ID_INPUT        18 /* virtio input */
 #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
 #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
+#define VIRTIO_ID_PMEM         27 /* virtio pmem */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h
new file mode 100644
index 000000000000..fa3f7d52717a
--- /dev/null
+++ b/include/uapi/linux/virtio_pmem.h
@@ -0,0 +1,10 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _UAPI_LINUX_VIRTIO_PMEM_H
+#define _UAPI_LINUX_VIRTIO_PMEM_H
+
+struct virtio_pmem_config {
+	__le64 start;
+	__le64 size;
+};
+#endif