diff mbox series

[15/52] fuse: map virtio_fs DAX window BAR

Message ID 20181210171318.16998-16-vgoyal@redhat.com (mailing list archive)
State New, archived
Headers show
Series virtio-fs: shared file system for virtual machines | expand

Commit Message

Vivek Goyal Dec. 10, 2018, 5:12 p.m. UTC
From: Stefan Hajnoczi <stefanha@redhat.com>

Experimental QEMU code introduces an MMIO BAR for mapping portions of
files in the virtio-fs device.  Map this BAR so that FUSE DAX can access
file contents from the host page cache.

The DAX window is accessed by the fs/dax.c infrastructure and must have
struct pages (at least on x86).  Use devm_memremap_pages() to map the
DAX window PCI BAR and allocate struct page.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 fs/fuse/virtio_fs.c | 166 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 143 insertions(+), 23 deletions(-)

Comments

Christian Borntraeger Dec. 12, 2018, 4:37 p.m. UTC | #1
On 10.12.2018 18:12, Vivek Goyal wrote:
> From: Stefan Hajnoczi <stefanha@redhat.com>

> +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> +{
> +	struct virtio_fs_memremap_info *mi;
> +	struct dev_pagemap *pgmap;
> +	struct pci_dev *pci_dev;
> +	phys_addr_t phys_addr;
> +	size_t len;
> +	int ret;
> +
> +	if (!IS_ENABLED(CONFIG_DAX_DRIVER))
> +		return 0;
> +
> +	/* HACK implement VIRTIO shared memory regions instead of
> +	 * directly accessing the PCI BAR from a virtio device driver.
> +	 */
> +	pci_dev = container_of(vdev->dev.parent, struct pci_dev, dev);
> +
> +	/* TODO Is this safe - the virtio_pci_* driver doesn't use managed
> +	 * device APIs? */
> +	ret = pcim_enable_device(pci_dev);
> +	if (ret < 0)
> +		return ret;
> +
> +	/* TODO handle case where device doesn't expose BAR? */
> +	ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
> +				 "virtio-fs-window");
> +	if (ret < 0) {
> +		dev_err(&vdev->dev, "%s: failed to request window BAR\n",
> +			__func__);
> +		return ret;
> +	}

Can we please have a generic virtio interface to map the address (the default can then
fall back to PCI) instead of mapping a PCI bar? This would make it easier to implement
virtio-ccw or virtio-mmio.
Stefan Hajnoczi Dec. 13, 2018, 11:55 a.m. UTC | #2
On Wed, Dec 12, 2018 at 05:37:35PM +0100, Christian Borntraeger wrote:
> 
> 
> On 10.12.2018 18:12, Vivek Goyal wrote:
> > From: Stefan Hajnoczi <stefanha@redhat.com>
> 
> > +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > +{
> > +	struct virtio_fs_memremap_info *mi;
> > +	struct dev_pagemap *pgmap;
> > +	struct pci_dev *pci_dev;
> > +	phys_addr_t phys_addr;
> > +	size_t len;
> > +	int ret;
> > +
> > +	if (!IS_ENABLED(CONFIG_DAX_DRIVER))
> > +		return 0;
> > +
> > +	/* HACK implement VIRTIO shared memory regions instead of
> > +	 * directly accessing the PCI BAR from a virtio device driver.
> > +	 */
> > +	pci_dev = container_of(vdev->dev.parent, struct pci_dev, dev);
> > +
> > +	/* TODO Is this safe - the virtio_pci_* driver doesn't use managed
> > +	 * device APIs? */
> > +	ret = pcim_enable_device(pci_dev);
> > +	if (ret < 0)
> > +		return ret;
> > +
> > +	/* TODO handle case where device doesn't expose BAR? */
> > +	ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
> > +				 "virtio-fs-window");
> > +	if (ret < 0) {
> > +		dev_err(&vdev->dev, "%s: failed to request window BAR\n",
> > +			__func__);
> > +		return ret;
> > +	}
> 
> Can we please have a generic virtio interface to map the address (the default can then
> fall back to PCI) instead of mapping a PCI bar? This would make it easier to implement
> virtio-ccw or virtio-mmio.

Yes, we'll define shared memory as a device resource in the VIRTIO
specification.  It will become part of the device model, alongside
virtqueues, configuration space, etc.  That means devices can use shared
memory without it being tied to PCI BARs explicitly.

But only the PCI transport will have a realization of shared memory
resources in the beginning.  We need to work together to add this
feature to the ccw and mmio transports.

Stefan
kernel test robot Dec. 13, 2018, 4:06 p.m. UTC | #3
Hi Stefan,

I love your patch! Yet something to improve:

[auto build test ERROR on fuse/for-next]
[also build test ERROR on v4.20-rc6]
[cannot apply to next-20181213]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Vivek-Goyal/virtio-fs-shared-file-system-for-virtual-machines/20181211-103034
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git for-next
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=sh 

All errors (new ones prefixed by >>):

   fs/fuse/virtio_fs.c: In function 'virtio_fs_setup_dax':
>> fs/fuse/virtio_fs.c:465:8: error: implicit declaration of function 'pcim_enable_device'; did you mean 'pci_enable_device'? [-Werror=implicit-function-declaration]
     ret = pcim_enable_device(pci_dev);
           ^~~~~~~~~~~~~~~~~~
           pci_enable_device
>> fs/fuse/virtio_fs.c:470:8: error: implicit declaration of function 'pci_request_region'; did you mean 'pci_request_regions'? [-Werror=implicit-function-declaration]
     ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
           ^~~~~~~~~~~~~~~~~~
           pci_request_regions
   In file included from include/linux/printk.h:336:0,
                    from include/linux/kernel.h:14,
                    from include/linux/list.h:9,
                    from include/linux/wait.h:7,
                    from include/linux/wait_bit.h:8,
                    from include/linux/fs.h:6,
                    from fs/fuse/virtio_fs.c:7:
   fs/fuse/virtio_fs.c:528:22: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 6 has type 'phys_addr_t {aka unsigned int}' [-Wformat=]
     dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n",
                         ^
   include/linux/dynamic_debug.h:135:39: note: in definition of macro 'dynamic_dev_dbg'
      __dynamic_dev_dbg(&descriptor, dev, fmt, \
                                          ^~~
   include/linux/device.h:1463:23: note: in expansion of macro 'dev_fmt'
     dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__)
                          ^~~~~~~
   fs/fuse/virtio_fs.c:528:2: note: in expansion of macro 'dev_dbg'
     dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n",
     ^~~~~~~
   At top level:
   fs/fuse/virtio_fs.c:604:12: warning: 'virtio_fs_restore' defined but not used [-Wunused-function]
    static int virtio_fs_restore(struct virtio_device *vdev)
               ^~~~~~~~~~~~~~~~~
   fs/fuse/virtio_fs.c:599:12: warning: 'virtio_fs_freeze' defined but not used [-Wunused-function]
    static int virtio_fs_freeze(struct virtio_device *vdev)
               ^~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +465 fs/fuse/virtio_fs.c

   445	
   446	static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
   447	{
   448		struct virtio_fs_memremap_info *mi;
   449		struct dev_pagemap *pgmap;
   450		struct pci_dev *pci_dev;
   451		phys_addr_t phys_addr;
   452		size_t len;
   453		int ret;
   454	
   455		if (!IS_ENABLED(CONFIG_DAX_DRIVER))
   456			return 0;
   457	
   458		/* HACK implement VIRTIO shared memory regions instead of
   459		 * directly accessing the PCI BAR from a virtio device driver.
   460		 */
   461		pci_dev = container_of(vdev->dev.parent, struct pci_dev, dev);
   462	
   463		/* TODO Is this safe - the virtio_pci_* driver doesn't use managed
   464		 * device APIs? */
 > 465		ret = pcim_enable_device(pci_dev);
   466		if (ret < 0)
   467			return ret;
   468	
   469		/* TODO handle case where device doesn't expose BAR? */
 > 470		ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
   471					 "virtio-fs-window");
   472		if (ret < 0) {
   473			dev_err(&vdev->dev, "%s: failed to request window BAR\n",
   474				__func__);
   475			return ret;
   476		}
   477	
   478		phys_addr = pci_resource_start(pci_dev, VIRTIO_FS_WINDOW_BAR);
   479		len = pci_resource_len(pci_dev, VIRTIO_FS_WINDOW_BAR);
   480	
   481		mi = devm_kzalloc(&pci_dev->dev, sizeof(*mi), GFP_KERNEL);
   482		if (!mi)
   483			return -ENOMEM;
   484	
   485		init_completion(&mi->completion);
   486		ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0,
   487				      GFP_KERNEL);
   488		if (ret < 0) {
   489			dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n",
   490				__func__, ret);
   491			return ret;
   492		}
   493	
   494		ret = devm_add_action(&pci_dev->dev, virtio_fs_percpu_exit, mi);
   495		if (ret < 0) {
   496			percpu_ref_exit(&mi->ref);
   497			return ret;
   498		}
   499	
   500		pgmap = &mi->pgmap;
   501		pgmap->altmap_valid = false;
   502		pgmap->ref = &mi->ref;
   503		pgmap->type = MEMORY_DEVICE_FS_DAX;
   504	
   505		/* Ideally we would directly use the PCI BAR resource but
   506		 * devm_memremap_pages() wants its own copy in pgmap.  So
   507		 * initialize a struct resource from scratch (only the start
   508		 * and end fields will be used).
   509		 */
   510		pgmap->res = (struct resource){
   511			.name = "virtio-fs dax window",
   512			.start = phys_addr,
   513			.end = phys_addr + len,
   514		};
   515	
   516		fs->window_kaddr = devm_memremap_pages(&pci_dev->dev, pgmap);
   517		if (IS_ERR(fs->window_kaddr))
   518			return PTR_ERR(fs->window_kaddr);
   519	
   520		ret = devm_add_action_or_reset(&pci_dev->dev, virtio_fs_percpu_kill,
   521					       &mi->ref);
   522		if (ret < 0)
   523			return ret;
   524	
   525		fs->window_phys_addr = phys_addr;
   526		fs->window_len = len;
   527	
   528		dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n",
   529			__func__, fs->window_kaddr, phys_addr, len);
   530	
   531		fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
   532		if (!fs->dax_dev)
   533			return -ENOMEM;
   534	
   535		return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs);
   536	}
   537	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Dan Williams Dec. 13, 2018, 7:55 p.m. UTC | #4
On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote:
>
> From: Stefan Hajnoczi <stefanha@redhat.com>
>
> Experimental QEMU code introduces an MMIO BAR for mapping portions of
> files in the virtio-fs device.  Map this BAR so that FUSE DAX can access
> file contents from the host page cache.

FUSE DAX sounds terrifying, can you explain a bit more about what this is?

> The DAX window is accessed by the fs/dax.c infrastructure and must have
> struct pages (at least on x86).  Use devm_memremap_pages() to map the
> DAX window PCI BAR and allocate struct page.

PCI BAR space is not cache coherent, what prevents these pages from
being used in paths that would do:

   object = page_address(pfn_to_page(virtio_fs_pfn));

...?
Dr. David Alan Gilbert Dec. 13, 2018, 8:09 p.m. UTC | #5
* Dan Williams (dan.j.williams@intel.com) wrote:
> On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote:
> >
> > From: Stefan Hajnoczi <stefanha@redhat.com>
> >
> > Experimental QEMU code introduces an MMIO BAR for mapping portions of
> > files in the virtio-fs device.  Map this BAR so that FUSE DAX can access
> > file contents from the host page cache.
> 
> FUSE DAX sounds terrifying, can you explain a bit more about what this is?

We've got a guest running in QEMU, it sees an emulated PCI device;
that runs a FUSE protocol over virtio on that PCI device, but also has
a trick where via commands sent over the virtio queue associated with that device,
(fragments of) host files get mmap'd into the qemu virtual memory that corresponds
to the kvm slot exposed to the guest for that bar.

The guest sees those chunks in that BAR, and thus you can read/write
to the host file by directly writing into that BAR.

> > The DAX window is accessed by the fs/dax.c infrastructure and must have
> > struct pages (at least on x86).  Use devm_memremap_pages() to map the
> > DAX window PCI BAR and allocate struct page.
> 
> PCI BAR space is not cache coherent,

Note that no real PCI infrastructure is involved - this is all emulated
devices, backed by mmap'd files on the host qemu process.

Dave

> what prevents these pages from
> being used in paths that would do:
> 
>    object = page_address(pfn_to_page(virtio_fs_pfn));
> 
> ...?
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dan Williams Dec. 13, 2018, 8:15 p.m. UTC | #6
On Thu, Dec 13, 2018 at 12:09 PM Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
>
> * Dan Williams (dan.j.williams@intel.com) wrote:
> > On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote:
> > >
> > > From: Stefan Hajnoczi <stefanha@redhat.com>
> > >
> > > Experimental QEMU code introduces an MMIO BAR for mapping portions of
> > > files in the virtio-fs device.  Map this BAR so that FUSE DAX can access
> > > file contents from the host page cache.
> >
> > FUSE DAX sounds terrifying, can you explain a bit more about what this is?
>
> We've got a guest running in QEMU, it sees an emulated PCI device;
> that runs a FUSE protocol over virtio on that PCI device, but also has
> a trick where via commands sent over the virtio queue associated with that device,
> (fragments of) host files get mmap'd into the qemu virtual memory that corresponds
> to the kvm slot exposed to the guest for that bar.
>
> The guest sees those chunks in that BAR, and thus you can read/write
> to the host file by directly writing into that BAR.

Ok so it's all software emulated and there won't be hardware DMA
initiated by the guest to that address? I.e. if the host file gets
truncated / hole-punched the guest would just cause a refault and the
filesystem could fill in the block, or the guest is expected to die if
the fault to the truncated file range results in SIGBUS.

> > > The DAX window is accessed by the fs/dax.c infrastructure and must have
> > > struct pages (at least on x86).  Use devm_memremap_pages() to map the
> > > DAX window PCI BAR and allocate struct page.
> >
> > PCI BAR space is not cache coherent,
>
> Note that no real PCI infrastructure is involved - this is all emulated
> devices, backed by mmap'd files on the host qemu process.

Ok, terror level decreased.
Vivek Goyal Dec. 13, 2018, 8:40 p.m. UTC | #7
On Thu, Dec 13, 2018 at 12:15:51PM -0800, Dan Williams wrote:
> On Thu, Dec 13, 2018 at 12:09 PM Dr. David Alan Gilbert
> <dgilbert@redhat.com> wrote:
> >
> > * Dan Williams (dan.j.williams@intel.com) wrote:
> > > On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote:
> > > >
> > > > From: Stefan Hajnoczi <stefanha@redhat.com>
> > > >
> > > > Experimental QEMU code introduces an MMIO BAR for mapping portions of
> > > > files in the virtio-fs device.  Map this BAR so that FUSE DAX can access
> > > > file contents from the host page cache.
> > >
> > > FUSE DAX sounds terrifying, can you explain a bit more about what this is?
> >
> > We've got a guest running in QEMU, it sees an emulated PCI device;
> > that runs a FUSE protocol over virtio on that PCI device, but also has
> > a trick where via commands sent over the virtio queue associated with that device,
> > (fragments of) host files get mmap'd into the qemu virtual memory that corresponds
> > to the kvm slot exposed to the guest for that bar.
> >
> > The guest sees those chunks in that BAR, and thus you can read/write
> > to the host file by directly writing into that BAR.
> 
> Ok so it's all software emulated and there won't be hardware DMA
> initiated by the guest to that address?

That's my understanding.

> I.e. if the host file gets
> truncated / hole-punched the guest would just cause a refault and the
> filesystem could fill in the block,

Right

> or the guest is expected to die if
> the fault to the truncated file range results in SIGBUS.

Are you referring to the case where a file page is mapped in qemu and
another guest/process trucates that page and when qemu tries to access it it
will get SIGBUS. Have not tried it, will give it a try. Not sure what
happens when QEMU receives SIGBUS.

Having said that, this is not different from the case of one process
mapping a file and another process truncating the file and first process
getting SIGBUS, right?

Thanks
Vivek

> 
> > > > The DAX window is accessed by the fs/dax.c infrastructure and must have
> > > > struct pages (at least on x86).  Use devm_memremap_pages() to map the
> > > > DAX window PCI BAR and allocate struct page.
> > >
> > > PCI BAR space is not cache coherent,
> >
> > Note that no real PCI infrastructure is involved - this is all emulated
> > devices, backed by mmap'd files on the host qemu process.
> 
> Ok, terror level decreased.
Vivek Goyal Dec. 13, 2018, 9:18 p.m. UTC | #8
On Thu, Dec 13, 2018 at 03:40:52PM -0500, Vivek Goyal wrote:
> On Thu, Dec 13, 2018 at 12:15:51PM -0800, Dan Williams wrote:
> > On Thu, Dec 13, 2018 at 12:09 PM Dr. David Alan Gilbert
> > <dgilbert@redhat.com> wrote:
> > >
> > > * Dan Williams (dan.j.williams@intel.com) wrote:
> > > > On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote:
> > > > >
> > > > > From: Stefan Hajnoczi <stefanha@redhat.com>
> > > > >
> > > > > Experimental QEMU code introduces an MMIO BAR for mapping portions of
> > > > > files in the virtio-fs device.  Map this BAR so that FUSE DAX can access
> > > > > file contents from the host page cache.
> > > >
> > > > FUSE DAX sounds terrifying, can you explain a bit more about what this is?
> > >
> > > We've got a guest running in QEMU, it sees an emulated PCI device;
> > > that runs a FUSE protocol over virtio on that PCI device, but also has
> > > a trick where via commands sent over the virtio queue associated with that device,
> > > (fragments of) host files get mmap'd into the qemu virtual memory that corresponds
> > > to the kvm slot exposed to the guest for that bar.
> > >
> > > The guest sees those chunks in that BAR, and thus you can read/write
> > > to the host file by directly writing into that BAR.
> > 
> > Ok so it's all software emulated and there won't be hardware DMA
> > initiated by the guest to that address?
> 
> That's my understanding.
> 
> > I.e. if the host file gets
> > truncated / hole-punched the guest would just cause a refault and the
> > filesystem could fill in the block,
> 
> Right
> 
> > or the guest is expected to die if
> > the fault to the truncated file range results in SIGBUS.
> 
> Are you referring to the case where a file page is mapped in qemu and
> another guest/process trucates that page and when qemu tries to access it it
> will get SIGBUS. Have not tried it, will give it a try. Not sure what
> happens when QEMU receives SIGBUS.
> 
> Having said that, this is not different from the case of one process
> mapping a file and another process truncating the file and first process
> getting SIGBUS, right?

Ok, tried this and guest process hangs.

Stefan, dgilbert, this reminds me that we have faced this issue during
our testing and we decided that this will need some fixing in KVM. I
even put this in as part of changelog of patch with subject "fuse: Take
inode lock for dax inode truncation"

"Another problem is, if we setup a mapping in fuse_iomap_begin(), and
 file gets truncated and dax read/write happens, KVM currently hangs.
 It tries to fault in a page which does not exist on host (file got
 truncated). It probably requries fixing in KVM."

Not sure what should happen though when qemu receives SIGBUS in this
case.

Thanks
Vivek
Dr. David Alan Gilbert Dec. 14, 2018, 10:09 a.m. UTC | #9
* Vivek Goyal (vgoyal@redhat.com) wrote:
> On Thu, Dec 13, 2018 at 03:40:52PM -0500, Vivek Goyal wrote:
> > On Thu, Dec 13, 2018 at 12:15:51PM -0800, Dan Williams wrote:
> > > On Thu, Dec 13, 2018 at 12:09 PM Dr. David Alan Gilbert
> > > <dgilbert@redhat.com> wrote:
> > > >
> > > > * Dan Williams (dan.j.williams@intel.com) wrote:
> > > > > On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote:
> > > > > >
> > > > > > From: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > >
> > > > > > Experimental QEMU code introduces an MMIO BAR for mapping portions of
> > > > > > files in the virtio-fs device.  Map this BAR so that FUSE DAX can access
> > > > > > file contents from the host page cache.
> > > > >
> > > > > FUSE DAX sounds terrifying, can you explain a bit more about what this is?
> > > >
> > > > We've got a guest running in QEMU, it sees an emulated PCI device;
> > > > that runs a FUSE protocol over virtio on that PCI device, but also has
> > > > a trick where via commands sent over the virtio queue associated with that device,
> > > > (fragments of) host files get mmap'd into the qemu virtual memory that corresponds
> > > > to the kvm slot exposed to the guest for that bar.
> > > >
> > > > The guest sees those chunks in that BAR, and thus you can read/write
> > > > to the host file by directly writing into that BAR.
> > > 
> > > Ok so it's all software emulated and there won't be hardware DMA
> > > initiated by the guest to that address?
> > 
> > That's my understanding.
> > 
> > > I.e. if the host file gets
> > > truncated / hole-punched the guest would just cause a refault and the
> > > filesystem could fill in the block,
> > 
> > Right
> > 
> > > or the guest is expected to die if
> > > the fault to the truncated file range results in SIGBUS.
> > 
> > Are you referring to the case where a file page is mapped in qemu and
> > another guest/process trucates that page and when qemu tries to access it it
> > will get SIGBUS. Have not tried it, will give it a try. Not sure what
> > happens when QEMU receives SIGBUS.
> > 
> > Having said that, this is not different from the case of one process
> > mapping a file and another process truncating the file and first process
> > getting SIGBUS, right?
> 
> Ok, tried this and guest process hangs.
> 
> Stefan, dgilbert, this reminds me that we have faced this issue during
> our testing and we decided that this will need some fixing in KVM. I
> even put this in as part of changelog of patch with subject "fuse: Take
> inode lock for dax inode truncation"
> 
> "Another problem is, if we setup a mapping in fuse_iomap_begin(), and
>  file gets truncated and dax read/write happens, KVM currently hangs.
>  It tries to fault in a page which does not exist on host (file got
>  truncated). It probably requries fixing in KVM."
> 
> Not sure what should happen though when qemu receives SIGBUS in this
> case.

Yes, and I noted it in the TODO in my qemu patch posting.

We need to figure out what we want the guest to see in this case and
figure out how to make QEMU/kvm fix it up so that the guest doesn't
see anything odd.

Dave

> Thanks
> Vivek
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index ba615ec2603e..87b7e42a6763 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -6,12 +6,18 @@ 
 
 #include <linux/fs.h>
 #include <linux/dax.h>
+#include <linux/pci.h>
 #include <linux/pfn_t.h>
 #include <linux/module.h>
 #include <linux/virtio.h>
 #include <linux/virtio_fs.h>
 #include "fuse_i.h"
 
+enum {
+	/* PCI BAR number of the virtio-fs DAX window */
+	VIRTIO_FS_WINDOW_BAR = 2,
+};
+
 /* List of virtio-fs device instances and a lock for the list */
 static DEFINE_MUTEX(virtio_fs_mutex);
 static LIST_HEAD(virtio_fs_instances);
@@ -24,6 +30,18 @@  struct virtio_fs_vq {
 	char name[24];
 } ____cacheline_aligned_in_smp;
 
+/* State needed for devm_memremap_pages().  This API is called on the
+ * underlying pci_dev instead of struct virtio_fs (layering violation).  Since
+ * the memremap release function only gets called when the pci_dev is released,
+ * keep the associated state separate from struct virtio_fs (it has a different
+ * lifecycle from pci_dev).
+ */
+struct virtio_fs_memremap_info {
+	struct dev_pagemap pgmap;
+	struct percpu_ref ref;
+	struct completion completion;
+};
+
 /* A virtio-fs device instance */
 struct virtio_fs {
 	struct list_head list;    /* on virtio_fs_instances */
@@ -36,6 +54,7 @@  struct virtio_fs {
 	/* DAX memory window where file contents are mapped */
 	void *window_kaddr;
 	phys_addr_t window_phys_addr;
+	size_t window_len;
 };
 
 static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
@@ -395,6 +414,127 @@  static const struct dax_operations virtio_fs_dax_ops = {
 	.copy_to_iter = virtio_fs_copy_to_iter,
 };
 
+static void virtio_fs_percpu_release(struct percpu_ref *ref)
+{
+	struct virtio_fs_memremap_info *mi =
+		container_of(ref, struct virtio_fs_memremap_info, ref);
+
+	complete(&mi->completion);
+}
+
+static void virtio_fs_percpu_exit(void *data)
+{
+	struct virtio_fs_memremap_info *mi = data;
+
+	wait_for_completion(&mi->completion);
+	percpu_ref_exit(&mi->ref);
+}
+
+static void virtio_fs_percpu_kill(void *data)
+{
+	percpu_ref_kill(data);
+}
+
+static void virtio_fs_cleanup_dax(void *data)
+{
+	struct virtio_fs *fs = data;
+
+	kill_dax(fs->dax_dev);
+	put_dax(fs->dax_dev);
+}
+
+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+	struct virtio_fs_memremap_info *mi;
+	struct dev_pagemap *pgmap;
+	struct pci_dev *pci_dev;
+	phys_addr_t phys_addr;
+	size_t len;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_DAX_DRIVER))
+		return 0;
+
+	/* HACK implement VIRTIO shared memory regions instead of
+	 * directly accessing the PCI BAR from a virtio device driver.
+	 */
+	pci_dev = container_of(vdev->dev.parent, struct pci_dev, dev);
+
+	/* TODO Is this safe - the virtio_pci_* driver doesn't use managed
+	 * device APIs? */
+	ret = pcim_enable_device(pci_dev);
+	if (ret < 0)
+		return ret;
+
+	/* TODO handle case where device doesn't expose BAR? */
+	ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
+				 "virtio-fs-window");
+	if (ret < 0) {
+		dev_err(&vdev->dev, "%s: failed to request window BAR\n",
+			__func__);
+		return ret;
+	}
+
+	phys_addr = pci_resource_start(pci_dev, VIRTIO_FS_WINDOW_BAR);
+	len = pci_resource_len(pci_dev, VIRTIO_FS_WINDOW_BAR);
+
+	mi = devm_kzalloc(&pci_dev->dev, sizeof(*mi), GFP_KERNEL);
+	if (!mi)
+		return -ENOMEM;
+
+	init_completion(&mi->completion);
+	ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0,
+			      GFP_KERNEL);
+	if (ret < 0) {
+		dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
+	ret = devm_add_action(&pci_dev->dev, virtio_fs_percpu_exit, mi);
+	if (ret < 0) {
+		percpu_ref_exit(&mi->ref);
+		return ret;
+	}
+
+	pgmap = &mi->pgmap;
+	pgmap->altmap_valid = false;
+	pgmap->ref = &mi->ref;
+	pgmap->type = MEMORY_DEVICE_FS_DAX;
+
+	/* Ideally we would directly use the PCI BAR resource but
+	 * devm_memremap_pages() wants its own copy in pgmap.  So
+	 * initialize a struct resource from scratch (only the start
+	 * and end fields will be used).
+	 */
+	pgmap->res = (struct resource){
+		.name = "virtio-fs dax window",
+		.start = phys_addr,
+		.end = phys_addr + len,
+	};
+
+	fs->window_kaddr = devm_memremap_pages(&pci_dev->dev, pgmap);
+	if (IS_ERR(fs->window_kaddr))
+		return PTR_ERR(fs->window_kaddr);
+
+	ret = devm_add_action_or_reset(&pci_dev->dev, virtio_fs_percpu_kill,
+				       &mi->ref);
+	if (ret < 0)
+		return ret;
+
+	fs->window_phys_addr = phys_addr;
+	fs->window_len = len;
+
+	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n",
+		__func__, fs->window_kaddr, phys_addr, len);
+
+	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
+	if (!fs->dax_dev)
+		return -ENOMEM;
+
+	return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs);
+}
+
 static int virtio_fs_probe(struct virtio_device *vdev)
 {
 	struct virtio_fs *fs;
@@ -416,16 +556,9 @@  static int virtio_fs_probe(struct virtio_device *vdev)
 	/* TODO vq affinity */
 	/* TODO populate notifications vq */
 
-	if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
-		/* TODO map window */
-		fs->window_kaddr = NULL;
-		fs->window_phys_addr = 0;
-
-		fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
-		if (!fs->dax_dev)
-			goto out_vqs; /* TODO handle case where device doesn't expose
-					 BAR */
-	}
+	ret = virtio_fs_setup_dax(vdev, fs);
+	if (ret < 0)
+		goto out_vqs;
 
 	/* Bring the device online in case the filesystem is mounted and
 	 * requests need to be sent before we return.
@@ -441,13 +574,6 @@  static int virtio_fs_probe(struct virtio_device *vdev)
 out_vqs:
 	vdev->config->reset(vdev);
 	virtio_fs_cleanup_vqs(vdev, fs);
-
-	if (fs->dax_dev) {
-		kill_dax(fs->dax_dev);
-		put_dax(fs->dax_dev);
-		fs->dax_dev = NULL;
-	}
-
 out:
 	vdev->priv = NULL;
 	return ret;
@@ -466,12 +592,6 @@  static void virtio_fs_remove(struct virtio_device *vdev)
 	list_del(&fs->list);
 	mutex_unlock(&virtio_fs_mutex);
 
-	if (fs->dax_dev) {
-		kill_dax(fs->dax_dev);
-		put_dax(fs->dax_dev);
-		fs->dax_dev = NULL;
-	}
-
 	vdev->priv = NULL;
 }