Message ID | 20181210171318.16998-16-vgoyal@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | virtio-fs: shared file system for virtual machines | expand |
On 10.12.2018 18:12, Vivek Goyal wrote: > From: Stefan Hajnoczi <stefanha@redhat.com> > +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) > +{ > + struct virtio_fs_memremap_info *mi; > + struct dev_pagemap *pgmap; > + struct pci_dev *pci_dev; > + phys_addr_t phys_addr; > + size_t len; > + int ret; > + > + if (!IS_ENABLED(CONFIG_DAX_DRIVER)) > + return 0; > + > + /* HACK implement VIRTIO shared memory regions instead of > + * directly accessing the PCI BAR from a virtio device driver. > + */ > + pci_dev = container_of(vdev->dev.parent, struct pci_dev, dev); > + > + /* TODO Is this safe - the virtio_pci_* driver doesn't use managed > + * device APIs? */ > + ret = pcim_enable_device(pci_dev); > + if (ret < 0) > + return ret; > + > + /* TODO handle case where device doesn't expose BAR? */ > + ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR, > + "virtio-fs-window"); > + if (ret < 0) { > + dev_err(&vdev->dev, "%s: failed to request window BAR\n", > + __func__); > + return ret; > + } Can we please have a generic virtio interface to map the address (the default can then fall back to PCI) instead of mapping a PCI bar? This would make it easier to implement virtio-ccw or virtio-mmio.
On Wed, Dec 12, 2018 at 05:37:35PM +0100, Christian Borntraeger wrote: > > > On 10.12.2018 18:12, Vivek Goyal wrote: > > From: Stefan Hajnoczi <stefanha@redhat.com> > > > +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) > > +{ > > + struct virtio_fs_memremap_info *mi; > > + struct dev_pagemap *pgmap; > > + struct pci_dev *pci_dev; > > + phys_addr_t phys_addr; > > + size_t len; > > + int ret; > > + > > + if (!IS_ENABLED(CONFIG_DAX_DRIVER)) > > + return 0; > > + > > + /* HACK implement VIRTIO shared memory regions instead of > > + * directly accessing the PCI BAR from a virtio device driver. > > + */ > > + pci_dev = container_of(vdev->dev.parent, struct pci_dev, dev); > > + > > + /* TODO Is this safe - the virtio_pci_* driver doesn't use managed > > + * device APIs? */ > > + ret = pcim_enable_device(pci_dev); > > + if (ret < 0) > > + return ret; > > + > > + /* TODO handle case where device doesn't expose BAR? */ > > + ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR, > > + "virtio-fs-window"); > > + if (ret < 0) { > > + dev_err(&vdev->dev, "%s: failed to request window BAR\n", > > + __func__); > > + return ret; > > + } > > Can we please have a generic virtio interface to map the address (the default can then > fall back to PCI) instead of mapping a PCI bar? This would make it easier to implement > virtio-ccw or virtio-mmio. Yes, we'll define shared memory as a device resource in the VIRTIO specification. It will become part of the device model, alongside virtqueues, configuration space, etc. That means devices can use shared memory without it being tied to PCI BARs explicitly. But only the PCI transport will have a realization of shared memory resources in the beginning. We need to work together to add this feature to the ccw and mmio transports. Stefan
Hi Stefan, I love your patch! Yet something to improve: [auto build test ERROR on fuse/for-next] [also build test ERROR on v4.20-rc6] [cannot apply to next-20181213] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Vivek-Goyal/virtio-fs-shared-file-system-for-virtual-machines/20181211-103034 base: https://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git for-next config: sh-allmodconfig (attached as .config) compiler: sh4-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree GCC_VERSION=7.2.0 make.cross ARCH=sh All errors (new ones prefixed by >>): fs/fuse/virtio_fs.c: In function 'virtio_fs_setup_dax': >> fs/fuse/virtio_fs.c:465:8: error: implicit declaration of function 'pcim_enable_device'; did you mean 'pci_enable_device'? [-Werror=implicit-function-declaration] ret = pcim_enable_device(pci_dev); ^~~~~~~~~~~~~~~~~~ pci_enable_device >> fs/fuse/virtio_fs.c:470:8: error: implicit declaration of function 'pci_request_region'; did you mean 'pci_request_regions'? [-Werror=implicit-function-declaration] ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR, ^~~~~~~~~~~~~~~~~~ pci_request_regions In file included from include/linux/printk.h:336:0, from include/linux/kernel.h:14, from include/linux/list.h:9, from include/linux/wait.h:7, from include/linux/wait_bit.h:8, from include/linux/fs.h:6, from fs/fuse/virtio_fs.c:7: fs/fuse/virtio_fs.c:528:22: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 6 has type 'phys_addr_t {aka unsigned int}' [-Wformat=] dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n", ^ include/linux/dynamic_debug.h:135:39: note: in definition of macro 'dynamic_dev_dbg' __dynamic_dev_dbg(&descriptor, dev, fmt, \ ^~~ include/linux/device.h:1463:23: note: in expansion of macro 'dev_fmt' dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__) ^~~~~~~ fs/fuse/virtio_fs.c:528:2: note: in expansion of macro 'dev_dbg' dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n", ^~~~~~~ At top level: fs/fuse/virtio_fs.c:604:12: warning: 'virtio_fs_restore' defined but not used [-Wunused-function] static int virtio_fs_restore(struct virtio_device *vdev) ^~~~~~~~~~~~~~~~~ fs/fuse/virtio_fs.c:599:12: warning: 'virtio_fs_freeze' defined but not used [-Wunused-function] static int virtio_fs_freeze(struct virtio_device *vdev) ^~~~~~~~~~~~~~~~ cc1: some warnings being treated as errors vim +465 fs/fuse/virtio_fs.c 445 446 static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) 447 { 448 struct virtio_fs_memremap_info *mi; 449 struct dev_pagemap *pgmap; 450 struct pci_dev *pci_dev; 451 phys_addr_t phys_addr; 452 size_t len; 453 int ret; 454 455 if (!IS_ENABLED(CONFIG_DAX_DRIVER)) 456 return 0; 457 458 /* HACK implement VIRTIO shared memory regions instead of 459 * directly accessing the PCI BAR from a virtio device driver. 460 */ 461 pci_dev = container_of(vdev->dev.parent, struct pci_dev, dev); 462 463 /* TODO Is this safe - the virtio_pci_* driver doesn't use managed 464 * device APIs? */ > 465 ret = pcim_enable_device(pci_dev); 466 if (ret < 0) 467 return ret; 468 469 /* TODO handle case where device doesn't expose BAR? */ > 470 ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR, 471 "virtio-fs-window"); 472 if (ret < 0) { 473 dev_err(&vdev->dev, "%s: failed to request window BAR\n", 474 __func__); 475 return ret; 476 } 477 478 phys_addr = pci_resource_start(pci_dev, VIRTIO_FS_WINDOW_BAR); 479 len = pci_resource_len(pci_dev, VIRTIO_FS_WINDOW_BAR); 480 481 mi = devm_kzalloc(&pci_dev->dev, sizeof(*mi), GFP_KERNEL); 482 if (!mi) 483 return -ENOMEM; 484 485 init_completion(&mi->completion); 486 ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0, 487 GFP_KERNEL); 488 if (ret < 0) { 489 dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n", 490 __func__, ret); 491 return ret; 492 } 493 494 ret = devm_add_action(&pci_dev->dev, virtio_fs_percpu_exit, mi); 495 if (ret < 0) { 496 percpu_ref_exit(&mi->ref); 497 return ret; 498 } 499 500 pgmap = &mi->pgmap; 501 pgmap->altmap_valid = false; 502 pgmap->ref = &mi->ref; 503 pgmap->type = MEMORY_DEVICE_FS_DAX; 504 505 /* Ideally we would directly use the PCI BAR resource but 506 * devm_memremap_pages() wants its own copy in pgmap. So 507 * initialize a struct resource from scratch (only the start 508 * and end fields will be used). 509 */ 510 pgmap->res = (struct resource){ 511 .name = "virtio-fs dax window", 512 .start = phys_addr, 513 .end = phys_addr + len, 514 }; 515 516 fs->window_kaddr = devm_memremap_pages(&pci_dev->dev, pgmap); 517 if (IS_ERR(fs->window_kaddr)) 518 return PTR_ERR(fs->window_kaddr); 519 520 ret = devm_add_action_or_reset(&pci_dev->dev, virtio_fs_percpu_kill, 521 &mi->ref); 522 if (ret < 0) 523 return ret; 524 525 fs->window_phys_addr = phys_addr; 526 fs->window_len = len; 527 528 dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n", 529 __func__, fs->window_kaddr, phys_addr, len); 530 531 fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops); 532 if (!fs->dax_dev) 533 return -ENOMEM; 534 535 return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs); 536 } 537 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote: > > From: Stefan Hajnoczi <stefanha@redhat.com> > > Experimental QEMU code introduces an MMIO BAR for mapping portions of > files in the virtio-fs device. Map this BAR so that FUSE DAX can access > file contents from the host page cache. FUSE DAX sounds terrifying, can you explain a bit more about what this is? > The DAX window is accessed by the fs/dax.c infrastructure and must have > struct pages (at least on x86). Use devm_memremap_pages() to map the > DAX window PCI BAR and allocate struct page. PCI BAR space is not cache coherent, what prevents these pages from being used in paths that would do: object = page_address(pfn_to_page(virtio_fs_pfn)); ...?
* Dan Williams (dan.j.williams@intel.com) wrote: > On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote: > > > > From: Stefan Hajnoczi <stefanha@redhat.com> > > > > Experimental QEMU code introduces an MMIO BAR for mapping portions of > > files in the virtio-fs device. Map this BAR so that FUSE DAX can access > > file contents from the host page cache. > > FUSE DAX sounds terrifying, can you explain a bit more about what this is? We've got a guest running in QEMU, it sees an emulated PCI device; that runs a FUSE protocol over virtio on that PCI device, but also has a trick where via commands sent over the virtio queue associated with that device, (fragments of) host files get mmap'd into the qemu virtual memory that corresponds to the kvm slot exposed to the guest for that bar. The guest sees those chunks in that BAR, and thus you can read/write to the host file by directly writing into that BAR. > > The DAX window is accessed by the fs/dax.c infrastructure and must have > > struct pages (at least on x86). Use devm_memremap_pages() to map the > > DAX window PCI BAR and allocate struct page. > > PCI BAR space is not cache coherent, Note that no real PCI infrastructure is involved - this is all emulated devices, backed by mmap'd files on the host qemu process. Dave > what prevents these pages from > being used in paths that would do: > > object = page_address(pfn_to_page(virtio_fs_pfn)); > > ...? -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Thu, Dec 13, 2018 at 12:09 PM Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > > * Dan Williams (dan.j.williams@intel.com) wrote: > > On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote: > > > > > > From: Stefan Hajnoczi <stefanha@redhat.com> > > > > > > Experimental QEMU code introduces an MMIO BAR for mapping portions of > > > files in the virtio-fs device. Map this BAR so that FUSE DAX can access > > > file contents from the host page cache. > > > > FUSE DAX sounds terrifying, can you explain a bit more about what this is? > > We've got a guest running in QEMU, it sees an emulated PCI device; > that runs a FUSE protocol over virtio on that PCI device, but also has > a trick where via commands sent over the virtio queue associated with that device, > (fragments of) host files get mmap'd into the qemu virtual memory that corresponds > to the kvm slot exposed to the guest for that bar. > > The guest sees those chunks in that BAR, and thus you can read/write > to the host file by directly writing into that BAR. Ok so it's all software emulated and there won't be hardware DMA initiated by the guest to that address? I.e. if the host file gets truncated / hole-punched the guest would just cause a refault and the filesystem could fill in the block, or the guest is expected to die if the fault to the truncated file range results in SIGBUS. > > > The DAX window is accessed by the fs/dax.c infrastructure and must have > > > struct pages (at least on x86). Use devm_memremap_pages() to map the > > > DAX window PCI BAR and allocate struct page. > > > > PCI BAR space is not cache coherent, > > Note that no real PCI infrastructure is involved - this is all emulated > devices, backed by mmap'd files on the host qemu process. Ok, terror level decreased.
On Thu, Dec 13, 2018 at 12:15:51PM -0800, Dan Williams wrote: > On Thu, Dec 13, 2018 at 12:09 PM Dr. David Alan Gilbert > <dgilbert@redhat.com> wrote: > > > > * Dan Williams (dan.j.williams@intel.com) wrote: > > > On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote: > > > > > > > > From: Stefan Hajnoczi <stefanha@redhat.com> > > > > > > > > Experimental QEMU code introduces an MMIO BAR for mapping portions of > > > > files in the virtio-fs device. Map this BAR so that FUSE DAX can access > > > > file contents from the host page cache. > > > > > > FUSE DAX sounds terrifying, can you explain a bit more about what this is? > > > > We've got a guest running in QEMU, it sees an emulated PCI device; > > that runs a FUSE protocol over virtio on that PCI device, but also has > > a trick where via commands sent over the virtio queue associated with that device, > > (fragments of) host files get mmap'd into the qemu virtual memory that corresponds > > to the kvm slot exposed to the guest for that bar. > > > > The guest sees those chunks in that BAR, and thus you can read/write > > to the host file by directly writing into that BAR. > > Ok so it's all software emulated and there won't be hardware DMA > initiated by the guest to that address? That's my understanding. > I.e. if the host file gets > truncated / hole-punched the guest would just cause a refault and the > filesystem could fill in the block, Right > or the guest is expected to die if > the fault to the truncated file range results in SIGBUS. Are you referring to the case where a file page is mapped in qemu and another guest/process trucates that page and when qemu tries to access it it will get SIGBUS. Have not tried it, will give it a try. Not sure what happens when QEMU receives SIGBUS. Having said that, this is not different from the case of one process mapping a file and another process truncating the file and first process getting SIGBUS, right? Thanks Vivek > > > > > The DAX window is accessed by the fs/dax.c infrastructure and must have > > > > struct pages (at least on x86). Use devm_memremap_pages() to map the > > > > DAX window PCI BAR and allocate struct page. > > > > > > PCI BAR space is not cache coherent, > > > > Note that no real PCI infrastructure is involved - this is all emulated > > devices, backed by mmap'd files on the host qemu process. > > Ok, terror level decreased.
On Thu, Dec 13, 2018 at 03:40:52PM -0500, Vivek Goyal wrote: > On Thu, Dec 13, 2018 at 12:15:51PM -0800, Dan Williams wrote: > > On Thu, Dec 13, 2018 at 12:09 PM Dr. David Alan Gilbert > > <dgilbert@redhat.com> wrote: > > > > > > * Dan Williams (dan.j.williams@intel.com) wrote: > > > > On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote: > > > > > > > > > > From: Stefan Hajnoczi <stefanha@redhat.com> > > > > > > > > > > Experimental QEMU code introduces an MMIO BAR for mapping portions of > > > > > files in the virtio-fs device. Map this BAR so that FUSE DAX can access > > > > > file contents from the host page cache. > > > > > > > > FUSE DAX sounds terrifying, can you explain a bit more about what this is? > > > > > > We've got a guest running in QEMU, it sees an emulated PCI device; > > > that runs a FUSE protocol over virtio on that PCI device, but also has > > > a trick where via commands sent over the virtio queue associated with that device, > > > (fragments of) host files get mmap'd into the qemu virtual memory that corresponds > > > to the kvm slot exposed to the guest for that bar. > > > > > > The guest sees those chunks in that BAR, and thus you can read/write > > > to the host file by directly writing into that BAR. > > > > Ok so it's all software emulated and there won't be hardware DMA > > initiated by the guest to that address? > > That's my understanding. > > > I.e. if the host file gets > > truncated / hole-punched the guest would just cause a refault and the > > filesystem could fill in the block, > > Right > > > or the guest is expected to die if > > the fault to the truncated file range results in SIGBUS. > > Are you referring to the case where a file page is mapped in qemu and > another guest/process trucates that page and when qemu tries to access it it > will get SIGBUS. Have not tried it, will give it a try. Not sure what > happens when QEMU receives SIGBUS. > > Having said that, this is not different from the case of one process > mapping a file and another process truncating the file and first process > getting SIGBUS, right? Ok, tried this and guest process hangs. Stefan, dgilbert, this reminds me that we have faced this issue during our testing and we decided that this will need some fixing in KVM. I even put this in as part of changelog of patch with subject "fuse: Take inode lock for dax inode truncation" "Another problem is, if we setup a mapping in fuse_iomap_begin(), and file gets truncated and dax read/write happens, KVM currently hangs. It tries to fault in a page which does not exist on host (file got truncated). It probably requries fixing in KVM." Not sure what should happen though when qemu receives SIGBUS in this case. Thanks Vivek
* Vivek Goyal (vgoyal@redhat.com) wrote: > On Thu, Dec 13, 2018 at 03:40:52PM -0500, Vivek Goyal wrote: > > On Thu, Dec 13, 2018 at 12:15:51PM -0800, Dan Williams wrote: > > > On Thu, Dec 13, 2018 at 12:09 PM Dr. David Alan Gilbert > > > <dgilbert@redhat.com> wrote: > > > > > > > > * Dan Williams (dan.j.williams@intel.com) wrote: > > > > > On Mon, Dec 10, 2018 at 9:22 AM Vivek Goyal <vgoyal@redhat.com> wrote: > > > > > > > > > > > > From: Stefan Hajnoczi <stefanha@redhat.com> > > > > > > > > > > > > Experimental QEMU code introduces an MMIO BAR for mapping portions of > > > > > > files in the virtio-fs device. Map this BAR so that FUSE DAX can access > > > > > > file contents from the host page cache. > > > > > > > > > > FUSE DAX sounds terrifying, can you explain a bit more about what this is? > > > > > > > > We've got a guest running in QEMU, it sees an emulated PCI device; > > > > that runs a FUSE protocol over virtio on that PCI device, but also has > > > > a trick where via commands sent over the virtio queue associated with that device, > > > > (fragments of) host files get mmap'd into the qemu virtual memory that corresponds > > > > to the kvm slot exposed to the guest for that bar. > > > > > > > > The guest sees those chunks in that BAR, and thus you can read/write > > > > to the host file by directly writing into that BAR. > > > > > > Ok so it's all software emulated and there won't be hardware DMA > > > initiated by the guest to that address? > > > > That's my understanding. > > > > > I.e. if the host file gets > > > truncated / hole-punched the guest would just cause a refault and the > > > filesystem could fill in the block, > > > > Right > > > > > or the guest is expected to die if > > > the fault to the truncated file range results in SIGBUS. > > > > Are you referring to the case where a file page is mapped in qemu and > > another guest/process trucates that page and when qemu tries to access it it > > will get SIGBUS. Have not tried it, will give it a try. Not sure what > > happens when QEMU receives SIGBUS. > > > > Having said that, this is not different from the case of one process > > mapping a file and another process truncating the file and first process > > getting SIGBUS, right? > > Ok, tried this and guest process hangs. > > Stefan, dgilbert, this reminds me that we have faced this issue during > our testing and we decided that this will need some fixing in KVM. I > even put this in as part of changelog of patch with subject "fuse: Take > inode lock for dax inode truncation" > > "Another problem is, if we setup a mapping in fuse_iomap_begin(), and > file gets truncated and dax read/write happens, KVM currently hangs. > It tries to fault in a page which does not exist on host (file got > truncated). It probably requries fixing in KVM." > > Not sure what should happen though when qemu receives SIGBUS in this > case. Yes, and I noted it in the TODO in my qemu patch posting. We need to figure out what we want the guest to see in this case and figure out how to make QEMU/kvm fix it up so that the guest doesn't see anything odd. Dave > Thanks > Vivek -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index ba615ec2603e..87b7e42a6763 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -6,12 +6,18 @@ #include <linux/fs.h> #include <linux/dax.h> +#include <linux/pci.h> #include <linux/pfn_t.h> #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_fs.h> #include "fuse_i.h" +enum { + /* PCI BAR number of the virtio-fs DAX window */ + VIRTIO_FS_WINDOW_BAR = 2, +}; + /* List of virtio-fs device instances and a lock for the list */ static DEFINE_MUTEX(virtio_fs_mutex); static LIST_HEAD(virtio_fs_instances); @@ -24,6 +30,18 @@ struct virtio_fs_vq { char name[24]; } ____cacheline_aligned_in_smp; +/* State needed for devm_memremap_pages(). This API is called on the + * underlying pci_dev instead of struct virtio_fs (layering violation). Since + * the memremap release function only gets called when the pci_dev is released, + * keep the associated state separate from struct virtio_fs (it has a different + * lifecycle from pci_dev). + */ +struct virtio_fs_memremap_info { + struct dev_pagemap pgmap; + struct percpu_ref ref; + struct completion completion; +}; + /* A virtio-fs device instance */ struct virtio_fs { struct list_head list; /* on virtio_fs_instances */ @@ -36,6 +54,7 @@ struct virtio_fs { /* DAX memory window where file contents are mapped */ void *window_kaddr; phys_addr_t window_phys_addr; + size_t window_len; }; static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) @@ -395,6 +414,127 @@ static const struct dax_operations virtio_fs_dax_ops = { .copy_to_iter = virtio_fs_copy_to_iter, }; +static void virtio_fs_percpu_release(struct percpu_ref *ref) +{ + struct virtio_fs_memremap_info *mi = + container_of(ref, struct virtio_fs_memremap_info, ref); + + complete(&mi->completion); +} + +static void virtio_fs_percpu_exit(void *data) +{ + struct virtio_fs_memremap_info *mi = data; + + wait_for_completion(&mi->completion); + percpu_ref_exit(&mi->ref); +} + +static void virtio_fs_percpu_kill(void *data) +{ + percpu_ref_kill(data); +} + +static void virtio_fs_cleanup_dax(void *data) +{ + struct virtio_fs *fs = data; + + kill_dax(fs->dax_dev); + put_dax(fs->dax_dev); +} + +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) +{ + struct virtio_fs_memremap_info *mi; + struct dev_pagemap *pgmap; + struct pci_dev *pci_dev; + phys_addr_t phys_addr; + size_t len; + int ret; + + if (!IS_ENABLED(CONFIG_DAX_DRIVER)) + return 0; + + /* HACK implement VIRTIO shared memory regions instead of + * directly accessing the PCI BAR from a virtio device driver. + */ + pci_dev = container_of(vdev->dev.parent, struct pci_dev, dev); + + /* TODO Is this safe - the virtio_pci_* driver doesn't use managed + * device APIs? */ + ret = pcim_enable_device(pci_dev); + if (ret < 0) + return ret; + + /* TODO handle case where device doesn't expose BAR? */ + ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR, + "virtio-fs-window"); + if (ret < 0) { + dev_err(&vdev->dev, "%s: failed to request window BAR\n", + __func__); + return ret; + } + + phys_addr = pci_resource_start(pci_dev, VIRTIO_FS_WINDOW_BAR); + len = pci_resource_len(pci_dev, VIRTIO_FS_WINDOW_BAR); + + mi = devm_kzalloc(&pci_dev->dev, sizeof(*mi), GFP_KERNEL); + if (!mi) + return -ENOMEM; + + init_completion(&mi->completion); + ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0, + GFP_KERNEL); + if (ret < 0) { + dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n", + __func__, ret); + return ret; + } + + ret = devm_add_action(&pci_dev->dev, virtio_fs_percpu_exit, mi); + if (ret < 0) { + percpu_ref_exit(&mi->ref); + return ret; + } + + pgmap = &mi->pgmap; + pgmap->altmap_valid = false; + pgmap->ref = &mi->ref; + pgmap->type = MEMORY_DEVICE_FS_DAX; + + /* Ideally we would directly use the PCI BAR resource but + * devm_memremap_pages() wants its own copy in pgmap. So + * initialize a struct resource from scratch (only the start + * and end fields will be used). + */ + pgmap->res = (struct resource){ + .name = "virtio-fs dax window", + .start = phys_addr, + .end = phys_addr + len, + }; + + fs->window_kaddr = devm_memremap_pages(&pci_dev->dev, pgmap); + if (IS_ERR(fs->window_kaddr)) + return PTR_ERR(fs->window_kaddr); + + ret = devm_add_action_or_reset(&pci_dev->dev, virtio_fs_percpu_kill, + &mi->ref); + if (ret < 0) + return ret; + + fs->window_phys_addr = phys_addr; + fs->window_len = len; + + dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n", + __func__, fs->window_kaddr, phys_addr, len); + + fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops); + if (!fs->dax_dev) + return -ENOMEM; + + return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs); +} + static int virtio_fs_probe(struct virtio_device *vdev) { struct virtio_fs *fs; @@ -416,16 +556,9 @@ static int virtio_fs_probe(struct virtio_device *vdev) /* TODO vq affinity */ /* TODO populate notifications vq */ - if (IS_ENABLED(CONFIG_DAX_DRIVER)) { - /* TODO map window */ - fs->window_kaddr = NULL; - fs->window_phys_addr = 0; - - fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops); - if (!fs->dax_dev) - goto out_vqs; /* TODO handle case where device doesn't expose - BAR */ - } + ret = virtio_fs_setup_dax(vdev, fs); + if (ret < 0) + goto out_vqs; /* Bring the device online in case the filesystem is mounted and * requests need to be sent before we return. @@ -441,13 +574,6 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: vdev->config->reset(vdev); virtio_fs_cleanup_vqs(vdev, fs); - - if (fs->dax_dev) { - kill_dax(fs->dax_dev); - put_dax(fs->dax_dev); - fs->dax_dev = NULL; - } - out: vdev->priv = NULL; return ret; @@ -466,12 +592,6 @@ static void virtio_fs_remove(struct virtio_device *vdev) list_del(&fs->list); mutex_unlock(&virtio_fs_mutex); - if (fs->dax_dev) { - kill_dax(fs->dax_dev); - put_dax(fs->dax_dev); - fs->dax_dev = NULL; - } - vdev->priv = NULL; }