Message ID | 20240731155352.3973857-1-kbusch@meta.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [rfc] vfio-pci: Allow write combining | expand |
On Wed, Jul 31, 2024 at 08:53:52AM -0700, Keith Busch wrote: > From: Keith Busch <kbusch@kernel.org> > > Write combining can be provide performance improvement for places that > can safely use this capability. > > Previous discussions on the topic suggest a vfio user needs to > explicitly request such a mapping, and it sounds like a new vfio > specific ioctl to request this is one way recommended way to do that. > This patch implements a new ioctl to achieve that so a user can request > write combining on prefetchable memory. A new ioctl seems a bit much for > just this purpose, so the implementation here provides a "flags" field > with only the write combine option defined. The rest of the bits are > reserved for future use. This is a neat hack for sure But how about adding this flag to vfio_region_info ? @@ -275,6 +289,7 @@ struct vfio_region_info { #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ +#define VFIO_REGION_INFO_REQ_WC (1 << 4) /* Request a write combining mapping*/ __u32 index; /* Region index */ __u32 cap_offset; /* Offset within info struct of first cap */ __aligned_u64 size; /* Region size (bytes) */ It specify REQ_WC when calling VFIO_DEVICE_GET_REGION_INFO The kernel will then return an offset value that yields a WC mapping. It doesn't displace the normal non-WC mapping? Arguably we should fixup the kernel to put the mmap cookies into a maple tree so they can be dynamically allocated and more densely packed. Jason
On Thu, 1 Aug 2024 11:19:14 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Wed, Jul 31, 2024 at 08:53:52AM -0700, Keith Busch wrote: > > From: Keith Busch <kbusch@kernel.org> > > > > Write combining can be provide performance improvement for places that > > can safely use this capability. > > > > Previous discussions on the topic suggest a vfio user needs to > > explicitly request such a mapping, and it sounds like a new vfio > > specific ioctl to request this is one way recommended way to do that. > > This patch implements a new ioctl to achieve that so a user can request > > write combining on prefetchable memory. A new ioctl seems a bit much for > > just this purpose, so the implementation here provides a "flags" field > > with only the write combine option defined. The rest of the bits are > > reserved for future use. > > This is a neat hack for sure > > But how about adding this flag to vfio_region_info ? > > @@ -275,6 +289,7 @@ struct vfio_region_info { > #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ > #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ > #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ > +#define VFIO_REGION_INFO_REQ_WC (1 << 4) /* Request a write combining mapping*/ > __u32 index; /* Region index */ > __u32 cap_offset; /* Offset within info struct of first cap */ > __aligned_u64 size; /* Region size (bytes) */ > > > It specify REQ_WC when calling VFIO_DEVICE_GET_REGION_INFO > > The kernel will then return an offset value that yields a WC > mapping. It doesn't displace the normal non-WC mapping? > > Arguably we should fixup the kernel to put the mmap cookies into a > maple tree so they can be dynamically allocated and more densely > packed. vfio_region_info.flags in not currently tested for input therefore this proposal could lead to unexpected behavior for a caller that doesn't currently zero this field. It's intended as an output-only field. Thanks, Alex
On Thu, Aug 01, 2024 at 09:41:23AM -0600, Alex Williamson wrote: > On Thu, 1 Aug 2024 11:19:14 -0300 > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > On Wed, Jul 31, 2024 at 08:53:52AM -0700, Keith Busch wrote: > > > From: Keith Busch <kbusch@kernel.org> > > > > > > Write combining can be provide performance improvement for places that > > > can safely use this capability. > > > > > > Previous discussions on the topic suggest a vfio user needs to > > > explicitly request such a mapping, and it sounds like a new vfio > > > specific ioctl to request this is one way recommended way to do that. > > > This patch implements a new ioctl to achieve that so a user can request > > > write combining on prefetchable memory. A new ioctl seems a bit much for > > > just this purpose, so the implementation here provides a "flags" field > > > with only the write combine option defined. The rest of the bits are > > > reserved for future use. > > > > This is a neat hack for sure > > > > But how about adding this flag to vfio_region_info ? > > > > @@ -275,6 +289,7 @@ struct vfio_region_info { > > #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ > > #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ > > #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ > > +#define VFIO_REGION_INFO_REQ_WC (1 << 4) /* Request a write combining mapping*/ > > __u32 index; /* Region index */ > > __u32 cap_offset; /* Offset within info struct of first cap */ > > __aligned_u64 size; /* Region size (bytes) */ > > > > > > It specify REQ_WC when calling VFIO_DEVICE_GET_REGION_INFO > > > > The kernel will then return an offset value that yields a WC > > mapping. It doesn't displace the normal non-WC mapping? > > > > Arguably we should fixup the kernel to put the mmap cookies into a > > maple tree so they can be dynamically allocated and more densely > > packed. > > vfio_region_info.flags in not currently tested for input therefore this > proposal could lead to unexpected behavior for a caller that doesn't > currently zero this field. It's intended as an output-only field. Perhaps a REGION_INFO2 then? I still think per-request is better than a global flag Jason
On Thu, 1 Aug 2024 13:11:30 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Thu, Aug 01, 2024 at 09:41:23AM -0600, Alex Williamson wrote: > > On Thu, 1 Aug 2024 11:19:14 -0300 > > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > On Wed, Jul 31, 2024 at 08:53:52AM -0700, Keith Busch wrote: > > > > From: Keith Busch <kbusch@kernel.org> > > > > > > > > Write combining can be provide performance improvement for places that > > > > can safely use this capability. > > > > > > > > Previous discussions on the topic suggest a vfio user needs to > > > > explicitly request such a mapping, and it sounds like a new vfio > > > > specific ioctl to request this is one way recommended way to do that. > > > > This patch implements a new ioctl to achieve that so a user can request > > > > write combining on prefetchable memory. A new ioctl seems a bit much for > > > > just this purpose, so the implementation here provides a "flags" field > > > > with only the write combine option defined. The rest of the bits are > > > > reserved for future use. > > > > > > This is a neat hack for sure > > > > > > But how about adding this flag to vfio_region_info ? > > > > > > @@ -275,6 +289,7 @@ struct vfio_region_info { > > > #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ > > > #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ > > > #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ > > > +#define VFIO_REGION_INFO_REQ_WC (1 << 4) /* Request a write combining mapping*/ > > > __u32 index; /* Region index */ > > > __u32 cap_offset; /* Offset within info struct of first cap */ > > > __aligned_u64 size; /* Region size (bytes) */ > > > > > > > > > It specify REQ_WC when calling VFIO_DEVICE_GET_REGION_INFO > > > > > > The kernel will then return an offset value that yields a WC > > > mapping. It doesn't displace the normal non-WC mapping? > > > > > > Arguably we should fixup the kernel to put the mmap cookies into a > > > maple tree so they can be dynamically allocated and more densely > > > packed. > > > > vfio_region_info.flags in not currently tested for input therefore this > > proposal could lead to unexpected behavior for a caller that doesn't > > currently zero this field. It's intended as an output-only field. > > Perhaps a REGION_INFO2 then? > > I still think per-request is better than a global flag I don't understand why we'd need a REGION_INFO2, we already have support for defining new regions. We do this by increasing the num_regions value from the VFIO_DEVICE_GET_INFO ioctl. The user can iterate those additional regions and for each index call VFIO_DEVICE_GET_REGION_INFO. The new regions expose a VFIO_REGION_INFO_CAP_TYPE capability where we define new types as: #define VFIO_REGION_TYPE_PCI_BAR0_WC (4) #define VFIO_REGION_TYPE_PCI_BAR1_WC (5) #define VFIO_REGION_TYPE_PCI_BAR2_WC (6) #define VFIO_REGION_TYPE_PCI_BAR3_WC (7) #define VFIO_REGION_TYPE_PCI_BAR4_WC (8) #define VFIO_REGION_TYPE_PCI_BAR5_WC (9) We'd populate these new regions only for BARs that support prefetch and mmap and we'd define that these BARs may expose only the MMAP flag and not the READ or WRITE flags since those can go through the standard region. Really the only difference from the static vfio-pci defined region indexes is the O(N) search for the user to find the vfio region index to BAR mapping. Thanks, Alex
On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote: > > > vfio_region_info.flags in not currently tested for input therefore this > > > proposal could lead to unexpected behavior for a caller that doesn't > > > currently zero this field. It's intended as an output-only field. > > > > Perhaps a REGION_INFO2 then? > > > > I still think per-request is better than a global flag > > I don't understand why we'd need a REGION_INFO2, we already have > support for defining new regions. It is not a new region, it is a modified mmap behavior for an existing region. > We'd populate these new regions only for BARs that support prefetch and > mmap That's not the point, prefetch has nothing to do with write combining. Every BAR can be mapped writecombining, it is up to the VFIO userspace to understand if it can use it or not. The only use case for this feature would be something like in DPDK. VM side write combining is already handled by KVM allowing the VM to upgrade the page attributes to WC from NC. Doubling all the region indexes just for WC does not seem like a good idea to me... Jason
On Thu, 1 Aug 2024 14:13:55 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote: > > > > vfio_region_info.flags in not currently tested for input therefore this > > > > proposal could lead to unexpected behavior for a caller that doesn't > > > > currently zero this field. It's intended as an output-only field. > > > > > > Perhaps a REGION_INFO2 then? > > > > > > I still think per-request is better than a global flag > > > > I don't understand why we'd need a REGION_INFO2, we already have > > support for defining new regions. > > It is not a new region, it is a modified mmap behavior for an existing > region. If we're returning a different offset into the vfio device file from which to get a WC mapping, what's the difference? A vfio "region" is describing a region or range of the vfio device file descriptor. Region indexes that map into the same device resource are not fundamentally incompatible AFAICT, but it does mean that zapping user access is not a nice contiguous single range. > > We'd populate these new regions only for BARs that support prefetch and > > mmap > > That's not the point, prefetch has nothing to do with write combining. I was following the original proposal in this thread that added a prefetch flag to REGION_INFO and allowed enabling WC only for IORESOURCE_PREFETCH. > Every BAR can be mapped writecombining, it is up to the VFIO userspace > to understand if it can use it or not. The only use case for this > feature would be something like in DPDK. > > VM side write combining is already handled by KVM allowing the VM to > upgrade the page attributes to WC from NC. > > Doubling all the region indexes just for WC does not seem like a good > idea to me... Is the difference you see that in the REQ_WC proposal the user is effectively asking vfio to pop a WC region into existence vs here they're pre-populated? At the limit they're the same. We could use a DEVICE_FEATURE to ask vfio to selectively populate WC regions after which the user could re-enumerate additional regions, or in fact to switch on WC for a given region if we want to go that route. Thanks, Alex
On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote: > On Thu, 1 Aug 2024 14:13:55 -0300 > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote: > > > > > vfio_region_info.flags in not currently tested for input therefore this > > > > > proposal could lead to unexpected behavior for a caller that doesn't > > > > > currently zero this field. It's intended as an output-only field. > > > > > > > > Perhaps a REGION_INFO2 then? > > > > > > > > I still think per-request is better than a global flag > > > > > > I don't understand why we'd need a REGION_INFO2, we already have > > > support for defining new regions. > > > > It is not a new region, it is a modified mmap behavior for an existing > > region. > > If we're returning a different offset into the vfio device file from > which to get a WC mapping, what's the difference? I think it is a pretty big difference.. The offset is just a "mmap cookie", it doesn't have to be 1:1 with the idea of a region. > A vfio "region" is > describing a region or range of the vfio device file descriptor. I'm thinking a region is describing an area of memory that is available in the VFIO device. The offset output is just a "mmap cookie" to tell userspace how to mmap it. Having N mmap cookies for 1 region is OK. > > > We'd populate these new regions only for BARs that support prefetch and > > > mmap > > > > That's not the point, prefetch has nothing to do with write combining. > > I was following the original proposal in this thread that added a > prefetch flag to REGION_INFO and allowed enabling WC only for > IORESOURCE_PREFETCH. Oh, I didn't notice that, it shouldn't do that. Returning the VFIO_REGION_FLAG_WRITE_COMBINE makes sense, but it shouldn't effect what the kernel allows. > > Doubling all the region indexes just for WC does not seem like a good > > idea to me... > > Is the difference you see that in the REQ_WC proposal the user is > effectively asking vfio to pop a WC region into existence vs here > they're pre-populated? ?? This didn't create more regions AFAICT. It created a new global + bool bar_write_combine[PCI_STD_NUM_BARS]; Which controls what NC/WC the mmap creates when called: + if (vdev->bar_write_combine[index]) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + else + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); You get the same output from REGION_INFO, same number of regions. It was the other proposal from long ago that created more regions. This is what I like and would prefer to stick with. REGION_INFO doesn't really change, we don't have two regions refering to the same physical memory, and we find some way to request NC/WC of a region at mmap time. A global is a neat trick, but it would be cleaner to request properties of the mmap when the "mmap cookie" is obtained. > At the limit they're the same. We could use a > DEVICE_FEATURE to ask vfio to selectively populate WC regions after > which the user could re-enumerate additional regions, or in fact to > switch on WC for a given region if we want to go that route. Thanks, This is still adding more regions and reporting more stuff from REGION_INFO, that is what I would like to avoid. Jason
On Thu, 1 Aug 2024 14:53:39 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote: > > On Thu, 1 Aug 2024 14:13:55 -0300 > > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote: > > > > > > vfio_region_info.flags in not currently tested for input therefore this > > > > > > proposal could lead to unexpected behavior for a caller that doesn't > > > > > > currently zero this field. It's intended as an output-only field. > > > > > > > > > > Perhaps a REGION_INFO2 then? > > > > > > > > > > I still think per-request is better than a global flag > > > > > > > > I don't understand why we'd need a REGION_INFO2, we already have > > > > support for defining new regions. > > > > > > It is not a new region, it is a modified mmap behavior for an existing > > > region. > > > > If we're returning a different offset into the vfio device file from > > which to get a WC mapping, what's the difference? > > I think it is a pretty big difference.. The offset is just a "mmap > cookie", it doesn't have to be 1:1 with the idea of a region. > > > A vfio "region" is > > describing a region or range of the vfio device file descriptor. > > I'm thinking a region is describing an area of memory that is > available in the VFIO device. The offset output is just a "mmap > cookie" to tell userspace how to mmap it. Having N mmap cookies for 1 > region is OK. Is an "mmap cookie" an offset into the vfio device file where mmap'ing that offset results in a WC mapping to a specific device resource? Isn't that just a region that doesn't have an index or supporting infrastructure? > > > > We'd populate these new regions only for BARs that support prefetch and > > > > mmap > > > > > > That's not the point, prefetch has nothing to do with write combining. > > > > I was following the original proposal in this thread that added a > > prefetch flag to REGION_INFO and allowed enabling WC only for > > IORESOURCE_PREFETCH. > > Oh, I didn't notice that, it shouldn't do that. Returning the > VFIO_REGION_FLAG_WRITE_COMBINE makes sense, but it shouldn't effect > what the kernel allows. > > > > Doubling all the region indexes just for WC does not seem like a good > > > idea to me... > > > > Is the difference you see that in the REQ_WC proposal the user is > > effectively asking vfio to pop a WC region into existence vs here > > they're pre-populated? > > ?? This didn't create more regions AFAICT. It created a new global > > + bool bar_write_combine[PCI_STD_NUM_BARS]; > > Which controls what NC/WC the mmap creates when called: > > + if (vdev->bar_write_combine[index]) > + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); > + else > + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); > > You get the same output from REGION_INFO, same number of regions. It was your proposal that introduced REQ_WC, this is Keith's original proposal. I'm equating a REQ_WC request inventing an "mmap cookie" as effectively the same as bringing a lightweight region into existence because it defines a section of the vfio device file to have specific mmap semantics. > It was the other proposal from long ago that created more regions. > > This is what I like and would prefer to stick with. REGION_INFO > doesn't really change, we don't have two regions refering to the same > physical memory, and we find some way to request NC/WC of a region at > mmap time. "At mmap time" means that something in the vma needs to describe to us to use the WC semantics, where I think you're proposing that the "mmap cookie" provides a specific vm_pgoff which we already use to determine the region index. So whether or not we want to call this a region, it's effectively in the same address space as regions. Therefore "mmap cookie" ~= "region offset". > A global is a neat trick, but it would be cleaner to request > properties of the mmap when the "mmap cookie" is obtained. > > > At the limit they're the same. We could use a > > DEVICE_FEATURE to ask vfio to selectively populate WC regions after > > which the user could re-enumerate additional regions, or in fact to > > switch on WC for a given region if we want to go that route. Thanks, > > This is still adding more regions and reporting more stuff from > REGION_INFO, that is what I would like to avoid. Why? This reminds me of hidden registers outside of capability chains in PCI config space. Thanks, Alex
On Thu, Aug 01, 2024 at 12:16:57PM -0600, Alex Williamson wrote: > On Thu, 1 Aug 2024 14:53:39 -0300 > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote: > > > On Thu, 1 Aug 2024 14:13:55 -0300 > > > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote: > > > > > > > vfio_region_info.flags in not currently tested for input therefore this > > > > > > > proposal could lead to unexpected behavior for a caller that doesn't > > > > > > > currently zero this field. It's intended as an output-only field. > > > > > > > > > > > > Perhaps a REGION_INFO2 then? > > > > > > > > > > > > I still think per-request is better than a global flag > > > > > > > > > > I don't understand why we'd need a REGION_INFO2, we already have > > > > > support for defining new regions. > > > > > > > > It is not a new region, it is a modified mmap behavior for an existing > > > > region. > > > > > > If we're returning a different offset into the vfio device file from > > > which to get a WC mapping, what's the difference? > > > > I think it is a pretty big difference.. The offset is just a "mmap > > cookie", it doesn't have to be 1:1 with the idea of a region. > > > > > A vfio "region" is > > > describing a region or range of the vfio device file descriptor. > > > > I'm thinking a region is describing an area of memory that is > > available in the VFIO device. The offset output is just a "mmap > > cookie" to tell userspace how to mmap it. Having N mmap cookies for 1 > > region is OK. > > Is an "mmap cookie" an offset into the vfio device file where mmap'ing > that offset results in a WC mapping to a specific device resource? Yes > Isn't that just a region that doesn't have an index or supporting > infrastructure? No? It is a "mmap cookie" that has the requested mmap-time properties. Today the kernel side binds the mmap offset to the index in a computed way, but from a API perspective userspace does REGION_INFO and gets back an opaque "mmap cookie" that it uses to pass to mmap to get back the thing describe by REGION_INFO. Userspace has no idea about any structure to the cookie numbers. > > Which controls what NC/WC the mmap creates when called: > > > > + if (vdev->bar_write_combine[index]) > > + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); > > + else > > + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); > > > > You get the same output from REGION_INFO, same number of regions. > > It was your proposal that introduced REQ_WC, this is Keith's original > proposal. I'm equating a REQ_WC request inventing an "mmap cookie" as > effectively the same as bringing a lightweight region into existence > because it defines a section of the vfio device file to have specific > mmap semantics. Well, again, it is not a region, it is just a record that this mmap cookie uses X region with Y mapping flags. The number of regions don't change. Critically from a driver perspective the number of regions and region indexes wouldn't change. I am not thing of making a new region, I am thinking of adjusting how mmap works. Like today we do this: index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); So maybe instead it is something like: vfio_decode_mmap_cookie(vma, &index, &flags); if (flags & VFIO_MMAP_WC) prot = pgprot_writecombine(..) From a driver perspective the region indexes don't change at all. This is what I think is important here. > > It was the other proposal from long ago that created more regions. > > > > This is what I like and would prefer to stick with. REGION_INFO > > doesn't really change, we don't have two regions refering to the same > > physical memory, and we find some way to request NC/WC of a region at > > mmap time. > > "At mmap time" means that something in the vma needs to describe to us > to use the WC semantics, where I think you're proposing that the "mmap > cookie" provides a specific vm_pgoff which we already use to determine > the region index. Yes > So whether or not we want to call this a region, > it's effectively in the same address space as regions. Therefore "mmap > cookie" ~= "region offset". Well, that is just the current implementation. What we did in RDMA when we switched from hard coded mmap cookies to dynamic ones is use an xarray (today this should be a maple tree) to dynamically allocate mmap cookies whenever the driver returns something to userspace. During the mmap fop the pgoff is fed back through the maple tree to get the description of what the cookie represents. So the encoding of cookies is completely disjoint from whatever the underlying thing is. If you want the same region to be mapped with two or three different prot flags you just ask for two or three cookies and at mmap time you can recover the region pointer and the mmap flags. So VFIO could do several different things here to convay the mmap flags through the cookie, including somehow encoding it in a pgoff bit, or using a dynamic maple tree scheme. My point is to not confuse the pgoff encoding with the driver concept of a region. The region is a single peice of memory, the "mmap cookie"s are just handles to it. Adding more data to the handle is not the same as adding more regions. > > > At the limit they're the same. We could use a > > > DEVICE_FEATURE to ask vfio to selectively populate WC regions after > > > which the user could re-enumerate additional regions, or in fact to > > > switch on WC for a given region if we want to go that route. Thanks, > > > > This is still adding more regions and reporting more stuff from > > REGION_INFO, that is what I would like to avoid. > > Why? This reminds me of hidden registers outside of capability chains > in PCI config space. Thanks, The mmap offsets are not (supposed to be) ABI in the VFIO ioctls. The encoding is entirely opaque inside the kernel already. Apps are supposed to use REGION_INFO to learn the value to pass to mmap. ie things like VFIO_PCI_OFFSET_SHIFT are not in the uAPI header. Jason
On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote: > On Thu, 1 Aug 2024 14:13:55 -0300 > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote: > > > > We'd populate these new regions only for BARs that support prefetch and > > > mmap > > > > That's not the point, prefetch has nothing to do with write combining. > > I was following the original proposal in this thread that added a > prefetch flag to REGION_INFO and allowed enabling WC only for > IORESOURCE_PREFETCH. Which itself follows the existing pattern from pci_create_resource_files(), which creates a write combine resource<X>_wc file only when IORESOURCE_PREFETCH is set. But yeah, prefetch isn't necessary for wc, but it seems to indicate it's safe.
On Fri, Aug 02, 2024 at 08:24:49AM -0600, Keith Busch wrote: > On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote: > > On Thu, 1 Aug 2024 14:13:55 -0300 > > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote: > > > > > > We'd populate these new regions only for BARs that support prefetch and > > > > mmap > > > > > > That's not the point, prefetch has nothing to do with write combining. > > > > I was following the original proposal in this thread that added a > > prefetch flag to REGION_INFO and allowed enabling WC only for > > IORESOURCE_PREFETCH. > > Which itself follows the existing pattern from > pci_create_resource_files(), which creates a write combine > resource<X>_wc file only when IORESOURCE_PREFETCH is set. But yeah, > prefetch isn't necessary for wc, but it seems to indicate it's safe. Yes, I know, that code isn't right either... It seems to be the root of this odd "prefetch and WC are related" idea. Jason
On Fri, 2 Aug 2024 08:53:08 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Thu, Aug 01, 2024 at 12:16:57PM -0600, Alex Williamson wrote: > > On Thu, 1 Aug 2024 14:53:39 -0300 > > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote: > > > > On Thu, 1 Aug 2024 14:13:55 -0300 > > > > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > > > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote: > > > > > > > > vfio_region_info.flags in not currently tested for input therefore this > > > > > > > > proposal could lead to unexpected behavior for a caller that doesn't > > > > > > > > currently zero this field. It's intended as an output-only field. > > > > > > > > > > > > > > Perhaps a REGION_INFO2 then? > > > > > > > > > > > > > > I still think per-request is better than a global flag > > > > > > > > > > > > I don't understand why we'd need a REGION_INFO2, we already have > > > > > > support for defining new regions. > > > > > > > > > > It is not a new region, it is a modified mmap behavior for an existing > > > > > region. > > > > > > > > If we're returning a different offset into the vfio device file from > > > > which to get a WC mapping, what's the difference? > > > > > > I think it is a pretty big difference.. The offset is just a "mmap > > > cookie", it doesn't have to be 1:1 with the idea of a region. > > > > > > > A vfio "region" is > > > > describing a region or range of the vfio device file descriptor. > > > > > > I'm thinking a region is describing an area of memory that is > > > available in the VFIO device. The offset output is just a "mmap > > > cookie" to tell userspace how to mmap it. Having N mmap cookies for 1 > > > region is OK. > > > > Is an "mmap cookie" an offset into the vfio device file where mmap'ing > > that offset results in a WC mapping to a specific device resource? > > Yes > > > Isn't that just a region that doesn't have an index or supporting > > infrastructure? > > No? It is a "mmap cookie" that has the requested mmap-time properties. > > Today the kernel side binds the mmap offset to the index in a computed > way, but from a API perspective userspace does REGION_INFO and gets > back an opaque "mmap cookie" that it uses to pass to mmap to get back > the thing describe by REGION_INFO. Userspace has no idea about any > structure to the cookie numbers. REGION_INFO is part of what I'm referring to as infrastructure. Userspace knows a number of regions, ie. indexes, via DEVICE_INFO and iterates those regions using REGION_INFO to get the mmap cookie, ie. file offset. The ABI is flexible here, the first few region indexes are static mappings to vfio-pci specific device resources and additional mappings are described as device specific resources using the REGION_INFO capability chain. > > > Which controls what NC/WC the mmap creates when called: > > > > > > + if (vdev->bar_write_combine[index]) > > > + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); > > > + else > > > + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); > > > > > > You get the same output from REGION_INFO, same number of regions. > > > > It was your proposal that introduced REQ_WC, this is Keith's original > > proposal. I'm equating a REQ_WC request inventing an "mmap cookie" as > > effectively the same as bringing a lightweight region into existence > > because it defines a section of the vfio device file to have specific > > mmap semantics. > > Well, again, it is not a region, it is just a record that this mmap > cookie uses X region with Y mapping flags. The number of regions don't > change. Critically from a driver perspective the number of regions and > region indexes wouldn't change. Why is this critical? As above, for vfio-pci devices the first few region indexes are static mappings to specific PCI resources and additional resources beyond VFIO_PCI_NUM_REGIONS are device specific and described by a region type in the capability chain. None of the static regions move and it's part of the defined ABI for userspace to iterate additional regions. > I am not thing of making a new region, I am thinking of adjusting how > mmap works. Like today we do this: > > index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); > > So maybe instead it is something like: > > vfio_decode_mmap_cookie(vma, &index, &flags); > if (flags & VFIO_MMAP_WC) > prot = pgprot_writecombine(..) > > From a driver perspective the region indexes don't change at all. This > is what I think is important here. The only thing that changes is that num_regions increases and the user can go check REGION_INFO on those newly reported regions. The way we currently calculate the index from the vm_pgoff is implementation, not uAPI. We can change that if necessary. Userspace should always get the offset, ie. mmap cookie, from REGION_INFO and the initial set of region indexes to device resources are fixed, they will not change. > > > It was the other proposal from long ago that created more regions. > > > > > > This is what I like and would prefer to stick with. REGION_INFO > > > doesn't really change, we don't have two regions refering to the same > > > physical memory, and we find some way to request NC/WC of a region at > > > mmap time. > > > > "At mmap time" means that something in the vma needs to describe to us > > to use the WC semantics, where I think you're proposing that the "mmap > > cookie" provides a specific vm_pgoff which we already use to determine > > the region index. > > Yes > > > So whether or not we want to call this a region, > > it's effectively in the same address space as regions. Therefore "mmap > > cookie" ~= "region offset". > > Well, that is just the current implementation. What we did in RDMA > when we switched from hard coded mmap cookies to dynamic ones is > use an xarray (today this should be a maple tree) to dynamically > allocate mmap cookies whenever the driver returns something to > userspace. During the mmap fop the pgoff is fed back through the maple > tree to get the description of what the cookie represents. Sure, we could do that too, the current implementation (not uAPI) just uses some upper bits to create fixed region address spaces. The only thing we should need to keep consistent is the mapping of indexes to device resources up through VFIO_PCI_NUM_REGIONS. > So the encoding of cookies is completely disjoint from whatever the > underlying thing is. If you want the same region to be mapped with two > or three different prot flags you just ask for two or three cookies > and at mmap time you can recover the region pointer and the mmap > flags. > > So VFIO could do several different things here to convay the mmap > flags through the cookie, including somehow encoding it in a pgoff > bit, or using a dynamic maple tree scheme. > > My point is to not confuse the pgoff encoding with the driver concept > of a region. The region is a single peice of memory, the "mmap cookie"s > are just handles to it. Adding more data to the handle is not the same > as adding more regions. I don't get it. Take for instance PCI config space. Given the right GPU, I can get to config space through an I/O port region, an MMIO region (possibly multiple ways), and the config space region itself. Therefore based on this hardware implementation there is no unique mapping that says that config space is uniquely accessible via a single region. Each of these regions has different semantics. If the layout of the device can cause this, why do we restrict ourselves that a given BAR can only be accessed via a signle region and we need to play games with terminology to call it an mmap cookie rather than officially creating a region with WC mmap semantics? > > > > At the limit they're the same. We could use a > > > > DEVICE_FEATURE to ask vfio to selectively populate WC regions after > > > > which the user could re-enumerate additional regions, or in fact to > > > > switch on WC for a given region if we want to go that route. Thanks, > > > > > > This is still adding more regions and reporting more stuff from > > > REGION_INFO, that is what I would like to avoid. > > > > Why? This reminds me of hidden registers outside of capability chains > > in PCI config space. Thanks, > > The mmap offsets are not (supposed to be) ABI in the VFIO ioctls. The > encoding is entirely opaque inside the kernel already. Apps are > supposed to use REGION_INFO to learn the value to pass to mmap. ie > things like VFIO_PCI_OFFSET_SHIFT are not in the uAPI header. Exactly, which gives us the flexibility to add new regions as necessary. The mechanism by which userspace iterates regions is already provided in the uABI. The initial set of static vfio-pci regions require fixed indexes, but not fixed offsets. VFIO_PCI_OFFSET_SHIFT is only an internal implementation detail. Thanks Alex
> From: Jason Gunthorpe <jgg@ziepe.ca> > Sent: Friday, August 2, 2024 10:33 PM > > On Fri, Aug 02, 2024 at 08:24:49AM -0600, Keith Busch wrote: > > On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote: > > > On Thu, 1 Aug 2024 14:13:55 -0300 > > > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote: > > > > > > > > We'd populate these new regions only for BARs that support prefetch > and > > > > > mmap > > > > > > > > That's not the point, prefetch has nothing to do with write combining. > > > > > > I was following the original proposal in this thread that added a > > > prefetch flag to REGION_INFO and allowed enabling WC only for > > > IORESOURCE_PREFETCH. > > > > Which itself follows the existing pattern from > > pci_create_resource_files(), which creates a write combine > > resource<X>_wc file only when IORESOURCE_PREFETCH is set. But yeah, > > prefetch isn't necessary for wc, but it seems to indicate it's safe. > > Yes, I know, that code isn't right either... It seems to be the root > of this odd "prefetch and WC are related" idea. > According to PCIe spec: " Bit 3 should be set to 1b if the data is prefetchable and set to 0b otherwise. A Function is permitted to mark a range as prefetchable if there are no side effects on reads, the Function returns all bytes on reads regardless of the byte enables, and host bridges can merge processor writes into this range without causing errors. " Above kind of suggests that using WC on a non-prefetchable BAR may cause errors then "prefetch and WC are related" does make some sense?
On Tue, Aug 06, 2024 at 07:19:18AM +0000, Tian, Kevin wrote: > " > Bit 3 should be set to 1b if the data is prefetchable and set to 0b > otherwise. A Function is permitted to mark a range as prefetchable > if there are no side effects on reads, the Function returns all bytes > on reads regardless of the byte enables, and host bridges can > merge processor writes into this range without causing errors. > " > > Above kind of suggests that using WC on a non-prefetchable BAR > may cause errors then "prefetch and WC are related" does make > some sense? prefetch exists in the spec to support historical old pre-PCI-x environments where a bridge does all kinds of strange things. prefetch turns that brdige behavior on because otherwise it is non backwards compatible. In a modern PCIe environment the fabric is perfectly TLP preserving and the PCI spec concept if prefetch is entirely vestigial. There is no clean mapping of what PCI spec prefetch contemplates with how moderns CPUs actually work. They should never be comingled. Today we expect the driver to understand what TLPs the CPU should emit and do the correct thing. From a Linux programming model with modern HW we never really permit "merge process writes" or expect "speculative reads" on anything except explicit WC mappings. Jason
On Fri, Aug 02, 2024 at 11:05:06AM -0600, Alex Williamson wrote: > > Well, again, it is not a region, it is just a record that this mmap > > cookie uses X region with Y mapping flags. The number of regions don't > > change. Critically from a driver perspective the number of regions and > > region indexes wouldn't change. > > Why is this critical? So we don't leak this too much into the drivers? Why should all the VFIO drivers have to be changed to alter how their region indexes work just to add a single flag?? > > Well, that is just the current implementation. What we did in RDMA > > when we switched from hard coded mmap cookies to dynamic ones is > > use an xarray (today this should be a maple tree) to dynamically > > allocate mmap cookies whenever the driver returns something to > > userspace. During the mmap fop the pgoff is fed back through the maple > > tree to get the description of what the cookie represents. > > Sure, we could do that too, the current implementation (not uAPI) just > uses some upper bits to create fixed region address spaces. The only > thing we should need to keep consistent is the mapping of indexes to > device resources up through VFIO_PCI_NUM_REGIONS. I fear we might need to do this as there may not be room in the pgoff space (at least for 32 bit) to duplicate everything.... > > My point is to not confuse the pgoff encoding with the driver concept > > of a region. The region is a single peice of memory, the "mmap cookie"s > > are just handles to it. Adding more data to the handle is not the same > > as adding more regions. > > I don't get it. Take for instance PCI config space. Given the right > GPU, I can get to config space through an I/O port region, an MMIO > region (possibly multiple ways), and the config space region itself. > Therefore based on this hardware implementation there is no unique > mapping that says that config space is uniquely accessible via a single > region. That doesn't seem like this sitation. Those are multiple different HW paths with different HW addresses, sure they can have different regions. Here we are talking about the same HW path with the same HW addresses. It shouldn't be duplicated. > BAR can only be accessed via a signle region and we need to play games > with terminology to call it an mmap cookie rather than officially > creating a region with WC mmap semantics? Because if you keep adding more regions for what are attributes of a mapping we may end up with a combinatoral explosion of regions. I already know there is interest in doing non-cache/cache mapping attributes too. Approaching this as a fixed number of regions reflecting the HW addresses and a variable number of flags requested by the user is alot more reasonable than trying to have a list of every permutation of every address for every combination of flags. Jason
On Tue, 6 Aug 2024 13:53:12 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Fri, Aug 02, 2024 at 11:05:06AM -0600, Alex Williamson wrote: > > > > Well, again, it is not a region, it is just a record that this mmap > > > cookie uses X region with Y mapping flags. The number of regions don't > > > change. Critically from a driver perspective the number of regions and > > > region indexes wouldn't change. > > > > Why is this critical? > > So we don't leak this too much into the drivers? Why should all the > VFIO drivers have to be changed to alter how their region indexes work > just to add a single flag?? I don't know how you're coming to this conclusion. A driver that wants this new mapping flag needs to do something different but the existing use case is absolutely unchanged. Look for example at how the IGD code in vfio adds several device specific regions. This doesn't affect anything other than new code in userspace that wants to iterate these regions. It doesn't change the indexes of any of the statically defined regions. > > > Well, that is just the current implementation. What we did in RDMA > > > when we switched from hard coded mmap cookies to dynamic ones is > > > use an xarray (today this should be a maple tree) to dynamically > > > allocate mmap cookies whenever the driver returns something to > > > userspace. During the mmap fop the pgoff is fed back through the maple > > > tree to get the description of what the cookie represents. > > > > Sure, we could do that too, the current implementation (not uAPI) just > > uses some upper bits to create fixed region address spaces. The only > > thing we should need to keep consistent is the mapping of indexes to > > device resources up through VFIO_PCI_NUM_REGIONS. > > I fear we might need to do this as there may not be room in the pgoff > space (at least for 32 bit) to duplicate everything.... We'll root out userspace drivers that hard code region offsets in doing this, but otherwise it shouldn't be an issue. If the collateral is too large the standard regions can use the fixed offsets and device specific regions can use dynamic offsets. > > > My point is to not confuse the pgoff encoding with the driver concept > > > of a region. The region is a single peice of memory, the "mmap cookie"s > > > are just handles to it. Adding more data to the handle is not the same > > > as adding more regions. > > > > I don't get it. Take for instance PCI config space. Given the right > > GPU, I can get to config space through an I/O port region, an MMIO > > region (possibly multiple ways), and the config space region itself. > > Therefore based on this hardware implementation there is no unique > > mapping that says that config space is uniquely accessible via a single > > region. > > That doesn't seem like this sitation. Those are multiple different HW > paths with different HW addresses, sure they can have different > regions. > > Here we are talking about the same HW path with the same HW > addresses. It shouldn't be duplicated. How does an "mmap cookie" not duplicate that a device range is accessible through multiple offsets of the vfio device file? > > BAR can only be accessed via a signle region and we need to play games > > with terminology to call it an mmap cookie rather than officially > > creating a region with WC mmap semantics? > > Because if you keep adding more regions for what are attributes of a > mapping we may end up with a combinatoral explosion of regions. > > I already know there is interest in doing non-cache/cache mapping > attributes too. This sounds like variant driver space, we can't generically create cachable mappings to MMIO. vfio-nvgrace-gpu already does this, but they usurp the standard BAR region, there's no longer uncached access. > Approaching this as a fixed number of regions reflecting the HW > addresses and a variable number of flags requested by the user is alot > more reasonable than trying to have a list of every permutation of > every address for every combination of flags. Well first, we're not talking about a fixed number of additional regions, we're talking about defining region identifiers for each BAR with a WC mapping attribute, but at worst we'd only populate implemented MMIO BARs. But then we've also mentioned that a device feature could be used to allow a userspace driver to selectively bring these regions into existence. In an case, an mmap cookie also consumes address space from the vfio device file, so I'm still failing to see how calling them a region vs just an mmap cookie makes a substantive difference. Thanks, Alex
On Tue, Aug 06, 2024 at 12:43:02PM -0600, Alex Williamson wrote: > > So we don't leak this too much into the drivers? Why should all the > > VFIO drivers have to be changed to alter how their region indexes work > > just to add a single flag?? > I don't know how you're coming to this conclusion. Ideally we'd want to support the WC option basically everywhere. > > I fear we might need to do this as there may not be room in the pgoff > > space (at least for 32 bit) to duplicate everything.... > We'll root out userspace drivers that hard code region offsets in doing > this, but otherwise it shouldn't be an issue. The issue is running out of pgoff bits on 32 bit. Maybe this isn't an issue for VFIO, but it was for RDMA. We needed tight optimal on-demand packing of actual requested mmaps. Allocating gigabytes of address space for possible mmaps ran out of pgoff bits. :\ > How does an "mmap cookie" not duplicate that a device range is > accessible through multiple offsets of the vfio device file? pgoff duplcation is not really an issue, from an API perspective the driver would call a helper to convert the pgoff into a region index and mmap flags. It wouldn't matter to any driver how many duplicates there are. > Well first, we're not talking about a fixed number of additional > regions, we're talking about defining region identifiers for each BAR > with a WC mapping attribute, but at worst we'd only populate > implemented MMIO BARs. But then we've also mentioned that a device > feature could be used to allow a userspace driver to selectively bring > these regions into existence. In an case, an mmap cookie also consumes > address space from the vfio device file, so I'm still failing to see > how calling them a region vs just an mmap cookie makes a substantive > difference. You'd only allocate the mmap cookie when userspace requests it. My original suggestion was to send a flag to REGION_INFO to specifically ask for the different behavior, that (and only that) would return new mmap cookies. The alternative version of this might be to have a single 'GET_REGION_MMAP' that gives a new mmap cookie for a singular specified region index. Userspace would call REGION_INFO to learn the memory regions and then it could call GET_REGION_MMAP(REQ_WC) and will get back a single dynamic mmap cookie that links the WC flags. No system call, no cookie allocation. Existing apps don't start seeing more regions from REGION_INFO. Drivers keep region indexes 1:1 with HW objects. The uAPI has room to add more mmap flags. Jason
On Wed, 7 Aug 2024 11:19:10 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Tue, Aug 06, 2024 at 12:43:02PM -0600, Alex Williamson wrote: > > > > So we don't leak this too much into the drivers? Why should all the > > > VFIO drivers have to be changed to alter how their region indexes work > > > just to add a single flag?? > > > I don't know how you're coming to this conclusion. > > Ideally we'd want to support the WC option basically everywhere. > > > > I fear we might need to do this as there may not be room in the pgoff > > > space (at least for 32 bit) to duplicate everything.... > > > We'll root out userspace drivers that hard code region offsets in doing > > this, but otherwise it shouldn't be an issue. > > The issue is running out of pgoff bits on 32 bit. Maybe this isn't an > issue for VFIO, but it was for RDMA. We needed tight optimal on-demand > packing of actual requested mmaps. Allocating gigabytes of address > space for possible mmaps ran out of pgoff bits. :\ If we only implemented WC for 64-bit, would anyone notice? > > How does an "mmap cookie" not duplicate that a device range is > > accessible through multiple offsets of the vfio device file? > > pgoff duplcation is not really an issue, from an API perspective the > driver would call a helper to convert the pgoff into a region index > and mmap flags. It wouldn't matter to any driver how many duplicates > there are. Which is exactly my point whether we call it a region or an mmap cookie. In one case we're trying to give a bare pgoff that effectively aliases to a region with different mapping flags, in the other the pgoff is exposed through a new region offset that does exactly the same thing. > > Well first, we're not talking about a fixed number of additional > > regions, we're talking about defining region identifiers for each BAR > > with a WC mapping attribute, but at worst we'd only populate > > implemented MMIO BARs. But then we've also mentioned that a device > > feature could be used to allow a userspace driver to selectively bring > > these regions into existence. In an case, an mmap cookie also consumes > > address space from the vfio device file, so I'm still failing to see > > how calling them a region vs just an mmap cookie makes a substantive > > difference. > > You'd only allocate the mmap cookie when userspace requests it. I've suggested a mechanism using DEVICE_FEATURE that could do this for regions. > My original suggestion was to send a flag to REGION_INFO to > specifically ask for the different behavior, that (and only that) > would return new mmap cookies. Which can't work because flags is only an output field. > The alternative version of this might be to have a single > 'GET_REGION_MMAP' that gives a new mmap cookie for a singular > specified region index. Userspace would call REGION_INFO to learn the > memory regions and then it could call GET_REGION_MMAP(REQ_WC) and will > get back a single dynamic mmap cookie that links the WC flags. > > No system call, no cookie allocation. Existing apps don't start seeing > more regions from REGION_INFO. Drivers keep region indexes 1:1 with HW > objects. The uAPI has room to add more mmap flags. Please tell me how this is ultimately different from invoking a DEVICE_FEATURE call to request that a new device specific region be created with the desired mappings. In the short term, if we run out of pgoff, the user gets an -ENOSPC. DEVICE_INFO is updated with the new number of regions, existing region indexes are unchanged, the user iterates new indexes with REGION_INFO to get the offset and identifies them using the previously proposed region types. Thanks, Alex
On Wed, Aug 07, 2024 at 11:46:43AM -0600, Alex Williamson wrote: > Please tell me how this is ultimately different from invoking a > DEVICE_FEATURE call to request that a new device specific region be > created with the desired mappings. I think this is more complex for userspace and the drivers to implement than just asking for a new pgoff directly.. A new pgoff we can manage pretty much generically with some new core code, some driver helpers, and adjusting the drivers to use the new helpers. I did exactly this a few years back to rdma and it was not hard. Dynamic region indexes, and indexes that alias other regions, seems more tricky to me. I'm not sure how this would look. It gets to the same place, I agree. Jason
On Fri, Aug 02, 2024 at 08:24:49AM -0600, Keith Busch wrote: > Which itself follows the existing pattern from > pci_create_resource_files(), which creates a write combine > resource<X>_wc file only when IORESOURCE_PREFETCH is set. But yeah, > prefetch isn't necessary for wc, but it seems to indicate it's safe. Apparently IORESOURCE_PREFETCH never implied anything about write combinability, and Linux got this wrong long time ago. The fact that we do this now clashes with PCI SIGs desire to kill the concept of prefetcable BARs that they are forcing down everyones throat.
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ba0ce0075b2fb..c275c95eafe32 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1042,12 +1042,18 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; if (vdev->bar_mmap_supported[info.index]) { + struct resource *res; + info.flags |= VFIO_REGION_INFO_FLAG_MMAP; if (info.index == vdev->msix_bar) { ret = msix_mmappable_cap(vdev, &caps); if (ret) return ret; } + + res = &vdev->pdev->resource[index]; + if (res->flags & IORESOURCE_PREFETCH) + info.flags |= VFIO_REGION_INFO_FLAG_PREFETCH; } break; @@ -1223,6 +1229,32 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, return ret; } +static int vfio_pci_ioctl_set_region_flags(struct vfio_pci_core_device *vdev, + struct vfio_region_flags __user *arg) +{ + struct vfio_region_flags region_flags; + struct resource *res; + u32 index; + + if (copy_from_user(®ion_flags, arg, sizeof(region_flags))) + return -EFAULT; + + index = region_flags.index; + if (index >= PCI_STD_NUM_BARS) + return -EINVAL; + + if (region_flags.flags & VFIO_REGION_FLAG_WRITE_COMBINE) { + res = &vdev->pdev->resource[index]; + if (!(res->flags & IORESOURCE_MEM) || + !(res->flags & IORESOURCE_PREFETCH)) + return -EINVAL; + vdev->bar_write_combine[index] = true; + } else + vdev->bar_write_combine[index] = false; + + return 0; +} + static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, void __user *arg) { @@ -1484,6 +1516,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioctl_reset(vdev, uarg); case VFIO_DEVICE_SET_IRQS: return vfio_pci_ioctl_set_irqs(vdev, uarg); + case VFIO_DEVICE_SET_REGION_FLAGS: + return vfio_pci_ioctl_set_region_flags(vdev, uarg); default: return -ENOTTY; } @@ -1756,7 +1790,10 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma } vma->vm_private_data = vdev; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (vdev->bar_write_combine[index]) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + else + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); /* diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index fbb472dd99b36..0e0122ce4196a 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -54,6 +54,7 @@ struct vfio_pci_core_device { struct pci_dev *pdev; void __iomem *barmap[PCI_STD_NUM_BARS]; bool bar_mmap_supported[PCI_STD_NUM_BARS]; + bool bar_write_combine[PCI_STD_NUM_BARS]; u8 *pci_config_map; u8 *vconfig; struct perm_bits *msi_perm; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 2b68e6cdf1902..5537b20b23541 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -275,6 +275,7 @@ struct vfio_region_info { #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ +#define VFIO_REGION_INFO_FLAG_PREFETCH (1 << 4) /* Region is prefetchable */ __u32 index; /* Region index */ __u32 cap_offset; /* Offset within info struct of first cap */ __aligned_u64 size; /* Region size (bytes) */ @@ -1821,6 +1822,22 @@ struct vfio_iommu_spapr_tce_remove { }; #define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) +/** + * VFIO_DEVICE_SET_REGION_FLAGS - _IOW(VFIO_TYPE, VFIO_BASE + 21, struct vfio_region_flags) + * + * Set mapping options for the region + * + * Flags supported: + * - VFIO_REGION_FLAG_WRITE_COMBINE: use write-combine when requested to map + * this region. Supported only if the region is prefetchable. + */ +struct vfio_region_flags { + __u32 index; /* Region index */ + __u32 flags; /* Region flags */ +#define VFIO_REGION_FLAG_WRITE_COMBINE (1 << 0) +}; +#define VFIO_DEVICE_SET_REGION_FLAGS _IO(VFIO_TYPE, VFIO_BASE + 21) + /* ***************************************************************** */ #endif /* _UAPIVFIO_H */