diff mbox series

[rfc] vfio-pci: Allow write combining

Message ID 20240731155352.3973857-1-kbusch@meta.com (mailing list archive)
State New, archived
Headers show
Series [rfc] vfio-pci: Allow write combining | expand

Commit Message

Keith Busch July 31, 2024, 3:53 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

Write combining can be provide performance improvement for places that
can safely use this capability.

Previous discussions on the topic suggest a vfio user needs to
explicitly request such a mapping, and it sounds like a new vfio
specific ioctl to request this is one way recommended way to do that.
This patch implements a new ioctl to achieve that so a user can request
write combining on prefetchable memory. A new ioctl seems a bit much for
just this purpose, so the implementation here provides a "flags" field
with only the write combine option defined. The rest of the bits are
reserved for future use.

Link: https://lore.kernel.org/all/20171009025000.39435-1-aik@ozlabs.ru/
Link: https://lore.kernel.org/lkml/ZLFBnACjoTbDmKuU@nvidia.com/
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/vfio/pci/vfio_pci_core.c | 39 +++++++++++++++++++++++++++++++-
 include/linux/vfio_pci_core.h    |  1 +
 include/uapi/linux/vfio.h        | 17 ++++++++++++++
 3 files changed, 56 insertions(+), 1 deletion(-)

Comments

Jason Gunthorpe Aug. 1, 2024, 2:19 p.m. UTC | #1
On Wed, Jul 31, 2024 at 08:53:52AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Write combining can be provide performance improvement for places that
> can safely use this capability.
> 
> Previous discussions on the topic suggest a vfio user needs to
> explicitly request such a mapping, and it sounds like a new vfio
> specific ioctl to request this is one way recommended way to do that.
> This patch implements a new ioctl to achieve that so a user can request
> write combining on prefetchable memory. A new ioctl seems a bit much for
> just this purpose, so the implementation here provides a "flags" field
> with only the write combine option defined. The rest of the bits are
> reserved for future use.

This is a neat hack for sure

But how about adding this flag to vfio_region_info ?

@@ -275,6 +289,7 @@ struct vfio_region_info {
 #define VFIO_REGION_INFO_FLAG_WRITE    (1 << 1) /* Region supports write */
 #define VFIO_REGION_INFO_FLAG_MMAP     (1 << 2) /* Region supports mmap */
 #define VFIO_REGION_INFO_FLAG_CAPS     (1 << 3) /* Info supports caps */
+#define VFIO_REGION_INFO_REQ_WC         (1 << 4) /* Request a write combining mapping*/
        __u32   index;          /* Region index */
        __u32   cap_offset;     /* Offset within info struct of first cap */
        __aligned_u64   size;   /* Region size (bytes) */


It specify REQ_WC when calling VFIO_DEVICE_GET_REGION_INFO

The kernel will then return an offset value that yields a WC
mapping. It doesn't displace the normal non-WC mapping?

Arguably we should fixup the kernel to put the mmap cookies into a
maple tree so they can be dynamically allocated and more densely
packed.

Jason
Alex Williamson Aug. 1, 2024, 3:41 p.m. UTC | #2
On Thu, 1 Aug 2024 11:19:14 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Wed, Jul 31, 2024 at 08:53:52AM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > Write combining can be provide performance improvement for places that
> > can safely use this capability.
> > 
> > Previous discussions on the topic suggest a vfio user needs to
> > explicitly request such a mapping, and it sounds like a new vfio
> > specific ioctl to request this is one way recommended way to do that.
> > This patch implements a new ioctl to achieve that so a user can request
> > write combining on prefetchable memory. A new ioctl seems a bit much for
> > just this purpose, so the implementation here provides a "flags" field
> > with only the write combine option defined. The rest of the bits are
> > reserved for future use.  
> 
> This is a neat hack for sure
> 
> But how about adding this flag to vfio_region_info ?
> 
> @@ -275,6 +289,7 @@ struct vfio_region_info {
>  #define VFIO_REGION_INFO_FLAG_WRITE    (1 << 1) /* Region supports write */
>  #define VFIO_REGION_INFO_FLAG_MMAP     (1 << 2) /* Region supports mmap */
>  #define VFIO_REGION_INFO_FLAG_CAPS     (1 << 3) /* Info supports caps */
> +#define VFIO_REGION_INFO_REQ_WC         (1 << 4) /* Request a write combining mapping*/
>         __u32   index;          /* Region index */
>         __u32   cap_offset;     /* Offset within info struct of first cap */
>         __aligned_u64   size;   /* Region size (bytes) */
> 
> 
> It specify REQ_WC when calling VFIO_DEVICE_GET_REGION_INFO
> 
> The kernel will then return an offset value that yields a WC
> mapping. It doesn't displace the normal non-WC mapping?
> 
> Arguably we should fixup the kernel to put the mmap cookies into a
> maple tree so they can be dynamically allocated and more densely
> packed.

vfio_region_info.flags in not currently tested for input therefore this
proposal could lead to unexpected behavior for a caller that doesn't
currently zero this field.  It's intended as an output-only field.
Thanks,

Alex
Jason Gunthorpe Aug. 1, 2024, 4:11 p.m. UTC | #3
On Thu, Aug 01, 2024 at 09:41:23AM -0600, Alex Williamson wrote:
> On Thu, 1 Aug 2024 11:19:14 -0300
> Jason Gunthorpe <jgg@ziepe.ca> wrote:
> 
> > On Wed, Jul 31, 2024 at 08:53:52AM -0700, Keith Busch wrote:
> > > From: Keith Busch <kbusch@kernel.org>
> > > 
> > > Write combining can be provide performance improvement for places that
> > > can safely use this capability.
> > > 
> > > Previous discussions on the topic suggest a vfio user needs to
> > > explicitly request such a mapping, and it sounds like a new vfio
> > > specific ioctl to request this is one way recommended way to do that.
> > > This patch implements a new ioctl to achieve that so a user can request
> > > write combining on prefetchable memory. A new ioctl seems a bit much for
> > > just this purpose, so the implementation here provides a "flags" field
> > > with only the write combine option defined. The rest of the bits are
> > > reserved for future use.  
> > 
> > This is a neat hack for sure
> > 
> > But how about adding this flag to vfio_region_info ?
> > 
> > @@ -275,6 +289,7 @@ struct vfio_region_info {
> >  #define VFIO_REGION_INFO_FLAG_WRITE    (1 << 1) /* Region supports write */
> >  #define VFIO_REGION_INFO_FLAG_MMAP     (1 << 2) /* Region supports mmap */
> >  #define VFIO_REGION_INFO_FLAG_CAPS     (1 << 3) /* Info supports caps */
> > +#define VFIO_REGION_INFO_REQ_WC         (1 << 4) /* Request a write combining mapping*/
> >         __u32   index;          /* Region index */
> >         __u32   cap_offset;     /* Offset within info struct of first cap */
> >         __aligned_u64   size;   /* Region size (bytes) */
> > 
> > 
> > It specify REQ_WC when calling VFIO_DEVICE_GET_REGION_INFO
> > 
> > The kernel will then return an offset value that yields a WC
> > mapping. It doesn't displace the normal non-WC mapping?
> > 
> > Arguably we should fixup the kernel to put the mmap cookies into a
> > maple tree so they can be dynamically allocated and more densely
> > packed.
> 
> vfio_region_info.flags in not currently tested for input therefore this
> proposal could lead to unexpected behavior for a caller that doesn't
> currently zero this field.  It's intended as an output-only field.

Perhaps a REGION_INFO2 then?

I still think per-request is better than a global flag

Jason
Alex Williamson Aug. 1, 2024, 4:52 p.m. UTC | #4
On Thu, 1 Aug 2024 13:11:30 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Thu, Aug 01, 2024 at 09:41:23AM -0600, Alex Williamson wrote:
> > On Thu, 1 Aug 2024 11:19:14 -0300
> > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >   
> > > On Wed, Jul 31, 2024 at 08:53:52AM -0700, Keith Busch wrote:  
> > > > From: Keith Busch <kbusch@kernel.org>
> > > > 
> > > > Write combining can be provide performance improvement for places that
> > > > can safely use this capability.
> > > > 
> > > > Previous discussions on the topic suggest a vfio user needs to
> > > > explicitly request such a mapping, and it sounds like a new vfio
> > > > specific ioctl to request this is one way recommended way to do that.
> > > > This patch implements a new ioctl to achieve that so a user can request
> > > > write combining on prefetchable memory. A new ioctl seems a bit much for
> > > > just this purpose, so the implementation here provides a "flags" field
> > > > with only the write combine option defined. The rest of the bits are
> > > > reserved for future use.    
> > > 
> > > This is a neat hack for sure
> > > 
> > > But how about adding this flag to vfio_region_info ?
> > > 
> > > @@ -275,6 +289,7 @@ struct vfio_region_info {
> > >  #define VFIO_REGION_INFO_FLAG_WRITE    (1 << 1) /* Region supports write */
> > >  #define VFIO_REGION_INFO_FLAG_MMAP     (1 << 2) /* Region supports mmap */
> > >  #define VFIO_REGION_INFO_FLAG_CAPS     (1 << 3) /* Info supports caps */
> > > +#define VFIO_REGION_INFO_REQ_WC         (1 << 4) /* Request a write combining mapping*/
> > >         __u32   index;          /* Region index */
> > >         __u32   cap_offset;     /* Offset within info struct of first cap */
> > >         __aligned_u64   size;   /* Region size (bytes) */
> > > 
> > > 
> > > It specify REQ_WC when calling VFIO_DEVICE_GET_REGION_INFO
> > > 
> > > The kernel will then return an offset value that yields a WC
> > > mapping. It doesn't displace the normal non-WC mapping?
> > > 
> > > Arguably we should fixup the kernel to put the mmap cookies into a
> > > maple tree so they can be dynamically allocated and more densely
> > > packed.  
> > 
> > vfio_region_info.flags in not currently tested for input therefore this
> > proposal could lead to unexpected behavior for a caller that doesn't
> > currently zero this field.  It's intended as an output-only field.  
> 
> Perhaps a REGION_INFO2 then?
> 
> I still think per-request is better than a global flag

I don't understand why we'd need a REGION_INFO2, we already have
support for defining new regions.  We do this by increasing the
num_regions value from the VFIO_DEVICE_GET_INFO ioctl.  The user can
iterate those additional regions and for each index call
VFIO_DEVICE_GET_REGION_INFO.  The new regions expose a
VFIO_REGION_INFO_CAP_TYPE capability where we define new types as:

#define VFIO_REGION_TYPE_PCI_BAR0_WC	(4)
#define VFIO_REGION_TYPE_PCI_BAR1_WC	(5)
#define VFIO_REGION_TYPE_PCI_BAR2_WC	(6)
#define VFIO_REGION_TYPE_PCI_BAR3_WC	(7)
#define VFIO_REGION_TYPE_PCI_BAR4_WC	(8)
#define VFIO_REGION_TYPE_PCI_BAR5_WC	(9)

We'd populate these new regions only for BARs that support prefetch and
mmap and we'd define that these BARs may expose only the MMAP flag and
not the READ or WRITE flags since those can go through the standard
region.  Really the only difference from the static vfio-pci defined
region indexes is the O(N) search for the user to find the vfio region
index to BAR mapping.  Thanks,

Alex
Jason Gunthorpe Aug. 1, 2024, 5:13 p.m. UTC | #5
On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote:
> > > vfio_region_info.flags in not currently tested for input therefore this
> > > proposal could lead to unexpected behavior for a caller that doesn't
> > > currently zero this field.  It's intended as an output-only field.  
> > 
> > Perhaps a REGION_INFO2 then?
> > 
> > I still think per-request is better than a global flag
> 
> I don't understand why we'd need a REGION_INFO2, we already have
> support for defining new regions.

It is not a new region, it is a modified mmap behavior for an existing
region.

> We'd populate these new regions only for BARs that support prefetch and
> mmap 

That's not the point, prefetch has nothing to do with write combining.

Every BAR can be mapped writecombining, it is up to the VFIO userspace
to understand if it can use it or not. The only use case for this
feature would be something like in DPDK.

VM side write combining is already handled by KVM allowing the VM to
upgrade the page attributes to WC from NC.

Doubling all the region indexes just for WC does not seem like a good
idea to me...

Jason
Alex Williamson Aug. 1, 2024, 5:33 p.m. UTC | #6
On Thu, 1 Aug 2024 14:13:55 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote:
> > > > vfio_region_info.flags in not currently tested for input therefore this
> > > > proposal could lead to unexpected behavior for a caller that doesn't
> > > > currently zero this field.  It's intended as an output-only field.    
> > > 
> > > Perhaps a REGION_INFO2 then?
> > > 
> > > I still think per-request is better than a global flag  
> > 
> > I don't understand why we'd need a REGION_INFO2, we already have
> > support for defining new regions.  
> 
> It is not a new region, it is a modified mmap behavior for an existing
> region.

If we're returning a different offset into the vfio device file from
which to get a WC mapping, what's the difference?  A vfio "region" is
describing a region or range of the vfio device file descriptor.
Region indexes that map into the same device resource are not
fundamentally incompatible AFAICT, but it does mean that zapping user
access is not a nice contiguous single range.
 
> > We'd populate these new regions only for BARs that support prefetch and
> > mmap   
> 
> That's not the point, prefetch has nothing to do with write combining.

I was following the original proposal in this thread that added a
prefetch flag to REGION_INFO and allowed enabling WC only for
IORESOURCE_PREFETCH.

> Every BAR can be mapped writecombining, it is up to the VFIO userspace
> to understand if it can use it or not. The only use case for this
> feature would be something like in DPDK.
> 
> VM side write combining is already handled by KVM allowing the VM to
> upgrade the page attributes to WC from NC.
> 
> Doubling all the region indexes just for WC does not seem like a good
> idea to me...

Is the difference you see that in the REQ_WC proposal the user is
effectively asking vfio to pop a WC region into existence vs here
they're pre-populated?  At the limit they're the same.  We could use a
DEVICE_FEATURE to ask vfio to selectively populate WC regions after
which the user could re-enumerate additional regions, or in fact to
switch on WC for a given region if we want to go that route.  Thanks,

Alex
Jason Gunthorpe Aug. 1, 2024, 5:53 p.m. UTC | #7
On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote:
> On Thu, 1 Aug 2024 14:13:55 -0300
> Jason Gunthorpe <jgg@ziepe.ca> wrote:
> 
> > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote:
> > > > > vfio_region_info.flags in not currently tested for input therefore this
> > > > > proposal could lead to unexpected behavior for a caller that doesn't
> > > > > currently zero this field.  It's intended as an output-only field.    
> > > > 
> > > > Perhaps a REGION_INFO2 then?
> > > > 
> > > > I still think per-request is better than a global flag  
> > > 
> > > I don't understand why we'd need a REGION_INFO2, we already have
> > > support for defining new regions.  
> > 
> > It is not a new region, it is a modified mmap behavior for an existing
> > region.
> 
> If we're returning a different offset into the vfio device file from
> which to get a WC mapping, what's the difference? 

I think it is a pretty big difference.. The offset is just a "mmap
cookie", it doesn't have to be 1:1 with the idea of a region.

> A vfio "region" is
> describing a region or range of the vfio device file descriptor.

I'm thinking a region is describing an area of memory that is
available in the VFIO device. The offset output is just a "mmap
cookie" to tell userspace how to mmap it. Having N mmap cookies for 1
region is OK.

> > > We'd populate these new regions only for BARs that support prefetch and
> > > mmap   
> >
> > That's not the point, prefetch has nothing to do with write combining.
> 
> I was following the original proposal in this thread that added a
> prefetch flag to REGION_INFO and allowed enabling WC only for
> IORESOURCE_PREFETCH.

Oh, I didn't notice that, it shouldn't do that. Returning the
VFIO_REGION_FLAG_WRITE_COMBINE makes sense, but it shouldn't effect
what the kernel allows.

> > Doubling all the region indexes just for WC does not seem like a good
> > idea to me...
> 
> Is the difference you see that in the REQ_WC proposal the user is
> effectively asking vfio to pop a WC region into existence vs here
> they're pre-populated? 

?? This didn't create more regions AFAICT. It created a new global

+	bool			bar_write_combine[PCI_STD_NUM_BARS];

Which controls what NC/WC the mmap creates when called:

+	if (vdev->bar_write_combine[index])
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	else
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

You get the same output from REGION_INFO, same number of regions.

It was the other proposal from long ago that created more regions.

This is what I like and would prefer to stick with. REGION_INFO
doesn't really change, we don't have two regions refering to the same
physical memory, and we find some way to request NC/WC of a region at
mmap time.

A global is a neat trick, but it would be cleaner to request
properties of the mmap when the "mmap cookie" is obtained.

> At the limit they're the same.  We could use a
> DEVICE_FEATURE to ask vfio to selectively populate WC regions after
> which the user could re-enumerate additional regions, or in fact to
> switch on WC for a given region if we want to go that route.  Thanks,

This is still adding more regions and reporting more stuff from
REGION_INFO, that is what I would like to avoid.

Jason
Alex Williamson Aug. 1, 2024, 6:16 p.m. UTC | #8
On Thu, 1 Aug 2024 14:53:39 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote:
> > On Thu, 1 Aug 2024 14:13:55 -0300
> > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >   
> > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote:  
> > > > > > vfio_region_info.flags in not currently tested for input therefore this
> > > > > > proposal could lead to unexpected behavior for a caller that doesn't
> > > > > > currently zero this field.  It's intended as an output-only field.      
> > > > > 
> > > > > Perhaps a REGION_INFO2 then?
> > > > > 
> > > > > I still think per-request is better than a global flag    
> > > > 
> > > > I don't understand why we'd need a REGION_INFO2, we already have
> > > > support for defining new regions.    
> > > 
> > > It is not a new region, it is a modified mmap behavior for an existing
> > > region.  
> > 
> > If we're returning a different offset into the vfio device file from
> > which to get a WC mapping, what's the difference?   
> 
> I think it is a pretty big difference.. The offset is just a "mmap
> cookie", it doesn't have to be 1:1 with the idea of a region.
> 
> > A vfio "region" is
> > describing a region or range of the vfio device file descriptor.  
> 
> I'm thinking a region is describing an area of memory that is
> available in the VFIO device. The offset output is just a "mmap
> cookie" to tell userspace how to mmap it. Having N mmap cookies for 1
> region is OK.

Is an "mmap cookie" an offset into the vfio device file where mmap'ing
that offset results in a WC mapping to a specific device resource?
Isn't that just a region that doesn't have an index or supporting
infrastructure?
 
> > > > We'd populate these new regions only for BARs that support prefetch and
> > > > mmap     
> > >
> > > That's not the point, prefetch has nothing to do with write combining.  
> > 
> > I was following the original proposal in this thread that added a
> > prefetch flag to REGION_INFO and allowed enabling WC only for
> > IORESOURCE_PREFETCH.  
> 
> Oh, I didn't notice that, it shouldn't do that. Returning the
> VFIO_REGION_FLAG_WRITE_COMBINE makes sense, but it shouldn't effect
> what the kernel allows.
> 
> > > Doubling all the region indexes just for WC does not seem like a good
> > > idea to me...  
> > 
> > Is the difference you see that in the REQ_WC proposal the user is
> > effectively asking vfio to pop a WC region into existence vs here
> > they're pre-populated?   
> 
> ?? This didn't create more regions AFAICT. It created a new global
> 
> +	bool			bar_write_combine[PCI_STD_NUM_BARS];
> 
> Which controls what NC/WC the mmap creates when called:
> 
> +	if (vdev->bar_write_combine[index])
> +		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
> +	else
> +		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> 
> You get the same output from REGION_INFO, same number of regions.

It was your proposal that introduced REQ_WC, this is Keith's original
proposal.  I'm equating a REQ_WC request inventing an "mmap cookie" as
effectively the same as bringing a lightweight region into existence
because it defines a section of the vfio device file to have specific
mmap semantics.

> It was the other proposal from long ago that created more regions.
> 
> This is what I like and would prefer to stick with. REGION_INFO
> doesn't really change, we don't have two regions refering to the same
> physical memory, and we find some way to request NC/WC of a region at
> mmap time.

"At mmap time" means that something in the vma needs to describe to us
to use the WC semantics, where I think you're proposing that the "mmap
cookie" provides a specific vm_pgoff which we already use to determine
the region index.  So whether or not we want to call this a region,
it's effectively in the same address space as regions.  Therefore "mmap
cookie" ~= "region offset".

> A global is a neat trick, but it would be cleaner to request
> properties of the mmap when the "mmap cookie" is obtained.
> 
> > At the limit they're the same.  We could use a
> > DEVICE_FEATURE to ask vfio to selectively populate WC regions after
> > which the user could re-enumerate additional regions, or in fact to
> > switch on WC for a given region if we want to go that route.  Thanks,  
> 
> This is still adding more regions and reporting more stuff from
> REGION_INFO, that is what I would like to avoid.

Why?  This reminds me of hidden registers outside of capability chains
in PCI config space.  Thanks,

Alex
Jason Gunthorpe Aug. 2, 2024, 11:53 a.m. UTC | #9
On Thu, Aug 01, 2024 at 12:16:57PM -0600, Alex Williamson wrote:
> On Thu, 1 Aug 2024 14:53:39 -0300
> Jason Gunthorpe <jgg@ziepe.ca> wrote:
> 
> > On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote:
> > > On Thu, 1 Aug 2024 14:13:55 -0300
> > > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >   
> > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote:  
> > > > > > > vfio_region_info.flags in not currently tested for input therefore this
> > > > > > > proposal could lead to unexpected behavior for a caller that doesn't
> > > > > > > currently zero this field.  It's intended as an output-only field.      
> > > > > > 
> > > > > > Perhaps a REGION_INFO2 then?
> > > > > > 
> > > > > > I still think per-request is better than a global flag    
> > > > > 
> > > > > I don't understand why we'd need a REGION_INFO2, we already have
> > > > > support for defining new regions.    
> > > > 
> > > > It is not a new region, it is a modified mmap behavior for an existing
> > > > region.  
> > > 
> > > If we're returning a different offset into the vfio device file from
> > > which to get a WC mapping, what's the difference?   
> > 
> > I think it is a pretty big difference.. The offset is just a "mmap
> > cookie", it doesn't have to be 1:1 with the idea of a region.
> > 
> > > A vfio "region" is
> > > describing a region or range of the vfio device file descriptor.  
> > 
> > I'm thinking a region is describing an area of memory that is
> > available in the VFIO device. The offset output is just a "mmap
> > cookie" to tell userspace how to mmap it. Having N mmap cookies for 1
> > region is OK.
> 
> Is an "mmap cookie" an offset into the vfio device file where mmap'ing
> that offset results in a WC mapping to a specific device resource?

Yes

> Isn't that just a region that doesn't have an index or supporting
> infrastructure?

No? It is a "mmap cookie" that has the requested mmap-time properties.

Today the kernel side binds the mmap offset to the index in a computed
way, but from a API perspective userspace does REGION_INFO and gets
back an opaque "mmap cookie" that it uses to pass to mmap to get back
the thing describe by REGION_INFO. Userspace has no idea about any
structure to the cookie numbers.
  
> > Which controls what NC/WC the mmap creates when called:
> > 
> > +	if (vdev->bar_write_combine[index])
> > +		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
> > +	else
> > +		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> > 
> > You get the same output from REGION_INFO, same number of regions.
> 
> It was your proposal that introduced REQ_WC, this is Keith's original
> proposal.  I'm equating a REQ_WC request inventing an "mmap cookie" as
> effectively the same as bringing a lightweight region into existence
> because it defines a section of the vfio device file to have specific
> mmap semantics.

Well, again, it is not a region, it is just a record that this mmap
cookie uses X region with Y mapping flags. The number of regions don't
change. Critically from a driver perspective the number of regions and
region indexes wouldn't change.

I am not thing of making a new region, I am thinking of adjusting how
mmap works. Like today we do this:

	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);

So maybe instead it is something like:

      vfio_decode_mmap_cookie(vma, &index, &flags);
      if (flags & VFIO_MMAP_WC)
          prot = pgprot_writecombine(..)

From a driver perspective the region indexes don't change at all. This
is what I think is important here.

> > It was the other proposal from long ago that created more regions.
> > 
> > This is what I like and would prefer to stick with. REGION_INFO
> > doesn't really change, we don't have two regions refering to the same
> > physical memory, and we find some way to request NC/WC of a region at
> > mmap time.
> 
> "At mmap time" means that something in the vma needs to describe to us
> to use the WC semantics, where I think you're proposing that the "mmap
> cookie" provides a specific vm_pgoff which we already use to determine
> the region index.  

Yes

> So whether or not we want to call this a region,
> it's effectively in the same address space as regions.  Therefore "mmap
> cookie" ~= "region offset".

Well, that is just the current implementation. What we did in RDMA
when we switched from hard coded mmap cookies to dynamic ones is
use an xarray (today this should be a maple tree) to dynamically
allocate mmap cookies whenever the driver returns something to
userspace. During the mmap fop the pgoff is fed back through the maple
tree to get the description of what the cookie represents.

So the encoding of cookies is completely disjoint from whatever the
underlying thing is. If you want the same region to be mapped with two
or three different prot flags you just ask for two or three cookies
and at mmap time you can recover the region pointer and the mmap
flags.

So VFIO could do several different things here to convay the mmap
flags through the cookie, including somehow encoding it in a pgoff
bit, or using a dynamic maple tree scheme.

My point is to not confuse the pgoff encoding with the driver concept
of a region. The region is a single peice of memory, the "mmap cookie"s
are just handles to it. Adding more data to the handle is not the same
as adding more regions.

> > > At the limit they're the same.  We could use a
> > > DEVICE_FEATURE to ask vfio to selectively populate WC regions after
> > > which the user could re-enumerate additional regions, or in fact to
> > > switch on WC for a given region if we want to go that route.  Thanks,  
> > 
> > This is still adding more regions and reporting more stuff from
> > REGION_INFO, that is what I would like to avoid.
> 
> Why?  This reminds me of hidden registers outside of capability chains
> in PCI config space.  Thanks,

The mmap offsets are not (supposed to be) ABI in the VFIO ioctls. The
encoding is entirely opaque inside the kernel already. Apps are
supposed to use REGION_INFO to learn the value to pass to mmap. ie
things like VFIO_PCI_OFFSET_SHIFT are not in the uAPI header.

Jason
Keith Busch Aug. 2, 2024, 2:24 p.m. UTC | #10
On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote:
> On Thu, 1 Aug 2024 14:13:55 -0300
> Jason Gunthorpe <jgg@ziepe.ca> wrote:
> 
> > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote:
>  
> > > We'd populate these new regions only for BARs that support prefetch and
> > > mmap   
> > 
> > That's not the point, prefetch has nothing to do with write combining.
> 
> I was following the original proposal in this thread that added a
> prefetch flag to REGION_INFO and allowed enabling WC only for
> IORESOURCE_PREFETCH.

Which itself follows the existing pattern from
pci_create_resource_files(), which creates a write combine
resource<X>_wc file only when IORESOURCE_PREFETCH is set. But yeah,
prefetch isn't necessary for wc, but it seems to indicate it's safe.
Jason Gunthorpe Aug. 2, 2024, 2:33 p.m. UTC | #11
On Fri, Aug 02, 2024 at 08:24:49AM -0600, Keith Busch wrote:
> On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote:
> > On Thu, 1 Aug 2024 14:13:55 -0300
> > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > 
> > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote:
> >  
> > > > We'd populate these new regions only for BARs that support prefetch and
> > > > mmap   
> > > 
> > > That's not the point, prefetch has nothing to do with write combining.
> > 
> > I was following the original proposal in this thread that added a
> > prefetch flag to REGION_INFO and allowed enabling WC only for
> > IORESOURCE_PREFETCH.
> 
> Which itself follows the existing pattern from
> pci_create_resource_files(), which creates a write combine
> resource<X>_wc file only when IORESOURCE_PREFETCH is set. But yeah,
> prefetch isn't necessary for wc, but it seems to indicate it's safe.

Yes, I know, that code isn't right either... It seems to be the root
of this odd "prefetch and WC are related" idea.

Jason
Alex Williamson Aug. 2, 2024, 5:05 p.m. UTC | #12
On Fri, 2 Aug 2024 08:53:08 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Thu, Aug 01, 2024 at 12:16:57PM -0600, Alex Williamson wrote:
> > On Thu, 1 Aug 2024 14:53:39 -0300
> > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >   
> > > On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote:  
> > > > On Thu, 1 Aug 2024 14:13:55 -0300
> > > > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > > >     
> > > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote:    
> > > > > > > > vfio_region_info.flags in not currently tested for input therefore this
> > > > > > > > proposal could lead to unexpected behavior for a caller that doesn't
> > > > > > > > currently zero this field.  It's intended as an output-only field.        
> > > > > > > 
> > > > > > > Perhaps a REGION_INFO2 then?
> > > > > > > 
> > > > > > > I still think per-request is better than a global flag      
> > > > > > 
> > > > > > I don't understand why we'd need a REGION_INFO2, we already have
> > > > > > support for defining new regions.      
> > > > > 
> > > > > It is not a new region, it is a modified mmap behavior for an existing
> > > > > region.    
> > > > 
> > > > If we're returning a different offset into the vfio device file from
> > > > which to get a WC mapping, what's the difference?     
> > > 
> > > I think it is a pretty big difference.. The offset is just a "mmap
> > > cookie", it doesn't have to be 1:1 with the idea of a region.
> > >   
> > > > A vfio "region" is
> > > > describing a region or range of the vfio device file descriptor.    
> > > 
> > > I'm thinking a region is describing an area of memory that is
> > > available in the VFIO device. The offset output is just a "mmap
> > > cookie" to tell userspace how to mmap it. Having N mmap cookies for 1
> > > region is OK.  
> > 
> > Is an "mmap cookie" an offset into the vfio device file where mmap'ing
> > that offset results in a WC mapping to a specific device resource?  
> 
> Yes
> 
> > Isn't that just a region that doesn't have an index or supporting
> > infrastructure?  
> 
> No? It is a "mmap cookie" that has the requested mmap-time properties.
> 
> Today the kernel side binds the mmap offset to the index in a computed
> way, but from a API perspective userspace does REGION_INFO and gets
> back an opaque "mmap cookie" that it uses to pass to mmap to get back
> the thing describe by REGION_INFO. Userspace has no idea about any
> structure to the cookie numbers.

REGION_INFO is part of what I'm referring to as infrastructure.
Userspace knows a number of regions, ie. indexes, via DEVICE_INFO and
iterates those regions using REGION_INFO to get the mmap cookie, ie.
file offset.  The ABI is flexible here, the first few region indexes
are static mappings to vfio-pci specific device resources and
additional mappings are described as device specific resources using
the REGION_INFO capability chain.

> > > Which controls what NC/WC the mmap creates when called:
> > > 
> > > +	if (vdev->bar_write_combine[index])
> > > +		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
> > > +	else
> > > +		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> > > 
> > > You get the same output from REGION_INFO, same number of regions.  
> > 
> > It was your proposal that introduced REQ_WC, this is Keith's original
> > proposal.  I'm equating a REQ_WC request inventing an "mmap cookie" as
> > effectively the same as bringing a lightweight region into existence
> > because it defines a section of the vfio device file to have specific
> > mmap semantics.  
> 
> Well, again, it is not a region, it is just a record that this mmap
> cookie uses X region with Y mapping flags. The number of regions don't
> change. Critically from a driver perspective the number of regions and
> region indexes wouldn't change.

Why is this critical?  As above, for vfio-pci devices the first few
region indexes are static mappings to specific PCI resources and
additional resources beyond VFIO_PCI_NUM_REGIONS are device specific
and described by a region type in the capability chain.

None of the static regions move and it's part of the defined ABI for
userspace to iterate additional regions.

> I am not thing of making a new region, I am thinking of adjusting how
> mmap works. Like today we do this:
> 
> 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
> 
> So maybe instead it is something like:
> 
>       vfio_decode_mmap_cookie(vma, &index, &flags);
>       if (flags & VFIO_MMAP_WC)
>           prot = pgprot_writecombine(..)
> 
> From a driver perspective the region indexes don't change at all. This
> is what I think is important here.

The only thing that changes is that num_regions increases and the user
can go check REGION_INFO on those newly reported regions.  The way we
currently calculate the index from the vm_pgoff is implementation, not
uAPI.  We can change that if necessary.  Userspace should always get
the offset, ie. mmap cookie, from REGION_INFO and the initial set of
region indexes to device resources are fixed, they will not change.

> > > It was the other proposal from long ago that created more regions.
> > > 
> > > This is what I like and would prefer to stick with. REGION_INFO
> > > doesn't really change, we don't have two regions refering to the same
> > > physical memory, and we find some way to request NC/WC of a region at
> > > mmap time.  
> > 
> > "At mmap time" means that something in the vma needs to describe to us
> > to use the WC semantics, where I think you're proposing that the "mmap
> > cookie" provides a specific vm_pgoff which we already use to determine
> > the region index.    
> 
> Yes
> 
> > So whether or not we want to call this a region,
> > it's effectively in the same address space as regions.  Therefore "mmap
> > cookie" ~= "region offset".  
> 
> Well, that is just the current implementation. What we did in RDMA
> when we switched from hard coded mmap cookies to dynamic ones is
> use an xarray (today this should be a maple tree) to dynamically
> allocate mmap cookies whenever the driver returns something to
> userspace. During the mmap fop the pgoff is fed back through the maple
> tree to get the description of what the cookie represents.

Sure, we could do that too, the current implementation (not uAPI) just
uses some upper bits to create fixed region address spaces.  The only
thing we should need to keep consistent is the mapping of indexes to
device resources up through VFIO_PCI_NUM_REGIONS.

> So the encoding of cookies is completely disjoint from whatever the
> underlying thing is. If you want the same region to be mapped with two
> or three different prot flags you just ask for two or three cookies
> and at mmap time you can recover the region pointer and the mmap
> flags.
> 
> So VFIO could do several different things here to convay the mmap
> flags through the cookie, including somehow encoding it in a pgoff
> bit, or using a dynamic maple tree scheme.
> 
> My point is to not confuse the pgoff encoding with the driver concept
> of a region. The region is a single peice of memory, the "mmap cookie"s
> are just handles to it. Adding more data to the handle is not the same
> as adding more regions.

I don't get it.  Take for instance PCI config space.  Given the right
GPU, I can get to config space through an I/O port region, an MMIO
region (possibly multiple ways), and the config space region itself.
Therefore based on this hardware implementation there is no unique
mapping that says that config space is uniquely accessible via a single
region.  Each of these regions has different semantics.  If the layout
of the device can cause this, why do we restrict ourselves that a given
BAR can only be accessed via a signle region and we need to play games
with terminology to call it an mmap cookie rather than officially
creating a region with WC mmap semantics?

> > > > At the limit they're the same.  We could use a
> > > > DEVICE_FEATURE to ask vfio to selectively populate WC regions after
> > > > which the user could re-enumerate additional regions, or in fact to
> > > > switch on WC for a given region if we want to go that route.  Thanks,    
> > > 
> > > This is still adding more regions and reporting more stuff from
> > > REGION_INFO, that is what I would like to avoid.  
> > 
> > Why?  This reminds me of hidden registers outside of capability chains
> > in PCI config space.  Thanks,  
> 
> The mmap offsets are not (supposed to be) ABI in the VFIO ioctls. The
> encoding is entirely opaque inside the kernel already. Apps are
> supposed to use REGION_INFO to learn the value to pass to mmap. ie
> things like VFIO_PCI_OFFSET_SHIFT are not in the uAPI header.

Exactly, which gives us the flexibility to add new regions as
necessary.  The mechanism by which userspace iterates regions is
already provided in the uABI.  The initial set of static vfio-pci
regions require fixed indexes, but not fixed offsets.
VFIO_PCI_OFFSET_SHIFT is only an internal implementation detail.  Thanks

Alex
Tian, Kevin Aug. 6, 2024, 7:19 a.m. UTC | #13
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Friday, August 2, 2024 10:33 PM
> 
> On Fri, Aug 02, 2024 at 08:24:49AM -0600, Keith Busch wrote:
> > On Thu, Aug 01, 2024 at 11:33:44AM -0600, Alex Williamson wrote:
> > > On Thu, 1 Aug 2024 14:13:55 -0300
> > > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >
> > > > On Thu, Aug 01, 2024 at 10:52:18AM -0600, Alex Williamson wrote:
> > >
> > > > > We'd populate these new regions only for BARs that support prefetch
> and
> > > > > mmap
> > > >
> > > > That's not the point, prefetch has nothing to do with write combining.
> > >
> > > I was following the original proposal in this thread that added a
> > > prefetch flag to REGION_INFO and allowed enabling WC only for
> > > IORESOURCE_PREFETCH.
> >
> > Which itself follows the existing pattern from
> > pci_create_resource_files(), which creates a write combine
> > resource<X>_wc file only when IORESOURCE_PREFETCH is set. But yeah,
> > prefetch isn't necessary for wc, but it seems to indicate it's safe.
> 
> Yes, I know, that code isn't right either... It seems to be the root
> of this odd "prefetch and WC are related" idea.
> 

According to PCIe spec:

"
Bit 3 should be set to 1b if the data is prefetchable and set to 0b
otherwise. A Function is permitted to mark a range as prefetchable
if there are no side effects on reads, the Function returns all bytes
on reads regardless of the byte enables, and host bridges can
merge processor writes into this range without causing errors.
"

Above kind of suggests that using WC on a non-prefetchable BAR
may cause errors then "prefetch and WC are related" does make
some sense?
Jason Gunthorpe Aug. 6, 2024, 4:47 p.m. UTC | #14
On Tue, Aug 06, 2024 at 07:19:18AM +0000, Tian, Kevin wrote:

> "
> Bit 3 should be set to 1b if the data is prefetchable and set to 0b
> otherwise. A Function is permitted to mark a range as prefetchable
> if there are no side effects on reads, the Function returns all bytes
> on reads regardless of the byte enables, and host bridges can
> merge processor writes into this range without causing errors.
> "
> 
> Above kind of suggests that using WC on a non-prefetchable BAR
> may cause errors then "prefetch and WC are related" does make
> some sense?

prefetch exists in the spec to support historical old pre-PCI-x
environments where a bridge does all kinds of strange things. prefetch
turns that brdige behavior on because otherwise it is non backwards
compatible.

In a modern PCIe environment the fabric is perfectly TLP preserving
and the PCI spec concept if prefetch is entirely vestigial.

There is no clean mapping of what PCI spec prefetch contemplates with
how moderns CPUs actually work. They should never be comingled.

Today we expect the driver to understand what TLPs the CPU should emit
and do the correct thing. From a Linux programming model with modern
HW we never really permit "merge process writes" or expect
"speculative reads" on anything except explicit WC mappings.

Jason
Jason Gunthorpe Aug. 6, 2024, 4:53 p.m. UTC | #15
On Fri, Aug 02, 2024 at 11:05:06AM -0600, Alex Williamson wrote:

> > Well, again, it is not a region, it is just a record that this mmap
> > cookie uses X region with Y mapping flags. The number of regions don't
> > change. Critically from a driver perspective the number of regions and
> > region indexes wouldn't change.
> 
> Why is this critical?

So we don't leak this too much into the drivers? Why should all the
VFIO drivers have to be changed to alter how their region indexes work
just to add a single flag?? 

> > Well, that is just the current implementation. What we did in RDMA
> > when we switched from hard coded mmap cookies to dynamic ones is
> > use an xarray (today this should be a maple tree) to dynamically
> > allocate mmap cookies whenever the driver returns something to
> > userspace. During the mmap fop the pgoff is fed back through the maple
> > tree to get the description of what the cookie represents.
> 
> Sure, we could do that too, the current implementation (not uAPI) just
> uses some upper bits to create fixed region address spaces.  The only
> thing we should need to keep consistent is the mapping of indexes to
> device resources up through VFIO_PCI_NUM_REGIONS.

I fear we might need to do this as there may not be room in the pgoff
space (at least for 32 bit) to duplicate everything....

> > My point is to not confuse the pgoff encoding with the driver concept
> > of a region. The region is a single peice of memory, the "mmap cookie"s
> > are just handles to it. Adding more data to the handle is not the same
> > as adding more regions.
> 
> I don't get it.  Take for instance PCI config space.  Given the right
> GPU, I can get to config space through an I/O port region, an MMIO
> region (possibly multiple ways), and the config space region itself.
> Therefore based on this hardware implementation there is no unique
> mapping that says that config space is uniquely accessible via a single
> region.  

That doesn't seem like this sitation. Those are multiple different HW
paths with different HW addresses, sure they can have different
regions.

Here we are talking about the same HW path with the same HW
addresses. It shouldn't be duplicated.

> BAR can only be accessed via a signle region and we need to play games
> with terminology to call it an mmap cookie rather than officially
> creating a region with WC mmap semantics?

Because if you keep adding more regions for what are attributes of a
mapping we may end up with a combinatoral explosion of regions.

I already know there is interest in doing non-cache/cache mapping
attributes too.

Approaching this as a fixed number of regions reflecting the HW
addresses and a variable number of flags requested by the user is alot
more reasonable than trying to have a list of every permutation of
every address for every combination of flags.

Jason
Alex Williamson Aug. 6, 2024, 6:43 p.m. UTC | #16
On Tue, 6 Aug 2024 13:53:12 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Fri, Aug 02, 2024 at 11:05:06AM -0600, Alex Williamson wrote:
> 
> > > Well, again, it is not a region, it is just a record that this mmap
> > > cookie uses X region with Y mapping flags. The number of regions don't
> > > change. Critically from a driver perspective the number of regions and
> > > region indexes wouldn't change.  
> > 
> > Why is this critical?  
> 
> So we don't leak this too much into the drivers? Why should all the
> VFIO drivers have to be changed to alter how their region indexes work
> just to add a single flag?? 

I don't know how you're coming to this conclusion.  A driver that wants
this new mapping flag needs to do something different but the existing
use case is absolutely unchanged.  Look for example at how the IGD code
in vfio adds several device specific regions.  This doesn't affect
anything other than new code in userspace that wants to iterate these
regions.  It doesn't change the indexes of any of the statically
defined regions.
 
> > > Well, that is just the current implementation. What we did in RDMA
> > > when we switched from hard coded mmap cookies to dynamic ones is
> > > use an xarray (today this should be a maple tree) to dynamically
> > > allocate mmap cookies whenever the driver returns something to
> > > userspace. During the mmap fop the pgoff is fed back through the maple
> > > tree to get the description of what the cookie represents.  
> > 
> > Sure, we could do that too, the current implementation (not uAPI) just
> > uses some upper bits to create fixed region address spaces.  The only
> > thing we should need to keep consistent is the mapping of indexes to
> > device resources up through VFIO_PCI_NUM_REGIONS.  
> 
> I fear we might need to do this as there may not be room in the pgoff
> space (at least for 32 bit) to duplicate everything....

We'll root out userspace drivers that hard code region offsets in doing
this, but otherwise it shouldn't be an issue.  If the collateral is too
large the standard regions can use the fixed offsets and device
specific regions can use dynamic offsets.

> > > My point is to not confuse the pgoff encoding with the driver concept
> > > of a region. The region is a single peice of memory, the "mmap cookie"s
> > > are just handles to it. Adding more data to the handle is not the same
> > > as adding more regions.  
> > 
> > I don't get it.  Take for instance PCI config space.  Given the right
> > GPU, I can get to config space through an I/O port region, an MMIO
> > region (possibly multiple ways), and the config space region itself.
> > Therefore based on this hardware implementation there is no unique
> > mapping that says that config space is uniquely accessible via a single
> > region.    
> 
> That doesn't seem like this sitation. Those are multiple different HW
> paths with different HW addresses, sure they can have different
> regions.
> 
> Here we are talking about the same HW path with the same HW
> addresses. It shouldn't be duplicated.

How does an "mmap cookie" not duplicate that a device range is
accessible through multiple offsets of the vfio device file?

> > BAR can only be accessed via a signle region and we need to play games
> > with terminology to call it an mmap cookie rather than officially
> > creating a region with WC mmap semantics?  
> 
> Because if you keep adding more regions for what are attributes of a
> mapping we may end up with a combinatoral explosion of regions.
> 
> I already know there is interest in doing non-cache/cache mapping
> attributes too.

This sounds like variant driver space, we can't generically create
cachable mappings to MMIO.  vfio-nvgrace-gpu already does this, but
they usurp the standard BAR region, there's no longer uncached access.

> Approaching this as a fixed number of regions reflecting the HW
> addresses and a variable number of flags requested by the user is alot
> more reasonable than trying to have a list of every permutation of
> every address for every combination of flags.

Well first, we're not talking about a fixed number of additional
regions, we're talking about defining region identifiers for each BAR
with a WC mapping attribute, but at worst we'd only populate
implemented MMIO BARs.  But then we've also mentioned that a device
feature could be used to allow a userspace driver to selectively bring
these regions into existence.  In an case, an mmap cookie also consumes
address space from the vfio device file, so I'm still failing to see
how calling them a region vs just an mmap cookie makes a substantive
difference.  Thanks,

Alex
Jason Gunthorpe Aug. 7, 2024, 2:19 p.m. UTC | #17
On Tue, Aug 06, 2024 at 12:43:02PM -0600, Alex Williamson wrote:

> > So we don't leak this too much into the drivers? Why should all the
> > VFIO drivers have to be changed to alter how their region indexes work
> > just to add a single flag??

> I don't know how you're coming to this conclusion.

Ideally we'd want to support the WC option basically everywhere.

> > I fear we might need to do this as there may not be room in the pgoff
> > space (at least for 32 bit) to duplicate everything....

> We'll root out userspace drivers that hard code region offsets in doing
> this, but otherwise it shouldn't be an issue.

The issue is running out of pgoff bits on 32 bit. Maybe this isn't an
issue for VFIO, but it was for RDMA. We needed tight optimal on-demand
packing of actual requested mmaps. Allocating gigabytes of address
space for possible mmaps ran out of pgoff bits. :\

> How does an "mmap cookie" not duplicate that a device range is
> accessible through multiple offsets of the vfio device file?

pgoff duplcation is not really an issue, from an API perspective the
driver would call a helper to convert the pgoff into a region index
and mmap flags. It wouldn't matter to any driver how many duplicates
there are.

> Well first, we're not talking about a fixed number of additional
> regions, we're talking about defining region identifiers for each BAR
> with a WC mapping attribute, but at worst we'd only populate
> implemented MMIO BARs.  But then we've also mentioned that a device
> feature could be used to allow a userspace driver to selectively bring
> these regions into existence.  In an case, an mmap cookie also consumes
> address space from the vfio device file, so I'm still failing to see
> how calling them a region vs just an mmap cookie makes a substantive
> difference.

You'd only allocate the mmap cookie when userspace requests it.

My original suggestion was to send a flag to REGION_INFO to
specifically ask for the different behavior, that (and only that)
would return new mmap cookies.

The alternative version of this might be to have a single
'GET_REGION_MMAP' that gives a new mmap cookie for a singular
specified region index. Userspace would call REGION_INFO to learn the
memory regions and then it could call GET_REGION_MMAP(REQ_WC) and will
get back a single dynamic mmap cookie that links the WC flags.

No system call, no cookie allocation. Existing apps don't start seeing
more regions from REGION_INFO. Drivers keep region indexes 1:1 with HW
objects. The uAPI has room to add more mmap flags.

Jason
Alex Williamson Aug. 7, 2024, 5:46 p.m. UTC | #18
On Wed, 7 Aug 2024 11:19:10 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Tue, Aug 06, 2024 at 12:43:02PM -0600, Alex Williamson wrote:
> 
> > > So we don't leak this too much into the drivers? Why should all the
> > > VFIO drivers have to be changed to alter how their region indexes work
> > > just to add a single flag??  
> 
> > I don't know how you're coming to this conclusion.  
> 
> Ideally we'd want to support the WC option basically everywhere.
> 
> > > I fear we might need to do this as there may not be room in the pgoff
> > > space (at least for 32 bit) to duplicate everything....  
> 
> > We'll root out userspace drivers that hard code region offsets in doing
> > this, but otherwise it shouldn't be an issue.  
> 
> The issue is running out of pgoff bits on 32 bit. Maybe this isn't an
> issue for VFIO, but it was for RDMA. We needed tight optimal on-demand
> packing of actual requested mmaps. Allocating gigabytes of address
> space for possible mmaps ran out of pgoff bits. :\

If we only implemented WC for 64-bit, would anyone notice?
 
> > How does an "mmap cookie" not duplicate that a device range is
> > accessible through multiple offsets of the vfio device file?  
> 
> pgoff duplcation is not really an issue, from an API perspective the
> driver would call a helper to convert the pgoff into a region index
> and mmap flags. It wouldn't matter to any driver how many duplicates
> there are.

Which is exactly my point whether we call it a region or an mmap
cookie.  In one case we're trying to give a bare pgoff that effectively
aliases to a region with different mapping flags, in the other the
pgoff is exposed through a new region offset that does exactly the same
thing.

> > Well first, we're not talking about a fixed number of additional
> > regions, we're talking about defining region identifiers for each BAR
> > with a WC mapping attribute, but at worst we'd only populate
> > implemented MMIO BARs.  But then we've also mentioned that a device
> > feature could be used to allow a userspace driver to selectively bring
> > these regions into existence.  In an case, an mmap cookie also consumes
> > address space from the vfio device file, so I'm still failing to see
> > how calling them a region vs just an mmap cookie makes a substantive
> > difference.  
> 
> You'd only allocate the mmap cookie when userspace requests it.

I've suggested a mechanism using DEVICE_FEATURE that could do this for
regions.

> My original suggestion was to send a flag to REGION_INFO to
> specifically ask for the different behavior, that (and only that)
> would return new mmap cookies.

Which can't work because flags is only an output field.

> The alternative version of this might be to have a single
> 'GET_REGION_MMAP' that gives a new mmap cookie for a singular
> specified region index. Userspace would call REGION_INFO to learn the
> memory regions and then it could call GET_REGION_MMAP(REQ_WC) and will
> get back a single dynamic mmap cookie that links the WC flags.
> 
> No system call, no cookie allocation. Existing apps don't start seeing
> more regions from REGION_INFO. Drivers keep region indexes 1:1 with HW
> objects. The uAPI has room to add more mmap flags.

Please tell me how this is ultimately different from invoking a
DEVICE_FEATURE call to request that a new device specific region be
created with the desired mappings.  In the short term, if we run out of
pgoff, the user gets an -ENOSPC.  DEVICE_INFO is updated with the new
number of regions, existing region indexes are unchanged, the user
iterates new indexes with REGION_INFO to get the offset and identifies
them using the previously proposed region types.  Thanks,

Alex
Jason Gunthorpe Aug. 13, 2024, 6:02 p.m. UTC | #19
On Wed, Aug 07, 2024 at 11:46:43AM -0600, Alex Williamson wrote:

> Please tell me how this is ultimately different from invoking a
> DEVICE_FEATURE call to request that a new device specific region be
> created with the desired mappings. 

I think this is more complex for userspace and the drivers to
implement than just asking for a new pgoff directly..

A new pgoff we can manage pretty much generically with some new core
code, some driver helpers, and adjusting the drivers to use the new
helpers. I did exactly this a few years back to rdma and it was not
hard.

Dynamic region indexes, and indexes that alias other regions, seems
more tricky to me. I'm not sure how this would look.

It gets to the same place, I agree.

Jason
Christoph Hellwig Aug. 15, 2024, 5:05 a.m. UTC | #20
On Fri, Aug 02, 2024 at 08:24:49AM -0600, Keith Busch wrote:
> Which itself follows the existing pattern from
> pci_create_resource_files(), which creates a write combine
> resource<X>_wc file only when IORESOURCE_PREFETCH is set. But yeah,
> prefetch isn't necessary for wc, but it seems to indicate it's safe.

Apparently IORESOURCE_PREFETCH never implied anything about write
combinability, and Linux got this wrong long time ago.  The fact that
we do this now clashes with PCI SIGs desire to kill the concept of
prefetcable BARs that they are forcing down everyones throat.
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index ba0ce0075b2fb..c275c95eafe32 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1042,12 +1042,18 @@  static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
 		info.flags = VFIO_REGION_INFO_FLAG_READ |
 			     VFIO_REGION_INFO_FLAG_WRITE;
 		if (vdev->bar_mmap_supported[info.index]) {
+			struct resource *res;
+
 			info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
 			if (info.index == vdev->msix_bar) {
 				ret = msix_mmappable_cap(vdev, &caps);
 				if (ret)
 					return ret;
 			}
+
+			res = &vdev->pdev->resource[index];
+			if (res->flags & IORESOURCE_PREFETCH)
+				info.flags |= VFIO_REGION_INFO_FLAG_PREFETCH;
 		}
 
 		break;
@@ -1223,6 +1229,32 @@  static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
 	return ret;
 }
 
+static int vfio_pci_ioctl_set_region_flags(struct vfio_pci_core_device *vdev,
+				   struct vfio_region_flags __user *arg)
+{
+	struct vfio_region_flags region_flags;
+	struct resource *res;
+	u32 index;
+
+	if (copy_from_user(&region_flags, arg, sizeof(region_flags)))
+		return -EFAULT;
+
+	index = region_flags.index;
+	if (index >= PCI_STD_NUM_BARS)
+		return -EINVAL;
+
+	if (region_flags.flags & VFIO_REGION_FLAG_WRITE_COMBINE) {
+		res = &vdev->pdev->resource[index];
+		if (!(res->flags & IORESOURCE_MEM) ||
+		    !(res->flags & IORESOURCE_PREFETCH))
+			return -EINVAL;
+		vdev->bar_write_combine[index] = true;
+	} else
+		vdev->bar_write_combine[index] = false;
+
+	return 0;
+}
+
 static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
 				void __user *arg)
 {
@@ -1484,6 +1516,8 @@  long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 		return vfio_pci_ioctl_reset(vdev, uarg);
 	case VFIO_DEVICE_SET_IRQS:
 		return vfio_pci_ioctl_set_irqs(vdev, uarg);
+	case VFIO_DEVICE_SET_REGION_FLAGS:
+		return vfio_pci_ioctl_set_region_flags(vdev, uarg);
 	default:
 		return -ENOTTY;
 	}
@@ -1756,7 +1790,10 @@  int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
 	}
 
 	vma->vm_private_data = vdev;
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	if (vdev->bar_write_combine[index])
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	else
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
 
 	/*
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index fbb472dd99b36..0e0122ce4196a 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -54,6 +54,7 @@  struct vfio_pci_core_device {
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_NUM_BARS];
 	bool			bar_mmap_supported[PCI_STD_NUM_BARS];
+	bool			bar_write_combine[PCI_STD_NUM_BARS];
 	u8			*pci_config_map;
 	u8			*vconfig;
 	struct perm_bits	*msi_perm;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf1902..5537b20b23541 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -275,6 +275,7 @@  struct vfio_region_info {
 #define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
 #define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
 #define VFIO_REGION_INFO_FLAG_CAPS	(1 << 3) /* Info supports caps */
+#define VFIO_REGION_INFO_FLAG_PREFETCH	(1 << 4) /* Region is prefetchable */
 	__u32	index;		/* Region index */
 	__u32	cap_offset;	/* Offset within info struct of first cap */
 	__aligned_u64	size;	/* Region size (bytes) */
@@ -1821,6 +1822,22 @@  struct vfio_iommu_spapr_tce_remove {
 };
 #define VFIO_IOMMU_SPAPR_TCE_REMOVE	_IO(VFIO_TYPE, VFIO_BASE + 20)
 
+/**
+ * VFIO_DEVICE_SET_REGION_FLAGS	 - _IOW(VFIO_TYPE, VFIO_BASE + 21, struct vfio_region_flags)
+ *
+ * Set mapping options for the region
+ *
+ * Flags supported:
+ * - VFIO_REGION_FLAG_WRITE_COMBINE: use write-combine when requested to map
+ *   this region. Supported only if the region is prefetchable.
+ */
+struct vfio_region_flags {
+	__u32	index;		/* Region index */
+	__u32	flags;		/* Region flags */
+#define VFIO_REGION_FLAG_WRITE_COMBINE	(1 << 0)
+};
+#define VFIO_DEVICE_SET_REGION_FLAGS	_IO(VFIO_TYPE, VFIO_BASE + 21)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */