diff mbox

[1/2] drm: Add DRM_CAP_PRIME_SCANOUT.

Message ID 20170404081321.19263-2-raof@ubuntu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Christopher James Halse Rogers April 4, 2017, 8:13 a.m. UTC
From: Christopher James Halse Rogers <raof@ubuntu.com>

Until recently, on (at least) nouveau, radeon, and amdgpu attempting to scanout of an
imported dma-buf would silently result in the dma-buf sharing being broken.

While the hardware is capable of scanning out of imported dma-bufs (at least in some circumstances),
these drivers do not currently implement it, so attempts to scan out of such buffers will never succeed.

Add a userspace-visible drm capability and associated driver_feature so that userspace can discover
when scanning out of an imported dma-buf can work.

Signed-off-by: Christopher James Halse Rogers <christopher.halse.rogers@canonical.com>
---
 drivers/gpu/drm/drm_ioctl.c |  3 +++
 include/drm/drm_drv.h       |  1 +
 include/uapi/drm/drm.h      | 21 +++++++++++++++++++++
 3 files changed, 25 insertions(+)

Comments

Daniel Vetter April 4, 2017, 8:31 a.m. UTC | #1
On Tue, Apr 04, 2017 at 06:13:20PM +1000, raof@ubuntu.com wrote:
> From: Christopher James Halse Rogers <raof@ubuntu.com>
> 
> Until recently, on (at least) nouveau, radeon, and amdgpu attempting to scanout of an
> imported dma-buf would silently result in the dma-buf sharing being broken.
> 
> While the hardware is capable of scanning out of imported dma-bufs (at least in some circumstances),
> these drivers do not currently implement it, so attempts to scan out of such buffers will never succeed.
> 
> Add a userspace-visible drm capability and associated driver_feature so that userspace can discover
> when scanning out of an imported dma-buf can work.
> 
> Signed-off-by: Christopher James Halse Rogers <christopher.halse.rogers@canonical.com>

This seems like an awful specific special case. Why can't you do the
entire dance upfront, i.e. import buffer, addfb2? None of that has any
visible effect, and it will also allow you to check runtime constraints
which can't be covered with this here.
-Daniel

> ---
>  drivers/gpu/drm/drm_ioctl.c |  3 +++
>  include/drm/drm_drv.h       |  1 +
>  include/uapi/drm/drm.h      | 21 +++++++++++++++++++++
>  3 files changed, 25 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
> index a7c61c23685a..79ccf638668e 100644
> --- a/drivers/gpu/drm/drm_ioctl.c
> +++ b/drivers/gpu/drm/drm_ioctl.c
> @@ -285,6 +285,9 @@ static int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_
>  	case DRM_CAP_ADDFB2_MODIFIERS:
>  		req->value = dev->mode_config.allow_fb_modifiers;
>  		break;
> +	case DRM_CAP_PRIME_SCANOUT:
> +		req->value = drm_core_check_feature(dev, DRIVER_PRIME_SCANOUT);
> +		break;
>  	default:
>  		return -EINVAL;
>  	}
> diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
> index 5699f42195fe..32cc0d956d7e 100644
> --- a/include/drm/drm_drv.h
> +++ b/include/drm/drm_drv.h
> @@ -53,6 +53,7 @@ struct drm_mode_create_dumb;
>  #define DRIVER_RENDER			0x8000
>  #define DRIVER_ATOMIC			0x10000
>  #define DRIVER_KMS_LEGACY_CONTEXT	0x20000
> +#define DRIVER_PRIME_SCANOUT		0x40000
>  
>  /**
>   * struct drm_driver - DRM driver structure
> diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
> index b2c52843bc70..c57213cdb702 100644
> --- a/include/uapi/drm/drm.h
> +++ b/include/uapi/drm/drm.h
> @@ -647,6 +647,27 @@ struct drm_gem_open {
>  #define DRM_CAP_CURSOR_HEIGHT		0x9
>  #define DRM_CAP_ADDFB2_MODIFIERS	0x10
>  #define DRM_CAP_PAGE_FLIP_TARGET	0x11
> +/**
> + * DRM_CAP_PRIME_SCANOUT
> + *
> + * The PRIME_SCANOUT capability returns whether the driver might be capable
> + * of scanning out of an imported PRIME buffer.
> + *
> + * This does not guarantee that any imported buffer can be scanned out, as
> + * there may be specific requirements that a given buffer might not satisfy.
> + *
> + * If this capability is false then imported PRIME buffers will definitely
> + * never be suitable for scanout.
> + *
> + * Note: Prior to the introduction of this capability, bugs in drivers would
> + * allow userspace to attempt to scan out of imported PRIME buffers. This would
> + * work once, but move the buffer into an inconsistent state where rendering from
> + * the exporting GPU would no longer be seen by the importing GPU.
> + *
> + * It is unsafe for driver-agnostic code to attempt to scan out of an imported
> + * PRIME buffer in the absense of this capability.
> + */
> +#define DRM_CAP_PRIME_SCANOUT		0x12
>  
>  /** DRM_IOCTL_GET_CAP ioctl argument type */
>  struct drm_get_cap {
> -- 
> 2.11.0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Daniel Stone April 4, 2017, 10:11 a.m. UTC | #2
Hi,

On 4 April 2017 at 09:31, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Tue, Apr 04, 2017 at 06:13:20PM +1000, raof@ubuntu.com wrote:
>> Until recently, on (at least) nouveau, radeon, and amdgpu attempting to scanout of an
>> imported dma-buf would silently result in the dma-buf sharing being broken.
>>
>> While the hardware is capable of scanning out of imported dma-bufs (at least in some circumstances),
>> these drivers do not currently implement it, so attempts to scan out of such buffers will never succeed.
>>
>> Add a userspace-visible drm capability and associated driver_feature so that userspace can discover
>> when scanning out of an imported dma-buf can work.
>>
>> Signed-off-by: Christopher James Halse Rogers <christopher.halse.rogers@canonical.com>
>
> This seems like an awful specific special case. Why can't you do the
> entire dance upfront, i.e. import buffer, addfb2? None of that has any
> visible effect, and it will also allow you to check runtime constraints
> which can't be covered with this here.

Oh, that already happens. Chris sent out a series which fixes bugs for
PRIME import across a few drivers, mostly ones which need to pin
scanout buffers to dedicated memory. So this cap is basically
DRM_CAP_PRIME_SCANOUT_ISNT_BUGGY. FWIW, I'm fairly skeptical about
adding caps which essentially declare the absence of bugs like this.

Cheers,
Daniel
Christopher James Halse Rogers April 4, 2017, 10:27 a.m. UTC | #3
On 4 April 2017 6:31:12 pm AEST, Daniel Vetter <daniel@ffwll.ch> wrote:
>On Tue, Apr 04, 2017 at 06:13:20PM +1000, raof@ubuntu.com wrote:
>> From: Christopher James Halse Rogers <raof@ubuntu.com>
>> 
>> Until recently, on (at least) nouveau, radeon, and amdgpu attempting
>to scanout of an
>> imported dma-buf would silently result in the dma-buf sharing being
>broken.
>> 
>> While the hardware is capable of scanning out of imported dma-bufs
>(at least in some circumstances),
>> these drivers do not currently implement it, so attempts to scan out
>of such buffers will never succeed.
>> 
>> Add a userspace-visible drm capability and associated driver_feature
>so that userspace can discover
>> when scanning out of an imported dma-buf can work.
>> 
>> Signed-off-by: Christopher James Halse Rogers
><christopher.halse.rogers@canonical.com>
>
>This seems like an awful specific special case. Why can't you do the
>entire dance upfront, i.e. import buffer, addfb2? None of that has any
>visible effect, and it will also allow you to check runtime constraints
>which can't be covered with this here.
>-Daniel

No, because addfb2 doesn't (or, rather, didn't) actually check any runtime constraints.

The problem only appeared when the buffer is actually *used* in a modeset - otherwise I could do a (reasonably) cheap import/addfb/render on exporter/read out on importer dance to detect it.

To be clear - this is trying to solve the problem “how can I tell if it's safe to addfb/pageflip rather than do a GL copy to a GPU-local buffer and flip from that?”.

If I could guarantee that I'd only ever run on 4.13-or-later kernels (I think that's when the previous patches will land?), then this would indeed be mostly unnecessary. It would save me a bunch of addfb calls that would predictably fail, but they're cheap.

>
>> ---
>>  drivers/gpu/drm/drm_ioctl.c |  3 +++
>>  include/drm/drm_drv.h       |  1 +
>>  include/uapi/drm/drm.h      | 21 +++++++++++++++++++++
>>  3 files changed, 25 insertions(+)
>> 
>> diff --git a/drivers/gpu/drm/drm_ioctl.c
>b/drivers/gpu/drm/drm_ioctl.c
>> index a7c61c23685a..79ccf638668e 100644
>> --- a/drivers/gpu/drm/drm_ioctl.c
>> +++ b/drivers/gpu/drm/drm_ioctl.c
>> @@ -285,6 +285,9 @@ static int drm_getcap(struct drm_device *dev,
>void *data, struct drm_file *file_
>>  	case DRM_CAP_ADDFB2_MODIFIERS:
>>  		req->value = dev->mode_config.allow_fb_modifiers;
>>  		break;
>> +	case DRM_CAP_PRIME_SCANOUT:
>> +		req->value = drm_core_check_feature(dev, DRIVER_PRIME_SCANOUT);
>> +		break;
>>  	default:
>>  		return -EINVAL;
>>  	}
>> diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
>> index 5699f42195fe..32cc0d956d7e 100644
>> --- a/include/drm/drm_drv.h
>> +++ b/include/drm/drm_drv.h
>> @@ -53,6 +53,7 @@ struct drm_mode_create_dumb;
>>  #define DRIVER_RENDER			0x8000
>>  #define DRIVER_ATOMIC			0x10000
>>  #define DRIVER_KMS_LEGACY_CONTEXT	0x20000
>> +#define DRIVER_PRIME_SCANOUT		0x40000
>>  
>>  /**
>>   * struct drm_driver - DRM driver structure
>> diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
>> index b2c52843bc70..c57213cdb702 100644
>> --- a/include/uapi/drm/drm.h
>> +++ b/include/uapi/drm/drm.h
>> @@ -647,6 +647,27 @@ struct drm_gem_open {
>>  #define DRM_CAP_CURSOR_HEIGHT		0x9
>>  #define DRM_CAP_ADDFB2_MODIFIERS	0x10
>>  #define DRM_CAP_PAGE_FLIP_TARGET	0x11
>> +/**
>> + * DRM_CAP_PRIME_SCANOUT
>> + *
>> + * The PRIME_SCANOUT capability returns whether the driver might be
>capable
>> + * of scanning out of an imported PRIME buffer.
>> + *
>> + * This does not guarantee that any imported buffer can be scanned
>out, as
>> + * there may be specific requirements that a given buffer might not
>satisfy.
>> + *
>> + * If this capability is false then imported PRIME buffers will
>definitely
>> + * never be suitable for scanout.
>> + *
>> + * Note: Prior to the introduction of this capability, bugs in
>drivers would
>> + * allow userspace to attempt to scan out of imported PRIME buffers.
>This would
>> + * work once, but move the buffer into an inconsistent state where
>rendering from
>> + * the exporting GPU would no longer be seen by the importing GPU.
>> + *
>> + * It is unsafe for driver-agnostic code to attempt to scan out of
>an imported
>> + * PRIME buffer in the absense of this capability.
>> + */
>> +#define DRM_CAP_PRIME_SCANOUT		0x12
>>  
>>  /** DRM_IOCTL_GET_CAP ioctl argument type */
>>  struct drm_get_cap {
>> -- 
>> 2.11.0
>> 
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Lucas Stach April 4, 2017, 10:43 a.m. UTC | #4
Am Dienstag, den 04.04.2017, 20:27 +1000 schrieb Christopher James Halse
Rogers:
> On 4 April 2017 6:31:12 pm AEST, Daniel Vetter <daniel@ffwll.ch> wrote:
> >On Tue, Apr 04, 2017 at 06:13:20PM +1000, raof@ubuntu.com wrote:
> >> From: Christopher James Halse Rogers <raof@ubuntu.com>
> >> 
> >> Until recently, on (at least) nouveau, radeon, and amdgpu attempting
> >to scanout of an
> >> imported dma-buf would silently result in the dma-buf sharing being
> >broken.
> >> 
> >> While the hardware is capable of scanning out of imported dma-bufs
> >(at least in some circumstances),
> >> these drivers do not currently implement it, so attempts to scan out
> >of such buffers will never succeed.
> >> 
> >> Add a userspace-visible drm capability and associated driver_feature
> >so that userspace can discover
> >> when scanning out of an imported dma-buf can work.
> >> 
> >> Signed-off-by: Christopher James Halse Rogers
> ><christopher.halse.rogers@canonical.com>
> >
> >This seems like an awful specific special case. Why can't you do the
> >entire dance upfront, i.e. import buffer, addfb2? None of that has any
> >visible effect, and it will also allow you to check runtime constraints
> >which can't be covered with this here.
> >-Daniel
> 
> No, because addfb2 doesn't (or, rather, didn't) actually check any
> runtime constraints.
> 
> The problem only appeared when the buffer is actually *used* in a
> modeset - otherwise I could do a (reasonably) cheap
> import/addfb/render on exporter/read out on importer dance to detect
> it.
> 
> To be clear - this is trying to solve the problem “how can I tell if
> it's safe to addfb/pageflip rather than do a GL copy to a GPU-local
> buffer and flip from that?”.
> 
> If I could guarantee that I'd only ever run on 4.13-or-later kernels
> (I think that's when the previous patches will land?), then this would
> indeed be mostly unnecessary. It would save me a bunch of addfb calls
> that would predictably fail, but they're cheap.

I don't think we ever had caps for "things are working now, as they are
supposed to". i915 wasn't properly synchronizing on foreign fences for a
long time, yet we didn't gain a cap for "cross device sync works now".

If your distro use-case relies on those things working it's probably
best to just backport the relevant fixes.

Regards,
Lucas
Daniel Stone April 4, 2017, 10:43 a.m. UTC | #5
Hi,

On 4 April 2017 at 11:27, Christopher James Halse Rogers
<chris@cooperteam.net> wrote:
> On 4 April 2017 6:31:12 pm AEST, Daniel Vetter <daniel@ffwll.ch> wrote:
>>This seems like an awful specific special case. Why can't you do the
>>entire dance upfront, i.e. import buffer, addfb2? None of that has any
>>visible effect, and it will also allow you to check runtime constraints
>>which can't be covered with this here.
>
> No, because addfb2 doesn't (or, rather, didn't) actually check any runtime constraints.
>
> The problem only appeared when the buffer is actually *used* in a modeset - otherwise I could do a (reasonably) cheap import/addfb/render on exporter/read out on importer dance to detect it.
>
> To be clear - this is trying to solve the problem “how can I tell if it's safe to addfb/pageflip rather than do a GL copy to a GPU-local buffer and flip from that?”.
>
> If I could guarantee that I'd only ever run on 4.13-or-later kernels (I think that's when the previous patches will land?), then this would indeed be mostly unnecessary. It would save me a bunch of addfb calls that would predictably fail, but they're cheap.

Yeah, the ABI is that AddFB2 should fail hard on something which can
never be used as a framebuffer. The fact it didn't is a bug rather
than a behavioural change per se ...

Cheers,
Daniel
Christian König April 4, 2017, 11:15 a.m. UTC | #6
Am 04.04.2017 um 12:43 schrieb Daniel Stone:
> Hi,
>
> On 4 April 2017 at 11:27, Christopher James Halse Rogers
> <chris@cooperteam.net> wrote:
>> On 4 April 2017 6:31:12 pm AEST, Daniel Vetter <daniel@ffwll.ch> wrote:
>>> This seems like an awful specific special case. Why can't you do the
>>> entire dance upfront, i.e. import buffer, addfb2? None of that has any
>>> visible effect, and it will also allow you to check runtime constraints
>>> which can't be covered with this here.
>> No, because addfb2 doesn't (or, rather, didn't) actually check any runtime constraints.
>>
>> The problem only appeared when the buffer is actually *used* in a modeset - otherwise I could do a (reasonably) cheap import/addfb/render on exporter/read out on importer dance to detect it.
>>
>> To be clear - this is trying to solve the problem “how can I tell if it's safe to addfb/pageflip rather than do a GL copy to a GPU-local buffer and flip from that?”.
>>
>> If I could guarantee that I'd only ever run on 4.13-or-later kernels (I think that's when the previous patches will land?), then this would indeed be mostly unnecessary. It would save me a bunch of addfb calls that would predictably fail, but they're cheap.
> Yeah, the ABI is that AddFB2 should fail hard on something which can
> never be used as a framebuffer. The fact it didn't is a bug rather
> than a behavioural change per se ...

I agree on that, but the problem Christopher tries to solve looks a bit 
different from my perspective.

He wants to know if a driver can scan out from a foreign BO or if he 
needs to copy the content.

The problem I see with this patch is that the kernel doesn't look like 
the right place for this decision to make.

Regards,
Christian.

>
> Cheers,
> Daniel
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Daniel Stone April 4, 2017, 11:32 a.m. UTC | #7
Hi,

On 4 April 2017 at 12:15, Christian König <deathsimple@vodafone.de> wrote:
> Am 04.04.2017 um 12:43 schrieb Daniel Stone:
>> Yeah, the ABI is that AddFB2 should fail hard on something which can
>> never be used as a framebuffer. The fact it didn't is a bug rather
>> than a behavioural change per se ...
>
> I agree on that, but the problem Christopher tries to solve looks a bit
> different from my perspective.
>
> He wants to know if a driver can scan out from a foreign BO or if he needs
> to copy the content.
>
> The problem I see with this patch is that the kernel doesn't look like the
> right place for this decision to make.

Any number of constraints exist there: 'can I use any completely
arbitrary dmabuf from anywhere?' doesn't express the full range. AddFB
can fail for other totally legitimate reasons, so to me it makes the
most sense to centralise the checks there.

Cheers,
Daniel
Christian König April 4, 2017, 11:43 a.m. UTC | #8
Am 04.04.2017 um 13:32 schrieb Daniel Stone:
> Hi,
>
> On 4 April 2017 at 12:15, Christian König <deathsimple@vodafone.de> wrote:
>> Am 04.04.2017 um 12:43 schrieb Daniel Stone:
>>> Yeah, the ABI is that AddFB2 should fail hard on something which can
>>> never be used as a framebuffer. The fact it didn't is a bug rather
>>> than a behavioural change per se ...
>> I agree on that, but the problem Christopher tries to solve looks a bit
>> different from my perspective.
>>
>> He wants to know if a driver can scan out from a foreign BO or if he needs
>> to copy the content.
>>
>> The problem I see with this patch is that the kernel doesn't look like the
>> right place for this decision to make.
> Any number of constraints exist there: 'can I use any completely
> arbitrary dmabuf from anywhere?' doesn't express the full range.
Yes, completely agree.

> AddFB can fail for other totally legitimate reasons, so to me it makes the
> most sense to centralise the checks there.
Sorry, sounds like I wasn't clear on this. Removing the check in was 
*not* what I've wanted to suggest.

The kernel of course needs to check if what userspace requested makes 
sense and fail with a proper error message if it doesn't.

But the negotiation if two kernel drivers can work with each others data 
formats for each use case is not something I would want to push into the 
kernel. For this we simply have to many and to complex constraints on this.

In other words if you have an AMD APU with integrated graphics and an 
AMD dGPU it is likely that both can understand what the other puts into 
its buffer, even when it is completely AMD specific. The AMD user space 
drivers already have code to handle such cases.

But when we have a combination of Intel+AMD or Intel+NVidia we need way 
more negotiation between the two driver stacks then just the placement 
of buffers.

Regards,
Christian.

>
> Cheers,
> Daniel
Daniel Vetter April 4, 2017, 11:53 a.m. UTC | #9
On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach <l.stach@pengutronix.de> wrote:
>> If I could guarantee that I'd only ever run on 4.13-or-later kernels
>> (I think that's when the previous patches will land?), then this would
>> indeed be mostly unnecessary. It would save me a bunch of addfb calls
>> that would predictably fail, but they're cheap.
>
> I don't think we ever had caps for "things are working now, as they are
> supposed to". i915 wasn't properly synchronizing on foreign fences for a
> long time, yet we didn't gain a cap for "cross device sync works now".
>
> If your distro use-case relies on those things working it's probably
> best to just backport the relevant fixes.

The even better solution for this is to push the backports through
upstream -stable kernels. This stuff here is simple enough that we can
do it. Same could have been done for the fairly minimal fencing fixes
for i915 (at least some of them, the ones in the page-flip).

Otherwise we'll end up with tons IM_NOT_BUGGY and
IM_SLIGHTLY_LESS_BUGGY and IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
flags where no one at all knows what they mean, usage between
different drivers and different userspace is entirely inconsistent and
they just all add to the confusion. They're just bugs, lets treat them
like that.
-Daniel
Christopher James Halse Rogers April 5, 2017, 12:20 a.m. UTC | #10
On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter <daniel@ffwll.ch> wrote:

> On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach <l.stach@pengutronix.de>
> wrote:
> >> If I could guarantee that I'd only ever run on 4.13-or-later kernels
> >> (I think that's when the previous patches will land?), then this would
> >> indeed be mostly unnecessary. It would save me a bunch of addfb calls
> >> that would predictably fail, but they're cheap.
> >
> > I don't think we ever had caps for "things are working now, as they are
> > supposed to". i915 wasn't properly synchronizing on foreign fences for a
> > long time, yet we didn't gain a cap for "cross device sync works now".
> >
> > If your distro use-case relies on those things working it's probably
> > best to just backport the relevant fixes.
>
> The even better solution for this is to push the backports through
> upstream -stable kernels. This stuff here is simple enough that we can
> do it. Same could have been done for the fairly minimal fencing fixes
> for i915 (at least some of them, the ones in the page-flip).
>
> Otherwise we'll end up with tons IM_NOT_BUGGY and
> IM_SLIGHTLY_LESS_BUGGY and IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
> flags where no one at all knows what they mean, usage between
> different drivers and different userspace is entirely inconsistent and
> they just all add to the confusion. They're just bugs, lets treat them
> like that.
>

It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the relevant
hardware allegedly supports it, nouveau/radeon/amdgpu don't do scanout of
GTT, so the lack of this cap indicates that there's no point in trying to
call addfb2.

But calling addfb2 and it failing is not expensive, so this is rather niche.

In practice I can just restrict attempting to scanout of imported buffers
to i915, as that's the only driver that it'll work on. By the time
nouveau/radeon/amdgpu get patches to scanout of GTT the fixes should be old
enough that I don't need to care about unfixed kernels.
Daniel Vetter April 5, 2017, 6:27 a.m. UTC | #11
On Wed, Apr 05, 2017 at 12:20:46AM +0000, Christopher James Halse Rogers wrote:
> On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach <l.stach@pengutronix.de>
> > wrote:
> > >> If I could guarantee that I'd only ever run on 4.13-or-later kernels
> > >> (I think that's when the previous patches will land?), then this would
> > >> indeed be mostly unnecessary. It would save me a bunch of addfb calls
> > >> that would predictably fail, but they're cheap.
> > >
> > > I don't think we ever had caps for "things are working now, as they are
> > > supposed to". i915 wasn't properly synchronizing on foreign fences for a
> > > long time, yet we didn't gain a cap for "cross device sync works now".
> > >
> > > If your distro use-case relies on those things working it's probably
> > > best to just backport the relevant fixes.
> >
> > The even better solution for this is to push the backports through
> > upstream -stable kernels. This stuff here is simple enough that we can
> > do it. Same could have been done for the fairly minimal fencing fixes
> > for i915 (at least some of them, the ones in the page-flip).
> >
> > Otherwise we'll end up with tons IM_NOT_BUGGY and
> > IM_SLIGHTLY_LESS_BUGGY and IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
> > flags where no one at all knows what they mean, usage between
> > different drivers and different userspace is entirely inconsistent and
> > they just all add to the confusion. They're just bugs, lets treat them
> > like that.
> >
> 
> It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the relevant
> hardware allegedly supports it, nouveau/radeon/amdgpu don't do scanout of
> GTT, so the lack of this cap indicates that there's no point in trying to
> call addfb2.
> 
> But calling addfb2 and it failing is not expensive, so this is rather niche.
> 
> In practice I can just restrict attempting to scanout of imported buffers
> to i915, as that's the only driver that it'll work on. By the time
> nouveau/radeon/amdgpu get patches to scanout of GTT the fixes should be old
> enough that I don't need to care about unfixed kernels.

"I'll only run on i915" sounds like a rather risky assumption. You're sure
no one will install ubuntu on e.g. rasperry with Eric's shiny new pl111
driver?

I really think that if you want to rely on this, backporting the fixes is
perfectly fine.
-Daniel
Christopher James Halse Rogers April 5, 2017, 6:52 a.m. UTC | #12
On Wed, Apr 5, 2017 at 4:27 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Wed, Apr 05, 2017 at 12:20:46AM +0000, Christopher James Halse 
> Rogers wrote:
>>  On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter <daniel@ffwll.ch> 
>> wrote:
>> 
>>  > On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach 
>> <l.stach@pengutronix.de>
>>  > wrote:
>>  > >> If I could guarantee that I'd only ever run on 4.13-or-later 
>> kernels
>>  > >> (I think that's when the previous patches will land?), then 
>> this would
>>  > >> indeed be mostly unnecessary. It would save me a bunch of 
>> addfb calls
>>  > >> that would predictably fail, but they're cheap.
>>  > >
>>  > > I don't think we ever had caps for "things are working now, as 
>> they are
>>  > > supposed to". i915 wasn't properly synchronizing on foreign 
>> fences for a
>>  > > long time, yet we didn't gain a cap for "cross device sync 
>> works now".
>>  > >
>>  > > If your distro use-case relies on those things working it's 
>> probably
>>  > > best to just backport the relevant fixes.
>>  >
>>  > The even better solution for this is to push the backports through
>>  > upstream -stable kernels. This stuff here is simple enough that 
>> we can
>>  > do it. Same could have been done for the fairly minimal fencing 
>> fixes
>>  > for i915 (at least some of them, the ones in the page-flip).
>>  >
>>  > Otherwise we'll end up with tons IM_NOT_BUGGY and
>>  > IM_SLIGHTLY_LESS_BUGGY and 
>> IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
>>  > flags where no one at all knows what they mean, usage between
>>  > different drivers and different userspace is entirely 
>> inconsistent and
>>  > they just all add to the confusion. They're just bugs, lets treat 
>> them
>>  > like that.
>>  >
>> 
>>  It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the 
>> relevant
>>  hardware allegedly supports it, nouveau/radeon/amdgpu don't do 
>> scanout of
>>  GTT, so the lack of this cap indicates that there's no point in 
>> trying to
>>  call addfb2.
>> 
>>  But calling addfb2 and it failing is not expensive, so this is 
>> rather niche.
>> 
>>  In practice I can just restrict attempting to scanout of imported 
>> buffers
>>  to i915, as that's the only driver that it'll work on. By the time
>>  nouveau/radeon/amdgpu get patches to scanout of GTT the fixes 
>> should be old
>>  enough that I don't need to care about unfixed kernels.
> 
> "I'll only run on i915" sounds like a rather risky assumption. You're 
> sure
> no one will install ubuntu on e.g. rasperry with Eric's shiny new 
> pl111
> driver?
> 
> I really think that if you want to rely on this, backporting the 
> fixes is
> perfectly fine.

Oh, no! I mean *check* that I'm running on i915, and only use the 
no-copy path if I am.

Backporting the patches would indeed be sensible. I guess I should have 
CC: stable'd at the time.
Lucas Stach April 5, 2017, 8:15 a.m. UTC | #13
Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher James Halse
Rogers:
> 
> 
> On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> 
>         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
>         <l.stach@pengutronix.de> wrote:
>         >> If I could guarantee that I'd only ever run on
>         4.13-or-later kernels
>         >> (I think that's when the previous patches will land?), then
>         this would
>         >> indeed be mostly unnecessary. It would save me a bunch of
>         addfb calls
>         >> that would predictably fail, but they're cheap.
>         >
>         > I don't think we ever had caps for "things are working now,
>         as they are
>         > supposed to". i915 wasn't properly synchronizing on foreign
>         fences for a
>         > long time, yet we didn't gain a cap for "cross device sync
>         works now".
>         >
>         > If your distro use-case relies on those things working it's
>         probably
>         > best to just backport the relevant fixes.
>         
>         The even better solution for this is to push the backports
>         through
>         upstream -stable kernels. This stuff here is simple enough
>         that we can
>         do it. Same could have been done for the fairly minimal
>         fencing fixes
>         for i915 (at least some of them, the ones in the page-flip).
>         
>         Otherwise we'll end up with tons IM_NOT_BUGGY and
>         IM_SLIGHTLY_LESS_BUGGY and
>         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
>         flags where no one at all knows what they mean, usage between
>         different drivers and different userspace is entirely
>         inconsistent and
>         they just all add to the confusion. They're just bugs, lets
>         treat them
>         like that.
> 
> 
> It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the relevant
> hardware allegedly supports it, nouveau/radeon/amdgpu don't do scanout
> of GTT, so the lack of this cap indicates that there's no point in
> trying to call addfb2.
> 
> 
> But calling addfb2 and it failing is not expensive, so this is rather
> niche.
> 
> 
> In practice I can just restrict attempting to scanout of imported
> buffers to i915, as that's the only driver that it'll work on. By the
> time nouveau/radeon/amdgpu get patches to scanout of GTT the fixes
> should be old enough that I don't need to care about unfixed kernels.
> 
So given that most discreet hardware won't ever be able to scanout from
GTT (latency and iso requirements will be hard to meet), can't we just
fix the case of the broken prime sharing when migrating to VRAM?

I'm thinking about attaching an exclusive fence to the dma-buf when the
migration to VRAM happens, then when the GPU is done with the buffer we
can either write back any changes to GTT, or just drop the fence in case
the GPU didn't modify the buffer.

Regards,
Lucas
Daniel Vetter April 5, 2017, 9:59 a.m. UTC | #14
On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
> Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher James Halse
> Rogers:
> > 
> > 
> > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > 
> >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
> >         <l.stach@pengutronix.de> wrote:
> >         >> If I could guarantee that I'd only ever run on
> >         4.13-or-later kernels
> >         >> (I think that's when the previous patches will land?), then
> >         this would
> >         >> indeed be mostly unnecessary. It would save me a bunch of
> >         addfb calls
> >         >> that would predictably fail, but they're cheap.
> >         >
> >         > I don't think we ever had caps for "things are working now,
> >         as they are
> >         > supposed to". i915 wasn't properly synchronizing on foreign
> >         fences for a
> >         > long time, yet we didn't gain a cap for "cross device sync
> >         works now".
> >         >
> >         > If your distro use-case relies on those things working it's
> >         probably
> >         > best to just backport the relevant fixes.
> >         
> >         The even better solution for this is to push the backports
> >         through
> >         upstream -stable kernels. This stuff here is simple enough
> >         that we can
> >         do it. Same could have been done for the fairly minimal
> >         fencing fixes
> >         for i915 (at least some of them, the ones in the page-flip).
> >         
> >         Otherwise we'll end up with tons IM_NOT_BUGGY and
> >         IM_SLIGHTLY_LESS_BUGGY and
> >         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
> >         flags where no one at all knows what they mean, usage between
> >         different drivers and different userspace is entirely
> >         inconsistent and
> >         they just all add to the confusion. They're just bugs, lets
> >         treat them
> >         like that.
> > 
> > 
> > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the relevant
> > hardware allegedly supports it, nouveau/radeon/amdgpu don't do scanout
> > of GTT, so the lack of this cap indicates that there's no point in
> > trying to call addfb2.
> > 
> > 
> > But calling addfb2 and it failing is not expensive, so this is rather
> > niche.
> > 
> > 
> > In practice I can just restrict attempting to scanout of imported
> > buffers to i915, as that's the only driver that it'll work on. By the
> > time nouveau/radeon/amdgpu get patches to scanout of GTT the fixes
> > should be old enough that I don't need to care about unfixed kernels.
> > 
> So given that most discreet hardware won't ever be able to scanout from
> GTT (latency and iso requirements will be hard to meet), can't we just
> fix the case of the broken prime sharing when migrating to VRAM?
> 
> I'm thinking about attaching an exclusive fence to the dma-buf when the
> migration to VRAM happens, then when the GPU is done with the buffer we
> can either write back any changes to GTT, or just drop the fence in case
> the GPU didn't modify the buffer.

We could, but someone needs to type the code for it. There's also the
problem that you need to migrate back, and then doing all that behind
userspaces back might not be the best idea.
-Daniel
Lucas Stach April 5, 2017, 10:14 a.m. UTC | #15
Am Mittwoch, den 05.04.2017, 11:59 +0200 schrieb Daniel Vetter:
> On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
> > Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher James Halse
> > Rogers:
> > > 
> > > 
> > > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > 
> > >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
> > >         <l.stach@pengutronix.de> wrote:
> > >         >> If I could guarantee that I'd only ever run on
> > >         4.13-or-later kernels
> > >         >> (I think that's when the previous patches will land?), then
> > >         this would
> > >         >> indeed be mostly unnecessary. It would save me a bunch of
> > >         addfb calls
> > >         >> that would predictably fail, but they're cheap.
> > >         >
> > >         > I don't think we ever had caps for "things are working now,
> > >         as they are
> > >         > supposed to". i915 wasn't properly synchronizing on foreign
> > >         fences for a
> > >         > long time, yet we didn't gain a cap for "cross device sync
> > >         works now".
> > >         >
> > >         > If your distro use-case relies on those things working it's
> > >         probably
> > >         > best to just backport the relevant fixes.
> > >         
> > >         The even better solution for this is to push the backports
> > >         through
> > >         upstream -stable kernels. This stuff here is simple enough
> > >         that we can
> > >         do it. Same could have been done for the fairly minimal
> > >         fencing fixes
> > >         for i915 (at least some of them, the ones in the page-flip).
> > >         
> > >         Otherwise we'll end up with tons IM_NOT_BUGGY and
> > >         IM_SLIGHTLY_LESS_BUGGY and
> > >         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
> > >         flags where no one at all knows what they mean, usage between
> > >         different drivers and different userspace is entirely
> > >         inconsistent and
> > >         they just all add to the confusion. They're just bugs, lets
> > >         treat them
> > >         like that.
> > > 
> > > 
> > > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the relevant
> > > hardware allegedly supports it, nouveau/radeon/amdgpu don't do scanout
> > > of GTT, so the lack of this cap indicates that there's no point in
> > > trying to call addfb2.
> > > 
> > > 
> > > But calling addfb2 and it failing is not expensive, so this is rather
> > > niche.
> > > 
> > > 
> > > In practice I can just restrict attempting to scanout of imported
> > > buffers to i915, as that's the only driver that it'll work on. By the
> > > time nouveau/radeon/amdgpu get patches to scanout of GTT the fixes
> > > should be old enough that I don't need to care about unfixed kernels.
> > > 
> > So given that most discreet hardware won't ever be able to scanout from
> > GTT (latency and iso requirements will be hard to meet), can't we just
> > fix the case of the broken prime sharing when migrating to VRAM?
> > 
> > I'm thinking about attaching an exclusive fence to the dma-buf when the
> > migration to VRAM happens, then when the GPU is done with the buffer we
> > can either write back any changes to GTT, or just drop the fence in case
> > the GPU didn't modify the buffer.
> 
> We could, but someone needs to type the code for it. There's also the
> problem that you need to migrate back, and then doing all that behind
> userspaces back might not be the best idea.

Drivers with separate VRAM and GTT are already doing a lot of migration
behind the userspaces back. The only issue with dma-buf migration to
VRAM is that you probably don't want to migrate the pages, but duplicate
them in VRAM, doubling memory consumption with possible OOM. But then
you could alloc the memory on addfb where you are able to return proper
errors.

I guess what I'm saying is that userspace really should check if addfb
for imported buffers works and base its decisions on that, not some
arbitrary "is this driver XY" or "does it have magic DRM_CAP".  This way
we can enable transparent VRAM migration (making addfb on them work) if
someone is interested in this, without further changes to the UAPI.

Regards,
Lucas
Christopher James Halse Rogers April 5, 2017, 11:13 a.m. UTC | #16
On Wed, Apr 5, 2017 at 8:14 PM Lucas Stach <l.stach@pengutronix.de> wrote:

> Am Mittwoch, den 05.04.2017, 11:59 +0200 schrieb Daniel Vetter:
> > On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
> > > Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher James
> Halse
> > > Rogers:
> > > >
> > > >
> > > > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter <daniel@ffwll.ch>
> wrote:
> > > >
> > > >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
> > > >         <l.stach@pengutronix.de> wrote:
> > > >         >> If I could guarantee that I'd only ever run on
> > > >         4.13-or-later kernels
> > > >         >> (I think that's when the previous patches will land?),
> then
> > > >         this would
> > > >         >> indeed be mostly unnecessary. It would save me a bunch of
> > > >         addfb calls
> > > >         >> that would predictably fail, but they're cheap.
> > > >         >
> > > >         > I don't think we ever had caps for "things are working now,
> > > >         as they are
> > > >         > supposed to". i915 wasn't properly synchronizing on foreign
> > > >         fences for a
> > > >         > long time, yet we didn't gain a cap for "cross device sync
> > > >         works now".
> > > >         >
> > > >         > If your distro use-case relies on those things working it's
> > > >         probably
> > > >         > best to just backport the relevant fixes.
> > > >
> > > >         The even better solution for this is to push the backports
> > > >         through
> > > >         upstream -stable kernels. This stuff here is simple enough
> > > >         that we can
> > > >         do it. Same could have been done for the fairly minimal
> > > >         fencing fixes
> > > >         for i915 (at least some of them, the ones in the page-flip).
> > > >
> > > >         Otherwise we'll end up with tons IM_NOT_BUGGY and
> > > >         IM_SLIGHTLY_LESS_BUGGY and
> > > >         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
> > > >         flags where no one at all knows what they mean, usage between
> > > >         different drivers and different userspace is entirely
> > > >         inconsistent and
> > > >         they just all add to the confusion. They're just bugs, lets
> > > >         treat them
> > > >         like that.
> > > >
> > > >
> > > > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the relevant
> > > > hardware allegedly supports it, nouveau/radeon/amdgpu don't do
> scanout
> > > > of GTT, so the lack of this cap indicates that there's no point in
> > > > trying to call addfb2.
> > > >
> > > >
> > > > But calling addfb2 and it failing is not expensive, so this is rather
> > > > niche.
> > > >
> > > >
> > > > In practice I can just restrict attempting to scanout of imported
> > > > buffers to i915, as that's the only driver that it'll work on. By the
> > > > time nouveau/radeon/amdgpu get patches to scanout of GTT the fixes
> > > > should be old enough that I don't need to care about unfixed kernels.
> > > >
> > > So given that most discreet hardware won't ever be able to scanout from
> > > GTT (latency and iso requirements will be hard to meet), can't we just
> > > fix the case of the broken prime sharing when migrating to VRAM?
>

At least some nouveau and AMD devs seem to think that their hardware is
capable of doing it. Shrug.

> >
> > > I'm thinking about attaching an exclusive fence to the dma-buf when the
> > > migration to VRAM happens, then when the GPU is done with the buffer we
> > > can either write back any changes to GTT, or just drop the fence in
> case
> > > the GPU didn't modify the buffer.
> >
> > We could, but someone needs to type the code for it. There's also the
> > problem that you need to migrate back, and then doing all that behind
> > userspaces back might not be the best idea.
>
> Drivers with separate VRAM and GTT are already doing a lot of migration
> behind the userspaces back. The only issue with dma-buf migration to
> VRAM is that you probably don't want to migrate the pages, but duplicate
> them in VRAM, doubling memory consumption with possible OOM. But then
> you could alloc the memory on addfb where you are able to return proper
> errors.
>

I would *love* for the driver to copy the pages for me into VRAM for
scanout, rather than me having to spin up an EGL context and run the
trivial blitting shader across an EGLImage.

Are you offering to do it? :)

I'll still need to, for the short term, assume that only i915 can do this
without breaking, though.
Christian König April 5, 2017, 11:21 a.m. UTC | #17
Am 05.04.2017 um 13:13 schrieb Christopher James Halse Rogers:
> On Wed, Apr 5, 2017 at 8:14 PM Lucas Stach <l.stach@pengutronix.de 
> <mailto:l.stach@pengutronix.de>> wrote:
>
>     Am Mittwoch, den 05.04.2017, 11:59 +0200 schrieb Daniel Vetter:
>     > On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
>     > > Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher
>     James Halse
>     > > Rogers:
>     > > >
>     > > >
>     > > > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter
>     <daniel@ffwll.ch <mailto:daniel@ffwll.ch>> wrote:
>     > > >
>     > > >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
>     > > >         <l.stach@pengutronix.de
>     <mailto:l.stach@pengutronix.de>> wrote:
>     > > >         >> If I could guarantee that I'd only ever run on
>     > > >         4.13-or-later kernels
>     > > >         >> (I think that's when the previous patches will
>     land?), then
>     > > >         this would
>     > > >         >> indeed be mostly unnecessary. It would save me a
>     bunch of
>     > > >         addfb calls
>     > > >         >> that would predictably fail, but they're cheap.
>     > > >         >
>     > > >         > I don't think we ever had caps for "things are
>     working now,
>     > > >         as they are
>     > > >         > supposed to". i915 wasn't properly synchronizing
>     on foreign
>     > > >         fences for a
>     > > >         > long time, yet we didn't gain a cap for "cross
>     device sync
>     > > >         works now".
>     > > >         >
>     > > >         > If your distro use-case relies on those things
>     working it's
>     > > >         probably
>     > > >         > best to just backport the relevant fixes.
>     > > >
>     > > >         The even better solution for this is to push the
>     backports
>     > > >         through
>     > > >         upstream -stable kernels. This stuff here is simple
>     enough
>     > > >         that we can
>     > > >         do it. Same could have been done for the fairly minimal
>     > > >         fencing fixes
>     > > >         for i915 (at least some of them, the ones in the
>     page-flip).
>     > > >
>     > > >         Otherwise we'll end up with tons IM_NOT_BUGGY and
>     > > >         IM_SLIGHTLY_LESS_BUGGY and
>     > > >  IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
>     > > >         flags where no one at all knows what they mean,
>     usage between
>     > > >         different drivers and different userspace is entirely
>     > > >         inconsistent and
>     > > >         they just all add to the confusion. They're just
>     bugs, lets
>     > > >         treat them
>     > > >         like that.
>     > > >
>     > > >
>     > > > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the
>     relevant
>     > > > hardware allegedly supports it, nouveau/radeon/amdgpu don't
>     do scanout
>     > > > of GTT, so the lack of this cap indicates that there's no
>     point in
>     > > > trying to call addfb2.
>     > > >
>     > > >
>     > > > But calling addfb2 and it failing is not expensive, so this
>     is rather
>     > > > niche.
>     > > >
>     > > >
>     > > > In practice I can just restrict attempting to scanout of
>     imported
>     > > > buffers to i915, as that's the only driver that it'll work
>     on. By the
>     > > > time nouveau/radeon/amdgpu get patches to scanout of GTT the
>     fixes
>     > > > should be old enough that I don't need to care about unfixed
>     kernels.
>     > > >
>     > > So given that most discreet hardware won't ever be able to
>     scanout from
>     > > GTT (latency and iso requirements will be hard to meet), can't
>     we just
>     > > fix the case of the broken prime sharing when migrating to VRAM?
>
>
> At least some nouveau and AMD devs seem to think that their hardware 
> is capable of doing it. Shrug.

Wow, wait a second. Recent AMD GPU can scanout from system memory, 
that's true.

But you need to met quite a bunch of special allocation requirements to 
do this.

When we are talking about sharing between AMD GPUs, (e.g. both exporter 
and importer are AMD hardware) than that might work.

But I think it's unrealistic that an imported BO (created by an external 
driver stack) will ever meet those requirements.

Christian.

>
>     > >
>     > > I'm thinking about attaching an exclusive fence to the dma-buf
>     when the
>     > > migration to VRAM happens, then when the GPU is done with the
>     buffer we
>     > > can either write back any changes to GTT, or just drop the
>     fence in case
>     > > the GPU didn't modify the buffer.
>     >
>     > We could, but someone needs to type the code for it. There's
>     also the
>     > problem that you need to migrate back, and then doing all that
>     behind
>     > userspaces back might not be the best idea.
>
>     Drivers with separate VRAM and GTT are already doing a lot of
>     migration
>     behind the userspaces back. The only issue with dma-buf migration to
>     VRAM is that you probably don't want to migrate the pages, but
>     duplicate
>     them in VRAM, doubling memory consumption with possible OOM. But then
>     you could alloc the memory on addfb where you are able to return
>     proper
>     errors.
>
>
> I would *love* for the driver to copy the pages for me into VRAM for 
> scanout, rather than me having to spin up an EGL context and run the 
> trivial blitting shader across an EGLImage.
>
> Are you offering to do it? :)
>
> I'll still need to, for the short term, assume that only i915 can do 
> this without breaking, though.
>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Christopher James Halse Rogers April 6, 2017, 7:47 a.m. UTC | #18
On Wed, 5 Apr 2017 at 20:14 Lucas Stach <l.stach@pengutronix.de> wrote:

> Am Mittwoch, den 05.04.2017, 11:59 +0200 schrieb Daniel Vetter:
> > On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
> > > Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher James
> Halse
> > > Rogers:
> > > >
> > > >
> > > > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter <daniel@ffwll.ch>
> wrote:
> > > >
> > > >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
> > > >         <l.stach@pengutronix.de> wrote:
> > > >         >> If I could guarantee that I'd only ever run on
> > > >         4.13-or-later kernels
> > > >         >> (I think that's when the previous patches will land?),
> then
> > > >         this would
> > > >         >> indeed be mostly unnecessary. It would save me a bunch of
> > > >         addfb calls
> > > >         >> that would predictably fail, but they're cheap.
> > > >         >
> > > >         > I don't think we ever had caps for "things are working now,
> > > >         as they are
> > > >         > supposed to". i915 wasn't properly synchronizing on foreign
> > > >         fences for a
> > > >         > long time, yet we didn't gain a cap for "cross device sync
> > > >         works now".
> > > >         >
> > > >         > If your distro use-case relies on those things working it's
> > > >         probably
> > > >         > best to just backport the relevant fixes.
> > > >
> > > >         The even better solution for this is to push the backports
> > > >         through
> > > >         upstream -stable kernels. This stuff here is simple enough
> > > >         that we can
> > > >         do it. Same could have been done for the fairly minimal
> > > >         fencing fixes
> > > >         for i915 (at least some of them, the ones in the page-flip).
> > > >
> > > >         Otherwise we'll end up with tons IM_NOT_BUGGY and
> > > >         IM_SLIGHTLY_LESS_BUGGY and
> > > >         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
> > > >         flags where no one at all knows what they mean, usage between
> > > >         different drivers and different userspace is entirely
> > > >         inconsistent and
> > > >         they just all add to the confusion. They're just bugs, lets
> > > >         treat them
> > > >         like that.
> > > >
> > > >
> > > > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the relevant
> > > > hardware allegedly supports it, nouveau/radeon/amdgpu don't do
> scanout
> > > > of GTT, so the lack of this cap indicates that there's no point in
> > > > trying to call addfb2.
> > > >
> > > >
> > > > But calling addfb2 and it failing is not expensive, so this is rather
> > > > niche.
> > > >
> > > >
> > > > In practice I can just restrict attempting to scanout of imported
> > > > buffers to i915, as that's the only driver that it'll work on. By the
> > > > time nouveau/radeon/amdgpu get patches to scanout of GTT the fixes
> > > > should be old enough that I don't need to care about unfixed kernels.
> > > >
> > > So given that most discreet hardware won't ever be able to scanout from
> > > GTT (latency and iso requirements will be hard to meet), can't we just
> > > fix the case of the broken prime sharing when migrating to VRAM?
> > >
> > > I'm thinking about attaching an exclusive fence to the dma-buf when the
> > > migration to VRAM happens, then when the GPU is done with the buffer we
> > > can either write back any changes to GTT, or just drop the fence in
> case
> > > the GPU didn't modify the buffer.
> >
> > We could, but someone needs to type the code for it. There's also the
> > problem that you need to migrate back, and then doing all that behind
> > userspaces back might not be the best idea.
>
> Drivers with separate VRAM and GTT are already doing a lot of migration
> behind the userspaces back. The only issue with dma-buf migration to
> VRAM is that you probably don't want to migrate the pages, but duplicate
> them in VRAM, doubling memory consumption with possible OOM. But then
> you could alloc the memory on addfb where you are able to return proper
> errors.


Hm. So, on a first inspection, this looks like something I could actually
cook up.

Looking at amdgpu it seems like the thing to do would be to associate a
shadow-bo in VRAM for the imported dma-buf in the addfb call, then
pin-and-copy-to the shadow bo in the places where the bo is currently
pinned.

Is this approach likely to be acceptable?
Michel Dänzer April 10, 2017, 8:51 a.m. UTC | #19
On 06/04/17 04:47 PM, Christopher James Halse Rogers wrote:
> On Wed, 5 Apr 2017 at 20:14 Lucas Stach <l.stach@pengutronix.de
> <mailto:l.stach@pengutronix.de>> wrote:
> 
>     Am Mittwoch, den 05.04.2017, 11:59 +0200 schrieb Daniel Vetter:
>     > On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
>     > > Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher
>     James Halse
>     > > Rogers:
>     > > >
>     > > >
>     > > > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter <daniel@ffwll.ch
>     <mailto:daniel@ffwll.ch>> wrote:
>     > > >
>     > > >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
>     > > >         <l.stach@pengutronix.de
>     <mailto:l.stach@pengutronix.de>> wrote:
>     > > >         >> If I could guarantee that I'd only ever run on
>     > > >         4.13-or-later kernels
>     > > >         >> (I think that's when the previous patches will
>     land?), then
>     > > >         this would
>     > > >         >> indeed be mostly unnecessary. It would save me a
>     bunch of
>     > > >         addfb calls
>     > > >         >> that would predictably fail, but they're cheap.
>     > > >         >
>     > > >         > I don't think we ever had caps for "things are
>     working now,
>     > > >         as they are
>     > > >         > supposed to". i915 wasn't properly synchronizing on
>     foreign
>     > > >         fences for a
>     > > >         > long time, yet we didn't gain a cap for "cross
>     device sync
>     > > >         works now".
>     > > >         >
>     > > >         > If your distro use-case relies on those things
>     working it's
>     > > >         probably
>     > > >         > best to just backport the relevant fixes.
>     > > >
>     > > >         The even better solution for this is to push the backports
>     > > >         through
>     > > >         upstream -stable kernels. This stuff here is simple enough
>     > > >         that we can
>     > > >         do it. Same could have been done for the fairly minimal
>     > > >         fencing fixes
>     > > >         for i915 (at least some of them, the ones in the
>     page-flip).
>     > > >
>     > > >         Otherwise we'll end up with tons IM_NOT_BUGGY and
>     > > >         IM_SLIGHTLY_LESS_BUGGY and
>     > > >         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
>     > > >         flags where no one at all knows what they mean, usage
>     between
>     > > >         different drivers and different userspace is entirely
>     > > >         inconsistent and
>     > > >         they just all add to the confusion. They're just bugs,
>     lets
>     > > >         treat them
>     > > >         like that.
>     > > >
>     > > >
>     > > > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the
>     relevant
>     > > > hardware allegedly supports it, nouveau/radeon/amdgpu don't do
>     scanout
>     > > > of GTT, so the lack of this cap indicates that there's no point in
>     > > > trying to call addfb2.
>     > > >
>     > > >
>     > > > But calling addfb2 and it failing is not expensive, so this is
>     rather
>     > > > niche.
>     > > >
>     > > >
>     > > > In practice I can just restrict attempting to scanout of imported
>     > > > buffers to i915, as that's the only driver that it'll work on.
>     By the
>     > > > time nouveau/radeon/amdgpu get patches to scanout of GTT the fixes
>     > > > should be old enough that I don't need to care about unfixed
>     kernels.
>     > > >
>     > > So given that most discreet hardware won't ever be able to
>     scanout from
>     > > GTT (latency and iso requirements will be hard to meet), can't
>     we just
>     > > fix the case of the broken prime sharing when migrating to VRAM?
>     > >
>     > > I'm thinking about attaching an exclusive fence to the dma-buf
>     when the
>     > > migration to VRAM happens, then when the GPU is done with the
>     buffer we
>     > > can either write back any changes to GTT, or just drop the fence
>     in case
>     > > the GPU didn't modify the buffer.
>     >
>     > We could, but someone needs to type the code for it. There's also the
>     > problem that you need to migrate back, and then doing all that behind
>     > userspaces back might not be the best idea.
> 
>     Drivers with separate VRAM and GTT are already doing a lot of migration
>     behind the userspaces back. The only issue with dma-buf migration to
>     VRAM is that you probably don't want to migrate the pages, but duplicate
>     them in VRAM, doubling memory consumption with possible OOM. But then
>     you could alloc the memory on addfb where you are able to return proper
>     errors.
> 
> 
> Hm. So, on a first inspection, this looks like something I could
> actually cook up.
> 
> Looking at amdgpu it seems like the thing to do would be to associate a
> shadow-bo in VRAM for the imported dma-buf in the addfb call, then
> pin-and-copy-to the shadow bo in the places where the bo is currently
> pinned.
> 
> Is this approach likely to be acceptable?

It would break e.g. with DRI2 flips, because they replace the screen
pixmap buffer with the buffer we're flipping to. If the app stops
flipping while such a shadow BO is being scanned out, later draws to the
screen pixmap won't become visible.
Michel Dänzer April 10, 2017, 8:52 a.m. UTC | #20
On 05/04/17 08:21 PM, Christian König wrote:
> Am 05.04.2017 um 13:13 schrieb Christopher James Halse Rogers:
>> On Wed, Apr 5, 2017 at 8:14 PM Lucas Stach <l.stach@pengutronix.de
>> <mailto:l.stach@pengutronix.de>> wrote:
>>
>>     Am Mittwoch, den 05.04.2017, 11:59 +0200 schrieb Daniel Vetter:
>>     > On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
>>     > > Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher
>>     James Halse
>>     > > Rogers:
>>     > > >
>>     > > >
>>     > > > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter
>>     <daniel@ffwll.ch <mailto:daniel@ffwll.ch>> wrote:
>>     > > >
>>     > > >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
>>     > > >         <l.stach@pengutronix.de
>>     <mailto:l.stach@pengutronix.de>> wrote:
>>     > > >         >> If I could guarantee that I'd only ever run on
>>     > > >         4.13-or-later kernels
>>     > > >         >> (I think that's when the previous patches will
>>     land?), then
>>     > > >         this would
>>     > > >         >> indeed be mostly unnecessary. It would save me a
>>     bunch of
>>     > > >         addfb calls
>>     > > >         >> that would predictably fail, but they're cheap.
>>     > > >         >
>>     > > >         > I don't think we ever had caps for "things are
>>     working now,
>>     > > >         as they are
>>     > > >         > supposed to". i915 wasn't properly synchronizing
>>     on foreign
>>     > > >         fences for a
>>     > > >         > long time, yet we didn't gain a cap for "cross
>>     device sync
>>     > > >         works now".
>>     > > >         >
>>     > > >         > If your distro use-case relies on those things
>>     working it's
>>     > > >         probably
>>     > > >         > best to just backport the relevant fixes.
>>     > > >
>>     > > >         The even better solution for this is to push the
>>     backports
>>     > > >         through
>>     > > >         upstream -stable kernels. This stuff here is simple
>>     enough
>>     > > >         that we can
>>     > > >         do it. Same could have been done for the fairly minimal
>>     > > >         fencing fixes
>>     > > >         for i915 (at least some of them, the ones in the
>>     page-flip).
>>     > > >
>>     > > >         Otherwise we'll end up with tons IM_NOT_BUGGY and
>>     > > >         IM_SLIGHTLY_LESS_BUGGY and
>>     > > >         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
>>     > > >         flags where no one at all knows what they mean,
>>     usage between
>>     > > >         different drivers and different userspace is entirely
>>     > > >         inconsistent and
>>     > > >         they just all add to the confusion. They're just
>>     bugs, lets
>>     > > >         treat them
>>     > > >         like that.
>>     > > >
>>     > > >
>>     > > > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the
>>     relevant
>>     > > > hardware allegedly supports it, nouveau/radeon/amdgpu don't
>>     do scanout
>>     > > > of GTT, so the lack of this cap indicates that there's no
>>     point in
>>     > > > trying to call addfb2.
>>     > > >
>>     > > >
>>     > > > But calling addfb2 and it failing is not expensive, so this
>>     is rather
>>     > > > niche.
>>     > > >
>>     > > >
>>     > > > In practice I can just restrict attempting to scanout of
>>     imported
>>     > > > buffers to i915, as that's the only driver that it'll work
>>     on. By the
>>     > > > time nouveau/radeon/amdgpu get patches to scanout of GTT the
>>     fixes
>>     > > > should be old enough that I don't need to care about unfixed
>>     kernels.
>>     > > >
>>     > > So given that most discreet hardware won't ever be able to
>>     scanout from
>>     > > GTT (latency and iso requirements will be hard to meet), can't
>>     we just
>>     > > fix the case of the broken prime sharing when migrating to VRAM?
>>
>>
>> At least some nouveau and AMD devs seem to think that their hardware
>> is capable of doing it. Shrug.
> 
> Wow, wait a second. Recent AMD GPU can scanout from system memory,
> that's true.

Even discrete GPUs, or only APUs?


> But you need to met quite a bunch of special allocation requirements to
> do this.
> 
> When we are talking about sharing between AMD GPUs, (e.g. both exporter
> and importer are AMD hardware) than that might work.
> 
> But I think it's unrealistic that an imported BO (created by an external
> driver stack) will ever meet those requirements.

Indeed. Also, none of the GPUs supported by the radeon driver support this.
Christian König April 10, 2017, 9:03 a.m. UTC | #21
Am 10.04.2017 um 10:52 schrieb Michel Dänzer:
> On 05/04/17 08:21 PM, Christian König wrote:
>> Am 05.04.2017 um 13:13 schrieb Christopher James Halse Rogers:
>>> On Wed, Apr 5, 2017 at 8:14 PM Lucas Stach <l.stach@pengutronix.de
>>> <mailto:l.stach@pengutronix.de>> wrote:
>>>
>>>      Am Mittwoch, den 05.04.2017, 11:59 +0200 schrieb Daniel Vetter:
>>>      > On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
>>>      > > Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher
>>>      James Halse
>>>      > > Rogers:
>>>      > > >
>>>      > > >
>>>      > > > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter
>>>      <daniel@ffwll.ch <mailto:daniel@ffwll.ch>> wrote:
>>>      > > >
>>>      > > >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
>>>      > > >         <l.stach@pengutronix.de
>>>      <mailto:l.stach@pengutronix.de>> wrote:
>>>      > > >         >> If I could guarantee that I'd only ever run on
>>>      > > >         4.13-or-later kernels
>>>      > > >         >> (I think that's when the previous patches will
>>>      land?), then
>>>      > > >         this would
>>>      > > >         >> indeed be mostly unnecessary. It would save me a
>>>      bunch of
>>>      > > >         addfb calls
>>>      > > >         >> that would predictably fail, but they're cheap.
>>>      > > >         >
>>>      > > >         > I don't think we ever had caps for "things are
>>>      working now,
>>>      > > >         as they are
>>>      > > >         > supposed to". i915 wasn't properly synchronizing
>>>      on foreign
>>>      > > >         fences for a
>>>      > > >         > long time, yet we didn't gain a cap for "cross
>>>      device sync
>>>      > > >         works now".
>>>      > > >         >
>>>      > > >         > If your distro use-case relies on those things
>>>      working it's
>>>      > > >         probably
>>>      > > >         > best to just backport the relevant fixes.
>>>      > > >
>>>      > > >         The even better solution for this is to push the
>>>      backports
>>>      > > >         through
>>>      > > >         upstream -stable kernels. This stuff here is simple
>>>      enough
>>>      > > >         that we can
>>>      > > >         do it. Same could have been done for the fairly minimal
>>>      > > >         fencing fixes
>>>      > > >         for i915 (at least some of them, the ones in the
>>>      page-flip).
>>>      > > >
>>>      > > >         Otherwise we'll end up with tons IM_NOT_BUGGY and
>>>      > > >         IM_SLIGHTLY_LESS_BUGGY and
>>>      > > >         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
>>>      > > >         flags where no one at all knows what they mean,
>>>      usage between
>>>      > > >         different drivers and different userspace is entirely
>>>      > > >         inconsistent and
>>>      > > >         they just all add to the confusion. They're just
>>>      bugs, lets
>>>      > > >         treat them
>>>      > > >         like that.
>>>      > > >
>>>      > > >
>>>      > > > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the
>>>      relevant
>>>      > > > hardware allegedly supports it, nouveau/radeon/amdgpu don't
>>>      do scanout
>>>      > > > of GTT, so the lack of this cap indicates that there's no
>>>      point in
>>>      > > > trying to call addfb2.
>>>      > > >
>>>      > > >
>>>      > > > But calling addfb2 and it failing is not expensive, so this
>>>      is rather
>>>      > > > niche.
>>>      > > >
>>>      > > >
>>>      > > > In practice I can just restrict attempting to scanout of
>>>      imported
>>>      > > > buffers to i915, as that's the only driver that it'll work
>>>      on. By the
>>>      > > > time nouveau/radeon/amdgpu get patches to scanout of GTT the
>>>      fixes
>>>      > > > should be old enough that I don't need to care about unfixed
>>>      kernels.
>>>      > > >
>>>      > > So given that most discreet hardware won't ever be able to
>>>      scanout from
>>>      > > GTT (latency and iso requirements will be hard to meet), can't
>>>      we just
>>>      > > fix the case of the broken prime sharing when migrating to VRAM?
>>>
>>>
>>> At least some nouveau and AMD devs seem to think that their hardware
>>> is capable of doing it. Shrug.
>> Wow, wait a second. Recent AMD GPU can scanout from system memory,
>> that's true.
> Even discrete GPUs, or only APUs?

Good question. The crux is that you need to match certain bandwith and 
latency limitations or otherwise you get a flickering picture.

If I understood the documentation correctly that should even work with 
dGPUs in theory, but nobody is testing/validating this.

Long story short I wouldn't try it without feedback from the hardware/DC 
guys. They are the designated experts for this.

Regards,
Christian.

>
>
>> But you need to met quite a bunch of special allocation requirements to
>> do this.
>>
>> When we are talking about sharing between AMD GPUs, (e.g. both exporter
>> and importer are AMD hardware) than that might work.
>>
>> But I think it's unrealistic that an imported BO (created by an external
>> driver stack) will ever meet those requirements.
> Indeed. Also, none of the GPUs supported by the radeon driver support this.
>
>
Alex Deucher April 10, 2017, 2:10 p.m. UTC | #22
On Mon, Apr 10, 2017 at 5:03 AM, Christian König
<christian.koenig@amd.com> wrote:
> Am 10.04.2017 um 10:52 schrieb Michel Dänzer:
>>
>> On 05/04/17 08:21 PM, Christian König wrote:
>>>
>>> Am 05.04.2017 um 13:13 schrieb Christopher James Halse Rogers:
>>>>
>>>> On Wed, Apr 5, 2017 at 8:14 PM Lucas Stach <l.stach@pengutronix.de
>>>> <mailto:l.stach@pengutronix.de>> wrote:
>>>>
>>>>      Am Mittwoch, den 05.04.2017, 11:59 +0200 schrieb Daniel Vetter:
>>>>      > On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
>>>>      > > Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher
>>>>      James Halse
>>>>      > > Rogers:
>>>>      > > >
>>>>      > > >
>>>>      > > > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter
>>>>      <daniel@ffwll.ch <mailto:daniel@ffwll.ch>> wrote:
>>>>      > > >
>>>>      > > >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
>>>>      > > >         <l.stach@pengutronix.de
>>>>      <mailto:l.stach@pengutronix.de>> wrote:
>>>>      > > >         >> If I could guarantee that I'd only ever run on
>>>>      > > >         4.13-or-later kernels
>>>>      > > >         >> (I think that's when the previous patches will
>>>>      land?), then
>>>>      > > >         this would
>>>>      > > >         >> indeed be mostly unnecessary. It would save me a
>>>>      bunch of
>>>>      > > >         addfb calls
>>>>      > > >         >> that would predictably fail, but they're cheap.
>>>>      > > >         >
>>>>      > > >         > I don't think we ever had caps for "things are
>>>>      working now,
>>>>      > > >         as they are
>>>>      > > >         > supposed to". i915 wasn't properly synchronizing
>>>>      on foreign
>>>>      > > >         fences for a
>>>>      > > >         > long time, yet we didn't gain a cap for "cross
>>>>      device sync
>>>>      > > >         works now".
>>>>      > > >         >
>>>>      > > >         > If your distro use-case relies on those things
>>>>      working it's
>>>>      > > >         probably
>>>>      > > >         > best to just backport the relevant fixes.
>>>>      > > >
>>>>      > > >         The even better solution for this is to push the
>>>>      backports
>>>>      > > >         through
>>>>      > > >         upstream -stable kernels. This stuff here is simple
>>>>      enough
>>>>      > > >         that we can
>>>>      > > >         do it. Same could have been done for the fairly
>>>> minimal
>>>>      > > >         fencing fixes
>>>>      > > >         for i915 (at least some of them, the ones in the
>>>>      page-flip).
>>>>      > > >
>>>>      > > >         Otherwise we'll end up with tons IM_NOT_BUGGY and
>>>>      > > >         IM_SLIGHTLY_LESS_BUGGY and
>>>>      > > >         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
>>>>      > > >         flags where no one at all knows what they mean,
>>>>      usage between
>>>>      > > >         different drivers and different userspace is entirely
>>>>      > > >         inconsistent and
>>>>      > > >         they just all add to the confusion. They're just
>>>>      bugs, lets
>>>>      > > >         treat them
>>>>      > > >         like that.
>>>>      > > >
>>>>      > > >
>>>>      > > > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while the
>>>>      relevant
>>>>      > > > hardware allegedly supports it, nouveau/radeon/amdgpu don't
>>>>      do scanout
>>>>      > > > of GTT, so the lack of this cap indicates that there's no
>>>>      point in
>>>>      > > > trying to call addfb2.
>>>>      > > >
>>>>      > > >
>>>>      > > > But calling addfb2 and it failing is not expensive, so this
>>>>      is rather
>>>>      > > > niche.
>>>>      > > >
>>>>      > > >
>>>>      > > > In practice I can just restrict attempting to scanout of
>>>>      imported
>>>>      > > > buffers to i915, as that's the only driver that it'll work
>>>>      on. By the
>>>>      > > > time nouveau/radeon/amdgpu get patches to scanout of GTT the
>>>>      fixes
>>>>      > > > should be old enough that I don't need to care about unfixed
>>>>      kernels.
>>>>      > > >
>>>>      > > So given that most discreet hardware won't ever be able to
>>>>      scanout from
>>>>      > > GTT (latency and iso requirements will be hard to meet), can't
>>>>      we just
>>>>      > > fix the case of the broken prime sharing when migrating to
>>>> VRAM?
>>>>
>>>>
>>>> At least some nouveau and AMD devs seem to think that their hardware
>>>> is capable of doing it. Shrug.
>>>
>>> Wow, wait a second. Recent AMD GPU can scanout from system memory,
>>> that's true.
>>
>> Even discrete GPUs, or only APUs?
>
>
> Good question. The crux is that you need to match certain bandwith and
> latency limitations or otherwise you get a flickering picture.
>
> If I understood the documentation correctly that should even work with dGPUs
> in theory, but nobody is testing/validating this.
>
> Long story short I wouldn't try it without feedback from the hardware/DC
> guys. They are the designated experts for this.

It's only been validated on APUs.

Alex


>
> Regards,
> Christian.
>
>>
>>
>>> But you need to met quite a bunch of special allocation requirements to
>>> do this.
>>>
>>> When we are talking about sharing between AMD GPUs, (e.g. both exporter
>>> and importer are AMD hardware) than that might work.
>>>
>>> But I think it's unrealistic that an imported BO (created by an external
>>> driver stack) will ever meet those requirements.
>>
>> Indeed. Also, none of the GPUs supported by the radeon driver support
>> this.
>>
>>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Christopher James Halse Rogers April 17, 2017, 5:05 a.m. UTC | #23
On 10 April 2017 6:51:21 pm AEST, "Michel Dänzer" <michel@daenzer.net> wrote:
>On 06/04/17 04:47 PM, Christopher James Halse Rogers wrote:
>> On Wed, 5 Apr 2017 at 20:14 Lucas Stach <l.stach@pengutronix.de
>> <mailto:l.stach@pengutronix.de>> wrote:
>> 
>>     Am Mittwoch, den 05.04.2017, 11:59 +0200 schrieb Daniel Vetter:
>>     > On Wed, Apr 05, 2017 at 10:15:44AM +0200, Lucas Stach wrote:
>>     > > Am Mittwoch, den 05.04.2017, 00:20 +0000 schrieb Christopher
>>     James Halse
>>     > > Rogers:
>>     > > >
>>     > > >
>>     > > > On Tue, Apr 4, 2017 at 9:53 PM Daniel Vetter
><daniel@ffwll.ch
>>     <mailto:daniel@ffwll.ch>> wrote:
>>     > > >
>>     > > >         On Tue, Apr 4, 2017 at 12:43 PM, Lucas Stach
>>     > > >         <l.stach@pengutronix.de
>>     <mailto:l.stach@pengutronix.de>> wrote:
>>     > > >         >> If I could guarantee that I'd only ever run on
>>     > > >         4.13-or-later kernels
>>     > > >         >> (I think that's when the previous patches will
>>     land?), then
>>     > > >         this would
>>     > > >         >> indeed be mostly unnecessary. It would save me a
>>     bunch of
>>     > > >         addfb calls
>>     > > >         >> that would predictably fail, but they're cheap.
>>     > > >         >
>>     > > >         > I don't think we ever had caps for "things are
>>     working now,
>>     > > >         as they are
>>     > > >         > supposed to". i915 wasn't properly synchronizing
>on
>>     foreign
>>     > > >         fences for a
>>     > > >         > long time, yet we didn't gain a cap for "cross
>>     device sync
>>     > > >         works now".
>>     > > >         >
>>     > > >         > If your distro use-case relies on those things
>>     working it's
>>     > > >         probably
>>     > > >         > best to just backport the relevant fixes.
>>     > > >
>>     > > >         The even better solution for this is to push the
>backports
>>     > > >         through
>>     > > >         upstream -stable kernels. This stuff here is simple
>enough
>>     > > >         that we can
>>     > > >         do it. Same could have been done for the fairly
>minimal
>>     > > >         fencing fixes
>>     > > >         for i915 (at least some of them, the ones in the
>>     page-flip).
>>     > > >
>>     > > >         Otherwise we'll end up with tons IM_NOT_BUGGY and
>>     > > >         IM_SLIGHTLY_LESS_BUGGY and
>>     > > >         IM_NOT_BUGGY_EXCEPT_THIS_BOTCHED_BACKPORT
>>     > > >         flags where no one at all knows what they mean,
>usage
>>     between
>>     > > >         different drivers and different userspace is
>entirely
>>     > > >         inconsistent and
>>     > > >         they just all add to the confusion. They're just
>bugs,
>>     lets
>>     > > >         treat them
>>     > > >         like that.
>>     > > >
>>     > > >
>>     > > > It's not *quite* DRM_CAP_PRIME_SCANOUT_NOT_BUGGY - while
>the
>>     relevant
>>     > > > hardware allegedly supports it, nouveau/radeon/amdgpu don't
>do
>>     scanout
>>     > > > of GTT, so the lack of this cap indicates that there's no
>point in
>>     > > > trying to call addfb2.
>>     > > >
>>     > > >
>>     > > > But calling addfb2 and it failing is not expensive, so this
>is
>>     rather
>>     > > > niche.
>>     > > >
>>     > > >
>>     > > > In practice I can just restrict attempting to scanout of
>imported
>>     > > > buffers to i915, as that's the only driver that it'll work
>on.
>>     By the
>>     > > > time nouveau/radeon/amdgpu get patches to scanout of GTT
>the fixes
>>     > > > should be old enough that I don't need to care about
>unfixed
>>     kernels.
>>     > > >
>>     > > So given that most discreet hardware won't ever be able to
>>     scanout from
>>     > > GTT (latency and iso requirements will be hard to meet),
>can't
>>     we just
>>     > > fix the case of the broken prime sharing when migrating to
>VRAM?
>>     > >
>>     > > I'm thinking about attaching an exclusive fence to the
>dma-buf
>>     when the
>>     > > migration to VRAM happens, then when the GPU is done with the
>>     buffer we
>>     > > can either write back any changes to GTT, or just drop the
>fence
>>     in case
>>     > > the GPU didn't modify the buffer.
>>     >
>>     > We could, but someone needs to type the code for it. There's
>also the
>>     > problem that you need to migrate back, and then doing all that
>behind
>>     > userspaces back might not be the best idea.
>> 
>>     Drivers with separate VRAM and GTT are already doing a lot of
>migration
>>     behind the userspaces back. The only issue with dma-buf migration
>to
>>     VRAM is that you probably don't want to migrate the pages, but
>duplicate
>>     them in VRAM, doubling memory consumption with possible OOM. But
>then
>>     you could alloc the memory on addfb where you are able to return
>proper
>>     errors.
>> 
>> 
>> Hm. So, on a first inspection, this looks like something I could
>> actually cook up.
>> 
>> Looking at amdgpu it seems like the thing to do would be to associate
>a
>> shadow-bo in VRAM for the imported dma-buf in the addfb call, then
>> pin-and-copy-to the shadow bo in the places where the bo is currently
>> pinned.
>> 
>> Is this approach likely to be acceptable?
>
>It would break e.g. with DRI2 flips, because they replace the screen
>pixmap buffer with the buffer we're flipping to. If the app stops
>flipping while such a shadow BO is being scanned out, later draws to
>the
>screen pixmap won't become visible.

This shadow BO would only ever be used for imported dma-bufs. This would change the behaviour from “addfb fails” to “you get a shadow BO”. (And, pre-patch, from “addfb succeeds but you never see any new rendering”).

I don't think any DRI2 implementation hits this, because of it did it would already be broken.

(On paternity leave, so I'll be intermittent in following this up)
Michel Dänzer April 17, 2017, 6:41 a.m. UTC | #24
On 17/04/17 02:05 PM, Christopher James Halse Rogers wrote:
> On 10 April 2017 6:51:21 pm AEST, "Michel Dänzer" <michel@daenzer.net> wrote:
>> On 06/04/17 04:47 PM, Christopher James Halse Rogers wrote:
>>> On Wed, 5 Apr 2017 at 20:14 Lucas Stach <l.stach@pengutronix.de
>>> <mailto:l.stach@pengutronix.de>> wrote:
>>>
>>>> Drivers with separate VRAM and GTT are already doing a lot of migration
>>>> behind the userspaces back. The only issue with dma-buf migration to
>>>> VRAM is that you probably don't want to migrate the pages, but duplicate
>>>> them in VRAM, doubling memory consumption with possible OOM. But then
>>>> you could alloc the memory on addfb where you are able to return proper
>>>> errors.
>>>
>>> Hm. So, on a first inspection, this looks like something I could
>>> actually cook up.
>>>
>>> Looking at amdgpu it seems like the thing to do would be to associate a
>>> shadow-bo in VRAM for the imported dma-buf in the addfb call, then
>>> pin-and-copy-to the shadow bo in the places where the bo is currently
>>> pinned.
>>>
>>> Is this approach likely to be acceptable?
>>
>> It would break e.g. with DRI2 flips, because they replace the screen
>> pixmap buffer with the buffer we're flipping to. If the app stops
>> flipping while such a shadow BO is being scanned out, later draws to
>> the screen pixmap won't become visible.
> 
> This shadow BO would only ever be used for imported dma-bufs. This
> would change the behaviour from “addfb fails” to “you get a shadow
> BO”. (And, pre-patch, from “addfb succeeds but you never see any new
> rendering”).
> 
> I don't think any DRI2 implementation hits this, because of it did it
> would already be broken.

You're right, I got it backwards. I guess this could work.
diff mbox

Patch

diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index a7c61c23685a..79ccf638668e 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -285,6 +285,9 @@  static int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_
 	case DRM_CAP_ADDFB2_MODIFIERS:
 		req->value = dev->mode_config.allow_fb_modifiers;
 		break;
+	case DRM_CAP_PRIME_SCANOUT:
+		req->value = drm_core_check_feature(dev, DRIVER_PRIME_SCANOUT);
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index 5699f42195fe..32cc0d956d7e 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -53,6 +53,7 @@  struct drm_mode_create_dumb;
 #define DRIVER_RENDER			0x8000
 #define DRIVER_ATOMIC			0x10000
 #define DRIVER_KMS_LEGACY_CONTEXT	0x20000
+#define DRIVER_PRIME_SCANOUT		0x40000
 
 /**
  * struct drm_driver - DRM driver structure
diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index b2c52843bc70..c57213cdb702 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -647,6 +647,27 @@  struct drm_gem_open {
 #define DRM_CAP_CURSOR_HEIGHT		0x9
 #define DRM_CAP_ADDFB2_MODIFIERS	0x10
 #define DRM_CAP_PAGE_FLIP_TARGET	0x11
+/**
+ * DRM_CAP_PRIME_SCANOUT
+ *
+ * The PRIME_SCANOUT capability returns whether the driver might be capable
+ * of scanning out of an imported PRIME buffer.
+ *
+ * This does not guarantee that any imported buffer can be scanned out, as
+ * there may be specific requirements that a given buffer might not satisfy.
+ *
+ * If this capability is false then imported PRIME buffers will definitely
+ * never be suitable for scanout.
+ *
+ * Note: Prior to the introduction of this capability, bugs in drivers would
+ * allow userspace to attempt to scan out of imported PRIME buffers. This would
+ * work once, but move the buffer into an inconsistent state where rendering from
+ * the exporting GPU would no longer be seen by the importing GPU.
+ *
+ * It is unsafe for driver-agnostic code to attempt to scan out of an imported
+ * PRIME buffer in the absense of this capability.
+ */
+#define DRM_CAP_PRIME_SCANOUT		0x12
 
 /** DRM_IOCTL_GET_CAP ioctl argument type */
 struct drm_get_cap {