diff mbox series

[v1] drm/ttm: Refcount allocated tail pages

Message ID 20220815095423.11131-1-dmitry.osipenko@collabora.com (mailing list archive)
State New, archived
Headers show
Series [v1] drm/ttm: Refcount allocated tail pages | expand

Commit Message

Dmitry Osipenko Aug. 15, 2022, 9:54 a.m. UTC
Higher order pages allocated using alloc_pages() aren't refcounted and they
need to be refcounted, otherwise it's impossible to map them by KVM. This
patch sets the refcount of the tail pages and fixes the KVM memory mapping
faults.

Without this change guest virgl driver can't map host buffers into guest
and can't provide OpenGL 4.5 profile support to the guest. The host
mappings are also needed for enabling the Venus driver using host GPU
drivers that are utilizing TTM.

Based on a patch proposed by Trigger Huang.

Cc: stable@vger.kernel.org
Cc: Trigger Huang <Trigger.Huang@gmail.com>
Link: https://www.collabora.com/news-and-blog/blog/2021/11/26/venus-on-qemu-enabling-new-virtual-vulkan-driver/#qcom1343
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com> # AMDGPU (Qemu and crosvm)
Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
---
 drivers/gpu/drm/ttm/ttm_pool.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

Comments

Daniel Vetter Sept. 6, 2022, 8:01 p.m. UTC | #1
On Mon, Aug 15, 2022 at 12:05:19PM +0200, Christian König wrote:
> Am 15.08.22 um 11:54 schrieb Dmitry Osipenko:
> > Higher order pages allocated using alloc_pages() aren't refcounted and they
> > need to be refcounted, otherwise it's impossible to map them by KVM. This
> > patch sets the refcount of the tail pages and fixes the KVM memory mapping
> > faults.
> > 
> > Without this change guest virgl driver can't map host buffers into guest
> > and can't provide OpenGL 4.5 profile support to the guest. The host
> > mappings are also needed for enabling the Venus driver using host GPU
> > drivers that are utilizing TTM.
> > 
> > Based on a patch proposed by Trigger Huang.
> 
> Well I can't count how often I have repeated this: This is an absolutely
> clear NAK!
> 
> TTM pages are not reference counted in the first place and because of this
> giving them to virgl is illegal.
> 
> Please immediately stop this completely broken approach. We have discussed
> this multiple times now.

Yeah we need to get this stuff closed for real by tagging them all with
VM_IO or VM_PFNMAP asap.

It seems ot be a recurring amount of fun that people try to mmap dma-buf
and then call get_user_pages on them.

Which just doesn't work. I guess this is also why Rob Clark send out that
dma-buf patch to expos mapping information (i.e. wc vs wb vs uncached).

There seems to be some serious bonghits going on :-/
-Daniel

> 
> Regards,
> Christian.
> 
> > 
> > Cc: stable@vger.kernel.org
> > Cc: Trigger Huang <Trigger.Huang@gmail.com>
> > Link: https://www.collabora.com/news-and-blog/blog/2021/11/26/venus-on-qemu-enabling-new-virtual-vulkan-driver/#qcom1343
> > Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com> # AMDGPU (Qemu and crosvm)
> > Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
> > ---
> >   drivers/gpu/drm/ttm/ttm_pool.c | 25 ++++++++++++++++++++++++-
> >   1 file changed, 24 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
> > index 21b61631f73a..11e92bb149c9 100644
> > --- a/drivers/gpu/drm/ttm/ttm_pool.c
> > +++ b/drivers/gpu/drm/ttm/ttm_pool.c
> > @@ -81,6 +81,7 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> >   	unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
> >   	struct ttm_pool_dma *dma;
> >   	struct page *p;
> > +	unsigned int i;
> >   	void *vaddr;
> >   	/* Don't set the __GFP_COMP flag for higher order allocations.
> > @@ -93,8 +94,10 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> >   	if (!pool->use_dma_alloc) {
> >   		p = alloc_pages(gfp_flags, order);
> > -		if (p)
> > +		if (p) {
> >   			p->private = order;
> > +			goto ref_tail_pages;
> > +		}
> >   		return p;
> >   	}
> > @@ -120,6 +123,23 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> >   	dma->vaddr = (unsigned long)vaddr | order;
> >   	p->private = (unsigned long)dma;
> > +
> > +ref_tail_pages:
> > +	/*
> > +	 * KVM requires mapped tail pages to be refcounted because put_page()
> > +	 * is invoked on them in the end of the page fault handling, and thus,
> > +	 * tail pages need to be protected from the premature releasing.
> > +	 * In fact, KVM page fault handler refuses to map tail pages to guest
> > +	 * if they aren't refcounted because hva_to_pfn_remapped() checks the
> > +	 * refcount specifically for this case.
> > +	 *
> > +	 * In particular, unreferenced tail pages result in a KVM "Bad address"
> > +	 * failure for VMMs that use VirtIO-GPU when guest's Mesa VirGL driver
> > +	 * accesses mapped host TTM buffer that contains tail pages.
> > +	 */
> > +	for (i = 1; i < 1 << order; i++)
> > +		page_ref_inc(p + i);
> > +
> >   	return p;
> >   error_free:
> > @@ -133,6 +153,7 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
> >   {
> >   	unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
> >   	struct ttm_pool_dma *dma;
> > +	unsigned int i;
> >   	void *vaddr;
> >   #ifdef CONFIG_X86
> > @@ -142,6 +163,8 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
> >   	if (caching != ttm_cached && !PageHighMem(p))
> >   		set_pages_wb(p, 1 << order);
> >   #endif
> > +	for (i = 1; i < 1 << order; i++)
> > +		page_ref_dec(p + i);
> >   	if (!pool || !pool->use_dma_alloc) {
> >   		__free_pages(p, order);
>
Daniel Vetter Sept. 6, 2022, 8:05 p.m. UTC | #2
On Tue, Sep 06, 2022 at 10:01:47PM +0200, Daniel Vetter wrote:
> On Mon, Aug 15, 2022 at 12:05:19PM +0200, Christian König wrote:
> > Am 15.08.22 um 11:54 schrieb Dmitry Osipenko:
> > > Higher order pages allocated using alloc_pages() aren't refcounted and they
> > > need to be refcounted, otherwise it's impossible to map them by KVM. This
> > > patch sets the refcount of the tail pages and fixes the KVM memory mapping
> > > faults.
> > > 
> > > Without this change guest virgl driver can't map host buffers into guest
> > > and can't provide OpenGL 4.5 profile support to the guest. The host
> > > mappings are also needed for enabling the Venus driver using host GPU
> > > drivers that are utilizing TTM.
> > > 
> > > Based on a patch proposed by Trigger Huang.
> > 
> > Well I can't count how often I have repeated this: This is an absolutely
> > clear NAK!
> > 
> > TTM pages are not reference counted in the first place and because of this
> > giving them to virgl is illegal.
> > 
> > Please immediately stop this completely broken approach. We have discussed
> > this multiple times now.
> 
> Yeah we need to get this stuff closed for real by tagging them all with
> VM_IO or VM_PFNMAP asap.

For a bit more context: Anything mapping a bo should be VM_SPECIAL. And I
think we should add the checks to the gem and dma-buf mmap functions to
validate for that, and fix all the fallout.

Otherwise this dragon keeps resurrecting ...

VM_SPECIAL _will_ block get_user_pages, which will block everyone from
even trying to refcount this stuff.

Minimally we need to fix this for all ttm drivers, and it sounds like
that's still not yet the case :-( Iirc last time around some funky amdkfd
userspace was the hold-up because regressions?
-Daniel

> 
> It seems ot be a recurring amount of fun that people try to mmap dma-buf
> and then call get_user_pages on them.
> 
> Which just doesn't work. I guess this is also why Rob Clark send out that
> dma-buf patch to expos mapping information (i.e. wc vs wb vs uncached).
> 
> There seems to be some serious bonghits going on :-/
> -Daniel
> 
> > 
> > Regards,
> > Christian.
> > 
> > > 
> > > Cc: stable@vger.kernel.org
> > > Cc: Trigger Huang <Trigger.Huang@gmail.com>
> > > Link: https://www.collabora.com/news-and-blog/blog/2021/11/26/venus-on-qemu-enabling-new-virtual-vulkan-driver/#qcom1343
> > > Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com> # AMDGPU (Qemu and crosvm)
> > > Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
> > > ---
> > >   drivers/gpu/drm/ttm/ttm_pool.c | 25 ++++++++++++++++++++++++-
> > >   1 file changed, 24 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
> > > index 21b61631f73a..11e92bb149c9 100644
> > > --- a/drivers/gpu/drm/ttm/ttm_pool.c
> > > +++ b/drivers/gpu/drm/ttm/ttm_pool.c
> > > @@ -81,6 +81,7 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> > >   	unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
> > >   	struct ttm_pool_dma *dma;
> > >   	struct page *p;
> > > +	unsigned int i;
> > >   	void *vaddr;
> > >   	/* Don't set the __GFP_COMP flag for higher order allocations.
> > > @@ -93,8 +94,10 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> > >   	if (!pool->use_dma_alloc) {
> > >   		p = alloc_pages(gfp_flags, order);
> > > -		if (p)
> > > +		if (p) {
> > >   			p->private = order;
> > > +			goto ref_tail_pages;
> > > +		}
> > >   		return p;
> > >   	}
> > > @@ -120,6 +123,23 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> > >   	dma->vaddr = (unsigned long)vaddr | order;
> > >   	p->private = (unsigned long)dma;
> > > +
> > > +ref_tail_pages:
> > > +	/*
> > > +	 * KVM requires mapped tail pages to be refcounted because put_page()
> > > +	 * is invoked on them in the end of the page fault handling, and thus,
> > > +	 * tail pages need to be protected from the premature releasing.
> > > +	 * In fact, KVM page fault handler refuses to map tail pages to guest
> > > +	 * if they aren't refcounted because hva_to_pfn_remapped() checks the
> > > +	 * refcount specifically for this case.
> > > +	 *
> > > +	 * In particular, unreferenced tail pages result in a KVM "Bad address"
> > > +	 * failure for VMMs that use VirtIO-GPU when guest's Mesa VirGL driver
> > > +	 * accesses mapped host TTM buffer that contains tail pages.
> > > +	 */
> > > +	for (i = 1; i < 1 << order; i++)
> > > +		page_ref_inc(p + i);
> > > +
> > >   	return p;
> > >   error_free:
> > > @@ -133,6 +153,7 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
> > >   {
> > >   	unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
> > >   	struct ttm_pool_dma *dma;
> > > +	unsigned int i;
> > >   	void *vaddr;
> > >   #ifdef CONFIG_X86
> > > @@ -142,6 +163,8 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
> > >   	if (caching != ttm_cached && !PageHighMem(p))
> > >   		set_pages_wb(p, 1 << order);
> > >   #endif
> > > +	for (i = 1; i < 1 << order; i++)
> > > +		page_ref_dec(p + i);
> > >   	if (!pool || !pool->use_dma_alloc) {
> > >   		__free_pages(p, order);
> > 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Christian König Sept. 7, 2022, 6:48 a.m. UTC | #3
Am 06.09.22 um 22:05 schrieb Daniel Vetter:
> On Tue, Sep 06, 2022 at 10:01:47PM +0200, Daniel Vetter wrote:
>> On Mon, Aug 15, 2022 at 12:05:19PM +0200, Christian König wrote:
>>> Am 15.08.22 um 11:54 schrieb Dmitry Osipenko:
>>>> Higher order pages allocated using alloc_pages() aren't refcounted and they
>>>> need to be refcounted, otherwise it's impossible to map them by KVM. This
>>>> patch sets the refcount of the tail pages and fixes the KVM memory mapping
>>>> faults.
>>>>
>>>> Without this change guest virgl driver can't map host buffers into guest
>>>> and can't provide OpenGL 4.5 profile support to the guest. The host
>>>> mappings are also needed for enabling the Venus driver using host GPU
>>>> drivers that are utilizing TTM.
>>>>
>>>> Based on a patch proposed by Trigger Huang.
>>> Well I can't count how often I have repeated this: This is an absolutely
>>> clear NAK!
>>>
>>> TTM pages are not reference counted in the first place and because of this
>>> giving them to virgl is illegal.
>>>
>>> Please immediately stop this completely broken approach. We have discussed
>>> this multiple times now.
>> Yeah we need to get this stuff closed for real by tagging them all with
>> VM_IO or VM_PFNMAP asap.
> For a bit more context: Anything mapping a bo should be VM_SPECIAL. And I
> think we should add the checks to the gem and dma-buf mmap functions to
> validate for that, and fix all the fallout.
>
> Otherwise this dragon keeps resurrecting ...
>
> VM_SPECIAL _will_ block get_user_pages, which will block everyone from
> even trying to refcount this stuff.
>
> Minimally we need to fix this for all ttm drivers, and it sounds like
> that's still not yet the case :-( Iirc last time around some funky amdkfd
> userspace was the hold-up because regressions?

My recollection is that Felix and I fixed this with a KFD specific 
workaround. But I can double check with Felix on Monday.

Christian.

> -Daniel
>
>> It seems ot be a recurring amount of fun that people try to mmap dma-buf
>> and then call get_user_pages on them.
>>
>> Which just doesn't work. I guess this is also why Rob Clark send out that
>> dma-buf patch to expos mapping information (i.e. wc vs wb vs uncached).
>>
>> There seems to be some serious bonghits going on :-/
>> -Daniel
>>
>>> Regards,
>>> Christian.
>>>
>>>> Cc: stable@vger.kernel.org
>>>> Cc: Trigger Huang <Trigger.Huang@gmail.com>
>>>> Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.collabora.com%2Fnews-and-blog%2Fblog%2F2021%2F11%2F26%2Fvenus-on-qemu-enabling-new-virtual-vulkan-driver%2F%23qcom1343&amp;data=05%7C01%7Cchristian.koenig%40amd.com%7C37a7d9b0f91249da415b08da90432d3a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637980915471280078%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=XN6wFiWc6Jljekmst0aOCPSTsFLlmkUjD9F%2Fl9nluAs%3D&amp;reserved=0
>>>> Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com> # AMDGPU (Qemu and crosvm)
>>>> Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
>>>> ---
>>>>    drivers/gpu/drm/ttm/ttm_pool.c | 25 ++++++++++++++++++++++++-
>>>>    1 file changed, 24 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
>>>> index 21b61631f73a..11e92bb149c9 100644
>>>> --- a/drivers/gpu/drm/ttm/ttm_pool.c
>>>> +++ b/drivers/gpu/drm/ttm/ttm_pool.c
>>>> @@ -81,6 +81,7 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
>>>>    	unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
>>>>    	struct ttm_pool_dma *dma;
>>>>    	struct page *p;
>>>> +	unsigned int i;
>>>>    	void *vaddr;
>>>>    	/* Don't set the __GFP_COMP flag for higher order allocations.
>>>> @@ -93,8 +94,10 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
>>>>    	if (!pool->use_dma_alloc) {
>>>>    		p = alloc_pages(gfp_flags, order);
>>>> -		if (p)
>>>> +		if (p) {
>>>>    			p->private = order;
>>>> +			goto ref_tail_pages;
>>>> +		}
>>>>    		return p;
>>>>    	}
>>>> @@ -120,6 +123,23 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
>>>>    	dma->vaddr = (unsigned long)vaddr | order;
>>>>    	p->private = (unsigned long)dma;
>>>> +
>>>> +ref_tail_pages:
>>>> +	/*
>>>> +	 * KVM requires mapped tail pages to be refcounted because put_page()
>>>> +	 * is invoked on them in the end of the page fault handling, and thus,
>>>> +	 * tail pages need to be protected from the premature releasing.
>>>> +	 * In fact, KVM page fault handler refuses to map tail pages to guest
>>>> +	 * if they aren't refcounted because hva_to_pfn_remapped() checks the
>>>> +	 * refcount specifically for this case.
>>>> +	 *
>>>> +	 * In particular, unreferenced tail pages result in a KVM "Bad address"
>>>> +	 * failure for VMMs that use VirtIO-GPU when guest's Mesa VirGL driver
>>>> +	 * accesses mapped host TTM buffer that contains tail pages.
>>>> +	 */
>>>> +	for (i = 1; i < 1 << order; i++)
>>>> +		page_ref_inc(p + i);
>>>> +
>>>>    	return p;
>>>>    error_free:
>>>> @@ -133,6 +153,7 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
>>>>    {
>>>>    	unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
>>>>    	struct ttm_pool_dma *dma;
>>>> +	unsigned int i;
>>>>    	void *vaddr;
>>>>    #ifdef CONFIG_X86
>>>> @@ -142,6 +163,8 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
>>>>    	if (caching != ttm_cached && !PageHighMem(p))
>>>>    		set_pages_wb(p, 1 << order);
>>>>    #endif
>>>> +	for (i = 1; i < 1 << order; i++)
>>>> +		page_ref_dec(p + i);
>>>>    	if (!pool || !pool->use_dma_alloc) {
>>>>    		__free_pages(p, order);
>> -- 
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll.ch%2F&amp;data=05%7C01%7Cchristian.koenig%40amd.com%7C37a7d9b0f91249da415b08da90432d3a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637980915471280078%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=bGZ1OAxL%2Fd99Nqu49soWZVqvvUKjuD6n6BKkAhMv4fs%3D&amp;reserved=0
Rob Clark Sept. 8, 2022, 11:04 a.m. UTC | #4
On Tue, Sep 6, 2022 at 1:01 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Mon, Aug 15, 2022 at 12:05:19PM +0200, Christian König wrote:
> > Am 15.08.22 um 11:54 schrieb Dmitry Osipenko:
> > > Higher order pages allocated using alloc_pages() aren't refcounted and they
> > > need to be refcounted, otherwise it's impossible to map them by KVM. This
> > > patch sets the refcount of the tail pages and fixes the KVM memory mapping
> > > faults.
> > >
> > > Without this change guest virgl driver can't map host buffers into guest
> > > and can't provide OpenGL 4.5 profile support to the guest. The host
> > > mappings are also needed for enabling the Venus driver using host GPU
> > > drivers that are utilizing TTM.
> > >
> > > Based on a patch proposed by Trigger Huang.
> >
> > Well I can't count how often I have repeated this: This is an absolutely
> > clear NAK!
> >
> > TTM pages are not reference counted in the first place and because of this
> > giving them to virgl is illegal.
> >
> > Please immediately stop this completely broken approach. We have discussed
> > this multiple times now.
>
> Yeah we need to get this stuff closed for real by tagging them all with
> VM_IO or VM_PFNMAP asap.
>
> It seems ot be a recurring amount of fun that people try to mmap dma-buf
> and then call get_user_pages on them.
>
> Which just doesn't work. I guess this is also why Rob Clark send out that
> dma-buf patch to expos mapping information (i.e. wc vs wb vs uncached).

No, not really.. my patch was simply so that the VMM side of virtgpu
could send the correct cache mode to the guest when handed a dma-buf
;-)

BR,
-R

>
> There seems to be some serious bonghits going on :-/
> -Daniel
>
> >
> > Regards,
> > Christian.
> >
> > >
> > > Cc: stable@vger.kernel.org
> > > Cc: Trigger Huang <Trigger.Huang@gmail.com>
> > > Link: https://www.collabora.com/news-and-blog/blog/2021/11/26/venus-on-qemu-enabling-new-virtual-vulkan-driver/#qcom1343
> > > Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com> # AMDGPU (Qemu and crosvm)
> > > Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
> > > ---
> > >   drivers/gpu/drm/ttm/ttm_pool.c | 25 ++++++++++++++++++++++++-
> > >   1 file changed, 24 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
> > > index 21b61631f73a..11e92bb149c9 100644
> > > --- a/drivers/gpu/drm/ttm/ttm_pool.c
> > > +++ b/drivers/gpu/drm/ttm/ttm_pool.c
> > > @@ -81,6 +81,7 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> > >     unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
> > >     struct ttm_pool_dma *dma;
> > >     struct page *p;
> > > +   unsigned int i;
> > >     void *vaddr;
> > >     /* Don't set the __GFP_COMP flag for higher order allocations.
> > > @@ -93,8 +94,10 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> > >     if (!pool->use_dma_alloc) {
> > >             p = alloc_pages(gfp_flags, order);
> > > -           if (p)
> > > +           if (p) {
> > >                     p->private = order;
> > > +                   goto ref_tail_pages;
> > > +           }
> > >             return p;
> > >     }
> > > @@ -120,6 +123,23 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> > >     dma->vaddr = (unsigned long)vaddr | order;
> > >     p->private = (unsigned long)dma;
> > > +
> > > +ref_tail_pages:
> > > +   /*
> > > +    * KVM requires mapped tail pages to be refcounted because put_page()
> > > +    * is invoked on them in the end of the page fault handling, and thus,
> > > +    * tail pages need to be protected from the premature releasing.
> > > +    * In fact, KVM page fault handler refuses to map tail pages to guest
> > > +    * if they aren't refcounted because hva_to_pfn_remapped() checks the
> > > +    * refcount specifically for this case.
> > > +    *
> > > +    * In particular, unreferenced tail pages result in a KVM "Bad address"
> > > +    * failure for VMMs that use VirtIO-GPU when guest's Mesa VirGL driver
> > > +    * accesses mapped host TTM buffer that contains tail pages.
> > > +    */
> > > +   for (i = 1; i < 1 << order; i++)
> > > +           page_ref_inc(p + i);
> > > +
> > >     return p;
> > >   error_free:
> > > @@ -133,6 +153,7 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
> > >   {
> > >     unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
> > >     struct ttm_pool_dma *dma;
> > > +   unsigned int i;
> > >     void *vaddr;
> > >   #ifdef CONFIG_X86
> > > @@ -142,6 +163,8 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
> > >     if (caching != ttm_cached && !PageHighMem(p))
> > >             set_pages_wb(p, 1 << order);
> > >   #endif
> > > +   for (i = 1; i < 1 << order; i++)
> > > +           page_ref_dec(p + i);
> > >     if (!pool || !pool->use_dma_alloc) {
> > >             __free_pages(p, order);
> >
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Sean Christopherson Jan. 11, 2023, 5:13 p.m. UTC | #5
On Tue, Sep 06, 2022, Daniel Vetter wrote:
> On Tue, Sep 06, 2022 at 10:01:47PM +0200, Daniel Vetter wrote:
> > On Mon, Aug 15, 2022 at 12:05:19PM +0200, Christian König wrote:
> > > Am 15.08.22 um 11:54 schrieb Dmitry Osipenko:
> > > > Higher order pages allocated using alloc_pages() aren't refcounted and they
> > > > need to be refcounted, otherwise it's impossible to map them by KVM. This
> > > > patch sets the refcount of the tail pages and fixes the KVM memory mapping
> > > > faults.
> > > > 
> > > > Without this change guest virgl driver can't map host buffers into guest
> > > > and can't provide OpenGL 4.5 profile support to the guest. The host
> > > > mappings are also needed for enabling the Venus driver using host GPU
> > > > drivers that are utilizing TTM.
> > > > 
> > > > Based on a patch proposed by Trigger Huang.
> > > 
> > > Well I can't count how often I have repeated this: This is an absolutely
> > > clear NAK!
> > > 
> > > TTM pages are not reference counted in the first place and because of this
> > > giving them to virgl is illegal.
> > > 
> > > Please immediately stop this completely broken approach. We have discussed
> > > this multiple times now.
> > 
> > Yeah we need to get this stuff closed for real by tagging them all with
> > VM_IO or VM_PFNMAP asap.
> 
> For a bit more context: Anything mapping a bo should be VM_SPECIAL. And I
> think we should add the checks to the gem and dma-buf mmap functions to
> validate for that, and fix all the fallout.
> 
> Otherwise this dragon keeps resurrecting ...
> 
> VM_SPECIAL _will_ block get_user_pages, which will block everyone from
> even trying to refcount this stuff.

FWIW, IIUC that won't change the KVM story.  KVM acquires the PFN for these pages
via follow_pte(), not by gup().  Details are in a different strand of this thread[*].

If TTM pages aren't tied into mmu_notifiers, then I believe the only solution is
to not allow them to be mapped into user page tables.  If they are tied into
mmu_notifiers, then this is fully a KVM limitation that we are (slowly) resolving.

[*] https://lore.kernel.org/all/Y77sQZI0IfFVx7Jo@google.com
diff mbox series

Patch

diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 21b61631f73a..11e92bb149c9 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -81,6 +81,7 @@  static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
 	unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
 	struct ttm_pool_dma *dma;
 	struct page *p;
+	unsigned int i;
 	void *vaddr;
 
 	/* Don't set the __GFP_COMP flag for higher order allocations.
@@ -93,8 +94,10 @@  static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
 
 	if (!pool->use_dma_alloc) {
 		p = alloc_pages(gfp_flags, order);
-		if (p)
+		if (p) {
 			p->private = order;
+			goto ref_tail_pages;
+		}
 		return p;
 	}
 
@@ -120,6 +123,23 @@  static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
 
 	dma->vaddr = (unsigned long)vaddr | order;
 	p->private = (unsigned long)dma;
+
+ref_tail_pages:
+	/*
+	 * KVM requires mapped tail pages to be refcounted because put_page()
+	 * is invoked on them in the end of the page fault handling, and thus,
+	 * tail pages need to be protected from the premature releasing.
+	 * In fact, KVM page fault handler refuses to map tail pages to guest
+	 * if they aren't refcounted because hva_to_pfn_remapped() checks the
+	 * refcount specifically for this case.
+	 *
+	 * In particular, unreferenced tail pages result in a KVM "Bad address"
+	 * failure for VMMs that use VirtIO-GPU when guest's Mesa VirGL driver
+	 * accesses mapped host TTM buffer that contains tail pages.
+	 */
+	for (i = 1; i < 1 << order; i++)
+		page_ref_inc(p + i);
+
 	return p;
 
 error_free:
@@ -133,6 +153,7 @@  static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
 {
 	unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
 	struct ttm_pool_dma *dma;
+	unsigned int i;
 	void *vaddr;
 
 #ifdef CONFIG_X86
@@ -142,6 +163,8 @@  static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
 	if (caching != ttm_cached && !PageHighMem(p))
 		set_pages_wb(p, 1 << order);
 #endif
+	for (i = 1; i < 1 << order; i++)
+		page_ref_dec(p + i);
 
 	if (!pool || !pool->use_dma_alloc) {
 		__free_pages(p, order);