diff mbox series

[RFC,14/17] KVM: arm64: Punt last page reference to rcu callback for parallel walk

Message ID 20220415215901.1737897-15-oupton@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Parallelize stage 2 fault handling | expand

Commit Message

Oliver Upton April 15, 2022, 9:58 p.m. UTC
It is possible that a table page remains visible to another thread until
the next rcu synchronization event. To that end, we cannot drop the last
page reference synchronous with post-order traversal for a parallel
table walk.

Schedule an rcu callback to clean up the child table page for parallel
walks.

Signed-off-by: Oliver Upton <oupton@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h |  3 ++
 arch/arm64/kvm/hyp/pgtable.c         | 24 +++++++++++++--
 arch/arm64/kvm/mmu.c                 | 44 +++++++++++++++++++++++++++-
 3 files changed, 67 insertions(+), 4 deletions(-)

Comments

Ricardo Koller April 19, 2022, 2:59 a.m. UTC | #1
On Fri, Apr 15, 2022 at 09:58:58PM +0000, Oliver Upton wrote:
> It is possible that a table page remains visible to another thread until
> the next rcu synchronization event. To that end, we cannot drop the last
> page reference synchronous with post-order traversal for a parallel
> table walk.
> 
> Schedule an rcu callback to clean up the child table page for parallel
> walks.
> 
> Signed-off-by: Oliver Upton <oupton@google.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h |  3 ++
>  arch/arm64/kvm/hyp/pgtable.c         | 24 +++++++++++++--
>  arch/arm64/kvm/mmu.c                 | 44 +++++++++++++++++++++++++++-
>  3 files changed, 67 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 74955aba5918..52e55e00f0ca 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -81,6 +81,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
>   * @put_page:			Decrement the refcount on a page. When the
>   *				refcount reaches 0 the page is automatically
>   *				freed.
> + * @free_table:			Drop the last page reference, possibly in the
> + *				next RCU sync if doing a shared walk.
>   * @page_count:			Return the refcount of a page.
>   * @phys_to_virt:		Convert a physical address into a virtual
>   *				address	mapped in the current context.
> @@ -98,6 +100,7 @@ struct kvm_pgtable_mm_ops {
>  	void		(*get_page)(void *addr);
>  	void		(*put_page)(void *addr);
>  	int		(*page_count)(void *addr);
> +	void		(*free_table)(void *addr, bool shared);
>  	void*		(*phys_to_virt)(phys_addr_t phys);
>  	phys_addr_t	(*virt_to_phys)(void *addr);
>  	void		(*dcache_clean_inval_poc)(void *addr, size_t size);
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 121818d4c33e..a9a48edba63b 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -147,12 +147,19 @@ static inline void kvm_pgtable_walk_end(void)
>  {}
>  
>  #define kvm_dereference_ptep	rcu_dereference_raw
> +
> +static inline void kvm_pgtable_destroy_barrier(void)
> +{}
> +
>  #else
>  #define kvm_pgtable_walk_begin	rcu_read_lock
>  
>  #define kvm_pgtable_walk_end	rcu_read_unlock
>  
>  #define kvm_dereference_ptep	rcu_dereference
> +
> +#define kvm_pgtable_destroy_barrier	rcu_barrier
> +
>  #endif
>  
>  static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
> @@ -1063,7 +1070,12 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
>  		childp = kvm_pte_follow(*old, mm_ops);
>  	}
>  
> -	mm_ops->put_page(childp);
> +	/*
> +	 * If we do not have exclusive access to the page tables it is possible
> +	 * the unlinked table remains visible to another thread until the next
> +	 * rcu synchronization.
> +	 */
> +	mm_ops->free_table(childp, shared);
>  	mm_ops->put_page(ptep);
>  
>  	return ret;
> @@ -1203,7 +1215,7 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
>  					       kvm_granule_size(level));
>  
>  	if (childp)
> -		mm_ops->put_page(childp);
> +		mm_ops->free_table(childp, shared);
>  
>  	return 0;
>  }
> @@ -1433,7 +1445,7 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
>  	mm_ops->put_page(ptep);
>  
>  	if (kvm_pte_table(*old, level))
> -		mm_ops->put_page(kvm_pte_follow(*old, mm_ops));
> +		mm_ops->free_table(kvm_pte_follow(*old, mm_ops), shared);
>  
>  	return 0;
>  }
> @@ -1452,4 +1464,10 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
>  	pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
>  	pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
>  	pgt->pgd = NULL;
> +
> +	/*
> +	 * Guarantee that all unlinked subtrees associated with the stage2 page
> +	 * table have also been freed before returning.
> +	 */
> +	kvm_pgtable_destroy_barrier();
>  }
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index cc6ed6b06ec2..6ecf37009c21 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -98,9 +98,50 @@ static bool kvm_is_device_pfn(unsigned long pfn)
>  static void *stage2_memcache_zalloc_page(void *arg)
>  {
>  	struct kvm_mmu_caches *mmu_caches = arg;
> +	struct stage2_page_header *hdr;
> +	void *addr;
>  
>  	/* Allocated with __GFP_ZERO, so no need to zero */
> -	return kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> +	addr = kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> +	if (!addr)
> +		return NULL;
> +
> +	hdr = kvm_mmu_memory_cache_alloc(&mmu_caches->header_cache);
> +	if (!hdr) {
> +		free_page((unsigned long)addr);
> +		return NULL;
> +	}
> +
> +	hdr->page = virt_to_page(addr);
> +	set_page_private(hdr->page, (unsigned long)hdr);
> +	return addr;
> +}
> +
> +static void stage2_free_page_now(struct stage2_page_header *hdr)
> +{
> +	WARN_ON(page_ref_count(hdr->page) != 1);
> +
> +	__free_page(hdr->page);
> +	kmem_cache_free(stage2_page_header_cache, hdr);
> +}
> +
> +static void stage2_free_page_rcu_cb(struct rcu_head *head)
> +{
> +	struct stage2_page_header *hdr = container_of(head, struct stage2_page_header,
> +						      rcu_head);
> +
> +	stage2_free_page_now(hdr);
> +}
> +
> +static void stage2_free_table(void *addr, bool shared)
> +{
> +	struct page *page = virt_to_page(addr);
> +	struct stage2_page_header *hdr = (struct stage2_page_header *)page_private(page);
> +
> +	if (shared)
> +		call_rcu(&hdr->rcu_head, stage2_free_page_rcu_cb);

Can the number of callbacks grow to "dangerous" numbers? can it be
bounded with something like the following?

if number of readers is really high:
	synchronize_rcu() 
else
	call_rcu()

maybe the rcu API has an option for that.

> +	else
> +		stage2_free_page_now(hdr);
>  }
>  
>  static void *kvm_host_zalloc_pages_exact(size_t size)
> @@ -613,6 +654,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
>  	.free_pages_exact	= free_pages_exact,
>  	.get_page		= kvm_host_get_page,
>  	.put_page		= kvm_host_put_page,
> +	.free_table		= stage2_free_table,
>  	.page_count		= kvm_host_page_count,
>  	.phys_to_virt		= kvm_host_va,
>  	.virt_to_phys		= kvm_host_pa,
> -- 
> 2.36.0.rc0.470.gd361397f0d-goog
>
Ricardo Koller April 19, 2022, 3:09 a.m. UTC | #2
On Mon, Apr 18, 2022 at 07:59:04PM -0700, Ricardo Koller wrote:
> On Fri, Apr 15, 2022 at 09:58:58PM +0000, Oliver Upton wrote:
> > It is possible that a table page remains visible to another thread until
> > the next rcu synchronization event. To that end, we cannot drop the last
> > page reference synchronous with post-order traversal for a parallel
> > table walk.
> > 
> > Schedule an rcu callback to clean up the child table page for parallel
> > walks.
> > 
> > Signed-off-by: Oliver Upton <oupton@google.com>
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h |  3 ++
> >  arch/arm64/kvm/hyp/pgtable.c         | 24 +++++++++++++--
> >  arch/arm64/kvm/mmu.c                 | 44 +++++++++++++++++++++++++++-
> >  3 files changed, 67 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > index 74955aba5918..52e55e00f0ca 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -81,6 +81,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
> >   * @put_page:			Decrement the refcount on a page. When the
> >   *				refcount reaches 0 the page is automatically
> >   *				freed.
> > + * @free_table:			Drop the last page reference, possibly in the
> > + *				next RCU sync if doing a shared walk.
> >   * @page_count:			Return the refcount of a page.
> >   * @phys_to_virt:		Convert a physical address into a virtual
> >   *				address	mapped in the current context.
> > @@ -98,6 +100,7 @@ struct kvm_pgtable_mm_ops {
> >  	void		(*get_page)(void *addr);
> >  	void		(*put_page)(void *addr);
> >  	int		(*page_count)(void *addr);
> > +	void		(*free_table)(void *addr, bool shared);
> >  	void*		(*phys_to_virt)(phys_addr_t phys);
> >  	phys_addr_t	(*virt_to_phys)(void *addr);
> >  	void		(*dcache_clean_inval_poc)(void *addr, size_t size);
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 121818d4c33e..a9a48edba63b 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -147,12 +147,19 @@ static inline void kvm_pgtable_walk_end(void)
> >  {}
> >  
> >  #define kvm_dereference_ptep	rcu_dereference_raw
> > +
> > +static inline void kvm_pgtable_destroy_barrier(void)
> > +{}
> > +
> >  #else
> >  #define kvm_pgtable_walk_begin	rcu_read_lock
> >  
> >  #define kvm_pgtable_walk_end	rcu_read_unlock
> >  
> >  #define kvm_dereference_ptep	rcu_dereference
> > +
> > +#define kvm_pgtable_destroy_barrier	rcu_barrier
> > +
> >  #endif
> >  
> >  static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
> > @@ -1063,7 +1070,12 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
> >  		childp = kvm_pte_follow(*old, mm_ops);
> >  	}
> >  
> > -	mm_ops->put_page(childp);
> > +	/*
> > +	 * If we do not have exclusive access to the page tables it is possible
> > +	 * the unlinked table remains visible to another thread until the next
> > +	 * rcu synchronization.
> > +	 */
> > +	mm_ops->free_table(childp, shared);
> >  	mm_ops->put_page(ptep);
> >  
> >  	return ret;
> > @@ -1203,7 +1215,7 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> >  					       kvm_granule_size(level));
> >  
> >  	if (childp)
> > -		mm_ops->put_page(childp);
> > +		mm_ops->free_table(childp, shared);
> >  
> >  	return 0;
> >  }
> > @@ -1433,7 +1445,7 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> >  	mm_ops->put_page(ptep);
> >  
> >  	if (kvm_pte_table(*old, level))
> > -		mm_ops->put_page(kvm_pte_follow(*old, mm_ops));
> > +		mm_ops->free_table(kvm_pte_follow(*old, mm_ops), shared);
> >  
> >  	return 0;
> >  }
> > @@ -1452,4 +1464,10 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
> >  	pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
> >  	pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
> >  	pgt->pgd = NULL;
> > +
> > +	/*
> > +	 * Guarantee that all unlinked subtrees associated with the stage2 page
> > +	 * table have also been freed before returning.
> > +	 */
> > +	kvm_pgtable_destroy_barrier();
> >  }
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index cc6ed6b06ec2..6ecf37009c21 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -98,9 +98,50 @@ static bool kvm_is_device_pfn(unsigned long pfn)
> >  static void *stage2_memcache_zalloc_page(void *arg)
> >  {
> >  	struct kvm_mmu_caches *mmu_caches = arg;
> > +	struct stage2_page_header *hdr;
> > +	void *addr;
> >  
> >  	/* Allocated with __GFP_ZERO, so no need to zero */
> > -	return kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> > +	addr = kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> > +	if (!addr)
> > +		return NULL;
> > +
> > +	hdr = kvm_mmu_memory_cache_alloc(&mmu_caches->header_cache);
> > +	if (!hdr) {
> > +		free_page((unsigned long)addr);
> > +		return NULL;
> > +	}
> > +
> > +	hdr->page = virt_to_page(addr);
> > +	set_page_private(hdr->page, (unsigned long)hdr);
> > +	return addr;
> > +}
> > +
> > +static void stage2_free_page_now(struct stage2_page_header *hdr)
> > +{
> > +	WARN_ON(page_ref_count(hdr->page) != 1);
> > +
> > +	__free_page(hdr->page);
> > +	kmem_cache_free(stage2_page_header_cache, hdr);
> > +}
> > +
> > +static void stage2_free_page_rcu_cb(struct rcu_head *head)
> > +{
> > +	struct stage2_page_header *hdr = container_of(head, struct stage2_page_header,
> > +						      rcu_head);
> > +
> > +	stage2_free_page_now(hdr);
> > +}
> > +
> > +static void stage2_free_table(void *addr, bool shared)
> > +{
> > +	struct page *page = virt_to_page(addr);
> > +	struct stage2_page_header *hdr = (struct stage2_page_header *)page_private(page);
> > +
> > +	if (shared)
> > +		call_rcu(&hdr->rcu_head, stage2_free_page_rcu_cb);
> 
> Can the number of callbacks grow to "dangerous" numbers? can it be
> bounded with something like the following?
> 
> if number of readers is really high:
> 	synchronize_rcu() 
> else
> 	call_rcu()

sorry, meant to say "number of callbacks"
> 
> maybe the rcu API has an option for that.
> 
> > +	else
> > +		stage2_free_page_now(hdr);
> >  }
> >  
> >  static void *kvm_host_zalloc_pages_exact(size_t size)
> > @@ -613,6 +654,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
> >  	.free_pages_exact	= free_pages_exact,
> >  	.get_page		= kvm_host_get_page,
> >  	.put_page		= kvm_host_put_page,
> > +	.free_table		= stage2_free_table,
> >  	.page_count		= kvm_host_page_count,
> >  	.phys_to_virt		= kvm_host_va,
> >  	.virt_to_phys		= kvm_host_pa,
> > -- 
> > 2.36.0.rc0.470.gd361397f0d-goog
> >
Oliver Upton April 20, 2022, 12:53 a.m. UTC | #3
Hi Ricardo,

On Mon, Apr 18, 2022 at 8:09 PM Ricardo Koller <ricarkol@google.com> wrote:
>
> On Mon, Apr 18, 2022 at 07:59:04PM -0700, Ricardo Koller wrote:
> > On Fri, Apr 15, 2022 at 09:58:58PM +0000, Oliver Upton wrote:
> > > It is possible that a table page remains visible to another thread until
> > > the next rcu synchronization event. To that end, we cannot drop the last
> > > page reference synchronous with post-order traversal for a parallel
> > > table walk.
> > >
> > > Schedule an rcu callback to clean up the child table page for parallel
> > > walks.
> > >
> > > Signed-off-by: Oliver Upton <oupton@google.com>
> > > ---
> > >  arch/arm64/include/asm/kvm_pgtable.h |  3 ++
> > >  arch/arm64/kvm/hyp/pgtable.c         | 24 +++++++++++++--
> > >  arch/arm64/kvm/mmu.c                 | 44 +++++++++++++++++++++++++++-
> > >  3 files changed, 67 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > > index 74955aba5918..52e55e00f0ca 100644
> > > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > > @@ -81,6 +81,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
> > >   * @put_page:                      Decrement the refcount on a page. When the
> > >   *                         refcount reaches 0 the page is automatically
> > >   *                         freed.
> > > + * @free_table:                    Drop the last page reference, possibly in the
> > > + *                         next RCU sync if doing a shared walk.
> > >   * @page_count:                    Return the refcount of a page.
> > >   * @phys_to_virt:          Convert a physical address into a virtual
> > >   *                         address mapped in the current context.
> > > @@ -98,6 +100,7 @@ struct kvm_pgtable_mm_ops {
> > >     void            (*get_page)(void *addr);
> > >     void            (*put_page)(void *addr);
> > >     int             (*page_count)(void *addr);
> > > +   void            (*free_table)(void *addr, bool shared);
> > >     void*           (*phys_to_virt)(phys_addr_t phys);
> > >     phys_addr_t     (*virt_to_phys)(void *addr);
> > >     void            (*dcache_clean_inval_poc)(void *addr, size_t size);
> > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > > index 121818d4c33e..a9a48edba63b 100644
> > > --- a/arch/arm64/kvm/hyp/pgtable.c
> > > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > > @@ -147,12 +147,19 @@ static inline void kvm_pgtable_walk_end(void)
> > >  {}
> > >
> > >  #define kvm_dereference_ptep       rcu_dereference_raw
> > > +
> > > +static inline void kvm_pgtable_destroy_barrier(void)
> > > +{}
> > > +
> > >  #else
> > >  #define kvm_pgtable_walk_begin     rcu_read_lock
> > >
> > >  #define kvm_pgtable_walk_end       rcu_read_unlock
> > >
> > >  #define kvm_dereference_ptep       rcu_dereference
> > > +
> > > +#define kvm_pgtable_destroy_barrier        rcu_barrier
> > > +
> > >  #endif
> > >
> > >  static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
> > > @@ -1063,7 +1070,12 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
> > >             childp = kvm_pte_follow(*old, mm_ops);
> > >     }
> > >
> > > -   mm_ops->put_page(childp);
> > > +   /*
> > > +    * If we do not have exclusive access to the page tables it is possible
> > > +    * the unlinked table remains visible to another thread until the next
> > > +    * rcu synchronization.
> > > +    */
> > > +   mm_ops->free_table(childp, shared);
> > >     mm_ops->put_page(ptep);
> > >
> > >     return ret;
> > > @@ -1203,7 +1215,7 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> > >                                            kvm_granule_size(level));
> > >
> > >     if (childp)
> > > -           mm_ops->put_page(childp);
> > > +           mm_ops->free_table(childp, shared);
> > >
> > >     return 0;
> > >  }
> > > @@ -1433,7 +1445,7 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> > >     mm_ops->put_page(ptep);
> > >
> > >     if (kvm_pte_table(*old, level))
> > > -           mm_ops->put_page(kvm_pte_follow(*old, mm_ops));
> > > +           mm_ops->free_table(kvm_pte_follow(*old, mm_ops), shared);
> > >
> > >     return 0;
> > >  }
> > > @@ -1452,4 +1464,10 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
> > >     pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
> > >     pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
> > >     pgt->pgd = NULL;
> > > +
> > > +   /*
> > > +    * Guarantee that all unlinked subtrees associated with the stage2 page
> > > +    * table have also been freed before returning.
> > > +    */
> > > +   kvm_pgtable_destroy_barrier();
> > >  }
> > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > > index cc6ed6b06ec2..6ecf37009c21 100644
> > > --- a/arch/arm64/kvm/mmu.c
> > > +++ b/arch/arm64/kvm/mmu.c
> > > @@ -98,9 +98,50 @@ static bool kvm_is_device_pfn(unsigned long pfn)
> > >  static void *stage2_memcache_zalloc_page(void *arg)
> > >  {
> > >     struct kvm_mmu_caches *mmu_caches = arg;
> > > +   struct stage2_page_header *hdr;
> > > +   void *addr;
> > >
> > >     /* Allocated with __GFP_ZERO, so no need to zero */
> > > -   return kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> > > +   addr = kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> > > +   if (!addr)
> > > +           return NULL;
> > > +
> > > +   hdr = kvm_mmu_memory_cache_alloc(&mmu_caches->header_cache);
> > > +   if (!hdr) {
> > > +           free_page((unsigned long)addr);
> > > +           return NULL;
> > > +   }
> > > +
> > > +   hdr->page = virt_to_page(addr);
> > > +   set_page_private(hdr->page, (unsigned long)hdr);
> > > +   return addr;
> > > +}
> > > +
> > > +static void stage2_free_page_now(struct stage2_page_header *hdr)
> > > +{
> > > +   WARN_ON(page_ref_count(hdr->page) != 1);
> > > +
> > > +   __free_page(hdr->page);
> > > +   kmem_cache_free(stage2_page_header_cache, hdr);
> > > +}
> > > +
> > > +static void stage2_free_page_rcu_cb(struct rcu_head *head)
> > > +{
> > > +   struct stage2_page_header *hdr = container_of(head, struct stage2_page_header,
> > > +                                                 rcu_head);
> > > +
> > > +   stage2_free_page_now(hdr);
> > > +}
> > > +
> > > +static void stage2_free_table(void *addr, bool shared)
> > > +{
> > > +   struct page *page = virt_to_page(addr);
> > > +   struct stage2_page_header *hdr = (struct stage2_page_header *)page_private(page);
> > > +
> > > +   if (shared)
> > > +           call_rcu(&hdr->rcu_head, stage2_free_page_rcu_cb);
> >
> > Can the number of callbacks grow to "dangerous" numbers? can it be
> > bounded with something like the following?
> >
> > if number of readers is really high:
> >       synchronize_rcu()
> > else
> >       call_rcu()
>
> sorry, meant to say "number of callbacks"

Good point. I don't have data for this, but generally speaking I do
not believe we need to enqueue a callback for every page. In fact,
since we already make the invalid PTE visible in pre-order traversal
we could theoretically free all tables from a single RCU callback (per
fault).

I think if we used synchronize_rcu() then we would need to drop the
mmu lock since it will block the thread.

--
Thanks,
Oliver
Ben Gardon April 21, 2022, 4:28 p.m. UTC | #4
On Fri, Apr 15, 2022 at 2:59 PM Oliver Upton <oupton@google.com> wrote:
>
> It is possible that a table page remains visible to another thread until
> the next rcu synchronization event. To that end, we cannot drop the last
> page reference synchronous with post-order traversal for a parallel
> table walk.
>
> Schedule an rcu callback to clean up the child table page for parallel
> walks.
>
> Signed-off-by: Oliver Upton <oupton@google.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h |  3 ++
>  arch/arm64/kvm/hyp/pgtable.c         | 24 +++++++++++++--
>  arch/arm64/kvm/mmu.c                 | 44 +++++++++++++++++++++++++++-
>  3 files changed, 67 insertions(+), 4 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 74955aba5918..52e55e00f0ca 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -81,6 +81,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
>   * @put_page:                  Decrement the refcount on a page. When the
>   *                             refcount reaches 0 the page is automatically
>   *                             freed.
> + * @free_table:                        Drop the last page reference, possibly in the
> + *                             next RCU sync if doing a shared walk.
>   * @page_count:                        Return the refcount of a page.
>   * @phys_to_virt:              Convert a physical address into a virtual
>   *                             address mapped in the current context.
> @@ -98,6 +100,7 @@ struct kvm_pgtable_mm_ops {
>         void            (*get_page)(void *addr);
>         void            (*put_page)(void *addr);
>         int             (*page_count)(void *addr);
> +       void            (*free_table)(void *addr, bool shared);
>         void*           (*phys_to_virt)(phys_addr_t phys);
>         phys_addr_t     (*virt_to_phys)(void *addr);
>         void            (*dcache_clean_inval_poc)(void *addr, size_t size);
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 121818d4c33e..a9a48edba63b 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -147,12 +147,19 @@ static inline void kvm_pgtable_walk_end(void)
>  {}
>
>  #define kvm_dereference_ptep   rcu_dereference_raw
> +
> +static inline void kvm_pgtable_destroy_barrier(void)
> +{}
> +
>  #else
>  #define kvm_pgtable_walk_begin rcu_read_lock
>
>  #define kvm_pgtable_walk_end   rcu_read_unlock
>
>  #define kvm_dereference_ptep   rcu_dereference
> +
> +#define kvm_pgtable_destroy_barrier    rcu_barrier
> +
>  #endif
>
>  static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
> @@ -1063,7 +1070,12 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
>                 childp = kvm_pte_follow(*old, mm_ops);
>         }
>
> -       mm_ops->put_page(childp);
> +       /*
> +        * If we do not have exclusive access to the page tables it is possible
> +        * the unlinked table remains visible to another thread until the next
> +        * rcu synchronization.
> +        */
> +       mm_ops->free_table(childp, shared);
>         mm_ops->put_page(ptep);
>
>         return ret;
> @@ -1203,7 +1215,7 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
>                                                kvm_granule_size(level));
>
>         if (childp)
> -               mm_ops->put_page(childp);
> +               mm_ops->free_table(childp, shared);
>
>         return 0;
>  }
> @@ -1433,7 +1445,7 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
>         mm_ops->put_page(ptep);
>
>         if (kvm_pte_table(*old, level))
> -               mm_ops->put_page(kvm_pte_follow(*old, mm_ops));
> +               mm_ops->free_table(kvm_pte_follow(*old, mm_ops), shared);
>
>         return 0;
>  }
> @@ -1452,4 +1464,10 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
>         pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
>         pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
>         pgt->pgd = NULL;
> +
> +       /*
> +        * Guarantee that all unlinked subtrees associated with the stage2 page
> +        * table have also been freed before returning.
> +        */
> +       kvm_pgtable_destroy_barrier();

Why do we need to wait for in-flight RCU callbacks to finish here?
Is this function only used on VM teardown and we just want to make
sure all the memory is freed or is something actually depending on
this behavior?

>  }
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index cc6ed6b06ec2..6ecf37009c21 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -98,9 +98,50 @@ static bool kvm_is_device_pfn(unsigned long pfn)
>  static void *stage2_memcache_zalloc_page(void *arg)
>  {
>         struct kvm_mmu_caches *mmu_caches = arg;
> +       struct stage2_page_header *hdr;
> +       void *addr;
>
>         /* Allocated with __GFP_ZERO, so no need to zero */
> -       return kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> +       addr = kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> +       if (!addr)
> +               return NULL;
> +
> +       hdr = kvm_mmu_memory_cache_alloc(&mmu_caches->header_cache);
> +       if (!hdr) {
> +               free_page((unsigned long)addr);
> +               return NULL;
> +       }
> +
> +       hdr->page = virt_to_page(addr);
> +       set_page_private(hdr->page, (unsigned long)hdr);
> +       return addr;
> +}
> +
> +static void stage2_free_page_now(struct stage2_page_header *hdr)
> +{
> +       WARN_ON(page_ref_count(hdr->page) != 1);
> +
> +       __free_page(hdr->page);
> +       kmem_cache_free(stage2_page_header_cache, hdr);
> +}
> +
> +static void stage2_free_page_rcu_cb(struct rcu_head *head)
> +{
> +       struct stage2_page_header *hdr = container_of(head, struct stage2_page_header,
> +                                                     rcu_head);
> +
> +       stage2_free_page_now(hdr);
> +}
> +
> +static void stage2_free_table(void *addr, bool shared)
> +{
> +       struct page *page = virt_to_page(addr);
> +       struct stage2_page_header *hdr = (struct stage2_page_header *)page_private(page);
> +
> +       if (shared)
> +               call_rcu(&hdr->rcu_head, stage2_free_page_rcu_cb);
> +       else
> +               stage2_free_page_now(hdr);
>  }
>
>  static void *kvm_host_zalloc_pages_exact(size_t size)
> @@ -613,6 +654,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
>         .free_pages_exact       = free_pages_exact,
>         .get_page               = kvm_host_get_page,
>         .put_page               = kvm_host_put_page,
> +       .free_table             = stage2_free_table,
>         .page_count             = kvm_host_page_count,
>         .phys_to_virt           = kvm_host_va,
>         .virt_to_phys           = kvm_host_pa,
> --
> 2.36.0.rc0.470.gd361397f0d-goog
>
David Matlack Sept. 8, 2022, 12:52 a.m. UTC | #5
On Tue, Apr 19, 2022 at 5:53 PM Oliver Upton <oupton@google.com> wrote:
>
> Hi Ricardo,
>
> On Mon, Apr 18, 2022 at 8:09 PM Ricardo Koller <ricarkol@google.com> wrote:
> >
> > On Mon, Apr 18, 2022 at 07:59:04PM -0700, Ricardo Koller wrote:
> > > On Fri, Apr 15, 2022 at 09:58:58PM +0000, Oliver Upton wrote:
> > > > It is possible that a table page remains visible to another thread until
> > > > the next rcu synchronization event. To that end, we cannot drop the last
> > > > page reference synchronous with post-order traversal for a parallel
> > > > table walk.
> > > >
> > > > Schedule an rcu callback to clean up the child table page for parallel
> > > > walks.
> > > >
> > > > Signed-off-by: Oliver Upton <oupton@google.com>
> > > > ---
> > > >  arch/arm64/include/asm/kvm_pgtable.h |  3 ++
> > > >  arch/arm64/kvm/hyp/pgtable.c         | 24 +++++++++++++--
> > > >  arch/arm64/kvm/mmu.c                 | 44 +++++++++++++++++++++++++++-
> > > >  3 files changed, 67 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > > > index 74955aba5918..52e55e00f0ca 100644
> > > > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > > > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > > > @@ -81,6 +81,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
> > > >   * @put_page:                      Decrement the refcount on a page. When the
> > > >   *                         refcount reaches 0 the page is automatically
> > > >   *                         freed.
> > > > + * @free_table:                    Drop the last page reference, possibly in the
> > > > + *                         next RCU sync if doing a shared walk.
> > > >   * @page_count:                    Return the refcount of a page.
> > > >   * @phys_to_virt:          Convert a physical address into a virtual
> > > >   *                         address mapped in the current context.
> > > > @@ -98,6 +100,7 @@ struct kvm_pgtable_mm_ops {
> > > >     void            (*get_page)(void *addr);
> > > >     void            (*put_page)(void *addr);
> > > >     int             (*page_count)(void *addr);
> > > > +   void            (*free_table)(void *addr, bool shared);
> > > >     void*           (*phys_to_virt)(phys_addr_t phys);
> > > >     phys_addr_t     (*virt_to_phys)(void *addr);
> > > >     void            (*dcache_clean_inval_poc)(void *addr, size_t size);
> > > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > > > index 121818d4c33e..a9a48edba63b 100644
> > > > --- a/arch/arm64/kvm/hyp/pgtable.c
> > > > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > > > @@ -147,12 +147,19 @@ static inline void kvm_pgtable_walk_end(void)
> > > >  {}
> > > >
> > > >  #define kvm_dereference_ptep       rcu_dereference_raw
> > > > +
> > > > +static inline void kvm_pgtable_destroy_barrier(void)
> > > > +{}
> > > > +
> > > >  #else
> > > >  #define kvm_pgtable_walk_begin     rcu_read_lock
> > > >
> > > >  #define kvm_pgtable_walk_end       rcu_read_unlock
> > > >
> > > >  #define kvm_dereference_ptep       rcu_dereference
> > > > +
> > > > +#define kvm_pgtable_destroy_barrier        rcu_barrier
> > > > +
> > > >  #endif
> > > >
> > > >  static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
> > > > @@ -1063,7 +1070,12 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
> > > >             childp = kvm_pte_follow(*old, mm_ops);
> > > >     }
> > > >
> > > > -   mm_ops->put_page(childp);
> > > > +   /*
> > > > +    * If we do not have exclusive access to the page tables it is possible
> > > > +    * the unlinked table remains visible to another thread until the next
> > > > +    * rcu synchronization.
> > > > +    */
> > > > +   mm_ops->free_table(childp, shared);
> > > >     mm_ops->put_page(ptep);
> > > >
> > > >     return ret;
> > > > @@ -1203,7 +1215,7 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> > > >                                            kvm_granule_size(level));
> > > >
> > > >     if (childp)
> > > > -           mm_ops->put_page(childp);
> > > > +           mm_ops->free_table(childp, shared);
> > > >
> > > >     return 0;
> > > >  }
> > > > @@ -1433,7 +1445,7 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> > > >     mm_ops->put_page(ptep);
> > > >
> > > >     if (kvm_pte_table(*old, level))
> > > > -           mm_ops->put_page(kvm_pte_follow(*old, mm_ops));
> > > > +           mm_ops->free_table(kvm_pte_follow(*old, mm_ops), shared);
> > > >
> > > >     return 0;
> > > >  }
> > > > @@ -1452,4 +1464,10 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
> > > >     pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
> > > >     pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
> > > >     pgt->pgd = NULL;
> > > > +
> > > > +   /*
> > > > +    * Guarantee that all unlinked subtrees associated with the stage2 page
> > > > +    * table have also been freed before returning.
> > > > +    */
> > > > +   kvm_pgtable_destroy_barrier();
> > > >  }
> > > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > > > index cc6ed6b06ec2..6ecf37009c21 100644
> > > > --- a/arch/arm64/kvm/mmu.c
> > > > +++ b/arch/arm64/kvm/mmu.c
> > > > @@ -98,9 +98,50 @@ static bool kvm_is_device_pfn(unsigned long pfn)
> > > >  static void *stage2_memcache_zalloc_page(void *arg)
> > > >  {
> > > >     struct kvm_mmu_caches *mmu_caches = arg;
> > > > +   struct stage2_page_header *hdr;
> > > > +   void *addr;
> > > >
> > > >     /* Allocated with __GFP_ZERO, so no need to zero */
> > > > -   return kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> > > > +   addr = kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
> > > > +   if (!addr)
> > > > +           return NULL;
> > > > +
> > > > +   hdr = kvm_mmu_memory_cache_alloc(&mmu_caches->header_cache);
> > > > +   if (!hdr) {
> > > > +           free_page((unsigned long)addr);
> > > > +           return NULL;
> > > > +   }
> > > > +
> > > > +   hdr->page = virt_to_page(addr);
> > > > +   set_page_private(hdr->page, (unsigned long)hdr);
> > > > +   return addr;
> > > > +}
> > > > +
> > > > +static void stage2_free_page_now(struct stage2_page_header *hdr)
> > > > +{
> > > > +   WARN_ON(page_ref_count(hdr->page) != 1);
> > > > +
> > > > +   __free_page(hdr->page);
> > > > +   kmem_cache_free(stage2_page_header_cache, hdr);
> > > > +}
> > > > +
> > > > +static void stage2_free_page_rcu_cb(struct rcu_head *head)
> > > > +{
> > > > +   struct stage2_page_header *hdr = container_of(head, struct stage2_page_header,
> > > > +                                                 rcu_head);
> > > > +
> > > > +   stage2_free_page_now(hdr);
> > > > +}
> > > > +
> > > > +static void stage2_free_table(void *addr, bool shared)
> > > > +{
> > > > +   struct page *page = virt_to_page(addr);
> > > > +   struct stage2_page_header *hdr = (struct stage2_page_header *)page_private(page);
> > > > +
> > > > +   if (shared)
> > > > +           call_rcu(&hdr->rcu_head, stage2_free_page_rcu_cb);
> > >
> > > Can the number of callbacks grow to "dangerous" numbers? can it be
> > > bounded with something like the following?
> > >
> > > if number of readers is really high:
> > >       synchronize_rcu()
> > > else
> > >       call_rcu()
> >
> > sorry, meant to say "number of callbacks"
>
> Good point. I don't have data for this, but generally speaking I do
> not believe we need to enqueue a callback for every page. In fact,
> since we already make the invalid PTE visible in pre-order traversal
> we could theoretically free all tables from a single RCU callback (per
> fault).

I noticed this change was made in v1, but I don't understand the
reasoning. Whether page tables are freed in many callbacks (one per
table) or a single callback (one per subtree), we will still do the
same amount of work in RCU callbacks. In fact the latter (i.e. v1)
approach seems like it ends up doing more work in the RCU callback
because it has to do the page table traversal rather than just call
free() a bunch of times. I'm also not sure if RCU callbacks have any
limitations on how long they can/should take (it may be better to have
lots of tiny callbacks than one large one). OTOH maybe I'm just
misunderstanding something so I thought I'd ask :)

>
> I think if we used synchronize_rcu() then we would need to drop the
> mmu lock since it will block the thread.
>
> --
> Thanks,
> Oliver
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 74955aba5918..52e55e00f0ca 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -81,6 +81,8 @@  static inline bool kvm_level_supports_block_mapping(u32 level)
  * @put_page:			Decrement the refcount on a page. When the
  *				refcount reaches 0 the page is automatically
  *				freed.
+ * @free_table:			Drop the last page reference, possibly in the
+ *				next RCU sync if doing a shared walk.
  * @page_count:			Return the refcount of a page.
  * @phys_to_virt:		Convert a physical address into a virtual
  *				address	mapped in the current context.
@@ -98,6 +100,7 @@  struct kvm_pgtable_mm_ops {
 	void		(*get_page)(void *addr);
 	void		(*put_page)(void *addr);
 	int		(*page_count)(void *addr);
+	void		(*free_table)(void *addr, bool shared);
 	void*		(*phys_to_virt)(phys_addr_t phys);
 	phys_addr_t	(*virt_to_phys)(void *addr);
 	void		(*dcache_clean_inval_poc)(void *addr, size_t size);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 121818d4c33e..a9a48edba63b 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -147,12 +147,19 @@  static inline void kvm_pgtable_walk_end(void)
 {}
 
 #define kvm_dereference_ptep	rcu_dereference_raw
+
+static inline void kvm_pgtable_destroy_barrier(void)
+{}
+
 #else
 #define kvm_pgtable_walk_begin	rcu_read_lock
 
 #define kvm_pgtable_walk_end	rcu_read_unlock
 
 #define kvm_dereference_ptep	rcu_dereference
+
+#define kvm_pgtable_destroy_barrier	rcu_barrier
+
 #endif
 
 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
@@ -1063,7 +1070,12 @@  static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
 		childp = kvm_pte_follow(*old, mm_ops);
 	}
 
-	mm_ops->put_page(childp);
+	/*
+	 * If we do not have exclusive access to the page tables it is possible
+	 * the unlinked table remains visible to another thread until the next
+	 * rcu synchronization.
+	 */
+	mm_ops->free_table(childp, shared);
 	mm_ops->put_page(ptep);
 
 	return ret;
@@ -1203,7 +1215,7 @@  static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 					       kvm_granule_size(level));
 
 	if (childp)
-		mm_ops->put_page(childp);
+		mm_ops->free_table(childp, shared);
 
 	return 0;
 }
@@ -1433,7 +1445,7 @@  static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	mm_ops->put_page(ptep);
 
 	if (kvm_pte_table(*old, level))
-		mm_ops->put_page(kvm_pte_follow(*old, mm_ops));
+		mm_ops->free_table(kvm_pte_follow(*old, mm_ops), shared);
 
 	return 0;
 }
@@ -1452,4 +1464,10 @@  void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
 	pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
 	pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
 	pgt->pgd = NULL;
+
+	/*
+	 * Guarantee that all unlinked subtrees associated with the stage2 page
+	 * table have also been freed before returning.
+	 */
+	kvm_pgtable_destroy_barrier();
 }
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index cc6ed6b06ec2..6ecf37009c21 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -98,9 +98,50 @@  static bool kvm_is_device_pfn(unsigned long pfn)
 static void *stage2_memcache_zalloc_page(void *arg)
 {
 	struct kvm_mmu_caches *mmu_caches = arg;
+	struct stage2_page_header *hdr;
+	void *addr;
 
 	/* Allocated with __GFP_ZERO, so no need to zero */
-	return kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
+	addr = kvm_mmu_memory_cache_alloc(&mmu_caches->page_cache);
+	if (!addr)
+		return NULL;
+
+	hdr = kvm_mmu_memory_cache_alloc(&mmu_caches->header_cache);
+	if (!hdr) {
+		free_page((unsigned long)addr);
+		return NULL;
+	}
+
+	hdr->page = virt_to_page(addr);
+	set_page_private(hdr->page, (unsigned long)hdr);
+	return addr;
+}
+
+static void stage2_free_page_now(struct stage2_page_header *hdr)
+{
+	WARN_ON(page_ref_count(hdr->page) != 1);
+
+	__free_page(hdr->page);
+	kmem_cache_free(stage2_page_header_cache, hdr);
+}
+
+static void stage2_free_page_rcu_cb(struct rcu_head *head)
+{
+	struct stage2_page_header *hdr = container_of(head, struct stage2_page_header,
+						      rcu_head);
+
+	stage2_free_page_now(hdr);
+}
+
+static void stage2_free_table(void *addr, bool shared)
+{
+	struct page *page = virt_to_page(addr);
+	struct stage2_page_header *hdr = (struct stage2_page_header *)page_private(page);
+
+	if (shared)
+		call_rcu(&hdr->rcu_head, stage2_free_page_rcu_cb);
+	else
+		stage2_free_page_now(hdr);
 }
 
 static void *kvm_host_zalloc_pages_exact(size_t size)
@@ -613,6 +654,7 @@  static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
 	.free_pages_exact	= free_pages_exact,
 	.get_page		= kvm_host_get_page,
 	.put_page		= kvm_host_put_page,
+	.free_table		= stage2_free_table,
 	.page_count		= kvm_host_page_count,
 	.phys_to_virt		= kvm_host_va,
 	.virt_to_phys		= kvm_host_pa,