diff mbox series

[4/4] KVM: PPC: Book3S HV: Flush guest mappings when turning dirty tracking on/off

Message ID 20181212041717.GE22265@blackberry (mailing list archive)
State New, archived
Headers show
Series KVM: PPC: Book3S HV: Improve live migration of radix guests | expand

Commit Message

Paul Mackerras Dec. 12, 2018, 4:17 a.m. UTC
This adds code to flush the partition-scoped page tables for a radix
guest when dirty tracking is turned on or off for a memslot.  Only the
guest real addresses covered by the memslot are flushed.  The reason
for this is to get rid of any 2M PTEs in the partition-scoped page
tables that correspond to host transparent huge pages, so that page
dirtiness is tracked at a system page (4k or 64k) granularity rather
than a 2M granularity.  The page tables are also flushed when turning
dirty tracking off so that the memslot's address space can be
repopulated with THPs if possible.

To do this, we add a new function kvmppc_radix_flush_memslot().  Since
this does what's needed for kvmppc_core_flush_memslot_hv() on a radix
guest, we now make kvmppc_core_flush_memslot_hv() call the new
kvmppc_radix_flush_memslot() rather than calling kvm_unmap_radix()
for each page in the memslot.  This has the effect of fixing a bug in
that kvmppc_core_flush_memslot_hv() was previously calling
kvm_unmap_radix() without holding the kvm->mmu_lock spinlock, which
is required to be held.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s.h  |  2 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  9 +++++----
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 20 ++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c           | 17 +++++++++++++++++
 4 files changed, 44 insertions(+), 4 deletions(-)

Comments

Suraj Jitindar Singh Dec. 12, 2018, 5:22 a.m. UTC | #1
On Wed, 2018-12-12 at 15:17 +1100, Paul Mackerras wrote:
> This adds code to flush the partition-scoped page tables for a radix
> guest when dirty tracking is turned on or off for a memslot.  Only
> the
> guest real addresses covered by the memslot are flushed.  The reason
> for this is to get rid of any 2M PTEs in the partition-scoped page
> tables that correspond to host transparent huge pages, so that page
> dirtiness is tracked at a system page (4k or 64k) granularity rather
> than a 2M granularity.  The page tables are also flushed when turning
> dirty tracking off so that the memslot's address space can be
> repopulated with THPs if possible.
> 
> To do this, we add a new function
> kvmppc_radix_flush_memslot().  Since
> this does what's needed for kvmppc_core_flush_memslot_hv() on a radix
> guest, we now make kvmppc_core_flush_memslot_hv() call the new
> kvmppc_radix_flush_memslot() rather than calling kvm_unmap_radix()
> for each page in the memslot.  This has the effect of fixing a bug in
> that kvmppc_core_flush_memslot_hv() was previously calling
> kvm_unmap_radix() without holding the kvm->mmu_lock spinlock, which
> is required to be held.
> 

One comment below.
Either way:

Reviewed-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>

> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> ---
>  arch/powerpc/include/asm/kvm_book3s.h  |  2 ++
>  arch/powerpc/kvm/book3s_64_mmu_hv.c    |  9 +++++----
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 20 ++++++++++++++++++++
>  arch/powerpc/kvm/book3s_hv.c           | 17 +++++++++++++++++
>  4 files changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 728d2b7..f8a5ac8 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -222,6 +222,8 @@ extern int kvm_test_age_radix(struct kvm *kvm,
> struct kvm_memory_slot *memslot,
>  			unsigned long gfn);
>  extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
>  			struct kvm_memory_slot *memslot, unsigned
> long *map);
> +extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
> +			const struct kvm_memory_slot *memslot);
>  extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct
> kvm_ppc_rmmu_info *info);
>  
>  /* XXX remove this export when load_last_inst() is generic */
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index a18afda..6f2d2fb 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -899,11 +899,12 @@ void kvmppc_core_flush_memslot_hv(struct kvm
> *kvm,
>  
>  	gfn = memslot->base_gfn;
>  	rmapp = memslot->arch.rmap;
> +	if (kvm_is_radix(kvm)) {
> +		kvmppc_radix_flush_memslot(kvm, memslot);
> +		return;
> +	}
> +
>  	for (n = memslot->npages; n; --n, ++gfn) {
> -		if (kvm_is_radix(kvm)) {
> -			kvm_unmap_radix(kvm, memslot, gfn);
> -			continue;
> -		}
>  		/*
>  		 * Testing the present bit without locking is OK
> because
>  		 * the memslot has been marked invalid already, and
> hence
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 52711eb..d675ad9 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -958,6 +958,26 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm
> *kvm,
>  	return 0;
>  }
>  
> +void kvmppc_radix_flush_memslot(struct kvm *kvm,
> +				const struct kvm_memory_slot
> *memslot)
> +{
> +	unsigned long n;
> +	pte_t *ptep;
> +	unsigned long gpa;
> +	unsigned int shift;
> +
> +	gpa = memslot->base_gfn << PAGE_SHIFT;
> +	spin_lock(&kvm->mmu_lock);
> +	for (n = memslot->npages; n; --n) {
> +		ptep = __find_linux_pte(kvm->arch.pgtable, gpa,
> NULL, &shift);
> +		if (ptep && pte_present(*ptep))
> +			kvmppc_unmap_pte(kvm, ptep, gpa, shift,
> memslot,
> +					 kvm->arch.lpid);
> +		gpa += PAGE_SIZE;
> +	}
> +	spin_unlock(&kvm->mmu_lock);

I don't know how expensive it is calling __find_linux_pte() may times
for a 2M or 1G page when we know we've already invalidated the mapping?
Could be improved with:

end_gpa = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
for (; gpa < end_gpa; gpa += (1UL << shift)) {
	...
	if (!shift)
		shift = PAGE_SHIFT;
}

> +}
> +
>  static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
>  				 int psize, int *indexp)
>  {
> diff --git a/arch/powerpc/kvm/book3s_hv.c
> b/arch/powerpc/kvm/book3s_hv.c
> index f4fbb7b5..074ff5b 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4384,6 +4384,23 @@ static void
> kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
>  	 */
>  	if (npages)
>  		atomic64_inc(&kvm->arch.mmio_update);
> +
> +	/*
> +	 * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
> +	 * have already called kvm_arch_flush_shadow_memslot() to
> +	 * flush shadow mappings.  For KVM_MR_CREATE we have no
> +	 * previous mappings.  So the only case to handle is
> +	 * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
> +	 * has been changed.
> +	 * For radix guests, we flush on setting
> KVM_MEM_LOG_DIRTY_PAGES
> +	 * to get rid of any THP PTEs in the partition-scoped page
> tables
> +	 * so we can track dirtiness at the page level; we flush
> when
> +	 * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back
> to
> +	 * using THP PTEs.
> +	 */
> +	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
> +	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
> +		kvmppc_radix_flush_memslot(kvm, old);
>  }
>  
>  /*
David Gibson Dec. 12, 2018, 11:54 p.m. UTC | #2
On Wed, Dec 12, 2018 at 03:17:17PM +1100, Paul Mackerras wrote:
> This adds code to flush the partition-scoped page tables for a radix
> guest when dirty tracking is turned on or off for a memslot.  Only the
> guest real addresses covered by the memslot are flushed.  The reason
> for this is to get rid of any 2M PTEs in the partition-scoped page
> tables that correspond to host transparent huge pages, so that page
> dirtiness is tracked at a system page (4k or 64k) granularity rather
> than a 2M granularity.  The page tables are also flushed when turning
> dirty tracking off so that the memslot's address space can be
> repopulated with THPs if possible.
> 
> To do this, we add a new function kvmppc_radix_flush_memslot().  Since
> this does what's needed for kvmppc_core_flush_memslot_hv() on a radix
> guest, we now make kvmppc_core_flush_memslot_hv() call the new
> kvmppc_radix_flush_memslot() rather than calling kvm_unmap_radix()
> for each page in the memslot.  This has the effect of fixing a bug in
> that kvmppc_core_flush_memslot_hv() was previously calling
> kvm_unmap_radix() without holding the kvm->mmu_lock spinlock, which
> is required to be held.
> 
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/include/asm/kvm_book3s.h  |  2 ++
>  arch/powerpc/kvm/book3s_64_mmu_hv.c    |  9 +++++----
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 20 ++++++++++++++++++++
>  arch/powerpc/kvm/book3s_hv.c           | 17 +++++++++++++++++
>  4 files changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index 728d2b7..f8a5ac8 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -222,6 +222,8 @@ extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
>  			unsigned long gfn);
>  extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
>  			struct kvm_memory_slot *memslot, unsigned long *map);
> +extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
> +			const struct kvm_memory_slot *memslot);
>  extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
>  
>  /* XXX remove this export when load_last_inst() is generic */
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index a18afda..6f2d2fb 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -899,11 +899,12 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
>  
>  	gfn = memslot->base_gfn;
>  	rmapp = memslot->arch.rmap;
> +	if (kvm_is_radix(kvm)) {
> +		kvmppc_radix_flush_memslot(kvm, memslot);
> +		return;
> +	}
> +
>  	for (n = memslot->npages; n; --n, ++gfn) {
> -		if (kvm_is_radix(kvm)) {
> -			kvm_unmap_radix(kvm, memslot, gfn);
> -			continue;
> -		}
>  		/*
>  		 * Testing the present bit without locking is OK because
>  		 * the memslot has been marked invalid already, and hence
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 52711eb..d675ad9 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -958,6 +958,26 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
>  	return 0;
>  }
>  
> +void kvmppc_radix_flush_memslot(struct kvm *kvm,
> +				const struct kvm_memory_slot *memslot)
> +{
> +	unsigned long n;
> +	pte_t *ptep;
> +	unsigned long gpa;
> +	unsigned int shift;
> +
> +	gpa = memslot->base_gfn << PAGE_SHIFT;
> +	spin_lock(&kvm->mmu_lock);
> +	for (n = memslot->npages; n; --n) {
> +		ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
> +		if (ptep && pte_present(*ptep))
> +			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
> +					 kvm->arch.lpid);
> +		gpa += PAGE_SIZE;
> +	}
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
>  static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
>  				 int psize, int *indexp)
>  {
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index f4fbb7b5..074ff5b 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4384,6 +4384,23 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
>  	 */
>  	if (npages)
>  		atomic64_inc(&kvm->arch.mmio_update);
> +
> +	/*
> +	 * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
> +	 * have already called kvm_arch_flush_shadow_memslot() to
> +	 * flush shadow mappings.  For KVM_MR_CREATE we have no
> +	 * previous mappings.  So the only case to handle is
> +	 * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
> +	 * has been changed.
> +	 * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
> +	 * to get rid of any THP PTEs in the partition-scoped page tables
> +	 * so we can track dirtiness at the page level; we flush when
> +	 * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
> +	 * using THP PTEs.
> +	 */
> +	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
> +	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
> +		kvmppc_radix_flush_memslot(kvm, old);
>  }
>  
>  /*
Laurent Vivier April 28, 2020, 3:57 p.m. UTC | #3
On 12/12/2018 05:17, Paul Mackerras wrote:
> This adds code to flush the partition-scoped page tables for a radix
> guest when dirty tracking is turned on or off for a memslot.  Only the
> guest real addresses covered by the memslot are flushed.  The reason
> for this is to get rid of any 2M PTEs in the partition-scoped page
> tables that correspond to host transparent huge pages, so that page
> dirtiness is tracked at a system page (4k or 64k) granularity rather
> than a 2M granularity.  The page tables are also flushed when turning
> dirty tracking off so that the memslot's address space can be
> repopulated with THPs if possible.
> 
> To do this, we add a new function kvmppc_radix_flush_memslot().  Since
> this does what's needed for kvmppc_core_flush_memslot_hv() on a radix
> guest, we now make kvmppc_core_flush_memslot_hv() call the new
> kvmppc_radix_flush_memslot() rather than calling kvm_unmap_radix()
> for each page in the memslot.  This has the effect of fixing a bug in
> that kvmppc_core_flush_memslot_hv() was previously calling
> kvm_unmap_radix() without holding the kvm->mmu_lock spinlock, which
> is required to be held.
> 
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> Reviewed-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  arch/powerpc/include/asm/kvm_book3s.h  |  2 ++
>  arch/powerpc/kvm/book3s_64_mmu_hv.c    |  9 +++++----
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 20 ++++++++++++++++++++
>  arch/powerpc/kvm/book3s_hv.c           | 17 +++++++++++++++++
>  4 files changed, 44 insertions(+), 4 deletions(-)
> 
...
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index f4fbb7b5..074ff5b 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4384,6 +4384,23 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
>  	 */
>  	if (npages)
>  		atomic64_inc(&kvm->arch.mmio_update);
> +
> +	/*
> +	 * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
> +	 * have already called kvm_arch_flush_shadow_memslot() to
> +	 * flush shadow mappings.  For KVM_MR_CREATE we have no
> +	 * previous mappings.  So the only case to handle is
> +	 * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
> +	 * has been changed.
> +	 * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
> +	 * to get rid of any THP PTEs in the partition-scoped page tables
> +	 * so we can track dirtiness at the page level; we flush when
> +	 * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
> +	 * using THP PTEs.
> +	 */
> +	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
> +	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
> +		kvmppc_radix_flush_memslot(kvm, old);
>  }

Hi,

this part introduces a regression in QEMU test suite.

We have the following warning in the host kernel log:

[   96.631336] ------------[ cut here ]------------
[   96.631372] WARNING: CPU: 32 PID: 4095 at
arch/powerpc/kvm/book3s_64_mmu_radix.c:436
kvmppc_unmap_free_pte+0x84/0x150 [kvm_hv]
[   96.631394] Modules linked in: xt_CHECKSUM nft_chain_nat
xt_MASQUERADE nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6
nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 nft_counter nft_compat
nf_tables nfnetlink tun bridge stp llc rpcsec_gss_krb5 auth_rpcgss nfsv4
dns_resolver nfs lockd grace fscache rfkill kvm_hv kvm i2c_dev sunrpc
ext4 mbcache jbd2 xts ofpart ses enclosure vmx_crypto scsi_transport_sas
at24 ipmi_powernv powernv_flash ipmi_devintf opal_prd mtd
ipmi_msghandler uio_pdrv_genirq uio ibmpowernv ip_tables xfs libcrc32c
ast i2c_algo_bit drm_vram_helper drm_ttm_helper ttm drm_kms_helper
syscopyarea sysfillrect sysimgblt fb_sys_fops cec drm sd_mod i40e t10_pi
sg aacraid drm_panel_orientation_quirks dm_mirror dm_region_hash dm_log
dm_mod
[   96.631553] CPU: 32 PID: 4095 Comm: CPU 0/KVM Not tainted 5.7.0-rc3 #24
[   96.631571] NIP:  c008000007745d1c LR: c008000007746c20 CTR:
c000000000098aa0
[   96.631598] REGS: c0000003b842f440 TRAP: 0700   Not tainted  (5.7.0-rc3)
[   96.631615] MSR:  900000010282b033
<SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE,TM[E]>  CR: 24222448  XER: 20040000
[   96.631648] CFAR: c008000007746c1c IRQMASK: 0
[   96.631648] GPR00: c008000007746c20 c0000003b842f6d0 c00800000775b300
c000000008a60000
[   96.631648] GPR04: c00000039b835200 c0000003aae00387 0000000000000001
000000009b835205
[   96.631648] GPR08: 0552839b03000080 0000000000000020 0000000000000005
c00800000774dd88
[   96.631648] GPR12: c000000000098aa0 c0000007fffdb000 0000000000000000
0000000000000000
[   96.631648] GPR16: 0000000000000000 0000000000000000 0000000000000000
0000000000000000
[   96.631648] GPR20: 0000000000000000 0000000000000001 8703e0aa030000c0
c00000000183d9d8
[   96.631648] GPR24: c00000000183d9e0 c00000039b835200 0000000000000001
c000000008a60000
[   96.631648] GPR28: 0000000000000001 c00000000183d9e8 0000000000000000
c00000039b835200
[   96.631784] NIP [c008000007745d1c] kvmppc_unmap_free_pte+0x84/0x150
[kvm_hv]
[   96.631813] LR [c008000007746c20] kvmppc_create_pte+0x948/0xa90 [kvm_hv]
[   96.631830] Call Trace:
[   96.631845] [c0000003b842f6d0] [00007fff60740000] 0x7fff60740000
(unreliable)
[   96.631866] [c0000003b842f730] [c008000007746c20]
kvmppc_create_pte+0x948/0xa90 [kvm_hv]
[   96.631886] [c0000003b842f7d0] [c0080000077470f8]
kvmppc_book3s_instantiate_page+0x270/0x5c0 [kvm_hv]
[   96.631920] [c0000003b842f8c0] [c008000007747618]
kvmppc_book3s_radix_page_fault+0x1d0/0x300 [kvm_hv]
[   96.631951] [c0000003b842f970] [c008000007741f7c]
kvmppc_book3s_hv_page_fault+0x1f4/0xd50 [kvm_hv]
[   96.631981] [c0000003b842fa60] [c00800000773da34]
kvmppc_vcpu_run_hv+0x9bc/0xba0 [kvm_hv]
[   96.632006] [c0000003b842fb30] [c008000007c6aabc]
kvmppc_vcpu_run+0x34/0x48 [kvm]
[   96.632030] [c0000003b842fb50] [c008000007c6686c]
kvm_arch_vcpu_ioctl_run+0x2f4/0x400 [kvm]
[   96.632061] [c0000003b842fbe0] [c008000007c57fb8]
kvm_vcpu_ioctl+0x340/0x7d0 [kvm]
[   96.632082] [c0000003b842fd50] [c0000000004c9598] ksys_ioctl+0xf8/0x150
[   96.632100] [c0000003b842fda0] [c0000000004c9618] sys_ioctl+0x28/0x80
[   96.632118] [c0000003b842fdc0] [c000000000034b34]
system_call_exception+0x104/0x1d0
[   96.632146] [c0000003b842fe20] [c00000000000c970]
system_call_common+0xf0/0x278
[   96.632165] Instruction dump:
[   96.632181] 7cda3378 7c9f2378 3bc00000 3b800001 e95d0000 7d295030
2f890000 419e0054
[   96.632211] 60000000 7ca0fc28 2fa50000 419e0028 <0fe00000> 78a55924
7f48d378 7fe4fb78
[   96.632241] ---[ end trace 4f8aa0280f1215fb ]---

The problem is detected with the "migration-test" test program in qemu,
on a POWER9 host in radix mode with THP. It happens only the first time
the test program is run after loading kvm_hv. The warning is an explicit
detection [1] of a problem:

arch/powerpc/kvm/book3s_64_mmu_radix.c:
 414 /*
 415  * kvmppc_free_p?d are used to free existing page tables, and
recursively
 416  * descend and clear and free children.
 417  * Callers are responsible for flushing the PWC.
 418  *
 419  * When page tables are being unmapped/freed as part of page fault path
 420  * (full == false), ptes are not expected. There is code to unmap them
 421  * and emit a warning if encountered, but there may already be data
 422  * corruption due to the unexpected mappings.
 423  */
 424 static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool
full,
 425                                   unsigned int lpid)
 426 {
 427         if (full) {
 428                 memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
 429         } else {
 430                 pte_t *p = pte;
 431                 unsigned long it;
 432
 433                 for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
 434                         if (pte_val(*p) == 0)
 435                                 continue;
 436                         WARN_ON_ONCE(1);
 437                         kvmppc_unmap_pte(kvm, p,
 438                                          pte_pfn(*p) << PAGE_SHIFT,
 439                                          PAGE_SHIFT, NULL, lpid);
 440                 }
 441         }
 442
 443         kvmppc_pte_free(pte);
 444 }


I reproduce the problem in QEMU 4.2 build directory with :

 sudo rmmod kvm_hv
 sudo modprobe kvm_hv
 make check-qtest-ppc64

or once the test binary is built with

 sudo rmmod kvm_hv
 sudo modprobe kvm_hv
 export QTEST_QEMU_BINARY=ppc64-softmmu/qemu-system-ppc64
 tests/qtest/migration-test

The sub-test is "/migration/validate_uuid_error" that generates some
memory stress with a SLOF program in the source guest and fails because
of a mismatch with the destination guest. So the source guest continues
to execute the stress program while the migration is aborted.

Another way to reproduce the problem is:

Source guest:

sudo rmmod kvm-hv
sudo modprobe kvm-hv
qemu-system-ppc64 -display none -accel kvm -name source -m 256M \
                  -nodefaults -prom-env use-nvramrc?=true \
                  -prom-env 'nvramrc=hex ." _" begin 6400000 100000 \
                  do i c@ 1 + i c! 1000 +loop ." B" 0 until' -monitor \
                  stdio

Destination guest (on the same host):

qemu-system-ppc64 -display none -accel kvm -name source -m 512M \
                  -nodefaults -monitor stdio -incoming tcp:0:4444

Then in source guest monitor:

(qemu) migrate tcp:localhost:4444

The migration intentionally fails because of a memory size mismatch and
the warning is generated in the host kernel log.

It was not detected earlier because the problem is only detected with a
test added in qemu-4.2.0 [2] with THP enabled that starts a migration
and fails, so the memory stress program continues to run in the source
guest while the dirty log bitmap is released. The problem cannot be
shown with qemu-5.0 because a change in the migration speed has been
added [3]

The problem seems to happen because QEMU modifies the memory map while a
vCPU is running:

   (qemu) migrate tcp:localhost:4444

   108264@1578573121.242431:kvm_run_exit cpu_index 0, reason 19
   108264@1578573121.242454:kvm_vcpu_ioctl cpu_index 0, type 0x2000ae80,
arg (nil)
   108264@1578573121.248110:kvm_run_exit cpu_index 0, reason 19

   108281@1578573121.248710:kvm_set_user_memory Slot#0 flags=0x1 gpa=0x0
size=0x10000000 ua=0x7fff8fe00000 ret=0

-> 108264@1578573121.249335:kvm_vcpu_ioctl cpu_index 0, type 0x2000ae80,
arg (nil)
-> 108259@1578573121.253111:kvm_set_user_memory Slot#0 flags=0x0 gpa=0x0
size=0x10000000 ua=0x7fff8fe00000 ret=0
-> 108264@1578573121.256593:kvm_run_exit cpu_index 0, reason 19

This is part of the cleanup function after the migration has failed:

migration_iteration_finish
  migrate_fd_cleanup_schedule()
    migrate_fd_cleanup_bh
...
migrate_fd_cleanup
  qemu_savevm_state_cleanup
    ram_save_cleanup
      memory_global_dirty_log_stop
        memory_global_dirty_log_do_stop
          ..

If I pause the VM in qemu_savevm_state_cleanup() while save_cleanup()
functions are called (ram_save_cleanup()), the warning disappears.

Any idea?

Thanks,
Laurent

[1] warning introduced by
    a5704e83aa3d KVM: PPC: Book3S HV: Recursively unmap all page table
                 entries when unmapping
[2] 3af31a3469 "tests/migration: Add a test for validate-uuid capability"
[3] 97e1e06780 "migration: Rate limit inside host pages"
Paul Mackerras May 6, 2020, 5:12 a.m. UTC | #4
On Tue, Apr 28, 2020 at 05:57:59PM +0200, Laurent Vivier wrote:
> On 12/12/2018 05:17, Paul Mackerras wrote:
> > +	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
> > +	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
> > +		kvmppc_radix_flush_memslot(kvm, old);
> >  }
> 
> Hi,
> 
> this part introduces a regression in QEMU test suite.
> 
> We have the following warning in the host kernel log:
> 
[snip]
> 
> The problem is detected with the "migration-test" test program in qemu,
> on a POWER9 host in radix mode with THP. It happens only the first time
> the test program is run after loading kvm_hv. The warning is an explicit
> detection [1] of a problem:

Yes, we found a valid PTE where we didn't expect there to be one.

[snip]
> I reproduce the problem in QEMU 4.2 build directory with :
> 
>  sudo rmmod kvm_hv
>  sudo modprobe kvm_hv
>  make check-qtest-ppc64
> 
> or once the test binary is built with
> 
>  sudo rmmod kvm_hv
>  sudo modprobe kvm_hv
>  export QTEST_QEMU_BINARY=ppc64-softmmu/qemu-system-ppc64
>  tests/qtest/migration-test
> 
> The sub-test is "/migration/validate_uuid_error" that generates some
> memory stress with a SLOF program in the source guest and fails because
> of a mismatch with the destination guest. So the source guest continues
> to execute the stress program while the migration is aborted.
> 
> Another way to reproduce the problem is:
> 
> Source guest:
> 
> sudo rmmod kvm-hv
> sudo modprobe kvm-hv
> qemu-system-ppc64 -display none -accel kvm -name source -m 256M \
>                   -nodefaults -prom-env use-nvramrc?=true \
>                   -prom-env 'nvramrc=hex ." _" begin 6400000 100000 \
>                   do i c@ 1 + i c! 1000 +loop ." B" 0 until' -monitor \
>                   stdio
> 
> Destination guest (on the same host):
> 
> qemu-system-ppc64 -display none -accel kvm -name source -m 512M \
>                   -nodefaults -monitor stdio -incoming tcp:0:4444
> 
> Then in source guest monitor:
> 
> (qemu) migrate tcp:localhost:4444
> 
> The migration intentionally fails because of a memory size mismatch and
> the warning is generated in the host kernel log.

I built QEMU 4.2 and tried all three methods you list without seeing
the problem manifest itself.  Is there any other configuration
regarding THP or anything necessary?

I was using the patch below, which you could try also, since you are
better able to trigger the problem.  I never saw the "flush_memslot
was necessary" message nor the WARN_ON.  If you see the flush_memslot
message then that will give us a clue as to where the problem is
coming from.

Regards,
Paul.

--
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 506e4df2d730..14ddf1411279 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -220,7 +220,7 @@ extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			unsigned long gfn);
 extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long *map);
-extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
+extern int kvmppc_radix_flush_memslot(struct kvm *kvm,
 			const struct kvm_memory_slot *memslot);
 extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index aa12cd4078b3..930042632d8f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -433,7 +433,7 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
 		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
 			if (pte_val(*p) == 0)
 				continue;
-			WARN_ON_ONCE(1);
+			WARN_ON(1);
 			kvmppc_unmap_pte(kvm, p,
 					 pte_pfn(*p) << PAGE_SHIFT,
 					 PAGE_SHIFT, NULL, lpid);
@@ -1092,30 +1092,34 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 	return 0;
 }
 
-void kvmppc_radix_flush_memslot(struct kvm *kvm,
+int kvmppc_radix_flush_memslot(struct kvm *kvm,
 				const struct kvm_memory_slot *memslot)
 {
 	unsigned long n;
 	pte_t *ptep;
 	unsigned long gpa;
 	unsigned int shift;
+	int ret = 0;
 
 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
 		kvmppc_uvmem_drop_pages(memslot, kvm, true);
 
 	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
-		return;
+		return 0;
 
 	gpa = memslot->base_gfn << PAGE_SHIFT;
 	spin_lock(&kvm->mmu_lock);
 	for (n = memslot->npages; n; --n) {
 		ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
-		if (ptep && pte_present(*ptep))
+		if (ptep && pte_present(*ptep)) {
 			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
 					 kvm->arch.lpid);
+			ret = 1;
+		}
 		gpa += PAGE_SIZE;
 	}
 	spin_unlock(&kvm->mmu_lock);
+	return ret;
 }
 
 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 93493f0cbfe8..40b50f867835 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4508,6 +4508,10 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
 	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
 		kvmppc_radix_flush_memslot(kvm, old);
+	else if (kvm_is_radix(kvm) && kvmppc_radix_flush_memslot(kvm, old))
+		pr_err("flush_memslot was necessary - change = %d flags %x -> %x\n",
+		       change, old->flags, new->flags);
+
 	/*
 	 * If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
 	 */
Paul Mackerras May 27, 2020, 10:23 a.m. UTC | #5
Hi Laurent,

On Wed, May 06, 2020 at 10:24:48AM +0200, Laurent Vivier wrote:
> On 06/05/2020 07:12, Paul Mackerras wrote:
> > On Tue, Apr 28, 2020 at 05:57:59PM +0200, Laurent Vivier wrote:
> >> On 12/12/2018 05:17, Paul Mackerras wrote:
> >>> +	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
> >>> +	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
> >>> +		kvmppc_radix_flush_memslot(kvm, old);
> >>>  }
> >>
> >> Hi,
> >>
> >> this part introduces a regression in QEMU test suite.
> >>
> >> We have the following warning in the host kernel log:
> >>
> > [snip]
> >>
> >> The problem is detected with the "migration-test" test program in qemu,
> >> on a POWER9 host in radix mode with THP. It happens only the first time
> >> the test program is run after loading kvm_hv. The warning is an explicit
> >> detection [1] of a problem:
> > 
> > Yes, we found a valid PTE where we didn't expect there to be one.

I have now managed to reproduce the problem locally, and I have an
explanation for what's going on.  QEMU turns on dirty tracking for the
memslot for the VM's RAM, and KVM flushes the mappings from the
partition-scoped page table, and then proceeds to fill it up with 64k
page mappings due to page faults as the VM runs.

Then QEMU turns dirty tracking off, while the VM is still running.
The new memslot, with the dirty tracking bit clear, becomes visible to
page faults before we get to the kvmppc_core_commit_memory_region_hv()
function.  Thus, page faults occurring during the interval between the
calls to install_new_memslots() and kvm_arch_commit_memory_region()
will decide to install a 2MB page if there is a THP on the host side.
This will hit the case in kvmppc_create_pte() where it wants to
install a 2MB leaf PTE but there is a page table pointer there
already.  It calls kvmppc_unmap_free_pmd_entry_table(), which calls
kvmppc_unmap_free_pte(), which finds the existing 64k PTEs and
generates the warning.

Now, the code in kvmppc_unmap_free_pte() does the right thing, in that
it calls kvmppc_unmap_pte to deal with the PTE it found.  The warning
was just an attempt to catch code bugs rather than an indication of
any immediate and obvious problem.  Since we now know of one situation
where this can legitimately happen, I think we should just take out
the WARN_ON_ONCE, along with the scary comment.  If people want to be
more paranoid than that, we could add a check that the existing PTEs
all map sub-pages of the 2MB page that we're about to map.

There is another race which is possible, which is that when turning on
dirty page tracking, a parallel page fault reads the memslot flags
before the new memslots are installed, and inserts its PTE after
kvmppc_core_commit_memory_region_hv has flushed the memslot.  In that
case, we would have a 2MB PTE left over, which would result in coarser
dirty tracking granularity for that 2MB page.  I think this would be
very hard to hit in practice, and that having coarser dirty tracking
granularity for one 2MB page of the VM would not cause any practical
problem.

That race could be closed by bumping the kvm->mmu_notifier_seq while
the kvm->mmu_lock is held in kvmppc_radix_flush_memslot().

Thoughts?

Paul.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 728d2b7..f8a5ac8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -222,6 +222,8 @@  extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			unsigned long gfn);
 extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long *map);
+extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
+			const struct kvm_memory_slot *memslot);
 extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
 
 /* XXX remove this export when load_last_inst() is generic */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index a18afda..6f2d2fb 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -899,11 +899,12 @@  void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 
 	gfn = memslot->base_gfn;
 	rmapp = memslot->arch.rmap;
+	if (kvm_is_radix(kvm)) {
+		kvmppc_radix_flush_memslot(kvm, memslot);
+		return;
+	}
+
 	for (n = memslot->npages; n; --n, ++gfn) {
-		if (kvm_is_radix(kvm)) {
-			kvm_unmap_radix(kvm, memslot, gfn);
-			continue;
-		}
 		/*
 		 * Testing the present bit without locking is OK because
 		 * the memslot has been marked invalid already, and hence
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 52711eb..d675ad9 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -958,6 +958,26 @@  long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 	return 0;
 }
 
+void kvmppc_radix_flush_memslot(struct kvm *kvm,
+				const struct kvm_memory_slot *memslot)
+{
+	unsigned long n;
+	pte_t *ptep;
+	unsigned long gpa;
+	unsigned int shift;
+
+	gpa = memslot->base_gfn << PAGE_SHIFT;
+	spin_lock(&kvm->mmu_lock);
+	for (n = memslot->npages; n; --n) {
+		ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+		if (ptep && pte_present(*ptep))
+			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
+					 kvm->arch.lpid);
+		gpa += PAGE_SIZE;
+	}
+	spin_unlock(&kvm->mmu_lock);
+}
+
 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
 				 int psize, int *indexp)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f4fbb7b5..074ff5b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4384,6 +4384,23 @@  static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 	 */
 	if (npages)
 		atomic64_inc(&kvm->arch.mmio_update);
+
+	/*
+	 * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
+	 * have already called kvm_arch_flush_shadow_memslot() to
+	 * flush shadow mappings.  For KVM_MR_CREATE we have no
+	 * previous mappings.  So the only case to handle is
+	 * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
+	 * has been changed.
+	 * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
+	 * to get rid of any THP PTEs in the partition-scoped page tables
+	 * so we can track dirtiness at the page level; we flush when
+	 * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
+	 * using THP PTEs.
+	 */
+	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
+	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
+		kvmppc_radix_flush_memslot(kvm, old);
 }
 
 /*