diff mbox

[v9,2/4] arm: ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support)

Message ID 1406249768-25315-3-git-send-email-m.smarduch@samsung.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mario Smarduch July 25, 2014, 12:56 a.m. UTC
Patch adds  support for initial write protection VM memlsot. This patch series
assumes that huge PUDs will not be used in 2nd stage tables.

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 arch/arm/include/asm/kvm_host.h       |    1 +
 arch/arm/include/asm/kvm_mmu.h        |   20 ++++++
 arch/arm/include/asm/pgtable-3level.h |    1 +
 arch/arm/kvm/arm.c                    |    9 +++
 arch/arm/kvm/mmu.c                    |  128 +++++++++++++++++++++++++++++++++
 5 files changed, 159 insertions(+)

Comments

Alexander Graf July 25, 2014, 6:16 a.m. UTC | #1
On 25.07.14 02:56, Mario Smarduch wrote:
> Patch adds  support for initial write protection VM memlsot. This patch series
> assumes that huge PUDs will not be used in 2nd stage tables.

Is this a valid assumption?

>
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
>   arch/arm/include/asm/kvm_host.h       |    1 +
>   arch/arm/include/asm/kvm_mmu.h        |   20 ++++++
>   arch/arm/include/asm/pgtable-3level.h |    1 +
>   arch/arm/kvm/arm.c                    |    9 +++
>   arch/arm/kvm/mmu.c                    |  128 +++++++++++++++++++++++++++++++++
>   5 files changed, 159 insertions(+)
>
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 042206f..6521a2d 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -231,5 +231,6 @@ int kvm_perf_teardown(void);
>   u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
>   int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
>   void kvm_arch_flush_remote_tlbs(struct kvm *);
> +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>   
>   #endif /* __ARM_KVM_HOST_H__ */
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index 5cc0b0f..08ab5e8 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
>   	pmd_val(*pmd) |= L_PMD_S2_RDWR;
>   }
>   
> +static inline void kvm_set_s2pte_readonly(pte_t *pte)
> +{
> +	pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
> +}
> +
> +static inline bool kvm_s2pte_readonly(pte_t *pte)
> +{
> +	return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
> +}
> +
> +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
> +{
> +	pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
> +}
> +
> +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
> +{
> +	return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
> +}
> +
>   /* Open coded p*d_addr_end that can deal with 64bit addresses */
>   #define kvm_pgd_addr_end(addr, end)					\
>   ({	u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;		\
> diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
> index 85c60ad..d8bb40b 100644
> --- a/arch/arm/include/asm/pgtable-3level.h
> +++ b/arch/arm/include/asm/pgtable-3level.h
> @@ -129,6 +129,7 @@
>   #define L_PTE_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
>   #define L_PTE_S2_RDWR			(_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
>   
> +#define L_PMD_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
>   #define L_PMD_S2_RDWR			(_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
>   
>   /*
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 3c82b37..e11c2dd 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
>   				   const struct kvm_memory_slot *old,
>   				   enum kvm_mr_change change)
>   {
> +#ifdef CONFIG_ARM

Same question on CONFIG_ARM here. Is this the define used to distinguish 
between 32bit and 64bit?


Alex
Mario Smarduch July 25, 2014, 5:45 p.m. UTC | #2
On 07/24/2014 11:16 PM, Alexander Graf wrote:
> 
> On 25.07.14 02:56, Mario Smarduch wrote:
>> Patch adds  support for initial write protection VM memlsot. This
>> patch series
>> assumes that huge PUDs will not be used in 2nd stage tables.
> 
> Is this a valid assumption?

Right now it's unclear if PUDs will be used to back guest
memory, assuming so required quite a bit of additional code.
After discussing on mailing list it was recommended to
treat this as BUG_ON case for now.

> 
>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>   arch/arm/include/asm/kvm_host.h       |    1 +
>>   arch/arm/include/asm/kvm_mmu.h        |   20 ++++++
>>   arch/arm/include/asm/pgtable-3level.h |    1 +
>>   arch/arm/kvm/arm.c                    |    9 +++
>>   arch/arm/kvm/mmu.c                    |  128
>> +++++++++++++++++++++++++++++++++
>>   5 files changed, 159 insertions(+)
>>
>> diff --git a/arch/arm/include/asm/kvm_host.h
>> b/arch/arm/include/asm/kvm_host.h
>> index 042206f..6521a2d 100644
>> --- a/arch/arm/include/asm/kvm_host.h
>> +++ b/arch/arm/include/asm/kvm_host.h
>> @@ -231,5 +231,6 @@ int kvm_perf_teardown(void);
>>   u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
>>   int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
>>   void kvm_arch_flush_remote_tlbs(struct kvm *);
>> +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>>     #endif /* __ARM_KVM_HOST_H__ */
>> diff --git a/arch/arm/include/asm/kvm_mmu.h
>> b/arch/arm/include/asm/kvm_mmu.h
>> index 5cc0b0f..08ab5e8 100644
>> --- a/arch/arm/include/asm/kvm_mmu.h
>> +++ b/arch/arm/include/asm/kvm_mmu.h
>> @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t
>> *pmd)
>>       pmd_val(*pmd) |= L_PMD_S2_RDWR;
>>   }
>>   +static inline void kvm_set_s2pte_readonly(pte_t *pte)
>> +{
>> +    pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
>> +}
>> +
>> +static inline bool kvm_s2pte_readonly(pte_t *pte)
>> +{
>> +    return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
>> +}
>> +
>> +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
>> +{
>> +    pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
>> +}
>> +
>> +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
>> +{
>> +    return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
>> +}
>> +
>>   /* Open coded p*d_addr_end that can deal with 64bit addresses */
>>   #define kvm_pgd_addr_end(addr, end)                    \
>>   ({    u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;        \
>> diff --git a/arch/arm/include/asm/pgtable-3level.h
>> b/arch/arm/include/asm/pgtable-3level.h
>> index 85c60ad..d8bb40b 100644
>> --- a/arch/arm/include/asm/pgtable-3level.h
>> +++ b/arch/arm/include/asm/pgtable-3level.h
>> @@ -129,6 +129,7 @@
>>   #define L_PTE_S2_RDONLY            (_AT(pteval_t, 1) << 6)   /*
>> HAP[1]   */
>>   #define L_PTE_S2_RDWR            (_AT(pteval_t, 3) << 6)   /*
>> HAP[2:1] */
>>   +#define L_PMD_S2_RDONLY            (_AT(pteval_t, 1) << 6)   /*
>> HAP[1]   */
>>   #define L_PMD_S2_RDWR            (_AT(pmdval_t, 3) << 6)   /*
>> HAP[2:1] */
>>     /*
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 3c82b37..e11c2dd 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
>>                      const struct kvm_memory_slot *old,
>>                      enum kvm_mr_change change)
>>   {
>> +#ifdef CONFIG_ARM
> 
> Same question on CONFIG_ARM here. Is this the define used to distinguish
> between 32bit and 64bit?

Yes let ARM64 compile. Eventually we'll come back to ARM64 soon, and
these will go.
> 
> 
> Alex
>
Christoffer Dall Aug. 11, 2014, 7:12 p.m. UTC | #3
Remove the parenthesis from the subject line.

On Thu, Jul 24, 2014 at 05:56:06PM -0700, Mario Smarduch wrote:
> Patch adds  support for initial write protection VM memlsot. This patch series
            ^^                                    ^
stray whitespace                                 of


> assumes that huge PUDs will not be used in 2nd stage tables.

may be worth mentioning that this is always valid on ARMv7.

> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
>  arch/arm/include/asm/kvm_host.h       |    1 +
>  arch/arm/include/asm/kvm_mmu.h        |   20 ++++++
>  arch/arm/include/asm/pgtable-3level.h |    1 +
>  arch/arm/kvm/arm.c                    |    9 +++
>  arch/arm/kvm/mmu.c                    |  128 +++++++++++++++++++++++++++++++++
>  5 files changed, 159 insertions(+)
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 042206f..6521a2d 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -231,5 +231,6 @@ int kvm_perf_teardown(void);
>  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
>  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
>  void kvm_arch_flush_remote_tlbs(struct kvm *);
> +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>  
>  #endif /* __ARM_KVM_HOST_H__ */
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index 5cc0b0f..08ab5e8 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
>  	pmd_val(*pmd) |= L_PMD_S2_RDWR;
>  }
>  
> +static inline void kvm_set_s2pte_readonly(pte_t *pte)
> +{
> +	pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
> +}
> +
> +static inline bool kvm_s2pte_readonly(pte_t *pte)
> +{
> +	return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
> +}
> +
> +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
> +{
> +	pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
> +}
> +
> +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
> +{
> +	return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
> +}
> +
>  /* Open coded p*d_addr_end that can deal with 64bit addresses */
>  #define kvm_pgd_addr_end(addr, end)					\
>  ({	u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;		\
> diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
> index 85c60ad..d8bb40b 100644
> --- a/arch/arm/include/asm/pgtable-3level.h
> +++ b/arch/arm/include/asm/pgtable-3level.h
> @@ -129,6 +129,7 @@
>  #define L_PTE_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
>  #define L_PTE_S2_RDWR			(_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
>  
> +#define L_PMD_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
>  #define L_PMD_S2_RDWR			(_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
>  
>  /*
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 3c82b37..e11c2dd 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
>  				   const struct kvm_memory_slot *old,
>  				   enum kvm_mr_change change)
>  {
> +#ifdef CONFIG_ARM
> +	/*
> +	 * At this point memslot has been committed and there is an
> +	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
> +	 * memory slot is write protected.
> +	 */
> +	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
> +		kvm_mmu_wp_memory_region(kvm, mem->slot);
> +#endif
>  }
>  
>  void kvm_arch_flush_shadow_all(struct kvm *kvm)
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 35254c6..7bfc792 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -763,6 +763,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
>  	return false;
>  }
>  
> +#ifdef CONFIG_ARM
> +/**
> + * stage2_wp_pte_range - write protect PTE range
> + * @pmd:	pointer to pmd entry
> + * @addr:	range start address
> + * @end:	range end address
> + */
> +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
> +{
> +	pte_t *pte;
> +
> +	pte = pte_offset_kernel(pmd, addr);
> +	do {
> +		if (!pte_none(*pte)) {
> +			if (!kvm_s2pte_readonly(pte))
> +				kvm_set_s2pte_readonly(pte);
> +		}
> +	} while (pte++, addr += PAGE_SIZE, addr != end);
> +}
> +
> +/**
> + * stage2_wp_pmd_range - write protect PMD range
> + * @pud:	pointer to pud entry
> + * @addr:	range start address
> + * @end:	range end address
> + */
> +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end)
> +{
> +	pmd_t *pmd;
> +	phys_addr_t next;
> +
> +	pmd = pmd_offset(pud, addr);
> +
> +	do {
> +		next = kvm_pmd_addr_end(addr, end);
> +		if (!pmd_none(*pmd)) {
> +			if (kvm_pmd_huge(*pmd)) {
> +				if (!kvm_s2pmd_readonly(pmd))
> +					kvm_set_s2pmd_readonly(pmd);
> +			} else
> +				stage2_wp_pte_range(pmd, addr, next);
please use a closing brace when the first part of the if-statement is a
multi-line block with braces, as per the CodingStyle.
> +

stray blank line

> +		}
> +	} while (pmd++, addr = next, addr != end);
> +}
> +
> +/**
> +  * stage2_wp_pud_range - write protect PUD range
> +  * @kvm:	pointer to kvm structure
> +  * @pud:	pointer to pgd entry
        pgd
> +  * @addr:	range start address
> +  * @end:	range end address
> +  *
> +  * While walking the PUD range huge PUD pages are ignored, in the future this
                             range, huge PUDs are ignored.  In the future...
> +  * may need to be revisited. Determine how to handle huge PUDs when logging
> +  * of dirty pages is enabled.

I don't understand the last sentence?

> +  */
> +static void  stage2_wp_pud_range(struct kvm *kvm, pgd_t *pgd,
> +				phys_addr_t addr, phys_addr_t end)
> +{
> +	pud_t *pud;
> +	phys_addr_t next;
> +
> +	pud = pud_offset(pgd, addr);
> +	do {
> +		next = kvm_pud_addr_end(addr, end);
> +		/* TODO: huge PUD not supported, revisit later */
> +		BUG_ON(pud_huge(*pud));

we should probably define kvm_pud_huge()

> +		if (!pud_none(*pud))
> +			stage2_wp_pmd_range(pud, addr, next);
> +	} while (pud++, addr = next, addr != end);
> +}
> +
> +/**
> + * stage2_wp_range() - write protect stage2 memory region range
> + * @kvm:	The KVM pointer
> + * @start:	Start address of range
> + * &end:	End address of range
> + */
> +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
> +{
> +	pgd_t *pgd;
> +	phys_addr_t next;
> +
> +	pgd = kvm->arch.pgd + pgd_index(addr);
> +	do {
> +		/*
> +		 * Release kvm_mmu_lock periodically if the memory region is
> +		 * large features like detect hung task, lock detector or lock
                   large.  Otherwise, we may see panics due to..
> +		 * dep  may panic. In addition holding the lock this long will
    extra white space ^^           Additionally, holding the lock for a
    long timer will
> +		 * also starve other vCPUs. Applies to huge VM memory regions.
                                            ^^^ I don't understand this
					    last remark.
> +		 */
> +		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
> +			cond_resched_lock(&kvm->mmu_lock);
> +
> +		next = kvm_pgd_addr_end(addr, end);
> +		if (pgd_present(*pgd))
> +			stage2_wp_pud_range(kvm, pgd, addr, next);
> +	} while (pgd++, addr = next, addr != end);
> +}
> +
> +/**
> + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
> + * @kvm:	The KVM pointer
> + * @slot:	The memory slot to write protect
> + *
> + * Called to start logging dirty pages after memory region
> + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
> + * all present PMD and PTEs are write protected in the memory region.
> + * Afterwards read of dirty page log can be called.
> + *
> + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
> + * serializing operations for VM memory regions.
> + */
> +
> +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
> +{
> +	struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
> +	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
> +	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	stage2_wp_range(kvm, start, end);
> +	kvm_flush_remote_tlbs(kvm);
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +#endif
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_memory_slot *memslot,
>  			  unsigned long fault_status)
> -- 
> 1.7.9.5
> 

Besides the commenting and whitespace stuff, this is beginning to look
good.

Thanks,
-Christoffer
Mario Smarduch Aug. 12, 2014, 12:16 a.m. UTC | #4
On 08/11/2014 12:12 PM, Christoffer Dall wrote:
> Remove the parenthesis from the subject line.

Hmmm have to check this don't see it my patch file.
> 
> On Thu, Jul 24, 2014 at 05:56:06PM -0700, Mario Smarduch wrote:
>> Patch adds  support for initial write protection VM memlsot. This patch series
>             ^^                                    ^
> stray whitespace                                 of
> 
Need to watch out for these adds delays to review cycle.
> 
>> assumes that huge PUDs will not be used in 2nd stage tables.
> 
> may be worth mentioning that this is always valid on ARMv7.
> 

Yep definitely.

>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>  arch/arm/include/asm/kvm_host.h       |    1 +
>>  arch/arm/include/asm/kvm_mmu.h        |   20 ++++++
>>  arch/arm/include/asm/pgtable-3level.h |    1 +
>>  arch/arm/kvm/arm.c                    |    9 +++
>>  arch/arm/kvm/mmu.c                    |  128 +++++++++++++++++++++++++++++++++
>>  5 files changed, 159 insertions(+)
>>
>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
>> index 042206f..6521a2d 100644
>> --- a/arch/arm/include/asm/kvm_host.h
>> +++ b/arch/arm/include/asm/kvm_host.h
>> @@ -231,5 +231,6 @@ int kvm_perf_teardown(void);
>>  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
>>  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
>>  void kvm_arch_flush_remote_tlbs(struct kvm *);
>> +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>>  
>>  #endif /* __ARM_KVM_HOST_H__ */
>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>> index 5cc0b0f..08ab5e8 100644
>> --- a/arch/arm/include/asm/kvm_mmu.h
>> +++ b/arch/arm/include/asm/kvm_mmu.h
>> @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
>>  	pmd_val(*pmd) |= L_PMD_S2_RDWR;
>>  }
>>  
>> +static inline void kvm_set_s2pte_readonly(pte_t *pte)
>> +{
>> +	pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
>> +}
>> +
>> +static inline bool kvm_s2pte_readonly(pte_t *pte)
>> +{
>> +	return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
>> +}
>> +
>> +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
>> +{
>> +	pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
>> +}
>> +
>> +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
>> +{
>> +	return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
>> +}
>> +
>>  /* Open coded p*d_addr_end that can deal with 64bit addresses */
>>  #define kvm_pgd_addr_end(addr, end)					\
>>  ({	u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;		\
>> diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
>> index 85c60ad..d8bb40b 100644
>> --- a/arch/arm/include/asm/pgtable-3level.h
>> +++ b/arch/arm/include/asm/pgtable-3level.h
>> @@ -129,6 +129,7 @@
>>  #define L_PTE_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
>>  #define L_PTE_S2_RDWR			(_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
>>  
>> +#define L_PMD_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
>>  #define L_PMD_S2_RDWR			(_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
>>  
>>  /*
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 3c82b37..e11c2dd 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
>>  				   const struct kvm_memory_slot *old,
>>  				   enum kvm_mr_change change)
>>  {
>> +#ifdef CONFIG_ARM
>> +	/*
>> +	 * At this point memslot has been committed and there is an
>> +	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
>> +	 * memory slot is write protected.
>> +	 */
>> +	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
>> +		kvm_mmu_wp_memory_region(kvm, mem->slot);
>> +#endif
>>  }
>>  
>>  void kvm_arch_flush_shadow_all(struct kvm *kvm)
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 35254c6..7bfc792 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -763,6 +763,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
>>  	return false;
>>  }
>>  
>> +#ifdef CONFIG_ARM
>> +/**
>> + * stage2_wp_pte_range - write protect PTE range
>> + * @pmd:	pointer to pmd entry
>> + * @addr:	range start address
>> + * @end:	range end address
>> + */
>> +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
>> +{
>> +	pte_t *pte;
>> +
>> +	pte = pte_offset_kernel(pmd, addr);
>> +	do {
>> +		if (!pte_none(*pte)) {
>> +			if (!kvm_s2pte_readonly(pte))
>> +				kvm_set_s2pte_readonly(pte);
>> +		}
>> +	} while (pte++, addr += PAGE_SIZE, addr != end);
>> +}
>> +
>> +/**
>> + * stage2_wp_pmd_range - write protect PMD range
>> + * @pud:	pointer to pud entry
>> + * @addr:	range start address
>> + * @end:	range end address
>> + */
>> +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end)
>> +{
>> +	pmd_t *pmd;
>> +	phys_addr_t next;
>> +
>> +	pmd = pmd_offset(pud, addr);
>> +
>> +	do {
>> +		next = kvm_pmd_addr_end(addr, end);
>> +		if (!pmd_none(*pmd)) {
>> +			if (kvm_pmd_huge(*pmd)) {
>> +				if (!kvm_s2pmd_readonly(pmd))
>> +					kvm_set_s2pmd_readonly(pmd);
>> +			} else
>> +				stage2_wp_pte_range(pmd, addr, next);
> please use a closing brace when the first part of the if-statement is a
> multi-line block with braces, as per the CodingStyle.
Will fix.
>> +
> 
> stray blank line

Not sure how it got by checkpatch, will fix.
> 
>> +		}
>> +	} while (pmd++, addr = next, addr != end);
>> +}
>> +
>> +/**
>> +  * stage2_wp_pud_range - write protect PUD range
>> +  * @kvm:	pointer to kvm structure
>> +  * @pud:	pointer to pgd entry
>         pgd
>> +  * @addr:	range start address
>> +  * @end:	range end address
>> +  *
>> +  * While walking the PUD range huge PUD pages are ignored, in the future this
>                              range, huge PUDs are ignored.  In the future...
>> +  * may need to be revisited. Determine how to handle huge PUDs when logging
>> +  * of dirty pages is enabled.
> 
> I don't understand the last sentence?

Probably last two sentences should be combined.
".... to determine how to handle huge PUT...". Would that be
clear enough?

The overall theme is what to do about PUDs - mark all pages dirty
in the region, attempt to breakup such huge regions?

> 
>> +  */
>> +static void  stage2_wp_pud_range(struct kvm *kvm, pgd_t *pgd,
>> +				phys_addr_t addr, phys_addr_t end)
>> +{
>> +	pud_t *pud;
>> +	phys_addr_t next;
>> +
>> +	pud = pud_offset(pgd, addr);
>> +	do {
>> +		next = kvm_pud_addr_end(addr, end);
>> +		/* TODO: huge PUD not supported, revisit later */
>> +		BUG_ON(pud_huge(*pud));
> 
> we should probably define kvm_pud_huge()

Yep will add it.

> 
>> +		if (!pud_none(*pud))
>> +			stage2_wp_pmd_range(pud, addr, next);
>> +	} while (pud++, addr = next, addr != end);
>> +}
>> +
>> +/**
>> + * stage2_wp_range() - write protect stage2 memory region range
>> + * @kvm:	The KVM pointer
>> + * @start:	Start address of range
>> + * &end:	End address of range
>> + */
>> +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
>> +{
>> +	pgd_t *pgd;
>> +	phys_addr_t next;
>> +
>> +	pgd = kvm->arch.pgd + pgd_index(addr);
>> +	do {
>> +		/*
>> +		 * Release kvm_mmu_lock periodically if the memory region is
>> +		 * large features like detect hung task, lock detector or lock
>                    large.  Otherwise, we may see panics due to..
>> +		 * dep  may panic. In addition holding the lock this long will
>     extra white space ^^           Additionally, holding the lock for a
>     long timer will
>> +		 * also starve other vCPUs. Applies to huge VM memory regions.
>                                             ^^^ I don't understand this
> 					    last remark.
>> +		 */
>> +		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
>> +			cond_resched_lock(&kvm->mmu_lock);
>> +
>> +		next = kvm_pgd_addr_end(addr, end);
>> +		if (pgd_present(*pgd))
>> +			stage2_wp_pud_range(kvm, pgd, addr, next);
>> +	} while (pgd++, addr = next, addr != end);
>> +}
>> +
>> +/**
>> + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
>> + * @kvm:	The KVM pointer
>> + * @slot:	The memory slot to write protect
>> + *
>> + * Called to start logging dirty pages after memory region
>> + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
>> + * all present PMD and PTEs are write protected in the memory region.
>> + * Afterwards read of dirty page log can be called.
>> + *
>> + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
>> + * serializing operations for VM memory regions.
>> + */
>> +
>> +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
>> +{
>> +	struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
>> +	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
>> +	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
>> +
>> +	spin_lock(&kvm->mmu_lock);
>> +	stage2_wp_range(kvm, start, end);
>> +	kvm_flush_remote_tlbs(kvm);
>> +	spin_unlock(&kvm->mmu_lock);
>> +}
>> +#endif
>> +
>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  			  struct kvm_memory_slot *memslot,
>>  			  unsigned long fault_status)
>> -- 
>> 1.7.9.5
>>
> 
> Besides the commenting and whitespace stuff, this is beginning to look
> good.

I'll clean it up for next iteration.
> 
> Thanks,
> -Christoffer
>
Mario Smarduch Aug. 12, 2014, 1:36 a.m. UTC | #5
On 08/11/2014 12:12 PM, Christoffer Dall wrote:
> Remove the parenthesis from the subject line.
> 
> On Thu, Jul 24, 2014 at 05:56:06PM -0700, Mario Smarduch wrote:
>> Patch adds  support for initial write protection VM memlsot. This patch series
>             ^^                                    ^
> stray whitespace                                 of
> 
> 
>> assumes that huge PUDs will not be used in 2nd stage tables.
> 
> may be worth mentioning that this is always valid on ARMv7.
> 
>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>  arch/arm/include/asm/kvm_host.h       |    1 +
>>  arch/arm/include/asm/kvm_mmu.h        |   20 ++++++
>>  arch/arm/include/asm/pgtable-3level.h |    1 +
>>  arch/arm/kvm/arm.c                    |    9 +++
>>  arch/arm/kvm/mmu.c                    |  128 +++++++++++++++++++++++++++++++++
>>  5 files changed, 159 insertions(+)
>>
>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
>> index 042206f..6521a2d 100644
>> --- a/arch/arm/include/asm/kvm_host.h
>> +++ b/arch/arm/include/asm/kvm_host.h
>> @@ -231,5 +231,6 @@ int kvm_perf_teardown(void);
>>  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
>>  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
>>  void kvm_arch_flush_remote_tlbs(struct kvm *);
>> +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>>  
>>  #endif /* __ARM_KVM_HOST_H__ */
>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>> index 5cc0b0f..08ab5e8 100644
>> --- a/arch/arm/include/asm/kvm_mmu.h
>> +++ b/arch/arm/include/asm/kvm_mmu.h
>> @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
>>  	pmd_val(*pmd) |= L_PMD_S2_RDWR;
>>  }
>>  
>> +static inline void kvm_set_s2pte_readonly(pte_t *pte)
>> +{
>> +	pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
>> +}
>> +
>> +static inline bool kvm_s2pte_readonly(pte_t *pte)
>> +{
>> +	return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
>> +}
>> +
>> +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
>> +{
>> +	pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
>> +}
>> +
>> +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
>> +{
>> +	return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
>> +}
>> +
>>  /* Open coded p*d_addr_end that can deal with 64bit addresses */
>>  #define kvm_pgd_addr_end(addr, end)					\
>>  ({	u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;		\
>> diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
>> index 85c60ad..d8bb40b 100644
>> --- a/arch/arm/include/asm/pgtable-3level.h
>> +++ b/arch/arm/include/asm/pgtable-3level.h
>> @@ -129,6 +129,7 @@
>>  #define L_PTE_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
>>  #define L_PTE_S2_RDWR			(_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
>>  
>> +#define L_PMD_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
>>  #define L_PMD_S2_RDWR			(_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
>>  
>>  /*
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 3c82b37..e11c2dd 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
>>  				   const struct kvm_memory_slot *old,
>>  				   enum kvm_mr_change change)
>>  {
>> +#ifdef CONFIG_ARM
>> +	/*
>> +	 * At this point memslot has been committed and there is an
>> +	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
>> +	 * memory slot is write protected.
>> +	 */
>> +	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
>> +		kvm_mmu_wp_memory_region(kvm, mem->slot);
>> +#endif
>>  }
>>  
>>  void kvm_arch_flush_shadow_all(struct kvm *kvm)
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 35254c6..7bfc792 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -763,6 +763,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
>>  	return false;
>>  }
>>  
>> +#ifdef CONFIG_ARM
>> +/**
>> + * stage2_wp_pte_range - write protect PTE range
>> + * @pmd:	pointer to pmd entry
>> + * @addr:	range start address
>> + * @end:	range end address
>> + */
>> +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
>> +{
>> +	pte_t *pte;
>> +
>> +	pte = pte_offset_kernel(pmd, addr);
>> +	do {
>> +		if (!pte_none(*pte)) {
>> +			if (!kvm_s2pte_readonly(pte))
>> +				kvm_set_s2pte_readonly(pte);
>> +		}
>> +	} while (pte++, addr += PAGE_SIZE, addr != end);
>> +}
>> +
>> +/**
>> + * stage2_wp_pmd_range - write protect PMD range
>> + * @pud:	pointer to pud entry
>> + * @addr:	range start address
>> + * @end:	range end address
>> + */
>> +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end)
>> +{
>> +	pmd_t *pmd;
>> +	phys_addr_t next;
>> +
>> +	pmd = pmd_offset(pud, addr);
>> +
>> +	do {
>> +		next = kvm_pmd_addr_end(addr, end);
>> +		if (!pmd_none(*pmd)) {
>> +			if (kvm_pmd_huge(*pmd)) {
>> +				if (!kvm_s2pmd_readonly(pmd))
>> +					kvm_set_s2pmd_readonly(pmd);
>> +			} else
>> +				stage2_wp_pte_range(pmd, addr, next);
> please use a closing brace when the first part of the if-statement is a
> multi-line block with braces, as per the CodingStyle.
>> +
> 
> stray blank line
> 
>> +		}
>> +	} while (pmd++, addr = next, addr != end);
>> +}
>> +
>> +/**
>> +  * stage2_wp_pud_range - write protect PUD range
>> +  * @kvm:	pointer to kvm structure
>> +  * @pud:	pointer to pgd entry
>         pgd
>> +  * @addr:	range start address
>> +  * @end:	range end address
>> +  *
>> +  * While walking the PUD range huge PUD pages are ignored, in the future this
>                              range, huge PUDs are ignored.  In the future...
>> +  * may need to be revisited. Determine how to handle huge PUDs when logging
>> +  * of dirty pages is enabled.
> 
> I don't understand the last sentence?
> 
>> +  */
>> +static void  stage2_wp_pud_range(struct kvm *kvm, pgd_t *pgd,
>> +				phys_addr_t addr, phys_addr_t end)
>> +{
>> +	pud_t *pud;
>> +	phys_addr_t next;
>> +
>> +	pud = pud_offset(pgd, addr);
>> +	do {
>> +		next = kvm_pud_addr_end(addr, end);
>> +		/* TODO: huge PUD not supported, revisit later */
>> +		BUG_ON(pud_huge(*pud));
> 
> we should probably define kvm_pud_huge()
> 
>> +		if (!pud_none(*pud))
>> +			stage2_wp_pmd_range(pud, addr, next);
>> +	} while (pud++, addr = next, addr != end);
>> +}
>> +
>> +/**
>> + * stage2_wp_range() - write protect stage2 memory region range
>> + * @kvm:	The KVM pointer
>> + * @start:	Start address of range
>> + * &end:	End address of range
>> + */
>> +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
>> +{
>> +	pgd_t *pgd;
>> +	phys_addr_t next;
>> +
>> +	pgd = kvm->arch.pgd + pgd_index(addr);
>> +	do {
>> +		/*
>> +		 * Release kvm_mmu_lock periodically if the memory region is
>> +		 * large features like detect hung task, lock detector or lock
>                    large.  Otherwise, we may see panics due to..
>> +		 * dep  may panic. In addition holding the lock this long will
>     extra white space ^^           Additionally, holding the lock for a
>     long timer will
>> +		 * also starve other vCPUs. Applies to huge VM memory regions.
>                                             ^^^ I don't understand this
> 					    last remark.
Sorry overlooked this.

While testing - VM regions that were small (~1GB) holding the mmu_lock
caused not problems, but when I was running memory regions around 2GB large
some kernel lockup detection/lock contention options (some selected by default) 
caused deadlock warnings/panics in host kernel.

This was in one my previous review comments sometime ago, I can go back
and find the options.

>> +		 */
>> +		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
>> +			cond_resched_lock(&kvm->mmu_lock);
>> +
>> +		next = kvm_pgd_addr_end(addr, end);
>> +		if (pgd_present(*pgd))
>> +			stage2_wp_pud_range(kvm, pgd, addr, next);
>> +	} while (pgd++, addr = next, addr != end);
>> +}
>> +
>> +/**
>> + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
>> + * @kvm:	The KVM pointer
>> + * @slot:	The memory slot to write protect
>> + *
>> + * Called to start logging dirty pages after memory region
>> + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
>> + * all present PMD and PTEs are write protected in the memory region.
>> + * Afterwards read of dirty page log can be called.
>> + *
>> + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
>> + * serializing operations for VM memory regions.
>> + */
>> +
>> +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
>> +{
>> +	struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
>> +	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
>> +	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
>> +
>> +	spin_lock(&kvm->mmu_lock);
>> +	stage2_wp_range(kvm, start, end);
>> +	kvm_flush_remote_tlbs(kvm);
>> +	spin_unlock(&kvm->mmu_lock);
>> +}
>> +#endif
>> +
>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  			  struct kvm_memory_slot *memslot,
>>  			  unsigned long fault_status)
>> -- 
>> 1.7.9.5
>>
> 
> Besides the commenting and whitespace stuff, this is beginning to look
> good.
> 
> Thanks,
> -Christoffer
>
Christoffer Dall Aug. 12, 2014, 9:32 a.m. UTC | #6
On Mon, Aug 11, 2014 at 05:16:21PM -0700, Mario Smarduch wrote:
> On 08/11/2014 12:12 PM, Christoffer Dall wrote:
> > Remove the parenthesis from the subject line.
> 

I just prefer not having the "(w/no huge PUD support)" part in the patch
title.

> Hmmm have to check this don't see it my patch file.
> > 
> > On Thu, Jul 24, 2014 at 05:56:06PM -0700, Mario Smarduch wrote:
> >> Patch adds  support for initial write protection VM memlsot. This patch series
> >             ^^                                    ^
> > stray whitespace                                 of
> > 
> Need to watch out for these adds delays to review cycle.

yes, I care quite a lot about proper English, syntax, grammar and
spelling.  Reading critically through your own patch files before
mailing them out is a good exercise.  You can even consider putting them
through a spell-checker and/or configure your editor to mark double
white space, trailing white space etc.

[...]

> >> +	do {
> >> +		next = kvm_pmd_addr_end(addr, end);
> >> +		if (!pmd_none(*pmd)) {
> >> +			if (kvm_pmd_huge(*pmd)) {
> >> +				if (!kvm_s2pmd_readonly(pmd))
> >> +					kvm_set_s2pmd_readonly(pmd);
> >> +			} else
> >> +				stage2_wp_pte_range(pmd, addr, next);
> > please use a closing brace when the first part of the if-statement is a
> > multi-line block with braces, as per the CodingStyle.
> Will fix.
> >> +
> > 
> > stray blank line
> 
> Not sure how it got by checkpatch, will fix.

Not sure checkpatch will complain, but I do ;)  No big deal anyway.

> > 
> >> +		}
> >> +	} while (pmd++, addr = next, addr != end);
> >> +}
> >> +
> >> +/**
> >> +  * stage2_wp_pud_range - write protect PUD range
> >> +  * @kvm:	pointer to kvm structure
> >> +  * @pud:	pointer to pgd entry
> >         pgd
> >> +  * @addr:	range start address
> >> +  * @end:	range end address
> >> +  *
> >> +  * While walking the PUD range huge PUD pages are ignored, in the future this
> >                              range, huge PUDs are ignored.  In the future...
> >> +  * may need to be revisited. Determine how to handle huge PUDs when logging
> >> +  * of dirty pages is enabled.
> > 
> > I don't understand the last sentence?
> 
> Probably last two sentences should be combined.
> ".... to determine how to handle huge PUT...". Would that be
> clear enough?
> 
> The overall theme is what to do about PUDs - mark all pages dirty
> in the region, attempt to breakup such huge regions?
> 

I think you should just state that this is not supported and worry
about how to deal with it when it's properly supported.  The TODO below
is sufficient, so just drop all mentionings about the future in the
function description above - it's likely to be forgotten when PUDs are
in fact support anyhow.

Thanks,
-Christoffer
Christoffer Dall Aug. 12, 2014, 9:36 a.m. UTC | #7
On Mon, Aug 11, 2014 at 06:36:14PM -0700, Mario Smarduch wrote:
> On 08/11/2014 12:12 PM, Christoffer Dall wrote:

[...]

> >> +/**
> >> + * stage2_wp_range() - write protect stage2 memory region range
> >> + * @kvm:	The KVM pointer
> >> + * @start:	Start address of range
> >> + * &end:	End address of range
> >> + */
> >> +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
> >> +{
> >> +	pgd_t *pgd;
> >> +	phys_addr_t next;
> >> +
> >> +	pgd = kvm->arch.pgd + pgd_index(addr);
> >> +	do {
> >> +		/*
> >> +		 * Release kvm_mmu_lock periodically if the memory region is
> >> +		 * large features like detect hung task, lock detector or lock
> >                    large.  Otherwise, we may see panics due to..
> >> +		 * dep  may panic. In addition holding the lock this long will
> >     extra white space ^^           Additionally, holding the lock for a
> >     long timer will
> >> +		 * also starve other vCPUs. Applies to huge VM memory regions.
> >                                             ^^^ I don't understand this
> > 					    last remark.
> Sorry overlooked this.
> 
> While testing - VM regions that were small (~1GB) holding the mmu_lock
> caused not problems, but when I was running memory regions around 2GB large
> some kernel lockup detection/lock contention options (some selected by default) 
> caused deadlock warnings/panics in host kernel.
> 
> This was in one my previous review comments sometime ago, I can go back
> and find the options.
> 

Just drop the last part of the comment, so the whole thing reads:

/*
 * Release kvm_mmu_lock periodically if the memory region is
 * large. Otherwise, we may see kernel panics from debugging features
 * such as "detect hung task", "lock detector" or "lock dep checks".
 * Additionally, holding the lock too long will also starve other vCPUs.
 */

And check the actual names of those debugging features or use the
CONFIG_<WHATEVER> names and say "we may see kernel panics with CONFIG_X,
CONFIG_Y, and CONFIG_Z.

Makes sense?

-Christoffer
Mario Smarduch Aug. 12, 2014, 9:08 p.m. UTC | #8
On 08/12/2014 02:36 AM, Christoffer Dall wrote:
> On Mon, Aug 11, 2014 at 06:36:14PM -0700, Mario Smarduch wrote:
>> On 08/11/2014 12:12 PM, Christoffer Dall wrote:
> 
> [...]
> 
>>>> +/**
>>>> + * stage2_wp_range() - write protect stage2 memory region range
>>>> + * @kvm:	The KVM pointer
>>>> + * @start:	Start address of range
>>>> + * &end:	End address of range
>>>> + */
>>>> +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
>>>> +{
>>>> +	pgd_t *pgd;
>>>> +	phys_addr_t next;
>>>> +
>>>> +	pgd = kvm->arch.pgd + pgd_index(addr);
>>>> +	do {
>>>> +		/*
>>>> +		 * Release kvm_mmu_lock periodically if the memory region is
>>>> +		 * large features like detect hung task, lock detector or lock
>>>                    large.  Otherwise, we may see panics due to..
>>>> +		 * dep  may panic. In addition holding the lock this long will
>>>     extra white space ^^           Additionally, holding the lock for a
>>>     long timer will
>>>> +		 * also starve other vCPUs. Applies to huge VM memory regions.
>>>                                             ^^^ I don't understand this
>>> 					    last remark.
>> Sorry overlooked this.
>>
>> While testing - VM regions that were small (~1GB) holding the mmu_lock
>> caused not problems, but when I was running memory regions around 2GB large
>> some kernel lockup detection/lock contention options (some selected by default) 
>> caused deadlock warnings/panics in host kernel.
>>
>> This was in one my previous review comments sometime ago, I can go back
>> and find the options.
>>
> 
> Just drop the last part of the comment, so the whole thing reads:
> 
> /*
>  * Release kvm_mmu_lock periodically if the memory region is
>  * large. Otherwise, we may see kernel panics from debugging features
>  * such as "detect hung task", "lock detector" or "lock dep checks".
>  * Additionally, holding the lock too long will also starve other vCPUs.
>  */
> 
> And check the actual names of those debugging features or use the
> CONFIG_<WHATEVER> names and say "we may see kernel panics with CONFIG_X,
> CONFIG_Y, and CONFIG_Z.
> 
> Makes sense?

Yep sure does.

> 
> -Christoffer
>
Mario Smarduch Aug. 12, 2014, 11:17 p.m. UTC | #9
On 08/12/2014 02:32 AM, Christoffer Dall wrote:
> On Mon, Aug 11, 2014 at 05:16:21PM -0700, Mario Smarduch wrote:
>> On 08/11/2014 12:12 PM, Christoffer Dall wrote:
>>> Remove the parenthesis from the subject line.
>>
> 
> I just prefer not having the "(w/no huge PUD support)" part in the patch
> title.

Ah ok.

> 
>> Hmmm have to check this don't see it my patch file.
>>>
>>> On Thu, Jul 24, 2014 at 05:56:06PM -0700, Mario Smarduch wrote:
>>>> Patch adds  support for initial write protection VM memlsot. This patch series
>>>             ^^                                    ^
>>> stray whitespace                                 of
>>>
>> Need to watch out for these adds delays to review cycle.
> 
> yes, I care quite a lot about proper English, syntax, grammar and
> spelling.  Reading critically through your own patch files before
> mailing them out is a good exercise.  You can even consider putting them
> through a spell-checker and/or configure your editor to mark double
> white space, trailing white space etc.
> 
> [...]
> 
>>>> +	do {
>>>> +		next = kvm_pmd_addr_end(addr, end);
>>>> +		if (!pmd_none(*pmd)) {
>>>> +			if (kvm_pmd_huge(*pmd)) {
>>>> +				if (!kvm_s2pmd_readonly(pmd))
>>>> +					kvm_set_s2pmd_readonly(pmd);
>>>> +			} else
>>>> +				stage2_wp_pte_range(pmd, addr, next);
>>> please use a closing brace when the first part of the if-statement is a
>>> multi-line block with braces, as per the CodingStyle.
>> Will fix.
>>>> +
>>>
>>> stray blank line
>>
>> Not sure how it got by checkpatch, will fix.
> 
> Not sure checkpatch will complain, but I do ;)  No big deal anyway.
> 
>>>
>>>> +		}
>>>> +	} while (pmd++, addr = next, addr != end);
>>>> +}
>>>> +
>>>> +/**
>>>> +  * stage2_wp_pud_range - write protect PUD range
>>>> +  * @kvm:	pointer to kvm structure
>>>> +  * @pud:	pointer to pgd entry
>>>         pgd
>>>> +  * @addr:	range start address
>>>> +  * @end:	range end address
>>>> +  *
>>>> +  * While walking the PUD range huge PUD pages are ignored, in the future this
>>>                              range, huge PUDs are ignored.  In the future...
>>>> +  * may need to be revisited. Determine how to handle huge PUDs when logging
>>>> +  * of dirty pages is enabled.
>>>
>>> I don't understand the last sentence?
>>
>> Probably last two sentences should be combined.
>> ".... to determine how to handle huge PUT...". Would that be
>> clear enough?
>>
>> The overall theme is what to do about PUDs - mark all pages dirty
>> in the region, attempt to breakup such huge regions?
>>
> 
> I think you should just state that this is not supported and worry
> about how to deal with it when it's properly supported.  The TODO below
> is sufficient, so just drop all mentionings about the future in the
> function description above - it's likely to be forgotten when PUDs are
> in fact support anyhow.

Ok the simpler the better.
> 
> Thanks,
> -Christoffer
>
diff mbox

Patch

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 042206f..6521a2d 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -231,5 +231,6 @@  int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 void kvm_arch_flush_remote_tlbs(struct kvm *);
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 5cc0b0f..08ab5e8 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -114,6 +114,26 @@  static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
 	pmd_val(*pmd) |= L_PMD_S2_RDWR;
 }
 
+static inline void kvm_set_s2pte_readonly(pte_t *pte)
+{
+	pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
+}
+
+static inline bool kvm_s2pte_readonly(pte_t *pte)
+{
+	return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
+}
+
+static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
+{
+	pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
+}
+
+static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
+{
+	return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
+}
+
 /* Open coded p*d_addr_end that can deal with 64bit addresses */
 #define kvm_pgd_addr_end(addr, end)					\
 ({	u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;		\
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 85c60ad..d8bb40b 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -129,6 +129,7 @@ 
 #define L_PTE_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
 #define L_PTE_S2_RDWR			(_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
 
+#define L_PMD_S2_RDONLY			(_AT(pteval_t, 1) << 6)   /* HAP[1]   */
 #define L_PMD_S2_RDWR			(_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
 
 /*
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 3c82b37..e11c2dd 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -242,6 +242,15 @@  void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   const struct kvm_memory_slot *old,
 				   enum kvm_mr_change change)
 {
+#ifdef CONFIG_ARM
+	/*
+	 * At this point memslot has been committed and there is an
+	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
+	 * memory slot is write protected.
+	 */
+	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+		kvm_mmu_wp_memory_region(kvm, mem->slot);
+#endif
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 35254c6..7bfc792 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -763,6 +763,134 @@  static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
 	return false;
 }
 
+#ifdef CONFIG_ARM
+/**
+ * stage2_wp_pte_range - write protect PTE range
+ * @pmd:	pointer to pmd entry
+ * @addr:	range start address
+ * @end:	range end address
+ */
+static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+	pte_t *pte;
+
+	pte = pte_offset_kernel(pmd, addr);
+	do {
+		if (!pte_none(*pte)) {
+			if (!kvm_s2pte_readonly(pte))
+				kvm_set_s2pte_readonly(pte);
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+/**
+ * stage2_wp_pmd_range - write protect PMD range
+ * @pud:	pointer to pud entry
+ * @addr:	range start address
+ * @end:	range end address
+ */
+static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+{
+	pmd_t *pmd;
+	phys_addr_t next;
+
+	pmd = pmd_offset(pud, addr);
+
+	do {
+		next = kvm_pmd_addr_end(addr, end);
+		if (!pmd_none(*pmd)) {
+			if (kvm_pmd_huge(*pmd)) {
+				if (!kvm_s2pmd_readonly(pmd))
+					kvm_set_s2pmd_readonly(pmd);
+			} else
+				stage2_wp_pte_range(pmd, addr, next);
+
+		}
+	} while (pmd++, addr = next, addr != end);
+}
+
+/**
+  * stage2_wp_pud_range - write protect PUD range
+  * @kvm:	pointer to kvm structure
+  * @pud:	pointer to pgd entry
+  * @addr:	range start address
+  * @end:	range end address
+  *
+  * While walking the PUD range huge PUD pages are ignored, in the future this
+  * may need to be revisited. Determine how to handle huge PUDs when logging
+  * of dirty pages is enabled.
+  */
+static void  stage2_wp_pud_range(struct kvm *kvm, pgd_t *pgd,
+				phys_addr_t addr, phys_addr_t end)
+{
+	pud_t *pud;
+	phys_addr_t next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = kvm_pud_addr_end(addr, end);
+		/* TODO: huge PUD not supported, revisit later */
+		BUG_ON(pud_huge(*pud));
+		if (!pud_none(*pud))
+			stage2_wp_pmd_range(pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_range() - write protect stage2 memory region range
+ * @kvm:	The KVM pointer
+ * @start:	Start address of range
+ * &end:	End address of range
+ */
+static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+	pgd_t *pgd;
+	phys_addr_t next;
+
+	pgd = kvm->arch.pgd + pgd_index(addr);
+	do {
+		/*
+		 * Release kvm_mmu_lock periodically if the memory region is
+		 * large features like detect hung task, lock detector or lock
+		 * dep  may panic. In addition holding the lock this long will
+		 * also starve other vCPUs. Applies to huge VM memory regions.
+		 */
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+			cond_resched_lock(&kvm->mmu_lock);
+
+		next = kvm_pgd_addr_end(addr, end);
+		if (pgd_present(*pgd))
+			stage2_wp_pud_range(kvm, pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
+ * @kvm:	The KVM pointer
+ * @slot:	The memory slot to write protect
+ *
+ * Called to start logging dirty pages after memory region
+ * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
+ * all present PMD and PTEs are write protected in the memory region.
+ * Afterwards read of dirty page log can be called.
+ *
+ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
+ */
+
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+{
+	struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
+	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
+	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+	spin_lock(&kvm->mmu_lock);
+	stage2_wp_range(kvm, start, end);
+	kvm_flush_remote_tlbs(kvm);
+	spin_unlock(&kvm->mmu_lock);
+}
+#endif
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot,
 			  unsigned long fault_status)