diff mbox series

[v1,16/16] arm64/mm: Defer barriers when updating kernel mappings

Message ID 20250205151003.88959-17-ryan.roberts@arm.com (mailing list archive)
State New
Headers show
Series hugetlb and vmalloc fixes and perf improvements | expand

Commit Message

Ryan Roberts Feb. 5, 2025, 3:09 p.m. UTC
Because the kernel can't tolerate page faults for kernel mappings, when
setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
dsb(ishst) to ensure that the store to the pgtable is observed by the
table walker immediately. Additionally it emits an isb() to ensure that
any already speculatively determined invalid mapping fault gets
canceled.

We can improve the performance of vmalloc operations by batching these
barriers until the end of a set up entry updates. The newly added
arch_update_kernel_mappings_begin() / arch_update_kernel_mappings_end()
provide the required hooks.

vmalloc improves by up to 30% as a result.

Two new TIF_ flags are created; TIF_KMAP_UPDATE_ACTIVE tells us if we
are in the batch mode and can therefore defer any barriers until the end
of the batch. TIF_KMAP_UPDATE_PENDING tells us if barriers are queued to
be emited at the end of the batch.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 arch/arm64/include/asm/pgtable.h     | 65 +++++++++++++++++++---------
 arch/arm64/include/asm/thread_info.h |  2 +
 arch/arm64/kernel/process.c          | 20 +++++++--
 3 files changed, 63 insertions(+), 24 deletions(-)

Comments

Anshuman Khandual Feb. 10, 2025, 8:03 a.m. UTC | #1
On 2/5/25 20:39, Ryan Roberts wrote:
> Because the kernel can't tolerate page faults for kernel mappings, when
> setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
> dsb(ishst) to ensure that the store to the pgtable is observed by the
> table walker immediately. Additionally it emits an isb() to ensure that
> any already speculatively determined invalid mapping fault gets
> canceled.> 
> We can improve the performance of vmalloc operations by batching these
> barriers until the end of a set up entry updates. The newly added
> arch_update_kernel_mappings_begin() / arch_update_kernel_mappings_end()
> provide the required hooks.
> 
> vmalloc improves by up to 30% as a result.
> 
> Two new TIF_ flags are created; TIF_KMAP_UPDATE_ACTIVE tells us if we
> are in the batch mode and can therefore defer any barriers until the end
> of the batch. TIF_KMAP_UPDATE_PENDING tells us if barriers are queued to
> be emited at the end of the batch.

Why cannot this be achieved with a single TIF_KMAP_UPDATE_ACTIVE which is
set in __begin(), cleared in __end() and saved across a __switch_to().

> 
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>  arch/arm64/include/asm/pgtable.h     | 65 +++++++++++++++++++---------
>  arch/arm64/include/asm/thread_info.h |  2 +
>  arch/arm64/kernel/process.c          | 20 +++++++--
>  3 files changed, 63 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index ff358d983583..1ee9b9588502 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -39,6 +39,41 @@
>  #include <linux/mm_types.h>
>  #include <linux/sched.h>
>  #include <linux/page_table_check.h>
> +#include <linux/pgtable_modmask.h>
> +
> +static inline void emit_pte_barriers(void)
> +{
> +	dsb(ishst);
> +	isb();
> +}

There are many sequence of these two barriers in this particular header,
hence probably a good idea to factor this out into a common helper.

> +
> +static inline void queue_pte_barriers(void)
> +{
> +	if (test_thread_flag(TIF_KMAP_UPDATE_ACTIVE)) {
> +		if (!test_thread_flag(TIF_KMAP_UPDATE_PENDING))
> +			set_thread_flag(TIF_KMAP_UPDATE_PENDING);
> +	} else
> +		emit_pte_barriers();
> +}
> +
> +#define arch_update_kernel_mappings_begin arch_update_kernel_mappings_begin
> +static inline void arch_update_kernel_mappings_begin(unsigned long start,
> +						     unsigned long end)
> +{
> +	set_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
> +}
> +
> +#define arch_update_kernel_mappings_end arch_update_kernel_mappings_end
> +static inline void arch_update_kernel_mappings_end(unsigned long start,
> +						   unsigned long end,
> +						   pgtbl_mod_mask mask)
> +{
> +	if (test_thread_flag(TIF_KMAP_UPDATE_PENDING))
> +		emit_pte_barriers();
> +
> +	clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
> +	clear_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
> +}
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
> @@ -323,10 +358,8 @@ static inline void __set_pte_complete(pte_t pte)
>  	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
>  	 * or update_mmu_cache() have the necessary barriers.
>  	 */
> -	if (pte_valid_not_user(pte)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (pte_valid_not_user(pte))
> +		queue_pte_barriers();
>  }
>  
>  static inline void __set_pte(pte_t *ptep, pte_t pte)
> @@ -791,10 +824,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>  
>  	WRITE_ONCE(*pmdp, pmd);
>  
> -	if (pmd_valid_not_user(pmd)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (pmd_valid_not_user(pmd))
> +		queue_pte_barriers();
>  }
>  
>  static inline void pmd_clear(pmd_t *pmdp)
> @@ -869,10 +900,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
>  
>  	WRITE_ONCE(*pudp, pud);
>  
> -	if (pud_valid_not_user(pud)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (pud_valid_not_user(pud))
> +		queue_pte_barriers();
>  }
>  
>  static inline void pud_clear(pud_t *pudp)
> @@ -960,10 +989,8 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
>  
>  	WRITE_ONCE(*p4dp, p4d);
>  
> -	if (p4d_valid_not_user(p4d)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (p4d_valid_not_user(p4d))
> +		queue_pte_barriers();
>  }
>  
>  static inline void p4d_clear(p4d_t *p4dp)
> @@ -1098,10 +1125,8 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
>  
>  	WRITE_ONCE(*pgdp, pgd);
>  
> -	if (pgd_valid_not_user(pgd)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (pgd_valid_not_user(pgd))
> +		queue_pte_barriers();
>  }
>  
>  static inline void pgd_clear(pgd_t *pgdp)
> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
> index 1114c1c3300a..382d2121261e 100644
> --- a/arch/arm64/include/asm/thread_info.h
> +++ b/arch/arm64/include/asm/thread_info.h
> @@ -82,6 +82,8 @@ void arch_setup_new_exec(void);
>  #define TIF_SME_VL_INHERIT	28	/* Inherit SME vl_onexec across exec */
>  #define TIF_KERNEL_FPSTATE	29	/* Task is in a kernel mode FPSIMD section */
>  #define TIF_TSC_SIGSEGV		30	/* SIGSEGV on counter-timer access */
> +#define TIF_KMAP_UPDATE_ACTIVE	31	/* kernel map update in progress */
> +#define TIF_KMAP_UPDATE_PENDING	32	/* kernel map updated with deferred barriers */
>  
>  #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> index 42faebb7b712..1367ec6407d1 100644
> --- a/arch/arm64/kernel/process.c
> +++ b/arch/arm64/kernel/process.c
> @@ -680,10 +680,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
>  	gcs_thread_switch(next);
>  
>  	/*
> -	 * Complete any pending TLB or cache maintenance on this CPU in case
> -	 * the thread migrates to a different CPU.
> -	 * This full barrier is also required by the membarrier system
> -	 * call.
> +	 * Complete any pending TLB or cache maintenance on this CPU in case the
> +	 * thread migrates to a different CPU. This full barrier is also
> +	 * required by the membarrier system call. Additionally it is required
> +	 * for TIF_KMAP_UPDATE_PENDING, see below.
>  	 */
>  	dsb(ish);
>  
> @@ -696,6 +696,18 @@ struct task_struct *__switch_to(struct task_struct *prev,
>  	/* avoid expensive SCTLR_EL1 accesses if no change */
>  	if (prev->thread.sctlr_user != next->thread.sctlr_user)
>  		update_sctlr_el1(next->thread.sctlr_user);
> +	else if (unlikely(test_thread_flag(TIF_KMAP_UPDATE_PENDING))) {
> +		/*
> +		 * In unlikely event that a kernel map update is on-going when
> +		 * preemption occurs, we must emit_pte_barriers() if pending.
> +		 * emit_pte_barriers() consists of "dsb(ishst); isb();". The dsb
> +		 * is already handled above. The isb() is handled if
> +		 * update_sctlr_el1() was called. So only need to emit isb()
> +		 * here if it wasn't called.
> +		 */
> +		isb();
> +		clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
> +	}
>  
>  	/* the actual thread switch */
>  	last = cpu_switch_to(prev, next);
Ryan Roberts Feb. 10, 2025, 11:12 a.m. UTC | #2
On 10/02/2025 08:03, Anshuman Khandual wrote:
> 
> 
> On 2/5/25 20:39, Ryan Roberts wrote:
>> Because the kernel can't tolerate page faults for kernel mappings, when
>> setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
>> dsb(ishst) to ensure that the store to the pgtable is observed by the
>> table walker immediately. Additionally it emits an isb() to ensure that
>> any already speculatively determined invalid mapping fault gets
>> canceled.> 
>> We can improve the performance of vmalloc operations by batching these
>> barriers until the end of a set up entry updates. The newly added
>> arch_update_kernel_mappings_begin() / arch_update_kernel_mappings_end()
>> provide the required hooks.
>>
>> vmalloc improves by up to 30% as a result.
>>
>> Two new TIF_ flags are created; TIF_KMAP_UPDATE_ACTIVE tells us if we
>> are in the batch mode and can therefore defer any barriers until the end
>> of the batch. TIF_KMAP_UPDATE_PENDING tells us if barriers are queued to
>> be emited at the end of the batch.
> 
> Why cannot this be achieved with a single TIF_KMAP_UPDATE_ACTIVE which is
> set in __begin(), cleared in __end() and saved across a __switch_to().

So unconditionally emit the barriers in _end(), and emit them in __switch_to()
if TIF_KMAP_UPDATE_ACTIVE is set?

I guess if calling _begin() then you are definitely going to be setting at least
1 PTE. So you can definitely emit the barriers unconditionally. I was trying to
protect against the case where you get pre-empted (potentially multiple times)
while in the loop. The TIF_KMAP_UPDATE_PENDING flag ensures you only emit the
barriers when you definitely need to. Without it, you would have to emit on
every pre-emption even if no more PTEs got set.

But I suspect this is a premature optimization. Probably it will never occur. So
I'll simplify as you suggest.

Thanks,
Ryan

> 
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>  arch/arm64/include/asm/pgtable.h     | 65 +++++++++++++++++++---------
>>  arch/arm64/include/asm/thread_info.h |  2 +
>>  arch/arm64/kernel/process.c          | 20 +++++++--
>>  3 files changed, 63 insertions(+), 24 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index ff358d983583..1ee9b9588502 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -39,6 +39,41 @@
>>  #include <linux/mm_types.h>
>>  #include <linux/sched.h>
>>  #include <linux/page_table_check.h>
>> +#include <linux/pgtable_modmask.h>
>> +
>> +static inline void emit_pte_barriers(void)
>> +{
>> +	dsb(ishst);
>> +	isb();
>> +}
> 
> There are many sequence of these two barriers in this particular header,
> hence probably a good idea to factor this out into a common helper.
> >> +
>> +static inline void queue_pte_barriers(void)
>> +{
>> +	if (test_thread_flag(TIF_KMAP_UPDATE_ACTIVE)) {
>> +		if (!test_thread_flag(TIF_KMAP_UPDATE_PENDING))
>> +			set_thread_flag(TIF_KMAP_UPDATE_PENDING);
>> +	} else
>> +		emit_pte_barriers();
>> +}
>> +
>> +#define arch_update_kernel_mappings_begin arch_update_kernel_mappings_begin
>> +static inline void arch_update_kernel_mappings_begin(unsigned long start,
>> +						     unsigned long end)
>> +{
>> +	set_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
>> +}
>> +
>> +#define arch_update_kernel_mappings_end arch_update_kernel_mappings_end
>> +static inline void arch_update_kernel_mappings_end(unsigned long start,
>> +						   unsigned long end,
>> +						   pgtbl_mod_mask mask)
>> +{
>> +	if (test_thread_flag(TIF_KMAP_UPDATE_PENDING))
>> +		emit_pte_barriers();
>> +
>> +	clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
>> +	clear_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
>> +}
>>  
>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>> @@ -323,10 +358,8 @@ static inline void __set_pte_complete(pte_t pte)
>>  	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
>>  	 * or update_mmu_cache() have the necessary barriers.
>>  	 */
>> -	if (pte_valid_not_user(pte)) {
>> -		dsb(ishst);
>> -		isb();
>> -	}
>> +	if (pte_valid_not_user(pte))
>> +		queue_pte_barriers();
>>  }
>>  
>>  static inline void __set_pte(pte_t *ptep, pte_t pte)
>> @@ -791,10 +824,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>>  
>>  	WRITE_ONCE(*pmdp, pmd);
>>  
>> -	if (pmd_valid_not_user(pmd)) {
>> -		dsb(ishst);
>> -		isb();
>> -	}
>> +	if (pmd_valid_not_user(pmd))
>> +		queue_pte_barriers();
>>  }
>>  
>>  static inline void pmd_clear(pmd_t *pmdp)
>> @@ -869,10 +900,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
>>  
>>  	WRITE_ONCE(*pudp, pud);
>>  
>> -	if (pud_valid_not_user(pud)) {
>> -		dsb(ishst);
>> -		isb();
>> -	}
>> +	if (pud_valid_not_user(pud))
>> +		queue_pte_barriers();
>>  }
>>  
>>  static inline void pud_clear(pud_t *pudp)
>> @@ -960,10 +989,8 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
>>  
>>  	WRITE_ONCE(*p4dp, p4d);
>>  
>> -	if (p4d_valid_not_user(p4d)) {
>> -		dsb(ishst);
>> -		isb();
>> -	}
>> +	if (p4d_valid_not_user(p4d))
>> +		queue_pte_barriers();
>>  }
>>  
>>  static inline void p4d_clear(p4d_t *p4dp)
>> @@ -1098,10 +1125,8 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
>>  
>>  	WRITE_ONCE(*pgdp, pgd);
>>  
>> -	if (pgd_valid_not_user(pgd)) {
>> -		dsb(ishst);
>> -		isb();
>> -	}
>> +	if (pgd_valid_not_user(pgd))
>> +		queue_pte_barriers();
>>  }
>>  
>>  static inline void pgd_clear(pgd_t *pgdp)
>> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
>> index 1114c1c3300a..382d2121261e 100644
>> --- a/arch/arm64/include/asm/thread_info.h
>> +++ b/arch/arm64/include/asm/thread_info.h
>> @@ -82,6 +82,8 @@ void arch_setup_new_exec(void);
>>  #define TIF_SME_VL_INHERIT	28	/* Inherit SME vl_onexec across exec */
>>  #define TIF_KERNEL_FPSTATE	29	/* Task is in a kernel mode FPSIMD section */
>>  #define TIF_TSC_SIGSEGV		30	/* SIGSEGV on counter-timer access */
>> +#define TIF_KMAP_UPDATE_ACTIVE	31	/* kernel map update in progress */
>> +#define TIF_KMAP_UPDATE_PENDING	32	/* kernel map updated with deferred barriers */
>>  
>>  #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
>>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>> index 42faebb7b712..1367ec6407d1 100644
>> --- a/arch/arm64/kernel/process.c
>> +++ b/arch/arm64/kernel/process.c
>> @@ -680,10 +680,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
>>  	gcs_thread_switch(next);
>>  
>>  	/*
>> -	 * Complete any pending TLB or cache maintenance on this CPU in case
>> -	 * the thread migrates to a different CPU.
>> -	 * This full barrier is also required by the membarrier system
>> -	 * call.
>> +	 * Complete any pending TLB or cache maintenance on this CPU in case the
>> +	 * thread migrates to a different CPU. This full barrier is also
>> +	 * required by the membarrier system call. Additionally it is required
>> +	 * for TIF_KMAP_UPDATE_PENDING, see below.
>>  	 */
>>  	dsb(ish);
>>  
>> @@ -696,6 +696,18 @@ struct task_struct *__switch_to(struct task_struct *prev,
>>  	/* avoid expensive SCTLR_EL1 accesses if no change */
>>  	if (prev->thread.sctlr_user != next->thread.sctlr_user)
>>  		update_sctlr_el1(next->thread.sctlr_user);
>> +	else if (unlikely(test_thread_flag(TIF_KMAP_UPDATE_PENDING))) {
>> +		/*
>> +		 * In unlikely event that a kernel map update is on-going when
>> +		 * preemption occurs, we must emit_pte_barriers() if pending.
>> +		 * emit_pte_barriers() consists of "dsb(ishst); isb();". The dsb
>> +		 * is already handled above. The isb() is handled if
>> +		 * update_sctlr_el1() was called. So only need to emit isb()
>> +		 * here if it wasn't called.
>> +		 */
>> +		isb();
>> +		clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
>> +	}
>>  
>>  	/* the actual thread switch */
>>  	last = cpu_switch_to(prev, next);
Anshuman Khandual Feb. 13, 2025, 5:30 a.m. UTC | #3
On 2/10/25 16:42, Ryan Roberts wrote:
> On 10/02/2025 08:03, Anshuman Khandual wrote:
>>
>>
>> On 2/5/25 20:39, Ryan Roberts wrote:
>>> Because the kernel can't tolerate page faults for kernel mappings, when
>>> setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
>>> dsb(ishst) to ensure that the store to the pgtable is observed by the
>>> table walker immediately. Additionally it emits an isb() to ensure that
>>> any already speculatively determined invalid mapping fault gets
>>> canceled.> 
>>> We can improve the performance of vmalloc operations by batching these
>>> barriers until the end of a set up entry updates. The newly added
>>> arch_update_kernel_mappings_begin() / arch_update_kernel_mappings_end()
>>> provide the required hooks.
>>>
>>> vmalloc improves by up to 30% as a result.
>>>
>>> Two new TIF_ flags are created; TIF_KMAP_UPDATE_ACTIVE tells us if we
>>> are in the batch mode and can therefore defer any barriers until the end
>>> of the batch. TIF_KMAP_UPDATE_PENDING tells us if barriers are queued to
>>> be emited at the end of the batch.
>>
>> Why cannot this be achieved with a single TIF_KMAP_UPDATE_ACTIVE which is
>> set in __begin(), cleared in __end() and saved across a __switch_to().
> 
> So unconditionally emit the barriers in _end(), and emit them in __switch_to()
> if TIF_KMAP_UPDATE_ACTIVE is set?

Right.

> 
> I guess if calling _begin() then you are definitely going to be setting at least
> 1 PTE. So you can definitely emit the barriers unconditionally. I was trying to
> protect against the case where you get pre-empted (potentially multiple times)
> while in the loop. The TIF_KMAP_UPDATE_PENDING flag ensures you only emit the
> barriers when you definitely need to. Without it, you would have to emit on
> every pre-emption even if no more PTEs got set.
> 
> But I suspect this is a premature optimization. Probably it will never occur. So

Agreed.

> I'll simplify as you suggest.
> 
> Thanks,
> Ryan
> 
>>
>>>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>>  arch/arm64/include/asm/pgtable.h     | 65 +++++++++++++++++++---------
>>>  arch/arm64/include/asm/thread_info.h |  2 +
>>>  arch/arm64/kernel/process.c          | 20 +++++++--
>>>  3 files changed, 63 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>>> index ff358d983583..1ee9b9588502 100644
>>> --- a/arch/arm64/include/asm/pgtable.h
>>> +++ b/arch/arm64/include/asm/pgtable.h
>>> @@ -39,6 +39,41 @@
>>>  #include <linux/mm_types.h>
>>>  #include <linux/sched.h>
>>>  #include <linux/page_table_check.h>
>>> +#include <linux/pgtable_modmask.h>
>>> +
>>> +static inline void emit_pte_barriers(void)
>>> +{
>>> +	dsb(ishst);
>>> +	isb();
>>> +}
>>
>> There are many sequence of these two barriers in this particular header,
>> hence probably a good idea to factor this out into a common helper.
>>>> +
>>> +static inline void queue_pte_barriers(void)
>>> +{
>>> +	if (test_thread_flag(TIF_KMAP_UPDATE_ACTIVE)) {
>>> +		if (!test_thread_flag(TIF_KMAP_UPDATE_PENDING))
>>> +			set_thread_flag(TIF_KMAP_UPDATE_PENDING);
>>> +	} else
>>> +		emit_pte_barriers();
>>> +}
>>> +
>>> +#define arch_update_kernel_mappings_begin arch_update_kernel_mappings_begin
>>> +static inline void arch_update_kernel_mappings_begin(unsigned long start,
>>> +						     unsigned long end)
>>> +{
>>> +	set_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
>>> +}
>>> +
>>> +#define arch_update_kernel_mappings_end arch_update_kernel_mappings_end
>>> +static inline void arch_update_kernel_mappings_end(unsigned long start,
>>> +						   unsigned long end,
>>> +						   pgtbl_mod_mask mask)
>>> +{
>>> +	if (test_thread_flag(TIF_KMAP_UPDATE_PENDING))
>>> +		emit_pte_barriers();
>>> +
>>> +	clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
>>> +	clear_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
>>> +}
>>>  
>>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>>> @@ -323,10 +358,8 @@ static inline void __set_pte_complete(pte_t pte)
>>>  	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
>>>  	 * or update_mmu_cache() have the necessary barriers.
>>>  	 */
>>> -	if (pte_valid_not_user(pte)) {
>>> -		dsb(ishst);
>>> -		isb();
>>> -	}
>>> +	if (pte_valid_not_user(pte))
>>> +		queue_pte_barriers();
>>>  }
>>>  
>>>  static inline void __set_pte(pte_t *ptep, pte_t pte)
>>> @@ -791,10 +824,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>>>  
>>>  	WRITE_ONCE(*pmdp, pmd);
>>>  
>>> -	if (pmd_valid_not_user(pmd)) {
>>> -		dsb(ishst);
>>> -		isb();
>>> -	}
>>> +	if (pmd_valid_not_user(pmd))
>>> +		queue_pte_barriers();
>>>  }
>>>  
>>>  static inline void pmd_clear(pmd_t *pmdp)
>>> @@ -869,10 +900,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
>>>  
>>>  	WRITE_ONCE(*pudp, pud);
>>>  
>>> -	if (pud_valid_not_user(pud)) {
>>> -		dsb(ishst);
>>> -		isb();
>>> -	}
>>> +	if (pud_valid_not_user(pud))
>>> +		queue_pte_barriers();
>>>  }
>>>  
>>>  static inline void pud_clear(pud_t *pudp)
>>> @@ -960,10 +989,8 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
>>>  
>>>  	WRITE_ONCE(*p4dp, p4d);
>>>  
>>> -	if (p4d_valid_not_user(p4d)) {
>>> -		dsb(ishst);
>>> -		isb();
>>> -	}
>>> +	if (p4d_valid_not_user(p4d))
>>> +		queue_pte_barriers();
>>>  }
>>>  
>>>  static inline void p4d_clear(p4d_t *p4dp)
>>> @@ -1098,10 +1125,8 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
>>>  
>>>  	WRITE_ONCE(*pgdp, pgd);
>>>  
>>> -	if (pgd_valid_not_user(pgd)) {
>>> -		dsb(ishst);
>>> -		isb();
>>> -	}
>>> +	if (pgd_valid_not_user(pgd))
>>> +		queue_pte_barriers();
>>>  }
>>>  
>>>  static inline void pgd_clear(pgd_t *pgdp)
>>> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
>>> index 1114c1c3300a..382d2121261e 100644
>>> --- a/arch/arm64/include/asm/thread_info.h
>>> +++ b/arch/arm64/include/asm/thread_info.h
>>> @@ -82,6 +82,8 @@ void arch_setup_new_exec(void);
>>>  #define TIF_SME_VL_INHERIT	28	/* Inherit SME vl_onexec across exec */
>>>  #define TIF_KERNEL_FPSTATE	29	/* Task is in a kernel mode FPSIMD section */
>>>  #define TIF_TSC_SIGSEGV		30	/* SIGSEGV on counter-timer access */
>>> +#define TIF_KMAP_UPDATE_ACTIVE	31	/* kernel map update in progress */
>>> +#define TIF_KMAP_UPDATE_PENDING	32	/* kernel map updated with deferred barriers */
>>>  
>>>  #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
>>>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>>> index 42faebb7b712..1367ec6407d1 100644
>>> --- a/arch/arm64/kernel/process.c
>>> +++ b/arch/arm64/kernel/process.c
>>> @@ -680,10 +680,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
>>>  	gcs_thread_switch(next);
>>>  
>>>  	/*
>>> -	 * Complete any pending TLB or cache maintenance on this CPU in case
>>> -	 * the thread migrates to a different CPU.
>>> -	 * This full barrier is also required by the membarrier system
>>> -	 * call.
>>> +	 * Complete any pending TLB or cache maintenance on this CPU in case the
>>> +	 * thread migrates to a different CPU. This full barrier is also
>>> +	 * required by the membarrier system call. Additionally it is required
>>> +	 * for TIF_KMAP_UPDATE_PENDING, see below.
>>>  	 */
>>>  	dsb(ish);
>>>  
>>> @@ -696,6 +696,18 @@ struct task_struct *__switch_to(struct task_struct *prev,
>>>  	/* avoid expensive SCTLR_EL1 accesses if no change */
>>>  	if (prev->thread.sctlr_user != next->thread.sctlr_user)
>>>  		update_sctlr_el1(next->thread.sctlr_user);
>>> +	else if (unlikely(test_thread_flag(TIF_KMAP_UPDATE_PENDING))) {
>>> +		/*
>>> +		 * In unlikely event that a kernel map update is on-going when
>>> +		 * preemption occurs, we must emit_pte_barriers() if pending.
>>> +		 * emit_pte_barriers() consists of "dsb(ishst); isb();". The dsb
>>> +		 * is already handled above. The isb() is handled if
>>> +		 * update_sctlr_el1() was called. So only need to emit isb()
>>> +		 * here if it wasn't called.
>>> +		 */
>>> +		isb();
>>> +		clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
>>> +	}
>>>  
>>>  	/* the actual thread switch */
>>>  	last = cpu_switch_to(prev, next);
>
Ryan Roberts Feb. 13, 2025, 9:38 a.m. UTC | #4
On 13/02/2025 05:30, Anshuman Khandual wrote:
> 
> 
> On 2/10/25 16:42, Ryan Roberts wrote:
>> On 10/02/2025 08:03, Anshuman Khandual wrote:
>>>
>>>
>>> On 2/5/25 20:39, Ryan Roberts wrote:
>>>> Because the kernel can't tolerate page faults for kernel mappings, when
>>>> setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
>>>> dsb(ishst) to ensure that the store to the pgtable is observed by the
>>>> table walker immediately. Additionally it emits an isb() to ensure that
>>>> any already speculatively determined invalid mapping fault gets
>>>> canceled.> 
>>>> We can improve the performance of vmalloc operations by batching these
>>>> barriers until the end of a set up entry updates. The newly added
>>>> arch_update_kernel_mappings_begin() / arch_update_kernel_mappings_end()
>>>> provide the required hooks.
>>>>
>>>> vmalloc improves by up to 30% as a result.
>>>>
>>>> Two new TIF_ flags are created; TIF_KMAP_UPDATE_ACTIVE tells us if we
>>>> are in the batch mode and can therefore defer any barriers until the end
>>>> of the batch. TIF_KMAP_UPDATE_PENDING tells us if barriers are queued to
>>>> be emited at the end of the batch.
>>>
>>> Why cannot this be achieved with a single TIF_KMAP_UPDATE_ACTIVE which is
>>> set in __begin(), cleared in __end() and saved across a __switch_to().
>>
>> So unconditionally emit the barriers in _end(), and emit them in __switch_to()
>> if TIF_KMAP_UPDATE_ACTIVE is set?
> 
> Right.
> 
>>
>> I guess if calling _begin() then you are definitely going to be setting at least
>> 1 PTE. So you can definitely emit the barriers unconditionally. I was trying to
>> protect against the case where you get pre-empted (potentially multiple times)
>> while in the loop. The TIF_KMAP_UPDATE_PENDING flag ensures you only emit the
>> barriers when you definitely need to. Without it, you would have to emit on
>> every pre-emption even if no more PTEs got set.
>>
>> But I suspect this is a premature optimization. Probably it will never occur. So
> 
> Agreed.

Having done this simplification, I've just noticed that one of the
arch_update_kernel_mappings_begin/end callsites is __apply_to_page_range() which
gets called for user space mappings as well as kernel mappings. So actually with
the simplification I'll be emitting barriers even when only user space mappings
were touched.

I think there are a couple of options to fix this:

- Revert to the 2 flag approach. For the user space case, I'll get to _end() and
notice that no barriers are queued so will emit nothing.

- Only set TIF_KMAP_UPDATE_ACTIVE if the address range passed to _begin() is a
kernel address range. I guess that's just a case of checking if the MSB is set
in "end"?

- pass mm to _begin() and only set TIF_KMAP_UPDATE_ACTIVE if mm == &init_mm. I
guess this should be the same as option 2.

I'm leaning towards option 2. But I have a niggling feeling that my proposed
check isn't quite correct. What do you think?

Thanks,
Ryan


> 
>> I'll simplify as you suggest.
>>
>> Thanks,
>> Ryan
>>
>>>
>>>>
>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>> ---
>>>>  arch/arm64/include/asm/pgtable.h     | 65 +++++++++++++++++++---------
>>>>  arch/arm64/include/asm/thread_info.h |  2 +
>>>>  arch/arm64/kernel/process.c          | 20 +++++++--
>>>>  3 files changed, 63 insertions(+), 24 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>>>> index ff358d983583..1ee9b9588502 100644
>>>> --- a/arch/arm64/include/asm/pgtable.h
>>>> +++ b/arch/arm64/include/asm/pgtable.h
>>>> @@ -39,6 +39,41 @@
>>>>  #include <linux/mm_types.h>
>>>>  #include <linux/sched.h>
>>>>  #include <linux/page_table_check.h>
>>>> +#include <linux/pgtable_modmask.h>
>>>> +
>>>> +static inline void emit_pte_barriers(void)
>>>> +{
>>>> +	dsb(ishst);
>>>> +	isb();
>>>> +}
>>>
>>> There are many sequence of these two barriers in this particular header,
>>> hence probably a good idea to factor this out into a common helper.
>>>>> +
>>>> +static inline void queue_pte_barriers(void)
>>>> +{
>>>> +	if (test_thread_flag(TIF_KMAP_UPDATE_ACTIVE)) {
>>>> +		if (!test_thread_flag(TIF_KMAP_UPDATE_PENDING))
>>>> +			set_thread_flag(TIF_KMAP_UPDATE_PENDING);
>>>> +	} else
>>>> +		emit_pte_barriers();
>>>> +}
>>>> +
>>>> +#define arch_update_kernel_mappings_begin arch_update_kernel_mappings_begin
>>>> +static inline void arch_update_kernel_mappings_begin(unsigned long start,
>>>> +						     unsigned long end)
>>>> +{
>>>> +	set_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
>>>> +}
>>>> +
>>>> +#define arch_update_kernel_mappings_end arch_update_kernel_mappings_end
>>>> +static inline void arch_update_kernel_mappings_end(unsigned long start,
>>>> +						   unsigned long end,
>>>> +						   pgtbl_mod_mask mask)
>>>> +{
>>>> +	if (test_thread_flag(TIF_KMAP_UPDATE_PENDING))
>>>> +		emit_pte_barriers();
>>>> +
>>>> +	clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
>>>> +	clear_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
>>>> +}
>>>>  
>>>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>>>> @@ -323,10 +358,8 @@ static inline void __set_pte_complete(pte_t pte)
>>>>  	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
>>>>  	 * or update_mmu_cache() have the necessary barriers.
>>>>  	 */
>>>> -	if (pte_valid_not_user(pte)) {
>>>> -		dsb(ishst);
>>>> -		isb();
>>>> -	}
>>>> +	if (pte_valid_not_user(pte))
>>>> +		queue_pte_barriers();
>>>>  }
>>>>  
>>>>  static inline void __set_pte(pte_t *ptep, pte_t pte)
>>>> @@ -791,10 +824,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>>>>  
>>>>  	WRITE_ONCE(*pmdp, pmd);
>>>>  
>>>> -	if (pmd_valid_not_user(pmd)) {
>>>> -		dsb(ishst);
>>>> -		isb();
>>>> -	}
>>>> +	if (pmd_valid_not_user(pmd))
>>>> +		queue_pte_barriers();
>>>>  }
>>>>  
>>>>  static inline void pmd_clear(pmd_t *pmdp)
>>>> @@ -869,10 +900,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
>>>>  
>>>>  	WRITE_ONCE(*pudp, pud);
>>>>  
>>>> -	if (pud_valid_not_user(pud)) {
>>>> -		dsb(ishst);
>>>> -		isb();
>>>> -	}
>>>> +	if (pud_valid_not_user(pud))
>>>> +		queue_pte_barriers();
>>>>  }
>>>>  
>>>>  static inline void pud_clear(pud_t *pudp)
>>>> @@ -960,10 +989,8 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
>>>>  
>>>>  	WRITE_ONCE(*p4dp, p4d);
>>>>  
>>>> -	if (p4d_valid_not_user(p4d)) {
>>>> -		dsb(ishst);
>>>> -		isb();
>>>> -	}
>>>> +	if (p4d_valid_not_user(p4d))
>>>> +		queue_pte_barriers();
>>>>  }
>>>>  
>>>>  static inline void p4d_clear(p4d_t *p4dp)
>>>> @@ -1098,10 +1125,8 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
>>>>  
>>>>  	WRITE_ONCE(*pgdp, pgd);
>>>>  
>>>> -	if (pgd_valid_not_user(pgd)) {
>>>> -		dsb(ishst);
>>>> -		isb();
>>>> -	}
>>>> +	if (pgd_valid_not_user(pgd))
>>>> +		queue_pte_barriers();
>>>>  }
>>>>  
>>>>  static inline void pgd_clear(pgd_t *pgdp)
>>>> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
>>>> index 1114c1c3300a..382d2121261e 100644
>>>> --- a/arch/arm64/include/asm/thread_info.h
>>>> +++ b/arch/arm64/include/asm/thread_info.h
>>>> @@ -82,6 +82,8 @@ void arch_setup_new_exec(void);
>>>>  #define TIF_SME_VL_INHERIT	28	/* Inherit SME vl_onexec across exec */
>>>>  #define TIF_KERNEL_FPSTATE	29	/* Task is in a kernel mode FPSIMD section */
>>>>  #define TIF_TSC_SIGSEGV		30	/* SIGSEGV on counter-timer access */
>>>> +#define TIF_KMAP_UPDATE_ACTIVE	31	/* kernel map update in progress */
>>>> +#define TIF_KMAP_UPDATE_PENDING	32	/* kernel map updated with deferred barriers */
>>>>  
>>>>  #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
>>>>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>>>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>>>> index 42faebb7b712..1367ec6407d1 100644
>>>> --- a/arch/arm64/kernel/process.c
>>>> +++ b/arch/arm64/kernel/process.c
>>>> @@ -680,10 +680,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
>>>>  	gcs_thread_switch(next);
>>>>  
>>>>  	/*
>>>> -	 * Complete any pending TLB or cache maintenance on this CPU in case
>>>> -	 * the thread migrates to a different CPU.
>>>> -	 * This full barrier is also required by the membarrier system
>>>> -	 * call.
>>>> +	 * Complete any pending TLB or cache maintenance on this CPU in case the
>>>> +	 * thread migrates to a different CPU. This full barrier is also
>>>> +	 * required by the membarrier system call. Additionally it is required
>>>> +	 * for TIF_KMAP_UPDATE_PENDING, see below.
>>>>  	 */
>>>>  	dsb(ish);
>>>>  
>>>> @@ -696,6 +696,18 @@ struct task_struct *__switch_to(struct task_struct *prev,
>>>>  	/* avoid expensive SCTLR_EL1 accesses if no change */
>>>>  	if (prev->thread.sctlr_user != next->thread.sctlr_user)
>>>>  		update_sctlr_el1(next->thread.sctlr_user);
>>>> +	else if (unlikely(test_thread_flag(TIF_KMAP_UPDATE_PENDING))) {
>>>> +		/*
>>>> +		 * In unlikely event that a kernel map update is on-going when
>>>> +		 * preemption occurs, we must emit_pte_barriers() if pending.
>>>> +		 * emit_pte_barriers() consists of "dsb(ishst); isb();". The dsb
>>>> +		 * is already handled above. The isb() is handled if
>>>> +		 * update_sctlr_el1() was called. So only need to emit isb()
>>>> +		 * here if it wasn't called.
>>>> +		 */
>>>> +		isb();
>>>> +		clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
>>>> +	}
>>>>  
>>>>  	/* the actual thread switch */
>>>>  	last = cpu_switch_to(prev, next);
>>
Anshuman Khandual Feb. 17, 2025, 4:48 a.m. UTC | #5
On 2/13/25 15:08, Ryan Roberts wrote:
> On 13/02/2025 05:30, Anshuman Khandual wrote:
>>
>>
>> On 2/10/25 16:42, Ryan Roberts wrote:
>>> On 10/02/2025 08:03, Anshuman Khandual wrote:
>>>>
>>>>
>>>> On 2/5/25 20:39, Ryan Roberts wrote:
>>>>> Because the kernel can't tolerate page faults for kernel mappings, when
>>>>> setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
>>>>> dsb(ishst) to ensure that the store to the pgtable is observed by the
>>>>> table walker immediately. Additionally it emits an isb() to ensure that
>>>>> any already speculatively determined invalid mapping fault gets
>>>>> canceled.> 
>>>>> We can improve the performance of vmalloc operations by batching these
>>>>> barriers until the end of a set up entry updates. The newly added
>>>>> arch_update_kernel_mappings_begin() / arch_update_kernel_mappings_end()
>>>>> provide the required hooks.
>>>>>
>>>>> vmalloc improves by up to 30% as a result.
>>>>>
>>>>> Two new TIF_ flags are created; TIF_KMAP_UPDATE_ACTIVE tells us if we
>>>>> are in the batch mode and can therefore defer any barriers until the end
>>>>> of the batch. TIF_KMAP_UPDATE_PENDING tells us if barriers are queued to
>>>>> be emited at the end of the batch.
>>>>
>>>> Why cannot this be achieved with a single TIF_KMAP_UPDATE_ACTIVE which is
>>>> set in __begin(), cleared in __end() and saved across a __switch_to().
>>>
>>> So unconditionally emit the barriers in _end(), and emit them in __switch_to()
>>> if TIF_KMAP_UPDATE_ACTIVE is set?
>>
>> Right.
>>
>>>
>>> I guess if calling _begin() then you are definitely going to be setting at least
>>> 1 PTE. So you can definitely emit the barriers unconditionally. I was trying to
>>> protect against the case where you get pre-empted (potentially multiple times)
>>> while in the loop. The TIF_KMAP_UPDATE_PENDING flag ensures you only emit the
>>> barriers when you definitely need to. Without it, you would have to emit on
>>> every pre-emption even if no more PTEs got set.
>>>
>>> But I suspect this is a premature optimization. Probably it will never occur. So
>>
>> Agreed.
> 
> Having done this simplification, I've just noticed that one of the
> arch_update_kernel_mappings_begin/end callsites is __apply_to_page_range() which
> gets called for user space mappings as well as kernel mappings. So actually with
> the simplification I'll be emitting barriers even when only user space mappings
> were touched.

Right, that will not be desirable.

> 
> I think there are a couple of options to fix this:
> 
> - Revert to the 2 flag approach. For the user space case, I'll get to _end() and
> notice that no barriers are queued so will emit nothing.
> 
> - Only set TIF_KMAP_UPDATE_ACTIVE if the address range passed to _begin() is a
> kernel address range. I guess that's just a case of checking if the MSB is set
> in "end"?
> 
> - pass mm to _begin() and only set TIF_KMAP_UPDATE_ACTIVE if mm == &init_mm. I
> guess this should be the same as option 2.
> 
> I'm leaning towards option 2. But I have a niggling feeling that my proposed
> check isn't quite correct. What do you think?

Option 2 and 3 looks better than the two flags approach proposed earlier. But is
not option 3 bit more simplistic than option 2 ? Does getting struct mm argument
into these function create more code churn ?

> 
> Thanks,
> Ryan
> 
> 
>>
>>> I'll simplify as you suggest.
>>>
>>> Thanks,
>>> Ryan
>>>
>>>>
>>>>>
>>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>>> ---
>>>>>  arch/arm64/include/asm/pgtable.h     | 65 +++++++++++++++++++---------
>>>>>  arch/arm64/include/asm/thread_info.h |  2 +
>>>>>  arch/arm64/kernel/process.c          | 20 +++++++--
>>>>>  3 files changed, 63 insertions(+), 24 deletions(-)
>>>>>
>>>>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>>>>> index ff358d983583..1ee9b9588502 100644
>>>>> --- a/arch/arm64/include/asm/pgtable.h
>>>>> +++ b/arch/arm64/include/asm/pgtable.h
>>>>> @@ -39,6 +39,41 @@
>>>>>  #include <linux/mm_types.h>
>>>>>  #include <linux/sched.h>
>>>>>  #include <linux/page_table_check.h>
>>>>> +#include <linux/pgtable_modmask.h>
>>>>> +
>>>>> +static inline void emit_pte_barriers(void)
>>>>> +{
>>>>> +	dsb(ishst);
>>>>> +	isb();
>>>>> +}
>>>>
>>>> There are many sequence of these two barriers in this particular header,
>>>> hence probably a good idea to factor this out into a common helper.
>>>>>> +
>>>>> +static inline void queue_pte_barriers(void)
>>>>> +{
>>>>> +	if (test_thread_flag(TIF_KMAP_UPDATE_ACTIVE)) {
>>>>> +		if (!test_thread_flag(TIF_KMAP_UPDATE_PENDING))
>>>>> +			set_thread_flag(TIF_KMAP_UPDATE_PENDING);
>>>>> +	} else
>>>>> +		emit_pte_barriers();
>>>>> +}
>>>>> +
>>>>> +#define arch_update_kernel_mappings_begin arch_update_kernel_mappings_begin
>>>>> +static inline void arch_update_kernel_mappings_begin(unsigned long start,
>>>>> +						     unsigned long end)
>>>>> +{
>>>>> +	set_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
>>>>> +}
>>>>> +
>>>>> +#define arch_update_kernel_mappings_end arch_update_kernel_mappings_end
>>>>> +static inline void arch_update_kernel_mappings_end(unsigned long start,
>>>>> +						   unsigned long end,
>>>>> +						   pgtbl_mod_mask mask)
>>>>> +{
>>>>> +	if (test_thread_flag(TIF_KMAP_UPDATE_PENDING))
>>>>> +		emit_pte_barriers();
>>>>> +
>>>>> +	clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
>>>>> +	clear_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
>>>>> +}
>>>>>  
>>>>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>>>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>>>>> @@ -323,10 +358,8 @@ static inline void __set_pte_complete(pte_t pte)
>>>>>  	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
>>>>>  	 * or update_mmu_cache() have the necessary barriers.
>>>>>  	 */
>>>>> -	if (pte_valid_not_user(pte)) {
>>>>> -		dsb(ishst);
>>>>> -		isb();
>>>>> -	}
>>>>> +	if (pte_valid_not_user(pte))
>>>>> +		queue_pte_barriers();
>>>>>  }
>>>>>  
>>>>>  static inline void __set_pte(pte_t *ptep, pte_t pte)
>>>>> @@ -791,10 +824,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>>>>>  
>>>>>  	WRITE_ONCE(*pmdp, pmd);
>>>>>  
>>>>> -	if (pmd_valid_not_user(pmd)) {
>>>>> -		dsb(ishst);
>>>>> -		isb();
>>>>> -	}
>>>>> +	if (pmd_valid_not_user(pmd))
>>>>> +		queue_pte_barriers();
>>>>>  }
>>>>>  
>>>>>  static inline void pmd_clear(pmd_t *pmdp)
>>>>> @@ -869,10 +900,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
>>>>>  
>>>>>  	WRITE_ONCE(*pudp, pud);
>>>>>  
>>>>> -	if (pud_valid_not_user(pud)) {
>>>>> -		dsb(ishst);
>>>>> -		isb();
>>>>> -	}
>>>>> +	if (pud_valid_not_user(pud))
>>>>> +		queue_pte_barriers();
>>>>>  }
>>>>>  
>>>>>  static inline void pud_clear(pud_t *pudp)
>>>>> @@ -960,10 +989,8 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
>>>>>  
>>>>>  	WRITE_ONCE(*p4dp, p4d);
>>>>>  
>>>>> -	if (p4d_valid_not_user(p4d)) {
>>>>> -		dsb(ishst);
>>>>> -		isb();
>>>>> -	}
>>>>> +	if (p4d_valid_not_user(p4d))
>>>>> +		queue_pte_barriers();
>>>>>  }
>>>>>  
>>>>>  static inline void p4d_clear(p4d_t *p4dp)
>>>>> @@ -1098,10 +1125,8 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
>>>>>  
>>>>>  	WRITE_ONCE(*pgdp, pgd);
>>>>>  
>>>>> -	if (pgd_valid_not_user(pgd)) {
>>>>> -		dsb(ishst);
>>>>> -		isb();
>>>>> -	}
>>>>> +	if (pgd_valid_not_user(pgd))
>>>>> +		queue_pte_barriers();
>>>>>  }
>>>>>  
>>>>>  static inline void pgd_clear(pgd_t *pgdp)
>>>>> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
>>>>> index 1114c1c3300a..382d2121261e 100644
>>>>> --- a/arch/arm64/include/asm/thread_info.h
>>>>> +++ b/arch/arm64/include/asm/thread_info.h
>>>>> @@ -82,6 +82,8 @@ void arch_setup_new_exec(void);
>>>>>  #define TIF_SME_VL_INHERIT	28	/* Inherit SME vl_onexec across exec */
>>>>>  #define TIF_KERNEL_FPSTATE	29	/* Task is in a kernel mode FPSIMD section */
>>>>>  #define TIF_TSC_SIGSEGV		30	/* SIGSEGV on counter-timer access */
>>>>> +#define TIF_KMAP_UPDATE_ACTIVE	31	/* kernel map update in progress */
>>>>> +#define TIF_KMAP_UPDATE_PENDING	32	/* kernel map updated with deferred barriers */
>>>>>  
>>>>>  #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
>>>>>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>>>>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>>>>> index 42faebb7b712..1367ec6407d1 100644
>>>>> --- a/arch/arm64/kernel/process.c
>>>>> +++ b/arch/arm64/kernel/process.c
>>>>> @@ -680,10 +680,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
>>>>>  	gcs_thread_switch(next);
>>>>>  
>>>>>  	/*
>>>>> -	 * Complete any pending TLB or cache maintenance on this CPU in case
>>>>> -	 * the thread migrates to a different CPU.
>>>>> -	 * This full barrier is also required by the membarrier system
>>>>> -	 * call.
>>>>> +	 * Complete any pending TLB or cache maintenance on this CPU in case the
>>>>> +	 * thread migrates to a different CPU. This full barrier is also
>>>>> +	 * required by the membarrier system call. Additionally it is required
>>>>> +	 * for TIF_KMAP_UPDATE_PENDING, see below.
>>>>>  	 */
>>>>>  	dsb(ish);
>>>>>  
>>>>> @@ -696,6 +696,18 @@ struct task_struct *__switch_to(struct task_struct *prev,
>>>>>  	/* avoid expensive SCTLR_EL1 accesses if no change */
>>>>>  	if (prev->thread.sctlr_user != next->thread.sctlr_user)
>>>>>  		update_sctlr_el1(next->thread.sctlr_user);
>>>>> +	else if (unlikely(test_thread_flag(TIF_KMAP_UPDATE_PENDING))) {
>>>>> +		/*
>>>>> +		 * In unlikely event that a kernel map update is on-going when
>>>>> +		 * preemption occurs, we must emit_pte_barriers() if pending.
>>>>> +		 * emit_pte_barriers() consists of "dsb(ishst); isb();". The dsb
>>>>> +		 * is already handled above. The isb() is handled if
>>>>> +		 * update_sctlr_el1() was called. So only need to emit isb()
>>>>> +		 * here if it wasn't called.
>>>>> +		 */
>>>>> +		isb();
>>>>> +		clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
>>>>> +	}
>>>>>  
>>>>>  	/* the actual thread switch */
>>>>>  	last = cpu_switch_to(prev, next);
>>>
>
Ryan Roberts Feb. 17, 2025, 9:40 a.m. UTC | #6
On 17/02/2025 04:48, Anshuman Khandual wrote:
> 
> 
> On 2/13/25 15:08, Ryan Roberts wrote:
>> On 13/02/2025 05:30, Anshuman Khandual wrote:
>>>
>>>
>>> On 2/10/25 16:42, Ryan Roberts wrote:
>>>> On 10/02/2025 08:03, Anshuman Khandual wrote:
>>>>>
>>>>>
>>>>> On 2/5/25 20:39, Ryan Roberts wrote:
>>>>>> Because the kernel can't tolerate page faults for kernel mappings, when
>>>>>> setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
>>>>>> dsb(ishst) to ensure that the store to the pgtable is observed by the
>>>>>> table walker immediately. Additionally it emits an isb() to ensure that
>>>>>> any already speculatively determined invalid mapping fault gets
>>>>>> canceled.> 
>>>>>> We can improve the performance of vmalloc operations by batching these
>>>>>> barriers until the end of a set up entry updates. The newly added
>>>>>> arch_update_kernel_mappings_begin() / arch_update_kernel_mappings_end()
>>>>>> provide the required hooks.
>>>>>>
>>>>>> vmalloc improves by up to 30% as a result.
>>>>>>
>>>>>> Two new TIF_ flags are created; TIF_KMAP_UPDATE_ACTIVE tells us if we
>>>>>> are in the batch mode and can therefore defer any barriers until the end
>>>>>> of the batch. TIF_KMAP_UPDATE_PENDING tells us if barriers are queued to
>>>>>> be emited at the end of the batch.
>>>>>
>>>>> Why cannot this be achieved with a single TIF_KMAP_UPDATE_ACTIVE which is
>>>>> set in __begin(), cleared in __end() and saved across a __switch_to().
>>>>
>>>> So unconditionally emit the barriers in _end(), and emit them in __switch_to()
>>>> if TIF_KMAP_UPDATE_ACTIVE is set?
>>>
>>> Right.
>>>
>>>>
>>>> I guess if calling _begin() then you are definitely going to be setting at least
>>>> 1 PTE. So you can definitely emit the barriers unconditionally. I was trying to
>>>> protect against the case where you get pre-empted (potentially multiple times)
>>>> while in the loop. The TIF_KMAP_UPDATE_PENDING flag ensures you only emit the
>>>> barriers when you definitely need to. Without it, you would have to emit on
>>>> every pre-emption even if no more PTEs got set.
>>>>
>>>> But I suspect this is a premature optimization. Probably it will never occur. So
>>>
>>> Agreed.
>>
>> Having done this simplification, I've just noticed that one of the
>> arch_update_kernel_mappings_begin/end callsites is __apply_to_page_range() which
>> gets called for user space mappings as well as kernel mappings. So actually with
>> the simplification I'll be emitting barriers even when only user space mappings
>> were touched.
> 
> Right, that will not be desirable.
> 
>>
>> I think there are a couple of options to fix this:
>>
>> - Revert to the 2 flag approach. For the user space case, I'll get to _end() and
>> notice that no barriers are queued so will emit nothing.
>>
>> - Only set TIF_KMAP_UPDATE_ACTIVE if the address range passed to _begin() is a
>> kernel address range. I guess that's just a case of checking if the MSB is set
>> in "end"?
>>
>> - pass mm to _begin() and only set TIF_KMAP_UPDATE_ACTIVE if mm == &init_mm. I
>> guess this should be the same as option 2.
>>
>> I'm leaning towards option 2. But I have a niggling feeling that my proposed
>> check isn't quite correct. What do you think?
> 
> Option 2 and 3 looks better than the two flags approach proposed earlier. But is
> not option 3 bit more simplistic than option 2 ? Does getting struct mm argument
> into these function create more code churn ?

Actually looking at this again, I think the best thing is that when called in
the context of __apply_to_page_range(), we will only call
arch_update_kernel_mappings_[begin|end]() if mm == &init_mm. The function is
explicitly for "kernel mappings" so it doesn't make sense to call it for user
mappings.

Looking at the current implementations of arch_sync_kernel_mappings() they are
filtering on kernel addresses anyway, so this should be safe.

Thanks,
Ryan
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index ff358d983583..1ee9b9588502 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -39,6 +39,41 @@ 
 #include <linux/mm_types.h>
 #include <linux/sched.h>
 #include <linux/page_table_check.h>
+#include <linux/pgtable_modmask.h>
+
+static inline void emit_pte_barriers(void)
+{
+	dsb(ishst);
+	isb();
+}
+
+static inline void queue_pte_barriers(void)
+{
+	if (test_thread_flag(TIF_KMAP_UPDATE_ACTIVE)) {
+		if (!test_thread_flag(TIF_KMAP_UPDATE_PENDING))
+			set_thread_flag(TIF_KMAP_UPDATE_PENDING);
+	} else
+		emit_pte_barriers();
+}
+
+#define arch_update_kernel_mappings_begin arch_update_kernel_mappings_begin
+static inline void arch_update_kernel_mappings_begin(unsigned long start,
+						     unsigned long end)
+{
+	set_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
+}
+
+#define arch_update_kernel_mappings_end arch_update_kernel_mappings_end
+static inline void arch_update_kernel_mappings_end(unsigned long start,
+						   unsigned long end,
+						   pgtbl_mod_mask mask)
+{
+	if (test_thread_flag(TIF_KMAP_UPDATE_PENDING))
+		emit_pte_barriers();
+
+	clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
+	clear_thread_flag(TIF_KMAP_UPDATE_ACTIVE);
+}
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
@@ -323,10 +358,8 @@  static inline void __set_pte_complete(pte_t pte)
 	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
 	 * or update_mmu_cache() have the necessary barriers.
 	 */
-	if (pte_valid_not_user(pte)) {
-		dsb(ishst);
-		isb();
-	}
+	if (pte_valid_not_user(pte))
+		queue_pte_barriers();
 }
 
 static inline void __set_pte(pte_t *ptep, pte_t pte)
@@ -791,10 +824,8 @@  static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 
 	WRITE_ONCE(*pmdp, pmd);
 
-	if (pmd_valid_not_user(pmd)) {
-		dsb(ishst);
-		isb();
-	}
+	if (pmd_valid_not_user(pmd))
+		queue_pte_barriers();
 }
 
 static inline void pmd_clear(pmd_t *pmdp)
@@ -869,10 +900,8 @@  static inline void set_pud(pud_t *pudp, pud_t pud)
 
 	WRITE_ONCE(*pudp, pud);
 
-	if (pud_valid_not_user(pud)) {
-		dsb(ishst);
-		isb();
-	}
+	if (pud_valid_not_user(pud))
+		queue_pte_barriers();
 }
 
 static inline void pud_clear(pud_t *pudp)
@@ -960,10 +989,8 @@  static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
 
 	WRITE_ONCE(*p4dp, p4d);
 
-	if (p4d_valid_not_user(p4d)) {
-		dsb(ishst);
-		isb();
-	}
+	if (p4d_valid_not_user(p4d))
+		queue_pte_barriers();
 }
 
 static inline void p4d_clear(p4d_t *p4dp)
@@ -1098,10 +1125,8 @@  static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
 
 	WRITE_ONCE(*pgdp, pgd);
 
-	if (pgd_valid_not_user(pgd)) {
-		dsb(ishst);
-		isb();
-	}
+	if (pgd_valid_not_user(pgd))
+		queue_pte_barriers();
 }
 
 static inline void pgd_clear(pgd_t *pgdp)
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 1114c1c3300a..382d2121261e 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -82,6 +82,8 @@  void arch_setup_new_exec(void);
 #define TIF_SME_VL_INHERIT	28	/* Inherit SME vl_onexec across exec */
 #define TIF_KERNEL_FPSTATE	29	/* Task is in a kernel mode FPSIMD section */
 #define TIF_TSC_SIGSEGV		30	/* SIGSEGV on counter-timer access */
+#define TIF_KMAP_UPDATE_ACTIVE	31	/* kernel map update in progress */
+#define TIF_KMAP_UPDATE_PENDING	32	/* kernel map updated with deferred barriers */
 
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 42faebb7b712..1367ec6407d1 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -680,10 +680,10 @@  struct task_struct *__switch_to(struct task_struct *prev,
 	gcs_thread_switch(next);
 
 	/*
-	 * Complete any pending TLB or cache maintenance on this CPU in case
-	 * the thread migrates to a different CPU.
-	 * This full barrier is also required by the membarrier system
-	 * call.
+	 * Complete any pending TLB or cache maintenance on this CPU in case the
+	 * thread migrates to a different CPU. This full barrier is also
+	 * required by the membarrier system call. Additionally it is required
+	 * for TIF_KMAP_UPDATE_PENDING, see below.
 	 */
 	dsb(ish);
 
@@ -696,6 +696,18 @@  struct task_struct *__switch_to(struct task_struct *prev,
 	/* avoid expensive SCTLR_EL1 accesses if no change */
 	if (prev->thread.sctlr_user != next->thread.sctlr_user)
 		update_sctlr_el1(next->thread.sctlr_user);
+	else if (unlikely(test_thread_flag(TIF_KMAP_UPDATE_PENDING))) {
+		/*
+		 * In unlikely event that a kernel map update is on-going when
+		 * preemption occurs, we must emit_pte_barriers() if pending.
+		 * emit_pte_barriers() consists of "dsb(ishst); isb();". The dsb
+		 * is already handled above. The isb() is handled if
+		 * update_sctlr_el1() was called. So only need to emit isb()
+		 * here if it wasn't called.
+		 */
+		isb();
+		clear_thread_flag(TIF_KMAP_UPDATE_PENDING);
+	}
 
 	/* the actual thread switch */
 	last = cpu_switch_to(prev, next);