Message ID | 20240920035506.346316-1-wangkefeng.wang@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2] arm64: optimize flush tlb kernel range | expand |
On 9/20/24 09:25, Kefeng Wang wrote: > Currently the kernel TLBs is flushed page by page if the target > VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll > brutally issue a TLBI ALL. > > But we could optimize it when CPU supports TLB range operations, > convert to use __flush_tlb_range_op() like other tlb range flush > to improve performance. > > Co-developed-by: Yicong Yang <yangyicong@hisilicon.com> > Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> > Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> > --- > v2: > - address Catalin's comments and use __flush_tlb_range_op() directly > > arch/arm64/include/asm/tlbflush.h | 24 +++++++++++++++++------- > 1 file changed, 17 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h > index 95fbc8c05607..42f0ec14fb2c 100644 > --- a/arch/arm64/include/asm/tlbflush.h > +++ b/arch/arm64/include/asm/tlbflush.h > @@ -492,19 +492,29 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, > > static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) > { > - unsigned long addr; > + const unsigned long stride = PAGE_SIZE; > + unsigned long pages; > + > + start = round_down(start, stride); > + end = round_up(end, stride); > + pages = (end - start) >> PAGE_SHIFT; > > - if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) { > + /* > + * When not uses TLB range ops, we can handle up to > + * (MAX_DVM_OPS - 1) pages; > + * When uses TLB range ops, we can handle up to > + * MAX_TLBI_RANGE_PAGES pages. > + */ > + if ((!system_supports_tlb_range() && > + (end - start) >= (MAX_DVM_OPS * stride)) || > + pages > MAX_TLBI_RANGE_PAGES) { > flush_tlb_all(); > return; > } Could the above conditional check for flush_tlb_all() be factored out in a helper, which can also be used in __flush_tlb_range_nosync() ? > > - start = __TLBI_VADDR(start, 0); > - end = __TLBI_VADDR(end, 0); > - > dsb(ishst); > - for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) > - __tlbi(vaale1is, addr); > + __flush_tlb_range_op(vaale1is, start, pages, stride, 0, > + TLBI_TTL_UNKNOWN, false, lpa2_is_enabled()); > dsb(ish); > isb(); > }
On 2024/9/20 14:10, Anshuman Khandual wrote: > > > On 9/20/24 09:25, Kefeng Wang wrote: >> Currently the kernel TLBs is flushed page by page if the target >> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll >> brutally issue a TLBI ALL. >> >> But we could optimize it when CPU supports TLB range operations, >> convert to use __flush_tlb_range_op() like other tlb range flush >> to improve performance. >> >> Co-developed-by: Yicong Yang <yangyicong@hisilicon.com> >> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> >> --- >> v2: >> - address Catalin's comments and use __flush_tlb_range_op() directly >> >> arch/arm64/include/asm/tlbflush.h | 24 +++++++++++++++++------- >> 1 file changed, 17 insertions(+), 7 deletions(-) >> >> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h >> index 95fbc8c05607..42f0ec14fb2c 100644 >> --- a/arch/arm64/include/asm/tlbflush.h >> +++ b/arch/arm64/include/asm/tlbflush.h >> @@ -492,19 +492,29 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, >> >> static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) >> { >> - unsigned long addr; >> + const unsigned long stride = PAGE_SIZE; >> + unsigned long pages; >> + >> + start = round_down(start, stride); >> + end = round_up(end, stride); >> + pages = (end - start) >> PAGE_SHIFT; >> >> - if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) { >> + /* >> + * When not uses TLB range ops, we can handle up to >> + * (MAX_DVM_OPS - 1) pages; >> + * When uses TLB range ops, we can handle up to >> + * MAX_TLBI_RANGE_PAGES pages. >> + */ >> + if ((!system_supports_tlb_range() && >> + (end - start) >= (MAX_DVM_OPS * stride)) || >> + pages > MAX_TLBI_RANGE_PAGES) { >> flush_tlb_all(); >> return; >> } > > Could the above conditional check for flush_tlb_all() be factored out > in a helper, which can also be used in __flush_tlb_range_nosync() ? How about adding this helper, not good at naming, diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 42f0ec14fb2c..b7043ff0945f 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -431,6 +431,23 @@ do { \ #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); +static inline int __flush_tlb_range_limit_excess(unsigned long start, + unsigned long end, unsigned long pages, unsigned long stride) +{ + /* + * When not uses TLB range ops, we can handle up to + * (MAX_DVM_OPS - 1) pages; + * When uses TLB range ops, we can handle up to + * MAX_TLBI_RANGE_PAGES pages. + */ + if ((!system_supports_tlb_range() && + (end - start) >= (MAX_DVM_OPS * stride)) || + pages > MAX_TLBI_RANGE_PAGES) + return -ERANGE; + + return 0; +} + static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level, @@ -442,15 +459,7 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, end = round_up(end, stride); pages = (end - start) >> PAGE_SHIFT; - /* - * When not uses TLB range ops, we can handle up to - * (MAX_DVM_OPS - 1) pages; - * When uses TLB range ops, we can handle up to - * MAX_TLBI_RANGE_PAGES pages. - */ - if ((!system_supports_tlb_range() && - (end - start) >= (MAX_DVM_OPS * stride)) || - pages > MAX_TLBI_RANGE_PAGES) { + if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { flush_tlb_mm(vma->vm_mm); return; }
On 9/20/24 16:47, Kefeng Wang wrote: > > > On 2024/9/20 14:10, Anshuman Khandual wrote: >> >> >> On 9/20/24 09:25, Kefeng Wang wrote: >>> Currently the kernel TLBs is flushed page by page if the target >>> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll >>> brutally issue a TLBI ALL. >>> >>> But we could optimize it when CPU supports TLB range operations, >>> convert to use __flush_tlb_range_op() like other tlb range flush >>> to improve performance. >>> >>> Co-developed-by: Yicong Yang <yangyicong@hisilicon.com> >>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> >>> --- >>> v2: >>> - address Catalin's comments and use __flush_tlb_range_op() directly >>> >>> arch/arm64/include/asm/tlbflush.h | 24 +++++++++++++++++------- >>> 1 file changed, 17 insertions(+), 7 deletions(-) >>> >>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h >>> index 95fbc8c05607..42f0ec14fb2c 100644 >>> --- a/arch/arm64/include/asm/tlbflush.h >>> +++ b/arch/arm64/include/asm/tlbflush.h >>> @@ -492,19 +492,29 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, >>> static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) >>> { >>> - unsigned long addr; >>> + const unsigned long stride = PAGE_SIZE; >>> + unsigned long pages; >>> + >>> + start = round_down(start, stride); >>> + end = round_up(end, stride); >>> + pages = (end - start) >> PAGE_SHIFT; >>> - if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) { >>> + /* >>> + * When not uses TLB range ops, we can handle up to >>> + * (MAX_DVM_OPS - 1) pages; >>> + * When uses TLB range ops, we can handle up to >>> + * MAX_TLBI_RANGE_PAGES pages. >>> + */ >>> + if ((!system_supports_tlb_range() && >>> + (end - start) >= (MAX_DVM_OPS * stride)) || >>> + pages > MAX_TLBI_RANGE_PAGES) { >>> flush_tlb_all(); >>> return; >>> } >> >> Could the above conditional check for flush_tlb_all() be factored out >> in a helper, which can also be used in __flush_tlb_range_nosync() ? > > How about adding this helper, not good at naming, > > diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h > index 42f0ec14fb2c..b7043ff0945f 100644 > --- a/arch/arm64/include/asm/tlbflush.h > +++ b/arch/arm64/include/asm/tlbflush.h > @@ -431,6 +431,23 @@ do { \ > #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ > __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); > > +static inline int __flush_tlb_range_limit_excess(unsigned long start, > + unsigned long end, unsigned long pages, unsigned long stride) Helper name sounds just fine. > +{ > + /* > + * When not uses TLB range ops, we can handle up to > + * (MAX_DVM_OPS - 1) pages; > + * When uses TLB range ops, we can handle up to > + * MAX_TLBI_RANGE_PAGES pages. > + */ This could be re-worded some what, something like this or may be you could make it better. /* * When the system does not support TLB range based flush * operation, (MAX_DVM_OPS - 1) pages can be handled. But * with TLB range based operation, MAX_TLBI_RANGE_PAGES * pages can be handled. */ > + if ((!system_supports_tlb_range() && > + (end - start) >= (MAX_DVM_OPS * stride)) || > + pages > MAX_TLBI_RANGE_PAGES) > + return -ERANGE; > + > + return 0; > +} > + > static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, > unsigned long start, unsigned long end, > unsigned long stride, bool last_level, > @@ -442,15 +459,7 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, > end = round_up(end, stride); > pages = (end - start) >> PAGE_SHIFT; > > - /* > - * When not uses TLB range ops, we can handle up to > - * (MAX_DVM_OPS - 1) pages; > - * When uses TLB range ops, we can handle up to > - * MAX_TLBI_RANGE_PAGES pages. > - */ > - if ((!system_supports_tlb_range() && > - (end - start) >= (MAX_DVM_OPS * stride)) || > - pages > MAX_TLBI_RANGE_PAGES) { > + if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { > flush_tlb_mm(vma->vm_mm); > return; > } > But yes, this factored out helper should now be used in flush_tlb_kernel_range() as well.
On 2024/9/23 12:53, Anshuman Khandual wrote: > > > On 9/20/24 16:47, Kefeng Wang wrote: >> >> >> On 2024/9/20 14:10, Anshuman Khandual wrote: >>> >>> >>> On 9/20/24 09:25, Kefeng Wang wrote: >>>> Currently the kernel TLBs is flushed page by page if the target >>>> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll >>>> brutally issue a TLBI ALL. >>>> >>>> But we could optimize it when CPU supports TLB range operations, >>>> convert to use __flush_tlb_range_op() like other tlb range flush >>>> to improve performance. >>>> >>>> Co-developed-by: Yicong Yang <yangyicong@hisilicon.com> >>>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >>>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> >>>> --- >>>> v2: >>>> - address Catalin's comments and use __flush_tlb_range_op() directly >>>> >>>> arch/arm64/include/asm/tlbflush.h | 24 +++++++++++++++++------- >>>> 1 file changed, 17 insertions(+), 7 deletions(-) >>>> >>>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h >>>> index 95fbc8c05607..42f0ec14fb2c 100644 >>>> --- a/arch/arm64/include/asm/tlbflush.h >>>> +++ b/arch/arm64/include/asm/tlbflush.h >>>> @@ -492,19 +492,29 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, >>>> static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) >>>> { >>>> - unsigned long addr; >>>> + const unsigned long stride = PAGE_SIZE; >>>> + unsigned long pages; >>>> + >>>> + start = round_down(start, stride); >>>> + end = round_up(end, stride); >>>> + pages = (end - start) >> PAGE_SHIFT; >>>> - if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) { >>>> + /* >>>> + * When not uses TLB range ops, we can handle up to >>>> + * (MAX_DVM_OPS - 1) pages; >>>> + * When uses TLB range ops, we can handle up to >>>> + * MAX_TLBI_RANGE_PAGES pages. >>>> + */ >>>> + if ((!system_supports_tlb_range() && >>>> + (end - start) >= (MAX_DVM_OPS * stride)) || >>>> + pages > MAX_TLBI_RANGE_PAGES) { >>>> flush_tlb_all(); >>>> return; >>>> } >>> >>> Could the above conditional check for flush_tlb_all() be factored out >>> in a helper, which can also be used in __flush_tlb_range_nosync() ? >> >> How about adding this helper, not good at naming, >> >> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h >> index 42f0ec14fb2c..b7043ff0945f 100644 >> --- a/arch/arm64/include/asm/tlbflush.h >> +++ b/arch/arm64/include/asm/tlbflush.h >> @@ -431,6 +431,23 @@ do { \ >> #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ >> __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); >> >> +static inline int __flush_tlb_range_limit_excess(unsigned long start, >> + unsigned long end, unsigned long pages, unsigned long stride) > > Helper name sounds just fine. > >> +{ >> + /* >> + * When not uses TLB range ops, we can handle up to >> + * (MAX_DVM_OPS - 1) pages; >> + * When uses TLB range ops, we can handle up to >> + * MAX_TLBI_RANGE_PAGES pages. >> + */ > > This could be re-worded some what, something like this or > may be you could make it better. > > /* > * When the system does not support TLB range based flush > * operation, (MAX_DVM_OPS - 1) pages can be handled. But > * with TLB range based operation, MAX_TLBI_RANGE_PAGES > * pages can be handled. > */ > Thanks for your advise, this is better. >> + if ((!system_supports_tlb_range() && >> + (end - start) >= (MAX_DVM_OPS * stride)) || >> + pages > MAX_TLBI_RANGE_PAGES) >> + return -ERANGE; >> + >> + return 0; >> +} >> + >> static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, >> unsigned long start, unsigned long end, >> unsigned long stride, bool last_level, >> @@ -442,15 +459,7 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, >> end = round_up(end, stride); >> pages = (end - start) >> PAGE_SHIFT; >> >> - /* >> - * When not uses TLB range ops, we can handle up to >> - * (MAX_DVM_OPS - 1) pages; >> - * When uses TLB range ops, we can handle up to >> - * MAX_TLBI_RANGE_PAGES pages. >> - */ >> - if ((!system_supports_tlb_range() && >> - (end - start) >= (MAX_DVM_OPS * stride)) || >> - pages > MAX_TLBI_RANGE_PAGES) { >> + if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { >> flush_tlb_mm(vma->vm_mm); >> return; >> } >> > > But yes, this factored out helper should now be used in flush_tlb_kernel_range() as well. > Yes, will re-post with the helper.
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 95fbc8c05607..42f0ec14fb2c 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -492,19 +492,29 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - unsigned long addr; + const unsigned long stride = PAGE_SIZE; + unsigned long pages; + + start = round_down(start, stride); + end = round_up(end, stride); + pages = (end - start) >> PAGE_SHIFT; - if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) { + /* + * When not uses TLB range ops, we can handle up to + * (MAX_DVM_OPS - 1) pages; + * When uses TLB range ops, we can handle up to + * MAX_TLBI_RANGE_PAGES pages. + */ + if ((!system_supports_tlb_range() && + (end - start) >= (MAX_DVM_OPS * stride)) || + pages > MAX_TLBI_RANGE_PAGES) { flush_tlb_all(); return; } - start = __TLBI_VADDR(start, 0); - end = __TLBI_VADDR(end, 0); - dsb(ishst); - for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) - __tlbi(vaale1is, addr); + __flush_tlb_range_op(vaale1is, start, pages, stride, 0, + TLBI_TTL_UNKNOWN, false, lpa2_is_enabled()); dsb(ish); isb(); }