diff mbox series

[v4,4/5] riscv: rewrite tlb flush for performance

Message ID d60a62cfbbf63382a47e3c2226c5dd6148f8b814.1553647082.git.gary@garyguo.net (mailing list archive)
State New, archived
Headers show
Series TLB/I$ flush cleanups and improvements | expand

Commit Message

Gary Guo March 27, 2019, 12:41 a.m. UTC
From: Gary Guo <gary@garyguo.net>

This patch rewrites the logic related to TLB flushing, both to cleanup
the code and to improve performance.

We now use sfence.vma variant with specified ASID and virtual address
whenever possible.  Even though only ASID 0 is used, it still improves
performance by preventing global mappings from being flushed from TLB.

Signed-off-by: Gary Guo <gary@garyguo.net>
Tested-by: Atish Patra <atish.patra@wdc.com>
---
 .../admin-guide/kernel-parameters.rst         |   1 +
 .../admin-guide/kernel-parameters.txt         |   8 ++
 arch/riscv/include/asm/pgtable.h              |   2 +-
 arch/riscv/include/asm/tlbflush.h             |  76 +++++------
 arch/riscv/mm/Makefile                        |   1 +
 arch/riscv/mm/context.c                       |   8 +-
 arch/riscv/mm/init.c                          |   2 +-
 arch/riscv/mm/tlbflush.c                      | 128 ++++++++++++++++++
 8 files changed, 178 insertions(+), 48 deletions(-)
 create mode 100644 arch/riscv/mm/tlbflush.c

Comments

Christoph Hellwig March 27, 2019, 7:25 a.m. UTC | #1
> @@ -27,53 +19,47 @@ static inline void local_flush_tlb_all(void)
>  	__asm__ __volatile__ ("sfence.vma" : : : "memory");
>  }
>  
> -/* Flush one page from local TLB */
> -static inline void local_flush_tlb_page(unsigned long addr)
> +static inline void local_flush_tlb_mm(struct mm_struct *mm)
>  {
> -	__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory");
> +	/* Flush ASID 0 so that global mappings are not affected */
> +	__asm__ __volatile__ ("sfence.vma x0, %0" : : "r" (0) : "memory");
>  }
>  
> -#ifndef CONFIG_SMP
> -
> -#define flush_tlb_all() local_flush_tlb_all()
> -#define flush_tlb_page(vma, addr) local_flush_tlb_page(addr)
> +static inline void local_flush_tlb_page(struct vm_area_struct *vma,
> +	unsigned long addr)
> +{
> +	__asm__ __volatile__ ("sfence.vma %0, %1"
> +			      : : "r" (addr), "r" (0)
> +			      : "memory");
> +}

Why do we pass the vma argument here even if it is never used?  That
just seems to create some rather pointless churn.  Also I'd add
local_flush_tlb_mm below local_flush_tlb_page to avoid churn as well,
nevermind that it seems the more logical order to me.

> +void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
> +	unsigned long end);
> +void local_flush_tlb_kernel_range(unsigned long start, unsigned long end);

As far as I can tell these are only used for the !SMP case and only
to implement the non-local prefixed versions.  In that case we should
just drop the local_prefix and implement those APIs directly, and only
for !SMP builds.

> +
> +#include <linux/mm.h>
> +#include <asm/sbi.h>
> +
> +#define SFENCE_VMA_FLUSH_ALL ((unsigned long) -1)
> +
> +/*
> + * This controls the maximum amount of page-level sfence.vma that the kernel
> + * can issue when the kernel needs to flush a range from the TLB.  If the size
> + * of range goes beyond this threshold, a full sfence.vma is issued.
> + *
> + * Increase this number can negatively impact performance on implementations
> + * where sfence.vma's address operand is ignored and always perform a global
> + * TLB flush.  On the other hand, implementations with page-level TLB flush
> + * support can benefit from a larger number.
> + */
> +static unsigned long tlbi_range_threshold = PAGE_SIZE;

I really hate having this is a tunable in the kernel code.  I think
the right answer is to have a device tree entry to carry this number
so that the platform can supply it.  Btw, what are examples of
platforms that flush globalls vs per-page at the moment?  What is a good
larger value for the latter based on your testing?

Also I wonder if we should also split this tunable and the optional
global flush into a separate patch.  This is in this first patch
just make use of the asid,  and then another patch to add the threshold
for doing the full flush.

> +void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
> +			   unsigned long end)
> +{
> +	if (end - start > tlbi_range_threshold) {
> +		local_flush_tlb_mm(vma->vm_mm);
> +		return;
> +	}
> +
> +	while (start < end) {
> +		__asm__ __volatile__ ("sfence.vma %0, %1"
> +				      : : "r" (start), "r" (0)
> +				      : "memory");

I think this should just call local_flush_tlb_page.

> +		start += PAGE_SIZE;
> +	}

And maybe use a for loop to short cut it a bit:

	for (; start < end; start += PAGE_SIZE)
		local_flush_tlb_page(start);

> +void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
> +{
> +	if (end - start > tlbi_range_threshold) {
> +		local_flush_tlb_all();
> +		return;
> +	}
> +
> +	while (start < end) {
> +		__asm__ __volatile__ ("sfence.vma %0"
> +				      : : "r" (start)
> +				      : "memory");
> +		start += PAGE_SIZE;

Same here, just with local_flush_tlb_kernel_page.
Gary Guo March 27, 2019, 1:56 p.m. UTC | #2
On 27/03/2019 07:25, Christoph Hellwig wrote:
>> @@ -27,53 +19,47 @@ static inline void local_flush_tlb_all(void)
>>   	__asm__ __volatile__ ("sfence.vma" : : : "memory");
>>   }
>>   
>> -/* Flush one page from local TLB */
>> -static inline void local_flush_tlb_page(unsigned long addr)
>> +static inline void local_flush_tlb_mm(struct mm_struct *mm)
>>   {
>> -	__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory");
>> +	/* Flush ASID 0 so that global mappings are not affected */
>> +	__asm__ __volatile__ ("sfence.vma x0, %0" : : "r" (0) : "memory");
>>   }
>>   
>> -#ifndef CONFIG_SMP
>> -
>> -#define flush_tlb_all() local_flush_tlb_all()
>> -#define flush_tlb_page(vma, addr) local_flush_tlb_page(addr)
>> +static inline void local_flush_tlb_page(struct vm_area_struct *vma,
>> +	unsigned long addr)
>> +{
>> +	__asm__ __volatile__ ("sfence.vma %0, %1"
>> +			      : : "r" (addr), "r" (0)
>> +			      : "memory");
>> +}
> 
> Why do we pass the vma argument here even if it is never used?  That
> just seems to create some rather pointless churn.  Also I'd add
> local_flush_tlb_mm below local_flush_tlb_page to avoid churn as well,
> nevermind that it seems the more logical order to me >
This isn't used now, but we need that for ASID support. It also more 
consistent with the non-SMP flush signature, and more consistent with 
code of other architectures.
> 
>> +void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
>> +	unsigned long end);
>> +void local_flush_tlb_kernel_range(unsigned long start, unsigned long end);
> 
> As far as I can tell these are only used for the !SMP case and only
> to implement the non-local prefixed versions.  In that case we should
> just drop the local_prefix and implement those APIs directly, and only
> for !SMP builds.
> 
Ok, in that case I'll also move it to tlbflush.c.
>> +
>> +#include <linux/mm.h>
>> +#include <asm/sbi.h>
>> +
>> +#define SFENCE_VMA_FLUSH_ALL ((unsigned long) -1)
>> +
>> +/*
>> + * This controls the maximum amount of page-level sfence.vma that the kernel
>> + * can issue when the kernel needs to flush a range from the TLB.  If the size
>> + * of range goes beyond this threshold, a full sfence.vma is issued.
>> + *
>> + * Increase this number can negatively impact performance on implementations
>> + * where sfence.vma's address operand is ignored and always perform a global
>> + * TLB flush.  On the other hand, implementations with page-level TLB flush
>> + * support can benefit from a larger number.
>> + */
>> +static unsigned long tlbi_range_threshold = PAGE_SIZE;
> 
> I really hate having this is a tunable in the kernel code.  I think
> the right answer is to have a device tree entry to carry this number
> so that the platform can supply it.  Btw, what are examples of
> platforms that flush globalls vs per-page at the moment?  What is a good
> larger value for the latter based on your testing?
> 
This is discussed in previous versions of this patch, and we arrived at 
the conclusion that a boot parameter is the best way to do it now, as at 
the moment we have no other ways to get this information. The actual 
value really depends on the actual implementation. If the implementation 
has a super large TLB where full invalidation would be super expensive, 
they might even want the value to be 511.

> Also I wonder if we should also split this tunable and the optional
> global flush into a separate patch.  This is in this first patch
> just make use of the asid,  and then another patch to add the threshold
> for doing the full flush.
I don't think we should. This patch is more like a rewrite to old logic 
rather than patching things up incrementally.
> 
>> +void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
>> +			   unsigned long end)
>> +{
>> +	if (end - start > tlbi_range_threshold) {
>> +		local_flush_tlb_mm(vma->vm_mm);
>> +		return;
>> +	}
>> +
>> +	while (start < end) {
>> +		__asm__ __volatile__ ("sfence.vma %0, %1"
>> +				      : : "r" (start), "r" (0)
>> +				      : "memory");
> 
> I think this should just call local_flush_tlb_page.
> 
I do this to minimise changes we need if we want to add ASID (in which 
case we want to avoid retrieving ASID from atomic variable multiple times).
>> +		start += PAGE_SIZE;
>> +	}
> 
> And maybe use a for loop to short cut it a bit:
> 
> 	for (; start < end; start += PAGE_SIZE)
> 		local_flush_tlb_page(start);
> 
Ok
>> +void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
>> +{
>> +	if (end - start > tlbi_range_threshold) {
>> +		local_flush_tlb_all();
>> +		return;
>> +	}
>> +
>> +	while (start < end) {
>> +		__asm__ __volatile__ ("sfence.vma %0"
>> +				      : : "r" (start)
>> +				      : "memory");
>> +		start += PAGE_SIZE;
> 
> Same here, just with local_flush_tlb_kernel_page.
> 
Ok
Christoph Hellwig March 28, 2019, 4:17 p.m. UTC | #3
On Wed, Mar 27, 2019 at 01:56:28PM +0000, Gary Guo wrote:
> >> +static inline void local_flush_tlb_page(struct vm_area_struct *vma,
> >> +	unsigned long addr)
> >> +{
> >> +	__asm__ __volatile__ ("sfence.vma %0, %1"
> >> +			      : : "r" (addr), "r" (0)
> >> +			      : "memory");
> >> +}
> > 
> > Why do we pass the vma argument here even if it is never used?  That
> > just seems to create some rather pointless churn.  Also I'd add
> > local_flush_tlb_mm below local_flush_tlb_page to avoid churn as well,
> > nevermind that it seems the more logical order to me >
> This isn't used now, but we need that for ASID support. It also more 
> consistent with the non-SMP flush signature, and more consistent with 
> code of other architectures.

I'd rather keep it simple for now.  For ASID support I suspect you'll
only need it to get the asid from the mm_struct pointer to by the
vma, right?  I'd rather pass the asid directly in that case.

> >> +static unsigned long tlbi_range_threshold = PAGE_SIZE;
> > 
> > I really hate having this is a tunable in the kernel code.  I think
> > the right answer is to have a device tree entry to carry this number
> > so that the platform can supply it.  Btw, what are examples of
> > platforms that flush globalls vs per-page at the moment?  What is a good
> > larger value for the latter based on your testing?
> > 
> This is discussed in previous versions of this patch, and we arrived at 
> the conclusion that a boot parameter is the best way to do it now, as at 
> the moment we have no other ways to get this information. The actual 
> value really depends on the actual implementation. If the implementation 
> has a super large TLB where full invalidation would be super expensive, 
> they might even want the value to be 511.

Sorry, I might not have been clear above - the tunable is ok for
playing around and benchmarking, but it is not the kind of interface we
should have regular users to poke at for good performance.  So I don't
mind keeping the paramter in, but we also really need to define a way
how the value could be passed through the device tree so that we get
a good default.

And I'd still like an answer for my sectond question above - what
were the good values for say the sifive u54 and qemu in your tests?

> > Also I wonder if we should also split this tunable and the optional
> > global flush into a separate patch.  This is in this first patch
> > just make use of the asid,  and then another patch to add the threshold
> > for doing the full flush.
> I don't think we should. This patch is more like a rewrite to old logic 
> rather than patching things up incrementally.

Well, we have two pretty distinct changes - one is to use a threshold
to do a global(-ish) flush instead of a per-page one, and the other is
to use AISD 0 explicitly.  In Linux we generally try to keep things at
the smallest logical change.  I'm not going to push hard for this, but
that is just how we normally do it.


> >> +	while (start < end) {
> >> +		__asm__ __volatile__ ("sfence.vma %0, %1"
> >> +				      : : "r" (start), "r" (0)
> >> +				      : "memory");
> > 
> > I think this should just call local_flush_tlb_page.
> > 
> I do this to minimise changes we need if we want to add ASID (in which 
> case we want to avoid retrieving ASID from atomic variable multiple times).

We can take vare of that later, preferably with a nice helper that gets
the ASID as an argument (see my local_flush_tlb_page comment above).
Gary Guo March 28, 2019, 4:39 p.m. UTC | #4
On 28/03/2019 16:17, Christoph Hellwig wrote:
> On Wed, Mar 27, 2019 at 01:56:28PM +0000, Gary Guo wrote:
>>>> +static inline void local_flush_tlb_page(struct vm_area_struct *vma,
>>>> +	unsigned long addr)
>>>> +{
>>>> +	__asm__ __volatile__ ("sfence.vma %0, %1"
>>>> +			      : : "r" (addr), "r" (0)
>>>> +			      : "memory");
>>>> +}
>>>
>>> Why do we pass the vma argument here even if it is never used?  That
>>> just seems to create some rather pointless churn.  Also I'd add
>>> local_flush_tlb_mm below local_flush_tlb_page to avoid churn as well,
>>> nevermind that it seems the more logical order to me >
>> This isn't used now, but we need that for ASID support. It also more
>> consistent with the non-SMP flush signature, and more consistent with
>> code of other architectures.
> 
> I'd rather keep it simple for now.  For ASID support I suspect you'll
> only need it to get the asid from the mm_struct pointer to by the
> vma, right?  I'd rather pass the asid directly in that case.
> 

Yes, just takes it from mm_struct. But the key point is to keep it 
similar to the local_flush_tlb_page.

>>>> +static unsigned long tlbi_range_threshold = PAGE_SIZE;
>>>
>>> I really hate having this is a tunable in the kernel code.  I think
>>> the right answer is to have a device tree entry to carry this number
>>> so that the platform can supply it.  Btw, what are examples of
>>> platforms that flush globalls vs per-page at the moment?  What is a good
>>> larger value for the latter based on your testing?
>>>
>> This is discussed in previous versions of this patch, and we arrived at
>> the conclusion that a boot parameter is the best way to do it now, as at
>> the moment we have no other ways to get this information. The actual
>> value really depends on the actual implementation. If the implementation
>> has a super large TLB where full invalidation would be super expensive,
>> they might even want the value to be 511.
> 
> Sorry, I might not have been clear above - the tunable is ok for
> playing around and benchmarking, but it is not the kind of interface we
> should have regular users to poke at for good performance.  So I don't
> mind keeping the paramter in, but we also really need to define a way
> how the value could be passed through the device tree so that we get
> a good default.
> 
> And I'd still like an answer for my sectond question above - what
> were the good values for say the sifive u54 and qemu in your tests?
> 
QEMU currently treats all SFENCE.VMA as global. Technically the QEMU's 
implementation can be modified to do page-level flush instead but the 
performance will not differ, as the dominating factor is resetting jump 
cache.

I don't have a SiFive board so I can't tell what's a good value for that.

On a hypothetical platform that I am working at (simulation only) we can 
benefit even when setting it to 511 (max allowed, as if value >=512 we 
don't know if a non-leaf entry is changed, in which case spec mandates a 
full flush).

So this really depends on the platform.

>>> Also I wonder if we should also split this tunable and the optional
>>> global flush into a separate patch.  This is in this first patch
>>> just make use of the asid,  and then another patch to add the threshold
>>> for doing the full flush.
>> I don't think we should. This patch is more like a rewrite to old logic
>> rather than patching things up incrementally.
> 
> Well, we have two pretty distinct changes - one is to use a threshold
> to do a global(-ish) flush instead of a per-page one, and the other is
> to use AISD 0 explicitly.  In Linux we generally try to keep things at
> the smallest logical change.  I'm not going to push hard for this, but
> that is just how we normally do it.
> 
> 
>>>> +	while (start < end) {
>>>> +		__asm__ __volatile__ ("sfence.vma %0, %1"
>>>> +				      : : "r" (start), "r" (0)
>>>> +				      : "memory");
>>>
>>> I think this should just call local_flush_tlb_page.
>>>
>> I do this to minimise changes we need if we want to add ASID (in which
>> case we want to avoid retrieving ASID from atomic variable multiple times).
> 
> We can take vare of that later, preferably with a nice helper that gets
> the ASID as an argument (see my local_flush_tlb_page comment above).
>
Christoph Hellwig March 28, 2019, 4:55 p.m. UTC | #5
On Thu, Mar 28, 2019 at 04:39:53PM +0000, Gary Guo wrote:
> > I'd rather keep it simple for now.  For ASID support I suspect you'll
> > only need it to get the asid from the mm_struct pointer to by the
> > vma, right?  I'd rather pass the asid directly in that case.
> > 
> 
> Yes, just takes it from mm_struct. But the key point is to keep it 
> similar to the local_flush_tlb_page.

And I'd much rather not pass unused argument that also require
duplicating the inline assembly code.

> > And I'd still like an answer for my sectond question above - what
> > were the good values for say the sifive u54 and qemu in your tests?
> > 
> QEMU currently treats all SFENCE.VMA as global. Technically the QEMU's 
> implementation can be modified to do page-level flush instead but the 
> performance will not differ, as the dominating factor is resetting jump 
> cache.
> 
> I don't have a SiFive board so I can't tell what's a good value for that.
> 
> On a hypothetical platform that I am working at (simulation only) we can 
> benefit even when setting it to 511 (max allowed, as if value >=512 we 
> don't know if a non-leaf entry is changed, in which case spec mandates a 
> full flush).
> 
> So this really depends on the platform.

Ok, so before moving on we really should figure out a good way for
the currently only support (well mostly supported) hardware platform
and figure out a way to pass that through DT.
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index b8d0bc07ed0a..8037db7ab25c 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -139,6 +139,7 @@  parameter is applicable::
 	PS2	Appropriate PS/2 support is enabled.
 	RAM	RAM disk support is enabled.
 	RDT	Intel Resource Director Technology.
+	RV	RISC-V architecture is enabled.
 	S390	S390 architecture is enabled.
 	SCSI	Appropriate SCSI support is enabled.
 			A lot of drivers have their options described inside
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 858b6c0b9a15..7a60edef09d2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4544,6 +4544,14 @@ 
 			Force threading of all interrupt handlers except those
 			marked explicitly IRQF_NO_THREAD.
 
+	tlbi_max_ops=	[RV]
+			Format: <int> (must be >= 1 and < PTRS_PER_PTE)
+			Default: 1
+			Controls the maximum amount of page-level sfence.vma
+			that the kernel can issue when a range needs to be
+			flushed.
+			See arch/riscv/mm/tlbflush.c
+
 	tmem		[KNL,XEN]
 			Enable the Transcendent memory driver if built-in.
 
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 1141364d990e..19d1aeb059da 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -279,7 +279,7 @@  static inline void update_mmu_cache(struct vm_area_struct *vma,
 	 * Relying on flush_tlb_fix_spurious_fault would suffice, but
 	 * the extra traps reduce performance.  So, eagerly SFENCE.VMA.
 	 */
-	local_flush_tlb_page(address);
+	local_flush_tlb_page(vma, address);
 }
 
 #define __HAVE_ARCH_PTE_SAME
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 54fee0cadb1e..29a780ca232a 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -1,22 +1,14 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (C) 2009 Chen Liqin <liqin.chen@sunplusct.com>
  * Copyright (C) 2012 Regents of the University of California
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Copyright (C) 2019 Gary Guo, University of Cambridge
  */
 
 #ifndef _ASM_RISCV_TLBFLUSH_H
 #define _ASM_RISCV_TLBFLUSH_H
 
 #include <linux/mm_types.h>
-#include <asm/smp.h>
 
 /*
  * Flush entire local TLB.  'sfence.vma' implicitly fences with the instruction
@@ -27,53 +19,47 @@  static inline void local_flush_tlb_all(void)
 	__asm__ __volatile__ ("sfence.vma" : : : "memory");
 }
 
-/* Flush one page from local TLB */
-static inline void local_flush_tlb_page(unsigned long addr)
+static inline void local_flush_tlb_mm(struct mm_struct *mm)
 {
-	__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory");
+	/* Flush ASID 0 so that global mappings are not affected */
+	__asm__ __volatile__ ("sfence.vma x0, %0" : : "r" (0) : "memory");
 }
 
-#ifndef CONFIG_SMP
-
-#define flush_tlb_all() local_flush_tlb_all()
-#define flush_tlb_page(vma, addr) local_flush_tlb_page(addr)
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
+	unsigned long addr)
+{
+	__asm__ __volatile__ ("sfence.vma %0, %1"
+			      : : "r" (addr), "r" (0)
+			      : "memory");
+}
 
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long end)
+static inline void local_flush_tlb_kernel_page(unsigned long addr)
 {
-	local_flush_tlb_all();
+	__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory");
 }
 
-#define flush_tlb_mm(mm) flush_tlb_all()
+void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+	unsigned long end);
+void local_flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
-#else /* CONFIG_SMP */
+#ifdef CONFIG_SMP
 
-#include <asm/sbi.h>
+void flush_tlb_all(void);
+void flush_tlb_mm(struct mm_struct *mm);
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr);
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+	unsigned long end);
+void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
-static inline void remote_sfence_vma(struct cpumask *cmask, unsigned long start,
-				     unsigned long size)
-{
-	struct cpumask hmask;
-
-	cpumask_clear(&hmask);
-	riscv_cpuid_to_hartid_mask(cmask, &hmask);
-	sbi_remote_sfence_vma(hmask.bits, start, size);
-}
+#else /* CONFIG_SMP */
 
-#define flush_tlb_all() sbi_remote_sfence_vma(NULL, 0, -1)
-#define flush_tlb_page(vma, addr) flush_tlb_range(vma, addr, 0)
-#define flush_tlb_range(vma, start, end) \
-	remote_sfence_vma(mm_cpumask((vma)->vm_mm), start, (end) - (start))
-#define flush_tlb_mm(mm) \
-	remote_sfence_vma(mm_cpumask(mm), 0, -1)
+#define flush_tlb_all() local_flush_tlb_all()
+#define flush_tlb_mm(mm) local_flush_tlb_mm(mm)
+#define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr)
+#define flush_tlb_range(vma, start, end) local_flush_tlb_range(vma, start, end)
+#define flush_tlb_kernel_range(start, end) \
+	local_flush_tlb_kernel_range(start, end)
 
 #endif /* CONFIG_SMP */
 
-/* Flush a range of kernel pages */
-static inline void flush_tlb_kernel_range(unsigned long start,
-	unsigned long end)
-{
-	flush_tlb_all();
-}
-
 #endif /* _ASM_RISCV_TLBFLUSH_H */
diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
index d75b035786d6..53b68fd3cb45 100644
--- a/arch/riscv/mm/Makefile
+++ b/arch/riscv/mm/Makefile
@@ -4,3 +4,4 @@  obj-y += extable.o
 obj-y += ioremap.o
 obj-y += cacheflush.o
 obj-y += context.o
+obj-y += tlbflush.o
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index fbb1cfe80267..0f787bcd3a7a 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -64,7 +64,13 @@  void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	 * privileged ISA 1.10 yet.
 	 */
 	csr_write(sptbr, virt_to_pfn(next->pgd) | SATP_MODE);
-	local_flush_tlb_all();
+
+	/*
+	 * sfence.vma after SATP write. We call it on MM context instead of
+	 * calling local_flush_tlb_all to prevent global mappings from being
+	 * affected.
+	 */
+	local_flush_tlb_mm(next);
 
 	flush_icache_deferred(next);
 }
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index b379a75ac6a6..858f55e8b219 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -168,7 +168,7 @@  void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
 		set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot));
 	} else {
 		pte_clear(&init_mm, addr, ptep);
-		local_flush_tlb_page(addr);
+		local_flush_tlb_kernel_page(addr);
 	}
 }
 
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
new file mode 100644
index 000000000000..33083f48a936
--- /dev/null
+++ b/arch/riscv/mm/tlbflush.c
@@ -0,0 +1,128 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Gary Guo, University of Cambridge
+ */
+
+#include <linux/mm.h>
+#include <asm/sbi.h>
+
+#define SFENCE_VMA_FLUSH_ALL ((unsigned long) -1)
+
+/*
+ * This controls the maximum amount of page-level sfence.vma that the kernel
+ * can issue when the kernel needs to flush a range from the TLB.  If the size
+ * of range goes beyond this threshold, a full sfence.vma is issued.
+ *
+ * Increase this number can negatively impact performance on implementations
+ * where sfence.vma's address operand is ignored and always perform a global
+ * TLB flush.  On the other hand, implementations with page-level TLB flush
+ * support can benefit from a larger number.
+ */
+static unsigned long tlbi_range_threshold = PAGE_SIZE;
+
+static int __init setup_tlbi_max_ops(char *str)
+{
+	int value = 0;
+
+	get_option(&str, &value);
+
+	/*
+	 * This value cannot be greater or equal to PTRS_PER_PTE, as we need
+	 * to full flush for any non-leaf page table change. The value has also
+	 * be at least 1.
+	 */
+	if (value >= PTRS_PER_PTE || value < 1)
+		return -EINVAL;
+
+	tlbi_range_threshold = value * PAGE_SIZE;
+	return 0;
+}
+early_param("tlbi_max_ops", setup_tlbi_max_ops);
+
+void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+			   unsigned long end)
+{
+	if (end - start > tlbi_range_threshold) {
+		local_flush_tlb_mm(vma->vm_mm);
+		return;
+	}
+
+	while (start < end) {
+		__asm__ __volatile__ ("sfence.vma %0, %1"
+				      : : "r" (start), "r" (0)
+				      : "memory");
+		start += PAGE_SIZE;
+	}
+}
+
+void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	if (end - start > tlbi_range_threshold) {
+		local_flush_tlb_all();
+		return;
+	}
+
+	while (start < end) {
+		__asm__ __volatile__ ("sfence.vma %0"
+				      : : "r" (start)
+				      : "memory");
+		start += PAGE_SIZE;
+	}
+}
+
+#ifdef CONFIG_SMP
+
+static void remote_sfence_vma(unsigned long start, unsigned long size)
+{
+	sbi_remote_sfence_vma(NULL, start, size);
+}
+
+static void remote_sfence_vma_asid(cpumask_t *mask, unsigned long start,
+				   unsigned long size, unsigned long asid)
+{
+	cpumask_t hmask;
+
+	cpumask_clear(&hmask);
+	riscv_cpuid_to_hartid_mask(mask, &hmask);
+	sbi_remote_sfence_vma_asid(hmask.bits, start, size, asid);
+}
+
+
+void flush_tlb_all(void)
+{
+	remote_sfence_vma(0, SFENCE_VMA_FLUSH_ALL);
+}
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	remote_sfence_vma_asid(mm_cpumask(mm), 0, SFENCE_VMA_FLUSH_ALL, 0);
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
+{
+	remote_sfence_vma_asid(mm_cpumask(vma->vm_mm), addr, PAGE_SIZE, 0);
+}
+
+
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+{
+	if (end - start > tlbi_range_threshold) {
+		flush_tlb_mm(vma->vm_mm);
+		return;
+	}
+
+	remote_sfence_vma_asid(mm_cpumask(vma->vm_mm), start, end - start, 0);
+}
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	if (end - start > tlbi_range_threshold) {
+		flush_tlb_all();
+		return;
+	}
+
+	remote_sfence_vma(start, end - start);
+}
+
+#endif /* CONFIG_SMP */