diff mbox

[3/3] arm64: IPI each CPU after invalidating the I-cache for kernel mappings

Message ID 1529412495-17525-4-git-send-email-will.deacon@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Will Deacon June 19, 2018, 12:48 p.m. UTC
When invalidating the instruction cache for a kernel mapping via
flush_icache_range(), it is also necessary to flush the pipeline for
other CPUs so that instructions fetched into the pipeline before the
I-cache invalidation are discarded. For example, if module 'foo' is
unloaded and then module 'bar' is loaded into the same area of memory,
a CPU could end up executing instructions from 'foo' when branching into
'bar' if these instructions were fetched into the pipeline before 'foo'
was unloaded.

Whilst this is highly unlikely to occur in practice, particularly as
any exception acts as a context-synchronizing operation, following the
letter of the architecture requires us to execute an ISB on each CPU
in order for the new instruction stream to be visible.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/cacheflush.h | 13 ++++++++++++-
 arch/arm64/kernel/alternative.c     |  1 -
 arch/arm64/kernel/cpu_errata.c      |  2 +-
 arch/arm64/kernel/insn.c            | 15 ++-------------
 arch/arm64/mm/cache.S               |  4 ++--
 5 files changed, 17 insertions(+), 18 deletions(-)

Comments

Mark Rutland June 19, 2018, 1:55 p.m. UTC | #1
On Tue, Jun 19, 2018 at 01:48:15PM +0100, Will Deacon wrote:
> When invalidating the instruction cache for a kernel mapping via
> flush_icache_range(), it is also necessary to flush the pipeline for
> other CPUs so that instructions fetched into the pipeline before the
> I-cache invalidation are discarded. For example, if module 'foo' is
> unloaded and then module 'bar' is loaded into the same area of memory,
> a CPU could end up executing instructions from 'foo' when branching into
> 'bar' if these instructions were fetched into the pipeline before 'foo'
> was unloaded.
> 
> Whilst this is highly unlikely to occur in practice, particularly as
> any exception acts as a context-synchronizing operation, following the
> letter of the architecture requires us to execute an ISB on each CPU
> in order for the new instruction stream to be visible.
> 
> Signed-off-by: Will Deacon <will.deacon@arm.com>
> ---
>  arch/arm64/include/asm/cacheflush.h | 13 ++++++++++++-
>  arch/arm64/kernel/alternative.c     |  1 -
>  arch/arm64/kernel/cpu_errata.c      |  2 +-
>  arch/arm64/kernel/insn.c            | 15 ++-------------
>  arch/arm64/mm/cache.S               |  4 ++--
>  5 files changed, 17 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
> index d264a7274811..a0ec27066e6f 100644
> --- a/arch/arm64/include/asm/cacheflush.h
> +++ b/arch/arm64/include/asm/cacheflush.h
> @@ -71,7 +71,7 @@
>   *		- kaddr  - page address
>   *		- size   - region size
>   */
> -extern void flush_icache_range(unsigned long start, unsigned long end);
> +extern void __flush_icache_range(unsigned long start, unsigned long end);
>  extern int  invalidate_icache_range(unsigned long start, unsigned long end);
>  extern void __flush_dcache_area(void *addr, size_t len);
>  extern void __inval_dcache_area(void *addr, size_t len);
> @@ -81,6 +81,17 @@ extern void __clean_dcache_area_pou(void *addr, size_t len);
>  extern long __flush_cache_user_range(unsigned long start, unsigned long end);
>  extern void sync_icache_aliases(void *kaddr, unsigned long len);
>  
> +static inline void flush_icache_range(unsigned long start, unsigned long end)
> +{
> +	__flush_icache_range(start, end);
> +
> +	/*
> +	 * IPI all online CPUs so that they undergo a context synchronization
> +	 * event and are forced to refetch the new instructions.
> +	 */
> +	kick_all_cpus_sync();
> +}
> +
>  static inline void flush_cache_mm(struct mm_struct *mm)
>  {
>  }
> diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
> index 4f3dcc15a5b2..0ac06560c166 100644
> --- a/arch/arm64/kernel/alternative.c
> +++ b/arch/arm64/kernel/alternative.c
> @@ -205,7 +205,6 @@ static int __apply_alternatives_multi_stop(void *unused)
>  	if (smp_processor_id()) {
>  		while (!READ_ONCE(alternatives_applied))
>  			cpu_relax();
> -		isb();
>  	} else {
>  		BUG_ON(alternatives_applied);
>  		__apply_alternatives(&region, false);
> diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
> index 1d2b6d768efe..0a338a1cd2d7 100644
> --- a/arch/arm64/kernel/cpu_errata.c
> +++ b/arch/arm64/kernel/cpu_errata.c
> @@ -101,7 +101,7 @@ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
>  	for (i = 0; i < SZ_2K; i += 0x80)
>  		memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
>  
> -	flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
> +	__flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
>  }
>  
>  static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
> diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
> index 816d03c4c913..4cc41864f277 100644
> --- a/arch/arm64/kernel/insn.c
> +++ b/arch/arm64/kernel/insn.c
> @@ -249,7 +249,6 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg)
>  	} else {
>  		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
>  			cpu_relax();
> -		isb();
>  	}

Something seems amiss here. 

We call __apply_alternatives_multi_stop() via stop_machine(), and I
thought that ensured that all CPUs had IRQs masked.

If so, the IPI from flush_icache_range() will deadlock.

If not, we can take IRQs, and execute potentially patched code.

I think there's also an existing problem here. Even if with have IRQs
masked, we could take SDEI events (or GICv3 psudeo-NMIs, once we have
those). I don't know how we can manage those.

Thanks,
Mark.
Mark Rutland June 19, 2018, 1:59 p.m. UTC | #2
On Tue, Jun 19, 2018 at 02:55:28PM +0100, Mark Rutland wrote:
> On Tue, Jun 19, 2018 at 01:48:15PM +0100, Will Deacon wrote:
> > When invalidating the instruction cache for a kernel mapping via
> > flush_icache_range(), it is also necessary to flush the pipeline for
> > other CPUs so that instructions fetched into the pipeline before the
> > I-cache invalidation are discarded. For example, if module 'foo' is
> > unloaded and then module 'bar' is loaded into the same area of memory,
> > a CPU could end up executing instructions from 'foo' when branching into
> > 'bar' if these instructions were fetched into the pipeline before 'foo'
> > was unloaded.
> > 
> > Whilst this is highly unlikely to occur in practice, particularly as
> > any exception acts as a context-synchronizing operation, following the
> > letter of the architecture requires us to execute an ISB on each CPU
> > in order for the new instruction stream to be visible.
> > 
> > Signed-off-by: Will Deacon <will.deacon@arm.com>
> > ---
> >  arch/arm64/include/asm/cacheflush.h | 13 ++++++++++++-
> >  arch/arm64/kernel/alternative.c     |  1 -
> >  arch/arm64/kernel/cpu_errata.c      |  2 +-
> >  arch/arm64/kernel/insn.c            | 15 ++-------------
> >  arch/arm64/mm/cache.S               |  4 ++--
> >  5 files changed, 17 insertions(+), 18 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
> > index d264a7274811..a0ec27066e6f 100644
> > --- a/arch/arm64/include/asm/cacheflush.h
> > +++ b/arch/arm64/include/asm/cacheflush.h
> > @@ -71,7 +71,7 @@
> >   *		- kaddr  - page address
> >   *		- size   - region size
> >   */
> > -extern void flush_icache_range(unsigned long start, unsigned long end);
> > +extern void __flush_icache_range(unsigned long start, unsigned long end);
> >  extern int  invalidate_icache_range(unsigned long start, unsigned long end);
> >  extern void __flush_dcache_area(void *addr, size_t len);
> >  extern void __inval_dcache_area(void *addr, size_t len);
> > @@ -81,6 +81,17 @@ extern void __clean_dcache_area_pou(void *addr, size_t len);
> >  extern long __flush_cache_user_range(unsigned long start, unsigned long end);
> >  extern void sync_icache_aliases(void *kaddr, unsigned long len);
> >  
> > +static inline void flush_icache_range(unsigned long start, unsigned long end)
> > +{
> > +	__flush_icache_range(start, end);
> > +
> > +	/*
> > +	 * IPI all online CPUs so that they undergo a context synchronization
> > +	 * event and are forced to refetch the new instructions.
> > +	 */
> > +	kick_all_cpus_sync();
> > +}
> > +
> >  static inline void flush_cache_mm(struct mm_struct *mm)
> >  {
> >  }
> > diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
> > index 4f3dcc15a5b2..0ac06560c166 100644
> > --- a/arch/arm64/kernel/alternative.c
> > +++ b/arch/arm64/kernel/alternative.c
> > @@ -205,7 +205,6 @@ static int __apply_alternatives_multi_stop(void *unused)
> >  	if (smp_processor_id()) {
> >  		while (!READ_ONCE(alternatives_applied))
> >  			cpu_relax();
> > -		isb();
> >  	} else {
> >  		BUG_ON(alternatives_applied);
> >  		__apply_alternatives(&region, false);
> > diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
> > index 1d2b6d768efe..0a338a1cd2d7 100644
> > --- a/arch/arm64/kernel/cpu_errata.c
> > +++ b/arch/arm64/kernel/cpu_errata.c
> > @@ -101,7 +101,7 @@ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
> >  	for (i = 0; i < SZ_2K; i += 0x80)
> >  		memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
> >  
> > -	flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
> > +	__flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
> >  }
> >  
> >  static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
> > diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
> > index 816d03c4c913..4cc41864f277 100644
> > --- a/arch/arm64/kernel/insn.c
> > +++ b/arch/arm64/kernel/insn.c
> > @@ -249,7 +249,6 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg)
> >  	} else {
> >  		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
> >  			cpu_relax();
> > -		isb();
> >  	}
> 
> Something seems amiss here. 
> 
> We call __apply_alternatives_multi_stop() via stop_machine(), and I
> thought that ensured that all CPUs had IRQs masked.

Whoops; that should say aarch64_insn_patch_text_cb(), not
__apply_alternatives_multi_stop()

> If so, the IPI from flush_icache_range() will deadlock.
> 
> If not, we can take IRQs, and execute potentially patched code.
> 
> I think there's also an existing problem here. Even if with have IRQs
> masked, we could take SDEI events (or GICv3 psudeo-NMIs, once we have
> those). I don't know how we can manage those.

Mark.
Will Deacon June 19, 2018, 4:50 p.m. UTC | #3
On Tue, Jun 19, 2018 at 02:55:28PM +0100, Mark Rutland wrote:
> On Tue, Jun 19, 2018 at 01:48:15PM +0100, Will Deacon wrote:
> > diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
> > index 816d03c4c913..4cc41864f277 100644
> > --- a/arch/arm64/kernel/insn.c
> > +++ b/arch/arm64/kernel/insn.c
> > @@ -249,7 +249,6 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg)
> >  	} else {
> >  		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
> >  			cpu_relax();
> > -		isb();
> >  	}
> 
> Something seems amiss here. 
> 
> We call __apply_alternatives_multi_stop() via stop_machine(), and I
> thought that ensured that all CPUs had IRQs masked.
> 
> If so, the IPI from flush_icache_range() will deadlock.
> 
> If not, we can take IRQs, and execute potentially patched code.

Yes, I think you're right, and I think this only applies to kprobes (since
it patches arbitrary instructions and requires the stop_machine()). However,
I think that highlights another issue, which is that the "nosync" patching
cases as used by things like jump_labels are still going to want this IPI,
so actually the fastpath stuff can all be ripped out. ftrace can probably
be left as-is, since I doubt it's critical that new CPUs immediately see
dynamic tracepoints.

I'll cook a patch sorting this out and include it in v2.

> I think there's also an existing problem here. Even if with have IRQs
> masked, we could take SDEI events (or GICv3 psudeo-NMIs, once we have
> those). I don't know how we can manage those.

I guess there are just some places where we can't deal with an SDEI event.
That said, it's fine as long as the SDEI path is careful about what it runs
(and SDEI is masked until the worst of the patching is over during boot).

James?

Will
Alexander Van Brunt June 20, 2018, 4:01 p.m. UTC | #4
> When invalidating the instruction cache for a kernel mapping via
> flush_icache_range(), it is also necessary to flush the pipeline for
> other CPUs so that instructions fetched into the pipeline before the
> I-cache invalidation are discarded. For example, if module 'foo' is
> unloaded and then module 'bar' is loaded into the same area of memory,
> a CPU could end up executing instructions from 'foo' when branching into
> 'bar' if these instructions were fetched into the pipeline before 'foo'
> was unloaded.

I don't think this fixes the problem. If a CPU is executing 'foo', takes an IPI, and returns to find itself executing in the middle of 'bar' there is still a problem because the code changed. All this patch does is synchronize when two CPUs see 'foo' change to 'bar'.
Will Deacon June 20, 2018, 5:01 p.m. UTC | #5
Hi Alex,

On Wed, Jun 20, 2018 at 04:01:29PM +0000, Alexander Van Brunt wrote:
> > When invalidating the instruction cache for a kernel mapping via
> > flush_icache_range(), it is also necessary to flush the pipeline for
> > other CPUs so that instructions fetched into the pipeline before the
> > I-cache invalidation are discarded. For example, if module 'foo' is
> > unloaded and then module 'bar' is loaded into the same area of memory,
> > a CPU could end up executing instructions from 'foo' when branching into
> > 'bar' if these instructions were fetched into the pipeline before 'foo'
> > was unloaded.
> 
> I don't think this fixes the problem. If a CPU is executing 'foo', takes
> an IPI, and returns to find itself executing in the middle of 'bar' there
> is still a problem because the code changed. All this patch does is
> synchronize when two CPUs see 'foo' change to 'bar'.

Right, but that would indicate a catastophic bug in the module code. There
are two sides to this:

1. Code that manages the lifetime of executable mappings. That should all be
   present in the core code, to make sure that we don't e.g. unmap code that
   is being executed.

2. Ensuring that new instructions in an executable mapping are visible to
   the CPU I-side when that CPU decides to branch into the mapping.

This patch series is all about (2). *If* (1) was implemented exclusively
using RCU, then we could probably avoid the IPI and instead ensure that an
RCU grace period ensured all concurrent executors had gone through a context
synchronization event, but unfortunately we can't rely on that.

Will
James Morse June 21, 2018, 10:24 a.m. UTC | #6
Hi Mark, Will

On 19/06/18 17:50, Will Deacon wrote:
> On Tue, Jun 19, 2018 at 02:55:28PM +0100, Mark Rutland wrote:
>> On Tue, Jun 19, 2018 at 01:48:15PM +0100, Will Deacon wrote:
>>> diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
>>> index 816d03c4c913..4cc41864f277 100644
>>> --- a/arch/arm64/kernel/insn.c
>>> +++ b/arch/arm64/kernel/insn.c
>>> @@ -249,7 +249,6 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg)
>>>  	} else {
>>>  		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
>>>  			cpu_relax();
>>> -		isb();
>>>  	}
>>
>> Something seems amiss here. 
>>
>> We call __apply_alternatives_multi_stop() via stop_machine(), and I
>> thought that ensured that all CPUs had IRQs masked.
>>
>> If so, the IPI from flush_icache_range() will deadlock.
>>
>> If not, we can take IRQs, and execute potentially patched code.
> 
> Yes, I think you're right, and I think this only applies to kprobes (since
> it patches arbitrary instructions and requires the stop_machine()). However,
> I think that highlights another issue, which is that the "nosync" patching
> cases as used by things like jump_labels are still going to want this IPI,
> so actually the fastpath stuff can all be ripped out. ftrace can probably
> be left as-is, since I doubt it's critical that new CPUs immediately see
> dynamic tracepoints.
>
> I'll cook a patch sorting this out and include it in v2.
> 
>> I think there's also an existing problem here. Even if with have IRQs
>> masked, we could take SDEI events (or GICv3 psudeo-NMIs, once we have
>> those). I don't know how we can manage those.
> 
> I guess there are just some places where we can't deal with an SDEI event.
> That said, it's fine as long as the SDEI path is careful about what it runs
> (and SDEI is masked until the worst of the patching is over during boot).

Yup, its setup as late as possible, so cpufeature stuff should be safe.

I think break-pointing code we expect to run in_nmi() is a disaster waiting to
happen. e.g. taking an SDEI-event out of a guest, we run the handler before KVM
does it's guest exit, so if you tread on a breakpoint KVM's el2_sync is going to
bring the machine down.

The SDEI stuff has __kprobes all over it, I probably need to add some more
'notrace'.


I think forbidding code-patch on NMI code-regions is the easiest thing to do.

If this is something we needed to support, we could get stop_machine() to mask
SDEI and GIC psuedo_NMIs as part of its work. We could mask SError and disable
debug exceptions while we're at it.



Thanks,

James
diff mbox

Patch

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index d264a7274811..a0ec27066e6f 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -71,7 +71,7 @@ 
  *		- kaddr  - page address
  *		- size   - region size
  */
-extern void flush_icache_range(unsigned long start, unsigned long end);
+extern void __flush_icache_range(unsigned long start, unsigned long end);
 extern int  invalidate_icache_range(unsigned long start, unsigned long end);
 extern void __flush_dcache_area(void *addr, size_t len);
 extern void __inval_dcache_area(void *addr, size_t len);
@@ -81,6 +81,17 @@  extern void __clean_dcache_area_pou(void *addr, size_t len);
 extern long __flush_cache_user_range(unsigned long start, unsigned long end);
 extern void sync_icache_aliases(void *kaddr, unsigned long len);
 
+static inline void flush_icache_range(unsigned long start, unsigned long end)
+{
+	__flush_icache_range(start, end);
+
+	/*
+	 * IPI all online CPUs so that they undergo a context synchronization
+	 * event and are forced to refetch the new instructions.
+	 */
+	kick_all_cpus_sync();
+}
+
 static inline void flush_cache_mm(struct mm_struct *mm)
 {
 }
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 4f3dcc15a5b2..0ac06560c166 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -205,7 +205,6 @@  static int __apply_alternatives_multi_stop(void *unused)
 	if (smp_processor_id()) {
 		while (!READ_ONCE(alternatives_applied))
 			cpu_relax();
-		isb();
 	} else {
 		BUG_ON(alternatives_applied);
 		__apply_alternatives(&region, false);
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 1d2b6d768efe..0a338a1cd2d7 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -101,7 +101,7 @@  static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
 	for (i = 0; i < SZ_2K; i += 0x80)
 		memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
 
-	flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
+	__flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
 }
 
 static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 816d03c4c913..4cc41864f277 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -249,7 +249,6 @@  static int __kprobes aarch64_insn_patch_text_cb(void *arg)
 	} else {
 		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
 			cpu_relax();
-		isb();
 	}
 
 	return ret;
@@ -283,18 +282,8 @@  int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
 		if (ret)
 			return ret;
 
-		if (aarch64_insn_hotpatch_safe(insn, insns[0])) {
-			/*
-			 * ARMv8 architecture doesn't guarantee all CPUs see
-			 * the new instruction after returning from function
-			 * aarch64_insn_patch_text_nosync(). So send IPIs to
-			 * all other CPUs to achieve instruction
-			 * synchronization.
-			 */
-			ret = aarch64_insn_patch_text_nosync(addrs[0], insns[0]);
-			kick_all_cpus_sync();
-			return ret;
-		}
+		if (aarch64_insn_hotpatch_safe(insn, insns[0]))
+			return aarch64_insn_patch_text_nosync(addrs[0], insns[0]);
 	}
 
 	return aarch64_insn_patch_text_sync(addrs, insns, cnt);
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 30334d81b021..0c22ede52f90 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -35,7 +35,7 @@ 
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-ENTRY(flush_icache_range)
+ENTRY(__flush_icache_range)
 	/* FALLTHROUGH */
 
 /*
@@ -77,7 +77,7 @@  alternative_else_nop_endif
 9:
 	mov	x0, #-EFAULT
 	b	1b
-ENDPROC(flush_icache_range)
+ENDPROC(__flush_icache_range)
 ENDPROC(__flush_cache_user_range)
 
 /*