diff mbox series

[v1,4/6] arm64: broadcast IPIs to pause remote CPUs

Message ID 20241021042218.746659-5-yuzhao@google.com (mailing list archive)
State New
Headers show
Series mm/arm64: re-enable HVO | expand

Commit Message

Yu Zhao Oct. 21, 2024, 4:22 a.m. UTC
Broadcast pseudo-NMI IPIs to pause remote CPUs for a short period of
time, and then reliably resume them when the local CPU exits critical
sections that preclude the execution of remote CPUs.

A typical example of such critical sections is BBM on kernel PTEs.
HugeTLB Vmemmap Optimization (HVO) on arm64 was disabled by
commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable
HUGETLB_PAGE_OPTIMIZE_VMEMMAP") due to the folllowing reason:

  This is deemed UNPREDICTABLE by the Arm architecture without a
  break-before-make sequence (make the PTE invalid, TLBI, write the
  new valid PTE). However, such sequence is not possible since the
  vmemmap may be concurrently accessed by the kernel.

Supporting BBM on kernel PTEs is one of the approaches that can make
HVO theoretically safe on arm64.

Note that it is still possible for the paused CPUs to perform
speculative translations. Such translations would cause spurious
kernel PFs, which should be properly handled by
is_spurious_el1_translation_fault().

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 arch/arm64/include/asm/smp.h |  3 ++
 arch/arm64/kernel/smp.c      | 92 +++++++++++++++++++++++++++++++++---
 2 files changed, 88 insertions(+), 7 deletions(-)

Comments

Marc Zyngier Oct. 22, 2024, 4:15 p.m. UTC | #1
On Mon, 21 Oct 2024 05:22:16 +0100,
Yu Zhao <yuzhao@google.com> wrote:
> 
> Broadcast pseudo-NMI IPIs to pause remote CPUs for a short period of
> time, and then reliably resume them when the local CPU exits critical
> sections that preclude the execution of remote CPUs.
> 
> A typical example of such critical sections is BBM on kernel PTEs.
> HugeTLB Vmemmap Optimization (HVO) on arm64 was disabled by
> commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable
> HUGETLB_PAGE_OPTIMIZE_VMEMMAP") due to the folllowing reason:
> 
>   This is deemed UNPREDICTABLE by the Arm architecture without a
>   break-before-make sequence (make the PTE invalid, TLBI, write the
>   new valid PTE). However, such sequence is not possible since the
>   vmemmap may be concurrently accessed by the kernel.
> 
> Supporting BBM on kernel PTEs is one of the approaches that can make
> HVO theoretically safe on arm64.

Is the safety only theoretical? I would have expected that we'd use an
approach that is absolutely rock-solid.

> 
> Note that it is still possible for the paused CPUs to perform
> speculative translations. Such translations would cause spurious
> kernel PFs, which should be properly handled by
> is_spurious_el1_translation_fault().

Speculative translation faults are never reported, that'd be a CPU
bug. *Spurious* translation faults can be reported if the CPU doesn't
implement FEAT_ETS2, for example, and that has to do with the ordering
of memory access wrt page-table walking for the purpose of translations.

> 
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> ---
>  arch/arm64/include/asm/smp.h |  3 ++
>  arch/arm64/kernel/smp.c      | 92 +++++++++++++++++++++++++++++++++---
>  2 files changed, 88 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> index 2510eec026f7..cffb0cfed961 100644
> --- a/arch/arm64/include/asm/smp.h
> +++ b/arch/arm64/include/asm/smp.h
> @@ -133,6 +133,9 @@ bool cpus_are_stuck_in_kernel(void);
>  extern void crash_smp_send_stop(void);
>  extern bool smp_crash_stop_failed(void);
>  
> +void pause_remote_cpus(void);
> +void resume_remote_cpus(void);
> +
>  #endif /* ifndef __ASSEMBLY__ */
>  
>  #endif /* ifndef __ASM_SMP_H */
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 3b3f6b56e733..68829c6de1b1 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -85,7 +85,12 @@ static int ipi_irq_base __ro_after_init;
>  static int nr_ipi __ro_after_init = NR_IPI;
>  static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
>  
> -static bool crash_stop;
> +enum {
> +	SEND_STOP = BIT(0),
> +	CRASH_STOP = BIT(1),
> +};
> +
> +static unsigned long stop_in_progress;
>  
>  static void ipi_setup(int cpu);
>  
> @@ -917,6 +922,79 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
>  #endif
>  }
>  
> +static DEFINE_SPINLOCK(cpu_pause_lock);

PREEMPT_RT will turn this into a sleeping lock. Is it safe to sleep as
you are dealing with kernel mappings?

> +static cpumask_t paused_cpus;
> +static cpumask_t resumed_cpus;
> +
> +static void pause_local_cpu(void)
> +{
> +	int cpu = smp_processor_id();
> +
> +	cpumask_clear_cpu(cpu, &resumed_cpus);
> +	/*
> +	 * Paired with pause_remote_cpus() to confirm that this CPU not only
> +	 * will be paused but also can be reliably resumed.
> +	 */
> +	smp_wmb();
> +	cpumask_set_cpu(cpu, &paused_cpus);
> +	/* paused_cpus must be set before waiting on resumed_cpus. */
> +	barrier();

I'm not sure what this is trying to enforce. Yes, the compiler won't
reorder the set and the test. But your comment seems to indicate that
also need to make sure the CPU preserves that ordering, and short of a
DMB, the test below could be reordered.

> +	while (!cpumask_test_cpu(cpu, &resumed_cpus))
> +		cpu_relax();
> +	/* A typical example for sleep and wake-up functions. */

I'm not sure this is "typical",...

> +	smp_mb();
> +	cpumask_clear_cpu(cpu, &paused_cpus);
> +}
> +
> +void pause_remote_cpus(void)
> +{
> +	cpumask_t cpus_to_pause;
> +
> +	lockdep_assert_cpus_held();
> +	lockdep_assert_preemption_disabled();
> +
> +	cpumask_copy(&cpus_to_pause, cpu_online_mask);
> +	cpumask_clear_cpu(smp_processor_id(), &cpus_to_pause);

This bitmap is manipulated outside of your cpu_pause_lock. What
guarantees you can't have two CPUs stepping on each other here?

> +
> +	spin_lock(&cpu_pause_lock);
> +
> +	WARN_ON_ONCE(!cpumask_empty(&paused_cpus));
> +
> +	smp_cross_call(&cpus_to_pause, IPI_CPU_STOP_NMI);
> +
> +	while (!cpumask_equal(&cpus_to_pause, &paused_cpus))
> +		cpu_relax();

This can be a lot of things to compare, specially that you are
explicitly mentioning large systems. Why can't this be implemented as
a counter instead?

Overall, this looks like stop_machine() in disguise. Why can't this
use the existing infrastructure?

Thanks,

	M.
Yu Zhao Oct. 28, 2024, 10:11 p.m. UTC | #2
On Tue, Oct 22, 2024 at 10:15 AM Marc Zyngier <maz@kernel.org> wrote:
>
> On Mon, 21 Oct 2024 05:22:16 +0100,
> Yu Zhao <yuzhao@google.com> wrote:
> >
> > Broadcast pseudo-NMI IPIs to pause remote CPUs for a short period of
> > time, and then reliably resume them when the local CPU exits critical
> > sections that preclude the execution of remote CPUs.
> >
> > A typical example of such critical sections is BBM on kernel PTEs.
> > HugeTLB Vmemmap Optimization (HVO) on arm64 was disabled by
> > commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable
> > HUGETLB_PAGE_OPTIMIZE_VMEMMAP") due to the folllowing reason:
> >
> >   This is deemed UNPREDICTABLE by the Arm architecture without a
> >   break-before-make sequence (make the PTE invalid, TLBI, write the
> >   new valid PTE). However, such sequence is not possible since the
> >   vmemmap may be concurrently accessed by the kernel.
> >
> > Supporting BBM on kernel PTEs is one of the approaches that can make
> > HVO theoretically safe on arm64.
>
> Is the safety only theoretical? I would have expected that we'd use an
> approach that is absolutely rock-solid.

We've been trying to construct a repro against the original HVO
(missing BBM), but so far no success. Hopefully a repro does exist,
and then we'd be able to demonstrate the effectiveness of this series,
which is only theoretical at the moment.

> > Note that it is still possible for the paused CPUs to perform
> > speculative translations. Such translations would cause spurious
> > kernel PFs, which should be properly handled by
> > is_spurious_el1_translation_fault().
>
> Speculative translation faults are never reported, that'd be a CPU
> bug.

Right, I meant to say "speculative accesses that cause translations".

> *Spurious* translation faults can be reported if the CPU doesn't
> implement FEAT_ETS2, for example, and that has to do with the ordering
> of memory access wrt page-table walking for the purpose of translations.

Just want to make sure I fully understand: after the local CPU sends
TLBI (followed by DSB & ISB), FEAT_ETS2 would act like DSB & ISB on
remote CPUs when they perform the invalidation, and therefore
speculative accesses are ordered as well on remote CPUs.

Also I'm assuming IPIs on remote CPUs don't act like a full barrier,
and whatever they interrupt still can be speculatively executed even
though the IPI hander itself doesn't access the vmemmap area
undergoing BBM. Is this correct?

> > Signed-off-by: Yu Zhao <yuzhao@google.com>
> > ---
> >  arch/arm64/include/asm/smp.h |  3 ++
> >  arch/arm64/kernel/smp.c      | 92 +++++++++++++++++++++++++++++++++---
> >  2 files changed, 88 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> > index 2510eec026f7..cffb0cfed961 100644
> > --- a/arch/arm64/include/asm/smp.h
> > +++ b/arch/arm64/include/asm/smp.h
> > @@ -133,6 +133,9 @@ bool cpus_are_stuck_in_kernel(void);
> >  extern void crash_smp_send_stop(void);
> >  extern bool smp_crash_stop_failed(void);
> >
> > +void pause_remote_cpus(void);
> > +void resume_remote_cpus(void);
> > +
> >  #endif /* ifndef __ASSEMBLY__ */
> >
> >  #endif /* ifndef __ASM_SMP_H */
> > diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> > index 3b3f6b56e733..68829c6de1b1 100644
> > --- a/arch/arm64/kernel/smp.c
> > +++ b/arch/arm64/kernel/smp.c
> > @@ -85,7 +85,12 @@ static int ipi_irq_base __ro_after_init;
> >  static int nr_ipi __ro_after_init = NR_IPI;
> >  static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
> >
> > -static bool crash_stop;
> > +enum {
> > +     SEND_STOP = BIT(0),
> > +     CRASH_STOP = BIT(1),
> > +};
> > +
> > +static unsigned long stop_in_progress;
> >
> >  static void ipi_setup(int cpu);
> >
> > @@ -917,6 +922,79 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
> >  #endif
> >  }
> >
> > +static DEFINE_SPINLOCK(cpu_pause_lock);
>
> PREEMPT_RT will turn this into a sleeping lock. Is it safe to sleep as
> you are dealing with kernel mappings?

Right, it should be a raw spinlock -- the caller disabled preemption,
which as you said is required when dealing with the kernel mappings.

> > +static cpumask_t paused_cpus;
> > +static cpumask_t resumed_cpus;
> > +
> > +static void pause_local_cpu(void)
> > +{
> > +     int cpu = smp_processor_id();
> > +
> > +     cpumask_clear_cpu(cpu, &resumed_cpus);
> > +     /*
> > +      * Paired with pause_remote_cpus() to confirm that this CPU not only
> > +      * will be paused but also can be reliably resumed.
> > +      */
> > +     smp_wmb();
> > +     cpumask_set_cpu(cpu, &paused_cpus);
> > +     /* paused_cpus must be set before waiting on resumed_cpus. */
> > +     barrier();
>
> I'm not sure what this is trying to enforce. Yes, the compiler won't
> reorder the set and the test.

Sorry I don't follow: does cpumask_set_cpu(), i.e., set_bit(), already
contain a compiler barrier?

My understanding is that the compiler is free to reorder the set and
test on those two independent variables, and make it like this:

  while (!cpumask_test_cpu(cpu, &resumed_cpus))
      cpu_relax();
  cpumask_set_cpu(cpu, &paused_cpus);

So the CPU sent the IPI would keep waiting on paused_cpus being set,
and this CPU would keep waiting on resumed_cpus being set, which would
end up with a deadlock.

> But your comment seems to indicate that
> also need to make sure the CPU preserves that ordering
> and short of a
> DMB, the test below could be reordered.

If this CPU reorders the set and test like above, it wouldn't be a
problem because the set would eventually appear on the other CPU that
sent the IPI.

> > +     while (!cpumask_test_cpu(cpu, &resumed_cpus))
> > +             cpu_relax();
> > +     /* A typical example for sleep and wake-up functions. */
>
> I'm not sure this is "typical",...

Sorry, this full barrier isn't needed. Apparently I didn't properly
fix this from the previous attempt to use wfe()/sev() to make this
function the sleeper for resume_remote_cpus() to wake up.

> > +     smp_mb();
> > +     cpumask_clear_cpu(cpu, &paused_cpus);
> > +}
> > +
> > +void pause_remote_cpus(void)
> > +{
> > +     cpumask_t cpus_to_pause;
> > +
> > +     lockdep_assert_cpus_held();
> > +     lockdep_assert_preemption_disabled();
> > +
> > +     cpumask_copy(&cpus_to_pause, cpu_online_mask);
> > +     cpumask_clear_cpu(smp_processor_id(), &cpus_to_pause);
>
> This bitmap is manipulated outside of your cpu_pause_lock. What
> guarantees you can't have two CPUs stepping on each other here?

Do you mean cpus_to_pause? If so, that's a local bitmap.

> > +
> > +     spin_lock(&cpu_pause_lock);
> > +
> > +     WARN_ON_ONCE(!cpumask_empty(&paused_cpus));
> > +
> > +     smp_cross_call(&cpus_to_pause, IPI_CPU_STOP_NMI);
> > +
> > +     while (!cpumask_equal(&cpus_to_pause, &paused_cpus))
> > +             cpu_relax();
>
> This can be a lot of things to compare, specially that you are
> explicitly mentioning large systems. Why can't this be implemented as
> a counter instead?

Agreed - that'd be sufficient and simpler.

> Overall, this looks like stop_machine() in disguise. Why can't this
> use the existing infrastructure?

This came up during the previous discussion [1]. There are
similarities. The main concern with reusing stop_machine() (or part of
its current implementation) is that it may not meet the performance
requirements. Refactoring might be possible, however, it seems to me
(after checking the code again) that such a refactoring unlikely ends
up cleaner or simpler codebase, especially after I got rid of CPU
masks, as you suggested.

[1] https://lore.kernel.org/all/ZbKjHHeEdFYY1xR5@arm.com/

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 3b3f6b56e733..b672af2441a3 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -917,6 +922,64 @@ static void __noreturn
ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
 #endif
 }

+static DEFINE_RAW_SPINLOCK(cpu_pause_lock);
+static bool __cacheline_aligned_in_smp cpu_paused;
+static atomic_t __cacheline_aligned_in_smp nr_cpus_paused;
+
+static void pause_local_cpu(void)
+{
+ atomic_inc(&nr_cpus_paused);
+
+ while (READ_ONCE(cpu_paused))
+ cpu_relax();
+
+ atomic_dec(&nr_cpus_paused);
+}
+
+void pause_remote_cpus(void)
+{
+ cpumask_t cpus_to_pause;
+ int nr_cpus_to_pause = num_online_cpus() - 1;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_preemption_disabled();
+
+ if (!nr_cpus_to_pause)
+ return;
+
+ cpumask_copy(&cpus_to_pause, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &cpus_to_pause);
+
+ raw_spin_lock(&cpu_pause_lock);
+
+ WARN_ON_ONCE(cpu_paused);
+ WARN_ON_ONCE(atomic_read(&nr_cpus_paused));
+
+ cpu_paused = true;
+
+ smp_cross_call(&cpus_to_pause, IPI_CPU_STOP_NMI);
+
+ while (atomic_read(&nr_cpus_paused) != nr_cpus_to_pause)
+ cpu_relax();
+
+ raw_spin_unlock(&cpu_pause_lock);
+}
+
+void resume_remote_cpus(void)
+{
+ if (!cpu_paused)
+ return;
+
+ raw_spin_lock(&cpu_pause_lock);
+
+ WRITE_ONCE(cpu_paused, false);
+
+ while (atomic_read(&nr_cpus_paused))
+ cpu_relax();
+
+ raw_spin_unlock(&cpu_pause_lock);

+}
+
 static void arm64_backtrace_ipi(cpumask_t *mask)
 {
  __ipi_send_mask(ipi_desc[IPI_CPU_BACKTRACE], mask);
Marc Zyngier Oct. 29, 2024, 7:36 p.m. UTC | #3
On Mon, 28 Oct 2024 22:11:52 +0000,
Yu Zhao <yuzhao@google.com> wrote:
> 
> On Tue, Oct 22, 2024 at 10:15 AM Marc Zyngier <maz@kernel.org> wrote:
> >
> > On Mon, 21 Oct 2024 05:22:16 +0100,
> > Yu Zhao <yuzhao@google.com> wrote:
> > >
> > > Broadcast pseudo-NMI IPIs to pause remote CPUs for a short period of
> > > time, and then reliably resume them when the local CPU exits critical
> > > sections that preclude the execution of remote CPUs.
> > >
> > > A typical example of such critical sections is BBM on kernel PTEs.
> > > HugeTLB Vmemmap Optimization (HVO) on arm64 was disabled by
> > > commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable
> > > HUGETLB_PAGE_OPTIMIZE_VMEMMAP") due to the folllowing reason:
> > >
> > >   This is deemed UNPREDICTABLE by the Arm architecture without a
> > >   break-before-make sequence (make the PTE invalid, TLBI, write the
> > >   new valid PTE). However, such sequence is not possible since the
> > >   vmemmap may be concurrently accessed by the kernel.
> > >
> > > Supporting BBM on kernel PTEs is one of the approaches that can make
> > > HVO theoretically safe on arm64.
> >
> > Is the safety only theoretical? I would have expected that we'd use an
> > approach that is absolutely rock-solid.
> 
> We've been trying to construct a repro against the original HVO
> (missing BBM), but so far no success. Hopefully a repro does exist,
> and then we'd be able to demonstrate the effectiveness of this series,
> which is only theoretical at the moment.

That wasn't my question.

Just because your HW doesn't show you a failure mode doesn't mean the
issue doesn't exist. Or that someone will eventually make use of the
relaxed aspects of the architecture and turn the behaviour you are
relying on into the perfect memory corruption  You absolutely cannot
rely on an implementation-defined behaviour for this stuff.

Hence my question: is your current approach actually safe? In a
non-theoretical manner?

> 
> > > Note that it is still possible for the paused CPUs to perform
> > > speculative translations. Such translations would cause spurious
> > > kernel PFs, which should be properly handled by
> > > is_spurious_el1_translation_fault().
> >
> > Speculative translation faults are never reported, that'd be a CPU
> > bug.
> 
> Right, I meant to say "speculative accesses that cause translations".
> 
> > *Spurious* translation faults can be reported if the CPU doesn't
> > implement FEAT_ETS2, for example, and that has to do with the ordering
> > of memory access wrt page-table walking for the purpose of translations.
> 
> Just want to make sure I fully understand: after the local CPU sends
> TLBI (followed by DSB & ISB), FEAT_ETS2 would act like DSB & ISB on
> remote CPUs when they perform the invalidation, and therefore
> speculative accesses are ordered as well on remote CPUs.

ETS2 has nothing to do with TLBI. It has to do with non-cacheable
(translation, access and address size) faults, and whether an older
update to a translation is visible to a younger memory access without
extra synchronisation. It definitely has nothing to do with remote
CPUs (and the idea of a remote ISB, while cute, doesn't exist).

> Also I'm assuming IPIs on remote CPUs don't act like a full barrier,
> and whatever they interrupt still can be speculatively executed even
> though the IPI hander itself doesn't access the vmemmap area
> undergoing BBM. Is this correct?

Full barrier *for what*? An interrupt is a CSE, which has the effects
described in the ARM ARM, but that's about it.

> 
> > > Signed-off-by: Yu Zhao <yuzhao@google.com>
> > > ---
> > >  arch/arm64/include/asm/smp.h |  3 ++
> > >  arch/arm64/kernel/smp.c      | 92 +++++++++++++++++++++++++++++++++---
> > >  2 files changed, 88 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> > > index 2510eec026f7..cffb0cfed961 100644
> > > --- a/arch/arm64/include/asm/smp.h
> > > +++ b/arch/arm64/include/asm/smp.h
> > > @@ -133,6 +133,9 @@ bool cpus_are_stuck_in_kernel(void);
> > >  extern void crash_smp_send_stop(void);
> > >  extern bool smp_crash_stop_failed(void);
> > >
> > > +void pause_remote_cpus(void);
> > > +void resume_remote_cpus(void);
> > > +
> > >  #endif /* ifndef __ASSEMBLY__ */
> > >
> > >  #endif /* ifndef __ASM_SMP_H */
> > > diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> > > index 3b3f6b56e733..68829c6de1b1 100644
> > > --- a/arch/arm64/kernel/smp.c
> > > +++ b/arch/arm64/kernel/smp.c
> > > @@ -85,7 +85,12 @@ static int ipi_irq_base __ro_after_init;
> > >  static int nr_ipi __ro_after_init = NR_IPI;
> > >  static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
> > >
> > > -static bool crash_stop;
> > > +enum {
> > > +     SEND_STOP = BIT(0),
> > > +     CRASH_STOP = BIT(1),
> > > +};
> > > +
> > > +static unsigned long stop_in_progress;
> > >
> > >  static void ipi_setup(int cpu);
> > >
> > > @@ -917,6 +922,79 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
> > >  #endif
> > >  }
> > >
> > > +static DEFINE_SPINLOCK(cpu_pause_lock);
> >
> > PREEMPT_RT will turn this into a sleeping lock. Is it safe to sleep as
> > you are dealing with kernel mappings?
> 
> Right, it should be a raw spinlock -- the caller disabled preemption,
> which as you said is required when dealing with the kernel mappings.
> 
> > > +static cpumask_t paused_cpus;
> > > +static cpumask_t resumed_cpus;
> > > +
> > > +static void pause_local_cpu(void)
> > > +{
> > > +     int cpu = smp_processor_id();
> > > +
> > > +     cpumask_clear_cpu(cpu, &resumed_cpus);
> > > +     /*
> > > +      * Paired with pause_remote_cpus() to confirm that this CPU not only
> > > +      * will be paused but also can be reliably resumed.
> > > +      */
> > > +     smp_wmb();
> > > +     cpumask_set_cpu(cpu, &paused_cpus);
> > > +     /* paused_cpus must be set before waiting on resumed_cpus. */
> > > +     barrier();
> >
> > I'm not sure what this is trying to enforce. Yes, the compiler won't
> > reorder the set and the test.
> 
> Sorry I don't follow: does cpumask_set_cpu(), i.e., set_bit(), already
> contain a compiler barrier?
> 
> My understanding is that the compiler is free to reorder the set and
> test on those two independent variables, and make it like this:
> 
>   while (!cpumask_test_cpu(cpu, &resumed_cpus))
>       cpu_relax();
>   cpumask_set_cpu(cpu, &paused_cpus);
> 
> So the CPU sent the IPI would keep waiting on paused_cpus being set,
> and this CPU would keep waiting on resumed_cpus being set, which would
> end up with a deadlock.
> 
> > But your comment seems to indicate that
> > also need to make sure the CPU preserves that ordering
> > and short of a
> > DMB, the test below could be reordered.
> 
> If this CPU reorders the set and test like above, it wouldn't be a
> problem because the set would eventually appear on the other CPU that
> sent the IPI.

I don't get it. You are requiring that the compiler doesn't reorder
things, but you're happy that the CPU reorders the same things. Surely
this leads to the same outcome...

> 
> > > +     while (!cpumask_test_cpu(cpu, &resumed_cpus))
> > > +             cpu_relax();
> > > +     /* A typical example for sleep and wake-up functions. */
> >
> > I'm not sure this is "typical",...
> 
> Sorry, this full barrier isn't needed. Apparently I didn't properly
> fix this from the previous attempt to use wfe()/sev() to make this
> function the sleeper for resume_remote_cpus() to wake up.
>
> > > +     smp_mb();
> > > +     cpumask_clear_cpu(cpu, &paused_cpus);
> > > +}
> > > +
> > > +void pause_remote_cpus(void)
> > > +{
> > > +     cpumask_t cpus_to_pause;
> > > +
> > > +     lockdep_assert_cpus_held();
> > > +     lockdep_assert_preemption_disabled();
> > > +
> > > +     cpumask_copy(&cpus_to_pause, cpu_online_mask);
> > > +     cpumask_clear_cpu(smp_processor_id(), &cpus_to_pause);
> >
> > This bitmap is manipulated outside of your cpu_pause_lock. What
> > guarantees you can't have two CPUs stepping on each other here?
> 
> Do you mean cpus_to_pause? If so, that's a local bitmap.

Ah yes, sorry.

> 
> > > +
> > > +     spin_lock(&cpu_pause_lock);
> > > +
> > > +     WARN_ON_ONCE(!cpumask_empty(&paused_cpus));
> > > +
> > > +     smp_cross_call(&cpus_to_pause, IPI_CPU_STOP_NMI);
> > > +
> > > +     while (!cpumask_equal(&cpus_to_pause, &paused_cpus))
> > > +             cpu_relax();
> >
> > This can be a lot of things to compare, specially that you are
> > explicitly mentioning large systems. Why can't this be implemented as
> > a counter instead?
> 
> Agreed - that'd be sufficient and simpler.
> 
> > Overall, this looks like stop_machine() in disguise. Why can't this
> > use the existing infrastructure?
> 
> This came up during the previous discussion [1]. There are
> similarities. The main concern with reusing stop_machine() (or part of
> its current implementation) is that it may not meet the performance
> requirements. Refactoring might be possible, however, it seems to me
> (after checking the code again) that such a refactoring unlikely ends
> up cleaner or simpler codebase, especially after I got rid of CPU
> masks, as you suggested.

I would have expected to see an implementation handling faults during
the break window. IPIs are slow, virtualised extremely badly, and
pNMIs are rarely enabled, due to the long history of issues with it.
At this stage, I'm not sure what you have here is any better than
stop_machine(). On the other hand, faults should be the rarest thing,

	M.
Yu Zhao Oct. 31, 2024, 6:10 p.m. UTC | #4
On Tue, Oct 29, 2024 at 1:36 PM Marc Zyngier <maz@kernel.org> wrote:
>
> On Mon, 28 Oct 2024 22:11:52 +0000,
> Yu Zhao <yuzhao@google.com> wrote:
> >
> > On Tue, Oct 22, 2024 at 10:15 AM Marc Zyngier <maz@kernel.org> wrote:
> > >
> > > On Mon, 21 Oct 2024 05:22:16 +0100,
> > > Yu Zhao <yuzhao@google.com> wrote:
> > > >
> > > > Broadcast pseudo-NMI IPIs to pause remote CPUs for a short period of
> > > > time, and then reliably resume them when the local CPU exits critical
> > > > sections that preclude the execution of remote CPUs.
> > > >
> > > > A typical example of such critical sections is BBM on kernel PTEs.
> > > > HugeTLB Vmemmap Optimization (HVO) on arm64 was disabled by
> > > > commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable
> > > > HUGETLB_PAGE_OPTIMIZE_VMEMMAP") due to the folllowing reason:
> > > >
> > > >   This is deemed UNPREDICTABLE by the Arm architecture without a
> > > >   break-before-make sequence (make the PTE invalid, TLBI, write the
> > > >   new valid PTE). However, such sequence is not possible since the
> > > >   vmemmap may be concurrently accessed by the kernel.
> > > >
> > > > Supporting BBM on kernel PTEs is one of the approaches that can make
> > > > HVO theoretically safe on arm64.
> > >
> > > Is the safety only theoretical? I would have expected that we'd use an
> > > approach that is absolutely rock-solid.
> >
> > We've been trying to construct a repro against the original HVO
> > (missing BBM), but so far no success. Hopefully a repro does exist,
> > and then we'd be able to demonstrate the effectiveness of this series,
> > which is only theoretical at the moment.
>
> That wasn't my question.
>
> Just because your HW doesn't show you a failure mode doesn't mean the
> issue doesn't exist. Or that someone will eventually make use of the
> relaxed aspects of the architecture and turn the behaviour you are
> relying on into the perfect memory corruption  You absolutely cannot
> rely on an implementation-defined behaviour for this stuff.

I agree.

> Hence my question: is your current approach actually safe?

AFAIK, yes.

> In a
> non-theoretical manner?

We are confident enough to try it in our production, but it doesn't
mean it's bug free. It would take more eyeballs and soak time before
it can prove itself.

> > > > Note that it is still possible for the paused CPUs to perform
> > > > speculative translations. Such translations would cause spurious
> > > > kernel PFs, which should be properly handled by
> > > > is_spurious_el1_translation_fault().
> > >
> > > Speculative translation faults are never reported, that'd be a CPU
> > > bug.
> >
> > Right, I meant to say "speculative accesses that cause translations".
> >
> > > *Spurious* translation faults can be reported if the CPU doesn't
> > > implement FEAT_ETS2, for example, and that has to do with the ordering
> > > of memory access wrt page-table walking for the purpose of translations.
> >
> > Just want to make sure I fully understand: after the local CPU sends
> > TLBI (followed by DSB & ISB), FEAT_ETS2 would act like DSB & ISB on
> > remote CPUs when they perform the invalidation, and therefore
> > speculative accesses are ordered as well on remote CPUs.
>
> ETS2 has nothing to do with TLBI. It has to do with non-cacheable
> (translation, access and address size) faults, and whether an older
> update to a translation is visible to a younger memory access without
> extra synchronisation.

Is it correct to say that *without* ETS2, we also wouldn't see
spurious faults on the local CPU from the following BBM sequence?
  clear_pte(); dsb(); isb(); tlbi(); dsb(); isb(); set_valid_pte();
dsb(); isb(); ...

> It definitely has nothing to do with remote
> CPUs (and the idea of a remote ISB, while cute, doesn't exist).
>
> > Also I'm assuming IPIs on remote CPUs don't act like a full barrier,
> > and whatever they interrupt still can be speculatively executed even
> > though the IPI hander itself doesn't access the vmemmap area
> > undergoing BBM. Is this correct?
>
> Full barrier *for what*? An interrupt is a CSE, which has the effects
> described in the ARM ARM, but that's about it.

Sorry for the confusion. I hope the following could explain what's in my mind:

local cpu          remote cpu

send ipi
                   cse (= dsb(); isb() ?)
                   enters ipi handler
                   sets cpu_paused
sees cpu_paused
clear_pte()
dsb();
tlbi();
dsb(); isb()
set_valid_pte()
dsb();
sets cpu_resumed
                  sees cpu_resumed
                  No isb()
                  exits ipi handler

My current understanding is:
Because exception exit doesn't have the CSE and there are no isb()s
within the IPI handler running on the remote CPU, it's possible for
the remote CPU to speculative execute the next instruction before it
sees the valid PTE and trigger a spurious fault.

> > > > Signed-off-by: Yu Zhao <yuzhao@google.com>
> > > > ---
> > > >  arch/arm64/include/asm/smp.h |  3 ++
> > > >  arch/arm64/kernel/smp.c      | 92 +++++++++++++++++++++++++++++++++---
> > > >  2 files changed, 88 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> > > > index 2510eec026f7..cffb0cfed961 100644
> > > > --- a/arch/arm64/include/asm/smp.h
> > > > +++ b/arch/arm64/include/asm/smp.h
> > > > @@ -133,6 +133,9 @@ bool cpus_are_stuck_in_kernel(void);
> > > >  extern void crash_smp_send_stop(void);
> > > >  extern bool smp_crash_stop_failed(void);
> > > >
> > > > +void pause_remote_cpus(void);
> > > > +void resume_remote_cpus(void);
> > > > +
> > > >  #endif /* ifndef __ASSEMBLY__ */
> > > >
> > > >  #endif /* ifndef __ASM_SMP_H */
> > > > diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> > > > index 3b3f6b56e733..68829c6de1b1 100644
> > > > --- a/arch/arm64/kernel/smp.c
> > > > +++ b/arch/arm64/kernel/smp.c
> > > > @@ -85,7 +85,12 @@ static int ipi_irq_base __ro_after_init;
> > > >  static int nr_ipi __ro_after_init = NR_IPI;
> > > >  static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
> > > >
> > > > -static bool crash_stop;
> > > > +enum {
> > > > +     SEND_STOP = BIT(0),
> > > > +     CRASH_STOP = BIT(1),
> > > > +};
> > > > +
> > > > +static unsigned long stop_in_progress;
> > > >
> > > >  static void ipi_setup(int cpu);
> > > >
> > > > @@ -917,6 +922,79 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
> > > >  #endif
> > > >  }
> > > >
> > > > +static DEFINE_SPINLOCK(cpu_pause_lock);
> > >
> > > PREEMPT_RT will turn this into a sleeping lock. Is it safe to sleep as
> > > you are dealing with kernel mappings?
> >
> > Right, it should be a raw spinlock -- the caller disabled preemption,
> > which as you said is required when dealing with the kernel mappings.
> >
> > > > +static cpumask_t paused_cpus;
> > > > +static cpumask_t resumed_cpus;
> > > > +
> > > > +static void pause_local_cpu(void)
> > > > +{
> > > > +     int cpu = smp_processor_id();
> > > > +
> > > > +     cpumask_clear_cpu(cpu, &resumed_cpus);
> > > > +     /*
> > > > +      * Paired with pause_remote_cpus() to confirm that this CPU not only
> > > > +      * will be paused but also can be reliably resumed.
> > > > +      */
> > > > +     smp_wmb();
> > > > +     cpumask_set_cpu(cpu, &paused_cpus);
> > > > +     /* paused_cpus must be set before waiting on resumed_cpus. */
> > > > +     barrier();
> > >
> > > I'm not sure what this is trying to enforce. Yes, the compiler won't
> > > reorder the set and the test.
> >
> > Sorry I don't follow: does cpumask_set_cpu(), i.e., set_bit(), already
> > contain a compiler barrier?
> >
> > My understanding is that the compiler is free to reorder the set and
> > test on those two independent variables, and make it like this:
> >
> >   while (!cpumask_test_cpu(cpu, &resumed_cpus))
> >       cpu_relax();
> >   cpumask_set_cpu(cpu, &paused_cpus);
> >
> > So the CPU sent the IPI would keep waiting on paused_cpus being set,
> > and this CPU would keep waiting on resumed_cpus being set, which would
> > end up with a deadlock.
> >
> > > But your comment seems to indicate that
> > > also need to make sure the CPU preserves that ordering
> > > and short of a
> > > DMB, the test below could be reordered.
> >
> > If this CPU reorders the set and test like above, it wouldn't be a
> > problem because the set would eventually appear on the other CPU that
> > sent the IPI.
>
> I don't get it. You are requiring that the compiler doesn't reorder
> things, but you're happy that the CPU reorders the same things. Surely
> this leads to the same outcome...

I was thinking that the compiler might be able to reorder even if it
can't prove the loop actually terminates. But I now think that
shouldn't happen. So yes, I agree with what you said earlier that "the
compiler won't reorder the set and the test".

> > > > +     while (!cpumask_test_cpu(cpu, &resumed_cpus))
> > > > +             cpu_relax();
> > > > +     /* A typical example for sleep and wake-up functions. */
> > >
> > > I'm not sure this is "typical",...
> >
> > Sorry, this full barrier isn't needed. Apparently I didn't properly
> > fix this from the previous attempt to use wfe()/sev() to make this
> > function the sleeper for resume_remote_cpus() to wake up.
> >
> > > > +     smp_mb();
> > > > +     cpumask_clear_cpu(cpu, &paused_cpus);
> > > > +}
> > > > +
> > > > +void pause_remote_cpus(void)
> > > > +{
> > > > +     cpumask_t cpus_to_pause;
> > > > +
> > > > +     lockdep_assert_cpus_held();
> > > > +     lockdep_assert_preemption_disabled();
> > > > +
> > > > +     cpumask_copy(&cpus_to_pause, cpu_online_mask);
> > > > +     cpumask_clear_cpu(smp_processor_id(), &cpus_to_pause);
> > >
> > > This bitmap is manipulated outside of your cpu_pause_lock. What
> > > guarantees you can't have two CPUs stepping on each other here?
> >
> > Do you mean cpus_to_pause? If so, that's a local bitmap.
>
> Ah yes, sorry.
>
> >
> > > > +
> > > > +     spin_lock(&cpu_pause_lock);
> > > > +
> > > > +     WARN_ON_ONCE(!cpumask_empty(&paused_cpus));
> > > > +
> > > > +     smp_cross_call(&cpus_to_pause, IPI_CPU_STOP_NMI);
> > > > +
> > > > +     while (!cpumask_equal(&cpus_to_pause, &paused_cpus))
> > > > +             cpu_relax();
> > >
> > > This can be a lot of things to compare, specially that you are
> > > explicitly mentioning large systems. Why can't this be implemented as
> > > a counter instead?
> >
> > Agreed - that'd be sufficient and simpler.
> >
> > > Overall, this looks like stop_machine() in disguise. Why can't this
> > > use the existing infrastructure?
> >
> > This came up during the previous discussion [1]. There are
> > similarities. The main concern with reusing stop_machine() (or part of
> > its current implementation) is that it may not meet the performance
> > requirements. Refactoring might be possible, however, it seems to me
> > (after checking the code again) that such a refactoring unlikely ends
> > up cleaner or simpler codebase, especially after I got rid of CPU
> > masks, as you suggested.
>
> I would have expected to see an implementation handling faults during
> the break window. IPIs are slow, virtualised extremely badly, and
> pNMIs are rarely enabled, due to the long history of issues with it.
> At this stage, I'm not sure what you have here is any better than
> stop_machine(). On the other hand, faults should be the rarest thing,

Let me measure stop_machine() and see how that works for our
production. Meanwhile, if you don't mind that we leave the IPI
approach along with the kernel PF approach on the table?

Thanks.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2510eec026f7..cffb0cfed961 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -133,6 +133,9 @@  bool cpus_are_stuck_in_kernel(void);
 extern void crash_smp_send_stop(void);
 extern bool smp_crash_stop_failed(void);
 
+void pause_remote_cpus(void);
+void resume_remote_cpus(void);
+
 #endif /* ifndef __ASSEMBLY__ */
 
 #endif /* ifndef __ASM_SMP_H */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 3b3f6b56e733..68829c6de1b1 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -85,7 +85,12 @@  static int ipi_irq_base __ro_after_init;
 static int nr_ipi __ro_after_init = NR_IPI;
 static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
 
-static bool crash_stop;
+enum {
+	SEND_STOP = BIT(0),
+	CRASH_STOP = BIT(1),
+};
+
+static unsigned long stop_in_progress;
 
 static void ipi_setup(int cpu);
 
@@ -917,6 +922,79 @@  static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
 #endif
 }
 
+static DEFINE_SPINLOCK(cpu_pause_lock);
+static cpumask_t paused_cpus;
+static cpumask_t resumed_cpus;
+
+static void pause_local_cpu(void)
+{
+	int cpu = smp_processor_id();
+
+	cpumask_clear_cpu(cpu, &resumed_cpus);
+	/*
+	 * Paired with pause_remote_cpus() to confirm that this CPU not only
+	 * will be paused but also can be reliably resumed.
+	 */
+	smp_wmb();
+	cpumask_set_cpu(cpu, &paused_cpus);
+	/* paused_cpus must be set before waiting on resumed_cpus. */
+	barrier();
+	while (!cpumask_test_cpu(cpu, &resumed_cpus))
+		cpu_relax();
+	/* A typical example for sleep and wake-up functions. */
+	smp_mb();
+	cpumask_clear_cpu(cpu, &paused_cpus);
+}
+
+void pause_remote_cpus(void)
+{
+	cpumask_t cpus_to_pause;
+
+	lockdep_assert_cpus_held();
+	lockdep_assert_preemption_disabled();
+
+	cpumask_copy(&cpus_to_pause, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), &cpus_to_pause);
+
+	spin_lock(&cpu_pause_lock);
+
+	WARN_ON_ONCE(!cpumask_empty(&paused_cpus));
+
+	smp_cross_call(&cpus_to_pause, IPI_CPU_STOP_NMI);
+
+	while (!cpumask_equal(&cpus_to_pause, &paused_cpus))
+		cpu_relax();
+	/*
+	 * Paired with pause_local_cpu() to confirm that all CPUs not only will
+	 * be paused but also can be reliably resumed.
+	 */
+	smp_rmb();
+	WARN_ON_ONCE(cpumask_intersects(&cpus_to_pause, &resumed_cpus));
+
+	spin_unlock(&cpu_pause_lock);
+}
+
+void resume_remote_cpus(void)
+{
+	cpumask_t cpus_to_resume;
+
+	lockdep_assert_cpus_held();
+	lockdep_assert_preemption_disabled();
+
+	cpumask_copy(&cpus_to_resume, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), &cpus_to_resume);
+
+	spin_lock(&cpu_pause_lock);
+
+	cpumask_setall(&resumed_cpus);
+	/* A typical example for sleep and wake-up functions. */
+	smp_mb();
+	while (cpumask_intersects(&cpus_to_resume, &paused_cpus))
+		cpu_relax();
+
+	spin_unlock(&cpu_pause_lock);
+}
+
 static void arm64_backtrace_ipi(cpumask_t *mask)
 {
 	__ipi_send_mask(ipi_desc[IPI_CPU_BACKTRACE], mask);
@@ -970,7 +1048,9 @@  static void do_handle_IPI(int ipinr)
 
 	case IPI_CPU_STOP:
 	case IPI_CPU_STOP_NMI:
-		if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop) {
+		if (!test_bit(SEND_STOP, &stop_in_progress)) {
+			pause_local_cpu();
+		} else if (test_bit(CRASH_STOP, &stop_in_progress)) {
 			ipi_cpu_crash_stop(cpu, get_irq_regs());
 			unreachable();
 		} else {
@@ -1142,7 +1222,6 @@  static inline unsigned int num_other_online_cpus(void)
 
 void smp_send_stop(void)
 {
-	static unsigned long stop_in_progress;
 	cpumask_t mask;
 	unsigned long timeout;
 
@@ -1154,7 +1233,7 @@  void smp_send_stop(void)
 		goto skip_ipi;
 
 	/* Only proceed if this is the first CPU to reach this code */
-	if (test_and_set_bit(0, &stop_in_progress))
+	if (test_and_set_bit(SEND_STOP, &stop_in_progress))
 		return;
 
 	/*
@@ -1230,12 +1309,11 @@  void crash_smp_send_stop(void)
 	 * This function can be called twice in panic path, but obviously
 	 * we execute this only once.
 	 *
-	 * We use this same boolean to tell whether the IPI we send was a
+	 * We use the CRASH_STOP bit to tell whether the IPI we send was a
 	 * stop or a "crash stop".
 	 */
-	if (crash_stop)
+	if (test_and_set_bit(CRASH_STOP, &stop_in_progress))
 		return;
-	crash_stop = 1;
 
 	smp_send_stop();