Message ID | 20230508213147.853677542@infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | local_clock() vs noinstr | expand |
On Mon, May 08, 2023 at 11:19:58PM +0200, Peter Zijlstra wrote: > --- a/drivers/clocksource/hyperv_timer.c > +++ b/drivers/clocksource/hyperv_timer.c > @@ -408,9 +408,9 @@ static u64 notrace read_hv_clock_tsc_cs( > return read_hv_clock_tsc(); > } > > -static u64 notrace read_hv_sched_clock_tsc(void) > +static u64 noinstr read_hv_sched_clock_tsc(void) > { > - return (read_hv_clock_tsc() - hv_sched_clock_offset) * > + return (hv_read_tsc_page(hv_get_tsc_page()) - hv_sched_clock_offset) * > (NSEC_PER_SEC / HV_CLOCK_HZ); > } > > --- a/include/clocksource/hyperv_timer.h > +++ b/include/clocksource/hyperv_timer.h > @@ -38,7 +38,7 @@ extern void hv_remap_tsc_clocksource(voi > extern unsigned long hv_get_tsc_pfn(void); > extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); > > -static inline notrace u64 > +static __always_inline notrace u64 > hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) > { > u64 scale, offset; > @@ -85,7 +85,7 @@ hv_read_tsc_page_tsc(const struct ms_hyp > return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; > } > > -static inline notrace u64 > +static __always_inline notrace u64 > hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) > { > u64 cur_tsc; Hyper-V folks! While reviewing all this I found the following 'gem': hv_init_clocksource() hv_setup_sched_clock() paravirt_set_sched_clock(read_hv_sched_clock_msr) read_hv_sched_clock_msr() [notrace] read_hv_clock_msr() [notrace] hv_get_register() *traced* hv_get_non_nested_register() ... hv_ghcb_msr_read() WARN_ON(in_nmi()) ... local_irq_save() Note that: a) sched_clock() is used in NMI context a *LOT* b) sched_clock() is notrace (or even noinstr with these patches) and local_irq_save() implies tracing Can you pretty please: 1) delete all this; or, 2) fix it in a hurry? Thanks!
On Mon, May 08, 2023 at 11:44:19PM +0200, Peter Zijlstra wrote: > On Mon, May 08, 2023 at 11:19:58PM +0200, Peter Zijlstra wrote: > > > --- a/drivers/clocksource/hyperv_timer.c > > +++ b/drivers/clocksource/hyperv_timer.c > > @@ -408,9 +408,9 @@ static u64 notrace read_hv_clock_tsc_cs( > > return read_hv_clock_tsc(); > > } > > > > -static u64 notrace read_hv_sched_clock_tsc(void) > > +static u64 noinstr read_hv_sched_clock_tsc(void) > > { > > - return (read_hv_clock_tsc() - hv_sched_clock_offset) * > > + return (hv_read_tsc_page(hv_get_tsc_page()) - hv_sched_clock_offset) * > > (NSEC_PER_SEC / HV_CLOCK_HZ); > > } > > > > --- a/include/clocksource/hyperv_timer.h > > +++ b/include/clocksource/hyperv_timer.h > > @@ -38,7 +38,7 @@ extern void hv_remap_tsc_clocksource(voi > > extern unsigned long hv_get_tsc_pfn(void); > > extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); > > > > -static inline notrace u64 > > +static __always_inline notrace u64 > > hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) > > { > > u64 scale, offset; > > @@ -85,7 +85,7 @@ hv_read_tsc_page_tsc(const struct ms_hyp > > return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; > > } > > > > -static inline notrace u64 > > +static __always_inline notrace u64 > > hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) > > { > > u64 cur_tsc; > > Hyper-V folks! > > While reviewing all this I found the following 'gem': > > hv_init_clocksource() > hv_setup_sched_clock() > paravirt_set_sched_clock(read_hv_sched_clock_msr) > > read_hv_sched_clock_msr() [notrace] > read_hv_clock_msr() [notrace] > hv_get_register() *traced* > hv_get_non_nested_register() ... > hv_ghcb_msr_read() > WARN_ON(in_nmi()) > ... > local_irq_save() > > > Note that: > > a) sched_clock() is used in NMI context a *LOT* > b) sched_clock() is notrace (or even noinstr with these patches) > and local_irq_save() implies tracing > Tianyu and Michael, what's your thought on this? Is the MSR-based GHCB usable at this point? What other clock source can be used? Thanks, Wei. > > Can you pretty please: > > 1) delete all this; or, > 2) fix it in a hurry? > > Thanks!
On Mon, May 08, 2023 at 11:30:43PM +0000, Wei Liu wrote: > On Mon, May 08, 2023 at 11:44:19PM +0200, Peter Zijlstra wrote: > > On Mon, May 08, 2023 at 11:19:58PM +0200, Peter Zijlstra wrote: > > > > > --- a/drivers/clocksource/hyperv_timer.c > > > +++ b/drivers/clocksource/hyperv_timer.c > > > @@ -408,9 +408,9 @@ static u64 notrace read_hv_clock_tsc_cs( > > > return read_hv_clock_tsc(); > > > } > > > > > > -static u64 notrace read_hv_sched_clock_tsc(void) > > > +static u64 noinstr read_hv_sched_clock_tsc(void) > > > { > > > - return (read_hv_clock_tsc() - hv_sched_clock_offset) * > > > + return (hv_read_tsc_page(hv_get_tsc_page()) - hv_sched_clock_offset) * > > > (NSEC_PER_SEC / HV_CLOCK_HZ); > > > } > > > > > > --- a/include/clocksource/hyperv_timer.h > > > +++ b/include/clocksource/hyperv_timer.h > > > @@ -38,7 +38,7 @@ extern void hv_remap_tsc_clocksource(voi > > > extern unsigned long hv_get_tsc_pfn(void); > > > extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); > > > > > > -static inline notrace u64 > > > +static __always_inline notrace u64 > > > hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) > > > { > > > u64 scale, offset; > > > @@ -85,7 +85,7 @@ hv_read_tsc_page_tsc(const struct ms_hyp > > > return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; > > > } > > > > > > -static inline notrace u64 > > > +static __always_inline notrace u64 > > > hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) > > > { > > > u64 cur_tsc; > > > > Hyper-V folks! > > > > While reviewing all this I found the following 'gem': > > > > hv_init_clocksource() > > hv_setup_sched_clock() > > paravirt_set_sched_clock(read_hv_sched_clock_msr) > > > > read_hv_sched_clock_msr() [notrace] > > read_hv_clock_msr() [notrace] > > hv_get_register() *traced* > > hv_get_non_nested_register() ... > > hv_ghcb_msr_read() > > WARN_ON(in_nmi()) > > ... > > local_irq_save() > > > > > > Note that: > > > > a) sched_clock() is used in NMI context a *LOT* > > b) sched_clock() is notrace (or even noinstr with these patches) > > and local_irq_save() implies tracing > > > > Tianyu and Michael, what's your thought on this? > > Is the MSR-based GHCB usable at this point? > > What other clock source can be used? You do have TSC support -- which is what I fixed for you. It's just the whole MSR thing that is comically broken. You could do a read_hv_clock_msr() implementation using __rdmsr() and add some sanity checking that anything GHCB using (SEV?) *will* use TSC. Anyway, will you guys do that, or should I pull out the chainsaw and fix it for you?
From: Peter Zijlstra <peterz@infradead.org> Sent: Thursday, May 11, 2023 1:24 PM > > On Mon, May 08, 2023 at 11:30:43PM +0000, Wei Liu wrote: > > On Mon, May 08, 2023 at 11:44:19PM +0200, Peter Zijlstra wrote: > > > On Mon, May 08, 2023 at 11:19:58PM +0200, Peter Zijlstra wrote: > > > > > > > --- a/drivers/clocksource/hyperv_timer.c > > > > +++ b/drivers/clocksource/hyperv_timer.c > > > > @@ -408,9 +408,9 @@ static u64 notrace read_hv_clock_tsc_cs( > > > > return read_hv_clock_tsc(); > > > > } > > > > > > > > -static u64 notrace read_hv_sched_clock_tsc(void) > > > > +static u64 noinstr read_hv_sched_clock_tsc(void) > > > > { > > > > - return (read_hv_clock_tsc() - hv_sched_clock_offset) * > > > > + return (hv_read_tsc_page(hv_get_tsc_page()) - hv_sched_clock_offset) * > > > > (NSEC_PER_SEC / HV_CLOCK_HZ); > > > > } > > > > > > > > --- a/include/clocksource/hyperv_timer.h > > > > +++ b/include/clocksource/hyperv_timer.h > > > > @@ -38,7 +38,7 @@ extern void hv_remap_tsc_clocksource(voi > > > > extern unsigned long hv_get_tsc_pfn(void); > > > > extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); > > > > > > > > -static inline notrace u64 > > > > +static __always_inline notrace u64 > > > > hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) > > > > { > > > > u64 scale, offset; > > > > @@ -85,7 +85,7 @@ hv_read_tsc_page_tsc(const struct ms_hyp > > > > return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; > > > > } > > > > > > > > -static inline notrace u64 > > > > +static __always_inline notrace u64 > > > > hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) > > > > { > > > > u64 cur_tsc; > > > > > > Hyper-V folks! > > > > > > While reviewing all this I found the following 'gem': > > > > > > hv_init_clocksource() > > > hv_setup_sched_clock() > > > paravirt_set_sched_clock(read_hv_sched_clock_msr) > > > > > > read_hv_sched_clock_msr() [notrace] > > > read_hv_clock_msr() [notrace] > > > hv_get_register() *traced* > > > hv_get_non_nested_register() ... > > > hv_ghcb_msr_read() > > > WARN_ON(in_nmi()) > > > ... > > > local_irq_save() > > > > > > > > > Note that: > > > > > > a) sched_clock() is used in NMI context a *LOT* > > > b) sched_clock() is notrace (or even noinstr with these patches) > > > and local_irq_save() implies tracing > > > > > > > Tianyu and Michael, what's your thought on this? > > > > Is the MSR-based GHCB usable at this point? > > > > What other clock source can be used? > > You do have TSC support -- which is what I fixed for you. It's just the > whole MSR thing that is comically broken. > > You could do a read_hv_clock_msr() implementation using > __rdmsr() and add some sanity checking that anything GHCB using (SEV?) > *will* use TSC. > > Anyway, will you guys do that, or should I pull out the chainsaw and fix > it for you? Peter -- I'll work on a fix. But it will be the first half of next week before I can do it. Michael
On Thu, May 11, 2023 at 11:11:07PM +0000, Michael Kelley (LINUX) wrote: > From: Peter Zijlstra <peterz@infradead.org> Sent: Thursday, May 11, 2023 1:24 PM > > > Tianyu and Michael, what's your thought on this? > > > > > > Is the MSR-based GHCB usable at this point? > > > > > > What other clock source can be used? > > > > You do have TSC support -- which is what I fixed for you. It's just the > > whole MSR thing that is comically broken. > > > > You could do a read_hv_clock_msr() implementation using > > __rdmsr() and add some sanity checking that anything GHCB using (SEV?) > > *will* use TSC. > > > > Anyway, will you guys do that, or should I pull out the chainsaw and fix > > it for you? > > Peter -- I'll work on a fix. But it will be the first half of next week before > I can do it. OK, Thanks!
From: Peter Zijlstra <peterz@infradead.org> Sent: Monday, May 8, 2023 2:44 PM > > On Mon, May 08, 2023 at 11:19:58PM +0200, Peter Zijlstra wrote: > > > --- a/drivers/clocksource/hyperv_timer.c > > +++ b/drivers/clocksource/hyperv_timer.c > > @@ -408,9 +408,9 @@ static u64 notrace read_hv_clock_tsc_cs( > > return read_hv_clock_tsc(); > > } > > > > -static u64 notrace read_hv_sched_clock_tsc(void) > > +static u64 noinstr read_hv_sched_clock_tsc(void) > > { > > - return (read_hv_clock_tsc() - hv_sched_clock_offset) * > > + return (hv_read_tsc_page(hv_get_tsc_page()) - hv_sched_clock_offset) * > > (NSEC_PER_SEC / HV_CLOCK_HZ); > > } > > > > --- a/include/clocksource/hyperv_timer.h > > +++ b/include/clocksource/hyperv_timer.h > > @@ -38,7 +38,7 @@ extern void hv_remap_tsc_clocksource(voi > > extern unsigned long hv_get_tsc_pfn(void); > > extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); > > > > -static inline notrace u64 > > +static __always_inline notrace u64 > > hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) > > { > > u64 scale, offset; > > @@ -85,7 +85,7 @@ hv_read_tsc_page_tsc(const struct ms_hyp > > return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; > > } > > > > -static inline notrace u64 > > +static __always_inline notrace u64 > > hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) > > { > > u64 cur_tsc; > > Hyper-V folks! > > While reviewing all this I found the following 'gem': > > hv_init_clocksource() > hv_setup_sched_clock() > paravirt_set_sched_clock(read_hv_sched_clock_msr) > > read_hv_sched_clock_msr() [notrace] > read_hv_clock_msr() [notrace] > hv_get_register() *traced* > hv_get_non_nested_register() ... > hv_ghcb_msr_read() > WARN_ON(in_nmi()) > ... > local_irq_save() > > > Note that: > > a) sched_clock() is used in NMI context a *LOT* > b) sched_clock() is notrace (or even noinstr with these patches) > and local_irq_save() implies tracing > > > Can you pretty please: > > 1) delete all this; or, > 2) fix it in a hurry? > Peter -- I've sent you an RFC patch to incorporate into your broader patch set. I think it probably makes sense for all the Hyper-V stuff to be a separate patch. I haven't previously worked with the details of notrace vs. noinstr, but I followed the patterns elsewhere in patch set. Please review to see if it seems correct. One thing: In the cases where I added __always_inline, I dropped any notrace or noinstr annotations. I presume such code always takes on the attributes of the caller. If that's not correct, let me know. Michael
On Wed, May 17, 2023 at 02:26:35AM +0000, Michael Kelley (LINUX) wrote: > Peter -- I've sent you an RFC patch to incorporate into your broader > patch set. I think it probably makes sense for all the Hyper-V > stuff to be a separate patch. Perhaps, it's not that much. > I haven't previously worked with the details of notrace vs. noinstr, > but I followed the patterns elsewhere in patch set. Please review > to see if it seems correct. notrace inhibits the "call __fentry__" at the start of the symbol. The __fentry__ call is mostly for ftrace, there's a few sites where inhibiting tracing is critical -- stuff that happens before the ftrace recursion handling, but mostly it's about performance these days, constantly hitting the recusion code isn't very good. noinstr inhibits any and all compiler generated 'extra' -- it is for the C as a portable assembler usage. This very much includes notrace, but it also covers all the *SAN nonsense. Basically, if it does not directly reflect the code as written, it shouldn't be emitted. Additionally, and for validation purposes, it also ensures all these symbols end up in a special text section. But yeah, you seem to have gotten it right. > One thing: In the cases where I added __always_inline, I dropped > any notrace or noinstr annotations. I presume such code always > takes on the attributes of the caller. If that's not correct, let me know. Correct; noinstr actually has an explicit noinline because compilers suck :/
--- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -71,7 +71,7 @@ static int kvm_set_wallclock(const struc return -ENODEV; } -static noinstr u64 kvm_clock_read(void) +static u64 kvm_clock_read(void) { u64 ret; @@ -88,7 +88,7 @@ static u64 kvm_clock_get_cycles(struct c static noinstr u64 kvm_sched_clock_read(void) { - return kvm_clock_read() - kvm_sched_clock_offset; + return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset; } static inline void kvm_sched_clock_init(bool stable) --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -69,12 +69,10 @@ static int __init tsc_early_khz_setup(ch } early_param("tsc_early_khz", tsc_early_khz_setup); -__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +__always_inline void __cyc2ns_read(struct cyc2ns_data *data) { int seq, idx; - preempt_disable_notrace(); - do { seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; @@ -86,6 +84,12 @@ __always_inline void cyc2ns_read_begin(s } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } +__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +{ + preempt_disable_notrace(); + __cyc2ns_read(data); +} + __always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); @@ -115,18 +119,25 @@ __always_inline void cyc2ns_read_end(voi * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; - cyc2ns_read_begin(&data); + __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - cyc2ns_read_end(); + return ns; +} +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = __cycles_2_ns(cyc); + preempt_enable_notrace(); return ns; } @@ -223,7 +234,7 @@ noinstr u64 native_sched_clock(void) u64 tsc_now = rdtsc(); /* return the value in ns */ - return cycles_2_ns(tsc_now); + return __cycles_2_ns(tsc_now); } /* @@ -250,7 +261,7 @@ u64 native_sched_clock_from_tsc(u64 tsc) /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT -noinstr u64 sched_clock(void) +noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } @@ -260,11 +271,20 @@ bool using_native_sched_clock(void) return static_call_query(pv_sched_clock) == native_sched_clock; } #else -u64 sched_clock(void) __attribute__((alias("native_sched_clock"))); +u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif +notrace u64 sched_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = sched_clock_noinstr(); + preempt_enable_notrace(); + return now; +} + int check_tsc_unstable(void) { return tsc_unstable; --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -66,11 +66,10 @@ static noinstr u64 xen_sched_clock(void) struct pvclock_vcpu_time_info *src; u64 ret; - preempt_disable_notrace(); src = &__this_cpu_read(xen_vcpu)->time; ret = pvclock_clocksource_read_nowd(src); ret -= xen_sched_clock_offset; - preempt_enable_notrace(); + return ret; } --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -408,9 +408,9 @@ static u64 notrace read_hv_clock_tsc_cs( return read_hv_clock_tsc(); } -static u64 notrace read_hv_sched_clock_tsc(void) +static u64 noinstr read_hv_sched_clock_tsc(void) { - return (read_hv_clock_tsc() - hv_sched_clock_offset) * + return (hv_read_tsc_page(hv_get_tsc_page()) - hv_sched_clock_offset) * (NSEC_PER_SEC / HV_CLOCK_HZ); } --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -38,7 +38,7 @@ extern void hv_remap_tsc_clocksource(voi extern unsigned long hv_get_tsc_pfn(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -static inline notrace u64 +static __always_inline notrace u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) { u64 scale, offset; @@ -85,7 +85,7 @@ hv_read_tsc_page_tsc(const struct ms_hyp return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; } -static inline notrace u64 +static __always_inline notrace u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) { u64 cur_tsc;
With the intent to provide local_clock_noinstr(), a variant of local_clock() that's safe to be called from noinstr code (with the assumption that any such code will already be non-preemptible), prepare for things by providing a noinstr sched_clock_noinstr() function. Specifically, preempt_enable_*() calls out to schedule(), which upsets noinstr validation efforts. vmlinux.o: warning: objtool: native_sched_clock+0x96: call to preempt_schedule_notrace_thunk() leaves .noinstr.text section vmlinux.o: warning: objtool: kvm_clock_read+0x22: call to preempt_schedule_notrace_thunk() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- arch/x86/kernel/kvmclock.c | 4 +-- arch/x86/kernel/tsc.c | 38 ++++++++++++++++++++++++++++--------- arch/x86/xen/time.c | 3 -- drivers/clocksource/hyperv_timer.c | 4 +-- include/clocksource/hyperv_timer.h | 4 +-- 5 files changed, 36 insertions(+), 17 deletions(-)