diff mbox series

[v4,30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe

Message ID 4ea47a9238cb726614f36a0aad2a545816442e57.1737511963.git.jpoimboe@kernel.org (mailing list archive)
State New
Headers show
Series unwind, perf: sframe user space unwinding | expand

Commit Message

Josh Poimboeuf Jan. 22, 2025, 2:31 a.m. UTC
Make unwind_deferred_request() NMI-safe so tracers in NMI context can
call it to get the cookie immediately rather than have to do the fragile
"schedule irq work and then call unwind_deferred_request()" dance.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/entry-common.h          |   1 +
 include/linux/unwind_deferred.h       |   6 ++
 include/linux/unwind_deferred_types.h |   1 +
 kernel/unwind/deferred.c              | 106 ++++++++++++++++++++++----
 4 files changed, 98 insertions(+), 16 deletions(-)

Comments

Peter Zijlstra Jan. 22, 2025, 2:15 p.m. UTC | #1
On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:

> diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> index 2f38055cce48..939c94abaa50 100644
> --- a/kernel/unwind/deferred.c
> +++ b/kernel/unwind/deferred.c
> @@ -29,27 +29,49 @@ static u64 ctx_to_cookie(u64 cpu, u64 ctx)
>  
>  /*
>   * Read the task context cookie, first initializing it if this is the first
> - * call to get_cookie() since the most recent entry from user.
> + * call to get_cookie() since the most recent entry from user.  This has to be
> + * done carefully to coordinate with unwind_deferred_request_nmi().
>   */
>  static u64 get_cookie(struct unwind_task_info *info)
>  {
>  	u64 ctx_ctr;
>  	u64 cookie;
> -	u64 cpu;
>  
>  	guard(irqsave)();
>  
> -	cookie = info->cookie;
> +	cookie = READ_ONCE(info->cookie);
>  	if (cookie)
>  		return cookie;
>  
> +	ctx_ctr = __this_cpu_read(unwind_ctx_ctr);
>  
> -	cpu = raw_smp_processor_id();
> -	ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
> -	info->cookie = ctx_to_cookie(cpu, ctx_ctr);
> +	/* Read ctx_ctr before info->nmi_cookie */
> +	barrier();
> +
> +	cookie = READ_ONCE(info->nmi_cookie);
> +	if (cookie) {
> +		/*
> +		 * This is the first call to get_cookie() since an NMI handler
> +		 * first wrote it to info->nmi_cookie.  Sync it.
> +		 */
> +		WRITE_ONCE(info->cookie, cookie);
> +		WRITE_ONCE(info->nmi_cookie, 0);
> +		return cookie;
> +	}
> +
> +	/*
> +	 * Write info->cookie.  It's ok to race with an NMI here.  The value of
> +	 * the cookie is based on ctx_ctr from before the NMI could have
> +	 * incremented it.  The result will be the same even if cookie or
> +	 * ctx_ctr end up getting written twice.
> +	 */
> +	cookie = ctx_to_cookie(raw_smp_processor_id(), ctx_ctr + 1);
> +	WRITE_ONCE(info->cookie, cookie);
> +	WRITE_ONCE(info->nmi_cookie, 0);
> +	barrier();
> +	__this_cpu_write(unwind_ctx_ctr, ctx_ctr + 1);
>  
>  	return cookie;
> -
>  }

Oh gawd. Can we please do something simple like:

	guard(irqsave)();
	cpu = raw_smp_processor_id();
	ctr = __this_cpu_read(unwind_ctx_cnt);
	cookie = READ_ONCE(current->unwind_info.cookie);
	do {
		if (cookie)
			return cookie;
		cookie = ctx_to_cookie(cpu, ctr+1);
	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
	__this_cpu_write(unwind_ctx_ctr, ctr+1);
	return cookie;
Peter Zijlstra Jan. 22, 2025, 2:24 p.m. UTC | #2
On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> +static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
> +{
> +	struct unwind_task_info *info = &current->unwind_info;
> +	bool inited_cookie = false;
> +	int ret;
> +
> +	*cookie = info->cookie;
> +	if (!*cookie) {
> +		/*
> +		 * This is the first unwind request since the most recent entry
> +		 * from user.  Initialize the task cookie.
> +		 *
> +		 * Don't write to info->cookie directly, otherwise it may get
> +		 * cleared if the NMI occurred in the kernel during early entry
> +		 * or late exit before the task work gets to run.  Instead, use
> +		 * info->nmi_cookie which gets synced later by get_cookie().
> +		 */
> +		if (!info->nmi_cookie) {
> +			u64 cpu = raw_smp_processor_id();
> +			u64 ctx_ctr;
> +
> +			ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);

__this_cpu_inc_return() is *NOT* NMI safe IIRC.
Josh Poimboeuf Jan. 22, 2025, 10:49 p.m. UTC | #3
On Wed, Jan 22, 2025 at 03:15:05PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> Oh gawd. Can we please do something simple like:
> 
> 	guard(irqsave)();
> 	cpu = raw_smp_processor_id();
> 	ctr = __this_cpu_read(unwind_ctx_cnt);

Don't you need a compiler barrier here?  __this_cpu_read() doesn't have
one.

> 	cookie = READ_ONCE(current->unwind_info.cookie);
> 	do {
> 		if (cookie)
> 			return cookie;
> 		cookie = ctx_to_cookie(cpu, ctr+1);
> 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
> 	__this_cpu_write(unwind_ctx_ctr, ctr+1);
> 	return cookie;

I was trying to avoid the overhead of the cmpxchg.

But also, the nmi_cookie is still needed for the case where the NMI
arrives before info->cookie gets cleared by early entry-from-user.
Josh Poimboeuf Jan. 22, 2025, 10:52 p.m. UTC | #4
On Wed, Jan 22, 2025 at 03:24:18PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> > +static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
> > +{
> > +	struct unwind_task_info *info = &current->unwind_info;
> > +	bool inited_cookie = false;
> > +	int ret;
> > +
> > +	*cookie = info->cookie;
> > +	if (!*cookie) {
> > +		/*
> > +		 * This is the first unwind request since the most recent entry
> > +		 * from user.  Initialize the task cookie.
> > +		 *
> > +		 * Don't write to info->cookie directly, otherwise it may get
> > +		 * cleared if the NMI occurred in the kernel during early entry
> > +		 * or late exit before the task work gets to run.  Instead, use
> > +		 * info->nmi_cookie which gets synced later by get_cookie().
> > +		 */
> > +		if (!info->nmi_cookie) {
> > +			u64 cpu = raw_smp_processor_id();
> > +			u64 ctx_ctr;
> > +
> > +			ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
> 
> __this_cpu_inc_return() is *NOT* NMI safe IIRC.

Hm, I guess I was only looking at x86.
Peter Zijlstra Jan. 23, 2025, 8:40 a.m. UTC | #5
On Wed, Jan 22, 2025 at 02:49:02PM -0800, Josh Poimboeuf wrote:
> On Wed, Jan 22, 2025 at 03:15:05PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> > Oh gawd. Can we please do something simple like:
> > 
> > 	guard(irqsave)();
> > 	cpu = raw_smp_processor_id();
> > 	ctr = __this_cpu_read(unwind_ctx_cnt);
> 
> Don't you need a compiler barrier here?  __this_cpu_read() doesn't have
> one.

What for?

> > 	cookie = READ_ONCE(current->unwind_info.cookie);
> > 	do {
> > 		if (cookie)
> > 			return cookie;
> > 		cookie = ctx_to_cookie(cpu, ctr+1);
> > 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
> > 	__this_cpu_write(unwind_ctx_ctr, ctr+1);
> > 	return cookie;
> 
> I was trying to avoid the overhead of the cmpxchg.

We're going to be doing userspace stack unwinding, I don't think
overhead is a real concern.

> But also, the nmi_cookie is still needed for the case where the NMI
> arrives before info->cookie gets cleared by early entry-from-user.

So how about we clear cookie (and set nr_entries to -1) at
return-to-user, after we've done the work loop and have interrupts
disabled until we hit userspace.

Any NMI that hits there will have to cause another entry anyway.
Peter Zijlstra Jan. 23, 2025, 8:42 a.m. UTC | #6
On Wed, Jan 22, 2025 at 02:52:57PM -0800, Josh Poimboeuf wrote:
> On Wed, Jan 22, 2025 at 03:24:18PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> > > +static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
> > > +{
> > > +	struct unwind_task_info *info = &current->unwind_info;
> > > +	bool inited_cookie = false;
> > > +	int ret;
> > > +
> > > +	*cookie = info->cookie;
> > > +	if (!*cookie) {
> > > +		/*
> > > +		 * This is the first unwind request since the most recent entry
> > > +		 * from user.  Initialize the task cookie.
> > > +		 *
> > > +		 * Don't write to info->cookie directly, otherwise it may get
> > > +		 * cleared if the NMI occurred in the kernel during early entry
> > > +		 * or late exit before the task work gets to run.  Instead, use
> > > +		 * info->nmi_cookie which gets synced later by get_cookie().
> > > +		 */
> > > +		if (!info->nmi_cookie) {
> > > +			u64 cpu = raw_smp_processor_id();
> > > +			u64 ctx_ctr;
> > > +
> > > +			ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
> > 
> > __this_cpu_inc_return() is *NOT* NMI safe IIRC.
> 
> Hm, I guess I was only looking at x86.

:-), right so x86 is rather special here, the various RISC platforms are
what you should be looking at, eg. arm64.
Josh Poimboeuf Jan. 23, 2025, 7:48 p.m. UTC | #7
On Thu, Jan 23, 2025 at 09:40:26AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 22, 2025 at 02:49:02PM -0800, Josh Poimboeuf wrote:
> > On Wed, Jan 22, 2025 at 03:15:05PM +0100, Peter Zijlstra wrote:
> > > On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> > > Oh gawd. Can we please do something simple like:
> > > 
> > > 	guard(irqsave)();
> > > 	cpu = raw_smp_processor_id();
> > > 	ctr = __this_cpu_read(unwind_ctx_cnt);
> > 
> > Don't you need a compiler barrier here?  __this_cpu_read() doesn't have
> > one.
> 
> What for?

Hm, I guess it's not needed for this one.

> > > 	cookie = READ_ONCE(current->unwind_info.cookie);
> > > 	do {
> > > 		if (cookie)
> > > 			return cookie;
> > > 		cookie = ctx_to_cookie(cpu, ctr+1);
> > > 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));

Should not the 2nd argument be &zero?

> > > 	__this_cpu_write(unwind_ctx_ctr, ctr+1);
> > > 	return cookie;
> > But also, the nmi_cookie is still needed for the case where the NMI
> > arrives before info->cookie gets cleared by early entry-from-user.
> 
> So how about we clear cookie (and set nr_entries to -1) at

I think we could set nr_entries to 0 instead of -1?

> return-to-user, after we've done the work loop and have interrupts
> disabled until we hit userspace.
>
> Any NMI that hits there will have to cause another entry anyway.

But there's a cookie mismatch:

    // return-to-user: IRQs disabled
    <NMI>
	current->unwind_info.cookie = 0x1234
    </NMI>
    unwind_exit_to_user_mode()
	current->unwind_info.cookie = 0
    IRET
<IRQ>
    task_work()
        callback(@cookie=WRONG)
Josh Poimboeuf Jan. 23, 2025, 7:54 p.m. UTC | #8
On Thu, Jan 23, 2025 at 11:48:10AM -0800, Josh Poimboeuf wrote:
> On Thu, Jan 23, 2025 at 09:40:26AM +0100, Peter Zijlstra wrote:
> > On Wed, Jan 22, 2025 at 02:49:02PM -0800, Josh Poimboeuf wrote:
> > > But also, the nmi_cookie is still needed for the case where the NMI
> > > arrives before info->cookie gets cleared by early entry-from-user.
> > 
> > So how about we clear cookie (and set nr_entries to -1) at
> 
> I think we could set nr_entries to 0 instead of -1?
> 
> > return-to-user, after we've done the work loop and have interrupts
> > disabled until we hit userspace.
> >
> > Any NMI that hits there will have to cause another entry anyway.
> 
> But there's a cookie mismatch:
> 
>     // return-to-user: IRQs disabled
>     <NMI>
> 	current->unwind_info.cookie = 0x1234
>     </NMI>
>     unwind_exit_to_user_mode()
> 	current->unwind_info.cookie = 0
>     IRET
> <IRQ>
>     task_work()
>         callback(@cookie=WRONG)

Though, assuming we're keeping the unwind_work struct, there's a simpler
alternative to nmi_cookie: store the cookie in the unwind_work.  Then
the task work can just use that instead of current->unwind_info.cookie.
Peter Zijlstra Jan. 23, 2025, 10:17 p.m. UTC | #9
On Thu, Jan 23, 2025 at 11:48:07AM -0800, Josh Poimboeuf wrote:
> > > > 	cookie = READ_ONCE(current->unwind_info.cookie);
> > > > 	do {
> > > > 		if (cookie)
> > > > 			return cookie;
> > > > 		cookie = ctx_to_cookie(cpu, ctr+1);
> > > > 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
> 
> Should not the 2nd argument be &zero?

This I suppose

	cookie = READ_ONCE(current->unwind_info.cookie);
	do {
		if (cookie)
			return cookie;
	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie,
				ctx_to_cookie(cpu, ctr+1)));
Josh Poimboeuf Jan. 23, 2025, 11:34 p.m. UTC | #10
On Thu, Jan 23, 2025 at 11:17:34PM +0100, Peter Zijlstra wrote:
> On Thu, Jan 23, 2025 at 11:48:07AM -0800, Josh Poimboeuf wrote:
> > > > > 	cookie = READ_ONCE(current->unwind_info.cookie);
> > > > > 	do {
> > > > > 		if (cookie)
> > > > > 			return cookie;
> > > > > 		cookie = ctx_to_cookie(cpu, ctr+1);
> > > > > 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
> > 
> > Should not the 2nd argument be &zero?
> 
> This I suppose
> 
> 	cookie = READ_ONCE(current->unwind_info.cookie);
> 	do {
> 		if (cookie)
> 			return cookie;
> 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie,
> 				ctx_to_cookie(cpu, ctr+1)));

Here's what I have at the moment.  unwind_deferred_request() is vastly
simplified thanks to your suggestions, no more NMI-specific muck.
work->pending is replaced with work->cookie so the task work can use it
in case it crossed the cookie-clearing boundary (return-to-user with
IRQs disabled).

static inline bool nmi_supported(void)
{
	return IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG);
}

/*
 * Read the task context cookie, initializing it if this is the first request
 * since the most recent entry from user.
 */
static u64 get_cookie(void)
{
	struct unwind_task_info *info = &current->unwind_info;
	u64 ctr, cookie, old_cookie = 0;
	unsigned int cpu;

	cookie = READ_ONCE(info->cookie);
	if (cookie)
		return cookie;

	guard(irqsave)();

	cpu = raw_smp_processor_id();
	ctr = __this_cpu_read(unwind_ctx_ctr);

	cookie = ctx_to_cookie(cpu, ++ctr);

	/* Make sure the cookie is non-zero. */
	if (!cookie)
		cookie = ctx_to_cookie(cpu, ++ctr);

	/* Write the task cookie unless an NMI just now swooped in to do so. */
	if (!try_cmpxchg64(&info->cookie, &old_cookie, cookie))
		return old_cookie;

	/* Success, update the context counter. */
	__this_cpu_write(unwind_ctx_ctr, ctr);

	return cookie;
}

/*
 * Schedule a user space unwind to be done in task work before exiting the
 * kernel.
 *
 * The returned cookie output is a unique identifer for the current task entry
 * context.  Its value will also be passed to the callback function.  It can be
 * used to stitch kernel and user stack traces together in post-processing.
 *
 * It's valid to call this function multiple times for the same @work within
 * the same task entry context.  Each call will return the same cookie.  If the
 * callback is already pending, an error will be returned along with the
 * cookie.  If the callback is not pending because it has already been
 * previously called for the same entry context, it will be called again with
 * the same stack trace and cookie.
 *
 * Thus are three possible return scenarios:
 *
 *   * return != 0, *cookie == 0: the operation failed, no pending callback.
 *
 *   * return != 0, *cookie != 0: the callback is already pending. The cookie
 *     can still be used to correlate with the pending callback.
 *
 *   * return == 0, *cookie != 0: the callback queued successfully.  The
 *     callback is guaranteed to be called with the given cookie.
 */
int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
{
	enum task_work_notify_mode mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
	u64 zero = 0;
	int ret;

	*cookie = 0;

	if (!current->mm || !user_mode(task_pt_regs(current)))
		return -EINVAL;

	if (!nmi_supported() && in_nmi())
		return -EINVAL;

	/* Return a valid @cookie even if the callback is already pending. */
	*cookie = READ_ONCE(work->cookie);
	if (*cookie)
		return -EEXIST;

	*cookie = get_cookie();

	/* Claim the work unless an NMI just now swooped in to do so. */
	if (!try_cmpxchg64(&work->cookie, &zero, *cookie))
		return -EEXIST;

	/* The work has been claimed, now schedule it. */
	ret = task_work_add(current, &work->work, mode);
	if (WARN_ON_ONCE(ret)) {
		WRITE_ONCE(work->cookie, 0);
		return ret;
	}

	return 0;
}
Peter Zijlstra Jan. 24, 2025, 11:58 a.m. UTC | #11
On Thu, Jan 23, 2025 at 03:34:45PM -0800, Josh Poimboeuf wrote:

> Here's what I have at the moment.  unwind_deferred_request() is vastly
> simplified thanks to your suggestions, no more NMI-specific muck.
> work->pending is replaced with work->cookie so the task work can use it
> in case it crossed the cookie-clearing boundary (return-to-user with
> IRQs disabled).

That looks a lot saner, thanks! I'll be waiting for the next cycle, not
sure I can track all the changes in me head.
diff mbox series

Patch

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index fb2b27154fee..e9b8c145f480 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -363,6 +363,7 @@  static __always_inline void exit_to_user_mode(void)
 	lockdep_hardirqs_on_prepare();
 	instrumentation_end();
 
+	unwind_exit_to_user_mode();
 	user_enter_irqoff();
 	arch_exit_to_user_mode();
 	lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 741f409f0d1f..22269f4d2392 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -30,6 +30,11 @@  static __always_inline void unwind_enter_from_user_mode(void)
 	current->unwind_info.cookie = 0;
 }
 
+static __always_inline void unwind_exit_to_user_mode(void)
+{
+	current->unwind_info.cookie = 0;
+}
+
 #else /* !CONFIG_UNWIND_USER */
 
 static inline void unwind_task_init(struct task_struct *task) {}
@@ -40,6 +45,7 @@  static inline int unwind_deferred_request(struct task_struct *task, struct unwin
 static inline bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work) { return false; }
 
 static inline void unwind_enter_from_user_mode(void) {}
+static inline void unwind_exit_to_user_mode(void) {}
 
 #endif /* !CONFIG_UNWIND_USER */
 
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index 6f71a06329fb..c535cca6534b 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -11,6 +11,7 @@  struct unwind_cache {
 struct unwind_task_info {
 	struct unwind_cache	cache;
 	u64			cookie;
+	u64			nmi_cookie;
 };
 
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index 2f38055cce48..939c94abaa50 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -29,27 +29,49 @@  static u64 ctx_to_cookie(u64 cpu, u64 ctx)
 
 /*
  * Read the task context cookie, first initializing it if this is the first
- * call to get_cookie() since the most recent entry from user.
+ * call to get_cookie() since the most recent entry from user.  This has to be
+ * done carefully to coordinate with unwind_deferred_request_nmi().
  */
 static u64 get_cookie(struct unwind_task_info *info)
 {
 	u64 ctx_ctr;
 	u64 cookie;
-	u64 cpu;
 
 	guard(irqsave)();
 
-	cookie = info->cookie;
+	cookie = READ_ONCE(info->cookie);
 	if (cookie)
 		return cookie;
 
+	ctx_ctr = __this_cpu_read(unwind_ctx_ctr);
 
-	cpu = raw_smp_processor_id();
-	ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
-	info->cookie = ctx_to_cookie(cpu, ctx_ctr);
+	/* Read ctx_ctr before info->nmi_cookie */
+	barrier();
+
+	cookie = READ_ONCE(info->nmi_cookie);
+	if (cookie) {
+		/*
+		 * This is the first call to get_cookie() since an NMI handler
+		 * first wrote it to info->nmi_cookie.  Sync it.
+		 */
+		WRITE_ONCE(info->cookie, cookie);
+		WRITE_ONCE(info->nmi_cookie, 0);
+		return cookie;
+	}
+
+	/*
+	 * Write info->cookie.  It's ok to race with an NMI here.  The value of
+	 * the cookie is based on ctx_ctr from before the NMI could have
+	 * incremented it.  The result will be the same even if cookie or
+	 * ctx_ctr end up getting written twice.
+	 */
+	cookie = ctx_to_cookie(raw_smp_processor_id(), ctx_ctr + 1);
+	WRITE_ONCE(info->cookie, cookie);
+	WRITE_ONCE(info->nmi_cookie, 0);
+	barrier();
+	__this_cpu_write(unwind_ctx_ctr, ctx_ctr + 1);
 
 	return cookie;
-
 }
 
 static void unwind_deferred_task_work(struct callback_head *head)
@@ -100,7 +122,52 @@  static void unwind_deferred_task_work(struct callback_head *head)
 
 do_callback:
 	work->func(work, &trace, cookie);
-	work->pending = 0;
+	WRITE_ONCE(work->pending, 0);
+}
+
+static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
+{
+	struct unwind_task_info *info = &current->unwind_info;
+	bool inited_cookie = false;
+	int ret;
+
+	*cookie = info->cookie;
+	if (!*cookie) {
+		/*
+		 * This is the first unwind request since the most recent entry
+		 * from user.  Initialize the task cookie.
+		 *
+		 * Don't write to info->cookie directly, otherwise it may get
+		 * cleared if the NMI occurred in the kernel during early entry
+		 * or late exit before the task work gets to run.  Instead, use
+		 * info->nmi_cookie which gets synced later by get_cookie().
+		 */
+		if (!info->nmi_cookie) {
+			u64 cpu = raw_smp_processor_id();
+			u64 ctx_ctr;
+
+			ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
+			info->nmi_cookie = ctx_to_cookie(cpu, ctx_ctr);
+
+			inited_cookie = true;
+		}
+
+		*cookie = info->nmi_cookie;
+
+	} else if (work->pending) {
+		return -EEXIST;
+	}
+
+	ret = task_work_add(current, &work->work, TWA_NMI_CURRENT);
+	if (ret) {
+		if (inited_cookie)
+			info->nmi_cookie = 0;
+		return ret;
+	}
+
+	work->pending = 1;
+
+	return 0;
 }
 
 /*
@@ -131,29 +198,36 @@  static void unwind_deferred_task_work(struct callback_head *head)
 int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 {
 	struct unwind_task_info *info = &current->unwind_info;
+	int pending;
 	int ret;
 
 	*cookie = 0;
 
-	if (WARN_ON_ONCE(in_nmi()))
-		return -EINVAL;
-
 	if (!current->mm || !user_mode(task_pt_regs(current)))
 		return -EINVAL;
 
+	if (in_nmi())
+		return unwind_deferred_request_nmi(work, cookie);
+
 	guard(irqsave)();
 
 	*cookie = get_cookie(info);
 
 	/* callback already pending? */
-	if (work->pending)
+	pending = READ_ONCE(work->pending);
+	if (pending)
 		return -EEXIST;
 
-	ret = task_work_add(current, &work->work, TWA_RESUME);
-	if (WARN_ON_ONCE(ret))
-		return ret;
+	/* Claim the work unless an NMI just now swooped in to do so. */
+	if (!try_cmpxchg(&work->pending, &pending, 1))
+		return -EEXIST;
 
-	work->pending = 1;
+	/* The work has been claimed, now schedule it. */
+	ret = task_work_add(current, &work->work, TWA_RESUME);
+	if (WARN_ON_ONCE(ret)) {
+		WRITE_ONCE(work->pending, 0);
+		return ret;
+	}
 
 	return 0;
 }