diff mbox series

[v4,28/39] unwind_user/deferred: Add deferred unwinding interface

Message ID 6052e8487746603bdb29b65f4033e739092d9925.1737511963.git.jpoimboe@kernel.org (mailing list archive)
State New
Headers show
Series unwind, perf: sframe user space unwinding | expand

Commit Message

Josh Poimboeuf Jan. 22, 2025, 2:31 a.m. UTC
Add an interface for scheduling task work to unwind the user space stack
before returning to user space.  This solves several problems for its
callers:

  - Ensure the unwind happens in task context even if the caller may be
    running in NMI or interrupt context.

  - Avoid duplicate unwinds, whether called multiple times by the same
    caller or by different callers.

  - Create a "context cookie" which allows trace post-processing to
    correlate kernel unwinds/traces with the user unwind.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/entry-common.h          |   2 +
 include/linux/sched.h                 |   5 +
 include/linux/unwind_deferred.h       |  46 +++++++
 include/linux/unwind_deferred_types.h |  10 ++
 kernel/fork.c                         |   4 +
 kernel/unwind/Makefile                |   2 +-
 kernel/unwind/deferred.c              | 178 ++++++++++++++++++++++++++
 7 files changed, 246 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/unwind_deferred.h
 create mode 100644 include/linux/unwind_deferred_types.h
 create mode 100644 kernel/unwind/deferred.c

Comments

Peter Zijlstra Jan. 22, 2025, 1:37 p.m. UTC | #1
On Tue, Jan 21, 2025 at 06:31:20PM -0800, Josh Poimboeuf wrote:
> +/*
> + * The context cookie is a unique identifier which allows post-processing to
> + * correlate kernel trace(s) with user unwinds.  The high 12 bits are the CPU

s/12/16/ ?

> + * id; the lower 48 bits are a per-CPU entry counter.
> + */
> +static u64 ctx_to_cookie(u64 cpu, u64 ctx)
> +{
> +	BUILD_BUG_ON(NR_CPUS > 65535);
> +	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);
> +}
Peter Zijlstra Jan. 22, 2025, 1:44 p.m. UTC | #2
On Tue, Jan 21, 2025 at 06:31:20PM -0800, Josh Poimboeuf wrote:

> +/* entry-from-user counter */
> +static DEFINE_PER_CPU(u64, unwind_ctx_ctr);

AFAICT from the below, this thing does *not* count entry-from-user. It
might count a subset, but I need to stare longer.

> +/*
> + * Read the task context cookie, first initializing it if this is the first
> + * call to get_cookie() since the most recent entry from user.
> + */
> +static u64 get_cookie(struct unwind_task_info *info)
> +{
> +	u64 ctx_ctr;
> +	u64 cookie;
> +	u64 cpu;
> +
> +	guard(irqsave)();
> +
> +	cookie = info->cookie;
> +	if (cookie)
> +		return cookie;
> +
> +
> +	cpu = raw_smp_processor_id();
> +	ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
> +	info->cookie = ctx_to_cookie(cpu, ctx_ctr);
> +
> +	return cookie;
> +
> +}
> +
> +static void unwind_deferred_task_work(struct callback_head *head)
> +{

> +	cookie = get_cookie(info);

> +}
> +

> +int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
> +{

> +	*cookie = get_cookie(info);

> +}
Peter Zijlstra Jan. 22, 2025, 2:16 p.m. UTC | #3
On Wed, Jan 22, 2025 at 02:37:30PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 21, 2025 at 06:31:20PM -0800, Josh Poimboeuf wrote:
> > +/*
> > + * The context cookie is a unique identifier which allows post-processing to
> > + * correlate kernel trace(s) with user unwinds.  The high 12 bits are the CPU
> 
> s/12/16/ ?
> 
> > + * id; the lower 48 bits are a per-CPU entry counter.
> > + */
> > +static u64 ctx_to_cookie(u64 cpu, u64 ctx)
> > +{
> > +	BUILD_BUG_ON(NR_CPUS > 65535);
> > +	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);
> > +}

Also, I have to note that 0 is a valid return value here, which will
give a ton of fun.
Mathieu Desnoyers Jan. 22, 2025, 8:13 p.m. UTC | #4
On 2025-01-21 21:31, Josh Poimboeuf wrote:
> Add an interface for scheduling task work to unwind the user space stack
> before returning to user space.  This solves several problems for its
> callers:
> 
>    - Ensure the unwind happens in task context even if the caller may be
>      running in NMI or interrupt context.
> 
>    - Avoid duplicate unwinds, whether called multiple times by the same
>      caller or by different callers.
> 
>    - Create a "context cookie" which allows trace post-processing to
>      correlate kernel unwinds/traces with the user unwind.
> 
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>   include/linux/entry-common.h          |   2 +
>   include/linux/sched.h                 |   5 +
>   include/linux/unwind_deferred.h       |  46 +++++++
>   include/linux/unwind_deferred_types.h |  10 ++
>   kernel/fork.c                         |   4 +
>   kernel/unwind/Makefile                |   2 +-
>   kernel/unwind/deferred.c              | 178 ++++++++++++++++++++++++++
>   7 files changed, 246 insertions(+), 1 deletion(-)
>   create mode 100644 include/linux/unwind_deferred.h
>   create mode 100644 include/linux/unwind_deferred_types.h
>   create mode 100644 kernel/unwind/deferred.c
> 
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index fc61d0205c97..fb2b27154fee 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -12,6 +12,7 @@
>   #include <linux/resume_user_mode.h>
>   #include <linux/tick.h>
>   #include <linux/kmsan.h>
> +#include <linux/unwind_deferred.h>
>   
>   #include <asm/entry-common.h>
>   
> @@ -111,6 +112,7 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
>   
>   	CT_WARN_ON(__ct_state() != CT_STATE_USER);
>   	user_exit_irqoff();
> +	unwind_enter_from_user_mode();
>   
>   	instrumentation_begin();
>   	kmsan_unpoison_entry_regs(regs);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 64934e0830af..042a95f4f6e6 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -46,6 +46,7 @@
>   #include <linux/rv.h>
>   #include <linux/livepatch_sched.h>
>   #include <linux/uidgid_types.h>
> +#include <linux/unwind_deferred_types.h>
>   #include <asm/kmap_size.h>
>   
>   /* task_struct member predeclarations (sorted alphabetically): */
> @@ -1603,6 +1604,10 @@ struct task_struct {
>   	struct user_event_mm		*user_event_mm;
>   #endif
>   
> +#ifdef CONFIG_UNWIND_USER
> +	struct unwind_task_info		unwind_info;
> +#endif
> +
>   	/*
>   	 * New fields for task_struct should be added above here, so that
>   	 * they are included in the randomized portion of task_struct.
> diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
> new file mode 100644
> index 000000000000..741f409f0d1f
> --- /dev/null
> +++ b/include/linux/unwind_deferred.h
> @@ -0,0 +1,46 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_UNWIND_USER_DEFERRED_H
> +#define _LINUX_UNWIND_USER_DEFERRED_H
> +
> +#include <linux/task_work.h>
> +#include <linux/unwind_user.h>
> +#include <linux/unwind_deferred_types.h>
> +
> +struct unwind_work;
> +
> +typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie);
> +
> +struct unwind_work {
> +	struct callback_head		work;
> +	unwind_callback_t		func;
> +	int				pending;
> +};

This is a lot of information to keep around per instance.

I'm not sure it would be OK to have a single unwind_work per perf-event
for perf. I suspect it may need to be per perf-event X per-task if a
perf-event can be associated to more than a single task (not sure ?).

For LTTng, we'd have to consider something similar because of multi-session
support. Either we'd have one unwind_work per-session X per-task, or we'd
need to multiplex this internally within LTTng-modules. None of this is
ideal in terms of memory footprint.

We should look at what part of this information can be made static/global
and what part is task-local, so we minimize the amount of redundant data
per-task (memory footprint).

AFAIU, most of that unwind_work information is global:

   - work,
   - func,

And could be registered dynamically by the tracer when it enables
tracing with an interest on stack walking.

At registration, we can allocate a descriptor ID (with a limited bounded
max number, configurable). This would associate a work+func to a given
ID, and keep track of this in a global table (indexed by ID).

I suspect that the only thing we really want to keep track of per-task
is the pending bit, and what is the ID of the unwind_work associated.
This could be kept, per-task, in either:

- a bitmap of pending bits, indexed by ID, or
- an array of pending IDs.

Unregistration of unwind_work could iterate on all tasks and clear the
pending bit or ID associated with the unregistered work, to make sure
we don't trigger unrelated work after a re-use.


> +
> +#ifdef CONFIG_UNWIND_USER
> +
> +void unwind_task_init(struct task_struct *task);
> +void unwind_task_free(struct task_struct *task);
> +
> +void unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
> +int unwind_deferred_request(struct unwind_work *work, u64 *cookie);
> +bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work);
> +
> +static __always_inline void unwind_enter_from_user_mode(void)
> +{
> +	current->unwind_info.cookie = 0;
> +}
> +
> +#else /* !CONFIG_UNWIND_USER */
> +
> +static inline void unwind_task_init(struct task_struct *task) {}
> +static inline void unwind_task_free(struct task_struct *task) {}
> +
> +static inline void unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) {}
> +static inline int unwind_deferred_request(struct task_struct *task, struct unwind_work *work, u64 *cookie) { return -ENOSYS; }
> +static inline bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work) { return false; }
> +
> +static inline void unwind_enter_from_user_mode(void) {}
> +
> +#endif /* !CONFIG_UNWIND_USER */
> +
> +#endif /* _LINUX_UNWIND_USER_DEFERRED_H */
> diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
> new file mode 100644
> index 000000000000..9749824aea09
> --- /dev/null
> +++ b/include/linux/unwind_deferred_types.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
> +#define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
> +
> +struct unwind_task_info {
> +	unsigned long		*entries;
> +	u64			cookie;
> +};
> +
> +#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 88753f8bbdd3..c9a954af72a1 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -106,6 +106,7 @@
>   #include <linux/pidfs.h>
>   #include <linux/tick.h>
>   #include <linux/sframe.h>
> +#include <linux/unwind_deferred.h>
>   
>   #include <asm/pgalloc.h>
>   #include <linux/uaccess.h>
> @@ -973,6 +974,7 @@ void __put_task_struct(struct task_struct *tsk)
>   	WARN_ON(refcount_read(&tsk->usage));
>   	WARN_ON(tsk == current);
>   
> +	unwind_task_free(tsk);
>   	sched_ext_free(tsk);
>   	io_uring_free(tsk);
>   	cgroup_free(tsk);
> @@ -2370,6 +2372,8 @@ __latent_entropy struct task_struct *copy_process(
>   	p->bpf_ctx = NULL;
>   #endif
>   
> +	unwind_task_init(p);
> +
>   	/* Perform scheduler related setup. Assign this task to a CPU. */
>   	retval = sched_fork(clone_flags, p);
>   	if (retval)
> diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
> index f70380d7a6a6..146038165865 100644
> --- a/kernel/unwind/Makefile
> +++ b/kernel/unwind/Makefile
> @@ -1,2 +1,2 @@
> - obj-$(CONFIG_UNWIND_USER)		+= user.o
> + obj-$(CONFIG_UNWIND_USER)		+= user.o deferred.o
>    obj-$(CONFIG_HAVE_UNWIND_USER_SFRAME)	+= sframe.o
> diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> new file mode 100644
> index 000000000000..f0dbe4069247
> --- /dev/null
> +++ b/kernel/unwind/deferred.c
> @@ -0,0 +1,178 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> +* Deferred user space unwinding
> +*/
> +#include <linux/kernel.h>
> +#include <linux/sched.h>
> +#include <linux/sched/task_stack.h>
> +#include <linux/sframe.h>
> +#include <linux/slab.h>
> +#include <linux/task_work.h>
> +#include <linux/mm.h>
> +#include <linux/unwind_deferred.h>
> +
> +#define UNWIND_MAX_ENTRIES 512
> +
> +/* entry-from-user counter */
> +static DEFINE_PER_CPU(u64, unwind_ctx_ctr);
> +
> +/*
> + * The context cookie is a unique identifier which allows post-processing to
> + * correlate kernel trace(s) with user unwinds.  The high 12 bits are the CPU
> + * id; the lower 48 bits are a per-CPU entry counter.
> + */
> +static u64 ctx_to_cookie(u64 cpu, u64 ctx)
> +{
> +	BUILD_BUG_ON(NR_CPUS > 65535);

2^12 = 4k, not 64k. Perhaps you mean to reserve 16 bits
for cpu numbers ?

> +	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);

Perhaps use ilog2(NR_CPUS) instead for the number of bits to use
rather than hard code 12 ?


> +}
> +
> +/*
> + * Read the task context cookie, first initializing it if this is the first
> + * call to get_cookie() since the most recent entry from user.
> + */
> +static u64 get_cookie(struct unwind_task_info *info)
> +{
> +	u64 ctx_ctr;
> +	u64 cookie;
> +	u64 cpu;
> +
> +	guard(irqsave)();
> +
> +	cookie = info->cookie;
> +	if (cookie)
> +		return cookie;
> +
> +
> +	cpu = raw_smp_processor_id();
> +	ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
> +	info->cookie = ctx_to_cookie(cpu, ctx_ctr);
> +
> +	return cookie;
> +
> +}
> +
> +static void unwind_deferred_task_work(struct callback_head *head)
> +{
> +	struct unwind_work *work = container_of(head, struct unwind_work, work);
> +	struct unwind_task_info *info = &current->unwind_info;
> +	struct unwind_stacktrace trace;
> +	u64 cookie;
> +
> +	if (WARN_ON_ONCE(!work->pending))
> +		return;
> +
> +	/*
> +	 * From here on out, the callback must always be called, even if it's
> +	 * just an empty trace.
> +	 */
> +
> +	cookie = get_cookie(info);
> +
> +	/* Check for task exit path. */
> +	if (!current->mm)
> +		goto do_callback;
> +
> +	if (!info->entries) {
> +		info->entries = kmalloc(UNWIND_MAX_ENTRIES * sizeof(long),
> +					GFP_KERNEL);
> +		if (!info->entries)
> +			goto do_callback;
> +	}
> +
> +	trace.entries = info->entries;
> +	trace.nr = 0;
> +	unwind_user(&trace, UNWIND_MAX_ENTRIES);
> +
> +do_callback:
> +	work->func(work, &trace, cookie);
> +	work->pending = 0;
> +}
> +
> +/*
> + * Schedule a user space unwind to be done in task work before exiting the
> + * kernel.
> + *
> + * The returned cookie output is a unique identifer for the current task entry

identifier

Thanks,

Mathieu

> + * context.  Its value will also be passed to the callback function.  It can be
> + * used to stitch kernel and user stack traces together in post-processing.
> + *
> + * It's valid to call this function multiple times for the same @work within
> + * the same task entry context.  Each call will return the same cookie.  If the
> + * callback is already pending, an error will be returned along with the
> + * cookie.  If the callback is not pending because it has already been
> + * previously called for the same entry context, it will be called again with
> + * the same stack trace and cookie.
> + *
> + * Thus are three possible return scenarios:
> + *
> + *   * return != 0, *cookie == 0: the operation failed, no pending callback.
> + *
> + *   * return != 0, *cookie != 0: the callback is already pending. The cookie
> + *     can still be used to correlate with the pending callback.
> + *
> + *   * return == 0, *cookie != 0: the callback queued successfully.  The
> + *     callback is guaranteed to be called with the given cookie.
> + */
> +int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
> +{
> +	struct unwind_task_info *info = &current->unwind_info;
> +	int ret;
> +
> +	*cookie = 0;
> +
> +	if (WARN_ON_ONCE(in_nmi()))
> +		return -EINVAL;
> +
> +	if (!current->mm || !user_mode(task_pt_regs(current)))
> +		return -EINVAL;
> +
> +	guard(irqsave)();
> +
> +	*cookie = get_cookie(info);
> +
> +	/* callback already pending? */
> +	if (work->pending)
> +		return -EEXIST;
> +
> +	ret = task_work_add(current, &work->work, TWA_RESUME);
> +	if (WARN_ON_ONCE(ret))
> +		return ret;
> +
> +	work->pending = 1;
> +
> +	return 0;
> +}
> +
> +bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work)
> +{
> +	bool ret;
> +
> +	ret = task_work_cancel(task, &work->work);
> +	if (ret)
> +		work->pending = 0;
> +
> +	return ret;
> +}
> +
> +void unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
> +{
> +	memset(work, 0, sizeof(*work));
> +
> +	init_task_work(&work->work, unwind_deferred_task_work);
> +	work->func = func;
> +}
> +
> +void unwind_task_init(struct task_struct *task)
> +{
> +	struct unwind_task_info *info = &task->unwind_info;
> +
> +	memset(info, 0, sizeof(*info));
> +}
> +
> +void unwind_task_free(struct task_struct *task)
> +{
> +	struct unwind_task_info *info = &task->unwind_info;
> +
> +	kfree(info->entries);
> +}
Josh Poimboeuf Jan. 22, 2025, 9:38 p.m. UTC | #5
On Wed, Jan 22, 2025 at 02:37:30PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 21, 2025 at 06:31:20PM -0800, Josh Poimboeuf wrote:
> > +/*
> > + * The context cookie is a unique identifier which allows post-processing to
> > + * correlate kernel trace(s) with user unwinds.  The high 12 bits are the CPU
> 
> s/12/16/ ?

Oops.  Code is right, comment is wrong.

> 
> > + * id; the lower 48 bits are a per-CPU entry counter.
> > + */
> > +static u64 ctx_to_cookie(u64 cpu, u64 ctx)
> > +{
> > +	BUILD_BUG_ON(NR_CPUS > 65535);
> > +	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);
> > +}
Josh Poimboeuf Jan. 22, 2025, 9:52 p.m. UTC | #6
On Wed, Jan 22, 2025 at 02:44:20PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 21, 2025 at 06:31:20PM -0800, Josh Poimboeuf wrote:
> 
> > +/* entry-from-user counter */
> > +static DEFINE_PER_CPU(u64, unwind_ctx_ctr);
> 
> AFAICT from the below, this thing does *not* count entry-from-user. It
> might count a subset, but I need to stare longer.

Right, it's a subset.  Something like so:

/*
 * This is a unique percpu identifier for a given task entry context.
 * Conceptually, it's incremented every time the CPU enters the kernel from
 * user space, so that each "entry context" on the CPU gets a unique ID.  In
 * reality, as an optimization, it's only incremented on demand for the first
 * deferred unwind request after a given entry-from-user.
 *
 * It's combined with the CPU id to make a systemwide-unique "context cookie".
 */
static DEFINE_PER_CPU(u64, unwind_ctx_ctr);
Josh Poimboeuf Jan. 22, 2025, 10:51 p.m. UTC | #7
On Wed, Jan 22, 2025 at 03:16:16PM +0100, Peter Zijlstra wrote:
> On Wed, Jan 22, 2025 at 02:37:30PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 21, 2025 at 06:31:20PM -0800, Josh Poimboeuf wrote:
> > > +/*
> > > + * The context cookie is a unique identifier which allows post-processing to
> > > + * correlate kernel trace(s) with user unwinds.  The high 12 bits are the CPU
> > 
> > s/12/16/ ?
> > 
> > > + * id; the lower 48 bits are a per-CPU entry counter.
> > > + */
> > > +static u64 ctx_to_cookie(u64 cpu, u64 ctx)
> > > +{
> > > +	BUILD_BUG_ON(NR_CPUS > 65535);
> > > +	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);
> > > +}
> 
> Also, I have to note that 0 is a valid return value here, which will
> give a ton of fun.

The ctx_ctr is always incremented before calling this, so 0 isn't a
valid cookie.
Josh Poimboeuf Jan. 23, 2025, 4:05 a.m. UTC | #8
On Wed, Jan 22, 2025 at 03:13:10PM -0500, Mathieu Desnoyers wrote:
> > +struct unwind_work {
> > +	struct callback_head		work;
> > +	unwind_callback_t		func;
> > +	int				pending;
> > +};
> 
> This is a lot of information to keep around per instance.
> 
> I'm not sure it would be OK to have a single unwind_work per perf-event
> for perf. I suspect it may need to be per perf-event X per-task if a
> perf-event can be associated to more than a single task (not sure ?).

For "perf record -g <command>", it seems to be one event per task.
Incidentally this is the mode where I did my perf testing :-/

But looking at it now, a global "perf record -g" appears to use one
event per CPU.  So if a task requests an unwind and then schedules out
before returning to user space, any subsequent tasks trying to unwind on
that CPU would be blocked until the original task returned to user.  So
yeah, that's definitely a problem.

Actually a per-CPU unwind_work descriptor could conceivably work if we
able to unwind at schedule() time.

But Steve pointed out that wouldn't work so well if the task isn't in
RUNNING state.

However... would it be a horrible idea for 'next' to unwind 'prev' after
the context switch???

> For LTTng, we'd have to consider something similar because of multi-session
> support. Either we'd have one unwind_work per-session X per-task, or we'd
> need to multiplex this internally within LTTng-modules. None of this is
> ideal in terms of memory footprint.
> 
> We should look at what part of this information can be made static/global
> and what part is task-local, so we minimize the amount of redundant data
> per-task (memory footprint).
> 
> AFAIU, most of that unwind_work information is global:
> 
>   - work,
>   - func,
> 
> And could be registered dynamically by the tracer when it enables
> tracing with an interest on stack walking.
> 
> At registration, we can allocate a descriptor ID (with a limited bounded
> max number, configurable). This would associate a work+func to a given
> ID, and keep track of this in a global table (indexed by ID).
> 
> I suspect that the only thing we really want to keep track of per-task
> is the pending bit, and what is the ID of the unwind_work associated.
> This could be kept, per-task, in either:
> 
> - a bitmap of pending bits, indexed by ID, or
> - an array of pending IDs.

That's basically what I was doing before.  The per-task state also had:

  - 'struct callback_head work' for doing the task work.  A single work
    function was used to multiplex the callbacks, as opposed to the
    current patches where each descriptor gets its own separate
    task_work.

  - 'void *privs[UNWIND_MAX_CALLBACKS]' opaque data pointers.  Maybe
    some callbacks don't need that, but perf needed it for the 'event'
    pointer.  For 32 max callbacks that's 256 bytes per task.

  - 'u64 last_cookies[UNWIND_MAX_CALLBACKS]' to prevent a callback from
    getting called twice.  But actually that may have been overkill, it
    should be fine to call the callback again with the cached stack
    trace.  The tracer could instead have its own policy for how to
    handle dupes.

  - 'unsigned int work_pending' to designate whether the task_work is
    pending.  Also probably not necessary, the pending bits could serve
    the same purpose.

So it had more concurrency to deal with, to handle the extra per-task
state.

It also had a global array of callbacks, which used a mutex and SRCU to
coordinate between the register/unregister and the task work.

Another major issue was that it wasn't NMI-safe due to all the shared
state.  So a tracer in NMI would have to schedule an IRQ to call
unwind_deferred_request().  Not only is that a pain for the tracers,
it's problematic in other ways:

  - If the NMI occurred in schedule() with IRQs disabled, the IRQ would
    actually interrupt the 'next' task.  So the caller would have to
    stash a 'task' pointer for the IRQ handler to read and pass to
    unwind_deferred_request().  (similar to the task_work bug I found)

  - Thus the deferred unwind interface would need to handle requests
    from non-current, introducing a new set of concurrency issues.

  - Also, while a tracer in NMI can unwind the kernel stack and send
    that to a ring buffer immediately, it can't store the cookie along
    with it, so there lie more tracer headaches.

Once I changed the interface to get rid of the global nastiness, all
those problems went away.

Of course that now introduces the new problem that each tracer (or
tracing event) needs some kind of per-task state.  But otherwise this
new interface really simplifies things a *lot*.

Anyway, I don't have a good answer at the moment.  Will marinate on it.

Maybe we could do something like allocate the unwind_work (or some
equivalent) on demand at the time of unwind request using GFP_NOWAIT or
GFP_ATOMIC or some such, then free it during the task work?

> Unregistration of unwind_work could iterate on all tasks and clear the
> pending bit or ID associated with the unregistered work, to make sure
> we don't trigger unrelated work after a re-use.

What the old unregister code did was to remove it from the global
callbacks array (with the careful use of mutex+SRCU to coordinate with
the task work).  Then synchronize_srcu() before returning.

> > +/*
> > + * The context cookie is a unique identifier which allows post-processing to
> > + * correlate kernel trace(s) with user unwinds.  The high 12 bits are the CPU
> > + * id; the lower 48 bits are a per-CPU entry counter.
> > + */
> > +static u64 ctx_to_cookie(u64 cpu, u64 ctx)
> > +{
> > +	BUILD_BUG_ON(NR_CPUS > 65535);
> 
> 2^12 = 4k, not 64k. Perhaps you mean to reserve 16 bits
> for cpu numbers ?

Yeah, here the code is right but the comment is wrong.  It actually does
use 16 bits.

> > +	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);
> 
> Perhaps use ilog2(NR_CPUS) instead for the number of bits to use
> rather than hard code 12 ?

I'm thinking I'd rather keep it simple by hard-coding the # of bits, so
as to avoid any surprises caused by edge cases.
Peter Zijlstra Jan. 23, 2025, 8:17 a.m. UTC | #9
On Wed, Jan 22, 2025 at 02:51:27PM -0800, Josh Poimboeuf wrote:
> On Wed, Jan 22, 2025 at 03:16:16PM +0100, Peter Zijlstra wrote:
> > On Wed, Jan 22, 2025 at 02:37:30PM +0100, Peter Zijlstra wrote:
> > > On Tue, Jan 21, 2025 at 06:31:20PM -0800, Josh Poimboeuf wrote:
> > > > +/*
> > > > + * The context cookie is a unique identifier which allows post-processing to
> > > > + * correlate kernel trace(s) with user unwinds.  The high 12 bits are the CPU
> > > 
> > > s/12/16/ ?
> > > 
> > > > + * id; the lower 48 bits are a per-CPU entry counter.
> > > > + */
> > > > +static u64 ctx_to_cookie(u64 cpu, u64 ctx)
> > > > +{
> > > > +	BUILD_BUG_ON(NR_CPUS > 65535);
> > > > +	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);
> > > > +}
> > 
> > Also, I have to note that 0 is a valid return value here, which will
> > give a ton of fun.
> 
> The ctx_ctr is always incremented before calling this, so 0 isn't a
> valid cookie.

Right, so that's the problem. You're considering 0 an invalid cookie,
but ctx_to_cookie(0, 1<<48) will be a 0 cookie.

That thing *will* wrap.
Peter Zijlstra Jan. 23, 2025, 8:25 a.m. UTC | #10
On Wed, Jan 22, 2025 at 08:05:33PM -0800, Josh Poimboeuf wrote:

> However... would it be a horrible idea for 'next' to unwind 'prev' after
> the context switch???

The idea isn't terrible, but it will be all sorta of tricky.

The big immediate problem is that the CPU doing the context switch
looses control over prev at:

  __schedule()
    context_switch()
      finish_task_switch()
        finish_task()
	  smp_store_release(&prev->on_cpu, 0);

And this is before we drop rq->lock.

The instruction after that store another CPU is free to claim the task
and run with it. Notably, another CPU might already be spin waiting on
that state, trying to wake the task back up.

By the time we get to a schedulable context, @prev is completely out of
bounds.
Josh Poimboeuf Jan. 23, 2025, 6:30 p.m. UTC | #11
On Thu, Jan 23, 2025 at 09:17:18AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 22, 2025 at 02:51:27PM -0800, Josh Poimboeuf wrote:
> > On Wed, Jan 22, 2025 at 03:16:16PM +0100, Peter Zijlstra wrote:
> > The ctx_ctr is always incremented before calling this, so 0 isn't a
> > valid cookie.
> 
> Right, so that's the problem. You're considering 0 an invalid cookie,
> but ctx_to_cookie(0, 1<<48) will be a 0 cookie.
> 
> That thing *will* wrap.

Well, yes, after N years of sustained very high syscall activity on CPU
0, with stack tracing enabled, in which multiple tracer unwind requests
happen to occur in the same entry context where ctx_ctr wrapped, one of
the tracers might get an invalid cookie.

I can double-increment the counter when it's (1UL << 48) - 1).  Or use
some other bit for "cookie valid".
Josh Poimboeuf Jan. 23, 2025, 6:43 p.m. UTC | #12
On Thu, Jan 23, 2025 at 09:25:34AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 22, 2025 at 08:05:33PM -0800, Josh Poimboeuf wrote:
> 
> > However... would it be a horrible idea for 'next' to unwind 'prev' after
> > the context switch???
> 
> The idea isn't terrible, but it will be all sorta of tricky.
> 
> The big immediate problem is that the CPU doing the context switch
> looses control over prev at:
> 
>   __schedule()
>     context_switch()
>       finish_task_switch()
>         finish_task()
> 	  smp_store_release(&prev->on_cpu, 0);
> 
> And this is before we drop rq->lock.
> 
> The instruction after that store another CPU is free to claim the task
> and run with it. Notably, another CPU might already be spin waiting on
> that state, trying to wake the task back up.
> 
> By the time we get to a schedulable context, @prev is completely out of
> bounds.

Could unwind_deferred_request() call migrate_disable() or so?

How bad would it be to set some bit in @prev to prevent it from getting
rescheduled until the unwind from @next has been done?  Unfortunately
two tasks would be blocked on the unwind instead of one.

BTW, this might be useful for another reason.  In Steve's sframe meeting
yesterday there was some talk of BPF needing to unwind from
sched-switch, without having to wait indefinitely for @prev to get
rescheduled and return to user.
Peter Zijlstra Jan. 23, 2025, 9:58 p.m. UTC | #13
On Thu, Jan 23, 2025 at 10:30:56AM -0800, Josh Poimboeuf wrote:
> On Thu, Jan 23, 2025 at 09:17:18AM +0100, Peter Zijlstra wrote:
> > On Wed, Jan 22, 2025 at 02:51:27PM -0800, Josh Poimboeuf wrote:
> > > On Wed, Jan 22, 2025 at 03:16:16PM +0100, Peter Zijlstra wrote:
> > > The ctx_ctr is always incremented before calling this, so 0 isn't a
> > > valid cookie.
> > 
> > Right, so that's the problem. You're considering 0 an invalid cookie,
> > but ctx_to_cookie(0, 1<<48) will be a 0 cookie.
> > 
> > That thing *will* wrap.
> 
> Well, yes, after N years of sustained very high syscall activity on CPU
> 0, with stack tracing enabled, in which multiple tracer unwind requests
> happen to occur in the same entry context where ctx_ctr wrapped, one of
> the tracers might get an invalid cookie.
> 
> I can double-increment the counter when it's (1UL << 48) - 1).  Or use
> some other bit for "cookie valid".

Right, steal one bit from counter and make it always 1. 47 bit wrap
around should be fine.
Peter Zijlstra Jan. 23, 2025, 10:13 p.m. UTC | #14
On Thu, Jan 23, 2025 at 10:43:05AM -0800, Josh Poimboeuf wrote:
> On Thu, Jan 23, 2025 at 09:25:34AM +0100, Peter Zijlstra wrote:
> > On Wed, Jan 22, 2025 at 08:05:33PM -0800, Josh Poimboeuf wrote:
> > 
> > > However... would it be a horrible idea for 'next' to unwind 'prev' after
> > > the context switch???
> > 
> > The idea isn't terrible, but it will be all sorta of tricky.
> > 
> > The big immediate problem is that the CPU doing the context switch
> > looses control over prev at:
> > 
> >   __schedule()
> >     context_switch()
> >       finish_task_switch()
> >         finish_task()
> > 	  smp_store_release(&prev->on_cpu, 0);
> > 
> > And this is before we drop rq->lock.
> > 
> > The instruction after that store another CPU is free to claim the task
> > and run with it. Notably, another CPU might already be spin waiting on
> > that state, trying to wake the task back up.
> > 
> > By the time we get to a schedulable context, @prev is completely out of
> > bounds.
> 
> Could unwind_deferred_request() call migrate_disable() or so?

That's pretty vile... and might cause performance issues. You realy
don't want things to magically start behaving differently just because
you're tracing.

> How bad would it be to set some bit in @prev to prevent it from getting
> rescheduled until the unwind from @next has been done?  Unfortunately
> two tasks would be blocked on the unwind instead of one.

Yeah, not going to happen. Those paths are complicated enough as is.

> BTW, this might be useful for another reason.  In Steve's sframe meeting
> yesterday there was some talk of BPF needing to unwind from
> sched-switch, without having to wait indefinitely for @prev to get
> rescheduled and return to user.

-EPONIES, you cannot take faults from the middle of schedule(). They can
always use the best effort FP unwind we have today.
Jens Remus Jan. 24, 2025, 4:35 p.m. UTC | #15
On 22.01.2025 03:31, Josh Poimboeuf wrote:

> diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h

> +#ifdef CONFIG_UNWIND_USER
> +
> +void unwind_task_init(struct task_struct *task);
> +void unwind_task_free(struct task_struct *task);
> +
> +void unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
> +int unwind_deferred_request(struct unwind_work *work, u64 *cookie);
> +bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work);
> +
> +static __always_inline void unwind_enter_from_user_mode(void)
> +{
> +	current->unwind_info.cookie = 0;
> +}
> +
> +#else /* !CONFIG_UNWIND_USER */
> +
> +static inline void unwind_task_init(struct task_struct *task) {}
> +static inline void unwind_task_free(struct task_struct *task) {}
> +
> +static inline void unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) {}
> +static inline int unwind_deferred_request(struct task_struct *task, struct unwind_work *work, u64 *cookie) { return -ENOSYS; }

static inline int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { return -ENOSYS; }

Otherwise this does not compile on architectures that do not enable
UNWIND_USER.

> +static inline bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work) { return false; }
> +
> +static inline void unwind_enter_from_user_mode(void) {}
> +
> +#endif /* !CONFIG_UNWIND_USER */

Regards,
Jens
Josh Poimboeuf Jan. 24, 2025, 4:57 p.m. UTC | #16
On Fri, Jan 24, 2025 at 05:35:37PM +0100, Jens Remus wrote:
> On 22.01.2025 03:31, Josh Poimboeuf wrote:
> > +static inline void unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) {}
> > +static inline int unwind_deferred_request(struct task_struct *task, struct unwind_work *work, u64 *cookie) { return -ENOSYS; }
> 
> static inline int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { return -ENOSYS; }
> 
> Otherwise this does not compile on architectures that do not enable
> UNWIND_USER.

Yeah, bots have reported that also, thanks.
Steven Rostedt Jan. 24, 2025, 9:58 p.m. UTC | #17
On Thu, 23 Jan 2025 23:13:26 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> -EPONIES, you cannot take faults from the middle of schedule(). They can
> always use the best effort FP unwind we have today.

Agreed.

Now the only thing I could think of is a flag gets set where the task comes
out of the scheduler and then does the stack trace. It doesn't need to do
the stack trace before it schedules. As it did just schedule, where ever it
scheduled must have been in a schedulable context.

That is, kind of like the task_work flag for entering user space and
exiting the kernel, could we have a sched_work flag to run after after being
scheduled back (exiting schedule()). Since the task has been picked to run,
it will not cause latency for other tasks. The work will be done in its
context. This is no different to the tasks accounting than if it does this
going back to user space. Heck, it would only need to do this once if it
didn't go back to user space, as the user space stack would be the same.
That is, if it gets scheduled multiple times, this would only happen on the
first instance until it leaves the kernel.


	[ trigger stack trace - set sched_work ]

	schedule() {
		context_switch() -> CPU runs some other task
				 <- gets scheduled back onto the CPU
		[..]
		/* preemption enabled ... */
		if (sched_work) {
			do stack trace() // can schedule here but
					 // calls a schedule function that does not
					 // do sched_work to prevent recursion
		}
	}

Could something like this work?

-- Steve
Josh Poimboeuf Jan. 24, 2025, 10:46 p.m. UTC | #18
On Fri, Jan 24, 2025 at 04:58:03PM -0500, Steven Rostedt wrote:
> On Thu, 23 Jan 2025 23:13:26 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > -EPONIES, you cannot take faults from the middle of schedule(). They can
> > always use the best effort FP unwind we have today.
> 
> Agreed.
> 
> Now the only thing I could think of is a flag gets set where the task comes
> out of the scheduler and then does the stack trace. It doesn't need to do
> the stack trace before it schedules. As it did just schedule, where ever it
> scheduled must have been in a schedulable context.
> 
> That is, kind of like the task_work flag for entering user space and
> exiting the kernel, could we have a sched_work flag to run after after being
> scheduled back (exiting schedule()). Since the task has been picked to run,
> it will not cause latency for other tasks. The work will be done in its
> context. This is no different to the tasks accounting than if it does this
> going back to user space. Heck, it would only need to do this once if it
> didn't go back to user space, as the user space stack would be the same.
> That is, if it gets scheduled multiple times, this would only happen on the
> first instance until it leaves the kernel.
> 
> 
> 	[ trigger stack trace - set sched_work ]
> 
> 	schedule() {
> 		context_switch() -> CPU runs some other task
> 				 <- gets scheduled back onto the CPU
> 		[..]
> 		/* preemption enabled ... */
> 		if (sched_work) {
> 			do stack trace() // can schedule here but
> 					 // calls a schedule function that does not
> 					 // do sched_work to prevent recursion
> 		}
> 	}
> 
> Could something like this work?

Yeah, this is basically a more fleshed out version of what I was trying
to propose.

One additional wrinkle is that if @prev wakes up on another CPU while
@next is unwinding it, the unwind goes haywire.  So that would maybe
need to be prevented.
Josh Poimboeuf Jan. 24, 2025, 10:50 p.m. UTC | #19
On Fri, Jan 24, 2025 at 02:46:48PM -0800, Josh Poimboeuf wrote:
> On Fri, Jan 24, 2025 at 04:58:03PM -0500, Steven Rostedt wrote:
> > Now the only thing I could think of is a flag gets set where the task comes
> > out of the scheduler and then does the stack trace. It doesn't need to do
> > the stack trace before it schedules. As it did just schedule, where ever it
> > scheduled must have been in a schedulable context.
> > 
> > That is, kind of like the task_work flag for entering user space and
> > exiting the kernel, could we have a sched_work flag to run after after being
> > scheduled back (exiting schedule()). Since the task has been picked to run,
> > it will not cause latency for other tasks. The work will be done in its
> > context. This is no different to the tasks accounting than if it does this
> > going back to user space. Heck, it would only need to do this once if it
> > didn't go back to user space, as the user space stack would be the same.
> > That is, if it gets scheduled multiple times, this would only happen on the
> > first instance until it leaves the kernel.
> > 
> > 
> > 	[ trigger stack trace - set sched_work ]
> > 
> > 	schedule() {
> > 		context_switch() -> CPU runs some other task
> > 				 <- gets scheduled back onto the CPU
> > 		[..]
> > 		/* preemption enabled ... */
> > 		if (sched_work) {
> > 			do stack trace() // can schedule here but
> > 					 // calls a schedule function that does not
> > 					 // do sched_work to prevent recursion
> > 		}
> > 	}
> > 
> > Could something like this work?
> 
> Yeah, this is basically a more fleshed out version of what I was trying
> to propose.
> 
> One additional wrinkle is that if @prev wakes up on another CPU while
> @next is unwinding it, the unwind goes haywire.  So that would maybe
> need to be prevented.

Hm, reading this again I'm wondering if you're actually proposing that
the unwind happens on @prev after it gets rescheduled sometime in the
future?  Does that actually solve the issue?  What if doesn't get
rescheduled within a reasonable amount of time?
Steven Rostedt Jan. 24, 2025, 11:57 p.m. UTC | #20
On Fri, 24 Jan 2025 14:50:11 -0800
Josh Poimboeuf <jpoimboe@kernel.org> wrote:

> Hm, reading this again I'm wondering if you're actually proposing that
> the unwind happens on @prev after it gets rescheduled sometime in the
> future?  Does that actually solve the issue?  What if doesn't get
> rescheduled within a reasonable amount of time?

Correct, it would be prev that would be doing the unwinding and not next.
But when prev is scheduled back onto the CPU. That way it's only blocking
itself.

The use case that people were doing this with was measuring the time a task
is off the CPU. It can't get that time until the task schedules back
anyway. What the complaint was about was that it could be a very long
system call, with lots of sleeps and they couldn't do the processing.

I can go back and ask, but I'm pretty sure doing the unwind when a task
comes back to the CPU would be sufficient.

-- Steve
Steven Rostedt Jan. 30, 2025, 8:21 p.m. UTC | #21
On Fri, 24 Jan 2025 18:57:44 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Fri, 24 Jan 2025 14:50:11 -0800
> Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> 
> > Hm, reading this again I'm wondering if you're actually proposing that
> > the unwind happens on @prev after it gets rescheduled sometime in the
> > future?  Does that actually solve the issue?  What if doesn't get
> > rescheduled within a reasonable amount of time?  
> 
> Correct, it would be prev that would be doing the unwinding and not next.
> But when prev is scheduled back onto the CPU. That way it's only blocking
> itself.
> 
> The use case that people were doing this with was measuring the time a task
> is off the CPU. It can't get that time until the task schedules back
> anyway. What the complaint was about was that it could be a very long
> system call, with lots of sleeps and they couldn't do the processing.
> 
> I can go back and ask, but I'm pretty sure doing the unwind when a task
> comes back to the CPU would be sufficient.
> 

Coming back from this. It would be fine if we could do the back trace when
we come back from the scheduler, so it should not be an issue if the task
even has to schedule again to fault in the sframe information.

I was also wondering if the unwinder doesn't keep track of who requested
the back trace, just that someone did. Then it would just take a single
flag in the task struct to do the back trace. Return the "cookie" to the
tracer that requested the back trace, and when you do the back trace, just
call all callbacks with that cookie. Let the tracer decided if it wants to
record the back trace or ignore it based on the cookie.

That is, the tracers would need to keep track of the cookies that it cares
about, as if there's other tracers asking for stack traces on tasks that
this tracer doesn't care about it needs to handle being called when it
doesn't care about the stack trace. That said, if you want to trace all
tasks, you can just ignore the cookies and record the traces.

-- Steve
diff mbox series

Patch

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index fc61d0205c97..fb2b27154fee 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -12,6 +12,7 @@ 
 #include <linux/resume_user_mode.h>
 #include <linux/tick.h>
 #include <linux/kmsan.h>
+#include <linux/unwind_deferred.h>
 
 #include <asm/entry-common.h>
 
@@ -111,6 +112,7 @@  static __always_inline void enter_from_user_mode(struct pt_regs *regs)
 
 	CT_WARN_ON(__ct_state() != CT_STATE_USER);
 	user_exit_irqoff();
+	unwind_enter_from_user_mode();
 
 	instrumentation_begin();
 	kmsan_unpoison_entry_regs(regs);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 64934e0830af..042a95f4f6e6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,6 +46,7 @@ 
 #include <linux/rv.h>
 #include <linux/livepatch_sched.h>
 #include <linux/uidgid_types.h>
+#include <linux/unwind_deferred_types.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -1603,6 +1604,10 @@  struct task_struct {
 	struct user_event_mm		*user_event_mm;
 #endif
 
+#ifdef CONFIG_UNWIND_USER
+	struct unwind_task_info		unwind_info;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
new file mode 100644
index 000000000000..741f409f0d1f
--- /dev/null
+++ b/include/linux/unwind_deferred.h
@@ -0,0 +1,46 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_H
+#define _LINUX_UNWIND_USER_DEFERRED_H
+
+#include <linux/task_work.h>
+#include <linux/unwind_user.h>
+#include <linux/unwind_deferred_types.h>
+
+struct unwind_work;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie);
+
+struct unwind_work {
+	struct callback_head		work;
+	unwind_callback_t		func;
+	int				pending;
+};
+
+#ifdef CONFIG_UNWIND_USER
+
+void unwind_task_init(struct task_struct *task);
+void unwind_task_free(struct task_struct *task);
+
+void unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
+int unwind_deferred_request(struct unwind_work *work, u64 *cookie);
+bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work);
+
+static __always_inline void unwind_enter_from_user_mode(void)
+{
+	current->unwind_info.cookie = 0;
+}
+
+#else /* !CONFIG_UNWIND_USER */
+
+static inline void unwind_task_init(struct task_struct *task) {}
+static inline void unwind_task_free(struct task_struct *task) {}
+
+static inline void unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) {}
+static inline int unwind_deferred_request(struct task_struct *task, struct unwind_work *work, u64 *cookie) { return -ENOSYS; }
+static inline bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work) { return false; }
+
+static inline void unwind_enter_from_user_mode(void) {}
+
+#endif /* !CONFIG_UNWIND_USER */
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_H */
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
new file mode 100644
index 000000000000..9749824aea09
--- /dev/null
+++ b/include/linux/unwind_deferred_types.h
@@ -0,0 +1,10 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+#define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+
+struct unwind_task_info {
+	unsigned long		*entries;
+	u64			cookie;
+};
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 88753f8bbdd3..c9a954af72a1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -106,6 +106,7 @@ 
 #include <linux/pidfs.h>
 #include <linux/tick.h>
 #include <linux/sframe.h>
+#include <linux/unwind_deferred.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -973,6 +974,7 @@  void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(refcount_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	unwind_task_free(tsk);
 	sched_ext_free(tsk);
 	io_uring_free(tsk);
 	cgroup_free(tsk);
@@ -2370,6 +2372,8 @@  __latent_entropy struct task_struct *copy_process(
 	p->bpf_ctx = NULL;
 #endif
 
+	unwind_task_init(p);
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
 	if (retval)
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index f70380d7a6a6..146038165865 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1,2 +1,2 @@ 
- obj-$(CONFIG_UNWIND_USER)		+= user.o
+ obj-$(CONFIG_UNWIND_USER)		+= user.o deferred.o
  obj-$(CONFIG_HAVE_UNWIND_USER_SFRAME)	+= sframe.o
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
new file mode 100644
index 000000000000..f0dbe4069247
--- /dev/null
+++ b/kernel/unwind/deferred.c
@@ -0,0 +1,178 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Deferred user space unwinding
+*/
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sframe.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+#include <linux/mm.h>
+#include <linux/unwind_deferred.h>
+
+#define UNWIND_MAX_ENTRIES 512
+
+/* entry-from-user counter */
+static DEFINE_PER_CPU(u64, unwind_ctx_ctr);
+
+/*
+ * The context cookie is a unique identifier which allows post-processing to
+ * correlate kernel trace(s) with user unwinds.  The high 12 bits are the CPU
+ * id; the lower 48 bits are a per-CPU entry counter.
+ */
+static u64 ctx_to_cookie(u64 cpu, u64 ctx)
+{
+	BUILD_BUG_ON(NR_CPUS > 65535);
+	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);
+}
+
+/*
+ * Read the task context cookie, first initializing it if this is the first
+ * call to get_cookie() since the most recent entry from user.
+ */
+static u64 get_cookie(struct unwind_task_info *info)
+{
+	u64 ctx_ctr;
+	u64 cookie;
+	u64 cpu;
+
+	guard(irqsave)();
+
+	cookie = info->cookie;
+	if (cookie)
+		return cookie;
+
+
+	cpu = raw_smp_processor_id();
+	ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
+	info->cookie = ctx_to_cookie(cpu, ctx_ctr);
+
+	return cookie;
+
+}
+
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+	struct unwind_work *work = container_of(head, struct unwind_work, work);
+	struct unwind_task_info *info = &current->unwind_info;
+	struct unwind_stacktrace trace;
+	u64 cookie;
+
+	if (WARN_ON_ONCE(!work->pending))
+		return;
+
+	/*
+	 * From here on out, the callback must always be called, even if it's
+	 * just an empty trace.
+	 */
+
+	cookie = get_cookie(info);
+
+	/* Check for task exit path. */
+	if (!current->mm)
+		goto do_callback;
+
+	if (!info->entries) {
+		info->entries = kmalloc(UNWIND_MAX_ENTRIES * sizeof(long),
+					GFP_KERNEL);
+		if (!info->entries)
+			goto do_callback;
+	}
+
+	trace.entries = info->entries;
+	trace.nr = 0;
+	unwind_user(&trace, UNWIND_MAX_ENTRIES);
+
+do_callback:
+	work->func(work, &trace, cookie);
+	work->pending = 0;
+}
+
+/*
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The returned cookie output is a unique identifer for the current task entry
+ * context.  Its value will also be passed to the callback function.  It can be
+ * used to stitch kernel and user stack traces together in post-processing.
+ *
+ * It's valid to call this function multiple times for the same @work within
+ * the same task entry context.  Each call will return the same cookie.  If the
+ * callback is already pending, an error will be returned along with the
+ * cookie.  If the callback is not pending because it has already been
+ * previously called for the same entry context, it will be called again with
+ * the same stack trace and cookie.
+ *
+ * Thus are three possible return scenarios:
+ *
+ *   * return != 0, *cookie == 0: the operation failed, no pending callback.
+ *
+ *   * return != 0, *cookie != 0: the callback is already pending. The cookie
+ *     can still be used to correlate with the pending callback.
+ *
+ *   * return == 0, *cookie != 0: the callback queued successfully.  The
+ *     callback is guaranteed to be called with the given cookie.
+ */
+int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
+{
+	struct unwind_task_info *info = &current->unwind_info;
+	int ret;
+
+	*cookie = 0;
+
+	if (WARN_ON_ONCE(in_nmi()))
+		return -EINVAL;
+
+	if (!current->mm || !user_mode(task_pt_regs(current)))
+		return -EINVAL;
+
+	guard(irqsave)();
+
+	*cookie = get_cookie(info);
+
+	/* callback already pending? */
+	if (work->pending)
+		return -EEXIST;
+
+	ret = task_work_add(current, &work->work, TWA_RESUME);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	work->pending = 1;
+
+	return 0;
+}
+
+bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work)
+{
+	bool ret;
+
+	ret = task_work_cancel(task, &work->work);
+	if (ret)
+		work->pending = 0;
+
+	return ret;
+}
+
+void unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{
+	memset(work, 0, sizeof(*work));
+
+	init_task_work(&work->work, unwind_deferred_task_work);
+	work->func = func;
+}
+
+void unwind_task_init(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_info;
+
+	memset(info, 0, sizeof(*info));
+}
+
+void unwind_task_free(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_info;
+
+	kfree(info->entries);
+}