Message ID | 20220120160822.914418096@infradead.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | sched: User Managed Concurrency Groups | expand |
On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: > +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo) > +{ > + struct task_struct *tsk = current; > + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); > + bool worker = tsk->flags & PF_UMCG_WORKER; > + int ret; > + > + if (!self || flags) > + return -EINVAL; > + > + if (worker) { > + tsk->flags &= ~PF_UMCG_WORKER; > + if (timo) > + return -ERANGE; > + } > + > + /* see umcg_sys_{enter,exit}() syscall exceptions */ > + ret = umcg_pin_pages(); > + if (ret) > + goto unblock; > + > + /* > + * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE. > + */ > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE); > + if (ret) > + goto unpin; > + > + ret = umcg_wake_next(tsk, self); > + if (ret) > + goto unpin; > + > + if (worker) { > + /* > + * If this fails it is possible ::next_tid is already running > + * while this task is not going to block. This violates our > + * constraints. > + * > + * That said, pretty much the only way to make this fail is by > + * force munmap()'ing things. In which case one is most welcome > + * to the pieces. > + */ > + ret = umcg_enqueue_and_wake(tsk); > + if (ret) > + goto unpin; > + } > + > + umcg_unpin_pages(); > + > + ret = umcg_wait(timo); > + switch (ret) { > + case 0: /* all done */ > + case -EINTR: /* umcg_notify_resume() will continue the wait */ So I was playing with the whole worker timeout thing last night and realized this is broken. If we get a signal while we have a timeout, the timeout gets lost. I think the easiest solution is to have umcg_notify_resume() also resume the timeout, but the first pass of that was yuck, so I need to try again. Related, by moving the whole enqueue-and-wake thing into the timeout, we get more 'fun' failure cases :-( Oh well.. > + ret = 0; > + break; > + > + default: > + goto unblock; > + } > +out: > + if (worker) > + tsk->flags |= PF_UMCG_WORKER; > + return ret; > + > +unpin: > + umcg_unpin_pages(); > +unblock: > + umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING); > + goto out; > +}
On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: > +/* > + * Pinning a page inhibits rmap based unmap for Anon pages. Doing a load > + * through the user mapping ensures the user mapping exists. > + */ > +#define umcg_pin_and_load(_self, _pagep, _member) \ > +({ \ > + __label__ __out; \ > + int __ret = -EFAULT; \ > + \ > + if (pin_user_pages_fast((unsigned long)(_self), 1, 0, &(_pagep)) != 1) \ > + goto __out; \ > + \ > + if (!PageAnon(_pagep) || \ > + get_user(_member, &(_self)->_member)) { \ > + unpin_user_page(_pagep); \ > + goto __out; \ > + } \ > + __ret = 0; \ > +__out: __ret; \ > +}) Per the thread with David, this wants changing like so. --- --- a/kernel/sched/umcg.c +++ b/kernel/sched/umcg.c @@ -34,25 +34,26 @@ static struct task_struct *umcg_get_task } /* - * Pinning a page inhibits rmap based unmap for Anon pages. Doing a load - * through the user mapping ensures the user mapping exists. + * Pinning a page inhibits rmap based unmap for Anon pages. Doing a store + * through the user mapping ensures the user mapping exists and is writable. */ -#define umcg_pin_and_load(_self, _pagep, _member) \ -({ \ - __label__ __out; \ - int __ret = -EFAULT; \ - \ - if (pin_user_pages_fast((unsigned long)(_self), 1, 0, &(_pagep)) != 1) \ - goto __out; \ - \ - if (!PageAnon(_pagep) || \ - get_user(_member, &(_self)->_member)) { \ - unpin_user_page(_pagep); \ - goto __out; \ - } \ - __ret = 0; \ -__out: __ret; \ -}) +static int umcg_pin_page(struct umcg_task __user *self, struct page **pagep) +{ + int ret = -EFAULT; + + if (pin_user_pages_fast((unsigned long)self, 1, FOLL_WRITE, pagep) != 1) + goto out; + + if (!PageAnon(*pagep) || + put_user(0ULL, &self->__zero[0])) { + unpin_user_page(*pagep); + goto out; + } + + ret = 0; +out: + return ret; +} /** * umcg_pin_pages: pin pages containing struct umcg_task of @@ -72,10 +73,13 @@ static int umcg_pin_pages(void) tsk->umcg_server)) return -EBUSY; - ret = umcg_pin_and_load(self, tsk->umcg_page, server_tid); + ret = umcg_pin_page(self, &tsk->umcg_page); if (ret) goto clear_self; + if (get_user(server_tid, &self->server_tid)) + goto unpin_self; + ret = -ESRCH; server = umcg_get_task(server_tid); if (!server) @@ -83,7 +87,7 @@ static int umcg_pin_pages(void) /* must cache due to possible concurrent change */ tsk->umcg_server_task = READ_ONCE(server->umcg_task); - ret = umcg_pin_and_load(tsk->umcg_server_task, tsk->umcg_server_page, server_tid); + ret = umcg_pin_page(tsk->umcg_server_task, &tsk->umcg_server_page); if (ret) goto clear_server; @@ -414,7 +418,7 @@ static int umcg_wait(u64 timo) break; } - ret = umcg_pin_and_load(self, page, state); + ret = umcg_pin_page(self, &page); if (ret) { page = NULL; break;
On Fri, Jan 21, 2022 at 12:47:58PM +0100, Peter Zijlstra wrote: > On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: > > > +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo) > > +{ > > + struct task_struct *tsk = current; > > + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); > > + bool worker = tsk->flags & PF_UMCG_WORKER; > > + int ret; > > + > > + if (!self || flags) > > + return -EINVAL; > > + > > + if (worker) { > > + tsk->flags &= ~PF_UMCG_WORKER; > > + if (timo) > > + return -ERANGE; > > + } > > + > > + /* see umcg_sys_{enter,exit}() syscall exceptions */ > > + ret = umcg_pin_pages(); > > + if (ret) > > + goto unblock; > > + > > + /* > > + * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE. > > + */ > > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE); > > + if (ret) > > + goto unpin; > > + > > + ret = umcg_wake_next(tsk, self); > > + if (ret) > > + goto unpin; > > + > > + if (worker) { > > + /* > > + * If this fails it is possible ::next_tid is already running > > + * while this task is not going to block. This violates our > > + * constraints. > > + * > > + * That said, pretty much the only way to make this fail is by > > + * force munmap()'ing things. In which case one is most welcome > > + * to the pieces. > > + */ > > + ret = umcg_enqueue_and_wake(tsk); > > + if (ret) > > + goto unpin; > > + } > > + > > + umcg_unpin_pages(); > > + > > + ret = umcg_wait(timo); > > + switch (ret) { > > + case 0: /* all done */ > > + case -EINTR: /* umcg_notify_resume() will continue the wait */ > > So I was playing with the whole worker timeout thing last night and > realized this is broken. If we get a signal while we have a timeout, the > timeout gets lost. > > I think the easiest solution is to have umcg_notify_resume() also resume > the timeout, but the first pass of that was yuck, so I need to try > again. Something like this, still yuck though. Also still need to write me a test for this. --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1300,12 +1300,14 @@ struct task_struct { clockid_t umcg_clock; struct umcg_task __user *umcg_task; - /* setup by umcg_pin_enter() */ + /* setup by umcg_pin_pages() */ struct page *umcg_page; struct task_struct *umcg_server; struct umcg_task __user *umcg_server_task; struct page *umcg_server_page; + + u64 umcg_timeout; #endif struct tlbflush_unmap_batch tlb_ubc; --- a/kernel/sched/umcg.c +++ b/kernel/sched/umcg.c @@ -232,6 +232,8 @@ static int umcg_update_state(struct task /* Called from syscall enter path and exceptions that can schedule */ void umcg_sys_enter(struct pt_regs *regs, long syscall) { + current->umcg_timeout = 0; + /* avoid recursion vs our own syscalls */ if (syscall == __NR_umcg_wait || syscall == __NR_umcg_ctl) @@ -519,6 +521,7 @@ void umcg_notify_resume(struct pt_regs * struct umcg_task __user *self = tsk->umcg_task; bool worker = tsk->flags & PF_UMCG_WORKER; u32 state; + int ret; /* avoid recursion vs schedule() */ if (worker) @@ -554,12 +557,17 @@ void umcg_notify_resume(struct pt_regs * umcg_unpin_pages(); } - switch (umcg_wait(0)) { + ret = umcg_wait(tsk->umcg_timeout); + switch (ret) { case 0: case -EINTR: /* we will resume the wait after the signal */ break; + case -ETIMEDOUT: + regs_set_return_value(regs, ret); + break; + default: UMCG_DIE("wait"); } @@ -759,6 +767,7 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u switch (ret) { case 0: /* all done */ case -EINTR: /* umcg_notify_resume() will continue the wait */ + tsk->umcg_timeout = timo; ret = 0; break;
On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: > User Managed Concurrency Groups is an M:N threading toolkit that allows > constructing user space schedulers designed to efficiently manage > heterogeneous in-process workloads while maintaining high CPU > utilization (95%+). > > XXX moar changelog explaining how this is moar awesome than > traditional user-space threading. Awaiting a commit message that I can parse, I'm just looking at the entry bits for now. TBH I have no idea what this is actually trying to do... [...] > --- a/include/linux/entry-common.h > +++ b/include/linux/entry-common.h > @@ -23,6 +23,10 @@ > # define _TIF_UPROBE (0) > #endif > > +#ifndef _TIF_UMCG > +# define _TIF_UMCG (0) > +#endif > + > /* > * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() > */ > @@ -43,11 +47,13 @@ > SYSCALL_WORK_SYSCALL_EMU | \ > SYSCALL_WORK_SYSCALL_AUDIT | \ > SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ > + SYSCALL_WORK_SYSCALL_UMCG | \ > ARCH_SYSCALL_WORK_ENTER) > #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ > SYSCALL_WORK_SYSCALL_TRACE | \ > SYSCALL_WORK_SYSCALL_AUDIT | \ > SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ > + SYSCALL_WORK_SYSCALL_UMCG | \ > SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ > ARCH_SYSCALL_WORK_EXIT) > > @@ -221,8 +227,11 @@ static inline void local_irq_disable_exi > */ > static inline void irqentry_irq_enable(struct pt_regs *regs) > { > - if (!regs_irqs_disabled(regs)) > + if (!regs_irqs_disabled(regs)) { > local_irq_enable(); > + if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) > + umcg_sys_enter(regs, -1); > + } > } Perhaps it would make sense to have separate umcg_sys_enter(regs) and umcg_sys_enter_syscall(regs, syscallno)? Even if the former is just a wrapper, to make the entry/exit bits clearly correspond for all the !syscall cases? Also, is the syscall case meant to nest within this, or syscall entry paths not supposed to call irqentry_irq_enable() ? > > /** > @@ -232,8 +241,11 @@ static inline void irqentry_irq_enable(s > */ > static inline void irqentry_irq_disable(struct pt_regs *regs) > { > - if (!regs_irqs_disabled(regs)) > + if (!regs_irqs_disabled(regs)) { > + if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) > + umcg_sys_exit(regs); > local_irq_disable(); > + } > } Do the umcg_sys_{enter,exit}() calls need to happen with IRQs unmasked? * If not (and this nests): for arm64 these can live in our enter_from_user_mode() and exit_to_user_mode() helpers. * If so (or this doesn't nest): for arm64 we'd need to rework our local_daif_{inherit,restore,mask}() calls to handle this, though I've been meaning to do that anyway to handle pseudo-NMI better. Either way, it looks like we'd need helpers along the lines of: | static __always_inline void umcg_enter_from_user(struct pt_regs *regs) | { | if (current->flags & PF_UMCG_WORKER) | umcg_sys_enter(regs, -1); | } | | static __always_inline void umcg_exit_to_user(struct pt_regs *regs) | { | if (current->flags & PF_UMCG_WORKER) | umcg_sys_exit(regs); | } Thanks, Mark.
On Fri, Jan 21, 2022 at 04:57:29PM +0000, Mark Rutland wrote: > On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: > > User Managed Concurrency Groups is an M:N threading toolkit that allows > > constructing user space schedulers designed to efficiently manage > > heterogeneous in-process workloads while maintaining high CPU > > utilization (95%+). > > > > XXX moar changelog explaining how this is moar awesome than > > traditional user-space threading. > > Awaiting a commit message that I can parse, I'm just looking at the entry bits > for now. TBH I have no idea what this is actually trying to do... Ha! yes.. I knew I was going to have to do that eventually :-) It's basically a user-space scheduler that is subservient to the kernel scheduler (hierarchical scheduling were a user task is a server for other user tasks), where a server thread is in charge of selecting which of it's worker threads gets to run. The original idea was that each server only ever runs a single worker, but PeterO is currently trying to reconsider that. The *big* feature here, over traditional N:M scheduling, is that threads can block, while traditional userspace threading is limited to non-blocking system calls (and per later, page-faults). In order to make that happen we must ovbiously hook schedule() for these worker threads and inform userspace (the server thread) when this happens such that it can select another worker thread to go vroom. Meanwhile, a worker task getting woken from schedule() must not continue running; instead it must enter the server's ready-queue and await it's turn again. Instead of dealing with arbitrary delays deep inside the kernel block chain, we punt and let the task complete until return-to-user and block it there. The time between schedule() and return-to-user is unmanaged time. Now, since we can't readily poke at userspace memory from schedule(), we could be holding mmap_sem etc., we pin the worker and server page on sys-enter such that when we hit schedule() we can update state and then unpin the pages such that page pin time is from sys-enter to first schedule(), or sys-exit which ever comes first. This ensures the page-pin is *short* term. Additionally we must deal with signals :-(, the currnt approach is to let them bust boundaries and run them as unmanaged time. UMCG userspace can obviously control this by using pthread_sigmask() and friends. Now, the reason for irqentry_irq_enable() is mostly #PF. When a worker faults and blocks we want the same things to happen. Anyway, so workers have 3 layers of hooks: sys_enter schedule() sys_exit return-to-user There's a bunch of paths through this: - sys_enter -> sys_exit: no blocking; nothing changes: - sys_enter: * pin pages - sys_exit: * unpin pages - sys_enter -> schedule() -> sys_exit: we did block: - sys_enter: * pin pages - schedule(): * mark worker BLOCKED * wake server (it will observe it's current worker !RUNNING and select a new worker or idles) * unpin pages - sys_exit(): * mark worker RUNNABLE * enqueue worker on server's runnable_list * wake server (which will observe a new runnable task, add it to whatever and if it was idle goes run, otherwise goes back to sleep to let it's current worker finish) * block until RUNNING - sys_enter -> schedule() -> sys_exit -> return_to_user: As above; except now we got a signal while !RUNNING. sys_exit() terminates and return-to-user takes over running the signal and on return from the signal we'll again block until RUNNING, or do the whole signal dance again if so required. Does this clarify things a little?
On Fri, Jan 21, 2022 at 04:57:29PM +0000, Mark Rutland wrote: > > @@ -221,8 +227,11 @@ static inline void local_irq_disable_exi > > */ > > static inline void irqentry_irq_enable(struct pt_regs *regs) > > { > > - if (!regs_irqs_disabled(regs)) > > + if (!regs_irqs_disabled(regs)) { > > local_irq_enable(); > > + if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) > > + umcg_sys_enter(regs, -1); > > + } > > } > > Perhaps it would make sense to have separate umcg_sys_enter(regs) and > umcg_sys_enter_syscall(regs, syscallno)? Even if the former is just a wrapper, > to make the entry/exit bits clearly correspond for all the !syscall cases? Can do I suppose. > Also, is the syscall case meant to nest within this, or syscall entry paths not > supposed to call irqentry_irq_enable() ? No nesting, syscall_ vs irqentry_. And you can't have a syscall and an exception both be from user at the same time :-) > > /** > > @@ -232,8 +241,11 @@ static inline void irqentry_irq_enable(s > > */ > > static inline void irqentry_irq_disable(struct pt_regs *regs) > > { > > - if (!regs_irqs_disabled(regs)) > > + if (!regs_irqs_disabled(regs)) { > > + if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) > > + umcg_sys_exit(regs); > > local_irq_disable(); > > + } > > } > > Do the umcg_sys_{enter,exit}() calls need to happen with IRQs unmasked? Yes; both can end up blocking. > * If not (and this nests): for arm64 these can live in our > enter_from_user_mode() and exit_to_user_mode() helpers. > > * If so (or this doesn't nest): for arm64 we'd need to rework our > local_daif_{inherit,restore,mask}() calls to handle this, though I've been > meaning to do that anyway to handle pseudo-NMI better. > > Either way, it looks like we'd need helpers along the lines of: > > | static __always_inline void umcg_enter_from_user(struct pt_regs *regs) > | { > | if (current->flags & PF_UMCG_WORKER) > | umcg_sys_enter(regs, -1); > | } > | > | static __always_inline void umcg_exit_to_user(struct pt_regs *regs) > | { > | if (current->flags & PF_UMCG_WORKER) > | umcg_sys_exit(regs); > | } Would something like: #ifndef arch_irqentry_irq_enter static __always_inline bool arch_irqentry_irq_enter(struct pt_regs *regs) { if (!regs_irqs_disabled(regs)) { local_irq_enable(); return true; } return false; } #endif static __always_inline void irqentry_irq_enter(struct pt_regs *regs) { if (arch_irqentry_irq_inherit(regs)) { if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) umcg_sys_enter(regs, -1); } } Work? Then arm64 can do: static __always_inline bool arch_irqentry_irq_enter(struct pt_regs *regs) { local_daif_inherit(); return interrupts_enabled(regs); } or somesuch...
On Mon, Jan 24, 2022 at 11:03:06AM +0100, Peter Zijlstra wrote: > > Either way, it looks like we'd need helpers along the lines of: > > > > | static __always_inline void umcg_enter_from_user(struct pt_regs *regs) > > | { > > | if (current->flags & PF_UMCG_WORKER) > > | umcg_sys_enter(regs, -1); > > | } > > | > > | static __always_inline void umcg_exit_to_user(struct pt_regs *regs) > > | { > > | if (current->flags & PF_UMCG_WORKER) > > | umcg_sys_exit(regs); > > | } > > Would something like: > > #ifndef arch_irqentry_irq_enter > static __always_inline bool arch_irqentry_irq_enter(struct pt_regs *regs) > { > if (!regs_irqs_disabled(regs)) { > local_irq_enable(); > return true; > } > return false; > } > #endif > > static __always_inline void irqentry_irq_enter(struct pt_regs *regs) > { > if (arch_irqentry_irq_inherit(regs)) { > if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) > umcg_sys_enter(regs, -1); > } > } > > Work? Then arm64 can do: > > static __always_inline bool arch_irqentry_irq_enter(struct pt_regs *regs) > { > local_daif_inherit(); > return interrupts_enabled(regs); > } > > or somesuch... Ah,.. just read your other email, so your concern is about the user_mode() thing due to ARM64 taking a different exception path for from-user vs from-kernel ? I don't mind too much if arm64 decides to open-code the umcg hooks, but please do it such that's hard to forget a spot.
On Mon, Jan 24, 2022 at 11:07:04AM +0100, Peter Zijlstra wrote: > On Mon, Jan 24, 2022 at 11:03:06AM +0100, Peter Zijlstra wrote: > > > > Either way, it looks like we'd need helpers along the lines of: > > > > > > | static __always_inline void umcg_enter_from_user(struct pt_regs *regs) > > > | { > > > | if (current->flags & PF_UMCG_WORKER) > > > | umcg_sys_enter(regs, -1); > > > | } > > > | > > > | static __always_inline void umcg_exit_to_user(struct pt_regs *regs) > > > | { > > > | if (current->flags & PF_UMCG_WORKER) > > > | umcg_sys_exit(regs); > > > | } > > > > Would something like: > > > > #ifndef arch_irqentry_irq_enter > > static __always_inline bool arch_irqentry_irq_enter(struct pt_regs *regs) > > { > > if (!regs_irqs_disabled(regs)) { > > local_irq_enable(); > > return true; > > } > > return false; > > } > > #endif > > > > static __always_inline void irqentry_irq_enter(struct pt_regs *regs) > > { > > if (arch_irqentry_irq_inherit(regs)) { > > if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) > > umcg_sys_enter(regs, -1); > > } > > } > > > > Work? Then arm64 can do: > > > > static __always_inline bool arch_irqentry_irq_enter(struct pt_regs *regs) > > { > > local_daif_inherit(); > > return interrupts_enabled(regs); > > } > > > > or somesuch... > > Ah,.. just read your other email, so your concern is about the > user_mode() thing due to ARM64 taking a different exception path for > from-user vs from-kernel ? Yup; it's two-fold: 1) We have separate vectors for entry from-user and from-kernel, and I'd like to avoid the conditionality (e.g. the user_mode(regs) checks) where possible. Having that unconditional and explicit in the from-user code avoids redundant work and is much easier to see that it's correct and balanced. We have separate irqentry_from_user() and irqentry_from_kernel() helpers today for this. 2) Due to the way we nest classes of exception, on the entry path we manipulate the flags differently depending on which specific exception we've taken. On the return path we always mask everything (necessary due to the way exception return works architecturally). Luckily exceptions from-user don't nest, so those cases are simpler than exceptions from-kernel. > I don't mind too much if arm64 decides to open-code the umcg hooks, but > please do it such that's hard to forget a spot. I'll see what I can do. :) Thanks, Mark.
On Fri, Jan 21, 2022 at 12:47:58PM +0100, Peter Zijlstra wrote: > On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: > > > +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo) > > +{ > > + struct task_struct *tsk = current; > > + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); > > + bool worker = tsk->flags & PF_UMCG_WORKER; > > + int ret; > > + > > + if (!self || flags) > > + return -EINVAL; > > + > > + if (worker) { > > + tsk->flags &= ~PF_UMCG_WORKER; > > + if (timo) > > + return -ERANGE; > > + } > > + > > + /* see umcg_sys_{enter,exit}() syscall exceptions */ > > + ret = umcg_pin_pages(); > > + if (ret) > > + goto unblock; > > + > > + /* > > + * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE. > > + */ > > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE); > > + if (ret) > > + goto unpin; > > + > > + ret = umcg_wake_next(tsk, self); > > + if (ret) > > + goto unpin; > > + > > + if (worker) { > > + /* > > + * If this fails it is possible ::next_tid is already running > > + * while this task is not going to block. This violates our > > + * constraints. > > + * > > + * That said, pretty much the only way to make this fail is by > > + * force munmap()'ing things. In which case one is most welcome > > + * to the pieces. > > + */ > > + ret = umcg_enqueue_and_wake(tsk); > > + if (ret) > > + goto unpin; > > + } > > + > > + umcg_unpin_pages(); > > + > > + ret = umcg_wait(timo); > > + switch (ret) { > > + case 0: /* all done */ > > + case -EINTR: /* umcg_notify_resume() will continue the wait */ > > So I was playing with the whole worker timeout thing last night and > realized this is broken. If we get a signal while we have a timeout, the > timeout gets lost. > > I think the easiest solution is to have umcg_notify_resume() also resume > the timeout, but the first pass of that was yuck, so I need to try > again. > > Related, by moving the whole enqueue-and-wake thing into the timeout, we > get more 'fun' failure cases :-( This is the best I can come up with,... but it's a hot mess :-( Still, let me go try this. --- --- a/include/uapi/linux/umcg.h +++ b/include/uapi/linux/umcg.h @@ -127,6 +127,14 @@ struct umcg_task { } __attribute__((packed, aligned(UMCG_TASK_ALIGN))); /** + * enum umcg_wait_flag - flags to pass to sys_umcg_wait + * @UMCG_WAIT_ENQUEUE: Enqueue the task on runnable_workers_ptr before waiting + */ +enum umcg_wait_flag { + UMCG_WAIT_ENQUEUE = 0x0001, +}; + +/** * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl * @UMCG_CTL_REGISTER: register the current task as a UMCG task * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task --- a/kernel/sched/umcg.c +++ b/kernel/sched/umcg.c @@ -227,7 +227,6 @@ static int umcg_update_state(struct task #define UMCG_DIE(reason) __UMCG_DIE(,reason) #define UMCG_DIE_PF(reason) __UMCG_DIE(pagefault_enable(), reason) -#define UMCG_DIE_UNPIN(reason) __UMCG_DIE(umcg_unpin_pages(), reason) /* Called from syscall enter path and exceptions that can schedule */ void umcg_sys_enter(struct pt_regs *regs, long syscall) @@ -371,15 +370,23 @@ static int umcg_enqueue_runnable(struct static int umcg_enqueue_and_wake(struct task_struct *tsk) { - int ret; - - ret = umcg_enqueue_runnable(tsk); + int ret = umcg_enqueue_runnable(tsk); if (!ret) ret = umcg_wake_server(tsk); return ret; } +static int umcg_pin_enqueue_and_wake(struct task_struct *tsk) +{ + int ret = umcg_pin_pages(); + if (!ret) { + ret = umcg_enqueue_and_wake(tsk); + umcg_unpin_pages(); + } + return ret; +} + /* * umcg_wait: Wait for ->state to become RUNNING * @@ -469,16 +476,11 @@ static void umcg_unblock_and_wait(void) /* avoid recursion vs schedule() */ tsk->flags &= ~PF_UMCG_WORKER; - if (umcg_pin_pages()) - UMCG_DIE("pin"); - if (umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE)) - UMCG_DIE_UNPIN("state"); + UMCG_DIE("state"); - if (umcg_enqueue_and_wake(tsk)) - UMCG_DIE_UNPIN("enqueue-wake"); - - umcg_unpin_pages(); + if (umcg_pin_enqueue_and_wake(tsk)) + UMCG_DIE("pin-enqueue-wake"); switch (umcg_wait(0)) { case 0: @@ -544,18 +546,13 @@ void umcg_notify_resume(struct pt_regs * goto done; if (state & UMCG_TF_PREEMPT) { - if (umcg_pin_pages()) - UMCG_DIE("pin"); - if (umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_RUNNABLE)) - UMCG_DIE_UNPIN("state"); + UMCG_DIE("state"); - if (umcg_enqueue_and_wake(tsk)) - UMCG_DIE_UNPIN("enqueue-wake"); - - umcg_unpin_pages(); + if (umcg_pin_enqueue_and_wake(tsk)) + UMCG_DIE("pin-enqueue-wake"); } if (WARN_ON_ONCE(timeout && syscall_get_nr(tsk, regs) != __NR_umcg_wait)) @@ -570,6 +567,13 @@ void umcg_notify_resume(struct pt_regs * case -ETIMEDOUT: regs_set_return_value(regs, ret); + if (worker) { + ret = umcg_pin_enqueue_and_wake(tsk); + if (ret) { + umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING); + regs_set_return_value(regs, ret); + } + } break; default: @@ -710,7 +714,6 @@ static int umcg_wake_next(struct task_st * Returns: * 0 - OK; * -ETIMEDOUT - the timeout expired; - * -ERANGE - the timeout is out of range (worker); * -EAGAIN - ::state wasn't RUNNABLE, concurrent wakeup; * -EFAULT - failed accessing struct umcg_task __user of the current * task, the server or next; @@ -725,48 +728,40 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u bool worker = tsk->flags & PF_UMCG_WORKER; int ret; - if (!self || flags) + if (!self || (flags & ~(UMCG_WAIT_ENQUEUE))) return -EINVAL; - if (worker) { - tsk->flags &= ~PF_UMCG_WORKER; - if (timo) - return -ERANGE; - } + if ((flags & UMCG_WAIT_ENQUEUE) && (timo || !worker)) + return -EINVAL; - /* see umcg_sys_{enter,exit}() syscall exceptions */ - ret = umcg_pin_pages(); - if (ret) - goto unblock; + if (worker) + tsk->flags &= ~PF_UMCG_WORKER; /* * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE. */ ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE); if (ret) - goto unpin; + goto unblock; ret = umcg_wake_next(tsk, self); if (ret) - goto unpin; + goto unblock; - if (worker) { + if (flags & UMCG_WAIT_ENQUEUE) { /* * If this fails it is possible ::next_tid is already running * while this task is not going to block. This violates our * constraints. * - * That said, pretty much the only way to make this fail is by - * force munmap()'ing things. In which case one is most welcome - * to the pieces. + * Userspace can detect this case by looking at: ::next_tid & + * TID_RUNNING. */ - ret = umcg_enqueue_and_wake(tsk); + ret = umcg_pin_enqueue_and_wake(tsk); if (ret) - goto unpin; + goto unblock; } - umcg_unpin_pages(); - ret = umcg_wait(timo); switch (ret) { case 0: /* all done */ @@ -775,6 +770,26 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u ret = 0; break; + case -ETIMEDOUT: + if (worker) { + /* + * See the UMCG_WAIT_ENQUEUE case above; except this is + * even more complicated because we *did* wait and + * things might have progressed lots. + * + * Still, abort the wait because otherwise nobody would + * ever find us again. Hopefully userspace can make + * sense of things. + */ + ret = umcg_pin_enqueue_and_wake(tsk); + if (ret) + goto unblock; + + ret = -ETIMEDOUT; + break; + } + goto unblock; + default: goto unblock; } @@ -783,8 +798,6 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u tsk->flags |= PF_UMCG_WORKER; return ret; -unpin: - umcg_unpin_pages(); unblock: umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING); goto out;
On Fri, Jan 21, 2022 at 04:18:46PM +0100, Peter Zijlstra wrote: > Something like this, still yuck though. Also still need to write me a > test for this. > > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1300,12 +1300,14 @@ struct task_struct { > clockid_t umcg_clock; > struct umcg_task __user *umcg_task; > > - /* setup by umcg_pin_enter() */ > + /* setup by umcg_pin_pages() */ > struct page *umcg_page; > > struct task_struct *umcg_server; > struct umcg_task __user *umcg_server_task; > struct page *umcg_server_page; > + > + u64 umcg_timeout; > #endif > > struct tlbflush_unmap_batch tlb_ubc; > --- a/kernel/sched/umcg.c > +++ b/kernel/sched/umcg.c > @@ -232,6 +232,8 @@ static int umcg_update_state(struct task > /* Called from syscall enter path and exceptions that can schedule */ > void umcg_sys_enter(struct pt_regs *regs, long syscall) > { > + current->umcg_timeout = 0; > + > /* avoid recursion vs our own syscalls */ > if (syscall == __NR_umcg_wait || > syscall == __NR_umcg_ctl) > @@ -519,6 +521,7 @@ void umcg_notify_resume(struct pt_regs * > struct umcg_task __user *self = tsk->umcg_task; > bool worker = tsk->flags & PF_UMCG_WORKER; > u32 state; > + int ret; > > /* avoid recursion vs schedule() */ > if (worker) > @@ -554,12 +557,17 @@ void umcg_notify_resume(struct pt_regs * > umcg_unpin_pages(); > } > > - switch (umcg_wait(0)) { > + ret = umcg_wait(tsk->umcg_timeout); Oh how I hate signals... this can get scribbled by a syscall/fault from sigcontext :/ Maybe I can recover the timo argument from the original syscall pt_regs.. let me try. > + switch (ret) { > case 0: > case -EINTR: > /* we will resume the wait after the signal */ > break; > > + case -ETIMEDOUT: > + regs_set_return_value(regs, ret); > + break; > + > default: > UMCG_DIE("wait"); > } > @@ -759,6 +767,7 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u > switch (ret) { > case 0: /* all done */ > case -EINTR: /* umcg_notify_resume() will continue the wait */ > + tsk->umcg_timeout = timo; > ret = 0; > break; >
Hi Peter, On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: [...] > +/* pre-schedule() */ > +void umcg_wq_worker_sleeping(struct task_struct *tsk) > +{ > + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); > + int ret; > + > + if (!tsk->umcg_server) { > + /* > + * Already blocked before, the pages are unpinned. > + */ > + return; > + } > + > + /* Must not fault, mmap_sem might be held. */ > + pagefault_disable(); > + > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_BLOCKED); > + if (ret == -EAGAIN) { > + /* > + * Consider: > + * > + * self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; > + * ... > + * sys_umcg_wait(); > + * > + * and the '...' code doing a blocking syscall/fault. This > + * ensures that returns with UMCG_TASK_RUNNING, which will make /UMCG_TASK_RUNNING/UMCG_TASK_RUNNABLE/ > + * sys_umcg_wait() return with -EAGAIN. > + */ > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_BLOCKED); > + } > + if (ret) > + UMCG_DIE_PF("state"); > + > + if (umcg_wake_server(tsk)) > + UMCG_DIE_PF("wake"); > + > + pagefault_enable(); > + > + /* > + * We're going to sleep, make sure to unpin the pages, this ensures > + * the pins are temporary. Also see umcg_sys_exit(). > + */ > + umcg_unpin_pages(); > +} [...] > +/* Called from syscall exit path and exceptions that can schedule */ > +void umcg_sys_exit(struct pt_regs *regs) > +{ > + struct task_struct *tsk = current; > + long syscall = syscall_get_nr(tsk, regs); > + > + if (syscall == __NR_umcg_wait || > + syscall == __NR_umcg_ctl) > + return; > + > + if (tsk->umcg_server) { > + /* > + * Didn't block, we done. > + */ > + umcg_unpin_pages(); > + return; > + } > + > + umcg_unblock_and_wait(); umcg_unblock_and_wait() -> umcg_enqueue_and_wake() -> umcg_wake_server() -> umcg_wake_task(tsk->umcg_server, ...) tsk->umcg_server is NULL here and umcg_wake_task() use it to update state in umcg_update_state(NULL, ...), that means tsk->umcg_clock will happen something i do not know. There are two places to call umcg_unblock_and_wait(). One is in umcg_register() where the server is set. Another one is in umcg_sys_exit() where the server is not set. May use a bool to indicate if the server is set. > +} [...] > +/** > + * sys_umcg_wait: transfer running context > + * > + * Called like: > + * > + * self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; > + * ... > + * sys_umcg_wait(0, time); > + * > + * The syscall will clear TF_COND_WAIT and wait until state becomes RUNNING. > + * The code '...' must not contain syscalls > + * > + * If self->next_tid is set and indicates a valid UMCG task with RUNNABLE state > + * that task will be made RUNNING and woken -- transfering the running context > + * to that task. In this case self->next_tid is modified with TID_RUNNING to > + * indicate self->next_tid is consumed. > + * > + * If self->next has TID_RUNNING set, it is validated the related task has /self->next/self->next_tid/ Things are not clear to me even they are clear now. Nice. Thanks, Tao
On Mon, Jan 24, 2022 at 03:29:56PM +0100, Peter Zijlstra wrote: > On Fri, Jan 21, 2022 at 04:18:46PM +0100, Peter Zijlstra wrote: > > Something like this, still yuck though. Also still need to write me a > > test for this. > > > > --- a/kernel/sched/umcg.c > > +++ b/kernel/sched/umcg.c > > @@ -232,6 +232,8 @@ static int umcg_update_state(struct task > > /* Called from syscall enter path and exceptions that can schedule */ > > void umcg_sys_enter(struct pt_regs *regs, long syscall) > > { > > + current->umcg_timeout = 0; > > + > > /* avoid recursion vs our own syscalls */ > > if (syscall == __NR_umcg_wait || > > syscall == __NR_umcg_ctl) > > @@ -519,6 +521,7 @@ void umcg_notify_resume(struct pt_regs * > > struct umcg_task __user *self = tsk->umcg_task; > > bool worker = tsk->flags & PF_UMCG_WORKER; > > u32 state; > > + int ret; > > > > /* avoid recursion vs schedule() */ > > if (worker) > > @@ -554,12 +557,17 @@ void umcg_notify_resume(struct pt_regs * > > umcg_unpin_pages(); > > } > > > > - switch (umcg_wait(0)) { > > + ret = umcg_wait(tsk->umcg_timeout); > > Oh how I hate signals... this can get scribbled by a syscall/fault from > sigcontext :/ > > Maybe I can recover the timo argument from the original syscall > pt_regs.. let me try. Urgh, recursive hell... If the signal does *anything* that tickles notify-resume it'll find RUNNABLE and go wait there --- ad infinitum. I need to go cook dinner, I'll prod more at this later
On Mon, Jan 24, 2022 at 8:44 AM Peter Zijlstra <peterz@infradead.org> wrote: [...] > > > > Oh how I hate signals... this can get scribbled by a syscall/fault from > > sigcontext :/ > > > > Maybe I can recover the timo argument from the original syscall > > pt_regs.. let me try. I don't think we need to do anything special with timeouts if a signal happens - just normally return -EINTR (after a resume from a server) and let the userspace figure things out. Alternatively, in my version of the patchset UMCG tasks ignore non-fatal signals. :) [...]
On Mon, Jan 24, 2022 at 03:29:56PM +0100, Peter Zijlstra wrote: > Oh how I hate signals... this can get scribbled by a syscall/fault from > sigcontext :/ OK, the below seems to work. I'll see if I can clean it up some. --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -94,28 +94,44 @@ static inline int syscall_get_arch(struc #else /* CONFIG_X86_64 */ -static inline void syscall_get_arguments(struct task_struct *task, - struct pt_regs *regs, - unsigned long *args) +static inline unsigned long +syscall_get_argument(struct task_struct *task, struct pt_regs *regs, int nr) { -# ifdef CONFIG_IA32_EMULATION +#ifdef CONFIG_IA32_EMULATION if (task->thread_info.status & TS_COMPAT) { - *args++ = regs->bx; - *args++ = regs->cx; - *args++ = regs->dx; - *args++ = regs->si; - *args++ = regs->di; - *args = regs->bp; + switch (nr) { + case 0: return regs->bx; + case 1: return regs->cx; + case 2: return regs->dx; + case 3: return regs->si; + case 4: return regs->di; + case 5: return regs->bp; + } } else -# endif +#endif { - *args++ = regs->di; - *args++ = regs->si; - *args++ = regs->dx; - *args++ = regs->r10; - *args++ = regs->r8; - *args = regs->r9; + switch (nr) { + case 0: return regs->di; + case 1: return regs->si; + case 2: return regs->dx; + case 3: return regs->r10; + case 4: return regs->r8; + case 5: return regs->r9; + } } + + WARN_ON_ONCE(1); + return 0; +} + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + int i; + + for (i = 0; i < 6; i++) + *args++ = syscall_get_argument(task, regs, i); } static inline int syscall_get_arch(struct task_struct *task) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1307,6 +1307,9 @@ struct task_struct { struct task_struct *umcg_server; struct umcg_task __user *umcg_server_task; struct page *umcg_server_page; + + unsigned long umcg_stack_pointer; + unsigned int umcg_worker; #endif struct tlbflush_unmap_batch tlb_ubc; --- a/kernel/sched/umcg.c +++ b/kernel/sched/umcg.c @@ -459,7 +459,7 @@ static int umcg_wait(u64 timo) /* * Blocked case for umcg_sys_exit(), shared with sys_umcg_ctl(). */ -static void umcg_unblock_and_wait(void) +static void umcg_unblock(void) { struct task_struct *tsk = current; struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); @@ -478,15 +478,7 @@ static void umcg_unblock_and_wait(void) umcg_unpin_pages(); - switch (umcg_wait(0)) { - case 0: - case -EINTR: - /* notify_resume will continue the wait after the signal */ - break; - - default: - UMCG_DIE("wait"); - } + /* notify-resume will wait */ tsk->flags |= PF_UMCG_WORKER; } @@ -509,7 +501,7 @@ void umcg_sys_exit(struct pt_regs *regs) return; } - umcg_unblock_and_wait(); + umcg_unblock(); } /* return-to-user path */ @@ -518,11 +510,47 @@ void umcg_notify_resume(struct pt_regs * struct task_struct *tsk = current; struct umcg_task __user *self = tsk->umcg_task; bool worker = tsk->flags & PF_UMCG_WORKER; + u64 timeout = 0; u32 state; + int ret; + + /* + * Unix signals are horrible, but we have to handle them somehow. + * + * - simply discarding a signal breaks userspace so is not an option. + * + * - returning -EINTR and have userspace deal with it is not an option + * since we can be blocked here due to !syscall reasons (page-faults + * for example). But it's also not permissible to have random + * syscalls return -EINTR that didn't before. + * + * - subjecting signal handlers to UMCG would render existing signal + * handler code subject to the whims and latencies of UMCG; given that + * most signal hander code is short and time sensitive, this seems + * undesirable (consider ^C not working because it got delivered to a + * blocked task). + * + * Therefore the chosen path is to exclude signal context from UMCG + * entirely and treat it as unmanaged time. + */ + if (tsk->umcg_stack_pointer) { + if (tsk->umcg_stack_pointer != user_stack_pointer(regs)) + return; + + tsk->umcg_stack_pointer = 0; + worker = tsk->umcg_worker; + tsk->umcg_worker = 0; + + if (worker) { + set_syscall_work(SYSCALL_UMCG); + /* and PF_UMCG_SYSCALL at done */ + } + goto resume; + } /* avoid recursion vs schedule() */ if (worker) - current->flags &= ~PF_UMCG_WORKER; + tsk->flags &= ~PF_UMCG_WORKER; if (get_user(state, &self->state)) UMCG_DIE("get-state"); @@ -554,10 +582,31 @@ void umcg_notify_resume(struct pt_regs * umcg_unpin_pages(); } - switch (umcg_wait(0)) { +resume: + /* + * Hack alert! Since the return-to-user path must resume waiting it + * needs access to the timeout argument and set the return value. + */ + if (syscall_get_nr(tsk, regs) == __NR_umcg_wait) + timeout = syscall_get_argument(tsk, regs, 1); + + ret = umcg_wait(timeout); + switch (ret) { case 0: + break; + case -EINTR: /* we will resume the wait after the signal */ + WARN_ON_ONCE(tsk->umcg_stack_pointer); + tsk->umcg_stack_pointer = user_stack_pointer(regs); + tsk->umcg_worker = worker; + clear_task_syscall_work(tsk, SYSCALL_UMCG); + /* implicitly clears PF_UMCG_WORKER with the early exit */ + return; + + case -ETIMEDOUT: + /* must be __NR_umcg_wait */ + regs_set_return_value(regs, ret); break; default: @@ -566,7 +615,7 @@ void umcg_notify_resume(struct pt_regs * done: if (worker) - current->flags |= PF_UMCG_WORKER; + tsk->flags |= PF_UMCG_WORKER; } /** @@ -755,16 +804,7 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u umcg_unpin_pages(); - ret = umcg_wait(timo); - switch (ret) { - case 0: /* all done */ - case -EINTR: /* umcg_notify_resume() will continue the wait */ - ret = 0; - break; - - default: - goto unblock; - } + /* notify-resume will wait */ out: if (worker) tsk->flags |= PF_UMCG_WORKER; @@ -831,7 +871,7 @@ static int umcg_register(struct umcg_tas set_syscall_work(SYSCALL_UMCG); /* hook syscall */ set_thread_flag(TIF_UMCG); /* hook return-to-user */ - umcg_unblock_and_wait(); + umcg_unblock(); } else { if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_RUNNING)
On Mon, Jan 24, 2022 at 10:46:17PM +0800, Tao Zhou wrote: > Hi Peter, > > On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: > > [...] > > > +/* pre-schedule() */ > > +void umcg_wq_worker_sleeping(struct task_struct *tsk) > > +{ > > + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); > > + int ret; > > + > > + if (!tsk->umcg_server) { > > + /* > > + * Already blocked before, the pages are unpinned. > > + */ > > + return; > > + } > > + > > + /* Must not fault, mmap_sem might be held. */ > > + pagefault_disable(); > > + > > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_BLOCKED); > > + if (ret == -EAGAIN) { > > + /* > > + * Consider: > > + * > > + * self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; > > + * ... > > + * sys_umcg_wait(); > > + * > > + * and the '...' code doing a blocking syscall/fault. This > > + * ensures that returns with UMCG_TASK_RUNNING, which will make > > /UMCG_TASK_RUNNING/UMCG_TASK_RUNNABLE/ So the issue is that: self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; <#PF> umcg_sys_enter() umcg_pin_user_page() schedule() sched_submit_work() umcg_wq_worker_sleeping() umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_BLOCKED) // -EAGAIN UMCG_DIE() Which is clearly not desirable. So this additinoal thing ensures that: umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_BLOCKED) // 0 umcg_sys_exit() umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE); umcg_enqueue_and_wake() umcg_notify_resume() umcg_wait() // must be UMCG_TASK_RUNNING here </#PF> So when the pagefault finally does return, it will have: UMCG_TASK_RUNNING. Which will then make sys_umcg_wait() return -EAGAIN and around we go. > > + * sys_umcg_wait() return with -EAGAIN. > > + */ > > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_BLOCKED); > > + } > > + if (ret) > > + UMCG_DIE_PF("state"); > > + > > + if (umcg_wake_server(tsk)) > > + UMCG_DIE_PF("wake"); > > + > > + pagefault_enable(); > > + > > + /* > > + * We're going to sleep, make sure to unpin the pages, this ensures > > + * the pins are temporary. Also see umcg_sys_exit(). > > + */ > > + umcg_unpin_pages(); > > +}
On Mon, Jan 24, 2022 at 10:46:17PM +0800, Tao Zhou wrote: > > +/* Called from syscall exit path and exceptions that can schedule */ > > +void umcg_sys_exit(struct pt_regs *regs) > > +{ > > + struct task_struct *tsk = current; > > + long syscall = syscall_get_nr(tsk, regs); > > + > > + if (syscall == __NR_umcg_wait || > > + syscall == __NR_umcg_ctl) > > + return; > > + > > + if (tsk->umcg_server) { > > + /* > > + * Didn't block, we done. > > + */ > > + umcg_unpin_pages(); > > + return; > > + } > > + > > + umcg_unblock_and_wait(); > > umcg_unblock_and_wait() -> umcg_enqueue_and_wake() -> > umcg_wake_server() -> umcg_wake_task(tsk->umcg_server, ...) > > tsk->umcg_server is NULL here and umcg_wake_task() use it to update > state in umcg_update_state(NULL, ...), that means tsk->umcg_clock > will happen something i do not know. I think umcg_unblock_and_wait() will repin, at which point we should have tsk->umcg_server again. > There are two places to call umcg_unblock_and_wait(). One is in > umcg_register() where the server is set. Another one is in > umcg_sys_exit() where the server is not set. May use a bool to > indicate if the server is set. I'm not sure what you're on about, but I absolutely hate redundant state, that only leads to problems.
On Mon, Jan 24, 2022 at 10:46:17PM +0800, Tao Zhou wrote: > > +/** > > + * sys_umcg_wait: transfer running context > > + * > > + * Called like: > > + * > > + * self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; > > + * ... > > + * sys_umcg_wait(0, time); > > + * > > + * The syscall will clear TF_COND_WAIT and wait until state becomes RUNNING. > > + * The code '...' must not contain syscalls > > + * > > + * If self->next_tid is set and indicates a valid UMCG task with RUNNABLE state > > + * that task will be made RUNNING and woken -- transfering the running context > > + * to that task. In this case self->next_tid is modified with TID_RUNNING to > > + * indicate self->next_tid is consumed. > > + * > > + * If self->next has TID_RUNNING set, it is validated the related task has > > /self->next/self->next_tid/ Yeah, there's more of that, I'll be sure to go re-read all the comments.
On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: Another iterator of the patch, some nits below. [...] > +/* > + * Pinning a page inhibits rmap based unmap for Anon pages. Doing a load > + * through the user mapping ensures the user mapping exists. > + */ > +#define umcg_pin_and_load(_self, _pagep, _member) \ > +({ \ > + __label__ __out; \ > + int __ret = -EFAULT; \ > + \ > + if (pin_user_pages_fast((unsigned long)(_self), 1, 0, &(_pagep)) != 1) \ > + goto __out; \ > + \ > + if (!PageAnon(_pagep) || \ > + get_user(_member, &(_self)->_member)) { \ Here should be 'get_user(_member, &(_self)->##_member))' if I am not wrong. > + unpin_user_page(_pagep); \ > + goto __out; \ > + } \ > + __ret = 0; \ > +__out: __ret; \ > +}) [...] > + > +/* > + * Enqueue @tsk on it's server's runnable list > + * > + * Must be called in umcg_pin_pages() context, relies on tsk->umcg_server. > + * > + * cmpxchg based single linked list add such that list integrity is never > + * violated. Userspace *MUST* remove it from the list before changing ->state. > + * As such, we must change state to RUNNABLE before enqueue. > + * > + * Returns: > + * 0: success > + * -EFAULT > + */ > +static int umcg_enqueue_runnable(struct task_struct *tsk) > +{ > + struct umcg_task __user *server = tsk->umcg_server_task; > + struct umcg_task __user *self = tsk->umcg_task; > + u64 first_ptr, *head = &server->runnable_workers_ptr; > + u64 self_ptr = (unsigned long)self; Why not 'u64 self_ptr = (u64)self;' ? > + /* > + * umcg_pin_pages() did access_ok() on both pointers, use self here > + * only because __user_access_begin() isn't available in generic code. > + */ > + if (!user_access_begin(self, sizeof(*self))) > + return -EFAULT; > + > + unsafe_get_user(first_ptr, head, Efault); > + do { > + unsafe_put_user(first_ptr, &self->runnable_workers_ptr, Efault); > + } while (!unsafe_try_cmpxchg_user(head, &first_ptr, self_ptr, Efault)); > + > + user_access_end(); > + return 0; > + > +Efault: > + user_access_end(); > + return -EFAULT; > +} [...] > +/* > + * Handles ::next_tid as per sys_umcg_wait(). > + * > + * ::next_tid - return > + * ----------------------------- > + * 0 - 0 (success) > + * > + * tid - -ESRCH (no such task, or not of this UMCG) > + * - -EAGAIN (next::state != RUNNABLE) > + * - 0 (success, ::next_tid |= RUNNING) > + * > + * tid|RUNNING - -EAGAIN (next::state != RUNNING) > + * - 0 (success) > + * > + * Returns: > + * 0: success > + * -EFAULT > + * -ESRCH > + * -EAGAIN > + */ > +static int umcg_wake_next(struct task_struct *tsk, struct umcg_task __user *self) @tsk is not used in function. > +{ > + struct umcg_task __user *next_task; > + struct task_struct *next; > + u32 next_tid, state; > + int ret; > + > + if (get_user(next_tid, &self->next_tid)) > + return -EFAULT; > + > + if (!next_tid) > + return 0; > + > + next = umcg_get_task(next_tid); > + if (!next) > + return -ESRCH; > + > + next_task = READ_ONCE(next->umcg_task); > + > + if (next_tid & UMCG_TID_RUNNING) { > + ret = -EFAULT; > + if (get_user(state, &next_task->state)) > + goto put_next; > + > + ret = 0; > + if ((state & UMCG_TASK_MASK) != UMCG_TASK_RUNNING) > + ret = -EAGAIN; > + > + } else { > + ret = umcg_wake_task(next, next_task); > + if (ret) > + goto put_next; > + > + ret = -EFAULT; > + if (put_user(next_tid | UMCG_TID_RUNNING, &self->next_tid)) > + goto put_next; > + > + /* > + * If this is a worker doing sys_umcg_wait() switching to > + * another worker, userspace has the responsibility to update > + * server::next_tid. > + */ > + > + ret = 0; > + } > + > +put_next: > + put_task_struct(next); > + return ret; > +} > +
On Thu, Jan 27, 2022 at 01:19:43PM +0100, Peter Zijlstra wrote: > On Mon, Jan 24, 2022 at 10:46:17PM +0800, Tao Zhou wrote: > > Hi Peter, > > > > On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: > > > > [...] > > > > > +/* pre-schedule() */ > > > +void umcg_wq_worker_sleeping(struct task_struct *tsk) > > > +{ > > > + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); > > > + int ret; > > > + > > > + if (!tsk->umcg_server) { > > > + /* > > > + * Already blocked before, the pages are unpinned. > > > + */ > > > + return; > > > + } > > > + > > > + /* Must not fault, mmap_sem might be held. */ > > > + pagefault_disable(); > > > + > > > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_BLOCKED); > > > + if (ret == -EAGAIN) { > > > + /* > > > + * Consider: > > > + * > > > + * self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; > > > + * ... > > > + * sys_umcg_wait(); > > > + * > > > + * and the '...' code doing a blocking syscall/fault. This > > > + * ensures that returns with UMCG_TASK_RUNNING, which will make > > > > /UMCG_TASK_RUNNING/UMCG_TASK_RUNNABLE/ > > So the issue is that: > > self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; > > <#PF> > umcg_sys_enter() > umcg_pin_user_page() > schedule() > sched_submit_work() > umcg_wq_worker_sleeping() > umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_BLOCKED) // -EAGAIN > UMCG_DIE() > > Which is clearly not desirable. > > So this additinoal thing ensures that: > > umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_BLOCKED) // 0 > > umcg_sys_exit() > umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE); > umcg_enqueue_and_wake() > > umcg_notify_resume() > umcg_wait() > > // must be UMCG_TASK_RUNNING here > </#PF> > > So when the pagefault finally does return, it will have: > UMCG_TASK_RUNNING. > > Which will then make sys_umcg_wait() return -EAGAIN and around we go. Thank you, Peter. > > > + * sys_umcg_wait() return with -EAGAIN. > > > + */ > > > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_BLOCKED); > > > + } > > > + if (ret) > > > + UMCG_DIE_PF("state"); > > > + > > > + if (umcg_wake_server(tsk)) > > > + UMCG_DIE_PF("wake"); > > > + > > > + pagefault_enable(); > > > + > > > + /* > > > + * We're going to sleep, make sure to unpin the pages, this ensures > > > + * the pins are temporary. Also see umcg_sys_exit(). > > > + */ > > > + umcg_unpin_pages(); > > > +}
On Thu, Jan 27, 2022 at 01:25:36PM +0100, Peter Zijlstra wrote: > On Mon, Jan 24, 2022 at 10:46:17PM +0800, Tao Zhou wrote: > > > > +/* Called from syscall exit path and exceptions that can schedule */ > > > +void umcg_sys_exit(struct pt_regs *regs) > > > +{ > > > + struct task_struct *tsk = current; > > > + long syscall = syscall_get_nr(tsk, regs); > > > + > > > + if (syscall == __NR_umcg_wait || > > > + syscall == __NR_umcg_ctl) > > > + return; > > > + > > > + if (tsk->umcg_server) { > > > + /* > > > + * Didn't block, we done. > > > + */ > > > + umcg_unpin_pages(); > > > + return; > > > + } > > > + > > > + umcg_unblock_and_wait(); > > > > umcg_unblock_and_wait() -> umcg_enqueue_and_wake() -> > > umcg_wake_server() -> umcg_wake_task(tsk->umcg_server, ...) > > > > tsk->umcg_server is NULL here and umcg_wake_task() use it to update > > state in umcg_update_state(NULL, ...), that means tsk->umcg_clock > > will happen something i do not know. > > I think umcg_unblock_and_wait() will repin, at which point we should > have tsk->umcg_server again. That's right, I miss that. > > There are two places to call umcg_unblock_and_wait(). One is in > > umcg_register() where the server is set. Another one is in > > umcg_sys_exit() where the server is not set. May use a bool to > > indicate if the server is set. > > I'm not sure what you're on about, but I absolutely hate redundant > state, that only leads to problems. But, it's my noise though.
--- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -248,6 +248,7 @@ config X86 select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK + select HAVE_UMCG if X86_64 select HAVE_USER_RETURN_NOTIFIER select HAVE_GENERIC_VDSO select HOTPLUG_SMT if SMP --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,6 +371,9 @@ 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common umcg_ctl sys_umcg_ctl +451 common umcg_wait sys_umcg_wait +452 common umcg_kick sys_umcg_kick # # Due to a historical design error, certain syscalls are numbered differently --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -83,6 +83,7 @@ struct thread_info { #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_SSBD 5 /* Speculative store bypass disable */ +#define TIF_UMCG 6 /* UMCG return to user hook */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ @@ -107,6 +108,7 @@ struct thread_info { #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) +#define _TIF_UMCG (1 << TIF_UMCG) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) #define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) --- a/fs/exec.c +++ b/fs/exec.c @@ -1838,6 +1838,7 @@ static int bprm_execve(struct linux_binp current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); + umcg_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -23,6 +23,10 @@ # define _TIF_UPROBE (0) #endif +#ifndef _TIF_UMCG +# define _TIF_UMCG (0) +#endif + /* * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() */ @@ -43,11 +47,13 @@ SYSCALL_WORK_SYSCALL_EMU | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ + SYSCALL_WORK_SYSCALL_UMCG | \ ARCH_SYSCALL_WORK_ENTER) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ + SYSCALL_WORK_SYSCALL_UMCG | \ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) @@ -221,8 +227,11 @@ static inline void local_irq_disable_exi */ static inline void irqentry_irq_enable(struct pt_regs *regs) { - if (!regs_irqs_disabled(regs)) + if (!regs_irqs_disabled(regs)) { local_irq_enable(); + if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) + umcg_sys_enter(regs, -1); + } } /** @@ -232,8 +241,11 @@ static inline void irqentry_irq_enable(s */ static inline void irqentry_irq_disable(struct pt_regs *regs) { - if (!regs_irqs_disabled(regs)) + if (!regs_irqs_disabled(regs)) { + if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) + umcg_sys_exit(regs); local_irq_disable(); + } } /** --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -67,6 +67,7 @@ struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct umcg_task; /* * Task state bitmask. NOTE! These bits are also @@ -1294,6 +1295,19 @@ struct task_struct { unsigned long rseq_event_mask; #endif +#ifdef CONFIG_UMCG + /* setup by sys_umcg_ctrl() */ + clockid_t umcg_clock; + struct umcg_task __user *umcg_task; + + /* setup by umcg_pin_enter() */ + struct page *umcg_page; + + struct task_struct *umcg_server; + struct umcg_task __user *umcg_server_task; + struct page *umcg_server_page; +#endif + struct tlbflush_unmap_batch tlb_ubc; union { @@ -1687,6 +1701,13 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ + +#ifdef CONFIG_UMCG +#define PF_UMCG_WORKER 0x01000000 /* UMCG worker */ +#else +#define PF_UMCG_WORKER 0x00000000 +#endif + #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ @@ -2294,6 +2315,67 @@ static inline void rseq_execve(struct ta { } +#endif + +#ifdef CONFIG_UMCG + +extern void umcg_sys_enter(struct pt_regs *regs, long syscall); +extern void umcg_sys_exit(struct pt_regs *regs); +extern void umcg_notify_resume(struct pt_regs *regs); +extern void umcg_worker_exit(void); +extern void umcg_clear_child(struct task_struct *tsk); + +/* Called by bprm_execve() in fs/exec.c. */ +static inline void umcg_execve(struct task_struct *tsk) +{ + if (tsk->umcg_task) + umcg_clear_child(tsk); +} + +/* Called by do_exit() in kernel/exit.c. */ +static inline void umcg_handle_exit(void) +{ + if (current->flags & PF_UMCG_WORKER) + umcg_worker_exit(); +} + +/* + * umcg_wq_worker_[sleeping|running] are called in core.c by + * sched_submit_work() and sched_update_worker(). + */ +extern void umcg_wq_worker_sleeping(struct task_struct *tsk); +extern void umcg_wq_worker_running(struct task_struct *tsk); + +#else /* CONFIG_UMCG */ + +static inline void umcg_sys_enter(struct pt_regs *regs, long syscall) +{ +} + +static inline void umcg_sys_exit(struct pt_regs *regs) +{ +} + +static inline void umcg_notify_resume(struct pt_regs *regs) +{ +} + +static inline void umcg_clear_child(struct task_struct *tsk) +{ +} +static inline void umcg_execve(struct task_struct *tsk) +{ +} +static inline void umcg_handle_exit(void) +{ +} +static inline void umcg_wq_worker_sleeping(struct task_struct *tsk) +{ +} +static inline void umcg_wq_worker_running(struct task_struct *tsk) +{ +} + #endif #ifdef CONFIG_DEBUG_RSEQ --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -72,6 +72,7 @@ struct open_how; struct mount_attr; struct landlock_ruleset_attr; enum landlock_rule_type; +struct umcg_task; #include <linux/types.h> #include <linux/aio_abi.h> @@ -1057,6 +1058,9 @@ asmlinkage long sys_landlock_add_rule(in const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); asmlinkage long sys_memfd_secret(unsigned int flags); +asmlinkage long sys_umcg_ctl(u32 flags, struct umcg_task __user *self, clockid_t which_clock); +asmlinkage long sys_umcg_wait(u32 flags, u64 abs_timeout); +asmlinkage long sys_umcg_kick(u32 flags, pid_t tid); /* * Architecture-specific system calls --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -46,6 +46,7 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_AUDIT, SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, + SYSCALL_WORK_BIT_SYSCALL_UMCG, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) @@ -55,6 +56,7 @@ enum syscall_work_bit { #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) #define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) +#define SYSCALL_WORK_SYSCALL_UMCG BIT(SYSCALL_WORK_BIT_SYSCALL_UMCG) #endif #include <asm/thread_info.h> --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -883,8 +883,15 @@ __SYSCALL(__NR_process_mrelease, sys_pro #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_umcg_ctl 450 +__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl) +#define __NR_umcg_wait 451 +__SYSCALL(__NR_umcg_wait, sys_umcg_wait) +#define __NR_umcg_kick 452 +__SYSCALL(__NR_umcg_kick, sys_umcg_kick) + #undef __NR_syscalls -#define __NR_syscalls 450 +#define __NR_syscalls 453 /* * 32 bit systems traditionally used different --- /dev/null +++ b/include/uapi/linux/umcg.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_UMCG_H +#define _UAPI_LINUX_UMCG_H + +#include <linux/types.h> + +/* + * UMCG: User Managed Concurrency Groups. + * + * Syscalls (see kernel/sched/umcg.c): + * sys_umcg_ctl() - register/unregister UMCG tasks; + * sys_umcg_wait() - wait/wake/context-switch. + * sys_umcg_kick() - prod a UMCG task + * + * struct umcg_task (below): controls the state of UMCG tasks. + */ + +/* + * UMCG task states, the first 8 bits of struct umcg_task.state. + * + * ,--------(TF_PREEMPT + notify_resume)-------. ,----------. + * | v | | + * RUNNING -(schedule)-> BLOCKED -(sys_exit)-> RUNNABLE (signal + notify_resume) + * ^ ^ | | ^ | + * | `-----(schedule)-----' | | | + * `--------------(sys_umcg_wait)----------------' `----------' + * + */ +#define UMCG_TASK_NONE 0x0000U +#define UMCG_TASK_RUNNING 0x0001U +#define UMCG_TASK_RUNNABLE 0x0002U +#define UMCG_TASK_BLOCKED 0x0003U + +#define UMCG_TASK_MASK 0x00ffU + +/* + * UMCG_TF_PREEMPT: userspace indicates the worker should be preempted. + * + * Must only be set on UMCG_TASK_RUNNING; once set, any subsequent + * return-to-user (eg sys_umcg_kick()) will perform the equivalent of + * sys_umcg_wait() on it. That is, it will wake next_tid/server_tid, transfer + * to RUNNABLE and enqueue on the server's runnable list. + */ +#define UMCG_TF_PREEMPT 0x0100U +/* + * UMCG_TF_COND_WAIT: indicate the task *will* call sys_umcg_wait() + * + * Enables server loops like (vs umcg_sys_exit()): + * + * for(;;) { + * self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; + * + * // smp_mb() implied by xchg() + * runnable_ptr = (void *)xchg(self->runnable_workers_ptr, NULL); + * while (runnable_ptr) { + * next = (void *)runnable_ptr->runnable_workers_ptr; + * umcg_server_add_runnable(self, runnable_ptr); + * runnable_ptr = next; + * } + * + * self->next_tid = umcg_server_pick_next(self); + * sys_umcg_wait(0, 0); + * } + * + * without a signal or interrupt in between setting umcg_task::state and + * sys_umcg_wait() resulting in an infinite wait in umcg_notify_resume(). + */ +#define UMCG_TF_COND_WAIT 0x0200U + +#define UMCG_TF_MASK 0xff00U + +#define UMCG_TASK_ALIGN 64 + +/** + * struct umcg_task - controls the state of UMCG tasks. + * + * The struct is aligned at 64 bytes to ensure that it fits into + * a single cache line. + */ +struct umcg_task { + /** + * @state: the current state of the UMCG task described by + * this struct. + * + * Readable/writable by both the kernel and the userspace. + * + * UMCG task state: + * bits 0 - 7: task state; + * bits 8 - 15: state flags; + * bits 16 - 31: for userspace use; + */ + __u32 state; /* r/w */ + +#define UMCG_TID_RUNNING 0x80000000U +#define UMCG_TID_MASK 0x3fffffffU + /** + * @next_tid: the TID of the UMCG task that should be context-switched + * into in sys_umcg_wait(). Can be zero. + * + * @server_tid: the TID of the UMCG server that hosts this task, + * when RUNNABLE this task will get added to it's + * runnable_workers_ptr list. + * + * Read-only for the kernel, read/write for the userspace. + */ + __u32 next_tid; /* r */ + __u32 server_tid; /* r */ + + __u32 __hole[1]; + + /* + * Timestamps for when last we became BLOCKED, RUNNABLE. + */ + __u64 blocked_ts; /* w */ + __u64 runnable_ts; /* w */ + + /** + * @runnable_workers_ptr: a single-linked list of runnable workers. + * + * Readable/writable by both the kernel and the userspace: the + * kernel adds items to the list, userspace removes them. + */ + __u64 runnable_workers_ptr; /* r/w */ + + __u64 __zero[3]; + +} __attribute__((packed, aligned(UMCG_TASK_ALIGN))); + +/** + * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl + * @UMCG_CTL_REGISTER: register the current task as a UMCG task + * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task + * @UMCG_CTL_WORKER: register the current task as a UMCG worker + */ +enum umcg_ctl_flag { + UMCG_CTL_REGISTER = 0x00001, + UMCG_CTL_UNREGISTER = 0x00002, + UMCG_CTL_WORKER = 0x10000, +}; + +#endif /* _UAPI_LINUX_UMCG_H */ --- a/init/Kconfig +++ b/init/Kconfig @@ -1686,6 +1686,21 @@ config MEMBARRIER If unsure, say Y. +config HAVE_UMCG + bool + +config UMCG + bool "Enable User Managed Concurrency Groups API" + depends on 64BIT + depends on GENERIC_ENTRY + depends on HAVE_UMCG + default n + help + Enable User Managed Concurrency Groups API, which form the basis + for an in-process M:N userspace scheduling framework. + At the moment this is an experimental/RFC feature that is not + guaranteed to be backward-compatible. + config KALLSYMS bool "Load all symbols for debugging/ksymoops" if EXPERT default y --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -6,6 +6,7 @@ #include <linux/livepatch.h> #include <linux/audit.h> #include <linux/tick.h> +#include <linux/sched.h> #include "common.h" @@ -76,6 +77,9 @@ static long syscall_trace_enter(struct p if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, syscall); + if (work & SYSCALL_WORK_SYSCALL_UMCG) + umcg_sys_enter(regs, syscall); + syscall_enter_audit(regs, syscall); return ret ? : syscall; @@ -155,8 +159,7 @@ static unsigned long exit_to_user_mode_l * Before returning to user space ensure that all pending work * items have been completed. */ - while (ti_work & EXIT_TO_USER_MODE_WORK) { - + do { local_irq_enable_exit_to_user(ti_work); if (ti_work & _TIF_NEED_RESCHED) @@ -168,6 +171,10 @@ static unsigned long exit_to_user_mode_l if (ti_work & _TIF_PATCH_PENDING) klp_update_patch_state(current); + /* must be before handle_signal_work(); terminates on sigpending */ + if (ti_work & _TIF_UMCG) + umcg_notify_resume(regs); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) handle_signal_work(regs, ti_work); @@ -188,7 +195,7 @@ static unsigned long exit_to_user_mode_l tick_nohz_user_enter_prepare(); ti_work = read_thread_flags(); - } + } while (ti_work & EXIT_TO_USER_MODE_WORK); /* Return the latest work state for arch_exit_to_user_mode() */ return ti_work; @@ -203,7 +210,7 @@ static void exit_to_user_mode_prepare(st /* Flush pending rcuog wakeup before the last need_resched() check */ tick_nohz_user_enter_prepare(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + if (unlikely(ti_work & (EXIT_TO_USER_MODE_WORK | _TIF_UMCG))) ti_work = exit_to_user_mode_loop(regs, ti_work); arch_exit_to_user_mode_prepare(regs, ti_work); @@ -253,6 +260,9 @@ static void syscall_exit_work(struct pt_ step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); + + if (work & SYSCALL_WORK_SYSCALL_UMCG) + umcg_sys_exit(regs); } /* --- a/kernel/exit.c +++ b/kernel/exit.c @@ -749,6 +749,10 @@ void __noreturn do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + /* Turn off UMCG sched hooks. */ + if (unlikely(tsk->flags & PF_UMCG_WORKER)) + tsk->flags &= ~PF_UMCG_WORKER; + /* * If do_exit is called because this processes oopsed, it's possible * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before @@ -786,6 +790,7 @@ void __noreturn do_exit(long code) io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ + umcg_handle_exit(); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -41,3 +41,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_CORE) += core_sched.o +obj-$(CONFIG_UMCG) += umcg.o --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4272,6 +4272,7 @@ static void __sched_fork(unsigned long c p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; #endif + umcg_clear_child(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -6330,9 +6331,11 @@ static inline void sched_submit_work(str * If a worker goes to sleep, notify and ask workqueue whether it * wants to wake up a task to maintain concurrency. */ - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) { if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); + else if (task_flags & PF_UMCG_WORKER) + umcg_wq_worker_sleeping(tsk); else io_wq_worker_sleeping(tsk); } @@ -6350,9 +6353,11 @@ static inline void sched_submit_work(str static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) { if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); + else if (tsk->flags & PF_UMCG_WORKER) + umcg_wq_worker_running(tsk); else io_wq_worker_running(tsk); } --- /dev/null +++ b/kernel/sched/umcg.c @@ -0,0 +1,954 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * User Managed Concurrency Groups (UMCG). + * + */ + +#include <linux/syscalls.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/umcg.h> +#include <linux/mm.h> + +#include <asm/syscall.h> +#include <asm/ptrace.h> + +#include "sched.h" + +static struct task_struct *umcg_get_task(u32 tid) +{ + struct task_struct *tsk = NULL; + + if (tid) { + rcu_read_lock(); + tsk = find_task_by_vpid(tid & UMCG_TID_MASK); + if (tsk && current->mm == tsk->mm && tsk->umcg_task) + get_task_struct(tsk); + else + tsk = NULL; + rcu_read_unlock(); + } + + return tsk; +} + +/* + * Pinning a page inhibits rmap based unmap for Anon pages. Doing a load + * through the user mapping ensures the user mapping exists. + */ +#define umcg_pin_and_load(_self, _pagep, _member) \ +({ \ + __label__ __out; \ + int __ret = -EFAULT; \ + \ + if (pin_user_pages_fast((unsigned long)(_self), 1, 0, &(_pagep)) != 1) \ + goto __out; \ + \ + if (!PageAnon(_pagep) || \ + get_user(_member, &(_self)->_member)) { \ + unpin_user_page(_pagep); \ + goto __out; \ + } \ + __ret = 0; \ +__out: __ret; \ +}) + +/** + * umcg_pin_pages: pin pages containing struct umcg_task of + * this task and its server (possibly this task again). + */ +static int umcg_pin_pages(void) +{ + struct task_struct *server = NULL, *tsk = current; + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); + int server_tid; + int ret; + + /* must not have stale state */ + if (WARN_ON_ONCE(tsk->umcg_page || + tsk->umcg_server_page || + tsk->umcg_server_task || + tsk->umcg_server)) + return -EBUSY; + + ret = umcg_pin_and_load(self, tsk->umcg_page, server_tid); + if (ret) + goto clear_self; + + ret = -ESRCH; + server = umcg_get_task(server_tid); + if (!server) + goto unpin_self; + + /* must cache due to possible concurrent change */ + tsk->umcg_server_task = READ_ONCE(server->umcg_task); + ret = umcg_pin_and_load(tsk->umcg_server_task, tsk->umcg_server_page, server_tid); + if (ret) + goto clear_server; + + tsk->umcg_server = server; + + return 0; + +clear_server: + tsk->umcg_server_task = NULL; + tsk->umcg_server_page = NULL; + +unpin_self: + unpin_user_page(tsk->umcg_page); +clear_self: + tsk->umcg_page = NULL; + + return ret; +} + +static void umcg_unpin_pages(void) +{ + struct task_struct *tsk = current; + + if (tsk->umcg_server) { + unpin_user_page(tsk->umcg_page); + tsk->umcg_page = NULL; + + unpin_user_page(tsk->umcg_server_page); + tsk->umcg_server_page = NULL; + tsk->umcg_server_task = NULL; + + put_task_struct(tsk->umcg_server); + tsk->umcg_server = NULL; + } +} + +static void umcg_clear_task(struct task_struct *tsk) +{ + /* + * This is either called for the current task, or for a newly forked + * task that is not yet running, so we don't need strict atomicity + * below. + */ + if (tsk->umcg_task) { + WRITE_ONCE(tsk->umcg_task, NULL); + tsk->umcg_page = NULL; + + tsk->umcg_server = NULL; + tsk->umcg_server_page = NULL; + tsk->umcg_server_task = NULL; + + tsk->flags &= ~PF_UMCG_WORKER; + clear_task_syscall_work(tsk, SYSCALL_UMCG); + clear_tsk_thread_flag(tsk, TIF_UMCG); + } +} + +/* Called for a forked or execve-ed child. */ +void umcg_clear_child(struct task_struct *tsk) +{ + umcg_clear_task(tsk); +} + +/* Called both by normally (unregister) and abnormally exiting workers. */ +void umcg_worker_exit(void) +{ + umcg_unpin_pages(); + umcg_clear_task(current); +} + +/* + * Do a state transition: @from -> @to. + * + * Will clear UMCG_TF_PREEMPT, UMCG_TF_COND_WAIT. + * + * When @to == {BLOCKED,RUNNABLE}, update timestamps. + * + * Returns: + * 0: success + * -EAGAIN: when self->state != @from + * -EFAULT + */ +static int umcg_update_state(struct task_struct *tsk, + struct umcg_task __user *self, + u32 from, u32 to) +{ + u32 old, new; + u64 now; + + if (to >= UMCG_TASK_RUNNABLE) { + switch (tsk->umcg_clock) { + case CLOCK_REALTIME: now = ktime_get_real_ns(); break; + case CLOCK_MONOTONIC: now = ktime_get_ns(); break; + case CLOCK_BOOTTIME: now = ktime_get_boottime_ns(); break; + case CLOCK_TAI: now = ktime_get_clocktai_ns(); break; + } + } + + if (!user_access_begin(self, sizeof(*self))) + return -EFAULT; + + unsafe_get_user(old, &self->state, Efault); + do { + if ((old & UMCG_TASK_MASK) != from) + goto fail; + + new = old & ~(UMCG_TASK_MASK | + UMCG_TF_PREEMPT | UMCG_TF_COND_WAIT); + new |= to & UMCG_TASK_MASK; + + } while (!unsafe_try_cmpxchg_user(&self->state, &old, new, Efault)); + + if (to == UMCG_TASK_BLOCKED) + unsafe_put_user(now, &self->blocked_ts, Efault); + if (to == UMCG_TASK_RUNNABLE) + unsafe_put_user(now, &self->runnable_ts, Efault); + + user_access_end(); + return 0; + +fail: + user_access_end(); + return -EAGAIN; + +Efault: + user_access_end(); + return -EFAULT; +} + +#define __UMCG_DIE(stmt, reason) do { \ + stmt; \ + pr_warn_ratelimited("%s: killing task %s/%d because: " reason "\n",\ + __func__, current->comm, current->pid); \ + force_sig(SIGKILL); \ + return; \ +} while (0) + +#define UMCG_DIE(reason) __UMCG_DIE(,reason) +#define UMCG_DIE_PF(reason) __UMCG_DIE(pagefault_enable(), reason) +#define UMCG_DIE_UNPIN(reason) __UMCG_DIE(umcg_unpin_pages(), reason) + +/* Called from syscall enter path and exceptions that can schedule */ +void umcg_sys_enter(struct pt_regs *regs, long syscall) +{ + /* avoid recursion vs our own syscalls */ + if (syscall == __NR_umcg_wait || + syscall == __NR_umcg_ctl) + return; + + /* avoid recursion vs schedule() */ + current->flags &= ~PF_UMCG_WORKER; + + /* + * Pin all the state on sys_enter() such that we can rely on it + * from dodgy contexts. It is either unpinned from pre-schedule() + * or sys_exit(), whichever comes first, thereby ensuring the pin + * is temporary. + */ + if (umcg_pin_pages()) + UMCG_DIE("pin"); + + current->flags |= PF_UMCG_WORKER; +} + +static int umcg_wake_task(struct task_struct *tsk, struct umcg_task __user *self) +{ + int ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING); + if (ret) + return ret; + + try_to_wake_up(tsk, TASK_NORMAL, WF_CURRENT_CPU); + return 0; +} + +static int umcg_wake_server(struct task_struct *tsk) +{ + int ret = umcg_wake_task(tsk->umcg_server, tsk->umcg_server_task); + if (ret == -EAGAIN) { + /* + * Server could have timed-out or already be running + * due to a runnable enqueue. See umcg_sys_exit(). + */ + ret = 0; + } + return ret; +} + +/* pre-schedule() */ +void umcg_wq_worker_sleeping(struct task_struct *tsk) +{ + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); + int ret; + + if (!tsk->umcg_server) { + /* + * Already blocked before, the pages are unpinned. + */ + return; + } + + /* Must not fault, mmap_sem might be held. */ + pagefault_disable(); + + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_BLOCKED); + if (ret == -EAGAIN) { + /* + * Consider: + * + * self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; + * ... + * sys_umcg_wait(); + * + * and the '...' code doing a blocking syscall/fault. This + * ensures that returns with UMCG_TASK_RUNNING, which will make + * sys_umcg_wait() return with -EAGAIN. + */ + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_BLOCKED); + } + if (ret) + UMCG_DIE_PF("state"); + + if (umcg_wake_server(tsk)) + UMCG_DIE_PF("wake"); + + pagefault_enable(); + + /* + * We're going to sleep, make sure to unpin the pages, this ensures + * the pins are temporary. Also see umcg_sys_exit(). + */ + umcg_unpin_pages(); +} + +/* post-schedule() */ +void umcg_wq_worker_running(struct task_struct *tsk) +{ + /* nothing here, see umcg_sys_exit() */ +} + +/* + * Enqueue @tsk on it's server's runnable list + * + * Must be called in umcg_pin_pages() context, relies on tsk->umcg_server. + * + * cmpxchg based single linked list add such that list integrity is never + * violated. Userspace *MUST* remove it from the list before changing ->state. + * As such, we must change state to RUNNABLE before enqueue. + * + * Returns: + * 0: success + * -EFAULT + */ +static int umcg_enqueue_runnable(struct task_struct *tsk) +{ + struct umcg_task __user *server = tsk->umcg_server_task; + struct umcg_task __user *self = tsk->umcg_task; + u64 first_ptr, *head = &server->runnable_workers_ptr; + u64 self_ptr = (unsigned long)self; + + /* + * umcg_pin_pages() did access_ok() on both pointers, use self here + * only because __user_access_begin() isn't available in generic code. + */ + if (!user_access_begin(self, sizeof(*self))) + return -EFAULT; + + unsafe_get_user(first_ptr, head, Efault); + do { + unsafe_put_user(first_ptr, &self->runnable_workers_ptr, Efault); + } while (!unsafe_try_cmpxchg_user(head, &first_ptr, self_ptr, Efault)); + + user_access_end(); + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +static int umcg_enqueue_and_wake(struct task_struct *tsk) +{ + int ret; + + ret = umcg_enqueue_runnable(tsk); + if (!ret) + ret = umcg_wake_server(tsk); + + return ret; +} + +/* + * umcg_wait: Wait for ->state to become RUNNING + * + * Returns: + * 0 - success + * -EINTR - pending signal + * -EINVAL - ::state is not {RUNNABLE,RUNNING} + * -ETIMEDOUT + * -EFAULT + */ +static int umcg_wait(u64 timo) +{ + struct task_struct *tsk = current; + struct umcg_task __user *self = tsk->umcg_task; + struct page *page = NULL; + u32 state; + int ret; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + ret = -EINTR; + if (signal_pending(current)) + break; + + /* + * Faults can block and scribble our wait state. + */ + pagefault_disable(); + if (get_user(state, &self->state)) { + pagefault_enable(); + __set_current_state(TASK_RUNNING); + + ret = -EFAULT; + if (page) { + unpin_user_page(page); + page = NULL; + break; + } + + ret = umcg_pin_and_load(self, page, state); + if (ret) { + page = NULL; + break; + } + + continue; + } + + if (page) { + unpin_user_page(page); + page = NULL; + } + pagefault_enable(); + + state &= UMCG_TASK_MASK; + if (state != UMCG_TASK_RUNNABLE) { + ret = 0; + if (state == UMCG_TASK_RUNNING) + break; + + ret = -EINVAL; + break; + } + + if (!schedule_hrtimeout_range_clock(timo ? &timo : NULL, + tsk->timer_slack_ns, + HRTIMER_MODE_ABS, + tsk->umcg_clock)) { + ret = -ETIMEDOUT; + break; + } + } + __set_current_state(TASK_RUNNING); + + return ret; +} + +/* + * Blocked case for umcg_sys_exit(), shared with sys_umcg_ctl(). + */ +static void umcg_unblock_and_wait(void) +{ + struct task_struct *tsk = current; + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); + + /* avoid recursion vs schedule() */ + tsk->flags &= ~PF_UMCG_WORKER; + + if (umcg_pin_pages()) + UMCG_DIE("pin"); + + if (umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE)) + UMCG_DIE_UNPIN("state"); + + if (umcg_enqueue_and_wake(tsk)) + UMCG_DIE_UNPIN("enqueue-wake"); + + umcg_unpin_pages(); + + switch (umcg_wait(0)) { + case 0: + case -EINTR: + /* notify_resume will continue the wait after the signal */ + break; + + default: + UMCG_DIE("wait"); + } + + tsk->flags |= PF_UMCG_WORKER; +} + +/* Called from syscall exit path and exceptions that can schedule */ +void umcg_sys_exit(struct pt_regs *regs) +{ + struct task_struct *tsk = current; + long syscall = syscall_get_nr(tsk, regs); + + if (syscall == __NR_umcg_wait || + syscall == __NR_umcg_ctl) + return; + + if (tsk->umcg_server) { + /* + * Didn't block, we done. + */ + umcg_unpin_pages(); + return; + } + + umcg_unblock_and_wait(); +} + +/* return-to-user path */ +void umcg_notify_resume(struct pt_regs *regs) +{ + struct task_struct *tsk = current; + struct umcg_task __user *self = tsk->umcg_task; + bool worker = tsk->flags & PF_UMCG_WORKER; + u32 state; + + /* avoid recursion vs schedule() */ + if (worker) + current->flags &= ~PF_UMCG_WORKER; + + if (get_user(state, &self->state)) + UMCG_DIE("get-state"); + + state &= UMCG_TASK_MASK | UMCG_TF_MASK; + if (state == UMCG_TASK_RUNNING) + goto done; + + /* + * See comment at UMCG_TF_COND_WAIT; TL;DR: user *will* call + * sys_umcg_wait() and signals/interrupts shouldn't block + * return-to-user. + */ + if (state == (UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT)) + goto done; + + if (state & UMCG_TF_PREEMPT) { + if (umcg_pin_pages()) + UMCG_DIE("pin"); + + if (umcg_update_state(tsk, self, + UMCG_TASK_RUNNING, + UMCG_TASK_RUNNABLE)) + UMCG_DIE_UNPIN("state"); + + if (umcg_enqueue_and_wake(tsk)) + UMCG_DIE_UNPIN("enqueue-wake"); + + umcg_unpin_pages(); + } + + switch (umcg_wait(0)) { + case 0: + case -EINTR: + /* we will resume the wait after the signal */ + break; + + default: + UMCG_DIE("wait"); + } + +done: + if (worker) + current->flags |= PF_UMCG_WORKER; +} + +/** + * sys_umcg_kick: makes a UMCG task cycle through umcg_notify_resume() + * + * Returns: + * 0 - Ok; + * -ESRCH - not a related UMCG task + * -EINVAL - another error happened (unknown flags, etc..) + */ +SYSCALL_DEFINE2(umcg_kick, u32, flags, pid_t, tid) +{ + struct task_struct *task = umcg_get_task(tid); + if (!task) + return -ESRCH; + + if (flags) + return -EINVAL; + +#ifdef CONFIG_SMP + smp_send_reschedule(task_cpu(task)); +#endif + + return 0; +} + +/* + * Handles ::next_tid as per sys_umcg_wait(). + * + * ::next_tid - return + * ----------------------------- + * 0 - 0 (success) + * + * tid - -ESRCH (no such task, or not of this UMCG) + * - -EAGAIN (next::state != RUNNABLE) + * - 0 (success, ::next_tid |= RUNNING) + * + * tid|RUNNING - -EAGAIN (next::state != RUNNING) + * - 0 (success) + * + * Returns: + * 0: success + * -EFAULT + * -ESRCH + * -EAGAIN + */ +static int umcg_wake_next(struct task_struct *tsk, struct umcg_task __user *self) +{ + struct umcg_task __user *next_task; + struct task_struct *next; + u32 next_tid, state; + int ret; + + if (get_user(next_tid, &self->next_tid)) + return -EFAULT; + + if (!next_tid) + return 0; + + next = umcg_get_task(next_tid); + if (!next) + return -ESRCH; + + next_task = READ_ONCE(next->umcg_task); + + if (next_tid & UMCG_TID_RUNNING) { + ret = -EFAULT; + if (get_user(state, &next_task->state)) + goto put_next; + + ret = 0; + if ((state & UMCG_TASK_MASK) != UMCG_TASK_RUNNING) + ret = -EAGAIN; + + } else { + ret = umcg_wake_task(next, next_task); + if (ret) + goto put_next; + + ret = -EFAULT; + if (put_user(next_tid | UMCG_TID_RUNNING, &self->next_tid)) + goto put_next; + + /* + * If this is a worker doing sys_umcg_wait() switching to + * another worker, userspace has the responsibility to update + * server::next_tid. + */ + + ret = 0; + } + +put_next: + put_task_struct(next); + return ret; +} + +/** + * sys_umcg_wait: transfer running context + * + * Called like: + * + * self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; + * ... + * sys_umcg_wait(0, time); + * + * The syscall will clear TF_COND_WAIT and wait until state becomes RUNNING. + * The code '...' must not contain syscalls + * + * If self->next_tid is set and indicates a valid UMCG task with RUNNABLE state + * that task will be made RUNNING and woken -- transfering the running context + * to that task. In this case self->next_tid is modified with TID_RUNNING to + * indicate self->next_tid is consumed. + * + * If self->next has TID_RUNNING set, it is validated the related task has + * RUNNING state, otherwise -EAGAIN is returned to indicate a new task needs to + * be selected. + * + * If the caller is a worker: + * + * - it will be enqueued on the associated server's runnable_workers_ptr list + * and the server will be woken. + * + * - when ::next_tid is used to affect a worker-to-worker transfer, it is up + * to userspace to keep server::next_tid consistent. + * + * The corrolary is that a server setting ::next_tid to 0 will idle. + * + * Returns: + * 0 - OK; + * -ETIMEDOUT - the timeout expired; + * -ERANGE - the timeout is out of range (worker); + * -EAGAIN - ::state wasn't RUNNABLE, concurrent wakeup; + * -EFAULT - failed accessing struct umcg_task __user of the current + * task, the server or next; + * -ESRCH - the task to wake not found or not a UMCG task; + * -EINVAL - another error happened (e.g. the current task is not a + * UMCG task, etc.) + */ +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo) +{ + struct task_struct *tsk = current; + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); + bool worker = tsk->flags & PF_UMCG_WORKER; + int ret; + + if (!self || flags) + return -EINVAL; + + if (worker) { + tsk->flags &= ~PF_UMCG_WORKER; + if (timo) + return -ERANGE; + } + + /* see umcg_sys_{enter,exit}() syscall exceptions */ + ret = umcg_pin_pages(); + if (ret) + goto unblock; + + /* + * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE. + */ + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE); + if (ret) + goto unpin; + + ret = umcg_wake_next(tsk, self); + if (ret) + goto unpin; + + if (worker) { + /* + * If this fails it is possible ::next_tid is already running + * while this task is not going to block. This violates our + * constraints. + * + * That said, pretty much the only way to make this fail is by + * force munmap()'ing things. In which case one is most welcome + * to the pieces. + */ + ret = umcg_enqueue_and_wake(tsk); + if (ret) + goto unpin; + } + + umcg_unpin_pages(); + + ret = umcg_wait(timo); + switch (ret) { + case 0: /* all done */ + case -EINTR: /* umcg_notify_resume() will continue the wait */ + ret = 0; + break; + + default: + goto unblock; + } +out: + if (worker) + tsk->flags |= PF_UMCG_WORKER; + return ret; + +unpin: + umcg_unpin_pages(); +unblock: + umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING); + goto out; +} + +static int umcg_register(struct umcg_task __user *self, u32 flags, clockid_t which_clock) +{ + struct task_struct *server; + struct umcg_task ut; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + case CLOCK_TAI: + current->umcg_clock = which_clock; + break; + + default: + return -EINVAL; + } + + if (current->umcg_task || !self) + return -EINVAL; + + if (copy_from_user(&ut, self, sizeof(ut))) + return -EFAULT; + + if (ut.next_tid || ut.__hole[0] || ut.__zero[0] || ut.__zero[1] || ut.__zero[2]) + return -EINVAL; + + rcu_read_lock(); + server = find_task_by_vpid(ut.server_tid); + if (server && server->mm == current->mm) { + if (flags == UMCG_CTL_WORKER) { + if (!server->umcg_task || + (server->flags & PF_UMCG_WORKER)) + server = NULL; + } else { + if (server != current) + server = NULL; + } + } else { + server = NULL; + } + rcu_read_unlock(); + + if (!server) + return -ESRCH; + + if (flags == UMCG_CTL_WORKER) { + if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_BLOCKED) + return -EINVAL; + + WRITE_ONCE(current->umcg_task, self); + current->flags |= PF_UMCG_WORKER; /* hook schedule() */ + set_syscall_work(SYSCALL_UMCG); /* hook syscall */ + set_thread_flag(TIF_UMCG); /* hook return-to-user */ + + umcg_unblock_and_wait(); + + } else { + if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_RUNNING) + return -EINVAL; + + WRITE_ONCE(current->umcg_task, self); + set_thread_flag(TIF_UMCG); /* hook return-to-user */ + + /* umcg_notify_resume() would block if not RUNNING */ + } + + return 0; +} + +static int umcg_unregister(struct umcg_task __user *self, u32 flags) +{ + bool worker = current->flags & PF_UMCG_WORKER; + int ret; + + if (!self || self != current->umcg_task) + return -EINVAL; + + if (!worker != !(flags & UMCG_CTL_WORKER)) + return -EINVAL; + + current->flags &= ~PF_UMCG_WORKER; + + ret = umcg_pin_pages(); + if (ret) { + if (worker) + current->flags |= PF_UMCG_WORKER; + return ret; + } + + ret = umcg_update_state(current, self, UMCG_TASK_RUNNING, UMCG_TASK_NONE); + if (ret) { + if (worker) + current->flags |= PF_UMCG_WORKER; + return ret; + } + + if (worker) + umcg_wake_server(current); + + umcg_unpin_pages(); + umcg_clear_task(current); + return 0; +} + +#define UMCG_CTL_CMD 0xff + +/** + * sys_umcg_ctl: (un)register the current task as a UMCG task. + * @flags: ORed values from enum umcg_ctl_flag; see below; + * @self: a pointer to struct umcg_task that describes this + * task and governs the behavior of sys_umcg_wait. + * @which_clock: clockid to use for timestamps and timeouts + * + * @flags & UMCG_CTL_REGISTER: register a UMCG task: + * + * UMCG workers: + * - @flags & UMCG_CTL_WORKER + * - self->state must be UMCG_TASK_BLOCKED + * + * UMCG servers: + * - !(@flags & UMCG_CTL_WORKER) + * - self->state must be UMCG_TASK_RUNNING + * + * All tasks: + * - self->server_tid must be a valid server + * - self->next_tid must be zero + * + * If the conditions above are met, sys_umcg_ctl() immediately returns + * if the registered task is a server. If the registered task is a + * worker it will be added to it's server's runnable_workers_ptr list + * and the server will be woken. + * + * @flags & UMCG_CTL_UNREGISTER: unregister a UMCG task. + * + * UMCG workers: + * - @flags & UMCG_CTL_WORKER + * + * UMCG servers: + * - !(@flags & UMCG_CTL_WORKER) + * + * All tasks: + * - self must match with UMCG_CTL_REGISTER + * - self->state must be UMCG_TASK_RUNNING + * - self->server_tid must be a valid server + * + * If the conditions above are met, sys_umcg_ctl() will change state to + * UMCG_TASK_NONE, and for workers, wake either next or server. + * + * Return: + * 0 - success + * -EFAULT - failed to read @self + * -EINVAL - some other error occurred + * -ESRCH - no such server_tid + */ +SYSCALL_DEFINE3(umcg_ctl, u32, flags, struct umcg_task __user *, self, clockid_t, which_clock) +{ + int cmd = flags & UMCG_CTL_CMD; + + if ((unsigned long)self % UMCG_TASK_ALIGN) + return -EINVAL; + + flags &= ~UMCG_CTL_CMD; + + if (flags & ~(UMCG_CTL_WORKER)) + return -EINVAL; + + switch (cmd) { + case UMCG_CTL_REGISTER: + return umcg_register(self, flags, which_clock); + + case UMCG_CTL_UNREGISTER: + return umcg_unregister(self, flags); + + default: + break; + } + + return -EINVAL; +} --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -273,6 +273,11 @@ COND_SYSCALL(landlock_create_ruleset); COND_SYSCALL(landlock_add_rule); COND_SYSCALL(landlock_restrict_self); +/* kernel/sched/umcg.c */ +COND_SYSCALL(umcg_ctl); +COND_SYSCALL(umcg_wait); +COND_SYSCALL(umcg_kick); + /* arch/example/kernel/sys_example.c */ /* mm/fadvise.c */
User Managed Concurrency Groups is an M:N threading toolkit that allows constructing user space schedulers designed to efficiently manage heterogeneous in-process workloads while maintaining high CPU utilization (95%+). XXX moar changelog explaining how this is moar awesome than traditional user-space threading. The big thing that's still missing is the SMP wake-to-remote-idle. Originally-by: Peter Oskolkov <posk@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20211214205358.701701555@infradead.org --- arch/x86/Kconfig | 1 arch/x86/entry/syscalls/syscall_64.tbl | 3 arch/x86/include/asm/thread_info.h | 2 fs/exec.c | 1 include/linux/entry-common.h | 16 include/linux/sched.h | 82 ++ include/linux/syscalls.h | 4 include/linux/thread_info.h | 2 include/uapi/asm-generic/unistd.h | 9 include/uapi/linux/umcg.h | 141 ++++ init/Kconfig | 15 kernel/entry/common.c | 18 kernel/exit.c | 5 kernel/sched/Makefile | 1 kernel/sched/core.c | 9 kernel/sched/umcg.c | 954 +++++++++++++++++++++++++++++++++ kernel/sys_ni.c | 5 17 files changed, 1259 insertions(+), 9 deletions(-)