Message ID | 20211214204445.665580974@infradead.org (mailing list archive) |
---|---|
Headers | show |
Series | sched: User Managed Concurrency Groups | expand |
On Tue, Dec 14, 2021 at 09:44:45PM +0100, Peter Zijlstra wrote: > I'll post my test-hack as a reply, but basically it does co-operative and > preemptive UP-like user scheduling. It's pretty rough, but seems to work. Defaults to co-operative and switches to preemptive when ran with an (any!) argument. --- // gcc -Itools/include/ -o umcg umcg.c -lpthread #define _GNU_SOURCE #include <unistd.h> #include <sys/types.h> #include <sys/syscall.h> #include <pthread.h> #include <stdio.h> #include <stdlib.h> #include <errno.h> #ifndef __NR_umcg_ctl #define __NR_umcg_ctl 450 #define __NR_umcg_wait 451 #define __NR_umcg_kick 452 #endif #include <linux/list.h> #include "include/uapi/linux/umcg.h" /* syscall wrappers */ static inline int sys_umcg_ctl(u32 flags, struct umcg_task *self, clockid_t which_clock) { return syscall(__NR_umcg_ctl, flags, self, which_clock); } static inline int sys_umcg_wait(u32 flags, u64 timo) { return syscall(__NR_umcg_wait, flags, timo); } static inline int sys_umcg_kick(u32 flags, pid_t tid) { return syscall(__NR_umcg_kick, flags, tid); } /* the 'foo' scheduler */ struct foo_task { struct umcg_task task; struct list_head node; pid_t tid; }; struct foo_server { struct umcg_task task; struct list_head node; pid_t tid; struct foo_task *cur; }; void foo_add(struct foo_server *server, struct umcg_task *t) { struct foo_task *foo = container_of(t, struct foo_task, task); t->runnable_workers_ptr = 0ULL; list_add_tail(&foo->node, &server->node); } struct foo_task *foo_pick_next(struct foo_server *server) { struct foo_task *first = NULL; if (list_empty(&server->node)) return first; first = list_first_entry(&server->node, struct foo_task, node); list_del(&first->node); return first; } #define NSEC_PER_SEC 1000000000ULL u64 foo_time(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (unsigned long long)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec; } void foo_yield(struct umcg_task *self) { self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; sys_umcg_wait(0, 0); } #define TICK_NSEC NSEC_PER_SEC volatile bool foo_preemptible = false; /* our workers */ /* always running worker */ void *worker_fn0(void *arg) { struct foo_server *server = arg; struct foo_task task = { }; unsigned long i; int ret; task.tid = gettid(); task.task.server_tid = server->tid; task.task.state = UMCG_TASK_BLOCKED; printf("A == %d\n", gettid()); ret = sys_umcg_ctl(UMCG_CTL_REGISTER|UMCG_CTL_WORKER, &task.task, CLOCK_MONOTONIC); if (ret) { perror("umcg_ctl(A): "); exit(-1); } for (;;) { int x = i++; if (!(x % 1000000)) { putchar('.'); fflush(stdout); } /* co-operative or preemptible */ if (!foo_preemptible && !(x % 10000000)) foo_yield(&task.task); } return NULL; } /* event driven worker */ void *worker_fn1(void *arg) { struct foo_server *server = arg; struct foo_task task = { }; int ret; task.tid = gettid(); task.task.server_tid = server->tid; task.task.state = UMCG_TASK_BLOCKED; printf("B == %d\n", gettid()); ret = sys_umcg_ctl(UMCG_CTL_REGISTER|UMCG_CTL_WORKER, &task.task, CLOCK_MONOTONIC); if (ret) { perror("umcg_ctl(B): "); exit(-1); } for (;;) { printf("B\n"); fflush(stdout); sleep(2); } return NULL; } void *worker_fn2(void *arg) { struct foo_server *server = arg; struct foo_task task = { }; int ret; task.tid = gettid(); task.task.server_tid = server->tid; task.task.state = UMCG_TASK_BLOCKED; printf("C == %d\n", gettid()); ret = sys_umcg_ctl(UMCG_CTL_REGISTER|UMCG_CTL_WORKER, &task.task, CLOCK_MONOTONIC); if (ret) { perror("umcg_ctl(C): "); exit(-1); } for (;;) { printf("C\n"); fflush(stdout); sleep(3); } return NULL; } /* the server */ int main(int argc, char **argv) { struct umcg_task *runnable_ptr, *next; struct foo_server server = { }; pthread_t worker[3]; u64 timeout = 0; int ret; printf("server == %d\n", gettid()); fflush(stdout); server.tid = gettid(); INIT_LIST_HEAD(&server.node); server.task.server_tid = gettid(); server.task.state = UMCG_TASK_RUNNING; ret = sys_umcg_ctl(UMCG_CTL_REGISTER, &server.task, CLOCK_MONOTONIC); if (ret) { perror("umcg_ctl: "); exit(-1); } pthread_create(&worker[0], NULL, worker_fn0, &server); pthread_create(&worker[1], NULL, worker_fn1, &server); pthread_create(&worker[2], NULL, worker_fn2, &server); if (argc > 1) { foo_preemptible = true; /* * setup preemption tick */ timeout = foo_time() + TICK_NSEC; } for (;;) { /* * Mark the server as runnable first, so we can detect * additions to the runnable list after we read it. */ server.task.state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; /* * comsume the runnable notification list and add * the tasks to our local runqueue. */ runnable_ptr = (void*)__atomic_exchange_n(&server.task.runnable_workers_ptr, NULL, __ATOMIC_SEQ_CST); while (runnable_ptr) { next = (void *)runnable_ptr->runnable_workers_ptr; foo_add(&server, runnable_ptr); runnable_ptr = next; } /* * If we've got a current running task, the server might have * gotten a 'spurious' wakeup to pick up new runnable tasks. * * In this case, don't pick a new task (possible * wakeup-preemption point, not implemented here). * * Note: even tough this RUNNING test is racy, if it blocks * after we'll get a RUNNABLE notification which will clear our * RUNNABLE state and sys_umcg_wait() will -EAGAIN. */ if (server.cur && server.cur->task.state == UMCG_TASK_RUNNING) { /* * Assert ::next_tid is clear, it should have been * consumed. */ if (server.task.next_tid) { printf("current running, but still have next_tid\n"); exit(-1); } putchar('x'); fflush(stdout); } else { /* * Pick the next task... */ server.cur = foo_pick_next(&server); server.task.next_tid = server.cur ? server.cur->tid : 0; printf("pick: %d\n", server.task.next_tid); fflush(stdout); } /* * And switch... */ ret = sys_umcg_wait(0, timeout); /* * If we did set ::next_tid but it hasn't been consumed by the * syscall due to failure, make sure to put the task back on * the runqueue, lest we leak it. */ if (server.task.next_tid) { foo_add(&server, &server.cur->task); server.cur = NULL; server.task.next_tid = 0; } if (!ret) continue; switch (errno) { case EAGAIN: /* * Got a wakeup, try again. */ continue; case ETIMEDOUT: /* * timeout: drive preemption */ putchar('t'); fflush(stdout); /* * Next tick.. */ timeout += TICK_NSEC; /* * If we have a current, cmpxchg set TF_PREEMPT and on success * send it a signal to kick it into the kernel such that * it might re-report itself runnable. */ if (server.cur) { struct foo_task *t = server.cur; u32 val = UMCG_TASK_RUNNING; u32 new = UMCG_TASK_RUNNING | UMCG_TF_PREEMPT; if (__atomic_compare_exchange_n(&t->task.state, &val, new, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { sys_umcg_kick(0, t->tid); } } /* * Either way around, if the cmpxchg * failed the task will have blocked * and we should re-start the loop. */ continue; default: printf("errno: %d\n", errno); perror("wait:"); exit(-1); } } return 0; }
On Tue, Dec 14, 2021 at 12:55 PM Peter Zijlstra <peterz@infradead.org> wrote: > > Hi, > > This is actually tested code; but still missing the SMP wake-to-idle machinery. > I still need to think about that. Thanks, Peter! At a first glance, your main patch does not look much smaller than mine, and I thought the whole point of re-doing it was to throw away extra features and make things smaller/simpler... Anyway, I'll test your patchset over the next week or so and let you know if anything really needed is missing (other than waking an idle server if there is one on a worker wakeup; this piece is definitely needed). > > I'll post my test-hack as a reply, but basically it does co-operative and > preemptive UP-like user scheduling. > > Patches go on top of tip/master as they rely on the .fixup removal > recently merged in tip/x86/core. > > Also, I still need to audit a bunch of mm code, because I'm not sure things are > actually as well behaved as this code supposes they are. >
On Tue, Dec 14, 2021 at 07:46:25PM -0800, Peter Oskolkov wrote: > On Tue, Dec 14, 2021 at 12:55 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > Hi, > > > > This is actually tested code; but still missing the SMP wake-to-idle machinery. > > I still need to think about that. > > Thanks, Peter! > > At a first glance, your main patch does not look much smaller than > mine, and I thought the whole point of re-doing it was to throw away > extra features and make things smaller/simpler... Well, simpler was the goal. I didn't really focus on size much. It isn't really big to begin with. But yes, it has 5 hooks now, 3 syscalls and lots of comments and all that under 900 lines, not bad I'd say. Also I think you wanted something like this? I'm not sure of the LAZY name, but I can't seem to come up with anything saner atm. --- --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1297,6 +1297,7 @@ struct task_struct { #ifdef CONFIG_UMCG /* setup by sys_umcg_ctrl() */ + u32 umcg_flags; clockid_t umcg_clock; struct umcg_task __user *umcg_task; --- a/include/uapi/linux/umcg.h +++ b/include/uapi/linux/umcg.h @@ -133,11 +133,13 @@ struct umcg_task { * @UMCG_CTL_REGISTER: register the current task as a UMCG task * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task * @UMCG_CTL_WORKER: register the current task as a UMCG worker + * @UMCG_CTL_LAZY: don't wake server on runnable enqueue */ enum umcg_ctl_flag { UMCG_CTL_REGISTER = 0x00001, UMCG_CTL_UNREGISTER = 0x00002, UMCG_CTL_WORKER = 0x10000, + UMCG_CTL_LAZY = 0x20000, }; #endif /* _UAPI_LINUX_UMCG_H */ --- a/kernel/sched/umcg.c +++ b/kernel/sched/umcg.c @@ -416,6 +416,27 @@ static int umcg_enqueue_runnable(struct } /* + * Enqueue tsk to it's server's runnable list and wake the server for pickup if + * so desired. Notable LAZY workers will not wake the server and rely on the + * server to do pickup whenever it naturally runs next. + * + * Returns: + * 0: success + * -EFAULT + */ +static int umcg_enqueue_and_wake(struct task_struct *tsk, bool force) +{ + int ret = umcg_enqueue_runnable(tsk); + if (ret) + return ret; + + if (force || !(tsk->umcg_flags & UMCG_CTL_LAZY)) + ret = umcg_wake_server(tsk); + + return ret; +} + +/* * umcg_wait: Wait for ->state to become RUNNING * * Returns: @@ -522,12 +543,8 @@ void umcg_sys_exit(struct pt_regs *regs) if (umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE)) UMCG_DIE_UNPIN("state"); - if (umcg_enqueue_runnable(tsk)) - UMCG_DIE_UNPIN("enqueue"); - - /* Server might not be RUNNABLE, means it's already running */ - if (umcg_wake_server(tsk)) - UMCG_DIE_UNPIN("wake-server"); + if (umcg_enqueue_and_wake(tsk, false)) + UMCG_DIE_UNPIN("enqueue-and-wake"); umcg_unpin_pages(); @@ -582,15 +599,11 @@ void umcg_notify_resume(struct pt_regs * UMCG_TASK_RUNNABLE)) UMCG_DIE_UNPIN("state"); - if (umcg_enqueue_runnable(tsk)) - UMCG_DIE_UNPIN("enqueue"); - /* - * XXX do we want a preemption consuming ::next_tid ? - * I'm currently leaning towards no. + * Preemption relies on waking the server on enqueue. */ - if (umcg_wake_server(tsk)) - UMCG_DIE_UNPIN("wake-server"); + if (umcg_enqueue_and_wake(tsk, true)) + UMCG_DIE_UNPIN("enqueue-and-wake"); umcg_unpin_pages(); } @@ -686,23 +699,15 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u goto unpin; if (worker) { - ret = umcg_enqueue_runnable(tsk); + ret = umcg_enqueue_and_wake(tsk, !tsk->umcg_next); if (ret) goto unpin; } - if (worker) - ret = umcg_wake(tsk); - else if (tsk->umcg_next) + if (tsk->umcg_next) { ret = umcg_wake_next(tsk); - - if (ret) { - /* - * XXX already enqueued ourself on ::server_tid; failing now - * leaves the lot in an inconsistent state since it'll also - * unblock self in order to return the error. !?!? - */ - goto unpin; + if (ret) + goto unpin; } umcg_unpin_pages(); @@ -783,7 +788,8 @@ SYSCALL_DEFINE3(umcg_ctl, u32, flags, st if (flags & ~(UMCG_CTL_REGISTER | UMCG_CTL_UNREGISTER | - UMCG_CTL_WORKER)) + UMCG_CTL_WORKER | + UMCG_CTL_LAZY)) return -EINVAL; if (flags == UMCG_CTL_UNREGISTER) { @@ -827,7 +833,7 @@ SYSCALL_DEFINE3(umcg_ctl, u32, flags, st rcu_read_lock(); server = find_task_by_vpid(ut.server_tid); if (server && server->mm == current->mm) { - if (flags == UMCG_CTL_WORKER) { + if (flags & UMCG_CTL_WORKER) { if (!server->umcg_task || (server->flags & PF_UMCG_WORKER)) server = NULL; @@ -843,10 +849,11 @@ SYSCALL_DEFINE3(umcg_ctl, u32, flags, st if (!server) return -ESRCH; - if (flags == UMCG_CTL_WORKER) { + if (flags & UMCG_CTL_WORKER) { if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_BLOCKED) return -EINVAL; + current->umcg_flags = flags & UMCG_CTL_LAZY; WRITE_ONCE(current->umcg_task, self); current->flags |= PF_UMCG_WORKER; /* hook schedule() */ set_syscall_work(SYSCALL_UMCG); /* hook syscall */ @@ -858,6 +865,7 @@ SYSCALL_DEFINE3(umcg_ctl, u32, flags, st if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_RUNNING) return -EINVAL; + current->umcg_flags = 0; WRITE_ONCE(current->umcg_task, self); set_thread_flag(TIF_UMCG); /* hook return-to-user */
On Tue, Dec 14, 2021 at 07:46:25PM -0800, Peter Oskolkov wrote: > Anyway, I'll test your patchset over the next week or so and let you > know if anything really needed is missing (other than waking an idle > server if there is one on a worker wakeup; this piece is definitely > needed). Right, so the problem I'm having is that a single idle server ptr like before can trivially miss waking annother idle server. Suppose: umcg::idle_server_tid_ptr Then the enqueue_and_wake() thing from the last patch would: idle_server_tid = xchg((pid_t __user *)self->idle_server_tid_ptr, 0); to consume the tid, and then use that to enqueue and wake. But what if a second wakeup happens right after that? There might be a second idle server, but we'll never find it, because userspace hasn't had time to update the field again. Alternatively, we do a linked list of servers, but then every such wakeup needs to iterate the whole list, looking for one that has UMCG_TF_IDLE set, or something like that, but that lookup is bad for performance. So I'm really not sure what way to go yet.
On Wed, Dec 15, 2021 at 11:06:20AM +0100, Peter Zijlstra wrote: > On Tue, Dec 14, 2021 at 07:46:25PM -0800, Peter Oskolkov wrote: > > On Tue, Dec 14, 2021 at 12:55 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > Hi, > > > > > > This is actually tested code; but still missing the SMP wake-to-idle machinery. > > > I still need to think about that. > > > > Thanks, Peter! > > > > At a first glance, your main patch does not look much smaller than > > mine, and I thought the whole point of re-doing it was to throw away > > extra features and make things smaller/simpler... > > Well, simpler was the goal. I didn't really focus on size much. It isn't > really big to begin with. > > But yes, it has 5 hooks now, 3 syscalls and lots of comments and all > that under 900 lines, not bad I'd say. > > Also I think you wanted something like this? I'm not sure of the LAZY > name, but I can't seem to come up with anything saner atm. > > > --- > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1297,6 +1297,7 @@ struct task_struct { > > #ifdef CONFIG_UMCG > /* setup by sys_umcg_ctrl() */ > + u32 umcg_flags; > clockid_t umcg_clock; > struct umcg_task __user *umcg_task; > > --- a/include/uapi/linux/umcg.h > +++ b/include/uapi/linux/umcg.h > @@ -133,11 +133,13 @@ struct umcg_task { > * @UMCG_CTL_REGISTER: register the current task as a UMCG task > * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task > * @UMCG_CTL_WORKER: register the current task as a UMCG worker > + * @UMCG_CTL_LAZY: don't wake server on runnable enqueue > */ > enum umcg_ctl_flag { > UMCG_CTL_REGISTER = 0x00001, > UMCG_CTL_UNREGISTER = 0x00002, > UMCG_CTL_WORKER = 0x10000, > + UMCG_CTL_LAZY = 0x20000, > }; > > #endif /* _UAPI_LINUX_UMCG_H */ > --- a/kernel/sched/umcg.c > +++ b/kernel/sched/umcg.c > @@ -416,6 +416,27 @@ static int umcg_enqueue_runnable(struct > } > > /* > + * Enqueue tsk to it's server's runnable list and wake the server for pickup if > + * so desired. Notable LAZY workers will not wake the server and rely on the > + * server to do pickup whenever it naturally runs next. > + * > + * Returns: > + * 0: success > + * -EFAULT > + */ > +static int umcg_enqueue_and_wake(struct task_struct *tsk, bool force) > +{ > + int ret = umcg_enqueue_runnable(tsk); > + if (ret) > + return ret; > + > + if (force || !(tsk->umcg_flags & UMCG_CTL_LAZY)) > + ret = umcg_wake_server(tsk); > + > + return ret; > +} Aah, this has a problem when the server is otherwise idle. I think we need that TF_IDLE thing for this too. Let me go write a test-case for all this.
On Wed, Dec 15, 2021 at 11:44:49AM +0100, Peter Zijlstra wrote: > On Tue, Dec 14, 2021 at 07:46:25PM -0800, Peter Oskolkov wrote: > > > Anyway, I'll test your patchset over the next week or so and let you > > know if anything really needed is missing (other than waking an idle > > server if there is one on a worker wakeup; this piece is definitely > > needed). > > Right, so the problem I'm having is that a single idle server ptr like > before can trivially miss waking annother idle server. > > Suppose: > > umcg::idle_server_tid_ptr > > Then the enqueue_and_wake() thing from the last patch would: > > idle_server_tid = xchg((pid_t __user *)self->idle_server_tid_ptr, 0); > > to consume the tid, and then use that to enqueue and wake. But what if a > second wakeup happens right after that? There might be a second idle > server, but we'll never find it, because userspace hasn't had time to > update the field again. > > Alternatively, we do a linked list of servers, but then every such > wakeup needs to iterate the whole list, looking for one that has > UMCG_TF_IDLE set, or something like that, but that lookup is bad for > performance. > > So I'm really not sure what way to go yet. 1. Linked lists are fugly and bad for the CPU. 2. I'm not sure how big the 'N' in 'M:N' is supposed to be. Might be one per hardware thread? So it could be hundreds-to-thousands, depending on the scale of system. 3. The interface between user-kernel could be an array of idle tids, maybe 16 entries long (16 * 4 = 64 bytes, just one cacheline). As a server finishes work, it looks for a 0 tid in the batch and stores its tid in the slot (cmpxchg, I guess, since the array will be shared between processes). If there are no free slots in the array, then we definitely have 16 threads already waiting for work, so it can park itself in whatever data structure userspace wants to use to manage idle servers. It's up to userspace to decide when to repopulate the array of available servers from its data structure of idle servers.
On Wed, Dec 15, 2021 at 01:49:28PM +0000, Matthew Wilcox wrote: > On Wed, Dec 15, 2021 at 11:44:49AM +0100, Peter Zijlstra wrote: > > On Tue, Dec 14, 2021 at 07:46:25PM -0800, Peter Oskolkov wrote: > > > > > Anyway, I'll test your patchset over the next week or so and let you > > > know if anything really needed is missing (other than waking an idle > > > server if there is one on a worker wakeup; this piece is definitely > > > needed). > > > > Right, so the problem I'm having is that a single idle server ptr like > > before can trivially miss waking annother idle server. > > > > Suppose: > > > > umcg::idle_server_tid_ptr > > > > Then the enqueue_and_wake() thing from the last patch would: > > > > idle_server_tid = xchg((pid_t __user *)self->idle_server_tid_ptr, 0); > > > > to consume the tid, and then use that to enqueue and wake. But what if a > > second wakeup happens right after that? There might be a second idle > > server, but we'll never find it, because userspace hasn't had time to > > update the field again. > > > > Alternatively, we do a linked list of servers, but then every such > > wakeup needs to iterate the whole list, looking for one that has > > UMCG_TF_IDLE set, or something like that, but that lookup is bad for > > performance. > > > > So I'm really not sure what way to go yet. > > 1. Linked lists are fugly and bad for the CPU. Absolutely.. although a stack might work, except for that ABA issue (and contention). > 2. I'm not sure how big the 'N' in 'M:N' is supposed to be. Might be > one per hardware thread? So it could be hundreds-to-thousands, > depending on the scale of system. Typically yes, one server task per hardware thread. Now, I'm also fairly sure you don't want excessive cross-node traffic for this stuff, so that puts a limit on things as well. > 3. The interface between user-kernel could be an array of idle tids, > maybe 16 entries long (16 * 4 = 64 bytes, just one cacheline). As a > server finishes work, it looks for a 0 tid in the batch and stores > its tid in the slot (cmpxchg, I guess, since the array will be shared > between processes). If there are no free slots in the array, then we > definitely have 16 threads already waiting for work, so it can park itself > in whatever data structure userspace wants to use to manage idle servers. > It's up to userspace to decide when to repopulate the array of available > servers from its data structure of idle servers. Right, a tid array might work. Could even have userspace specify the length, then it can do the trade-offs all on it's own. Either a fixed location for each server and a larger array, or clever things, whatever they want. I suppose I'll code up the variable length array, we have space for that.
On Wed, Dec 15, 2021 at 2:06 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Tue, Dec 14, 2021 at 07:46:25PM -0800, Peter Oskolkov wrote: > > On Tue, Dec 14, 2021 at 12:55 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > Hi, > > > > > > This is actually tested code; but still missing the SMP wake-to-idle machinery. > > > I still need to think about that. > > > > Thanks, Peter! > > > > At a first glance, your main patch does not look much smaller than > > mine, and I thought the whole point of re-doing it was to throw away > > extra features and make things smaller/simpler... > > Well, simpler was the goal. I didn't really focus on size much. It isn't > really big to begin with. > > But yes, it has 5 hooks now, 3 syscalls and lots of comments and all > that under 900 lines, not bad I'd say. My patchset had three hooks and two syscalls, and fewer new fields added to task_struct... And similarly around 900 lines on the kernel side in the main patch. So I am not sure why you believe that your approach is simpler, unless there was something fundamentally wrong with my approach. But tglx@ looked into it, and his remarks were more about comments and the commit message and smaller things at a function level, like an unneeded goto, than about the overall design... > > Also I think you wanted something like this? I'm not sure of the LAZY > name, but I can't seem to come up with anything saner atm. > [...] > /* > + * Enqueue tsk to it's server's runnable list and wake the server for pickup if > + * so desired. Notable LAZY workers will not wake the server and rely on the > + * server to do pickup whenever it naturally runs next. No, I never suggested we needed per-server runnable queues: in all my patchsets I had a single list of idle (runnable) workers. [...] From another message: >> Anyway, I'll test your patchset over the next week or so and let you >> know if anything really needed is missing (other than waking an idle >> server if there is one on a worker wakeup; this piece is definitely > needed). > Right, so the problem I'm having is that a single idle server ptr like > before can trivially miss waking annother idle server. I believe the approach I used in my patchset, suggested by Thierry Delisle, works. In short, there is a single idle server ptr for the kernel to work with. The userspace maintains a list of idle servers. If the ptr is NULL, the list is empty. When the kernel wakes the idle server it sees, the server reaps the runnable worker list and wakes another idle server from the userspace list, if available. This newly woken idle server repoints the ptr to itself, checks the runnable worker list, to avoid missing a woken worker, then goes to sleep. Why do you think this approach is not OK?
On Wed, Dec 15, 2021 at 09:56:06AM -0800, Peter Oskolkov wrote: > > Right, so the problem I'm having is that a single idle server ptr like > > before can trivially miss waking annother idle server. > > I believe the approach I used in my patchset, suggested by Thierry > Delisle, works. > > In short, there is a single idle server ptr for the kernel to work > with. The userspace maintains a list of idle servers. If the ptr is > NULL, the list is empty. When the kernel wakes the idle server it > sees, the server reaps the runnable worker list and wakes another idle > server from the userspace list, if available. This newly woken idle > server repoints the ptr to itself, checks the runnable worker list, to > avoid missing a woken worker, then goes to sleep. > > Why do you think this approach is not OK? Suppose at least 4 servers, 2 idle, 2 working. Now, the first of the working servers (lets call it S0) gets a wakeup (say Ta), it finds the idle server (say S3) and consumes it, sticking Ta on S3 and kicking it alive. Concurrently and loosing the race the other working server (S1) gets a wake-up from Tb, like said, it lost, no idle server, so Tb goes on the queue of S1. So then S3 wakes, finds Ta and they live happily ever after. While S2 and Tb fail to meet one another and both linger in sadness.
On Wed, Dec 15, 2021 at 09:56:06AM -0800, Peter Oskolkov wrote: > On Wed, Dec 15, 2021 at 2:06 AM Peter Zijlstra <peterz@infradead.org> wrote: > > /* > > + * Enqueue tsk to it's server's runnable list and wake the server for pickup if > > + * so desired. Notable LAZY workers will not wake the server and rely on the > > + * server to do pickup whenever it naturally runs next. > > No, I never suggested we needed per-server runnable queues: in all my > patchsets I had a single list of idle (runnable) workers. This is not about the idle servers.. So without the LAZY thing on, a previously blocked task hitting sys_exit will enqueue itself on the runnable list and wake the server for pickup. IIRC you didn't like the server waking while it was still running another task, but instead preferred to have it pick up the newly enqueued task when next it ran. LAZY enables that.. *however* it does need to wake the server when it is idle, otherwise they'll all sit there waiting for one another.
On Wed, Dec 15, 2021 at 10:19 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Dec 15, 2021 at 09:56:06AM -0800, Peter Oskolkov wrote: > > > > Right, so the problem I'm having is that a single idle server ptr like > > > before can trivially miss waking annother idle server. > > > > I believe the approach I used in my patchset, suggested by Thierry > > Delisle, works. > > > > In short, there is a single idle server ptr for the kernel to work > > with. The userspace maintains a list of idle servers. If the ptr is > > NULL, the list is empty. When the kernel wakes the idle server it > > sees, the server reaps the runnable worker list and wakes another idle > > server from the userspace list, if available. This newly woken idle > > server repoints the ptr to itself, checks the runnable worker list, to > > avoid missing a woken worker, then goes to sleep. > > > > Why do you think this approach is not OK? > > Suppose at least 4 servers, 2 idle, 2 working. > > Now, the first of the working servers (lets call it S0) gets a wakeup > (say Ta), it finds the idle server (say S3) and consumes it, sticking Ta > on S3 and kicking it alive. TL;DR: our models are different here. In your model a single server can have a bunch of workers interacting with it; in my model only a single RUNNING worker is assigned to a server, which it wakes when it blocks. More details: "Working servers" cannot get wakeups, because a "working server" has a single RUNNING worker attached to it. When a worker blocks, it wakes its attached server and becomes a detached blocked worker (same is true if the worker is "preempted": it blocks and wakes its assigned server). Blocked workers upon wakeup do this, in order: - always add themselves to the runnable worker list (the list is shared among ALL servers, it is NOT per server); - wake a server pointed to by idle_server_ptr, if not NULL; - sleep, waiting for a wakeup from a server; Server S, upon becoming IDLE (no worker to run, or woken on idle server list) does this, in order, in userspace (simplified, see umcg_get_idle_worker() in https://lore.kernel.org/lkml/20211122211327.5931-5-posk@google.com/): - take a userspace (spin) lock (so the steps below are all within a single critical section): - compare_xchg(idle_server_ptr, NULL, S); - if failed, there is another server in idle_server_ptr, so S adds itself to the userspace idle server list, releases the lock, goes to sleep; - if succeeded: - check the runnable worker list; - if empty, release the lock, sleep; - if not empty: - get the list - xchg(idle_server_ptr, NULL) (either S removes itself, or a worker in the kernel does it first, does not matter); - release the lock; - wake server S1 on idle server list. S1 goes through all of these steps. The protocol above serializes the userspace dealing with the idle server ptr/list. Wakeups in the kernel will be caught if there are idle servers. Yes, the protocol in the userspace is complicated (more complicated than outlined above, as the reaped idle/runnable worker list from the kernel is added to the userspace idle/runnable worker list), but the kernel side is very simple. I've tested this interaction extensively, I'm reasonably sure that no worker wakeups are lost. > > Concurrently and loosing the race the other working server (S1) gets a > wake-up from Tb, like said, it lost, no idle server, so Tb goes on the > queue of S1. > > So then S3 wakes, finds Ta and they live happily ever after. > > While S2 and Tb fail to meet one another and both linger in sadness. >
On Wed, Dec 15, 2021 at 10:25 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Dec 15, 2021 at 09:56:06AM -0800, Peter Oskolkov wrote: > > On Wed, Dec 15, 2021 at 2:06 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > /* > > > + * Enqueue tsk to it's server's runnable list and wake the server for pickup if > > > + * so desired. Notable LAZY workers will not wake the server and rely on the > > > + * server to do pickup whenever it naturally runs next. > > > > No, I never suggested we needed per-server runnable queues: in all my > > patchsets I had a single list of idle (runnable) workers. > > This is not about the idle servers.. > > So without the LAZY thing on, a previously blocked task hitting sys_exit > will enqueue itself on the runnable list and wake the server for pickup. How can a blocked task hit sys_exit()? Shouldn't it be RUNNING? Anyway, servers and workers are supposed to unregister before exiting, so if they call sys_exit() they break the agreement; in my patch I just clear all umcg-related state and proceed, without waking the server: the user broke the protocol, let them figure out what happened: +static void umcg_clear_task(struct task_struct *tsk) +{ + /* + * This is either called for the current task, or for a newly forked + * task that is not yet running, so we don't need strict atomicity + * below. + */ + if (tsk->umcg_task) { + WRITE_ONCE(tsk->umcg_task, NULL); + + /* These can be simple writes - see the comment above. */ + tsk->pinned_umcg_worker_page = NULL; + tsk->pinned_umcg_server_page = NULL; + tsk->flags &= ~PF_UMCG_WORKER; + } +} + +/* Called both by normally (unregister) and abnormally exiting workers. */ +void umcg_handle_exiting_worker(void) +{ + umcg_unpin_pages(); + umcg_clear_task(current); +} > > IIRC you didn't like the server waking while it was still running > another task, but instead preferred to have it pick up the newly > enqueued task when next it ran. Yes, this is the model I have, as I outlined in another email. I understand that having queues per-CPU/per-server is how it is done in the kernel, both for historical reasons (before multiprocessing there was a single queue/cpu) and for throughput (per-cpu runqueues are individually faster than a global one). However, this model is known to lag in presence of load spikes (long per-cpu queues with some CPUs idle), and is not really easy to work with given the use cases this whole userspace scheduling effort is trying to address: multiple priorities and work isolation: these are easy to address directly with a scheduler that has a global view rather than multiple per-cpu/per-server schedulers/queues that try to coordinate. I can even claim (without proof, just a hunch, based on how I would code this) that strict scheduling policies around priority and isolation (e.g. never run work item A if work item B becomes runnable, unless work item A is already running) cannot be enforced without a global scheduler, so per-cpu/per-server queues do not really fit the use case here... > > LAZY enables that.. *however* it does need to wake the server when it is > idle, otherwise they'll all sit there waiting for one another. If all servers are busy running workers, then it is not up to the kernel to "preempt" them in my model: the userspace can set up another thread/task to preempt a misbehaving worker, which will wake the server attached to it. But in practice there are always workers blocking in the kernel, which wakes their servers, which then reap the woken/runnable workers list, so well-behaving code does not need this. Yes, sometimes the code does not behave well, e.g. a worker grabs a spinlock, blocks in the kernel, its server runs another worker that starts spinning on the spinlock; but this is fixable by making the spinlock aware of our stuff: either the worker who got the lock is marked as LOCKED and so does not release its server (one of the reasons I have this flag), or the lock itself becomes sleepable (e.g. after spinning a bit it calls into a futex wait). And so we need to figure out this high-level thing first: do we go with the per-server worker queues/lists, or do we go with the approach I use in my patchset? It seems to me that the kernel-side code in my patchset is not more complicated than your patchset is shaping up to be, and some things are actually easier to accomplish, like having a single idle_server_ptr vs this LAZY and/or server "preemption" behavior that you have. Again, I'm OK with having it your way if all needed features are covered, but I think we should be explicit about why per-server/per-cpu model is chosen vs the one I proposed, especially as it seems the kernel side code is not really simpler in the end.
On Wed, Dec 15, 2021 at 11:49:51AM -0800, Peter Oskolkov wrote: > TL;DR: our models are different here. In your model a single server > can have a bunch of workers interacting with it; in my model only a > single RUNNING worker is assigned to a server, which it wakes when it > blocks. So part of the problem is that none of that was evident from the code. It is also completely different from the scheduler code it lives in, making it double confusing. After having read the code, I still had no clue what so ever how it was supposed to be used. Which is where my reverse engineering started :/ > More details: > > "Working servers" cannot get wakeups, because a "working server" has a > single RUNNING worker attached to it. When a worker blocks, it wakes > its attached server and becomes a detached blocked worker (same is > true if the worker is "preempted": it blocks and wakes its assigned > server). But who would do the preemption if the server isn't allowed to run? > Blocked workers upon wakeup do this, in order: > > - always add themselves to the runnable worker list (the list is > shared among ALL servers, it is NOT per server); That seems like a scalability issue. And, as said, it is completely alien when compared to the way Linux itself does scheduling. > - wake a server pointed to by idle_server_ptr, if not NULL; > - sleep, waiting for a wakeup from a server; > > Server S, upon becoming IDLE (no worker to run, or woken on idle > server list) does this, in order, in userspace (simplified, see > umcg_get_idle_worker() in > https://lore.kernel.org/lkml/20211122211327.5931-5-posk@google.com/): > - take a userspace (spin) lock (so the steps below are all within a > single critical section): Don't ever suggest userspace spinlocks, they're horrible crap. > - compare_xchg(idle_server_ptr, NULL, S); > - if failed, there is another server in idle_server_ptr, so S adds > itself to the userspace idle server list, releases the lock, goes to > sleep; > - if succeeded: > - check the runnable worker list; > - if empty, release the lock, sleep; > - if not empty: > - get the list > - xchg(idle_server_ptr, NULL) (either S removes itself, or > a worker in the kernel does it first, does not matter); > - release the lock; > - wake server S1 on idle server list. S1 goes through all > of these steps. > > The protocol above serializes the userspace dealing with the idle > server ptr/list. Wakeups in the kernel will be caught if there are > idle servers. Yes, the protocol in the userspace is complicated (more > complicated than outlined above, as the reaped idle/runnable worker > list from the kernel is added to the userspace idle/runnable worker > list), but the kernel side is very simple. I've tested this > interaction extensively, I'm reasonably sure that no worker wakeups > are lost. Sure, but also seems somewhat congestion prone :/
On Wed, Dec 15, 2021 at 01:04:33PM -0800, Peter Oskolkov wrote: > On Wed, Dec 15, 2021 at 10:25 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Wed, Dec 15, 2021 at 09:56:06AM -0800, Peter Oskolkov wrote: > > > On Wed, Dec 15, 2021 at 2:06 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > /* > > > > + * Enqueue tsk to it's server's runnable list and wake the server for pickup if > > > > + * so desired. Notable LAZY workers will not wake the server and rely on the > > > > + * server to do pickup whenever it naturally runs next. > > > > > > No, I never suggested we needed per-server runnable queues: in all my > > > patchsets I had a single list of idle (runnable) workers. > > > > This is not about the idle servers.. > > > > So without the LAZY thing on, a previously blocked task hitting sys_exit > > will enqueue itself on the runnable list and wake the server for pickup. > > How can a blocked task hit sys_exit()? Shouldn't it be RUNNING? Task was RUNNING, hits schedule() after passing through sys_enter(). this marks it BLOCKED. Task wakes again and proceeds to sys_exit(), at which point it's marked RUNNABLE and put on the runnable list. After which it'll kick the server to process said list. > Anyway, servers and workers are supposed to unregister before exiting, > so if they call sys_exit() they break the agreement; in my patch I > just clear all umcg-related state and proceed, without waking the > server: the user broke the protocol, let them figure out what > happened: No violation of anything there. The time between returning from schedule() and sys_exit() is unmanaged. sys_exit() is the spot where we regain control. > > IIRC you didn't like the server waking while it was still running > > another task, but instead preferred to have it pick up the newly > > enqueued task when next it ran. > > Yes, this is the model I have, as I outlined in another email. I > understand that having queues per-CPU/per-server is how it is done in > the kernel, both for historical reasons (before multiprocessing there > was a single queue/cpu) and for throughput (per-cpu runqueues are > individually faster than a global one). However, this model is known > to lag in presence of load spikes (long per-cpu queues with some CPUs > idle), and is not really easy to work with given the use cases this > whole userspace scheduling effort is trying to address: Well, that's *your* use-case. I'm fairly sure there's more people that want to use this thing. > multiple > priorities and work isolation: these are easy to address directly with > a scheduler that has a global view rather than multiple > per-cpu/per-server schedulers/queues that try to coordinate. You can trivially create this, even if the underlying thing is per-server. Simply have a lock and shared data structure between the servers. Even in the kernel, it should be mostly trivial to create a global policy. The only tricky bit (in the kernel) is the whole affinity muck, but userspace doesn't *need* to do even that. > > LAZY enables that.. *however* it does need to wake the server when it is > > idle, otherwise they'll all sit there waiting for one another. > > If all servers are busy running workers, then it is not up to the > kernel to "preempt" them in my model: the userspace can set up another > thread/task to preempt a misbehaving worker, which will wake the > server attached to it. So the way I'm seeing things is that the server *is* the 'CPU'. A UP machine cannot rely on another CPU to make preemption happen. Also, preemption is very much not about misbehaviour. Wakeup can cause a preemption event if the woken task is deemed higher priority than the current running one for example. And time based preemption is definitely also a thing wrt resource distribution. > But in practice there are always workers > blocking in the kernel, which wakes their servers, which then reap the > woken/runnable workers list, so well-behaving code does not need this. This seems to discount pure computational workloads. > And so we need to figure out this high-level thing first: do we go > with the per-server worker queues/lists, or do we go with the approach > I use in my patchset? It seems to me that the kernel-side code in my > patchset is not more complicated than your patchset is shaping up to > be, and some things are actually easier to accomplish, like having a > single idle_server_ptr vs this LAZY and/or server "preemption" > behavior that you have. > > Again, I'm OK with having it your way if all needed features are > covered, but I think we should be explicit about why > per-server/per-cpu model is chosen vs the one I proposed, especially > as it seems the kernel side code is not really simpler in the end. So I went with a UP first approach. I made single server preemption driven scheduling work first (both tick and wakeup-preemption are supported). The whole LAZY thing is only meant to supress some of that (notably wakeup preemption), but you're right in that it's not really nice. I got it working, but I'm not particularly happy with it either. Having the sys_enter/sys_exit hooks also made the page pins short lived, and signals much simpler to handle. You're destroying signals IIUC. So I see no fundamental reason why userspace cannot do something like: struct umcg_task *current = NULL; for (;;) { self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; runnable_ptr = (void *)__atomic_exchange_n(&self->runnable_workers_ptr, NULL, __ATOMIC_SEQ_CST); pthread_mutex_lock(&global_queue.lock); while (runnable_ptr) { next = (void *)runnable_ptr->runnable_workers_ptr; enqueue_task(&global_queue, runnable_ptr); runnable_ptr = next; } /* complicated bit about current already running goes here */ current = pick_task(&global_queue); self->next_tid = current ? current->tid : 0; unlock: pthread_mutex_unlock(&global_queue.lock); ret = sys_umcg_wait(0, 0); pthread_mutex_lock(&global_queue.lock); /* umcg_wait() didn't switch, make sure to return the task */ if (self->next_tid) { enqueue_task(&global_queue, current); current = NULL; } pthread_mutex_unlock(&global_queue.lock); // do something with @ret } to get global scheduling and all the contention^Wgoodness related to it. Except, of course, it's more complicated, but I think the idea's clear enough.
On Wed, Dec 15, 2021 at 2:25 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Dec 15, 2021 at 11:49:51AM -0800, Peter Oskolkov wrote: > > > TL;DR: our models are different here. In your model a single server > > can have a bunch of workers interacting with it; in my model only a > > single RUNNING worker is assigned to a server, which it wakes when it > > blocks. > > So part of the problem is that none of that was evident from the code. > It is also completely different from the scheduler code it lives in, > making it double confusing. > > After having read the code, I still had no clue what so ever how it was > supposed to be used. Which is where my reverse engineering started :/ I posted a doc patch: https://lore.kernel.org/lkml/20211122211327.5931-6-posk@google.com/ a lib patch with userspace code: https://lore.kernel.org/lkml/20211122211327.5931-5-posk@google.com/ and a doc patch for the lib/userspace code: https://lore.kernel.org/lkml/20211122211327.5931-7-posk@google.com/ I spent at least two weeks polishing the lib patch and the docs, much more if previous patchsets are to be taken into account. Yes, they are confusing, and most likely answer all of the wrong questions, but I did try to make my approach as clear as possible... I apologize if that was not very successful... > > > More details: > > > > "Working servers" cannot get wakeups, because a "working server" has a > > single RUNNING worker attached to it. When a worker blocks, it wakes > > its attached server and becomes a detached blocked worker (same is > > true if the worker is "preempted": it blocks and wakes its assigned > > server). > > But who would do the preemption if the server isn't allowed to run? > > > Blocked workers upon wakeup do this, in order: > > > > - always add themselves to the runnable worker list (the list is > > shared among ALL servers, it is NOT per server); > > That seems like a scalability issue. And, as said, it is completely > alien when compared to the way Linux itself does scheduling. > > > - wake a server pointed to by idle_server_ptr, if not NULL; > > - sleep, waiting for a wakeup from a server; > > > > Server S, upon becoming IDLE (no worker to run, or woken on idle > > server list) does this, in order, in userspace (simplified, see > > umcg_get_idle_worker() in > > https://lore.kernel.org/lkml/20211122211327.5931-5-posk@google.com/): > > - take a userspace (spin) lock (so the steps below are all within a > > single critical section): > > Don't ever suggest userspace spinlocks, they're horrible crap. This can easily be a mutex, not really important (although for very short critical sections with only memory reads/writes, like here, spin locks often perform better, in our experience). > > > - compare_xchg(idle_server_ptr, NULL, S); > > - if failed, there is another server in idle_server_ptr, so S adds > > itself to the userspace idle server list, releases the lock, goes to > > sleep; > > - if succeeded: > > - check the runnable worker list; > > - if empty, release the lock, sleep; > > - if not empty: > > - get the list > > - xchg(idle_server_ptr, NULL) (either S removes itself, or > > a worker in the kernel does it first, does not matter); > > - release the lock; > > - wake server S1 on idle server list. S1 goes through all > > of these steps. > > > > The protocol above serializes the userspace dealing with the idle > > server ptr/list. Wakeups in the kernel will be caught if there are > > idle servers. Yes, the protocol in the userspace is complicated (more > > complicated than outlined above, as the reaped idle/runnable worker > > list from the kernel is added to the userspace idle/runnable worker > > list), but the kernel side is very simple. I've tested this > > interaction extensively, I'm reasonably sure that no worker wakeups > > are lost. > > Sure, but also seems somewhat congestion prone :/ The whole critical section under the loc is just several memory read/write operations, so very short. And workers are removed from the kernel's list of runnable/woken workers all at once; and the server processing the runnable worker list knows how many of them are now available to run, so the appropriate number of idle servers can be woken (not yet implemented in my lib patch). So yes, this can be a bottleneck, but there are ways to make it less and less likely (by making the userspace more complicated; but this is not a concern).
On Wed, Dec 15, 2021 at 3:16 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Dec 15, 2021 at 01:04:33PM -0800, Peter Oskolkov wrote: > > On Wed, Dec 15, 2021 at 10:25 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Wed, Dec 15, 2021 at 09:56:06AM -0800, Peter Oskolkov wrote: > > > > On Wed, Dec 15, 2021 at 2:06 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > /* > > > > > + * Enqueue tsk to it's server's runnable list and wake the server for pickup if > > > > > + * so desired. Notable LAZY workers will not wake the server and rely on the > > > > > + * server to do pickup whenever it naturally runs next. > > > > > > > > No, I never suggested we needed per-server runnable queues: in all my > > > > patchsets I had a single list of idle (runnable) workers. > > > > > > This is not about the idle servers.. > > > > > > So without the LAZY thing on, a previously blocked task hitting sys_exit > > > will enqueue itself on the runnable list and wake the server for pickup. > > > > How can a blocked task hit sys_exit()? Shouldn't it be RUNNING? > > Task was RUNNING, hits schedule() after passing through sys_enter(). > this marks it BLOCKED. Task wakes again and proceeds to sys_exit(), at > which point it's marked RUNNABLE and put on the runnable list. After > which it'll kick the server to process said list. > Ah, you are talking about sys_exit hook; sorry, I thought you talked about the exit() syscall. [...] > > Well, that's *your* use-case. I'm fairly sure there's more people that > want to use this thing. > > > multiple > > priorities and work isolation: these are easy to address directly with > > a scheduler that has a global view rather than multiple > > per-cpu/per-server schedulers/queues that try to coordinate. > > You can trivially create this, even if the underlying thing is > per-server. Simply have a lock and shared data structure between the > servers. > > Even in the kernel, it should be mostly trivial to create a global > policy. The only tricky bit (in the kernel) is the whole affinity muck, > but userspace doesn't *need* to do even that. > > > > LAZY enables that.. *however* it does need to wake the server when it is > > > idle, otherwise they'll all sit there waiting for one another. > > > > If all servers are busy running workers, then it is not up to the > > kernel to "preempt" them in my model: the userspace can set up another > > thread/task to preempt a misbehaving worker, which will wake the > > server attached to it. > > So the way I'm seeing things is that the server *is* the 'CPU'. A UP > machine cannot rely on another CPU to make preemption happen. > > Also, preemption is very much not about misbehaviour. Wakeup can cause a > preemption event if the woken task is deemed higher priority than the > current running one for example. > > And time based preemption is definitely also a thing wrt resource > distribution. > > > But in practice there are always workers > > blocking in the kernel, which wakes their servers, which then reap the > > woken/runnable workers list, so well-behaving code does not need this. > > This seems to discount pure computational workloads. > > > And so we need to figure out this high-level thing first: do we go > > with the per-server worker queues/lists, or do we go with the approach > > I use in my patchset? It seems to me that the kernel-side code in my > > patchset is not more complicated than your patchset is shaping up to > > be, and some things are actually easier to accomplish, like having a > > single idle_server_ptr vs this LAZY and/or server "preemption" > > behavior that you have. > > > > Again, I'm OK with having it your way if all needed features are > > covered, but I think we should be explicit about why > > per-server/per-cpu model is chosen vs the one I proposed, especially > > as it seems the kernel side code is not really simpler in the end. > > So I went with a UP first approach. I made single server preemption > driven scheduling work first (both tick and wakeup-preemption are > supported). I agree that the UP approach is better than the LAZY one if we have per-server/per-cpu worker queues. > > The whole LAZY thing is only meant to supress some of that (notably > wakeup preemption), but you're right in that it's not really nice. I got > it working, but I'm not particularly happy with it either. > > Having the sys_enter/sys_exit hooks also made the page pins short lived, > and signals much simpler to handle. You're destroying signals IIUC. > > > So I see no fundamental reason why userspace cannot do something like: > > struct umcg_task *current = NULL; > > for (;;) { > self->state = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT; > > runnable_ptr = (void *)__atomic_exchange_n(&self->runnable_workers_ptr, > NULL, __ATOMIC_SEQ_CST); > > pthread_mutex_lock(&global_queue.lock); > while (runnable_ptr) { > next = (void *)runnable_ptr->runnable_workers_ptr; > enqueue_task(&global_queue, runnable_ptr); > runnable_ptr = next; > } > > /* complicated bit about current already running goes here */ > > current = pick_task(&global_queue); > self->next_tid = current ? current->tid : 0; > unlock: > pthread_mutex_unlock(&global_queue.lock); > > ret = sys_umcg_wait(0, 0); > > pthread_mutex_lock(&global_queue.lock); > /* umcg_wait() didn't switch, make sure to return the task */ > if (self->next_tid) { > enqueue_task(&global_queue, current); > current = NULL; > } > pthread_mutex_unlock(&global_queue.lock); > > // do something with @ret > } > > to get global scheduling and all the contention^Wgoodness related to it. > Except, of course, it's more complicated, but I think the idea's clear > enough. Let me spend some time and see if I can make all of this work together beyond simple tests. With the upcoming holidays and some other things I am busy with, this may take more than a week, I'm afraid...
Peter, On Wed, Dec 15 2021 at 15:26, Peter Oskolkov wrote: > On Wed, Dec 15, 2021 at 2:25 PM Peter Zijlstra <peterz@infradead.org> wrote: >> > - take a userspace (spin) lock (so the steps below are all within a >> > single critical section): >> >> Don't ever suggest userspace spinlocks, they're horrible crap. > > This can easily be a mutex, not really important (although for very > short critical sections with only memory reads/writes, like here, spin > locks often perform better, in our experience). Performance may be better, but user space spinlocks have a fundamental problem: They are prone to live locks. That's completely independent of the length of the critical section, it even can be empty. There are ways to avoid that, but that needs a very careful design on the application/library level and at the system configuration level (priorities, affinities ...). And even then, there are trival ways to break that, e.g. via CPU hotplug. So no, for something of general use, they are a complete NONO. People who think they know what they are doing have the source and can replace them if they feel the need to do so. Thanks, tglx