Message ID | 20230721105744.434742902@infradead.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | futex: More futex2 bits | expand |
On Fri, Jul 21, 2023 at 12:22:48PM +0200, Peter Zijlstra wrote: > @@ -217,32 +259,55 @@ static u64 get_inode_sequence_number(str > * > * lock_page() might sleep, the caller should not hold a spinlock. > */ > -int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, > +int get_futex_key(void __user *uaddr, unsigned int flags, union futex_key *key, > enum futex_access rw) > { > unsigned long address = (unsigned long)uaddr; > struct mm_struct *mm = current->mm; > struct page *page, *tail; > struct address_space *mapping; > + int node, err, size, ro = 0; > bool fshared; > > fshared = flags & FLAGS_SHARED; > + size = futex_size(flags); > > /* > * The futex address must be "naturally" aligned. > */ > key->both.offset = address % PAGE_SIZE; > + if (unlikely((address % size) != 0)) > return -EINVAL; This enforces u32 alignment for: struct futex_numa_32 { u32 val; u32 node; }; Or do we want to enfore u64 alignment for that? > address -= key->both.offset; > > + if (flags & FLAGS_NUMA) > + size *= 2; > + > + if (unlikely(!access_ok(uaddr, size))) > return -EFAULT;
On Fri, Jul 21 2023 at 12:22, Peter Zijlstra wrote: > struct futex_hash_bucket *futex_hash(union futex_key *key) > { > - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, > + u32 hash = jhash2((u32 *)key, > + offsetof(typeof(*key), both.offset) / sizeof(u32), > key->both.offset); > + int node = key->both.node; > > - return &futex_queues[hash & (futex_hashsize - 1)]; > + if (node == -1) { > + /* > + * In case of !FLAGS_NUMA, use some unused hash bits to pick a > + * node -- this ensures regular futexes are interleaved across > + * the nodes and avoids having to allocate multiple > + * hash-tables. > + * > + * NOTE: this isn't perfectly uniform, but it is fast and > + * handles sparse node masks. > + */ > + node = (hash >> futex_hashshift) % nr_node_ids; Is nr_node_ids guaranteed to be stable after init? It's marked __read_mostly, but not __ro_after_init. > + if (!node_possible(node)) { > + node = find_next_bit_wrap(node_possible_map.bits, > + nr_node_ids, node); > + } > + } > + > + return &futex_queues[node][hash & (futex_hashsize - 1)]; > } > fshared = flags & FLAGS_SHARED; > + size = futex_size(flags); > > /* > * The futex address must be "naturally" aligned. > */ > key->both.offset = address % PAGE_SIZE; > - if (unlikely((address % sizeof(u32)) != 0)) > + if (unlikely((address % size) != 0)) > return -EINVAL; Hmm. Shouldn't that have changed with the allowance of the 1 and 2 byte futexes? > address -= key->both.offset; > > - if (unlikely(!access_ok(uaddr, sizeof(u32)))) > + if (flags & FLAGS_NUMA) > + size *= 2; > + > + if (unlikely(!access_ok(uaddr, size))) > return -EFAULT; > > if (unlikely(should_fail_futex(fshared))) > return -EFAULT; > > + key->both.node = -1; Please put this into an else path. > + if (flags & FLAGS_NUMA) { > + void __user *naddr = uaddr + size/2; size / 2; > + > + if (futex_get_value(&node, naddr, flags)) > + return -EFAULT; > + > + if (node == -1) { > + node = numa_node_id(); > + if (futex_put_value(node, naddr, flags)) > + return -EFAULT; > + } > + > + if (node >= MAX_NUMNODES || !node_possible(node)) > + return -EINVAL; That's clearly an else path too. No point in checking whether numa_node_id() is valid. > + key->both.node = node; > + } > > +static inline unsigned int futex_size(unsigned int flags) > +{ > + unsigned int size = flags & FLAGS_SIZE_MASK; > + return 1 << size; /* {0,1,2,3} -> {1,2,4,8} */ > +} > + > static inline bool futex_flags_valid(unsigned int flags) > { > /* Only 64bit futexes for 64bit code */ > @@ -77,13 +83,19 @@ static inline bool futex_flags_valid(uns > if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) > return false; > > - return true; > -} > + /* > + * Must be able to represent both NUMA_NO_NODE and every valid nodeid > + * in a futex word. > + */ > + if (flags & FLAGS_NUMA) { > + int bits = 8 * futex_size(flags); > + u64 max = ~0ULL; > + max >>= 64 - bits; Your newline key is broken, right? > + if (nr_node_ids >= max) > + return false; > + } Thanks, tglx
On Mon, Jul 31, 2023 at 07:36:21PM +0200, Thomas Gleixner wrote: > On Fri, Jul 21 2023 at 12:22, Peter Zijlstra wrote: > > struct futex_hash_bucket *futex_hash(union futex_key *key) > > { > > - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, > > + u32 hash = jhash2((u32 *)key, > > + offsetof(typeof(*key), both.offset) / sizeof(u32), > > key->both.offset); > > + int node = key->both.node; > > > > - return &futex_queues[hash & (futex_hashsize - 1)]; > > + if (node == -1) { > > + /* > > + * In case of !FLAGS_NUMA, use some unused hash bits to pick a > > + * node -- this ensures regular futexes are interleaved across > > + * the nodes and avoids having to allocate multiple > > + * hash-tables. > > + * > > + * NOTE: this isn't perfectly uniform, but it is fast and > > + * handles sparse node masks. > > + */ > > + node = (hash >> futex_hashshift) % nr_node_ids; > > Is nr_node_ids guaranteed to be stable after init? It's marked > __read_mostly, but not __ro_after_init. AFAICT it is only ever written to in setup_nr_node_ids() and that is all __init code. So I'm thinking this could/should indeed be __ro_after_init. Esp. so since it is an exported variable. Mike? > > + if (!node_possible(node)) { > > + node = find_next_bit_wrap(node_possible_map.bits, > > + nr_node_ids, node); > > + } > > + } > > + > > + return &futex_queues[node][hash & (futex_hashsize - 1)]; > > } > > fshared = flags & FLAGS_SHARED; > > + size = futex_size(flags); > > > > /* > > * The futex address must be "naturally" aligned. > > */ > > key->both.offset = address % PAGE_SIZE; > > - if (unlikely((address % sizeof(u32)) != 0)) > > + if (unlikely((address % size) != 0)) > > return -EINVAL; > > Hmm. Shouldn't that have changed with the allowance of the 1 and 2 byte > futexes? That patches comes after this.. :-) But I do have an open question here; do we want FUTEX2_NUMA futexes aligned at futex_size or double that? That is, what do we want the alignment of: struct futex_numa_32 { u32 val; u32 node; }; to be? Having that u64 aligned will guarantee these two values end up in the same page, having them u32 aligned (as per this patch) allows for them to be split. The current paths don't care, we don't hold locks, but perhaps it makes sense to be conservative. > > address -= key->both.offset; > > > > - if (unlikely(!access_ok(uaddr, sizeof(u32)))) > > + if (flags & FLAGS_NUMA) > > + size *= 2; > > + > > + if (unlikely(!access_ok(uaddr, size))) > > return -EFAULT; > > > > if (unlikely(should_fail_futex(fshared))) > > return -EFAULT; > > > > + key->both.node = -1; > > Please put this into an else path. Can do, but I figured the compiler could figure it out through dead store elimitation or somesuch pass. > > + if (flags & FLAGS_NUMA) { > > + void __user *naddr = uaddr + size/2; > > size / 2; > > > + > > + if (futex_get_value(&node, naddr, flags)) > > + return -EFAULT; > > + > > + if (node == -1) { > > + node = numa_node_id(); > > + if (futex_put_value(node, naddr, flags)) > > + return -EFAULT; > > + } > > + > > + if (node >= MAX_NUMNODES || !node_possible(node)) > > + return -EINVAL; > > That's clearly an else path too. No point in checking whether > numa_node_id() is valid. No, this also checks if the value we read from userspace is valid. Only when the value we read from userspace is -1 do we set numa_node_id(), otherwise we take the value as read, which then must be a valid value. > > + key->both.node = node; > > + } > > > > +static inline unsigned int futex_size(unsigned int flags) > > +{ > > + unsigned int size = flags & FLAGS_SIZE_MASK; > > + return 1 << size; /* {0,1,2,3} -> {1,2,4,8} */ > > +} > > + > > static inline bool futex_flags_valid(unsigned int flags) > > { > > /* Only 64bit futexes for 64bit code */ > > @@ -77,13 +83,19 @@ static inline bool futex_flags_valid(uns > > if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) > > return false; > > > > - return true; > > -} > > + /* > > + * Must be able to represent both NUMA_NO_NODE and every valid nodeid > > + * in a futex word. > > + */ > > + if (flags & FLAGS_NUMA) { > > + int bits = 8 * futex_size(flags); > > + u64 max = ~0ULL; > > + max >>= 64 - bits; > Your newline key is broken, right? Yes :-) > > + if (nr_node_ids >= max) > > + return false; > > + }
On Mon, Jul 31 2023 at 20:03, Peter Zijlstra wrote: > On Mon, Jul 31, 2023 at 07:36:21PM +0200, Thomas Gleixner wrote: >> Hmm. Shouldn't that have changed with the allowance of the 1 and 2 byte >> futexes? > > That patches comes after this.. :-) Futexes are really cursed :) > But I do have an open question here; do we want FUTEX2_NUMA futexes > aligned at futex_size or double that? That is, what do we want the > alignment of: > > struct futex_numa_32 { > u32 val; > u32 node; > }; > > to be? Having that u64 aligned will guarantee these two values end up in > the same page, having them u32 aligned (as per this patch) allows for > them to be split. Same page and same cacheline. > The current paths don't care, we don't hold locks, but perhaps it makes > sense to be conservative. I think it makes sense. >> > address -= key->both.offset; >> > >> > - if (unlikely(!access_ok(uaddr, sizeof(u32)))) >> > + if (flags & FLAGS_NUMA) >> > + size *= 2; >> > + >> > + if (unlikely(!access_ok(uaddr, size))) >> > return -EFAULT; >> > >> > if (unlikely(should_fail_futex(fshared))) >> > return -EFAULT; >> > >> > + key->both.node = -1; >> >> Please put this into an else path. > > Can do, but I figured the compiler could figure it out through dead > store elimitation or somesuch pass. Sure, but taste disagrees and it simply makes the code more obvious. >> > + if (flags & FLAGS_NUMA) { >> > + void __user *naddr = uaddr + size/2; >> >> size / 2; >> >> > + >> > + if (futex_get_value(&node, naddr, flags)) >> > + return -EFAULT; >> > + >> > + if (node == -1) { >> > + node = numa_node_id(); >> > + if (futex_put_value(node, naddr, flags)) >> > + return -EFAULT; >> > + } >> > + >> > + if (node >= MAX_NUMNODES || !node_possible(node)) >> > + return -EINVAL; >> >> That's clearly an else path too. No point in checking whether >> numa_node_id() is valid. > > No, this also checks if the value we read from userspace is valid. > > Only when the value we read from userspace is -1 do we set > numa_node_id(), otherwise we take the value as read, which then must be > a valid value. Right, but: if (node == -1) { node = numa_node_id(); if (futex_put_value(node, naddr, flags)) return -EFAULT; } else if (node >= MAX_NUMNODES || !node_possible(node)) { return -EINVAL; } makes it clear that the path where @node read from user space is != -1 needs to be validated, while your version checks the result of node = numa_node_id(); too, which does not make sense to me. Yes, it works, but ... Thanks, tglx
On Mon, 31 Jul 2023, Peter Zijlstra wrote: >> Is nr_node_ids guaranteed to be stable after init? It's marked >> __read_mostly, but not __ro_after_init. > > AFAICT it is only ever written to in setup_nr_node_ids() and that is all > __init code. So I'm thinking this could/should indeed be > __ro_after_init. Esp. so since it is an exported variable. > > Mike? Its stable and lots of other components depend on it like f.e. the size of cpumasks.
On Fri, 21 Jul 2023, Peter Zijlstra wrote: > Extend the futex2 interface to be numa aware. Sorry to be chiming in that late but it seems that this is useful to mitigate NUMA issues also for our platform. > When FUTEX2_NUMA is not set, the node is simply an extention of the > hash, such that traditional futexes are still interleaved over the > nodes. Could we follow NUMA policies like with other metadata allocations during systen call processing? If there is no NUMA task policy then the futex should be placed on the local NUMA node. That way the placement of the futex can be controlled by the tasks memory policy. We could skip the FUTEX2_NUMA option. > @@ -114,10 +137,29 @@ late_initcall(fail_futex_debugfs); > */ > struct futex_hash_bucket *futex_hash(union futex_key *key) > { > - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, > + u32 hash = jhash2((u32 *)key, > + offsetof(typeof(*key), both.offset) / sizeof(u32), > key->both.offset); > + int node = key->both.node; > > - return &futex_queues[hash & (futex_hashsize - 1)]; > + if (node == -1) { > + /* > + * In case of !FLAGS_NUMA, use some unused hash bits to pick a > + * node -- this ensures regular futexes are interleaved across > + * the nodes and avoids having to allocate multiple > + * hash-tables. > + * > + * NOTE: this isn't perfectly uniform, but it is fast and > + * handles sparse node masks. > + */ > + node = (hash >> futex_hashshift) % nr_node_ids; > + if (!node_possible(node)) { > + node = find_next_bit_wrap(node_possible_map.bits, > + nr_node_ids, node); > + } Use memory allocation policies here instead?
On Wed, Jun 12, 2024 at 10:23:00AM -0700, Christoph Lameter (Ampere) wrote: > On Fri, 21 Jul 2023, Peter Zijlstra wrote: > > > Extend the futex2 interface to be numa aware. > > Sorry to be chiming in that late but it seems that this is useful to > mitigate NUMA issues also for our platform. I read this like: I tested it and it works for me. Is that a correct reading of your statement? If so, I'll look at bumping this on the priority list and I'll look at the placement suggestion you had when I respin the patches. Thanks!
On Wed, Jun 12, 2024 at 10:23:00AM -0700, Christoph Lameter (Ampere) wrote: > > When FUTEX2_NUMA is not set, the node is simply an extention of the > > hash, such that traditional futexes are still interleaved over the > > nodes. > > Could we follow NUMA policies like with other metadata allocations during > systen call processing? I had a quick look at this, and since the mempolicy stuff is per vma, and we don't have the vma, this is going to be terribly expensive -- mmap_lock and all that. Once lockless vma lookups land (soonish, perhaps), this could be reconsidered. But for now there just isn't a sane way to do this. Using memory policies is probably okay -- but still risky, since you get the extra failure case where if you change the mempolicy between WAIT and WAKE things will not match and sadness happens, but that *SHOULD* hopefully not happen a lot. Mempolicies are typically fairly static. > If there is no NUMA task policy then the futex > should be placed on the local NUMA node. > That way the placement of the futex can be controlled by the tasks memory > policy. We could skip the FUTEX2_NUMA option. That doesn't work. If we don't have storage for the node across WAIT/WAKE, then the node must be deterministic per futex_hash(). Otherwise wake has no chance of finding the entry. Consider our random unbound task with no policies etc. (default state) doing FUTEX_WAIT and going to sleep while on node-0, it's sibling thread, that happens to run on node-1 issues FUTEX_WAKE. If they disagree on determining 'node', then they will not find match and the wakeup doesn't happen and userspace gets really sad. The current scheme where we determine node based on hash bits is fully deterministic and WAIT/WAKE will agree on which node-hash to use. The interleave is no worse than the global hash today -- OTOH it also isn't better.
Sorry saw this after the other email. On Fri, 25 Oct 2024, Peter Zijlstra wrote: > > Could we follow NUMA policies like with other metadata allocations during > > systen call processing? > > I had a quick look at this, and since the mempolicy stuff is per vma, > and we don't have the vma, this is going to be terribly expensive -- > mmap_lock and all that. There is a memory policy for the task as a whole that is used for slab allocations and allocations that are not vma bound in current->mempolicy. Use that. > Using memory policies is probably okay -- but still risky, since you get > the extra failure case where if you change the mempolicy between WAIT > and WAKE things will not match and sadness happens, but that *SHOULD* > hopefully not happen a lot. Mempolicies are typically fairly static. Right. > > That way the placement of the futex can be controlled by the tasks memory > > policy. We could skip the FUTEX2_NUMA option. > > That doesn't work. If we don't have storage for the node across > WAIT/WAKE, then the node must be deterministic per futex_hash(). > Otherwise wake has no chance of finding the entry. You can get a node number following the current task mempolicy by calling mempolicy_slab_node() and keep using that node for the future. It is also possible to check if the policy is interleave and then follow the distributed hash scheme. > The current scheme where we determine node based on hash bits is fully > deterministic and WAIT/WAKE will agree on which node-hash to use. The > interleave is no worse than the global hash today -- OTOH it also isn't > better. This is unexpected strange behavior for those familiar with NUMA. We have tools to set memory policies for tasks and those policies should be used throughout.
On Fri, Oct 25, 2024 at 12:36:28PM -0700, Christoph Lameter (Ampere) wrote: > > Sorry saw this after the other email. > > On Fri, 25 Oct 2024, Peter Zijlstra wrote: > > > > Could we follow NUMA policies like with other metadata allocations during > > > systen call processing? > > > > I had a quick look at this, and since the mempolicy stuff is per vma, > > and we don't have the vma, this is going to be terribly expensive -- > > mmap_lock and all that. > > There is a memory policy for the task as a whole that is used for slab > allocations and allocations that are not vma bound in current->mempolicy. > Use that. > You can get a node number following the current task mempolicy by calling > mempolicy_slab_node() and keep using that node for the future. I'll look into the per task thing, which I'm hoping means per-process. We need something that is mm wide consistent. But since futexes play in the address space, I was really rather thinking we ought to use the vma policy.
On Sat, 26 Oct 2024, Peter Zijlstra wrote: > I'll look into the per task thing, which I'm hoping means per-process. > We need something that is mm wide consistent. Each thread can modify its policy and that is used f.e. to control memory allocations for syscalls. For example a thread wants to allocate kernel metadata on a specific node then the policy would be set to that node. Syscall is done and then the tasks resets the policy to the default. mm wide memory policies are set at an VMA level and are associated with addresses. > But since futexes play in the address space, I was really rather > thinking we ought to use the vma policy. If they are associated with an address then you can use the address space policy.
--- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -34,6 +34,7 @@ union futex_key { u64 i_seq; unsigned long pgoff; unsigned int offset; + /* unsigned int node; */ } shared; struct { union { @@ -42,11 +43,13 @@ union futex_key { }; unsigned long address; unsigned int offset; + /* unsigned int node; */ } private; struct { u64 ptr; unsigned long word; unsigned int offset; + unsigned int node; /* NOT hashed! */ } both; }; --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -34,7 +34,8 @@ #include <linux/compat.h> #include <linux/jhash.h> #include <linux/pagemap.h> -#include <linux/memblock.h> +#include <linux/gfp.h> +#include <linux/vmalloc.h> #include <linux/fault-inject.h> #include <linux/slab.h> @@ -47,12 +48,14 @@ * reside in the same cacheline. */ static struct { - struct futex_hash_bucket *queues; unsigned long hashsize; + unsigned int hashshift; + struct futex_hash_bucket *queues[MAX_NUMNODES]; } __futex_data __read_mostly __aligned(2*sizeof(long)); -#define futex_queues (__futex_data.queues) -#define futex_hashsize (__futex_data.hashsize) +#define futex_hashsize (__futex_data.hashsize) +#define futex_hashshift (__futex_data.hashshift) +#define futex_queues (__futex_data.queues) /* * Fault injections for futexes. @@ -105,6 +108,26 @@ late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAIL_FUTEX */ +static int futex_get_value(u32 *val, u32 __user *from, unsigned int flags) +{ + switch (futex_size(flags)) { + case 1: return __get_user(*val, (u8 __user *)from); + case 2: return __get_user(*val, (u16 __user *)from); + case 4: return __get_user(*val, (u32 __user *)from); + default: BUG(); + } +} + +static int futex_put_value(u32 val, u32 __user *to, unsigned int flags) +{ + switch (futex_size(flags)) { + case 1: return __put_user(val, (u8 __user *)to); + case 2: return __put_user(val, (u16 __user *)to); + case 4: return __put_user(val, (u32 __user *)to); + default: BUG(); + } +} + /** * futex_hash - Return the hash bucket in the global hash * @key: Pointer to the futex key for which the hash is calculated @@ -114,10 +137,29 @@ late_initcall(fail_futex_debugfs); */ struct futex_hash_bucket *futex_hash(union futex_key *key) { - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, + u32 hash = jhash2((u32 *)key, + offsetof(typeof(*key), both.offset) / sizeof(u32), key->both.offset); + int node = key->both.node; - return &futex_queues[hash & (futex_hashsize - 1)]; + if (node == -1) { + /* + * In case of !FLAGS_NUMA, use some unused hash bits to pick a + * node -- this ensures regular futexes are interleaved across + * the nodes and avoids having to allocate multiple + * hash-tables. + * + * NOTE: this isn't perfectly uniform, but it is fast and + * handles sparse node masks. + */ + node = (hash >> futex_hashshift) % nr_node_ids; + if (!node_possible(node)) { + node = find_next_bit_wrap(node_possible_map.bits, + nr_node_ids, node); + } + } + + return &futex_queues[node][hash & (futex_hashsize - 1)]; } @@ -217,32 +259,55 @@ static u64 get_inode_sequence_number(str * * lock_page() might sleep, the caller should not hold a spinlock. */ -int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, +int get_futex_key(void __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page, *tail; struct address_space *mapping; - int err, ro = 0; + int node, err, size, ro = 0; bool fshared; fshared = flags & FLAGS_SHARED; + size = futex_size(flags); /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; - if (unlikely((address % sizeof(u32)) != 0)) + if (unlikely((address % size) != 0)) return -EINVAL; address -= key->both.offset; - if (unlikely(!access_ok(uaddr, sizeof(u32)))) + if (flags & FLAGS_NUMA) + size *= 2; + + if (unlikely(!access_ok(uaddr, size))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; + key->both.node = -1; + if (flags & FLAGS_NUMA) { + void __user *naddr = uaddr + size/2; + + if (futex_get_value(&node, naddr, flags)) + return -EFAULT; + + if (node == -1) { + node = numa_node_id(); + if (futex_put_value(node, naddr, flags)) + return -EFAULT; + } + + if (node >= MAX_NUMNODES || !node_possible(node)) + return -EINVAL; + + key->both.node = node; + } + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -1125,27 +1190,42 @@ void futex_exit_release(struct task_stru static int __init futex_init(void) { - unsigned int futex_shift; - unsigned long i; + unsigned int order, n; + unsigned long size, i; #if CONFIG_BASE_SMALL futex_hashsize = 16; #else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + futex_hashsize = 256 * num_possible_cpus(); + futex_hashsize /= num_possible_nodes(); + futex_hashsize = roundup_pow_of_two(futex_hashsize); #endif + futex_hashshift = ilog2(futex_hashsize); + size = sizeof(struct futex_hash_bucket) * futex_hashsize; + order = get_order(size); + + for_each_node(n) { + struct futex_hash_bucket *table; + + if (order > MAX_ORDER) + table = vmalloc_huge_node(size, GFP_KERNEL, n); + else + table = alloc_pages_exact_nid(n, size, GFP_KERNEL); + + BUG_ON(!table); + + for (i = 0; i < futex_hashsize; i++) { + atomic_set(&table[i].waiters, 0); + spin_lock_init(&table[i].lock); + plist_head_init(&table[i].chain); + } - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, - futex_hashsize < 256 ? HASH_SMALL : 0, - &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; - - for (i = 0; i < futex_hashsize; i++) { - atomic_set(&futex_queues[i].waiters, 0); - plist_head_init(&futex_queues[i].chain); - spin_lock_init(&futex_queues[i].lock); + futex_queues[n] = table; } + pr_info("futex hash table, %d nodes, %ld entries (order: %d, %lu bytes)\n", + num_possible_nodes(), + futex_hashsize, order, + sizeof(struct futex_hash_bucket) * futex_hashsize); return 0; } --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -65,6 +65,12 @@ static inline unsigned int futex2_to_fla return flags; } +static inline unsigned int futex_size(unsigned int flags) +{ + unsigned int size = flags & FLAGS_SIZE_MASK; + return 1 << size; /* {0,1,2,3} -> {1,2,4,8} */ +} + static inline bool futex_flags_valid(unsigned int flags) { /* Only 64bit futexes for 64bit code */ @@ -77,13 +83,19 @@ static inline bool futex_flags_valid(uns if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) return false; - return true; -} + /* + * Must be able to represent both NUMA_NO_NODE and every valid nodeid + * in a futex word. + */ + if (flags & FLAGS_NUMA) { + int bits = 8 * futex_size(flags); + u64 max = ~0ULL; + max >>= 64 - bits; + if (nr_node_ids >= max) + return false; + } -static inline unsigned int futex_size(unsigned int flags) -{ - unsigned int size = flags & FLAGS_SIZE_MASK; - return 1 << size; /* {0,1,2,3} -> {1,2,4,8} */ + return true; } static inline bool futex_validate_input(unsigned int flags, u64 val) @@ -182,7 +194,7 @@ enum futex_access { FUTEX_WRITE }; -extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, +extern int get_futex_key(void __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); extern struct hrtimer_sleeper * --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -179,7 +179,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uad return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } -#define FUTEX2_MASK (FUTEX2_64 | FUTEX2_PRIVATE) +#define FUTEX2_MASK (FUTEX2_64 | FUTEX2_NUMA | FUTEX2_PRIVATE) /** * futex_parse_waitv - Parse a waitv array from userspace
Extend the futex2 interface to be numa aware. When FUTEX2_NUMA is specified for a futex, the user value is extended to two words (of the same size). The first is the user value we all know, the second one will be the node to place this futex on. struct futex_numa_32 { u32 val; u32 node; }; When node is set to ~0, WAIT will set it to the current node_id such that WAKE knows where to find it. If userspace corrupts the node value between WAIT and WAKE, the futex will not be found and no wakeup will happen. When FUTEX2_NUMA is not set, the node is simply an extention of the hash, such that traditional futexes are still interleaved over the nodes. This is done to avoid having to have a separate !numa hash-table. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- include/linux/futex.h | 3 + kernel/futex/core.c | 128 +++++++++++++++++++++++++++++++++++++++--------- kernel/futex/futex.h | 26 +++++++-- kernel/futex/syscalls.c | 2 4 files changed, 127 insertions(+), 32 deletions(-)