Message ID | 1344961490-4068-3-git-send-email-levinsasha928@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Sasha Levin <levinsasha928@gmail.com> writes: > Switch user_ns to use the new hashtable implementation. This reduces the amount of > generic unrelated code in user_ns. Two concerns here. 1) When adding a new entry you recompute the hash where previously that was not done. I believe that will slow down adding of new entries. 2) Using hash_32 for uids is an interesting choice. hash_32 discards the low bits. Last I checked for uids the low bits were the bits that were most likely to be different and had the most entropy. I'm not certain how multiplying by the GOLDEN_RATION_PRIME_32 will affect things but I would be surprised if it shifted all of the randomness from the low bits to the high bits. And just a nit. struct user is essentially orthogonal to the user namespace at this point, making the description of the patch a little weird. Eric > Signed-off-by: Sasha Levin <levinsasha928@gmail.com> > --- > kernel/user.c | 33 +++++++++++++-------------------- > 1 files changed, 13 insertions(+), 20 deletions(-) > > diff --git a/kernel/user.c b/kernel/user.c > index b815fef..d10c484 100644 > --- a/kernel/user.c > +++ b/kernel/user.c > @@ -16,6 +16,7 @@ > #include <linux/interrupt.h> > #include <linux/export.h> > #include <linux/user_namespace.h> > +#include <linux/hashtable.h> > > /* > * userns count is 1 for root user, 1 for init_uts_ns, > @@ -52,13 +53,9 @@ EXPORT_SYMBOL_GPL(init_user_ns); > */ > > #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) > -#define UIDHASH_SZ (1 << UIDHASH_BITS) > -#define UIDHASH_MASK (UIDHASH_SZ - 1) > -#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) > -#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) > > static struct kmem_cache *uid_cachep; > -struct hlist_head uidhash_table[UIDHASH_SZ]; > +static DEFINE_HASHTABLE(uidhash_table, UIDHASH_BITS) > > /* > * The uidhash_lock is mostly taken from process context, but it is > @@ -84,22 +81,22 @@ struct user_struct root_user = { > /* > * These routines must be called with the uidhash spinlock held! > */ > -static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) > +static void uid_hash_insert(struct user_struct *up) > { > - hlist_add_head(&up->uidhash_node, hashent); > + hash_add(uidhash_table, &up->uidhash_node, __kuid_val(up->uid)); > } > > static void uid_hash_remove(struct user_struct *up) > { > - hlist_del_init(&up->uidhash_node); > + hash_del(&up->uidhash_node); > } > > -static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) > +static struct user_struct *uid_hash_find(kuid_t uid) > { > struct user_struct *user; > struct hlist_node *h; > > - hlist_for_each_entry(user, h, hashent, uidhash_node) { > + hash_for_each_possible(uidhash_table, user, h, uidhash_node, __kuid_val(uid)) { > if (uid_eq(user->uid, uid)) { > atomic_inc(&user->__count); > return user; > @@ -135,7 +132,7 @@ struct user_struct *find_user(kuid_t uid) > unsigned long flags; > > spin_lock_irqsave(&uidhash_lock, flags); > - ret = uid_hash_find(uid, uidhashentry(uid)); > + ret = uid_hash_find(uid); > spin_unlock_irqrestore(&uidhash_lock, flags); > return ret; > } > @@ -156,11 +153,10 @@ void free_uid(struct user_struct *up) > > struct user_struct *alloc_uid(kuid_t uid) > { > - struct hlist_head *hashent = uidhashentry(uid); > struct user_struct *up, *new; > > spin_lock_irq(&uidhash_lock); > - up = uid_hash_find(uid, hashent); > + up = uid_hash_find(uid); > spin_unlock_irq(&uidhash_lock); > > if (!up) { > @@ -176,13 +172,13 @@ struct user_struct *alloc_uid(kuid_t uid) > * on adding the same user already.. > */ > spin_lock_irq(&uidhash_lock); > - up = uid_hash_find(uid, hashent); > + up = uid_hash_find(uid); > if (up) { > key_put(new->uid_keyring); > key_put(new->session_keyring); > kmem_cache_free(uid_cachep, new); > } else { > - uid_hash_insert(new, hashent); > + uid_hash_insert(new); > up = new; > } > spin_unlock_irq(&uidhash_lock); > @@ -196,17 +192,14 @@ out_unlock: > > static int __init uid_cache_init(void) > { > - int n; > - > uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), > 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); > > - for(n = 0; n < UIDHASH_SZ; ++n) > - INIT_HLIST_HEAD(uidhash_table + n); > + hash_init(uidhash_table); > > /* Insert the root user immediately (init already runs as root) */ > spin_lock_irq(&uidhash_lock); > - uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); > + uid_hash_insert(&root_user); > spin_unlock_irq(&uidhash_lock); > > return 0; -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/15/2012 01:52 AM, Eric W. Biederman wrote: > Sasha Levin <levinsasha928@gmail.com> writes: > >> Switch user_ns to use the new hashtable implementation. This reduces the amount of >> generic unrelated code in user_ns. > > Two concerns here. > 1) When adding a new entry you recompute the hash where previously that > was not done. I believe that will slow down adding of new entries. I figured that the price for the extra hashing isn't significant since hash_32 is just a multiplication and a shift. I'll modify the code to calculate the key just once. > 2) Using hash_32 for uids is an interesting choice. hash_32 discards > the low bits. Last I checked for uids the low bits were the bits > that were most likely to be different and had the most entropy. > > I'm not certain how multiplying by the GOLDEN_RATION_PRIME_32 will > affect things but I would be surprised if it shifted all of the > randomness from the low bits to the high bits. "Is hash_* good enough for our purpose?" - I was actually surprised that no one raised that question during the RFC and assumed it was because everybody agreed that it's indeed good enough. I can offer the following: I'll write a small module that will hash 1...10000 into a hashtable which uses 7 bits (just like user_ns) and post the distribution we'll get. If the results of the above will be satisfactory we can avoid the discussion about which hash function we should really be using. If not, I guess now is a good time for that :) -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Sasha Levin <levinsasha928@gmail.com> writes: > On 08/15/2012 01:52 AM, Eric W. Biederman wrote: >> Sasha Levin <levinsasha928@gmail.com> writes: >> >>> Switch user_ns to use the new hashtable implementation. This reduces the amount of >>> generic unrelated code in user_ns. >> >> Two concerns here. >> 1) When adding a new entry you recompute the hash where previously that >> was not done. I believe that will slow down adding of new entries. > > I figured that the price for the extra hashing isn't significant since hash_32 > is just a multiplication and a shift. > > I'll modify the code to calculate the key just once. Honestly I don't know either way, but it seemed a shame to give up a common and trivial optimization. >> 2) Using hash_32 for uids is an interesting choice. hash_32 discards >> the low bits. Last I checked for uids the low bits were the bits >> that were most likely to be different and had the most entropy. >> >> I'm not certain how multiplying by the GOLDEN_RATION_PRIME_32 will >> affect things but I would be surprised if it shifted all of the >> randomness from the low bits to the high bits. > > "Is hash_* good enough for our purpose?" - I was actually surprised that no one > raised that question during the RFC and assumed it was because everybody agreed > that it's indeed good enough. > > I can offer the following: I'll write a small module that will hash 1...10000 > into a hashtable which uses 7 bits (just like user_ns) and post the distribution > we'll get. That won't hurt. I think 1-100 then 1000-1100 may actually be more representative. Not that I would mind seeing the larger range. Especially since I am in the process of encouraging the use of more uids. > If the results of the above will be satisfactory we can avoid the discussion > about which hash function we should really be using. If not, I guess now is a > good time for that :) Yes. A small emperical test sounds good. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/15/2012 03:08 AM, Eric W. Biederman wrote: >> I can offer the following: I'll write a small module that will hash 1...10000 >> > into a hashtable which uses 7 bits (just like user_ns) and post the distribution >> > we'll get. > That won't hurt. I think 1-100 then 1000-1100 may actually be more > representative. Not that I would mind seeing the larger range. > Especially since I am in the process of encouraging the use of more > uids. > Alrighty, the results are in (numbers are objects in bucket): For the 0...10000 range: Average: 78.125 Std dev: 1.4197704151 Min: 75 Max: 80 For the 1...100 range: Average: 0.78125 Std dev: 0.5164613088 Min: 0 Max: 2 For the 1000...1100 range: Average: 0.7890625 Std dev: 0.4964812206 Min: 0 Max: 2 Looks like hash_32 is pretty good with small numbers. -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Sasha Levin <levinsasha928@gmail.com> writes: > On 08/15/2012 03:08 AM, Eric W. Biederman wrote: >>> I can offer the following: I'll write a small module that will hash 1...10000 >>> > into a hashtable which uses 7 bits (just like user_ns) and post the distribution >>> > we'll get. >> That won't hurt. I think 1-100 then 1000-1100 may actually be more >> representative. Not that I would mind seeing the larger range. >> Especially since I am in the process of encouraging the use of more >> uids. >> > > Alrighty, the results are in (numbers are objects in bucket): > > For the 0...10000 range: > > Average: 78.125 > Std dev: 1.4197704151 > Min: 75 > Max: 80 > > > For the 1...100 range: > > Average: 0.78125 > Std dev: 0.5164613088 > Min: 0 > Max: 2 > > > For the 1000...1100 range: > > Average: 0.7890625 > Std dev: 0.4964812206 > Min: 0 > Max: 2 > > > Looks like hash_32 is pretty good with small numbers. Yes hash_32 seems reasonable for the uid hash. With those long hash chains I wouldn't like to be on a machine with 10,000 processes with each with a different uid, and a processes calling setuid in the fast path. The uid hash that we are playing with is one that I sort of wish that the hash table could grow in size, so that we could scale up better. Aw well. Most of the time we only have a very small number of uids in play, so it doesn't matter at this point. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
* Eric W. Biederman (ebiederm@xmission.com) wrote: > Sasha Levin <levinsasha928@gmail.com> writes: > > > On 08/15/2012 03:08 AM, Eric W. Biederman wrote: > >>> I can offer the following: I'll write a small module that will hash 1...10000 > >>> > into a hashtable which uses 7 bits (just like user_ns) and post the distribution > >>> > we'll get. > >> That won't hurt. I think 1-100 then 1000-1100 may actually be more > >> representative. Not that I would mind seeing the larger range. > >> Especially since I am in the process of encouraging the use of more > >> uids. > >> > > > > Alrighty, the results are in (numbers are objects in bucket): > > > > For the 0...10000 range: > > > > Average: 78.125 > > Std dev: 1.4197704151 > > Min: 75 > > Max: 80 > > > > > > For the 1...100 range: > > > > Average: 0.78125 > > Std dev: 0.5164613088 > > Min: 0 > > Max: 2 > > > > > > For the 1000...1100 range: > > > > Average: 0.7890625 > > Std dev: 0.4964812206 > > Min: 0 > > Max: 2 > > > > > > Looks like hash_32 is pretty good with small numbers. > > Yes hash_32 seems reasonable for the uid hash. With those long hash > chains I wouldn't like to be on a machine with 10,000 processes with > each with a different uid, and a processes calling setuid in the fast > path. > > The uid hash that we are playing with is one that I sort of wish that > the hash table could grow in size, so that we could scale up better. Hi Eric, If you want to try out something that has more features than a basic hash table, already exists and is available for you to play with, you might want to have a look at the RCU lock-free resizable hash table. It's initially done in userspace, but shares the same RCU semantic as the kernel, and has chunk-based kernel-friendly index backends (thanks to Lai Jiangshan), very useful to integrate with the kernel page allocator. It has the following properties that might make this container a good fit for uid hashing: - Real-time friendly lookups: Lookups are RCU and wait-free. - Fast and real-time friendly updates: Use cmpxchg for update, and RCU to deal with ABA. - Resize (expand/shrink) for each power of two size, performed concurrently with ongoing updates and lookups. - Has add_unique (uniquify), add_replace, and also duplicate semantics. - Provide uniqueness guarantees for RCU traversals of the hash table with respect to add_unique and add_replace. So if you are looking for a fast, RT-friendly, resizable hash table to play with, you might want to have a look at the userspace RCU implementation, which now features this hash table: https://lttng.org/urcu See urcu/rculfhash.h for the API. Best regards, Mathieu > > Aw well. Most of the time we only have a very small number of uids > in play, so it doesn't matter at this point. > > Eric >
> Yes hash_32 seems reasonable for the uid hash. With those long hash > chains I wouldn't like to be on a machine with 10,000 processes with > each with a different uid, and a processes calling setuid in the fast > path. > > The uid hash that we are playing with is one that I sort of wish that > the hash table could grow in size, so that we could scale up better. Since uids are likely to be allocated in dense blocks, maybe an unhashed multi-level lookup scheme might be appropriate. Index an array with the low 8 (say) bits of the uid. Each item can be either: 1) NULL => free entry. 2) a pointer to a uid structure (check uid value). 3) a pointer to an array to index with the next 8 bits. (2) and (3) can be differentiated by the low address bit. I think that is updateable with cmpxchg. Clearly this is a bad algorithm if uids are all multiples of 2^24 but that is true or any hash function. David -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/15/2012 05:31 AM, Mathieu Desnoyers wrote: > * Eric W. Biederman (ebiederm@xmission.com) wrote: >> Sasha Levin <levinsasha928@gmail.com> writes: >> >>> On 08/15/2012 03:08 AM, Eric W. Biederman wrote: >>>>> I can offer the following: I'll write a small module that will hash 1...10000 >>>>>> into a hashtable which uses 7 bits (just like user_ns) and post the distribution >>>>>> we'll get. >>>> That won't hurt. I think 1-100 then 1000-1100 may actually be more >>>> representative. Not that I would mind seeing the larger range. >>>> Especially since I am in the process of encouraging the use of more >>>> uids. >>>> >>> >>> Alrighty, the results are in (numbers are objects in bucket): >>> >>> For the 0...10000 range: >>> >>> Average: 78.125 >>> Std dev: 1.4197704151 >>> Min: 75 >>> Max: 80 >>> >>> >>> For the 1...100 range: >>> >>> Average: 0.78125 >>> Std dev: 0.5164613088 >>> Min: 0 >>> Max: 2 >>> >>> >>> For the 1000...1100 range: >>> >>> Average: 0.7890625 >>> Std dev: 0.4964812206 >>> Min: 0 >>> Max: 2 >>> >>> >>> Looks like hash_32 is pretty good with small numbers. >> >> Yes hash_32 seems reasonable for the uid hash. With those long hash >> chains I wouldn't like to be on a machine with 10,000 processes with >> each with a different uid, and a processes calling setuid in the fast >> path. >> >> The uid hash that we are playing with is one that I sort of wish that >> the hash table could grow in size, so that we could scale up better. > > Hi Eric, > > If you want to try out something that has more features than a basic > hash table, already exists and is available for you to play with, you > might want to have a look at the RCU lock-free resizable hash table. > It's initially done in userspace, but shares the same RCU semantic as > the kernel, and has chunk-based kernel-friendly index backends (thanks > to Lai Jiangshan), very useful to integrate with the kernel page > allocator. I'm guessing that once this static hashtable is stable, a DEFINE_DYNAMIC_HASHTABLE() will get introduced which will evolve into something similar to what Mathieu has pointed out in the urcu. -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
* David Laight (David.Laight@ACULAB.COM) wrote: > > Yes hash_32 seems reasonable for the uid hash. With those long hash > > chains I wouldn't like to be on a machine with 10,000 processes with > > each with a different uid, and a processes calling setuid in the fast > > path. > > > > The uid hash that we are playing with is one that I sort of wish that > > the hash table could grow in size, so that we could scale up better. > > Since uids are likely to be allocated in dense blocks, maybe an > unhashed multi-level lookup scheme might be appropriate. > > Index an array with the low 8 (say) bits of the uid. > Each item can be either: > 1) NULL => free entry. > 2) a pointer to a uid structure (check uid value). > 3) a pointer to an array to index with the next 8 bits. > (2) and (3) can be differentiated by the low address bit. I'm currently experimenting with "Judy arrays", which would likely be a good fit for this kind of use-case. It's basically a 256-ary trie, with fixed depth that depends on the key size, that uses various encoding (compaction) schemes to compress internal nodes depending on their density. The original implementation made by HP has been criticised as somewhat too complex (20k lines of code), but I'm currently working (in my spare time) on a more elegant solution, that supports RCU lookups and distributed locking, and uses much simpler node compaction schemes, and focus on having good cache locality (and minimal number of cache line hits) for lookups. I'll be presenting my ongoing work at Plumbers, if you are interested. Best regards, Mathieu > I think that is updateable with cmpxchg. > > Clearly this is a bad algorithm if uids are all multiples of 2^24 > but that is true or any hash function. > > David > > >
diff --git a/kernel/user.c b/kernel/user.c index b815fef..d10c484 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> +#include <linux/hashtable.h> /* * userns count is 1 for root user, 1 for init_uts_ns, @@ -52,13 +53,9 @@ EXPORT_SYMBOL_GPL(init_user_ns); */ #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) -#define UIDHASH_SZ (1 << UIDHASH_BITS) -#define UIDHASH_MASK (UIDHASH_SZ - 1) -#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; -struct hlist_head uidhash_table[UIDHASH_SZ]; +static DEFINE_HASHTABLE(uidhash_table, UIDHASH_BITS) /* * The uidhash_lock is mostly taken from process context, but it is @@ -84,22 +81,22 @@ struct user_struct root_user = { /* * These routines must be called with the uidhash spinlock held! */ -static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) +static void uid_hash_insert(struct user_struct *up) { - hlist_add_head(&up->uidhash_node, hashent); + hash_add(uidhash_table, &up->uidhash_node, __kuid_val(up->uid)); } static void uid_hash_remove(struct user_struct *up) { - hlist_del_init(&up->uidhash_node); + hash_del(&up->uidhash_node); } -static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) +static struct user_struct *uid_hash_find(kuid_t uid) { struct user_struct *user; struct hlist_node *h; - hlist_for_each_entry(user, h, hashent, uidhash_node) { + hash_for_each_possible(uidhash_table, user, h, uidhash_node, __kuid_val(uid)) { if (uid_eq(user->uid, uid)) { atomic_inc(&user->__count); return user; @@ -135,7 +132,7 @@ struct user_struct *find_user(kuid_t uid) unsigned long flags; spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(uid, uidhashentry(uid)); + ret = uid_hash_find(uid); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } @@ -156,11 +153,10 @@ void free_uid(struct user_struct *up) struct user_struct *alloc_uid(kuid_t uid) { - struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); - up = uid_hash_find(uid, hashent); + up = uid_hash_find(uid); spin_unlock_irq(&uidhash_lock); if (!up) { @@ -176,13 +172,13 @@ struct user_struct *alloc_uid(kuid_t uid) * on adding the same user already.. */ spin_lock_irq(&uidhash_lock); - up = uid_hash_find(uid, hashent); + up = uid_hash_find(uid); if (up) { key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); } else { - uid_hash_insert(new, hashent); + uid_hash_insert(new); up = new; } spin_unlock_irq(&uidhash_lock); @@ -196,17 +192,14 @@ out_unlock: static int __init uid_cache_init(void) { - int n; - uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - for(n = 0; n < UIDHASH_SZ; ++n) - INIT_HLIST_HEAD(uidhash_table + n); + hash_init(uidhash_table); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); + uid_hash_insert(&root_user); spin_unlock_irq(&uidhash_lock); return 0;
Switch user_ns to use the new hashtable implementation. This reduces the amount of generic unrelated code in user_ns. Signed-off-by: Sasha Levin <levinsasha928@gmail.com> --- kernel/user.c | 33 +++++++++++++-------------------- 1 files changed, 13 insertions(+), 20 deletions(-)