Message ID | 87woyego2u.fsf_-_@xmission.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 03/14/2018 08:49 PM, Eric W. Biederman wrote: > The define IPCMNI was originally the size of a statically sized array in > the kernel and that has long since been removed. Therefore there is no > fundamental reason for IPCMNI. > > The only remaining use IPCMNI serves is as a convoluted way to format > the ipc id to userspace. It does not appear that anything except for > the CHECKPOINT_RESTORE code even cares about this variety of assignment > and the CHECKPOINT_RESTORE code only cares about this weirdness because > it has to restore these peculiar ids. > > Therefore make the assignment of ipc ids match the description in > Advanced Programming in the Unix Environment and assign the next id > until INT_MAX is hit then loop around to the lower ids. > > This can be implemented trivially with the current code using idr_alloc_cyclic. > > To make it possible to keep checkpoint/restore working I have renamed > the sysctls from xxx_next_id to xxx_nextid. That is enough change that > a smart CRIU implementation can see that what is exported has changed, > and act accordingly. New kernels will be able to restore the old id's. > > This code still needs some real world testing to verify my assumptions. > And some work with the CRIU implementations to actually add the code > that deals with the new for of id assignment. > > Updates: 03f595668017 ("ipc: add sysctl to specify desired next object id") > Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> > --- > > Waiman please take a look at this and run it through some tests etc, > I am pretty certain something like this patch is all you need to do > to sort out ipc assignment. Not messing with sysctls needed. > > include/linux/ipc.h | 2 -- > include/linux/ipc_namespace.h | 1 - > ipc/ipc_sysctl.c | 6 ++-- > ipc/namespace.c | 11 ++---- > ipc/util.c | 80 ++++++++++--------------------------------- > ipc/util.h | 11 +----- > 6 files changed, 25 insertions(+), 86 deletions(-) > > diff --git a/include/linux/ipc.h b/include/linux/ipc.h > index 821b2f260992..6cc2df7f7ac9 100644 > --- a/include/linux/ipc.h > +++ b/include/linux/ipc.h > @@ -8,8 +8,6 @@ > #include <uapi/linux/ipc.h> > #include <linux/refcount.h> > > -#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ > - > /* used by in-kernel data structures */ > struct kern_ipc_perm { > spinlock_t lock; > diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h > index b5630c8eb2f3..cab33b6a8236 100644 > --- a/include/linux/ipc_namespace.h > +++ b/include/linux/ipc_namespace.h > @@ -15,7 +15,6 @@ struct user_namespace; > > struct ipc_ids { > int in_use; > - unsigned short seq; > bool tables_initialized; > struct rw_semaphore rwsem; > struct idr ipcs_idr; > diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c > index 8ad93c29f511..a599963d58bf 100644 > --- a/ipc/ipc_sysctl.c > +++ b/ipc/ipc_sysctl.c > @@ -176,7 +176,7 @@ static struct ctl_table ipc_kern_table[] = { > }, > #ifdef CONFIG_CHECKPOINT_RESTORE > { > - .procname = "sem_next_id", > + .procname = "sem_nextid", > .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, > .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), > .mode = 0644, > @@ -185,7 +185,7 @@ static struct ctl_table ipc_kern_table[] = { > .extra2 = &int_max, > }, > { > - .procname = "msg_next_id", > + .procname = "msg_nextid", > .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, > .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), > .mode = 0644, > @@ -194,7 +194,7 @@ static struct ctl_table ipc_kern_table[] = { > .extra2 = &int_max, > }, > { > - .procname = "shm_next_id", > + .procname = "shm_nextid", > .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, > .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), > .mode = 0644, So you are changing the names of existing sysctl parameters. Will it be better to add new sysctl to indicate that the rule has changed instead? > diff --git a/ipc/namespace.c b/ipc/namespace.c > index f59a89966f92..84eaeba9e96c 100644 > --- a/ipc/namespace.c > +++ b/ipc/namespace.c > @@ -109,20 +109,13 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, > { > struct kern_ipc_perm *perm; > int next_id; > - int total, in_use; > > down_write(&ids->rwsem); > - > - in_use = ids->in_use; > - > - for (total = 0, next_id = 0; total < in_use; next_id++) { > - perm = idr_find(&ids->ipcs_idr, next_id); > - if (perm == NULL) > - continue; > + next_id = 0; > + while ((perm = idr_get_next(&ids->ipcs_idr, &next_id))) { > rcu_read_lock(); > ipc_lock_object(perm); > free(ns, perm); > - total++; > } > up_write(&ids->rwsem); > } > diff --git a/ipc/util.c b/ipc/util.c > index 4ed5a17dd06f..ce6bf18e54df 100644 > --- a/ipc/util.c > +++ b/ipc/util.c > @@ -118,7 +118,6 @@ int ipc_init_ids(struct ipc_ids *ids) > { > int err; > ids->in_use = 0; > - ids->seq = 0; > init_rwsem(&ids->rwsem); > err = rhashtable_init(&ids->key_ht, &ipc_kht_params); > if (err) > @@ -192,46 +191,18 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) > return NULL; > } > > -#ifdef CONFIG_CHECKPOINT_RESTORE > -/* > - * Specify desired id for next allocated IPC object. > - */ > -#define ipc_idr_alloc(ids, new) \ > - idr_alloc(&(ids)->ipcs_idr, (new), \ > - (ids)->next_id < 0 ? 0 : ipcid_to_idx((ids)->next_id),\ > - 0, GFP_NOWAIT) > > -static inline int ipc_buildid(int id, struct ipc_ids *ids, > - struct kern_ipc_perm *new) > +static int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) > { > - if (ids->next_id < 0) { /* default, behave as !CHECKPOINT_RESTORE */ > - new->seq = ids->seq++; > - if (ids->seq > IPCID_SEQ_MAX) > - ids->seq = 0; > - } else { > - new->seq = ipcid_to_seqx(ids->next_id); > +#ifdef CONFIG_CHECKPOINT_RESTORE > + if (ids->next_id >= 0) { > + idr_set_cursor(&ids->ipcs_idr, ids->next_id); > ids->next_id = -1; > } > - > - return SEQ_MULTIPLIER * new->seq + id; > -} > - > -#else > -#define ipc_idr_alloc(ids, new) \ > - idr_alloc(&(ids)->ipcs_idr, (new), 0, 0, GFP_NOWAIT) > - > -static inline int ipc_buildid(int id, struct ipc_ids *ids, > - struct kern_ipc_perm *new) > -{ > - new->seq = ids->seq++; > - if (ids->seq > IPCID_SEQ_MAX) > - ids->seq = 0; > - > - return SEQ_MULTIPLIER * new->seq + id; > +#endif > + return idr_alloc_cyclic(&ids->ipcs_idr, (new), 0, 0, GFP_NOWAIT); > } > > -#endif /* CONFIG_CHECKPOINT_RESTORE */ > - > /** > * ipc_addid - add an ipc identifier > * @ids: ipc identifier set > @@ -251,9 +222,6 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) > kgid_t egid; > int id, err; > > - if (limit > IPCMNI) > - limit = IPCMNI; > - > if (!ids->tables_initialized || ids->in_use >= limit) > return -ENOSPC; > > @@ -290,7 +258,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) > if (id > ids->max_id) > ids->max_id = id; > > - new->id = ipc_buildid(id, ids, new); > + new->id = id; > > return id; > } > @@ -430,7 +398,7 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) > */ > void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) > { > - int lid = ipcid_to_idx(ipcp->id); > + int lid = ipcp->id; > > idr_remove(&ids->ipcs_idr, lid); > ipc_kht_remove(ids, ipcp); > @@ -563,7 +531,7 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) > struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) > { > struct kern_ipc_perm *out; > - int lid = ipcid_to_idx(id); > + int lid = id; > > if (unlikely(!ids->tables_initialized)) > return ERR_PTR(-EINVAL); > @@ -757,30 +725,20 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, > loff_t *new_pos) > { > struct kern_ipc_perm *ipc; > - int total, id; > - > - total = 0; > - for (id = 0; id < pos && total < ids->in_use; id++) { > - ipc = idr_find(&ids->ipcs_idr, id); > - if (ipc != NULL) > - total++; > - } > + int id; I think you need to initialize id to pos. Right? > > - if (total >= ids->in_use) > + /* Out of range - return NULL to terminate iteration */ > + if (pos > INT_MAX) > return NULL; > > - for (; pos < IPCMNI; pos++) { > - ipc = idr_find(&ids->ipcs_idr, pos); > - if (ipc != NULL) { > - *new_pos = pos + 1; > - rcu_read_lock(); > - ipc_lock_object(ipc); > - return ipc; > - } > - } > + ipc = idr_get_next(&ids->ipcs_idr, &id); > + if (!ipc) > + return NULL; > > - /* Out of range - return NULL to terminate iteration */ > - return NULL; > + *new_pos = id + 1; > + rcu_read_lock(); > + ipc_lock_object(ipc); > + return ipc; > } > > static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) > diff --git a/ipc/util.h b/ipc/util.h > index 89b8ec176fc4..de8e27367f0c 100644 > --- a/ipc/util.h > +++ b/ipc/util.h > @@ -15,8 +15,6 @@ > #include <linux/err.h> > #include <linux/ipc_namespace.h> > > -#define SEQ_MULTIPLIER (IPCMNI) > - > int sem_init(void); > int msg_init(void); > void shm_init(void); > @@ -93,10 +91,6 @@ void __init ipc_init_proc_interface(const char *path, const char *header, > #define IPC_MSG_IDS 1 > #define IPC_SHM_IDS 2 > > -#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) > -#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) > -#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX) > - > /* must be called with ids->rwsem acquired for writing */ > int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); > > @@ -120,9 +114,6 @@ static inline int ipc_get_maxid(struct ipc_ids *ids) > if (ids->in_use == 0) > return -1; > > - if (ids->in_use == IPCMNI) > - return IPCMNI - 1; > - > return ids->max_id; > } > > @@ -163,7 +154,7 @@ extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len); > > static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) > { > - return uid / SEQ_MULTIPLIER != ipcp->seq; > + return uid != ipcp->seq; > } > > static inline void ipc_lock_object(struct kern_ipc_perm *perm) I don't know the history why the id management of SysV IPC was designed in such a convoluted way, but the patch does make sense to me. Cheers, Longman
Waiman Long <longman@redhat.com> writes: > On 03/14/2018 08:49 PM, Eric W. Biederman wrote: >> The define IPCMNI was originally the size of a statically sized array in >> the kernel and that has long since been removed. Therefore there is no >> fundamental reason for IPCMNI. >> >> The only remaining use IPCMNI serves is as a convoluted way to format >> the ipc id to userspace. It does not appear that anything except for >> the CHECKPOINT_RESTORE code even cares about this variety of assignment >> and the CHECKPOINT_RESTORE code only cares about this weirdness because >> it has to restore these peculiar ids. >> >> Therefore make the assignment of ipc ids match the description in >> Advanced Programming in the Unix Environment and assign the next id >> until INT_MAX is hit then loop around to the lower ids. >> >> This can be implemented trivially with the current code using idr_alloc_cyclic. >> >> To make it possible to keep checkpoint/restore working I have renamed >> the sysctls from xxx_next_id to xxx_nextid. That is enough change that >> a smart CRIU implementation can see that what is exported has changed, >> and act accordingly. New kernels will be able to restore the old id's. >> >> This code still needs some real world testing to verify my assumptions. >> And some work with the CRIU implementations to actually add the code >> that deals with the new for of id assignment. >> >> Updates: 03f595668017 ("ipc: add sysctl to specify desired next object id") >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> >> --- >> >> Waiman please take a look at this and run it through some tests etc, >> I am pretty certain something like this patch is all you need to do >> to sort out ipc assignment. Not messing with sysctls needed. >> >> include/linux/ipc.h | 2 -- >> include/linux/ipc_namespace.h | 1 - >> ipc/ipc_sysctl.c | 6 ++-- >> ipc/namespace.c | 11 ++---- >> ipc/util.c | 80 ++++++++++--------------------------------- >> ipc/util.h | 11 +----- >> 6 files changed, 25 insertions(+), 86 deletions(-) >> >> diff --git a/include/linux/ipc.h b/include/linux/ipc.h >> index 821b2f260992..6cc2df7f7ac9 100644 >> --- a/include/linux/ipc.h >> +++ b/include/linux/ipc.h >> @@ -8,8 +8,6 @@ >> #include <uapi/linux/ipc.h> >> #include <linux/refcount.h> >> >> -#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ >> - >> /* used by in-kernel data structures */ >> struct kern_ipc_perm { >> spinlock_t lock; >> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h >> index b5630c8eb2f3..cab33b6a8236 100644 >> --- a/include/linux/ipc_namespace.h >> +++ b/include/linux/ipc_namespace.h >> @@ -15,7 +15,6 @@ struct user_namespace; >> >> struct ipc_ids { >> int in_use; >> - unsigned short seq; >> bool tables_initialized; >> struct rw_semaphore rwsem; >> struct idr ipcs_idr; >> diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c >> index 8ad93c29f511..a599963d58bf 100644 >> --- a/ipc/ipc_sysctl.c >> +++ b/ipc/ipc_sysctl.c >> @@ -176,7 +176,7 @@ static struct ctl_table ipc_kern_table[] = { >> }, >> #ifdef CONFIG_CHECKPOINT_RESTORE >> { >> - .procname = "sem_next_id", >> + .procname = "sem_nextid", >> .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, >> .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), >> .mode = 0644, >> @@ -185,7 +185,7 @@ static struct ctl_table ipc_kern_table[] = { >> .extra2 = &int_max, >> }, >> { >> - .procname = "msg_next_id", >> + .procname = "msg_nextid", >> .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, >> .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), >> .mode = 0644, >> @@ -194,7 +194,7 @@ static struct ctl_table ipc_kern_table[] = { >> .extra2 = &int_max, >> }, >> { >> - .procname = "shm_next_id", >> + .procname = "shm_nextid", >> .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, >> .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), >> .mode = 0644, > > So you are changing the names of existing sysctl parameters. Will it be > better to add new sysctl to indicate that the rule has changed > instead? In practice I am replacing one set of sysctls with another, that work very similarly but not quite the same. As we can't keep the existing semantics removing the old sysctl seems correct. Likewise adding a new sysctl with slightly changed semantics seems correct. This needs an accompanying patch to CRIU to see which sysctls are available and to change it's behavior based upon that. The practical question is what makes it easiest not to confuse CRIU. Not having the sysctl should be something that CRIU detects today and the old versions should fail gracefully. But testing is needed. Adding a new sysctl to say the behavior has changed and reusing the old names won't have the same effect of disabling existing versions of CRIU. >> diff --git a/ipc/util.c b/ipc/util.c >> index 4ed5a17dd06f..ce6bf18e54df 100644 >> --- a/ipc/util.c >> +++ b/ipc/util.c >> @@ -118,7 +118,6 @@ int ipc_init_ids(struct ipc_ids *ids) >> { >> int err; >> ids->in_use = 0; >> - ids->seq = 0; >> init_rwsem(&ids->rwsem); >> err = rhashtable_init(&ids->key_ht, &ipc_kht_params); >> if (err) >> @@ -192,46 +191,18 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) >> return NULL; >> } >> >> -#ifdef CONFIG_CHECKPOINT_RESTORE >> -/* >> - * Specify desired id for next allocated IPC object. >> - */ >> -#define ipc_idr_alloc(ids, new) \ >> - idr_alloc(&(ids)->ipcs_idr, (new), \ >> - (ids)->next_id < 0 ? 0 : ipcid_to_idx((ids)->next_id),\ >> - 0, GFP_NOWAIT) >> >> -static inline int ipc_buildid(int id, struct ipc_ids *ids, >> - struct kern_ipc_perm *new) >> +static int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) >> { >> - if (ids->next_id < 0) { /* default, behave as !CHECKPOINT_RESTORE */ >> - new->seq = ids->seq++; >> - if (ids->seq > IPCID_SEQ_MAX) >> - ids->seq = 0; >> - } else { >> - new->seq = ipcid_to_seqx(ids->next_id); >> +#ifdef CONFIG_CHECKPOINT_RESTORE >> + if (ids->next_id >= 0) { >> + idr_set_cursor(&ids->ipcs_idr, ids->next_id); >> ids->next_id = -1; >> } >> - >> - return SEQ_MULTIPLIER * new->seq + id; >> -} >> - >> -#else >> -#define ipc_idr_alloc(ids, new) \ >> - idr_alloc(&(ids)->ipcs_idr, (new), 0, 0, GFP_NOWAIT) >> - >> -static inline int ipc_buildid(int id, struct ipc_ids *ids, >> - struct kern_ipc_perm *new) >> -{ >> - new->seq = ids->seq++; >> - if (ids->seq > IPCID_SEQ_MAX) >> - ids->seq = 0; >> - >> - return SEQ_MULTIPLIER * new->seq + id; >> +#endif >> + return idr_alloc_cyclic(&ids->ipcs_idr, (new), 0, 0, GFP_NOWAIT); >> } >> >> -#endif /* CONFIG_CHECKPOINT_RESTORE */ >> - >> /** >> * ipc_addid - add an ipc identifier >> * @ids: ipc identifier set >> @@ -251,9 +222,6 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) >> kgid_t egid; >> int id, err; >> >> - if (limit > IPCMNI) >> - limit = IPCMNI; >> - >> if (!ids->tables_initialized || ids->in_use >= limit) >> return -ENOSPC; >> >> @@ -290,7 +258,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) >> if (id > ids->max_id) >> ids->max_id = id; >> >> - new->id = ipc_buildid(id, ids, new); >> + new->id = id; >> >> return id; >> } >> @@ -430,7 +398,7 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) >> */ >> void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) >> { >> - int lid = ipcid_to_idx(ipcp->id); >> + int lid = ipcp->id; >> >> idr_remove(&ids->ipcs_idr, lid); >> ipc_kht_remove(ids, ipcp); >> @@ -563,7 +531,7 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) >> struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) >> { >> struct kern_ipc_perm *out; >> - int lid = ipcid_to_idx(id); >> + int lid = id; >> >> if (unlikely(!ids->tables_initialized)) >> return ERR_PTR(-EINVAL); >> @@ -757,30 +725,20 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, >> loff_t *new_pos) >> { >> struct kern_ipc_perm *ipc; >> - int total, id; >> - >> - total = 0; >> - for (id = 0; id < pos && total < ids->in_use; id++) { >> - ipc = idr_find(&ids->ipcs_idr, id); >> - if (ipc != NULL) >> - total++; >> - } >> + int id; > > I think you need to initialize id to pos. Right? > Yes. I have not done more than compiled tested this. It would probably be wise to move this bit to a prepatch. That just changes the implementation to something more efficient, and removes a use of IPCMNI at the same time. >> >> - if (total >= ids->in_use) >> + /* Out of range - return NULL to terminate iteration */ >> + if (pos > INT_MAX) >> return NULL; >> >> - for (; pos < IPCMNI; pos++) { >> - ipc = idr_find(&ids->ipcs_idr, pos); >> - if (ipc != NULL) { >> - *new_pos = pos + 1; >> - rcu_read_lock(); >> - ipc_lock_object(ipc); >> - return ipc; >> - } >> - } >> + ipc = idr_get_next(&ids->ipcs_idr, &id); >> + if (!ipc) >> + return NULL; >> >> - /* Out of range - return NULL to terminate iteration */ >> - return NULL; >> + *new_pos = id + 1; >> + rcu_read_lock(); >> + ipc_lock_object(ipc); >> + return ipc; >> } >> >> static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) >> diff --git a/ipc/util.h b/ipc/util.h >> index 89b8ec176fc4..de8e27367f0c 100644 >> --- a/ipc/util.h >> +++ b/ipc/util.h >> @@ -15,8 +15,6 @@ >> #include <linux/err.h> >> #include <linux/ipc_namespace.h> >> >> -#define SEQ_MULTIPLIER (IPCMNI) >> - >> int sem_init(void); >> int msg_init(void); >> void shm_init(void); >> @@ -93,10 +91,6 @@ void __init ipc_init_proc_interface(const char *path, const char *header, >> #define IPC_MSG_IDS 1 >> #define IPC_SHM_IDS 2 >> >> -#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) >> -#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) >> -#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX) >> - >> /* must be called with ids->rwsem acquired for writing */ >> int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); >> >> @@ -120,9 +114,6 @@ static inline int ipc_get_maxid(struct ipc_ids *ids) >> if (ids->in_use == 0) >> return -1; >> >> - if (ids->in_use == IPCMNI) >> - return IPCMNI - 1; >> - >> return ids->max_id; >> } >> >> @@ -163,7 +154,7 @@ extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len); >> >> static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) >> { >> - return uid / SEQ_MULTIPLIER != ipcp->seq; >> + return uid != ipcp->seq; >> } >> >> static inline void ipc_lock_object(struct kern_ipc_perm *perm) > > I don't know the history why the id management of SysV IPC was designed > in such a convoluted way, but the patch does make sense to me. I don't have the full history and we might wind up finding more as we run this patch through it's paces. The earliest history I know is what I read in Advanced Programming in the Unix Environment (which predates linux). It described the ipc ids as assigned from a counter that wraps. I thought like my patch implements. On closer reading it has a counter that increases each time the slot is used, and then wraps. Exactly like Linux before my patch. *Grrr* The existing structure of the bifurcated is present in Linux 1.0. At that time SHMMNI was 256. SHMMNI was the size of a static array of shm segments. The high 24 bits held a sequence number that was incremented when a segment was removed at the time. Presumably the upper bits were incremented to avoid swiftly reusing the same shm ids. Hmm. I took a quick look at FreeBSD10 and it has the exact same split in the id. So userspace may actually depend upon that split. Which comes down to the fundamental question what depends upon what. How do other operating systems like Solaris handle this? Does any nix flavor support more that 16bits worth of shm segments? The API has been deprecated for the last 20 years and we are still keeping it alive. Sigh. Still there is fundamentally only one thing the kernel can do if we wish to increase the number of shm segments. Please take my patch and test it out and see if you can find anything that cares about the change. Except for needing id reuse to be infrequent I can not imagine that there is anything that cares. It could very reasonably be argued that my when shmmni is < INT_MAX my patch implements a version of the existing algorithm. As we go through all of the possible ids before we reuse any of them. Eric
On Wed, Mar 14, 2018 at 07:49:29PM -0500, Eric W. Biederman wrote: > @@ -109,20 +109,13 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, > { > struct kern_ipc_perm *perm; > int next_id; > - int total, in_use; > > down_write(&ids->rwsem); > - > - in_use = ids->in_use; > - > - for (total = 0, next_id = 0; total < in_use; next_id++) { > - perm = idr_find(&ids->ipcs_idr, next_id); > - if (perm == NULL) > - continue; > + next_id = 0; > + while ((perm = idr_get_next(&ids->ipcs_idr, &next_id))) { > rcu_read_lock(); > ipc_lock_object(perm); > free(ns, perm); > - total++; > } > up_write(&ids->rwsem); > } We have a helper for this: idr_for_each_entry(&ids->ipcs_idr, perm, next_id) { rcu_read_lock(); ipc_lock_object(perm); free(ns, perm); } (using idr_get_next() is tricky because you have to remember to increment next_id yourself, and you didn't). > +static int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) > { > +#ifdef CONFIG_CHECKPOINT_RESTORE > + if (ids->next_id >= 0) { > + idr_set_cursor(&ids->ipcs_idr, ids->next_id); > ids->next_id = -1; > } > +#endif > + return idr_alloc_cyclic(&ids->ipcs_idr, (new), 0, 0, GFP_NOWAIT); > } > > -#endif /* CONFIG_CHECKPOINT_RESTORE */ That seems a little convoluted; is there a reason to not call idr_set_cursor() instead of assigning to ids->next_id? > @@ -757,30 +725,20 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, > loff_t *new_pos) > { > struct kern_ipc_perm *ipc; > + int id; > > + /* Out of range - return NULL to terminate iteration */ > + if (pos > INT_MAX) > return NULL; > > + ipc = idr_get_next(&ids->ipcs_idr, &id); > + if (!ipc) > + return NULL; > > + *new_pos = id + 1; > + rcu_read_lock(); > + ipc_lock_object(ipc); > + return ipc; > } I'm no expert on the IPC locking, but I would have thought you'd want to call rcu_read_lock() before calling idr_get_next() to avoid a simultaneous delete from freeing 'ipc'. Oh, I see. proc_start takes the rwsem for read and proc_stop releases it. The locking here seems quite shabby and in need of renovation.
On 03/15/2018 03:00 PM, Eric W. Biederman wrote: > Waiman Long <longman@redhat.com> writes: > >> On 03/14/2018 08:49 PM, Eric W. Biederman wrote: >>> The define IPCMNI was originally the size of a statically sized array in >>> the kernel and that has long since been removed. Therefore there is no >>> fundamental reason for IPCMNI. >>> >>> The only remaining use IPCMNI serves is as a convoluted way to format >>> the ipc id to userspace. It does not appear that anything except for >>> the CHECKPOINT_RESTORE code even cares about this variety of assignment >>> and the CHECKPOINT_RESTORE code only cares about this weirdness because >>> it has to restore these peculiar ids. >>> >>> Therefore make the assignment of ipc ids match the description in >>> Advanced Programming in the Unix Environment and assign the next id >>> until INT_MAX is hit then loop around to the lower ids. >>> >>> This can be implemented trivially with the current code using idr_alloc_cyclic. >>> >>> To make it possible to keep checkpoint/restore working I have renamed >>> the sysctls from xxx_next_id to xxx_nextid. That is enough change that >>> a smart CRIU implementation can see that what is exported has changed, >>> and act accordingly. New kernels will be able to restore the old id's. >>> >>> This code still needs some real world testing to verify my assumptions. >>> And some work with the CRIU implementations to actually add the code >>> that deals with the new for of id assignment. >>> >>> Updates: 03f595668017 ("ipc: add sysctl to specify desired next object id") >>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> >>> --- >>> >>> Waiman please take a look at this and run it through some tests etc, >>> I am pretty certain something like this patch is all you need to do >>> to sort out ipc assignment. Not messing with sysctls needed. >>> >>> include/linux/ipc.h | 2 -- >>> include/linux/ipc_namespace.h | 1 - >>> ipc/ipc_sysctl.c | 6 ++-- >>> ipc/namespace.c | 11 ++---- >>> ipc/util.c | 80 ++++++++++--------------------------------- >>> ipc/util.h | 11 +----- >>> 6 files changed, 25 insertions(+), 86 deletions(-) >>> >>> diff --git a/include/linux/ipc.h b/include/linux/ipc.h >>> index 821b2f260992..6cc2df7f7ac9 100644 >>> --- a/include/linux/ipc.h >>> +++ b/include/linux/ipc.h >>> @@ -8,8 +8,6 @@ >>> #include <uapi/linux/ipc.h> >>> #include <linux/refcount.h> >>> >>> -#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ >>> - >>> /* used by in-kernel data structures */ >>> struct kern_ipc_perm { >>> spinlock_t lock; >>> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h >>> index b5630c8eb2f3..cab33b6a8236 100644 >>> --- a/include/linux/ipc_namespace.h >>> +++ b/include/linux/ipc_namespace.h >>> @@ -15,7 +15,6 @@ struct user_namespace; >>> >>> struct ipc_ids { >>> int in_use; >>> - unsigned short seq; >>> bool tables_initialized; >>> struct rw_semaphore rwsem; >>> struct idr ipcs_idr; >>> diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c >>> index 8ad93c29f511..a599963d58bf 100644 >>> --- a/ipc/ipc_sysctl.c >>> +++ b/ipc/ipc_sysctl.c >>> @@ -176,7 +176,7 @@ static struct ctl_table ipc_kern_table[] = { >>> }, >>> #ifdef CONFIG_CHECKPOINT_RESTORE >>> { >>> - .procname = "sem_next_id", >>> + .procname = "sem_nextid", >>> .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, >>> .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), >>> .mode = 0644, >>> @@ -185,7 +185,7 @@ static struct ctl_table ipc_kern_table[] = { >>> .extra2 = &int_max, >>> }, >>> { >>> - .procname = "msg_next_id", >>> + .procname = "msg_nextid", >>> .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, >>> .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), >>> .mode = 0644, >>> @@ -194,7 +194,7 @@ static struct ctl_table ipc_kern_table[] = { >>> .extra2 = &int_max, >>> }, >>> { >>> - .procname = "shm_next_id", >>> + .procname = "shm_nextid", >>> .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, >>> .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), >>> .mode = 0644, >> So you are changing the names of existing sysctl parameters. Will it be >> better to add new sysctl to indicate that the rule has changed >> instead? > In practice I am replacing one set of sysctls with another, that work > very similarly but not quite the same. As we can't keep the existing > semantics removing the old sysctl seems correct. Likewise adding > a new sysctl with slightly changed semantics seems correct. > > This needs an accompanying patch to CRIU to see which sysctls are > available and to change it's behavior based upon that. The practical > question is what makes it easiest not to confuse CRIU. > > Not having the sysctl should be something that CRIU detects today > and the old versions should fail gracefully. But testing is needed. > Adding a new sysctl to say the behavior has changed and reusing the > old names won't have the same effect of disabling existing versions > of CRIU. That is fine as long as CRIU is the only user. > >> I don't know the history why the id management of SysV IPC was designed >> in such a convoluted way, but the patch does make sense to me. > I don't have the full history and we might wind up finding more as we > run this patch through it's paces. > > The earliest history I know is what I read in Advanced Programming in > the Unix Environment (which predates linux). It described the ipc ids > as assigned from a counter that wraps. I thought like my patch > implements. On closer reading it has a counter that increases each time > the slot is used, and then wraps. Exactly like Linux before my patch. > *Grrr* > > The existing structure of the bifurcated is present in Linux 1.0. At > that time SHMMNI was 256. SHMMNI was the size of a static array of shm > segments. The high 24 bits held a sequence number that was incremented > when a segment was removed at the time. Presumably the upper bits were > incremented to avoid swiftly reusing the same shm ids. > > Hmm. I took a quick look at FreeBSD10 and it has the exact same split > in the id. So userspace may actually depend upon that split. Backward compatibility is the part that I am most worry about this patch. That is also the reason I asked why the ID is generated in such a way. My original thinking was to have an extended mode where the IPCMNI becomes 8M from 32k. That will reduce the sequence number from 16 bits to 8 bits. The extended mode is enabled by adding, for example, a boot option. So this will be an opt-in feature instead of as a default. > > Which comes down to the fundamental question what depends upon what. > How do other operating systems like Solaris handle this? I don't know how Solaris handle this, but I know they support up to 2^24 shm segments. > > Does any nix flavor support more that 16bits worth of shm segments? > > The API has been deprecated for the last 20 years and we are still > keeping it alive. Sigh. > > Still there is fundamentally only one thing the kernel can do if we wish > to increase the number of shm segments. > > Please take my patch and test it out and see if you can find anything > that cares about the change. Except for needing id reuse to be > infrequent I can not imagine that there is anything that cares. > > It could very reasonably be argued that my when shmmni is < INT_MAX > my patch implements a version of the existing algorithm. As we go > through all of the possible ids before we reuse any of them. > > Eric > Thanks for the patch, I am still thinking about what is the best way to handle this. Cheers, Longman
Cc'ing mtk, Manfred and linux-api. See below. On Thu, 15 Mar 2018, Waiman Long wrote: >On 03/15/2018 03:00 PM, Eric W. Biederman wrote: >> Waiman Long <longman@redhat.com> writes: >> >>> On 03/14/2018 08:49 PM, Eric W. Biederman wrote: >>>> The define IPCMNI was originally the size of a statically sized array in >>>> the kernel and that has long since been removed. Therefore there is no >>>> fundamental reason for IPCMNI. >>>> >>>> The only remaining use IPCMNI serves is as a convoluted way to format >>>> the ipc id to userspace. It does not appear that anything except for >>>> the CHECKPOINT_RESTORE code even cares about this variety of assignment >>>> and the CHECKPOINT_RESTORE code only cares about this weirdness because >>>> it has to restore these peculiar ids. >>>> >>>> Therefore make the assignment of ipc ids match the description in >>>> Advanced Programming in the Unix Environment and assign the next id >>>> until INT_MAX is hit then loop around to the lower ids. >>>> >>>> This can be implemented trivially with the current code using idr_alloc_cyclic. >>>> >>>> To make it possible to keep checkpoint/restore working I have renamed >>>> the sysctls from xxx_next_id to xxx_nextid. That is enough change that >>>> a smart CRIU implementation can see that what is exported has changed, >>>> and act accordingly. New kernels will be able to restore the old id's. >>>> >>>> This code still needs some real world testing to verify my assumptions. >>>> And some work with the CRIU implementations to actually add the code >>>> that deals with the new for of id assignment. >>>> >>>> Updates: 03f595668017 ("ipc: add sysctl to specify desired next object id") >>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> >>>> --- >>>> >>>> Waiman please take a look at this and run it through some tests etc, >>>> I am pretty certain something like this patch is all you need to do >>>> to sort out ipc assignment. Not messing with sysctls needed. >>>> >>>> include/linux/ipc.h | 2 -- >>>> include/linux/ipc_namespace.h | 1 - >>>> ipc/ipc_sysctl.c | 6 ++-- >>>> ipc/namespace.c | 11 ++---- >>>> ipc/util.c | 80 ++++++++++--------------------------------- >>>> ipc/util.h | 11 +----- >>>> 6 files changed, 25 insertions(+), 86 deletions(-) >>>> >>>> diff --git a/include/linux/ipc.h b/include/linux/ipc.h >>>> index 821b2f260992..6cc2df7f7ac9 100644 >>>> --- a/include/linux/ipc.h >>>> +++ b/include/linux/ipc.h >>>> @@ -8,8 +8,6 @@ >>>> #include <uapi/linux/ipc.h> >>>> #include <linux/refcount.h> >>>> >>>> -#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ >>>> - >>>> /* used by in-kernel data structures */ >>>> struct kern_ipc_perm { >>>> spinlock_t lock; >>>> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h >>>> index b5630c8eb2f3..cab33b6a8236 100644 >>>> --- a/include/linux/ipc_namespace.h >>>> +++ b/include/linux/ipc_namespace.h >>>> @@ -15,7 +15,6 @@ struct user_namespace; >>>> >>>> struct ipc_ids { >>>> int in_use; >>>> - unsigned short seq; >>>> bool tables_initialized; >>>> struct rw_semaphore rwsem; >>>> struct idr ipcs_idr; >>>> diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c >>>> index 8ad93c29f511..a599963d58bf 100644 >>>> --- a/ipc/ipc_sysctl.c >>>> +++ b/ipc/ipc_sysctl.c >>>> @@ -176,7 +176,7 @@ static struct ctl_table ipc_kern_table[] = { >>>> }, >>>> #ifdef CONFIG_CHECKPOINT_RESTORE >>>> { >>>> - .procname = "sem_next_id", >>>> + .procname = "sem_nextid", >>>> .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, >>>> .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), >>>> .mode = 0644, >>>> @@ -185,7 +185,7 @@ static struct ctl_table ipc_kern_table[] = { >>>> .extra2 = &int_max, >>>> }, >>>> { >>>> - .procname = "msg_next_id", >>>> + .procname = "msg_nextid", >>>> .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, >>>> .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), >>>> .mode = 0644, >>>> @@ -194,7 +194,7 @@ static struct ctl_table ipc_kern_table[] = { >>>> .extra2 = &int_max, >>>> }, >>>> { >>>> - .procname = "shm_next_id", >>>> + .procname = "shm_nextid", >>>> .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, >>>> .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), >>>> .mode = 0644, >>> So you are changing the names of existing sysctl parameters. Will it be >>> better to add new sysctl to indicate that the rule has changed >>> instead? >> In practice I am replacing one set of sysctls with another, that work >> very similarly but not quite the same. As we can't keep the existing >> semantics removing the old sysctl seems correct. Likewise adding >> a new sysctl with slightly changed semantics seems correct. >> >> This needs an accompanying patch to CRIU to see which sysctls are >> available and to change it's behavior based upon that. The practical >> question is what makes it easiest not to confuse CRIU. >> >> Not having the sysctl should be something that CRIU detects today >> and the old versions should fail gracefully. But testing is needed. >> Adding a new sysctl to say the behavior has changed and reusing the >> old names won't have the same effect of disabling existing versions >> of CRIU. > >That is fine as long as CRIU is the only user. > >> >>> I don't know the history why the id management of SysV IPC was designed >>> in such a convoluted way, but the patch does make sense to me. >> I don't have the full history and we might wind up finding more as we >> run this patch through it's paces. >> >> The earliest history I know is what I read in Advanced Programming in >> the Unix Environment (which predates linux). It described the ipc ids >> as assigned from a counter that wraps. I thought like my patch >> implements. On closer reading it has a counter that increases each time >> the slot is used, and then wraps. Exactly like Linux before my patch. >> *Grrr* >> >> The existing structure of the bifurcated is present in Linux 1.0. At >> that time SHMMNI was 256. SHMMNI was the size of a static array of shm >> segments. The high 24 bits held a sequence number that was incremented >> when a segment was removed at the time. Presumably the upper bits were >> incremented to avoid swiftly reusing the same shm ids. >> >> Hmm. I took a quick look at FreeBSD10 and it has the exact same split >> in the id. So userspace may actually depend upon that split. > >Backward compatibility is the part that I am most worry about this >patch. That is also the reason I asked why the ID is generated in such a >way. I share these fears. Thanks, Davidlohr > >My original thinking was to have an extended mode where the IPCMNI >becomes 8M from 32k. That will reduce the sequence number from 16 bits >to 8 bits. The extended mode is enabled by adding, for example, a boot >option. So this will be an opt-in feature instead of as a default. > >> >> Which comes down to the fundamental question what depends upon what. >> How do other operating systems like Solaris handle this? > >I don't know how Solaris handle this, but I know they support up to 2^24 >shm segments. > >> >> Does any nix flavor support more that 16bits worth of shm segments? >> >> The API has been deprecated for the last 20 years and we are still >> keeping it alive. Sigh. >> >> Still there is fundamentally only one thing the kernel can do if we wish >> to increase the number of shm segments. >> >> Please take my patch and test it out and see if you can find anything >> that cares about the change. Except for needing id reuse to be >> infrequent I can not imagine that there is anything that cares. >> >> It could very reasonably be argued that my when shmmni is < INT_MAX >> my patch implements a version of the existing algorithm. As we go >> through all of the possible ids before we reuse any of them. >> >> Eric >> >Thanks for the patch, I am still thinking about what is the best way to >handle this. > >Cheers, >Longman > >
Hello together, On 03/29/2018 04:14 AM, Davidlohr Bueso wrote: > Cc'ing mtk, Manfred and linux-api. > > See below. > > On Thu, 15 Mar 2018, Waiman Long wrote: > >> On 03/15/2018 03:00 PM, Eric W. Biederman wrote: >>> Waiman Long <longman@redhat.com> writes: >>> >>>> On 03/14/2018 08:49 PM, Eric W. Biederman wrote: >>>>> The define IPCMNI was originally the size of a statically sized >>>>> array in >>>>> the kernel and that has long since been removed. Therefore there >>>>> is no >>>>> fundamental reason for IPCMNI. >>>>> >>>>> The only remaining use IPCMNI serves is as a convoluted way to format >>>>> the ipc id to userspace. It does not appear that anything except for >>>>> the CHECKPOINT_RESTORE code even cares about this variety of >>>>> assignment >>>>> and the CHECKPOINT_RESTORE code only cares about this weirdness >>>>> because >>>>> it has to restore these peculiar ids. >>>>> My assumption is that if an array is recreated, it should get a different id. a=semget(1234,,); semctl(a,,IPC_RMID); b=semget(1234,,); now a!=b. Rational: semop() calls only refer to the array by the id. If there is a stale process in the system that tries to access the "old" array and the new array has the same id, then the locking gets corrupted. >>>>> Therefore make the assignment of ipc ids match the description in >>>>> Advanced Programming in the Unix Environment and assign the next id >>>>> until INT_MAX is hit then loop around to the lower ids. >>>>> Ok, sounds good. That way we really cycle through INT_MAX, right now a==b would happen after 128k RMID calls. >>>>> This can be implemented trivially with the current code using >>>>> idr_alloc_cyclic. >>>>> Is there a performance impact? Right now, the idr tree is only large if there are lots of objects. What happens if we have only 1 object, with id=INT_MAX-1? semop() that do not sleep are fairly fast. The same applies for msgsnd/msgrcv, if the message is small enough. @Davidlohr: Do you know if there are application that frequently call semop() and it doesn't have to sleep? From the scalability that was pushed into the kernel, I assume that this exists. I have myself only checked postgresql, and postgresql always sleeps. (and this was long ago) >>>>> To make it possible to keep checkpoint/restore working I have renamed >>>>> the sysctls from xxx_next_id to xxx_nextid. That is enough change >>>>> that >>>>> a smart CRIU implementation can see that what is exported has >>>>> changed, >>>>> and act accordingly. New kernels will be able to restore the old >>>>> id's. >>>>> >>>>> This code still needs some real world testing to verify my >>>>> assumptions. >>>>> And some work with the CRIU implementations to actually add the code >>>>> that deals with the new for of id assignment. >>>>> It means that all existing checkpoint/restore application will not work with a new kernel. Everyone must first update the checkpoint/restore application, then update the kernel. Is this acceptable? -- Manfred
On Thu, Mar 29, 2018 at 10:47:45AM +0200, Manfred Spraul wrote: > > > > > > This can be implemented trivially with the current code > > > > > > using idr_alloc_cyclic. > > Is there a performance impact? > Right now, the idr tree is only large if there are lots of objects. > What happens if we have only 1 object, with id=INT_MAX-1? The radix tree uses a branching factor of 64 entries (6 bits) per level. The maximum ID is 31 bits (positive signed 32-bit integer). So the worst case for a single object is 6 pointer dereferences to find the object anywhere in the range (INT_MAX/2 - INT_MAX]. That will read 12 cachelines. If we were to constrain ourselves to a maximum of INT_MAX/2 (30 bits), we'd reduce that to 5 pointer dereferences and 10 cachelines. I have plans to make the representation more efficient and bring the specific case of one element with a high ID down to one pointer dereference and 2 cachelines, but I have not yet had time to implement those plans. From a memory consumption point of view, 6 layers of tree will consume 6/7 of a page on a 64-bit x86 kernel. I'm aiming to bring that down to 1/7 of a page. We get 7 radix_tree_nodes per 4kB page. (The old IDR tree had 256 entries per level which would have taken only four layers to get us to 31 bits, but the cost was getting only 3 layers per 8kB order-1 page, so we'd've taken 2 + 2/3 page to accomplish the same goal).
Hello Mathew, On 03/29/2018 12:56 PM, Matthew Wilcox wrote: > On Thu, Mar 29, 2018 at 10:47:45AM +0200, Manfred Spraul wrote: >>>>>>> This can be implemented trivially with the current code >>>>>>> using idr_alloc_cyclic. >> Is there a performance impact? >> Right now, the idr tree is only large if there are lots of objects. >> What happens if we have only 1 object, with id=INT_MAX-1? > The radix tree uses a branching factor of 64 entries (6 bits) per level. > The maximum ID is 31 bits (positive signed 32-bit integer). So the > worst case for a single object is 6 pointer dereferences to find the > object anywhere in the range (INT_MAX/2 - INT_MAX]. That will read 12 > cachelines. If we were to constrain ourselves to a maximum of INT_MAX/2 > (30 bits), we'd reduce that to 5 pointer dereferences and 10 cachelines. I'm concerned about the up to 6 branches. But this is just guessing, we need a test with a realistic workload. -- Manfred
Manfred Spraul <manfred@colorfullife.com> writes: > Hello Mathew, > > On 03/29/2018 12:56 PM, Matthew Wilcox wrote: >> On Thu, Mar 29, 2018 at 10:47:45AM +0200, Manfred Spraul wrote: >>>>>>>> This can be implemented trivially with the current code >>>>>>>> using idr_alloc_cyclic. >>> Is there a performance impact? >>> Right now, the idr tree is only large if there are lots of objects. >>> What happens if we have only 1 object, with id=INT_MAX-1? >> The radix tree uses a branching factor of 64 entries (6 bits) per level. >> The maximum ID is 31 bits (positive signed 32-bit integer). So the >> worst case for a single object is 6 pointer dereferences to find the >> object anywhere in the range (INT_MAX/2 - INT_MAX]. That will read 12 >> cachelines. If we were to constrain ourselves to a maximum of INT_MAX/2 >> (30 bits), we'd reduce that to 5 pointer dereferences and 10 cachelines. > I'm concerned about the up to 6 branches. > But this is just guessing, we need a test with a realistic workload. Yes. My primary purpose with the patch was to show that the issues with the current limits could be resolved in a straght forward manner. I really don't know if idrs are the appropriate data structure. It is possible rbtrees are a better fit. I think my algorithm I proposed for generating identifiers is as likely as any to be a good one. It does need testing on a wide variety of applications to see what applications actually care about and for that I think my proposed patch is more than sufficient. By not keeping a generation counters in the slots themselves linux already differs substantially from traditional implementations. Doing something to free us from using a fixed number of bits for the counter and a fixed number of bits to encode the slot we can support much larger use of this API. Eric
On Thu, Mar 29, 2018 at 08:07:44PM +0200, Manfred Spraul wrote: > Hello Mathew, > > On 03/29/2018 12:56 PM, Matthew Wilcox wrote: > > On Thu, Mar 29, 2018 at 10:47:45AM +0200, Manfred Spraul wrote: > > > > > > > > This can be implemented trivially with the current code > > > > > > > > using idr_alloc_cyclic. > > > Is there a performance impact? > > > Right now, the idr tree is only large if there are lots of objects. > > > What happens if we have only 1 object, with id=INT_MAX-1? > > The radix tree uses a branching factor of 64 entries (6 bits) per level. > > The maximum ID is 31 bits (positive signed 32-bit integer). So the > > worst case for a single object is 6 pointer dereferences to find the > > object anywhere in the range (INT_MAX/2 - INT_MAX]. That will read 12 > > cachelines. If we were to constrain ourselves to a maximum of INT_MAX/2 > > (30 bits), we'd reduce that to 5 pointer dereferences and 10 cachelines. > I'm concerned about the up to 6 branches. > But this is just guessing, we need a test with a realistic workload. Yes, and once there's a realistic workload, I'll be happy to prioritise adapting the data structure to reduce the pointer chases. FWIW, the plan is this: There's currently an unused 32-bit field (on 64-bit machines) which I plan to make the 'base' field. So at each step down the tree, one subtracts that field from the index in order to decide which slot to look at next. Something like this: index=0x3000'0000 (root) -> order=24 offset=48 -> order=18 offset=0 -> order=12 offset=0 -> order=6 offset=0 -> order=0 offset=0 -> data compresses to a single node: (root) -> order=0 base=3000'0000 offset=0 -> data If one has one entry at 5 and another entry at 0x3000'0000, the tree looks like this (three nodes): (root) -> order=24 base=0 offset=0 -> order=0 base=0 offset=5 -> entry1 offset=48 -> order=0 base=0 offset=0 -> entry2 The trick is making sure that looking up offset 0x300'1000 returns NULL and not entry2, but I can make that work. An alternative is to go to something a little more libJudy and have mutating internal nodes in the tree that can represent this kind of situation in a more compact form. There's a tradeoff to be made between simplicity of implementation, cost of insertion, cost of lookup and memory consumption. I don't know where the right balance point is yet.
diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 821b2f260992..6cc2df7f7ac9 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -8,8 +8,6 @@ #include <uapi/linux/ipc.h> #include <linux/refcount.h> -#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ - /* used by in-kernel data structures */ struct kern_ipc_perm { spinlock_t lock; diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index b5630c8eb2f3..cab33b6a8236 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -15,7 +15,6 @@ struct user_namespace; struct ipc_ids { int in_use; - unsigned short seq; bool tables_initialized; struct rw_semaphore rwsem; struct idr ipcs_idr; diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 8ad93c29f511..a599963d58bf 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -176,7 +176,7 @@ static struct ctl_table ipc_kern_table[] = { }, #ifdef CONFIG_CHECKPOINT_RESTORE { - .procname = "sem_next_id", + .procname = "sem_nextid", .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), .mode = 0644, @@ -185,7 +185,7 @@ static struct ctl_table ipc_kern_table[] = { .extra2 = &int_max, }, { - .procname = "msg_next_id", + .procname = "msg_nextid", .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), .mode = 0644, @@ -194,7 +194,7 @@ static struct ctl_table ipc_kern_table[] = { .extra2 = &int_max, }, { - .procname = "shm_next_id", + .procname = "shm_nextid", .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), .mode = 0644, diff --git a/ipc/namespace.c b/ipc/namespace.c index f59a89966f92..84eaeba9e96c 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -109,20 +109,13 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, { struct kern_ipc_perm *perm; int next_id; - int total, in_use; down_write(&ids->rwsem); - - in_use = ids->in_use; - - for (total = 0, next_id = 0; total < in_use; next_id++) { - perm = idr_find(&ids->ipcs_idr, next_id); - if (perm == NULL) - continue; + next_id = 0; + while ((perm = idr_get_next(&ids->ipcs_idr, &next_id))) { rcu_read_lock(); ipc_lock_object(perm); free(ns, perm); - total++; } up_write(&ids->rwsem); } diff --git a/ipc/util.c b/ipc/util.c index 4ed5a17dd06f..ce6bf18e54df 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -118,7 +118,6 @@ int ipc_init_ids(struct ipc_ids *ids) { int err; ids->in_use = 0; - ids->seq = 0; init_rwsem(&ids->rwsem); err = rhashtable_init(&ids->key_ht, &ipc_kht_params); if (err) @@ -192,46 +191,18 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) return NULL; } -#ifdef CONFIG_CHECKPOINT_RESTORE -/* - * Specify desired id for next allocated IPC object. - */ -#define ipc_idr_alloc(ids, new) \ - idr_alloc(&(ids)->ipcs_idr, (new), \ - (ids)->next_id < 0 ? 0 : ipcid_to_idx((ids)->next_id),\ - 0, GFP_NOWAIT) -static inline int ipc_buildid(int id, struct ipc_ids *ids, - struct kern_ipc_perm *new) +static int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { - if (ids->next_id < 0) { /* default, behave as !CHECKPOINT_RESTORE */ - new->seq = ids->seq++; - if (ids->seq > IPCID_SEQ_MAX) - ids->seq = 0; - } else { - new->seq = ipcid_to_seqx(ids->next_id); +#ifdef CONFIG_CHECKPOINT_RESTORE + if (ids->next_id >= 0) { + idr_set_cursor(&ids->ipcs_idr, ids->next_id); ids->next_id = -1; } - - return SEQ_MULTIPLIER * new->seq + id; -} - -#else -#define ipc_idr_alloc(ids, new) \ - idr_alloc(&(ids)->ipcs_idr, (new), 0, 0, GFP_NOWAIT) - -static inline int ipc_buildid(int id, struct ipc_ids *ids, - struct kern_ipc_perm *new) -{ - new->seq = ids->seq++; - if (ids->seq > IPCID_SEQ_MAX) - ids->seq = 0; - - return SEQ_MULTIPLIER * new->seq + id; +#endif + return idr_alloc_cyclic(&ids->ipcs_idr, (new), 0, 0, GFP_NOWAIT); } -#endif /* CONFIG_CHECKPOINT_RESTORE */ - /** * ipc_addid - add an ipc identifier * @ids: ipc identifier set @@ -251,9 +222,6 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) kgid_t egid; int id, err; - if (limit > IPCMNI) - limit = IPCMNI; - if (!ids->tables_initialized || ids->in_use >= limit) return -ENOSPC; @@ -290,7 +258,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) if (id > ids->max_id) ids->max_id = id; - new->id = ipc_buildid(id, ids, new); + new->id = id; return id; } @@ -430,7 +398,7 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) */ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { - int lid = ipcid_to_idx(ipcp->id); + int lid = ipcp->id; idr_remove(&ids->ipcs_idr, lid); ipc_kht_remove(ids, ipcp); @@ -563,7 +531,7 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out; - int lid = ipcid_to_idx(id); + int lid = id; if (unlikely(!ids->tables_initialized)) return ERR_PTR(-EINVAL); @@ -757,30 +725,20 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, loff_t *new_pos) { struct kern_ipc_perm *ipc; - int total, id; - - total = 0; - for (id = 0; id < pos && total < ids->in_use; id++) { - ipc = idr_find(&ids->ipcs_idr, id); - if (ipc != NULL) - total++; - } + int id; - if (total >= ids->in_use) + /* Out of range - return NULL to terminate iteration */ + if (pos > INT_MAX) return NULL; - for (; pos < IPCMNI; pos++) { - ipc = idr_find(&ids->ipcs_idr, pos); - if (ipc != NULL) { - *new_pos = pos + 1; - rcu_read_lock(); - ipc_lock_object(ipc); - return ipc; - } - } + ipc = idr_get_next(&ids->ipcs_idr, &id); + if (!ipc) + return NULL; - /* Out of range - return NULL to terminate iteration */ - return NULL; + *new_pos = id + 1; + rcu_read_lock(); + ipc_lock_object(ipc); + return ipc; } static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) diff --git a/ipc/util.h b/ipc/util.h index 89b8ec176fc4..de8e27367f0c 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -15,8 +15,6 @@ #include <linux/err.h> #include <linux/ipc_namespace.h> -#define SEQ_MULTIPLIER (IPCMNI) - int sem_init(void); int msg_init(void); void shm_init(void); @@ -93,10 +91,6 @@ void __init ipc_init_proc_interface(const char *path, const char *header, #define IPC_MSG_IDS 1 #define IPC_SHM_IDS 2 -#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) -#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) -#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX) - /* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); @@ -120,9 +114,6 @@ static inline int ipc_get_maxid(struct ipc_ids *ids) if (ids->in_use == 0) return -1; - if (ids->in_use == IPCMNI) - return IPCMNI - 1; - return ids->max_id; } @@ -163,7 +154,7 @@ extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len); static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) { - return uid / SEQ_MULTIPLIER != ipcp->seq; + return uid != ipcp->seq; } static inline void ipc_lock_object(struct kern_ipc_perm *perm)
The define IPCMNI was originally the size of a statically sized array in the kernel and that has long since been removed. Therefore there is no fundamental reason for IPCMNI. The only remaining use IPCMNI serves is as a convoluted way to format the ipc id to userspace. It does not appear that anything except for the CHECKPOINT_RESTORE code even cares about this variety of assignment and the CHECKPOINT_RESTORE code only cares about this weirdness because it has to restore these peculiar ids. Therefore make the assignment of ipc ids match the description in Advanced Programming in the Unix Environment and assign the next id until INT_MAX is hit then loop around to the lower ids. This can be implemented trivially with the current code using idr_alloc_cyclic. To make it possible to keep checkpoint/restore working I have renamed the sysctls from xxx_next_id to xxx_nextid. That is enough change that a smart CRIU implementation can see that what is exported has changed, and act accordingly. New kernels will be able to restore the old id's. This code still needs some real world testing to verify my assumptions. And some work with the CRIU implementations to actually add the code that deals with the new for of id assignment. Updates: 03f595668017 ("ipc: add sysctl to specify desired next object id") Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> --- Waiman please take a look at this and run it through some tests etc, I am pretty certain something like this patch is all you need to do to sort out ipc assignment. Not messing with sysctls needed. include/linux/ipc.h | 2 -- include/linux/ipc_namespace.h | 1 - ipc/ipc_sysctl.c | 6 ++-- ipc/namespace.c | 11 ++---- ipc/util.c | 80 ++++++++++--------------------------------- ipc/util.h | 11 +----- 6 files changed, 25 insertions(+), 86 deletions(-)