Message ID | 20190910194126.21144-1-idryomov@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | libceph: use ceph_kvmalloc() for osdmap arrays | expand |
On Tue, 2019-09-10 at 21:41 +0200, Ilya Dryomov wrote: > osdmap has a bunch of arrays that grow linearly with the number of > OSDs. osd_state, osd_weight and osd_primary_affinity take 4 bytes per > OSD. osd_addr takes 136 bytes per OSD because of sockaddr_storage. > The CRUSH workspace area also grows linearly with the number of OSDs. > > Normally these arrays are allocated at client startup. The osdmap is > usually updated in small incrementals, but once in a while a full map > may need to be processed. For a cluster with 10000 OSDs, this means > a bunch of 40K allocations followed by a 1.3M allocation, all of which > are currently required to be physically contiguous. This results in > sporadic ENOMEM errors, hanging the client. > > Go back to manually (re)allocating arrays and use ceph_kvmalloc() to > fall back to non-contiguous allocation when necessary. > > Link: https://tracker.ceph.com/issues/40481 > Signed-off-by: Ilya Dryomov <idryomov@gmail.com> > --- > net/ceph/osdmap.c | 69 +++++++++++++++++++++++++++++------------------ > 1 file changed, 43 insertions(+), 26 deletions(-) > > diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c > index 90437906b7bc..4e0de14f80bb 100644 > --- a/net/ceph/osdmap.c > +++ b/net/ceph/osdmap.c > @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) > struct ceph_pg_pool_info, node); > __remove_pg_pool(&map->pg_pools, pi); > } > - kfree(map->osd_state); > - kfree(map->osd_weight); > - kfree(map->osd_addr); > - kfree(map->osd_primary_affinity); > - kfree(map->crush_workspace); > + kvfree(map->osd_state); > + kvfree(map->osd_weight); > + kvfree(map->osd_addr); > + kvfree(map->osd_primary_affinity); > + kvfree(map->crush_workspace); > kfree(map); > } > > @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) > * > * The new elements are properly initialized. > */ > -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) > +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) > { > u32 *state; > u32 *weight; > struct ceph_entity_addr *addr; > + u32 to_copy; > int i; > > - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); > - if (!state) > - return -ENOMEM; > - map->osd_state = state; > + dout("%s old %u new %u\n", __func__, map->max_osd, max); > + if (max == map->max_osd) > + return 0; > > - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); > - if (!weight) > + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); > + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); > + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); Is GFP_NOFS sufficient here, given that this may be called from rbd? Should we be using NOIO instead (or maybe the PF_MEMALLOC_* equivalent)? > + if (!state || !weight || !addr) { > + kvfree(state); > + kvfree(weight); > + kvfree(addr); > return -ENOMEM; > - map->osd_weight = weight; > + } > > - addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); > - if (!addr) > - return -ENOMEM; > - map->osd_addr = addr; > + to_copy = min(map->max_osd, max); > + if (map->osd_state) { > + memcpy(state, map->osd_state, to_copy * sizeof(*state)); > + memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); > + memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); > + kvfree(map->osd_state); > + kvfree(map->osd_weight); > + kvfree(map->osd_addr); > + } > > + map->osd_state = state; > + map->osd_weight = weight; > + map->osd_addr = addr; > for (i = map->max_osd; i < max; i++) { > map->osd_state[i] = 0; > map->osd_weight[i] = CEPH_OSD_OUT; > @@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) > if (map->osd_primary_affinity) { > u32 *affinity; > > - affinity = krealloc(map->osd_primary_affinity, > - max*sizeof(*affinity), GFP_NOFS); > + affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), > + GFP_NOFS); > if (!affinity) > return -ENOMEM; > - map->osd_primary_affinity = affinity; > > + memcpy(affinity, map->osd_primary_affinity, > + to_copy * sizeof(*affinity)); > + kvfree(map->osd_primary_affinity); > + > + map->osd_primary_affinity = affinity; > for (i = map->max_osd; i < max; i++) > map->osd_primary_affinity[i] = > CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; > @@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) > > work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); > dout("%s work_size %zu bytes\n", __func__, work_size); > - workspace = kmalloc(work_size, GFP_NOIO); > + workspace = ceph_kvmalloc(work_size, GFP_NOIO); > if (!workspace) { > crush_destroy(crush); > return -ENOMEM; > @@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) > > if (map->crush) > crush_destroy(map->crush); > - kfree(map->crush_workspace); > + kvfree(map->crush_workspace); > map->crush = crush; > map->crush_workspace = workspace; > return 0; > @@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) > if (!map->osd_primary_affinity) { > int i; > > - map->osd_primary_affinity = kmalloc_array(map->max_osd, > - sizeof(u32), > - GFP_NOFS); > + map->osd_primary_affinity = ceph_kvmalloc( > + array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), > + GFP_NOFS); > if (!map->osd_primary_affinity) > return -ENOMEM; > > @@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, > > ceph_decode_32_safe(p, end, len, e_inval); > if (len == 0) { > - kfree(map->osd_primary_affinity); > + kvfree(map->osd_primary_affinity); > map->osd_primary_affinity = NULL; > return 0; > }
On Wed, Sep 11, 2019 at 4:54 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Tue, 2019-09-10 at 21:41 +0200, Ilya Dryomov wrote: > > osdmap has a bunch of arrays that grow linearly with the number of > > OSDs. osd_state, osd_weight and osd_primary_affinity take 4 bytes per > > OSD. osd_addr takes 136 bytes per OSD because of sockaddr_storage. > > The CRUSH workspace area also grows linearly with the number of OSDs. > > > > Normally these arrays are allocated at client startup. The osdmap is > > usually updated in small incrementals, but once in a while a full map > > may need to be processed. For a cluster with 10000 OSDs, this means > > a bunch of 40K allocations followed by a 1.3M allocation, all of which > > are currently required to be physically contiguous. This results in > > sporadic ENOMEM errors, hanging the client. > > > > Go back to manually (re)allocating arrays and use ceph_kvmalloc() to > > fall back to non-contiguous allocation when necessary. > > > > Link: https://tracker.ceph.com/issues/40481 > > Signed-off-by: Ilya Dryomov <idryomov@gmail.com> > > --- > > net/ceph/osdmap.c | 69 +++++++++++++++++++++++++++++------------------ > > 1 file changed, 43 insertions(+), 26 deletions(-) > > > > diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c > > index 90437906b7bc..4e0de14f80bb 100644 > > --- a/net/ceph/osdmap.c > > +++ b/net/ceph/osdmap.c > > @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) > > struct ceph_pg_pool_info, node); > > __remove_pg_pool(&map->pg_pools, pi); > > } > > - kfree(map->osd_state); > > - kfree(map->osd_weight); > > - kfree(map->osd_addr); > > - kfree(map->osd_primary_affinity); > > - kfree(map->crush_workspace); > > + kvfree(map->osd_state); > > + kvfree(map->osd_weight); > > + kvfree(map->osd_addr); > > + kvfree(map->osd_primary_affinity); > > + kvfree(map->crush_workspace); > > kfree(map); > > } > > > > @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) > > * > > * The new elements are properly initialized. > > */ > > -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) > > +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) > > { > > u32 *state; > > u32 *weight; > > struct ceph_entity_addr *addr; > > + u32 to_copy; > > int i; > > > > - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); > > - if (!state) > > - return -ENOMEM; > > - map->osd_state = state; > > + dout("%s old %u new %u\n", __func__, map->max_osd, max); > > + if (max == map->max_osd) > > + return 0; > > > > - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); > > - if (!weight) > > + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); > > + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); > > + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); > > Is GFP_NOFS sufficient here, given that this may be called from rbd? > Should we be using NOIO instead (or maybe the PF_MEMALLOC_* equivalent)? It should be NOIO, but it has been this way forever, so I kept it (keeping the future conversion to scopes that I mentioned in another email in mind). Thanks, Ilya
On Wed, 2019-09-11 at 17:08 +0200, Ilya Dryomov wrote: > On Wed, Sep 11, 2019 at 4:54 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Tue, 2019-09-10 at 21:41 +0200, Ilya Dryomov wrote: > > > osdmap has a bunch of arrays that grow linearly with the number of > > > OSDs. osd_state, osd_weight and osd_primary_affinity take 4 bytes per > > > OSD. osd_addr takes 136 bytes per OSD because of sockaddr_storage. > > > The CRUSH workspace area also grows linearly with the number of OSDs. > > > > > > Normally these arrays are allocated at client startup. The osdmap is > > > usually updated in small incrementals, but once in a while a full map > > > may need to be processed. For a cluster with 10000 OSDs, this means > > > a bunch of 40K allocations followed by a 1.3M allocation, all of which > > > are currently required to be physically contiguous. This results in > > > sporadic ENOMEM errors, hanging the client. > > > > > > Go back to manually (re)allocating arrays and use ceph_kvmalloc() to > > > fall back to non-contiguous allocation when necessary. > > > > > > Link: https://tracker.ceph.com/issues/40481 > > > Signed-off-by: Ilya Dryomov <idryomov@gmail.com> > > > --- > > > net/ceph/osdmap.c | 69 +++++++++++++++++++++++++++++------------------ > > > 1 file changed, 43 insertions(+), 26 deletions(-) > > > > > > diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c > > > index 90437906b7bc..4e0de14f80bb 100644 > > > --- a/net/ceph/osdmap.c > > > +++ b/net/ceph/osdmap.c > > > @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) > > > struct ceph_pg_pool_info, node); > > > __remove_pg_pool(&map->pg_pools, pi); > > > } > > > - kfree(map->osd_state); > > > - kfree(map->osd_weight); > > > - kfree(map->osd_addr); > > > - kfree(map->osd_primary_affinity); > > > - kfree(map->crush_workspace); > > > + kvfree(map->osd_state); > > > + kvfree(map->osd_weight); > > > + kvfree(map->osd_addr); > > > + kvfree(map->osd_primary_affinity); > > > + kvfree(map->crush_workspace); > > > kfree(map); > > > } > > > > > > @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) > > > * > > > * The new elements are properly initialized. > > > */ > > > -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) > > > +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) > > > { > > > u32 *state; > > > u32 *weight; > > > struct ceph_entity_addr *addr; > > > + u32 to_copy; > > > int i; > > > > > > - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); > > > - if (!state) > > > - return -ENOMEM; > > > - map->osd_state = state; > > > + dout("%s old %u new %u\n", __func__, map->max_osd, max); > > > + if (max == map->max_osd) > > > + return 0; > > > > > > - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); > > > - if (!weight) > > > + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); > > > + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); > > > + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); > > > > Is GFP_NOFS sufficient here, given that this may be called from rbd? > > Should we be using NOIO instead (or maybe the PF_MEMALLOC_* equivalent)? > > It should be NOIO, but it has been this way forever, so I kept it > (keeping the future conversion to scopes that I mentioned in another > email in mind). > Fair enough then. You can add my Reviewed-by:
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 90437906b7bc..4e0de14f80bb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) struct ceph_pg_pool_info, node); __remove_pg_pool(&map->pg_pools, pi); } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map->osd_primary_affinity); - kfree(map->crush_workspace); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + kvfree(map->osd_primary_affinity); + kvfree(map->crush_workspace); kfree(map); } @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) * * The new elements are properly initialized. */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) { u32 *state; u32 *weight; struct ceph_entity_addr *addr; + u32 to_copy; int i; - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); - if (!state) - return -ENOMEM; - map->osd_state = state; + dout("%s old %u new %u\n", __func__, map->max_osd, max); + if (max == map->max_osd) + return 0; - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); - if (!weight) + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + if (!state || !weight || !addr) { + kvfree(state); + kvfree(weight); + kvfree(addr); return -ENOMEM; - map->osd_weight = weight; + } - addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); - if (!addr) - return -ENOMEM; - map->osd_addr = addr; + to_copy = min(map->max_osd, max); + if (map->osd_state) { + memcpy(state, map->osd_state, to_copy * sizeof(*state)); + memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); + memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + } + map->osd_state = state; + map->osd_weight = weight; + map->osd_addr = addr; for (i = map->max_osd; i < max; i++) { map->osd_state[i] = 0; map->osd_weight[i] = CEPH_OSD_OUT; @@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = krealloc(map->osd_primary_affinity, - max*sizeof(*affinity), GFP_NOFS); + affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + GFP_NOFS); if (!affinity) return -ENOMEM; - map->osd_primary_affinity = affinity; + memcpy(affinity, map->osd_primary_affinity, + to_copy * sizeof(*affinity)); + kvfree(map->osd_primary_affinity); + + map->osd_primary_affinity = affinity; for (i = map->max_osd; i < max; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - workspace = kmalloc(work_size, GFP_NOIO); + workspace = ceph_kvmalloc(work_size, GFP_NOIO); if (!workspace) { crush_destroy(crush); return -ENOMEM; @@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) if (map->crush) crush_destroy(map->crush); - kfree(map->crush_workspace); + kvfree(map->crush_workspace); map->crush = crush; map->crush_workspace = workspace; return 0; @@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = kmalloc_array(map->max_osd, - sizeof(u32), - GFP_NOFS); + map->osd_primary_affinity = ceph_kvmalloc( + array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), + GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; @@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, len, e_inval); if (len == 0) { - kfree(map->osd_primary_affinity); + kvfree(map->osd_primary_affinity); map->osd_primary_affinity = NULL; return 0; }
osdmap has a bunch of arrays that grow linearly with the number of OSDs. osd_state, osd_weight and osd_primary_affinity take 4 bytes per OSD. osd_addr takes 136 bytes per OSD because of sockaddr_storage. The CRUSH workspace area also grows linearly with the number of OSDs. Normally these arrays are allocated at client startup. The osdmap is usually updated in small incrementals, but once in a while a full map may need to be processed. For a cluster with 10000 OSDs, this means a bunch of 40K allocations followed by a 1.3M allocation, all of which are currently required to be physically contiguous. This results in sporadic ENOMEM errors, hanging the client. Go back to manually (re)allocating arrays and use ceph_kvmalloc() to fall back to non-contiguous allocation when necessary. Link: https://tracker.ceph.com/issues/40481 Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- net/ceph/osdmap.c | 69 +++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 26 deletions(-)