diff mbox

[2/4] libceph: a per-osdc crush scratch buffer

Message ID 1391435228-27874-3-git-send-email-ilya.dryomov@inktank.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ilya Dryomov Feb. 3, 2014, 1:47 p.m. UTC
With the addition of erasure coding support in the future, scratch
variable-length array in crush_do_rule_ary() is going to grow to at
least 200 bytes on average, on top of another 128 bytes consumed by
rawosd/osd arrays in the call chain.  Replace it with a buffer inside
struct osdmap and a mutex.  This shouldn't result in any contention,
because all osd requests were already serialized by request_mutex at
that point; the only unlocked caller was ceph_ioctl_get_dataloc().

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
---
 include/linux/ceph/osdmap.h |    3 +++
 net/ceph/osdmap.c           |   25 ++++++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

Comments

Sage Weil Feb. 3, 2014, 4:27 p.m. UTC | #1
On Mon, 3 Feb 2014, Ilya Dryomov wrote:
> With the addition of erasure coding support in the future, scratch
> variable-length array in crush_do_rule_ary() is going to grow to at
> least 200 bytes on average, on top of another 128 bytes consumed by
> rawosd/osd arrays in the call chain.  Replace it with a buffer inside
> struct osdmap and a mutex.  This shouldn't result in any contention,
> because all osd requests were already serialized by request_mutex at
> that point; the only unlocked caller was ceph_ioctl_get_dataloc().
> 
> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
> ---
>  include/linux/ceph/osdmap.h |    3 +++
>  net/ceph/osdmap.c           |   25 ++++++++++++++++---------
>  2 files changed, 19 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
> index 49ff69f0746b..8c8b3cefc28b 100644
> --- a/include/linux/ceph/osdmap.h
> +++ b/include/linux/ceph/osdmap.h
> @@ -84,6 +84,9 @@ struct ceph_osdmap {
>  	/* the CRUSH map specifies the mapping of placement groups to
>  	 * the list of osds that store+replicate them. */
>  	struct crush_map *crush;
> +
> +	struct mutex crush_scratch_mutex;
> +	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];

There are no users for CEPH_PG_MAX_SIZE (16) in the userland code.  I 
would stick in a BUG_ON or WARN_ON just to make sure we don't get an 
OSDMap with more OSDs than that.

>  };
>  
>  static inline void ceph_oid_set_name(struct ceph_object_id *oid,
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index aade4a5c1c07..9d1aaa24def6 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -698,7 +698,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
>  	map = kzalloc(sizeof(*map), GFP_NOFS);
>  	if (map == NULL)
>  		return ERR_PTR(-ENOMEM);
> +
>  	map->pg_temp = RB_ROOT;
> +	mutex_init(&map->crush_scratch_mutex);
>  
>  	ceph_decode_16_safe(p, end, version, bad);
>  	if (version > 6) {
> @@ -1142,14 +1144,20 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
>  }
>  EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
>  
> -static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
> -			     int *result, int result_max,
> -			     const __u32 *weight, int weight_max)
> +static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
> +		    int *result, int result_max,
> +		    const __u32 *weight, int weight_max)
>  {
> -	int scratch[result_max * 3];
> +	int r;
> +
> +	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
> +
> +	mutex_lock(&map->crush_scratch_mutex);
> +	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
> +			  weight, weight_max, map->crush_scratch_ary);
> +	mutex_unlock(&map->crush_scratch_mutex);
>  
> -	return crush_do_rule(map, ruleno, x, result, result_max,
> -			     weight, weight_max, scratch);
> +	return r;
>  }
>  
>  /*
> @@ -1205,9 +1213,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
>  				      pool->pgp_num_mask) +
>  			(unsigned)pgid.pool;
>  	}
> -	r = crush_do_rule_ary(osdmap->crush, ruleno, pps,
> -			      osds, min_t(int, pool->size, *num),
> -			      osdmap->osd_weight, osdmap->max_osd);
> +	r = do_crush(osdmap, ruleno, pps, osds, min_t(int, pool->size, *num),
> +		     osdmap->osd_weight, osdmap->max_osd);
>  	if (r < 0) {
>  		pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
>  		       " size %d\n", r, pgid.pool, pool->crush_ruleset,
> -- 
> 1.7.10.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov Feb. 3, 2014, 4:53 p.m. UTC | #2
On Mon, Feb 3, 2014 at 6:27 PM, Sage Weil <sage@inktank.com> wrote:
> On Mon, 3 Feb 2014, Ilya Dryomov wrote:
>> With the addition of erasure coding support in the future, scratch
>> variable-length array in crush_do_rule_ary() is going to grow to at
>> least 200 bytes on average, on top of another 128 bytes consumed by
>> rawosd/osd arrays in the call chain.  Replace it with a buffer inside
>> struct osdmap and a mutex.  This shouldn't result in any contention,
>> because all osd requests were already serialized by request_mutex at
>> that point; the only unlocked caller was ceph_ioctl_get_dataloc().
>>
>> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
>> ---
>>  include/linux/ceph/osdmap.h |    3 +++
>>  net/ceph/osdmap.c           |   25 ++++++++++++++++---------
>>  2 files changed, 19 insertions(+), 9 deletions(-)
>>
>> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
>> index 49ff69f0746b..8c8b3cefc28b 100644
>> --- a/include/linux/ceph/osdmap.h
>> +++ b/include/linux/ceph/osdmap.h
>> @@ -84,6 +84,9 @@ struct ceph_osdmap {
>>       /* the CRUSH map specifies the mapping of placement groups to
>>        * the list of osds that store+replicate them. */
>>       struct crush_map *crush;
>> +
>> +     struct mutex crush_scratch_mutex;
>> +     int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
>
> There are no users for CEPH_PG_MAX_SIZE (16) in the userland code.  I
> would stick in a BUG_ON or WARN_ON just to make sure we don't get an
> OSDMap with more OSDs than that.

That IMO should be a separate patch.  This patch doesn't change
existing semantics: CEPH_PG_MAX_SIZE is used throughout libceph to size
osd arrays.  I'm not sure what happens if we get an osdmap wider than
16, but this buffer isn't overflown.  See the BUG_ON below.

>
>>  };
>>
>>  static inline void ceph_oid_set_name(struct ceph_object_id *oid,
>> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
>> index aade4a5c1c07..9d1aaa24def6 100644
>> --- a/net/ceph/osdmap.c
>> +++ b/net/ceph/osdmap.c
>> @@ -698,7 +698,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
>>       map = kzalloc(sizeof(*map), GFP_NOFS);
>>       if (map == NULL)
>>               return ERR_PTR(-ENOMEM);
>> +
>>       map->pg_temp = RB_ROOT;
>> +     mutex_init(&map->crush_scratch_mutex);
>>
>>       ceph_decode_16_safe(p, end, version, bad);
>>       if (version > 6) {
>> @@ -1142,14 +1144,20 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
>>  }
>>  EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
>>
>> -static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
>> -                          int *result, int result_max,
>> -                          const __u32 *weight, int weight_max)
>> +static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
>> +                 int *result, int result_max,
>> +                 const __u32 *weight, int weight_max)
>>  {
>> -     int scratch[result_max * 3];
>> +     int r;
>> +
>> +     BUG_ON(result_max > CEPH_PG_MAX_SIZE);

This BUG_ON.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sage Weil Feb. 3, 2014, 5 p.m. UTC | #3
On Mon, 3 Feb 2014, Ilya Dryomov wrote:
> On Mon, Feb 3, 2014 at 6:27 PM, Sage Weil <sage@inktank.com> wrote:
> > On Mon, 3 Feb 2014, Ilya Dryomov wrote:
> >> With the addition of erasure coding support in the future, scratch
> >> variable-length array in crush_do_rule_ary() is going to grow to at
> >> least 200 bytes on average, on top of another 128 bytes consumed by
> >> rawosd/osd arrays in the call chain.  Replace it with a buffer inside
> >> struct osdmap and a mutex.  This shouldn't result in any contention,
> >> because all osd requests were already serialized by request_mutex at
> >> that point; the only unlocked caller was ceph_ioctl_get_dataloc().
> >>
> >> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
> >> ---
> >>  include/linux/ceph/osdmap.h |    3 +++
> >>  net/ceph/osdmap.c           |   25 ++++++++++++++++---------
> >>  2 files changed, 19 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
> >> index 49ff69f0746b..8c8b3cefc28b 100644
> >> --- a/include/linux/ceph/osdmap.h
> >> +++ b/include/linux/ceph/osdmap.h
> >> @@ -84,6 +84,9 @@ struct ceph_osdmap {
> >>       /* the CRUSH map specifies the mapping of placement groups to
> >>        * the list of osds that store+replicate them. */
> >>       struct crush_map *crush;
> >> +
> >> +     struct mutex crush_scratch_mutex;
> >> +     int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
> >
> > There are no users for CEPH_PG_MAX_SIZE (16) in the userland code.  I
> > would stick in a BUG_ON or WARN_ON just to make sure we don't get an
> > OSDMap with more OSDs than that.
> 
> That IMO should be a separate patch.  This patch doesn't change
> existing semantics: CEPH_PG_MAX_SIZE is used throughout libceph to size
> osd arrays.  I'm not sure what happens if we get an osdmap wider than
> 16, but this buffer isn't overflown.  See the BUG_ON below.
> 
> >
> >>  };
> >>
> >>  static inline void ceph_oid_set_name(struct ceph_object_id *oid,
> >> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> >> index aade4a5c1c07..9d1aaa24def6 100644
> >> --- a/net/ceph/osdmap.c
> >> +++ b/net/ceph/osdmap.c
> >> @@ -698,7 +698,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
> >>       map = kzalloc(sizeof(*map), GFP_NOFS);
> >>       if (map == NULL)
> >>               return ERR_PTR(-ENOMEM);
> >> +
> >>       map->pg_temp = RB_ROOT;
> >> +     mutex_init(&map->crush_scratch_mutex);
> >>
> >>       ceph_decode_16_safe(p, end, version, bad);
> >>       if (version > 6) {
> >> @@ -1142,14 +1144,20 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
> >>  }
> >>  EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
> >>
> >> -static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
> >> -                          int *result, int result_max,
> >> -                          const __u32 *weight, int weight_max)
> >> +static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
> >> +                 int *result, int result_max,
> >> +                 const __u32 *weight, int weight_max)
> >>  {
> >> -     int scratch[result_max * 3];
> >> +     int r;
> >> +
> >> +     BUG_ON(result_max > CEPH_PG_MAX_SIZE);
> 
> This BUG_ON.

Ah, I think we're okay then.

Thanks!
sage
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 49ff69f0746b..8c8b3cefc28b 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -84,6 +84,9 @@  struct ceph_osdmap {
 	/* the CRUSH map specifies the mapping of placement groups to
 	 * the list of osds that store+replicate them. */
 	struct crush_map *crush;
+
+	struct mutex crush_scratch_mutex;
+	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 
 static inline void ceph_oid_set_name(struct ceph_object_id *oid,
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index aade4a5c1c07..9d1aaa24def6 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -698,7 +698,9 @@  struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	map = kzalloc(sizeof(*map), GFP_NOFS);
 	if (map == NULL)
 		return ERR_PTR(-ENOMEM);
+
 	map->pg_temp = RB_ROOT;
+	mutex_init(&map->crush_scratch_mutex);
 
 	ceph_decode_16_safe(p, end, version, bad);
 	if (version > 6) {
@@ -1142,14 +1144,20 @@  int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
 }
 EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
 
-static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
-			     int *result, int result_max,
-			     const __u32 *weight, int weight_max)
+static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
+		    int *result, int result_max,
+		    const __u32 *weight, int weight_max)
 {
-	int scratch[result_max * 3];
+	int r;
+
+	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
+
+	mutex_lock(&map->crush_scratch_mutex);
+	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
+			  weight, weight_max, map->crush_scratch_ary);
+	mutex_unlock(&map->crush_scratch_mutex);
 
-	return crush_do_rule(map, ruleno, x, result, result_max,
-			     weight, weight_max, scratch);
+	return r;
 }
 
 /*
@@ -1205,9 +1213,8 @@  static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 				      pool->pgp_num_mask) +
 			(unsigned)pgid.pool;
 	}
-	r = crush_do_rule_ary(osdmap->crush, ruleno, pps,
-			      osds, min_t(int, pool->size, *num),
-			      osdmap->osd_weight, osdmap->max_osd);
+	r = do_crush(osdmap, ruleno, pps, osds, min_t(int, pool->size, *num),
+		     osdmap->osd_weight, osdmap->max_osd);
 	if (r < 0) {
 		pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
 		       " size %d\n", r, pgid.pool, pool->crush_ruleset,