diff mbox

[2/6] libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op

Message ID 1393008946-7931-3-git-send-email-ilya.dryomov@inktank.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ilya Dryomov Feb. 21, 2014, 6:55 p.m. UTC
This is primarily for rbd's benefit and is supposed to combat
fragmentation:

"... knowing that rbd images have a 4m size, librbd can pass a hint
that will let the osd do the xfs allocation size ioctl on new files so
that they are allocated in 1m or 4m chunks.  We've seen cases where
users with rbd workloads have very high levels of fragmentation in xfs
and this would mitigate that and probably have a pretty nice
performance benefit."

SETALLOCHINT is considered advisory, so our backwards compatibility
mechanism here is to set FAILOK flag for all SETALLOCHINT ops.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
---
 include/linux/ceph/osd_client.h |    9 +++++++++
 include/linux/ceph/rados.h      |    8 ++++++++
 net/ceph/osd_client.c           |   30 ++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+)

Comments

Sage Weil Feb. 23, 2014, 4:03 p.m. UTC | #1
On Fri, 21 Feb 2014, Ilya Dryomov wrote:
> This is primarily for rbd's benefit and is supposed to combat
> fragmentation:
> 
> "... knowing that rbd images have a 4m size, librbd can pass a hint
> that will let the osd do the xfs allocation size ioctl on new files so
> that they are allocated in 1m or 4m chunks.  We've seen cases where
> users with rbd workloads have very high levels of fragmentation in xfs
> and this would mitigate that and probably have a pretty nice
> performance benefit."
> 
> SETALLOCHINT is considered advisory, so our backwards compatibility
> mechanism here is to set FAILOK flag for all SETALLOCHINT ops.
> 
> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
> ---
>  include/linux/ceph/osd_client.h |    9 +++++++++
>  include/linux/ceph/rados.h      |    8 ++++++++
>  net/ceph/osd_client.c           |   30 ++++++++++++++++++++++++++++++
>  3 files changed, 47 insertions(+)
> 
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index e94f5da251d6..6bfcb0eca8ab 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -103,6 +103,11 @@ struct ceph_osd_req_op {
>  			u32 timeout;
>  			__u8 flag;
>  		} watch;
> +		struct {
> +			u64 expected_size;
> +			u64 expected_write_size;
> +			__u8 expected_size_probability;
> +		} hint;

s/hint/alloc_hint/ ?

>  	};
>  };
>  
> @@ -294,6 +299,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
>  extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
>  					unsigned int which, u16 opcode,
>  					u64 cookie, u64 version, int flag);
> +extern void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
> +				 unsigned int which, u16 opcode,
> +				 u64 expected_size, u64 expected_write_size,
> +				 u8 expected_size_probability);
>  
>  extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>  					       struct ceph_snap_context *snapc,
> diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> index 8f9bf4570215..b8e2dd11f186 100644
> --- a/include/linux/ceph/rados.h
> +++ b/include/linux/ceph/rados.h
> @@ -227,6 +227,9 @@ enum {
>  	CEPH_OSD_OP_OMAPRMKEYS    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
>  	CEPH_OSD_OP_OMAP_CMP      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
>  
> +	/* hints */
> +	CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
> +
>  	/** multi **/
>  	CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
>  	CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
> @@ -416,6 +419,11 @@ struct ceph_osd_op {
>  			__le64 offset, length;
>  			__le64 src_offset;
>  		} __attribute__ ((packed)) clonerange;
> +		struct {
> +			__le64 expected_size;
> +			__le64 expected_write_size;
> +			__u8 expected_size_probability;
> +		} __attribute__ ((packed)) hint;

s/hint/alloc_hint/, I think.  Just made the same comment on the user space 
side.

>  	};
>  	__le32 payload_len;
>  } __attribute__ ((packed));
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 5d7fd0b8c1c8..4090f6e8db3a 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
>  	case CEPH_OSD_OP_OMAPCLEAR:
>  	case CEPH_OSD_OP_OMAPRMKEYS:
>  	case CEPH_OSD_OP_OMAP_CMP:
> +	case CEPH_OSD_OP_SETALLOCHINT:
>  	case CEPH_OSD_OP_CLONERANGE:
>  	case CEPH_OSD_OP_ASSERT_SRC_VERSION:
>  	case CEPH_OSD_OP_SRC_CMPXATTR:
> @@ -591,6 +592,28 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
>  }
>  EXPORT_SYMBOL(osd_req_op_watch_init);
>  
> +void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
> +			  unsigned int which, u16 opcode,
> +			  u64 expected_size, u64 expected_write_size,
> +			  u8 expected_size_probability)
> +{
> +	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
> +
> +	BUG_ON(opcode != CEPH_OSD_OP_SETALLOCHINT);

I would just drop the opcode argument all together.  And 
s/hint/alloc_hint/ in the function name...  I wouldn't expect that any 
other type of hint would have these same arguments.

> +
> +	op->hint.expected_size = expected_size;
> +	op->hint.expected_write_size = expected_write_size;
> +	op->hint.expected_size_probability = expected_size_probability;
> +
> +	/*
> +	 * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
> +	 * not worth a feature bit.  Set FAILOK per-op flag to make
> +	 * sure older osds don't trip over an unsupported opcode.
> +	 */
> +	op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
> +}
> +EXPORT_SYMBOL(osd_req_op_hint_init);
> +
>  static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
>  				struct ceph_osd_data *osd_data)
>  {
> @@ -681,6 +704,13 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
>  		dst->watch.ver = cpu_to_le64(src->watch.ver);
>  		dst->watch.flag = src->watch.flag;
>  		break;
> +	case CEPH_OSD_OP_SETALLOCHINT:
> +		dst->hint.expected_size = cpu_to_le64(src->hint.expected_size);
> +		dst->hint.expected_write_size =
> +		    cpu_to_le64(src->hint.expected_write_size);
> +		dst->hint.expected_size_probability =
> +		    src->hint.expected_size_probability;
> +		break;
>  	default:
>  		pr_err("unsupported osd opcode %s\n",
>  			ceph_osd_op_name(src->op));
> -- 
> 1.7.10.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Elder Feb. 24, 2014, 2:59 p.m. UTC | #2
On 02/21/2014 12:55 PM, Ilya Dryomov wrote:
> This is primarily for rbd's benefit and is supposed to combat
> fragmentation:
> 
> "... knowing that rbd images have a 4m size, librbd can pass a hint
> that will let the osd do the xfs allocation size ioctl on new files so
> that they are allocated in 1m or 4m chunks.  We've seen cases where
> users with rbd workloads have very high levels of fragmentation in xfs
> and this would mitigate that and probably have a pretty nice
> performance benefit."
> 
> SETALLOCHINT is considered advisory, so our backwards compatibility
> mechanism here is to set FAILOK flag for all SETALLOCHINT ops.
> 
> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>

I have a few small comments for you to consider but this
looks good to me.

Reviewed-by: Alex Elder <elder@linaro.org>

> ---
>  include/linux/ceph/osd_client.h |    9 +++++++++
>  include/linux/ceph/rados.h      |    8 ++++++++
>  net/ceph/osd_client.c           |   30 ++++++++++++++++++++++++++++++
>  3 files changed, 47 insertions(+)
> 
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index e94f5da251d6..6bfcb0eca8ab 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -103,6 +103,11 @@ struct ceph_osd_req_op {
>  			u32 timeout;
>  			__u8 flag;
>  		} watch;
> +		struct {
> +			u64 expected_size;

Maybe "expected_object_size"?  I wasn't sure until I read some
of the e-mail discussion what size this was referring to, and
wondered whether it was for reads or something.  This should
be done on the over-the-wire and server side structure
definitions too (if it's done at all).

The other thing is that the expected size is limited by
rbd_image_header->obj_order, which is a single byte.  I
think you should encode this the same way.  Even if the
hint were for more than RBD, this level of granularity
may be sufficient.

> +			u64 expected_write_size;

Probably the same thing here, a byte might be enough
to encode this by using log2(expected_write_size).

> +			__u8 expected_size_probability;

I'm not sure why these single-byte values use __u8 while the
multi-byte values use (e.g.) u32.  The leading underscores
are supposed to be used for values exposed to user space (or
something like that).  Anyway, not a problem with your change,
but something maybe you could check into.

> +		} hint;
>  	};
>  };
>  
> @@ -294,6 +299,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
>  extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
>  					unsigned int which, u16 opcode,
>  					u64 cookie, u64 version, int flag);
> +extern void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
> +				 unsigned int which, u16 opcode,
> +				 u64 expected_size, u64 expected_write_size,
> +				 u8 expected_size_probability);
>  
>  extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>  					       struct ceph_snap_context *snapc,
> diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> index 8f9bf4570215..b8e2dd11f186 100644
> --- a/include/linux/ceph/rados.h
> +++ b/include/linux/ceph/rados.h
> @@ -227,6 +227,9 @@ enum {
>  	CEPH_OSD_OP_OMAPRMKEYS    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
>  	CEPH_OSD_OP_OMAP_CMP      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
>  
> +	/* hints */
> +	CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
> +
>  	/** multi **/
>  	CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
>  	CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
> @@ -416,6 +419,11 @@ struct ceph_osd_op {
>  			__le64 offset, length;
>  			__le64 src_offset;
>  		} __attribute__ ((packed)) clonerange;
> +		struct {
> +			__le64 expected_size;
> +			__le64 expected_write_size;
> +			__u8 expected_size_probability;
> +		} __attribute__ ((packed)) hint;
>  	};
>  	__le32 payload_len;
>  } __attribute__ ((packed));
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 5d7fd0b8c1c8..4090f6e8db3a 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
>  	case CEPH_OSD_OP_OMAPCLEAR:
>  	case CEPH_OSD_OP_OMAPRMKEYS:
>  	case CEPH_OSD_OP_OMAP_CMP:
> +	case CEPH_OSD_OP_SETALLOCHINT:
>  	case CEPH_OSD_OP_CLONERANGE:
>  	case CEPH_OSD_OP_ASSERT_SRC_VERSION:
>  	case CEPH_OSD_OP_SRC_CMPXATTR:
> @@ -591,6 +592,28 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
>  }
>  EXPORT_SYMBOL(osd_req_op_watch_init);
>  
> +void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
> +			  unsigned int which, u16 opcode,
> +			  u64 expected_size, u64 expected_write_size,
> +			  u8 expected_size_probability)
> +{
> +	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
> +
> +	BUG_ON(opcode != CEPH_OSD_OP_SETALLOCHINT);
> +
> +	op->hint.expected_size = expected_size;
> +	op->hint.expected_write_size = expected_write_size;
> +	op->hint.expected_size_probability = expected_size_probability;
> +
> +	/*
> +	 * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
> +	 * not worth a feature bit.  Set FAILOK per-op flag to make
> +	 * sure older osds don't trip over an unsupported opcode.
> +	 */
> +	op->flags |= CEPH_OSD_OP_FLAG_FAILOK;

I think this is reasonable.  The only thing I wonder about is
whether there could be any other failure that could occur on
an OSD server that actually supports the op that we would care
about.  It's still advisory though, so functionally it won't
matter.

> +}
> +EXPORT_SYMBOL(osd_req_op_hint_init);
> +
>  static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
>  				struct ceph_osd_data *osd_data)
>  {
> @@ -681,6 +704,13 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
>  		dst->watch.ver = cpu_to_le64(src->watch.ver);
>  		dst->watch.flag = src->watch.flag;
>  		break;
> +	case CEPH_OSD_OP_SETALLOCHINT:
> +		dst->hint.expected_size = cpu_to_le64(src->hint.expected_size);
> +		dst->hint.expected_write_size =
> +		    cpu_to_le64(src->hint.expected_write_size);
> +		dst->hint.expected_size_probability =
> +		    src->hint.expected_size_probability;
> +		break;
>  	default:
>  		pr_err("unsupported osd opcode %s\n",
>  			ceph_osd_op_name(src->op));
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov Feb. 25, 2014, 12:52 p.m. UTC | #3
On Mon, Feb 24, 2014 at 4:59 PM, Alex Elder <elder@ieee.org> wrote:
> On 02/21/2014 12:55 PM, Ilya Dryomov wrote:
>> This is primarily for rbd's benefit and is supposed to combat
>> fragmentation:
>>
>> "... knowing that rbd images have a 4m size, librbd can pass a hint
>> that will let the osd do the xfs allocation size ioctl on new files so
>> that they are allocated in 1m or 4m chunks.  We've seen cases where
>> users with rbd workloads have very high levels of fragmentation in xfs
>> and this would mitigate that and probably have a pretty nice
>> performance benefit."
>>
>> SETALLOCHINT is considered advisory, so our backwards compatibility
>> mechanism here is to set FAILOK flag for all SETALLOCHINT ops.
>>
>> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
>
> I have a few small comments for you to consider but this
> looks good to me.
>
> Reviewed-by: Alex Elder <elder@linaro.org>
>
>> ---
>>  include/linux/ceph/osd_client.h |    9 +++++++++
>>  include/linux/ceph/rados.h      |    8 ++++++++
>>  net/ceph/osd_client.c           |   30 ++++++++++++++++++++++++++++++
>>  3 files changed, 47 insertions(+)
>>
>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>> index e94f5da251d6..6bfcb0eca8ab 100644
>> --- a/include/linux/ceph/osd_client.h
>> +++ b/include/linux/ceph/osd_client.h
>> @@ -103,6 +103,11 @@ struct ceph_osd_req_op {
>>                       u32 timeout;
>>                       __u8 flag;
>>               } watch;
>> +             struct {
>> +                     u64 expected_size;
>
> Maybe "expected_object_size"?  I wasn't sure until I read some
> of the e-mail discussion what size this was referring to, and
> wondered whether it was for reads or something.  This should
> be done on the over-the-wire and server side structure
> definitions too (if it's done at all).

Done, on both sides.

>
> The other thing is that the expected size is limited by
> rbd_image_header->obj_order, which is a single byte.  I
> think you should encode this the same way.  Even if the
> hint were for more than RBD, this level of granularity
> may be sufficient.
>
>> +                     u64 expected_write_size;
>
> Probably the same thing here, a byte might be enough
> to encode this by using log2(expected_write_size).
>
>> +                     __u8 expected_size_probability;

I think in the interest of genericity expected_object_size should be an
arbitrary, not limited to a power-of-2 value.  Now, given the current
90M object size limit 64 bits might seem a lot, but extent offset and
length are 64 bit values and to be future-proof I followed that here.

expected_size_probability is currently unused.  It was supposed to be
a 0-100 value, indicating whether we expect the object to be smaller
than expected_object_size or not and how strong that expectation is.

I'm not sure if you've seen it, but this (both userspace and kernel
sides) were implemented according to the design laid out by Sage in the
"rados io hints" thread on ceph-devel about a month ago.  There wasn't
any discussion that I'm aware of in the interim, that is until the
recent "wip-hint" thread, which was triggered by my PR.

>
> I'm not sure why these single-byte values use __u8 while the
> multi-byte values use (e.g.) u32.  The leading underscores
> are supposed to be used for values exposed to user space (or
> something like that).  Anyway, not a problem with your change,
> but something maybe you could check into.

I'm not sure either.  I vaguely assumed it's related to the fact that
single-byte ceph_osd_req_op fields are directly assigned to the
corresponding ceph_osd_op fields which are exported to userspace,
whereas the multi-byte values go through the endianness conversion
macros, which change the type to __le*.

>
>> +             } hint;
>>       };
>>  };
>>
>> @@ -294,6 +299,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
>>  extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
>>                                       unsigned int which, u16 opcode,
>>                                       u64 cookie, u64 version, int flag);
>> +extern void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
>> +                              unsigned int which, u16 opcode,
>> +                              u64 expected_size, u64 expected_write_size,
>> +                              u8 expected_size_probability);
>>
>>  extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>>                                              struct ceph_snap_context *snapc,
>> diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
>> index 8f9bf4570215..b8e2dd11f186 100644
>> --- a/include/linux/ceph/rados.h
>> +++ b/include/linux/ceph/rados.h
>> @@ -227,6 +227,9 @@ enum {
>>       CEPH_OSD_OP_OMAPRMKEYS    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
>>       CEPH_OSD_OP_OMAP_CMP      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
>>
>> +     /* hints */
>> +     CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
>> +
>>       /** multi **/
>>       CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
>>       CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
>> @@ -416,6 +419,11 @@ struct ceph_osd_op {
>>                       __le64 offset, length;
>>                       __le64 src_offset;
>>               } __attribute__ ((packed)) clonerange;
>> +             struct {
>> +                     __le64 expected_size;
>> +                     __le64 expected_write_size;
>> +                     __u8 expected_size_probability;
>> +             } __attribute__ ((packed)) hint;
>>       };
>>       __le32 payload_len;
>>  } __attribute__ ((packed));
>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> index 5d7fd0b8c1c8..4090f6e8db3a 100644
>> --- a/net/ceph/osd_client.c
>> +++ b/net/ceph/osd_client.c
>> @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
>>       case CEPH_OSD_OP_OMAPCLEAR:
>>       case CEPH_OSD_OP_OMAPRMKEYS:
>>       case CEPH_OSD_OP_OMAP_CMP:
>> +     case CEPH_OSD_OP_SETALLOCHINT:
>>       case CEPH_OSD_OP_CLONERANGE:
>>       case CEPH_OSD_OP_ASSERT_SRC_VERSION:
>>       case CEPH_OSD_OP_SRC_CMPXATTR:
>> @@ -591,6 +592,28 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
>>  }
>>  EXPORT_SYMBOL(osd_req_op_watch_init);
>>
>> +void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
>> +                       unsigned int which, u16 opcode,
>> +                       u64 expected_size, u64 expected_write_size,
>> +                       u8 expected_size_probability)
>> +{
>> +     struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
>> +
>> +     BUG_ON(opcode != CEPH_OSD_OP_SETALLOCHINT);
>> +
>> +     op->hint.expected_size = expected_size;
>> +     op->hint.expected_write_size = expected_write_size;
>> +     op->hint.expected_size_probability = expected_size_probability;
>> +
>> +     /*
>> +      * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
>> +      * not worth a feature bit.  Set FAILOK per-op flag to make
>> +      * sure older osds don't trip over an unsupported opcode.
>> +      */
>> +     op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
>
> I think this is reasonable.  The only thing I wonder about is
> whether there could be any other failure that could occur on
> an OSD server that actually supports the op that we would care
> about.  It's still advisory though, so functionally it won't
> matter.

Currently there isn't any such failure.  In fact, the only thing that
this FAILOK hides is the EOPNOTSUPP from an OSD that doesn't support
the new op.  Everything else is done on the backend, and there all
errors are explicitly ignored because this is an advisory op and it
would bring down the OSD otherwise.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Elder Feb. 25, 2014, 1:05 p.m. UTC | #4
On 02/25/2014 06:52 AM, Ilya Dryomov wrote:
> On Mon, Feb 24, 2014 at 4:59 PM, Alex Elder <elder@ieee.org> wrote:
>> On 02/21/2014 12:55 PM, Ilya Dryomov wrote:
>>> This is primarily for rbd's benefit and is supposed to combat
>>> fragmentation:
>>>
>>> "... knowing that rbd images have a 4m size, librbd can pass a hint
>>> that will let the osd do the xfs allocation size ioctl on new files so
>>> that they are allocated in 1m or 4m chunks.  We've seen cases where
>>> users with rbd workloads have very high levels of fragmentation in xfs
>>> and this would mitigate that and probably have a pretty nice
>>> performance benefit."
>>>
>>> SETALLOCHINT is considered advisory, so our backwards compatibility
>>> mechanism here is to set FAILOK flag for all SETALLOCHINT ops.
>>>
>>> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
>>
>> I have a few small comments for you to consider but this
>> looks good to me.
>>
>> Reviewed-by: Alex Elder <elder@linaro.org>

. . .

>> The other thing is that the expected size is limited by
>> rbd_image_header->obj_order, which is a single byte.  I
>> think you should encode this the same way.  Even if the
>> hint were for more than RBD, this level of granularity
>> may be sufficient.
>>
>>> +                     u64 expected_write_size;
>>
>> Probably the same thing here, a byte might be enough
>> to encode this by using log2(expected_write_size).
>>
>>> +                     __u8 expected_size_probability;
> 
> I think in the interest of genericity expected_object_size should be an
> arbitrary, not limited to a power-of-2 value.  Now, given the current
> 90M object size limit 64 bits might seem a lot, but extent offset and
> length are 64 bit values and to be future-proof I followed that here.

I have no objection to the 64-bit size but I still think
a byte representing log2(size) is sufficient.  Power-of-2
granularity is most likely fine (and might even be what
such a hint gets converted to anyway) for file systems
or other backing store.  But again, you can do that with
a 64 bit value as well.

> expected_size_probability is currently unused.  It was supposed to be
> a 0-100 value, indicating whether we expect the object to be smaller
> than expected_object_size or not and how strong that expectation is.
> 
> I'm not sure if you've seen it, but this (both userspace and kernel
> sides) were implemented according to the design laid out by Sage in the
> "rados io hints" thread on ceph-devel about a month ago.  There wasn't
> any discussion that I'm aware of in the interim, that is until the
> recent "wip-hint" thread, which was triggered by my PR.

I'm aware of it---I saw the discussion go by and skimmed
through it but did not look at it very closely.  I can't
keep up with all the traffic but I do try to pay attention
to code for review.

>> I'm not sure why these single-byte values use __u8 while the
>> multi-byte values use (e.g.) u32.  The leading underscores
>> are supposed to be used for values exposed to user space (or
>> something like that).  Anyway, not a problem with your change,
>> but something maybe you could check into.
> 
> I'm not sure either.  I vaguely assumed it's related to the fact that
> single-byte ceph_osd_req_op fields are directly assigned to the
> corresponding ceph_osd_op fields which are exported to userspace,
> whereas the multi-byte values go through the endianness conversion
> macros, which change the type to __le*.

Not a big deal regardless.

>>> +             } hint;
>>>       };
>>>  };

. . .

>>> +     /*
>>> +      * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
>>> +      * not worth a feature bit.  Set FAILOK per-op flag to make
>>> +      * sure older osds don't trip over an unsupported opcode.
>>> +      */
>>> +     op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
>>
>> I think this is reasonable.  The only thing I wonder about is
>> whether there could be any other failure that could occur on
>> an OSD server that actually supports the op that we would care
>> about.  It's still advisory though, so functionally it won't
>> matter.
> 
> Currently there isn't any such failure.  In fact, the only thing that
> this FAILOK hides is the EOPNOTSUPP from an OSD that doesn't support
> the new op.  Everything else is done on the backend, and there all
> errors are explicitly ignored because this is an advisory op and it
> would bring down the OSD otherwise.

Sounds good.  Thanks for the explanations.

					-Alex

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov Feb. 25, 2014, 1:38 p.m. UTC | #5
On Tue, Feb 25, 2014 at 3:05 PM, Alex Elder <elder@ieee.org> wrote:
>>> The other thing is that the expected size is limited by
>>> rbd_image_header->obj_order, which is a single byte.  I
>>> think you should encode this the same way.  Even if the
>>> hint were for more than RBD, this level of granularity
>>> may be sufficient.
>>>
>>>> +                     u64 expected_write_size;
>>>
>>> Probably the same thing here, a byte might be enough
>>> to encode this by using log2(expected_write_size).
>>>
>>>> +                     __u8 expected_size_probability;
>>
>> I think in the interest of genericity expected_object_size should be an
>> arbitrary, not limited to a power-of-2 value.  Now, given the current
>> 90M object size limit 64 bits might seem a lot, but extent offset and
>> length are 64 bit values and to be future-proof I followed that here.
>
> I have no objection to the 64-bit size but I still think
> a byte representing log2(size) is sufficient.  Power-of-2
> granularity is most likely fine (and might even be what
> such a hint gets converted to anyway) for file systems
> or other backing store.  But again, you can do that with
> a 64 bit value as well.

Filesystems of course round it, but probably not to a power-of-2.
I guess it's adjusted to a multiple of block size and then capped with
some value that the allocator can cope with.  xfs for example would
happily take on say 5M.  power-of-2 is sufficient for rbd, but
_probably_ not in general.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sage Weil Feb. 25, 2014, 5:12 p.m. UTC | #6
On Tue, 25 Feb 2014, Ilya Dryomov wrote:
> >> +                     u64 expected_write_size;
> >
> > Probably the same thing here, a byte might be enough
> > to encode this by using log2(expected_write_size).
> >
> >> +                     __u8 expected_size_probability;
> 
> I think in the interest of genericity expected_object_size should be an
> arbitrary, not limited to a power-of-2 value.  Now, given the current
> 90M object size limit 64 bits might seem a lot, but extent offset and
> length are 64 bit values and to be future-proof I followed that here.
> 
> expected_size_probability is currently unused.  It was supposed to be
> a 0-100 value, indicating whether we expect the object to be smaller
> than expected_object_size or not and how strong that expectation is.
> 
> I'm not sure if you've seen it, but this (both userspace and kernel
> sides) were implemented according to the design laid out by Sage in the
> "rados io hints" thread on ceph-devel about a month ago.  There wasn't
> any discussion that I'm aware of in the interim, that is until the
> recent "wip-hint" thread, which was triggered by my PR.
> 
> >
> > I'm not sure why these single-byte values use __u8 while the
> > multi-byte values use (e.g.) u32.  The leading underscores
> > are supposed to be used for values exposed to user space (or
> > something like that).  Anyway, not a problem with your change,
> > but something maybe you could check into.
> 
> I'm not sure either.  I vaguely assumed it's related to the fact that
> single-byte ceph_osd_req_op fields are directly assigned to the
> corresponding ceph_osd_op fields which are exported to userspace,
> whereas the multi-byte values go through the endianness conversion
> macros, which change the type to __le*.

Oh, good catch.  Yeah, these need to be __le64 so that on the kernel side 
sparse knows what is going on and so on the userspace side the magic __le* 
classes do the endian conversion.

sage
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sage Weil Feb. 25, 2014, 5:12 p.m. UTC | #7
On Tue, 25 Feb 2014, Ilya Dryomov wrote:
> On Tue, Feb 25, 2014 at 3:05 PM, Alex Elder <elder@ieee.org> wrote:
> >>> The other thing is that the expected size is limited by
> >>> rbd_image_header->obj_order, which is a single byte.  I
> >>> think you should encode this the same way.  Even if the
> >>> hint were for more than RBD, this level of granularity
> >>> may be sufficient.
> >>>
> >>>> +                     u64 expected_write_size;
> >>>
> >>> Probably the same thing here, a byte might be enough
> >>> to encode this by using log2(expected_write_size).
> >>>
> >>>> +                     __u8 expected_size_probability;
> >>
> >> I think in the interest of genericity expected_object_size should be an
> >> arbitrary, not limited to a power-of-2 value.  Now, given the current
> >> 90M object size limit 64 bits might seem a lot, but extent offset and
> >> length are 64 bit values and to be future-proof I followed that here.
> >
> > I have no objection to the 64-bit size but I still think
> > a byte representing log2(size) is sufficient.  Power-of-2
> > granularity is most likely fine (and might even be what
> > such a hint gets converted to anyway) for file systems
> > or other backing store.  But again, you can do that with
> > a 64 bit value as well.
> 
> Filesystems of course round it, but probably not to a power-of-2.
> I guess it's adjusted to a multiple of block size and then capped with
> some value that the allocator can cope with.  xfs for example would
> happily take on say 5M.  power-of-2 is sufficient for rbd, but
> _probably_ not in general.

I agree; let's stick with u64 here.

Thanks!
sage
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index e94f5da251d6..6bfcb0eca8ab 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -103,6 +103,11 @@  struct ceph_osd_req_op {
 			u32 timeout;
 			__u8 flag;
 		} watch;
+		struct {
+			u64 expected_size;
+			u64 expected_write_size;
+			__u8 expected_size_probability;
+		} hint;
 	};
 };
 
@@ -294,6 +299,10 @@  extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 					unsigned int which, u16 opcode,
 					u64 cookie, u64 version, int flag);
+extern void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
+				 unsigned int which, u16 opcode,
+				 u64 expected_size, u64 expected_write_size,
+				 u8 expected_size_probability);
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 8f9bf4570215..b8e2dd11f186 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -227,6 +227,9 @@  enum {
 	CEPH_OSD_OP_OMAPRMKEYS    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
 	CEPH_OSD_OP_OMAP_CMP      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
 
+	/* hints */
+	CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
+
 	/** multi **/
 	CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
 	CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
@@ -416,6 +419,11 @@  struct ceph_osd_op {
 			__le64 offset, length;
 			__le64 src_offset;
 		} __attribute__ ((packed)) clonerange;
+		struct {
+			__le64 expected_size;
+			__le64 expected_write_size;
+			__u8 expected_size_probability;
+		} __attribute__ ((packed)) hint;
 	};
 	__le32 payload_len;
 } __attribute__ ((packed));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5d7fd0b8c1c8..4090f6e8db3a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -436,6 +436,7 @@  static bool osd_req_opcode_valid(u16 opcode)
 	case CEPH_OSD_OP_OMAPCLEAR:
 	case CEPH_OSD_OP_OMAPRMKEYS:
 	case CEPH_OSD_OP_OMAP_CMP:
+	case CEPH_OSD_OP_SETALLOCHINT:
 	case CEPH_OSD_OP_CLONERANGE:
 	case CEPH_OSD_OP_ASSERT_SRC_VERSION:
 	case CEPH_OSD_OP_SRC_CMPXATTR:
@@ -591,6 +592,28 @@  void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_watch_init);
 
+void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
+			  unsigned int which, u16 opcode,
+			  u64 expected_size, u64 expected_write_size,
+			  u8 expected_size_probability)
+{
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+
+	BUG_ON(opcode != CEPH_OSD_OP_SETALLOCHINT);
+
+	op->hint.expected_size = expected_size;
+	op->hint.expected_write_size = expected_write_size;
+	op->hint.expected_size_probability = expected_size_probability;
+
+	/*
+	 * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+	 * not worth a feature bit.  Set FAILOK per-op flag to make
+	 * sure older osds don't trip over an unsupported opcode.
+	 */
+	op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
+}
+EXPORT_SYMBOL(osd_req_op_hint_init);
+
 static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 				struct ceph_osd_data *osd_data)
 {
@@ -681,6 +704,13 @@  static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		dst->watch.ver = cpu_to_le64(src->watch.ver);
 		dst->watch.flag = src->watch.flag;
 		break;
+	case CEPH_OSD_OP_SETALLOCHINT:
+		dst->hint.expected_size = cpu_to_le64(src->hint.expected_size);
+		dst->hint.expected_write_size =
+		    cpu_to_le64(src->hint.expected_write_size);
+		dst->hint.expected_size_probability =
+		    src->hint.expected_size_probability;
+		break;
 	default:
 		pr_err("unsupported osd opcode %s\n",
 			ceph_osd_op_name(src->op));