Message ID | 1438161835-27960-1-git-send-email-mchristi@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 07/29/2015 04:23 AM, mchristi@redhat.com wrote: > From: Mike Christie <michaelc@cs.wisc.edu> > > LIO uses scatterlist for its page/data management. This patch > adds a scatterlist messenger data type, so LIO can pass its sg > down directly to rbd. > > Signed-off-by: Mike Christie <michaelc@cs.wisc.edu> I'm not going to be able to review all of these, and this isnt' even a complete review. But it's something... You're clearly on the right track, but I want to provide a meaningful review for correctness and design so I'm looking for a bit more information. > --- > include/linux/ceph/messenger.h | 13 ++++++ > include/linux/ceph/osd_client.h | 12 +++++- > net/ceph/messenger.c | 96 +++++++++++++++++++++++++++++++++++++++++ > net/ceph/osd_client.c | 26 +++++++++++ > 4 files changed, 146 insertions(+), 1 deletion(-) > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > index 3775327..bc1bde8 100644 > --- a/include/linux/ceph/messenger.h > +++ b/include/linux/ceph/messenger.h > @@ -79,6 +79,7 @@ enum ceph_msg_data_type { > #ifdef CONFIG_BLOCK > CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ > #endif /* CONFIG_BLOCK */ > + CEPH_MSG_DATA_SG, /* data source/destination is a scatterlist */ > }; > > static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) > @@ -90,6 +91,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) > #ifdef CONFIG_BLOCK > case CEPH_MSG_DATA_BIO: > #endif /* CONFIG_BLOCK */ > + case CEPH_MSG_DATA_SG: > return true; > default: > return false; > @@ -112,6 +114,11 @@ struct ceph_msg_data { > unsigned int alignment; /* first page */ > }; > struct ceph_pagelist *pagelist; > + struct { > + struct scatterlist *sgl; > + unsigned int sgl_init_offset; > + u64 sgl_length; > + }; Can you supply a short explanation of what these fields represent? It seems sgl_init_offset is the offset of the starting byte in the sgl, but is the purpose of that for page offset calculation, or does it represent an offset into the total length of the sgl? Or put another way, does sgl_init_offset represent some portion of the sgl_length that has been consumed already (and so the initial residual length is sgl_length - sgl_init_offset)? > }; > }; > > @@ -139,6 +146,10 @@ struct ceph_msg_data_cursor { > struct page *page; /* page from list */ > size_t offset; /* bytes from list */ > }; > + struct { > + struct scatterlist *sg; /* curr sg */ /* current sg */ > + unsigned int sg_consumed; Here too, what does sg_consumed represent with respect to the initial offset and the length? I guess I'm going to stop with that. It'll be a lot easier for me to review this if I'm sure I'm sure I understand what these represent. Thanks. -Alex > + }; > }; > }; > . . . -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/29/2015 08:34 AM, Alex Elder wrote: > On 07/29/2015 04:23 AM, mchristi@redhat.com wrote: >> From: Mike Christie <michaelc@cs.wisc.edu> >> >> LIO uses scatterlist for its page/data management. This patch >> adds a scatterlist messenger data type, so LIO can pass its sg >> down directly to rbd. >> >> Signed-off-by: Mike Christie <michaelc@cs.wisc.edu> > > I'm not going to be able to review all of these, and this > isnt' even a complete review. But it's something... No problem. Thanks for any comments. > > You're clearly on the right track, but I want to provide > a meaningful review for correctness and design so I'm > looking for a bit more information. > >> --- >> include/linux/ceph/messenger.h | 13 ++++++ >> include/linux/ceph/osd_client.h | 12 +++++- >> net/ceph/messenger.c | 96 +++++++++++++++++++++++++++++++++++++++++ >> net/ceph/osd_client.c | 26 +++++++++++ >> 4 files changed, 146 insertions(+), 1 deletion(-) >> >> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h >> index 3775327..bc1bde8 100644 >> --- a/include/linux/ceph/messenger.h >> +++ b/include/linux/ceph/messenger.h >> @@ -79,6 +79,7 @@ enum ceph_msg_data_type { >> #ifdef CONFIG_BLOCK >> CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ >> #endif /* CONFIG_BLOCK */ >> + CEPH_MSG_DATA_SG, /* data source/destination is a scatterlist */ >> }; >> >> static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) >> @@ -90,6 +91,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) >> #ifdef CONFIG_BLOCK >> case CEPH_MSG_DATA_BIO: >> #endif /* CONFIG_BLOCK */ >> + case CEPH_MSG_DATA_SG: >> return true; >> default: >> return false; >> @@ -112,6 +114,11 @@ struct ceph_msg_data { >> unsigned int alignment; /* first page */ >> }; >> struct ceph_pagelist *pagelist; >> + struct { >> + struct scatterlist *sgl; >> + unsigned int sgl_init_offset; >> + u64 sgl_length; >> + }; > > Can you supply a short explanation of what these fields > represent? It seems sgl_init_offset is the offset of the > starting byte in the sgl, but is the purpose of that for > page offset calculation, or does it represent an offset > into the total length of the sgl? Or put another way, > does sgl_init_offset represent some portion of the > sgl_length that has been consumed already (and so the > initial ressidual length is sgl_length - sgl_init_offset)? sgl - starting scatterlist entry we are going to send/receive to/from. sgl_init_offset - byte offset in the sgl above we will start executing from. It is for cases like where a LIO command crossed segment/object boundaries, so we had to break it up, and the first obj request ended up in the middle of a scatterlist entry. For the second obj request we set the sgl to the sg we ended on in the first request, and then set the sgl_init_offset to where we left off in the first request. So it basically allows me to not have to clone the list similar to how the bio code does it. However, if we did clone it then I could just manipulate the cloned sg's sg->offset instead of adding the sgl_init_offset field. sgl_length - number of bytes in the sgl we are going to send/receive. This also is for the case where we broke up the LIO command into multiple obj requests. > >> }; >> }; >> >> @@ -139,6 +146,10 @@ struct ceph_msg_data_cursor { >> struct page *page; /* page from list */ >> size_t offset; /* bytes from list */ >> }; >> + struct { >> + struct scatterlist *sg; /* curr sg */ > > /* current sg */ > >> + unsigned int sg_consumed; > > Here too, what does sg_consumed represent with respect to the > initial offset and the length? It the number of bytes in the sgl we have sent/received. It is used by the messenger advance code to track if we need to advance to the next sg or if we still have data left from the current one. -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jul 29, 2015 at 04:23:38AM -0500, mchristi@redhat.com wrote: > From: Mike Christie <michaelc@cs.wisc.edu> > > LIO uses scatterlist for its page/data management. This patch > adds a scatterlist messenger data type, so LIO can pass its sg > down directly to rbd. Just as I mentioned for David's patches this is the wrong way to attack your problem. The block layer already supports WRITE SAME, and COMPARE and WRITE nees to be supported at that level too insted of creating artifical bypasses. -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/29/2015 12:55 PM, Christoph Hellwig wrote: > On Wed, Jul 29, 2015 at 04:23:38AM -0500, mchristi@redhat.com wrote: >> From: Mike Christie <michaelc@cs.wisc.edu> >> >> LIO uses scatterlist for its page/data management. This patch >> adds a scatterlist messenger data type, so LIO can pass its sg >> down directly to rbd. > > Just as I mentioned for David's patches this is the wrong way to attack > your problem. The block layer already supports WRITE SAME, and COMPARE > and WRITE nees to be supported at that level too insted of creating > artifical bypasses. Why do I have to use the block layer? I just want to map the se_cmd to a ceph request and then put it back on the wire. I don't think I need any of the block layer services. We will do things like io scheduling on the OSD side. We just want to use LIO as more of a passthrough. -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/29/2015 05:59 PM, Mike Christie wrote: > On 07/29/2015 12:55 PM, Christoph Hellwig wrote: >> On Wed, Jul 29, 2015 at 04:23:38AM -0500, mchristi@redhat.com wrote: >>> From: Mike Christie <michaelc@cs.wisc.edu> >>> >>> LIO uses scatterlist for its page/data management. This patch >>> adds a scatterlist messenger data type, so LIO can pass its sg >>> down directly to rbd. >> >> Just as I mentioned for David's patches this is the wrong way to attack >> your problem. The block layer already supports WRITE SAME, and COMPARE >> and WRITE nees to be supported at that level too insted of creating >> artifical bypasses. > > Why do I have to use the block layer? I just want to map the se_cmd to a > ceph request and then put it back on the wire. I don't think I need any > of the block layer services. We will do things like io scheduling on the > OSD side. We just want to use LIO as more of a passthrough. Maybe I misunderstood you. I guess I was viewing this similar to cephfs where it does not use rbd and the block layer. It just makes ceph/rados calls directly using libceph. I am using rbd.c for its helper/wrapper functions around the libceph ones, but I could just make libceph calls directly too. Were you saying because for lio support we need to do more block layer'ish operations like write same, compare and write, etc than cephfs, then I should not do the lio backend and we should always go through rbd for lio support? Is that for all operations? For distributed TMFs and PRs then are you thinking I should make those more block layer based (some sort of queue or block deivce callouts or REQ_ types), or should those still have some sort of lio callouts which could call different locking/cluster APIs like libceph? -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hey Mike & HCH, On Wed, 2015-07-29 at 18:40 -0500, Mike Christie wrote: > On 07/29/2015 05:59 PM, Mike Christie wrote: > > On 07/29/2015 12:55 PM, Christoph Hellwig wrote: > >> On Wed, Jul 29, 2015 at 04:23:38AM -0500, mchristi@redhat.com wrote: > >>> From: Mike Christie <michaelc@cs.wisc.edu> > >>> > >>> LIO uses scatterlist for its page/data management. This patch > >>> adds a scatterlist messenger data type, so LIO can pass its sg > >>> down directly to rbd. > >> > >> Just as I mentioned for David's patches this is the wrong way to attack > >> your problem. The block layer already supports WRITE SAME, and COMPARE > >> and WRITE nees to be supported at that level too insted of creating > >> artifical bypasses. > > > > Why do I have to use the block layer? I just want to map the se_cmd to a > > ceph request and then put it back on the wire. I don't think I need any > > of the block layer services. We will do things like io scheduling on the > > OSD side. We just want to use LIO as more of a passthrough. > > Maybe I misunderstood you. > > I guess I was viewing this similar to cephfs where it does not use rbd > and the block layer. It just makes ceph/rados calls directly using > libceph. I am using rbd.c for its helper/wrapper functions around the > libceph ones, but I could just make libceph calls directly too. > > Were you saying because for lio support we need to do more block > layer'ish operations like write same, compare and write, etc than > cephfs, then I should not do the lio backend and we should always go > through rbd for lio support? If we're using common request_queue function pointers it would avoid the need to maintain an extra backend drivers, and be a generic offload interface for other make_request_fn() based block drivers using target-core to utilize. In the WRITE_SAME + COMPARE_AND_WRITE pass-through cases, IBLOCK se_cmd pass-through should be invoking the driver provided VAAI callbacks directly if they exist, and disabling local target-core CDB emulation. For EXTENDED_COPY pass-through, target-core copy-manager still needs to be responsible for config_group dependencies across multiple se_device backends, with a IBLOCK pass-through providing both block_device *src_bd + *dst_bd pointers into a request_queue callback for different PUSH/PULL offload models. It should also be able to fall back to local copy if source + destination devices do not both support the same type copy-offload pass-through. > > Is that for all operations? For distributed TMFs and PRs then are you > thinking I should make those more block layer based (some sort of queue > or block deivce callouts or REQ_ types), or should those still have some > sort of lio callouts which could call different locking/cluster APIs > like libceph? The PR-OUT logic for REGISTER w/ remote I_PORT and remote PREEMPT-* currently obtains the necessary config_group dependencies, and would need to be considered for request_queue based PR pass-through too. Exposing target TMFs pass-through into request_queue is a different beast.. -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jul 29, 2015 at 06:40:01PM -0500, Mike Christie wrote: > I guess I was viewing this similar to cephfs where it does not use rbd > and the block layer. It just makes ceph/rados calls directly using > libceph. I am using rbd.c for its helper/wrapper functions around the > libceph ones, but I could just make libceph calls directly too. > > Were you saying because for lio support we need to do more block > layer'ish operations like write same, compare and write, etc than > cephfs, then I should not do the lio backend and we should always go > through rbd for lio support? I'd really prefer that. We have other users for these facilities as well, and I'd much prefer having block layer support rather than working around it. > Is that for all operations? For distributed TMFs and PRs then are you > thinking I should make those more block layer based (some sort of queue > or block deivce callouts or REQ_ types), or should those still have some > sort of lio callouts which could call different locking/cluster APIs > like libceph? Yes. FYI, I've pushed out my WIP work for PRs here: http://git.infradead.org/users/hch/scsi.git/shortlog/refs/heads/pr-api TMFs are a bit of boderline case, but instead of needing special bypasses I'd rather find a way to add them. For example we already have TMF ioctls for SCSI, so we might as well pull this up to the block layer. -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 29 Jul 2015 04:23:38 -0500, mchristi@redhat.com wrote: > From: Mike Christie <michaelc@cs.wisc.edu> > > LIO uses scatterlist for its page/data management. This patch > adds a scatterlist messenger data type, so LIO can pass its sg > down directly to rbd. > > Signed-off-by: Mike Christie <michaelc@cs.wisc.edu> ... > /* > + * For a sg data item, a piece is whatever remains of the next > + * entry in the current sg entry, or the first entry in the next > + * sg in the list. > + */ > +static void ceph_msg_data_sg_cursor_init(struct ceph_msg_data_cursor *cursor, > + size_t length) > +{ > + struct ceph_msg_data *data = cursor->data; > + struct scatterlist *sg; > + > + BUG_ON(data->type != CEPH_MSG_DATA_SG); > + > + sg = data->sgl; > + BUG_ON(!sg); > + > + cursor->resid = min_t(u64, length, data->sgl_length); > + cursor->sg = sg; > + cursor->sg_consumed = data->sgl_init_offset; > + cursor->last_piece = cursor->resid <= sg->length; > +} Just in case the CEPH_MSG_DATA_SG changes are picked up, the cursor->last_piece calculation here needs to take into account the data->sgl_init_offset: if (cursor->resid <= (sg->length - data->sgl_init_offset)) cursor->last_piece = true; ... Cheers, David -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 3775327..bc1bde8 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -79,6 +79,7 @@ enum ceph_msg_data_type { #ifdef CONFIG_BLOCK CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ #endif /* CONFIG_BLOCK */ + CEPH_MSG_DATA_SG, /* data source/destination is a scatterlist */ }; static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) @@ -90,6 +91,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_SG: return true; default: return false; @@ -112,6 +114,11 @@ struct ceph_msg_data { unsigned int alignment; /* first page */ }; struct ceph_pagelist *pagelist; + struct { + struct scatterlist *sgl; + unsigned int sgl_init_offset; + u64 sgl_length; + }; }; }; @@ -139,6 +146,10 @@ struct ceph_msg_data_cursor { struct page *page; /* page from list */ size_t offset; /* bytes from list */ }; + struct { + struct scatterlist *sg; /* curr sg */ + unsigned int sg_consumed; + }; }; }; @@ -294,6 +305,8 @@ extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, size_t length); #endif /* CONFIG_BLOCK */ +extern void ceph_msg_data_add_sg(struct ceph_msg *msg, struct scatterlist *sgl, + unsigned int sgl_init_offset, u64 length); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0890167..2152f06 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -52,6 +52,7 @@ enum ceph_osd_data_type { #ifdef CONFIG_BLOCK CEPH_OSD_DATA_TYPE_BIO, #endif /* CONFIG_BLOCK */ + CEPH_OSD_DATA_TYPE_SG, }; struct ceph_osd_data { @@ -70,6 +71,11 @@ struct ceph_osd_data { struct bio *bio; /* list of bios */ size_t bio_length; /* total in list */ }; + struct { + struct scatterlist *sgl; + size_t sgl_length; + unsigned int sgl_init_offset; + }; #endif /* CONFIG_BLOCK */ }; }; @@ -313,7 +319,11 @@ extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, unsigned int which, struct bio *bio, size_t bio_length); #endif /* CONFIG_BLOCK */ - +extern void osd_req_op_extent_osd_data_sg(struct ceph_osd_request *, + unsigned int which, + struct scatterlist *sgl, + unsigned int init_sg_offset, + u64 length); extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, unsigned int which, struct ceph_pagelist *pagelist); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e3be1d2..08d39fb 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -893,6 +893,75 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, #endif /* CONFIG_BLOCK */ /* + * For a sg data item, a piece is whatever remains of the next + * entry in the current sg entry, or the first entry in the next + * sg in the list. + */ +static void ceph_msg_data_sg_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + struct scatterlist *sg; + + BUG_ON(data->type != CEPH_MSG_DATA_SG); + + sg = data->sgl; + BUG_ON(!sg); + + cursor->resid = min_t(u64, length, data->sgl_length); + cursor->sg = sg; + cursor->sg_consumed = data->sgl_init_offset; + cursor->last_piece = cursor->resid <= sg->length; +} + +static struct page *ceph_msg_data_sg_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + struct scatterlist *sg; + + BUG_ON(data->type != CEPH_MSG_DATA_SG); + + sg = cursor->sg; + BUG_ON(!sg); + + *page_offset = sg->offset + cursor->sg_consumed; + + if (cursor->last_piece) + *length = cursor->resid; + else + *length = sg->length - cursor->sg_consumed; + + /* currently support non clustered sg pages */ + return sg_page(sg); +} + +static bool ceph_msg_data_sg_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + BUG_ON(cursor->data->type != CEPH_MSG_DATA_SG); + + /* Advance the cursor offset */ + BUG_ON(cursor->resid < bytes); + cursor->resid -= bytes; + cursor->sg_consumed += bytes; + + if (!bytes || cursor->sg_consumed < cursor->sg->length) + return false; /* more bytes to process in the current page */ + + if (!cursor->resid) + return false; /* no more data */ + + /* For WRITE_SAME we have a single sg that is written over and over */ + if (sg_next(cursor->sg)) + cursor->sg = sg_next(cursor->sg); + cursor->sg_consumed = 0; + + cursor->last_piece = cursor->resid <= cursor->sg->length; + return true; +} + +/* * For a page array, a piece comes from the first page in the array * that has not already been fully consumed. */ @@ -1075,6 +1144,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) ceph_msg_data_bio_cursor_init(cursor, length); break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_SG: + ceph_msg_data_sg_cursor_init(cursor, length); + break; case CEPH_MSG_DATA_NONE: default: /* BUG(); */ @@ -1123,6 +1195,9 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, page = ceph_msg_data_bio_next(cursor, page_offset, length); break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_SG: + page = ceph_msg_data_sg_next(cursor, page_offset, length); + break; case CEPH_MSG_DATA_NONE: default: page = NULL; @@ -1159,6 +1234,9 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, new_piece = ceph_msg_data_bio_advance(cursor, bytes); break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_SG: + new_piece = ceph_msg_data_sg_advance(cursor, bytes); + break; case CEPH_MSG_DATA_NONE: default: BUG(); @@ -3182,6 +3260,24 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, EXPORT_SYMBOL(ceph_msg_data_add_bio); #endif /* CONFIG_BLOCK */ +void ceph_msg_data_add_sg(struct ceph_msg *msg, struct scatterlist *sgl, + unsigned int sgl_init_offset, u64 length) +{ + struct ceph_msg_data *data; + + BUG_ON(!sgl); + + data = ceph_msg_data_create(CEPH_MSG_DATA_SG); + BUG_ON(!data); + data->sgl = sgl; + data->sgl_length = length; + data->sgl_init_offset = sgl_init_offset; + + list_add_tail(&data->links, &msg->data); + msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_sg); + /* * construct a new message with given type, size * the new msg has a ref count of 1. diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f8178b7..fd0a52e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -128,6 +128,16 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, } #endif /* CONFIG_BLOCK */ +static void ceph_osd_data_sg_init(struct ceph_osd_data *osd_data, + struct scatterlist *sgl, + unsigned int init_sg_offset, u64 length) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_SG; + osd_data->sgl = sgl; + osd_data->sgl_length = length; + osd_data->sgl_init_offset = init_sg_offset; +} + #define osd_req_op_data(oreq, whch, typ, fld) \ ({ \ BUG_ON(whch >= (oreq)->r_num_ops); \ @@ -206,6 +216,17 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); #endif /* CONFIG_BLOCK */ +void osd_req_op_extent_osd_data_sg(struct ceph_osd_request *osd_req, + unsigned int which, struct scatterlist *sgl, + unsigned int init_sg_offset, u64 length) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_sg_init(osd_data, sgl, init_sg_offset, length); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_sg); + static void osd_req_op_cls_request_info_pagelist( struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) @@ -317,6 +338,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) case CEPH_OSD_DATA_TYPE_BIO: return (u64)osd_data->bio_length; #endif /* CONFIG_BLOCK */ + case CEPH_OSD_DATA_TYPE_SG: + return osd_data->sgl_length; default: WARN(true, "unrecognized data type %d\n", (int)osd_data->type); return 0; @@ -727,6 +750,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { ceph_msg_data_add_bio(msg, osd_data->bio, length); #endif + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_SG) { + ceph_msg_data_add_sg(msg, osd_data->sgl, + osd_data->sgl_init_offset, length); } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); }