Message ID | 20181009104806.6821-3-lhenriques@suse.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | copy_file_range in cephfs kernel client | expand |
On Tue, Oct 9, 2018 at 12:47 PM Luis Henriques <lhenriques@suse.com> wrote: > > Add support for performing remote object copies using the 'copy-from' > operation. > > Signed-off-by: Luis Henriques <lhenriques@suse.com> > --- > include/linux/ceph/osd_client.h | 17 ++++++++ > include/linux/ceph/rados.h | 19 +++++++++ > net/ceph/osd_client.c | 72 +++++++++++++++++++++++++++++++++ > 3 files changed, 108 insertions(+) Change the title to "libceph: ...". > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > index 02096da01845..f25b4aaaab09 100644 > --- a/include/linux/ceph/osd_client.h > +++ b/include/linux/ceph/osd_client.h > @@ -136,6 +136,13 @@ struct ceph_osd_req_op { > u64 expected_object_size; > u64 expected_write_size; > } alloc_hint; > + struct { > + u64 snapid; > + u64 src_version; > + u8 flags; > + u32 src_fadvise_flags; > + struct ceph_osd_data osd_data; > + } copy_from; > }; > }; > > @@ -511,6 +518,16 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, > struct timespec64 *mtime, > struct page **pages, int nr_pages); > > +extern int ceph_osdc_copy_from(struct ceph_osd_client *osdc, > + u64 src_snapid, u64 src_version, > + struct ceph_object_id *src_oid, > + struct ceph_object_locator *src_oloc, > + u32 src_fadvise_flags, > + u64 dst_snapid, > + struct ceph_object_id *dst_oid, > + struct ceph_object_locator *dst_oloc, > + u8 dst_fadvise_flags); Drop extern here. > + > /* watch/notify */ > struct ceph_osd_linger_request * > ceph_osdc_watch(struct ceph_osd_client *osdc, > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h > index f1988387c5ad..d47540986eff 100644 > --- a/include/linux/ceph/rados.h > +++ b/include/linux/ceph/rados.h > @@ -410,6 +410,14 @@ enum { > enum { > CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ > CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ > + CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ > + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ > + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in > + the near future */ > + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed > + in the near future */ > + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only > + once by this client */ > }; > > #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ > @@ -497,6 +505,17 @@ struct ceph_osd_op { > __le64 expected_object_size; > __le64 expected_write_size; > } __attribute__ ((packed)) alloc_hint; > + struct { > + __le64 snapid; > + __le64 src_version; > + __u8 flags; Add /* CEPH_OSD_COPY_FROM_FLAG_* */ and pull in that enum. > + /* > + * __le32 flags: CEPH_OSD_OP_FLAG_FADVISE_: mean the > + * fadvise flags for dest object src_fadvise_flags mean > + * the fadvise flags for src object > + */ > + __le32 src_fadvise_flags; This comment is super confusing. It makes it look like a commented out field, but it actually refers to op.flags. This seems to have confused you into thinking that dst_fadvise_flags go into the above __u8 flags. Change it to /* * CEPH_OSD_OP_FLAG_FADVISE_* * * These are fadvise flags for src object, fadvise flags for dest * object are in ceph_osd_op::flags. */ I'll send a PR to get this clarified on the userspace side as well. > + } __attribute__ ((packed)) copy_from; > }; > __le32 payload_len; > } __attribute__ ((packed)); > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c > index 60934bd8796c..9fd19ff556af 100644 > --- a/net/ceph/osd_client.c > +++ b/net/ceph/osd_client.c > @@ -955,6 +955,14 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, > case CEPH_OSD_OP_CREATE: > case CEPH_OSD_OP_DELETE: > break; > + case CEPH_OSD_OP_COPY_FROM: > + dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); > + dst->copy_from.src_version = > + cpu_to_le64(src->copy_from.src_version); > + dst->copy_from.flags = src->copy_from.flags; > + dst->copy_from.src_fadvise_flags = > + cpu_to_le32(src->copy_from.src_fadvise_flags); > + break; > default: > pr_err("unsupported osd opcode %s\n", > ceph_osd_op_name(src->op)); > @@ -1908,6 +1916,10 @@ static void setup_request_data(struct ceph_osd_request *req, > ceph_osdc_msg_data_add(req->r_reply, > &op->notify.response_data); > break; > + case CEPH_OSD_OP_COPY_FROM: > + ceph_osdc_msg_data_add(msg, > + &op->copy_from.osd_data); > + break; This is a request data item. Move it up, near CEPH_OSD_OP_NOTIFY_ACK. > } > > data_len += op->indata_len; > @@ -5168,6 +5180,66 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, > } > EXPORT_SYMBOL(ceph_osdc_writepages); > > +int ceph_osdc_copy_from(struct ceph_osd_client *osdc, > + u64 src_snapid, u64 src_version, > + struct ceph_object_id *src_oid, > + struct ceph_object_locator *src_oloc, > + u32 src_fadvise_flags, > + u64 dst_snapid, I don't see any snap context around, why does it need dst_snapid? Also, libceph will WARN if you try to write to a snapshot. > + struct ceph_object_id *dst_oid, > + struct ceph_object_locator *dst_oloc, > + u8 dst_fadvise_flags) We should probably add u8 copy_from_flags for generality. > +{ > + struct ceph_osd_request *req = NULL; > + struct ceph_options *opts = osdc->client->options; > + struct ceph_osd_req_op *op; > + struct page **pages; > + void *p, *end; > + int ret; > + > + pages = ceph_alloc_page_vector(1, GFP_NOIO); > + if (IS_ERR(pages)) > + return PTR_ERR(pages); > + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); > + if (!req) { > + ret = -ENOMEM; > + goto out; > + } > + req->r_flags = CEPH_OSD_FLAG_WRITE; > + op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, 0); > + op->copy_from.snapid = src_snapid; > + op->copy_from.src_version = src_version; > + op->copy_from.flags = dst_fadvise_flags; This should be either copy_from_flags or 0. I'd factor this code out, similar to osd_req_op_notify_ack_init(). This function would then only be concerned with req itself. > + op->copy_from.src_fadvise_flags = src_fadvise_flags; > + > + ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); > + ceph_oid_copy(&req->r_t.base_oid, dst_oid); > + > + p = page_address(pages[0]); > + end = p + PAGE_SIZE; > + ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); > + encode_oloc(&p, end, src_oloc); > + op->indata_len = PAGE_SIZE - (end - p); > + > + ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, > + op->indata_len, 0, > + false, true); > + req->r_snapid = dst_snapid; > + req->r_data_offset = 0; /* XXX dst_off? */ > + > + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); You are using GFP_NOIO for pages and req and GFP_NOFS here. Does this operation need either? If it's only going to be called in normal syscall context, GFP_KERNEL should be good enough. > + if (ret) > + goto out; > + ceph_osdc_start_request(osdc, req, false); > + ret = wait_request_timeout(req, opts->mount_timeout); I don't think the timeout is needed here. CephFS uses mount_timeout for MDS operations, OSD requests aren't timed out in general. Thanks, Ilya
On Tue, Oct 9, 2018 at 4:10 PM Ilya Dryomov <idryomov@gmail.com> wrote: > > On Tue, Oct 9, 2018 at 12:47 PM Luis Henriques <lhenriques@suse.com> wrote: > > > > Add support for performing remote object copies using the 'copy-from' > > operation. > > > > Signed-off-by: Luis Henriques <lhenriques@suse.com> > > --- > > include/linux/ceph/osd_client.h | 17 ++++++++ > > include/linux/ceph/rados.h | 19 +++++++++ > > net/ceph/osd_client.c | 72 +++++++++++++++++++++++++++++++++ > > 3 files changed, 108 insertions(+) > > Change the title to "libceph: ...". > > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > index 02096da01845..f25b4aaaab09 100644 > > --- a/include/linux/ceph/osd_client.h > > +++ b/include/linux/ceph/osd_client.h > > @@ -136,6 +136,13 @@ struct ceph_osd_req_op { > > u64 expected_object_size; > > u64 expected_write_size; > > } alloc_hint; > > + struct { > > + u64 snapid; > > + u64 src_version; > > + u8 flags; > > + u32 src_fadvise_flags; > > + struct ceph_osd_data osd_data; > > + } copy_from; > > }; > > }; > > > > @@ -511,6 +518,16 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, > > struct timespec64 *mtime, > > struct page **pages, int nr_pages); > > > > +extern int ceph_osdc_copy_from(struct ceph_osd_client *osdc, > > + u64 src_snapid, u64 src_version, > > + struct ceph_object_id *src_oid, > > + struct ceph_object_locator *src_oloc, > > + u32 src_fadvise_flags, > > + u64 dst_snapid, > > + struct ceph_object_id *dst_oid, > > + struct ceph_object_locator *dst_oloc, > > + u8 dst_fadvise_flags); > > Drop extern here. > > > + > > /* watch/notify */ > > struct ceph_osd_linger_request * > > ceph_osdc_watch(struct ceph_osd_client *osdc, > > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h > > index f1988387c5ad..d47540986eff 100644 > > --- a/include/linux/ceph/rados.h > > +++ b/include/linux/ceph/rados.h > > @@ -410,6 +410,14 @@ enum { > > enum { > > CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ > > CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ > > + CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ > > + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ > > + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in > > + the near future */ > > + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed > > + in the near future */ > > + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only > > + once by this client */ > > }; > > > > #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ > > @@ -497,6 +505,17 @@ struct ceph_osd_op { > > __le64 expected_object_size; > > __le64 expected_write_size; > > } __attribute__ ((packed)) alloc_hint; > > + struct { > > + __le64 snapid; > > + __le64 src_version; > > + __u8 flags; > > Add /* CEPH_OSD_COPY_FROM_FLAG_* */ and pull in that enum. > > > + /* > > + * __le32 flags: CEPH_OSD_OP_FLAG_FADVISE_: mean the > > + * fadvise flags for dest object src_fadvise_flags mean > > + * the fadvise flags for src object > > + */ > > + __le32 src_fadvise_flags; > > This comment is super confusing. It makes it look like a commented out > field, but it actually refers to op.flags. This seems to have confused > you into thinking that dst_fadvise_flags go into the above __u8 flags. > > Change it to > > /* > * CEPH_OSD_OP_FLAG_FADVISE_* > * > * These are fadvise flags for src object, fadvise flags for dest > * object are in ceph_osd_op::flags. > */ > > I'll send a PR to get this clarified on the userspace side as well. https://github.com/ceph/ceph/pull/24497 Thanks, Ilya
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 02096da01845..f25b4aaaab09 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -136,6 +136,13 @@ struct ceph_osd_req_op { u64 expected_object_size; u64 expected_write_size; } alloc_hint; + struct { + u64 snapid; + u64 src_version; + u8 flags; + u32 src_fadvise_flags; + struct ceph_osd_data osd_data; + } copy_from; }; }; @@ -511,6 +518,16 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct timespec64 *mtime, struct page **pages, int nr_pages); +extern int ceph_osdc_copy_from(struct ceph_osd_client *osdc, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u64 dst_snapid, + struct ceph_object_id *dst_oid, + struct ceph_object_locator *dst_oloc, + u8 dst_fadvise_flags); + /* watch/notify */ struct ceph_osd_linger_request * ceph_osdc_watch(struct ceph_osd_client *osdc, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index f1988387c5ad..d47540986eff 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -410,6 +410,14 @@ enum { enum { CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ + CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in + the near future */ + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed + in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only + once by this client */ }; #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ @@ -497,6 +505,17 @@ struct ceph_osd_op { __le64 expected_object_size; __le64 expected_write_size; } __attribute__ ((packed)) alloc_hint; + struct { + __le64 snapid; + __le64 src_version; + __u8 flags; + /* + * __le32 flags: CEPH_OSD_OP_FLAG_FADVISE_: mean the + * fadvise flags for dest object src_fadvise_flags mean + * the fadvise flags for src object + */ + __le32 src_fadvise_flags; + } __attribute__ ((packed)) copy_from; }; __le32 payload_len; } __attribute__ ((packed)); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 60934bd8796c..9fd19ff556af 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -955,6 +955,14 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: break; + case CEPH_OSD_OP_COPY_FROM: + dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); + dst->copy_from.src_version = + cpu_to_le64(src->copy_from.src_version); + dst->copy_from.flags = src->copy_from.flags; + dst->copy_from.src_fadvise_flags = + cpu_to_le32(src->copy_from.src_fadvise_flags); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -1908,6 +1916,10 @@ static void setup_request_data(struct ceph_osd_request *req, ceph_osdc_msg_data_add(req->r_reply, &op->notify.response_data); break; + case CEPH_OSD_OP_COPY_FROM: + ceph_osdc_msg_data_add(msg, + &op->copy_from.osd_data); + break; } data_len += op->indata_len; @@ -5168,6 +5180,66 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } EXPORT_SYMBOL(ceph_osdc_writepages); +int ceph_osdc_copy_from(struct ceph_osd_client *osdc, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u64 dst_snapid, + struct ceph_object_id *dst_oid, + struct ceph_object_locator *dst_oloc, + u8 dst_fadvise_flags) +{ + struct ceph_osd_request *req = NULL; + struct ceph_options *opts = osdc->client->options; + struct ceph_osd_req_op *op; + struct page **pages; + void *p, *end; + int ret; + + pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(pages)) + return PTR_ERR(pages); + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); + if (!req) { + ret = -ENOMEM; + goto out; + } + req->r_flags = CEPH_OSD_FLAG_WRITE; + op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, 0); + op->copy_from.snapid = src_snapid; + op->copy_from.src_version = src_version; + op->copy_from.flags = dst_fadvise_flags; + op->copy_from.src_fadvise_flags = src_fadvise_flags; + + ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); + ceph_oid_copy(&req->r_t.base_oid, dst_oid); + + p = page_address(pages[0]); + end = p + PAGE_SIZE; + ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); + encode_oloc(&p, end, src_oloc); + op->indata_len = PAGE_SIZE - (end - p); + + ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, + op->indata_len, 0, + false, true); + req->r_snapid = dst_snapid; + req->r_data_offset = 0; /* XXX dst_off? */ + + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (ret) + goto out; + ceph_osdc_start_request(osdc, req, false); + ret = wait_request_timeout(req, opts->mount_timeout); +out: + ceph_release_page_vector(pages, 1); + if (req) + ceph_osdc_put_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_copy_from); + int __init ceph_osdc_setup(void) { size_t size = sizeof(struct ceph_osd_request) +
Add support for performing remote object copies using the 'copy-from' operation. Signed-off-by: Luis Henriques <lhenriques@suse.com> --- include/linux/ceph/osd_client.h | 17 ++++++++ include/linux/ceph/rados.h | 19 +++++++++ net/ceph/osd_client.c | 72 +++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+)