Message ID | 447c0db42cc71e92b1c2209f80e0e419628c3a9c.1432177493.git.liwang@ubuntukylin.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, May 21, 2015 at 12:19 PM, Li Wang <liwang@ubuntukylin.com> wrote: > From: Min Chen <minchen@ubuntukylin.com> > > Signed-off-by: Min Chen <minchen@ubuntukylin.com> > Signed-off-by: Li Wang <liwang@ubuntukylin.com> > Signed-off-by: Yunchuan Wen <yunchuanwen@ubuntukylin.com> > --- > drivers/block/rbd.c | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 183 insertions(+), 3 deletions(-) > > diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c > index 99a3a556..51d8398 100644 > --- a/drivers/block/rbd.c > +++ b/drivers/block/rbd.c > @@ -1851,12 +1851,16 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) > obj_request, img_request, obj_request->result, > obj_request->xferred, obj_request->length); > if (layered && obj_request->result == -ENOENT && > - obj_request->img_offset < rbd_dev->parent_overlap) > + obj_request->img_offset < rbd_dev->parent_overlap) { > rbd_img_parent_read(obj_request); > - else if (img_request) > + rbd_assert(obj_request->img_request); > + if(is_copy_on_read(obj_request->img_request->rbd_dev)) > + rbd_img_copyup_start(obj_request->img_request, obj_request->object_name); > + } else if (img_request) { > rbd_img_obj_request_read_callback(obj_request); > - else > + } else { > obj_request_done_set(obj_request); > + } > } > > static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) > @@ -2915,6 +2919,182 @@ out_err: > return result; > } > > +static void rbd_img_copyup_end(struct rbd_copyup_request *copyup_request) > +{ > + struct rbd_img_request *img_request = NULL; > + rbd_assert(copyup_request); > + img_request = copyup_request->img_request; > + rbd_img_copyup_request_del(img_request, copyup_request); > + rbd_copyup_request_destroy(©up_request->kref); > + rbd_img_request_put(img_request); > +} > + > +static void rbd_osd_req_copyup_callback(struct ceph_osd_request *osd_req, > + struct ceph_msg *msg) > +{ > + struct rbd_copyup_request *copyup_request = NULL; > + rbd_assert(osd_req); > + copyup_request = osd_req->r_priv; > + copyup_request->result = osd_req->r_result; > + if(copyup_request->callback) > + copyup_request->callback(copyup_request); > + else > + complete_all(©up_request->completion); > +} > + > +static void rbd_img_copyup_write_async(struct rbd_copyup_request *copyup_request) > +{ > + struct rbd_img_request *img_request = NULL; > + struct ceph_snap_context *snapc = NULL; > + struct ceph_osd_request *osd_req = NULL; > + struct ceph_osd_client *osdc = NULL; > + struct rbd_device *rbd_dev = NULL; > + struct page **pages = NULL; > + struct timespec mtime = CURRENT_TIME; > + u32 page_count = 0; > + u64 object_size = 0; > + int result = 0; > + > + /* if copyup_request read from parent failed, just end it */ > + if (copyup_request->result < 0) { > + rbd_img_copyup_end(copyup_request); > + goto out; > + } > + > + img_request = copyup_request->img_request; > + rbd_assert(img_request); > + rbd_dev = img_request->rbd_dev; > + rbd_assert(rbd_dev); > + osdc = &rbd_dev->rbd_client->client->osdc; > + rbd_assert(osdc); > + snapc = rbd_dev->header.snapc; > + > + ceph_osdc_put_request(copyup_request->osd_req); > + > + copyup_request->osd_req = NULL; > + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); > + if (!osd_req) > + goto out; > + > + pages = copyup_request->copyup_pages; > + page_count = copyup_request->copyup_page_count; > + object_size = (u64)1 << rbd_dev->header.obj_order; > + > + /* Initialize copyup op */ > + osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); > + osd_req_op_cls_request_data_pages(osd_req, 0, pages, object_size, 0, false, false); > + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; > + osd_req->r_callback = rbd_osd_req_copyup_callback; > + osd_req->r_priv = copyup_request; > + > + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); > + ceph_oid_set_name(&osd_req->r_base_oid, copyup_request->object_name); > + > + copyup_request->osd_req = osd_req; > + copyup_request->callback = rbd_img_copyup_end; > + > + ceph_osdc_build_request(osd_req, 0, snapc, CEPH_NOSNAP, &mtime); > + result = ceph_osdc_start_request(osdc, osd_req, false); > + if(!result) > + goto out; > + > + ceph_osdc_put_request(osd_req); > +out: > + return; > +} > + > +static void rbd_img_copyup_start(struct rbd_img_request *img_request, > + const char *object_name) > +{ > + struct rbd_copyup_request *copyup_request = NULL; > + struct rbd_device *rbd_dev = NULL; > + struct ceph_snap_context *snapc = NULL; > + struct ceph_osd_client *osdc = NULL; > + struct ceph_osd_request *osd_req = NULL; > + const char *parent_object_name = NULL; > + > + int result = 0; > + u64 object_no = (u64)-1; > + u64 object_size = 0; > + u64 snap_id = 0; > + __u8 obj_order = 0; > + bool is_read = false; > + > + rbd_assert(img_request != NULL); > + rbd_assert(object_name != NULL); > + > + rbd_dev = img_request->rbd_dev; > + rbd_assert(rbd_dev != NULL); > + > + is_read = !img_request_write_test(img_request) && > + !img_request_discard_test(img_request); > + > + object_no = rbd_object_no(rbd_dev, object_name); > + obj_order = rbd_dev->header.obj_order; > + object_size = (u64)1 << obj_order; > + > + spin_lock(&img_request->copyup_list_lock); > + /* Find if object_no exists in copyup_list */ > + for_each_copyup_request(img_request, copyup_request) { > + /* Found, just return */ > + if(copyup_request->object_no == object_no) { > + spin_unlock(&img_request->copyup_list_lock); > + return; > + } > + } > + spin_unlock(&img_request->copyup_list_lock); > + > + /* Not found, send new copyup request */ > + copyup_request = NULL; > + osdc = &rbd_dev->rbd_client->client->osdc; > + parent_object_name = rbd_segment_name(rbd_dev->parent, object_no << obj_order); > + if (!parent_object_name) > + goto out; > + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); > + if (!osd_req) > + goto out; > + copyup_request = rbd_copyup_request_create(object_name, rbd_dev); > + if (!copyup_request) { > + ceph_osdc_put_request(osd_req); > + goto out; > + } > + > + /* Init osd_req */ > + osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ, 0, object_size, 0, 0); > + osd_req_op_extent_osd_data_pages(osd_req, 0, copyup_request->copyup_pages, object_size, > + 0, false, false); > + > + osd_req->r_flags = CEPH_OSD_FLAG_READ; > + osd_req->r_callback = rbd_osd_req_copyup_callback; > + osd_req->r_priv = copyup_request; > + > + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->parent->layout); > + ceph_oid_set_name(&osd_req->r_base_oid, parent_object_name); > + rbd_segment_name_free(parent_object_name); > + > + /* Init copyup request */ > + rbd_assert(copyup_request->osd_req == NULL); > + copyup_request->osd_req = osd_req; > + copyup_request->callback = rbd_img_copyup_write_async; > + > + /* Encode osd_req data */ > + snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; > + ceph_osdc_build_request(osd_req, 0, NULL, snap_id, NULL); > + > + /* Add copyup request to img_request->copyup_list */ > + rbd_img_copyup_request_add(img_request, copyup_request); > + > + rbd_img_request_get(img_request); > + > + /* Send osd_req */ > + result = ceph_osdc_start_request(osdc, osd_req, false); > + if (!result) > + goto out; > +out: > + return; > +} > + > + > static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) > { > struct rbd_obj_request *orig_request; Hi, Sorry for late feedback, I thought I had sent this.. I have a number of high-level issues with this. First and foremost, the lifetime rules are unclear. It looks to me like there is nothing preventing a bunch of async copyup requests from hanging around after rbd unmap completes. These copyups bump img_request refcount, which means that img_requests along with rbd_device, rbd_client and ceph_client will also hang in there after rbd unmap, waiting for late replies. This is wrong: if there are no images mapped, there should be no rbd or libceph state around. I'm pretty sure async copyup requests don't need an osd_client callback at all - you can just send and forget. Second, I think sync (copy-on-write) and async (copy-on-read) copyups should be coordinated with each other. If there is an async copyup in progress, a sync copyup can just wait for it to complete. Related to the above is the copyup request list. I think it should be a per image rather than a per img_request thing - copyup_list in struct rbd_img_request doesn't make much sense to me. What exactly is it's use there? Fourth, unless I'm missing something, the following rbd_img_parent_read(obj_request); rbd_assert(obj_request->img_request); if(is_copy_on_read(obj_request->img_request->rbd_dev)) rbd_img_copyup_start(...); in rbd_osd_read_callback() means that async copyup requests will be issued for objects that don't exist in the parent. I think the correct way to handle this is to wait for a parent read request to complete and issue a copyup only if it completes successfully. Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 99a3a556..51d8398 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1851,12 +1851,16 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) obj_request, img_request, obj_request->result, obj_request->xferred, obj_request->length); if (layered && obj_request->result == -ENOENT && - obj_request->img_offset < rbd_dev->parent_overlap) + obj_request->img_offset < rbd_dev->parent_overlap) { rbd_img_parent_read(obj_request); - else if (img_request) + rbd_assert(obj_request->img_request); + if(is_copy_on_read(obj_request->img_request->rbd_dev)) + rbd_img_copyup_start(obj_request->img_request, obj_request->object_name); + } else if (img_request) { rbd_img_obj_request_read_callback(obj_request); - else + } else { obj_request_done_set(obj_request); + } } static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) @@ -2915,6 +2919,182 @@ out_err: return result; } +static void rbd_img_copyup_end(struct rbd_copyup_request *copyup_request) +{ + struct rbd_img_request *img_request = NULL; + rbd_assert(copyup_request); + img_request = copyup_request->img_request; + rbd_img_copyup_request_del(img_request, copyup_request); + rbd_copyup_request_destroy(©up_request->kref); + rbd_img_request_put(img_request); +} + +static void rbd_osd_req_copyup_callback(struct ceph_osd_request *osd_req, + struct ceph_msg *msg) +{ + struct rbd_copyup_request *copyup_request = NULL; + rbd_assert(osd_req); + copyup_request = osd_req->r_priv; + copyup_request->result = osd_req->r_result; + if(copyup_request->callback) + copyup_request->callback(copyup_request); + else + complete_all(©up_request->completion); +} + +static void rbd_img_copyup_write_async(struct rbd_copyup_request *copyup_request) +{ + struct rbd_img_request *img_request = NULL; + struct ceph_snap_context *snapc = NULL; + struct ceph_osd_request *osd_req = NULL; + struct ceph_osd_client *osdc = NULL; + struct rbd_device *rbd_dev = NULL; + struct page **pages = NULL; + struct timespec mtime = CURRENT_TIME; + u32 page_count = 0; + u64 object_size = 0; + int result = 0; + + /* if copyup_request read from parent failed, just end it */ + if (copyup_request->result < 0) { + rbd_img_copyup_end(copyup_request); + goto out; + } + + img_request = copyup_request->img_request; + rbd_assert(img_request); + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev); + osdc = &rbd_dev->rbd_client->client->osdc; + rbd_assert(osdc); + snapc = rbd_dev->header.snapc; + + ceph_osdc_put_request(copyup_request->osd_req); + + copyup_request->osd_req = NULL; + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); + if (!osd_req) + goto out; + + pages = copyup_request->copyup_pages; + page_count = copyup_request->copyup_page_count; + object_size = (u64)1 << rbd_dev->header.obj_order; + + /* Initialize copyup op */ + osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); + osd_req_op_cls_request_data_pages(osd_req, 0, pages, object_size, 0, false, false); + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + osd_req->r_callback = rbd_osd_req_copyup_callback; + osd_req->r_priv = copyup_request; + + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, copyup_request->object_name); + + copyup_request->osd_req = osd_req; + copyup_request->callback = rbd_img_copyup_end; + + ceph_osdc_build_request(osd_req, 0, snapc, CEPH_NOSNAP, &mtime); + result = ceph_osdc_start_request(osdc, osd_req, false); + if(!result) + goto out; + + ceph_osdc_put_request(osd_req); +out: + return; +} + +static void rbd_img_copyup_start(struct rbd_img_request *img_request, + const char *object_name) +{ + struct rbd_copyup_request *copyup_request = NULL; + struct rbd_device *rbd_dev = NULL; + struct ceph_snap_context *snapc = NULL; + struct ceph_osd_client *osdc = NULL; + struct ceph_osd_request *osd_req = NULL; + const char *parent_object_name = NULL; + + int result = 0; + u64 object_no = (u64)-1; + u64 object_size = 0; + u64 snap_id = 0; + __u8 obj_order = 0; + bool is_read = false; + + rbd_assert(img_request != NULL); + rbd_assert(object_name != NULL); + + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev != NULL); + + is_read = !img_request_write_test(img_request) && + !img_request_discard_test(img_request); + + object_no = rbd_object_no(rbd_dev, object_name); + obj_order = rbd_dev->header.obj_order; + object_size = (u64)1 << obj_order; + + spin_lock(&img_request->copyup_list_lock); + /* Find if object_no exists in copyup_list */ + for_each_copyup_request(img_request, copyup_request) { + /* Found, just return */ + if(copyup_request->object_no == object_no) { + spin_unlock(&img_request->copyup_list_lock); + return; + } + } + spin_unlock(&img_request->copyup_list_lock); + + /* Not found, send new copyup request */ + copyup_request = NULL; + osdc = &rbd_dev->rbd_client->client->osdc; + parent_object_name = rbd_segment_name(rbd_dev->parent, object_no << obj_order); + if (!parent_object_name) + goto out; + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); + if (!osd_req) + goto out; + copyup_request = rbd_copyup_request_create(object_name, rbd_dev); + if (!copyup_request) { + ceph_osdc_put_request(osd_req); + goto out; + } + + /* Init osd_req */ + osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ, 0, object_size, 0, 0); + osd_req_op_extent_osd_data_pages(osd_req, 0, copyup_request->copyup_pages, object_size, + 0, false, false); + + osd_req->r_flags = CEPH_OSD_FLAG_READ; + osd_req->r_callback = rbd_osd_req_copyup_callback; + osd_req->r_priv = copyup_request; + + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->parent->layout); + ceph_oid_set_name(&osd_req->r_base_oid, parent_object_name); + rbd_segment_name_free(parent_object_name); + + /* Init copyup request */ + rbd_assert(copyup_request->osd_req == NULL); + copyup_request->osd_req = osd_req; + copyup_request->callback = rbd_img_copyup_write_async; + + /* Encode osd_req data */ + snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; + ceph_osdc_build_request(osd_req, 0, NULL, snap_id, NULL); + + /* Add copyup request to img_request->copyup_list */ + rbd_img_copyup_request_add(img_request, copyup_request); + + rbd_img_request_get(img_request); + + /* Send osd_req */ + result = ceph_osdc_start_request(osdc, osd_req, false); + if (!result) + goto out; +out: + return; +} + + static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) { struct rbd_obj_request *orig_request;