@@ -34,10 +34,12 @@
#include <linux/ceph/cls_lock_client.h>
#include <linux/ceph/striper.h>
#include <linux/ceph/decode.h>
+#include <linux/ceph/journaler.h>
#include <linux/parser.h>
#include <linux/bsearch.h>
#include <linux/kernel.h>
+#include <linux/bio.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/blk-mq.h>
@@ -115,12 +117,14 @@ static int atomic_dec_return_safe(atomic_t *v)
#define RBD_FEATURE_LAYERING (1ULL<<0)
#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
+#define RBD_FEATURE_JOURNALING (1ULL<<6)
#define RBD_FEATURE_DATA_POOL (1ULL<<7)
#define RBD_FEATURE_OPERATIONS (1ULL<<8)
#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
RBD_FEATURE_STRIPINGV2 | \
RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_JOURNALING | \
RBD_FEATURE_DATA_POOL | \
RBD_FEATURE_OPERATIONS)
@@ -293,6 +297,8 @@ struct rbd_img_request {
u32 obj_request_count;
u32 pending_count;
+ struct ceph_journaler_future *journal_future;
+
struct kref kref;
};
@@ -375,6 +381,8 @@ struct rbd_device {
atomic_t parent_ref;
struct rbd_device *parent;
+ struct rbd_journal *journal;
+
/* Block layer tags. */
struct blk_mq_tag_set tag_set;
@@ -402,6 +410,14 @@ enum rbd_dev_flags {
RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
};
+struct rbd_journal {
+ struct ceph_journaler *journaler;
+
+ struct ceph_journaler_client *client;
+
+ uint64_t tag_tid;
+};
+
static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
static LIST_HEAD(rbd_dev_list); /* devices */
@@ -2562,12 +2578,19 @@ static void rbd_img_end_child_request(struct rbd_img_request *img_req)
static void rbd_img_end_request(struct rbd_img_request *img_req)
{
rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
- rbd_assert((!img_req->result &&
- img_req->xferred == blk_rq_bytes(img_req->rq)) ||
- (img_req->result < 0 && !img_req->xferred));
- blk_mq_end_request(img_req->rq,
- errno_to_blk_status(img_req->result));
+ if (img_req->rq) {
+ rbd_assert((!img_req->result &&
+ img_req->xferred == blk_rq_bytes(img_req->rq)) ||
+ (img_req->result < 0 && !img_req->xferred));
+ blk_mq_end_request(img_req->rq,
+ errno_to_blk_status(img_req->result));
+ }
+
+ if (img_req->journal_future) {
+ ceph_journaler_client_committed(img_req->rbd_dev->journal->journaler, img_req->journal_future->commit_tid);
+ }
+
rbd_img_request_put(img_req);
}
@@ -2576,8 +2599,9 @@ static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
struct rbd_img_request *img_req;
again:
- if (!__rbd_obj_handle_request(obj_req))
+ if (!__rbd_obj_handle_request(obj_req)) {
return;
+ }
img_req = obj_req->img_request;
spin_lock(&img_req->completion_lock);
@@ -3602,6 +3626,10 @@ static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
return ret;
}
+static int rbd_journal_append(struct rbd_device *rbd_dev, struct bio *bio,
+ u64 offset, u64 length, enum obj_operation_type op_type,
+ struct ceph_journaler_future **journal_future);
+
static void rbd_queue_workfn(struct work_struct *work)
{
struct request *rq = blk_mq_rq_from_pdu(work);
@@ -3707,6 +3735,13 @@ static void rbd_queue_workfn(struct work_struct *work)
if (result)
goto err_img_request;
+ if (rbd_dev->header.features & RBD_FEATURE_JOURNALING) {
+ result = rbd_journal_append(rbd_dev, rq->bio, offset, length, op_type, &img_request->journal_future);
+ if (result) {
+ goto err_unlock;
+ }
+ }
+
rbd_img_request_submit(img_request);
if (must_be_locked)
up_read(&rbd_dev->lock_rwsem);
@@ -5507,6 +5542,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
if (ret)
goto err_out_disk;
+
set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
@@ -5548,8 +5584,320 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
return ret;
}
+enum rbd_journal_event_type {
+ EVENT_TYPE_AIO_DISCARD = 0,
+ EVENT_TYPE_AIO_WRITE = 1,
+ EVENT_TYPE_AIO_FLUSH = 2,
+ EVENT_TYPE_OP_FINISH = 3,
+ EVENT_TYPE_SNAP_CREATE = 4,
+ EVENT_TYPE_SNAP_REMOVE = 5,
+ EVENT_TYPE_SNAP_RENAME = 6,
+ EVENT_TYPE_SNAP_PROTECT = 7,
+ EVENT_TYPE_SNAP_UNPROTECT = 8,
+ EVENT_TYPE_SNAP_ROLLBACK = 9,
+ EVENT_TYPE_RENAME = 10,
+ EVENT_TYPE_RESIZE = 11,
+ EVENT_TYPE_FLATTEN = 12,
+ EVENT_TYPE_DEMOTE_PROMOTE = 13,
+ EVENT_TYPE_SNAP_LIMIT = 14,
+ EVENT_TYPE_UPDATE_FEATURES = 15,
+ EVENT_TYPE_METADATA_SET = 16,
+ EVENT_TYPE_METADATA_REMOVE = 17,
+ EVENT_TYPE_AIO_WRITESAME = 18,
+ EVENT_TYPE_AIO_COMPARE_AND_WRITE = 19,
+};
+
+static struct bio_vec *setup_write_bvecs(void *buf, u64 offset, u64 length)
+{
+ u32 i;
+ struct bio_vec *bvecs = NULL;
+ u32 bvec_count = 0;
+
+ bvec_count = calc_pages_for(offset, length);
+ bvecs = kcalloc(bvec_count, sizeof(*bvecs), GFP_NOIO);
+ if (!bvecs)
+ goto err;
+
+ offset = offset % PAGE_SIZE;
+ for (i = 0; i < bvec_count; i++) {
+ unsigned int len = min(length, (u64)PAGE_SIZE - offset);
+
+ bvecs[i].bv_page = alloc_page(GFP_NOIO);
+ if (!bvecs[i].bv_page)
+ goto free_bvecs;
+
+ bvecs[i].bv_offset = offset;
+ bvecs[i].bv_len = len;
+ memcpy(page_address(bvecs[i].bv_page) + bvecs[i].bv_offset, buf, bvecs[i].bv_len);
+ length -= len;
+ buf += len;
+ offset = 0;
+ }
+
+ rbd_assert(!length);
+
+ return bvecs;
+
+free_bvecs:
+err:
+ return NULL;
+}
+
+static int rbd_journal_handle_aio_discard(struct rbd_device *rbd_dev, void **p, void *end, u8 struct_v)
+{
+ uint64_t offset;
+ uint64_t length;
+ int result = 0;
+ enum obj_operation_type op_type;
+ struct rbd_img_request *img_request;
+ struct ceph_snap_context *snapc = NULL;
+
+ offset = ceph_decode_64(p);
+ length = ceph_decode_64(p);
+
+ snapc = rbd_dev->header.snapc;
+ ceph_get_snap_context(snapc);
+ op_type = OBJ_OP_WRITE;
+
+ img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
+ if (!img_request) {
+ result = -ENOMEM;
+ goto err;
+ }
+
+ result = rbd_img_fill_nodata(img_request, offset, length);
+ if (result)
+ goto err;
+
+ rbd_img_request_submit(img_request);
+
+err:
+ return result;
+}
+
+static int rbd_journal_handle_aio_write(struct rbd_device *rbd_dev, void **p, void *end, u8 struct_v)
+{
+ uint64_t offset;
+ uint64_t length;
+ char *data;
+ ssize_t data_len;
+ int result = 0;
+ enum obj_operation_type op_type;
+ struct ceph_snap_context *snapc = NULL;
+ struct rbd_img_request *img_request;
+
+ struct ceph_file_extent ex;
+ struct bio_vec *bvecs;
+
+ offset = ceph_decode_64(p);
+ length = ceph_decode_64(p);
+
+ data = ceph_extract_encoded_string(p, end, &data_len, GFP_NOIO);
+ if (!data)
+ return -ENOMEM;
+
+ snapc = rbd_dev->header.snapc;
+ ceph_get_snap_context(snapc);
+ op_type = OBJ_OP_WRITE;
+
+ img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
+ if (!img_request) {
+ result = -ENOMEM;
+ goto err;
+ }
+
+ snapc = NULL; /* img_request consumes a ref */
+
+ ex.fe_off = offset;
+ ex.fe_len = length;
+
+ bvecs = setup_write_bvecs(data, offset, length);
+ if (!bvecs)
+ pr_err("failed to alloc bvecs.");
+ result = rbd_img_fill_from_bvecs(img_request,
+ &ex, 1, bvecs);
+
+ if (result)
+ goto err;
+
+ rbd_img_request_submit(img_request);
+
+err:
+ return result;
+}
+
+static int rbd_journal_replay(void *entry_handler, struct ceph_journaler_entry *entry)
+{
+ struct rbd_device *rbd_dev = entry_handler;
+ void **p = (void **)(&entry->data);
+ void *end = *p + entry->data_len;
+ uint32_t event_type;
+ u8 struct_v;
+ u32 struct_len;
+ int ret = 0;
+
+ ret = ceph_start_decoding(p, end, 1, "rbd_decode_entry",
+ &struct_v, &struct_len);
+ if (ret)
+ return -EINVAL;
+
+ event_type = ceph_decode_32(p);
+
+ switch (event_type) {
+ case EVENT_TYPE_AIO_WRITE:
+ rbd_journal_handle_aio_write(rbd_dev, p, end, struct_v);
+ break;
+ case EVENT_TYPE_AIO_DISCARD:
+ rbd_journal_handle_aio_discard(rbd_dev, p, end, struct_v);
+ break;
+ default:
+ pr_err("unknown event_type: %u", event_type);
+ }
+ return 0;
+}
+
+static int rbd_dev_open_journal(struct rbd_device *rbd_dev)
+{
+ int ret = 0;
+ struct ceph_journaler *journaler = NULL;
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+
+ rbd_dev->journal = kmalloc(sizeof(struct rbd_journal), GFP_KERNEL);
+ if (!rbd_dev->journal)
+ return -ENOMEM;
+
+ journaler = ceph_journaler_create(osdc, rbd_dev->spec->image_id, &rbd_dev->header_oloc);
+ if (!journaler) {
+ ret = -ENOMEM;
+ goto err_free_journal;
+ }
+
+ journaler->entry_handler = rbd_dev;
+ journaler->handle_entry = rbd_journal_replay;
+
+ ret = ceph_journaler_open(journaler);
+ if (ret)
+ goto err_destroy_journaler;
+
+
+ rbd_dev->journal->journaler = journaler;
+
+ start_replay(journaler);
+
+ return ret;
+
+err_destroy_journaler:
+ ceph_journaler_destroy(journaler);
+err_free_journal:
+ kfree(rbd_dev->journal);
+ rbd_dev->journal = NULL;
+ return ret;
+}
+
+static void rbd_dev_close_journal(struct rbd_device *rbd_dev)
+{
+ struct ceph_journaler *journaler = NULL;
+
+ if (!rbd_dev->journal)
+ return;
+
+ journaler = rbd_dev->journal->journaler;
+ ceph_journaler_close(journaler);
+ ceph_journaler_destroy(journaler);
+ kfree(rbd_dev->journal);
+ rbd_dev->journal = NULL;
+}
+
+#define LOCAL_MIRROR_UUID ""
+
+typedef struct rbd_journal_tag_predecessor {
+ bool commit_valid;
+ uint64_t tag_tid;
+ uint64_t entry_tid;
+ uint32_t uuid_len;
+ char *mirror_uuid;
+} rbd_journal_tag_predecessor;
+
+typedef struct rbd_journal_tag_data {
+ struct rbd_journal_tag_predecessor predecessor;
+ uint32_t uuid_len;
+ char *mirror_uuid;
+} rbd_journal_tag_data;
+
+static uint32_t tag_data_encoding_size(struct rbd_journal_tag_data *tag_data)
+{
+ // sizeof(uuid_len) 4 + uuid_len + 1 commit_valid + 8 tag_tid + 8 entry_tid + 4 sizeof(uuid_len) + uuid_len
+ return (4 + tag_data->uuid_len + 1 + 8 + 8 + 4 + tag_data->predecessor.uuid_len);
+}
+
+static void predecessor_encode(void **p, void *end, struct rbd_journal_tag_predecessor *predecessor)
+{
+ ceph_encode_string(p, end, predecessor->mirror_uuid, predecessor->uuid_len);
+ ceph_encode_8(p, predecessor->commit_valid);
+ ceph_encode_64(p, predecessor->tag_tid);
+ ceph_encode_64(p, predecessor->entry_tid);
+}
+
+static int rbd_journal_encode_tag_data(void **p, void *end, struct rbd_journal_tag_data *tag_data)
+{
+ struct rbd_journal_tag_predecessor *predecessor = &tag_data->predecessor;
+
+ ceph_encode_string(p, end, tag_data->mirror_uuid, tag_data->uuid_len);
+ predecessor_encode(p, end, predecessor);
+
+ return 0;
+}
+
+static int rbd_dev_allocate_journal_tag(struct rbd_device *rbd_dev)
+{
+ struct ceph_journaler_tag tag;
+ struct ceph_journaler *journaler = rbd_dev->journal->journaler;
+ struct rbd_journal_tag_predecessor *predecessor;
+ struct ceph_journaler_object_pos *position;
+ struct rbd_journal_tag_data tag_data;
+ void *buf = NULL, *p = NULL, *end = NULL;
+ uint32_t buf_len;
+ int ret = 0;
+
+ predecessor = &tag_data.predecessor;
+
+ position = list_first_entry(&journaler->client->object_positions, struct ceph_journaler_object_pos, node);
+
+ predecessor->commit_valid = true;
+ predecessor->tag_tid = position->tag_tid;
+ predecessor->entry_tid = position->entry_tid;
+ predecessor->uuid_len = 0;
+ predecessor->mirror_uuid = LOCAL_MIRROR_UUID;
+
+ tag_data.uuid_len = 0;
+ tag_data.mirror_uuid = LOCAL_MIRROR_UUID;
+
+ buf_len = tag_data_encoding_size(&tag_data);
+
+ p = kmalloc(buf_len, GFP_KERNEL);
+ if (!p) {
+ pr_err("failed to allocate tag data");
+ return -ENOMEM;
+ }
+
+ end = p + buf_len;
+ buf = p;
+ ret = rbd_journal_encode_tag_data(&p, end, &tag_data);
+ if (ret) {
+ pr_err("error in tag data");
+ return ret;
+ }
+
+ ret = ceph_journaler_allocate_tag(journaler, 0, buf, buf_len, &tag);
+
+ rbd_dev->journal->tag_tid = tag.tid;
+
+ return ret;
+}
+
static void rbd_dev_image_release(struct rbd_device *rbd_dev)
{
+ rbd_dev_close_journal(rbd_dev);
rbd_dev_unprobe(rbd_dev);
if (rbd_dev->opts)
rbd_unregister_watch(rbd_dev);
@@ -5558,6 +5906,117 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev)
rbd_dev->spec->image_id = NULL;
}
+struct AioWriteEvent {
+ enum rbd_journal_event_type type;
+ uint64_t offset;
+ uint64_t length;
+ char data[0];
+};
+
+static uint32_t EVENT_FIXED_SIZE = CEPH_ENCODING_START_BLK_LEN + 4; /// version encoding, type
+//static uint32_t METADATA_FIXED_SIZE = CEPH_ENCODING_START_BLK_LEN + 8; /// version encoding, timestamp
+
+static int copy_data_from_bio(char *data, struct bio *bio, u64 length)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ char *buf = NULL;
+ u64 offset = 0;
+
+next:
+ bio_for_each_segment(bv, bio, iter) {
+ buf = page_address(bv.bv_page) + bv.bv_offset;
+ memcpy(data + offset, buf, bv.bv_len);
+ offset += bv.bv_len;
+ }
+
+ if (bio->bi_next) {
+ bio = bio->bi_next;
+ goto next;
+ }
+
+ return 0;
+}
+
+static int rbd_journal_append_write_event(struct rbd_device *rbd_dev, struct bio *bio,
+ u64 offset, u64 length, struct ceph_journaler_future **journal_future)
+{
+ char *data = NULL;
+
+ void *p = NULL;
+ void *end = NULL;
+ void *buf = NULL;
+ uint32_t buf_len = 0;
+
+ data = kmalloc(length, GFP_KERNEL);
+ if (!data) {
+ pr_err("failed to alloc memeory for data");
+ return -ENOMEM;
+ }
+
+ copy_data_from_bio(data, bio, length);
+
+ buf_len = EVENT_FIXED_SIZE + 8 + 8 + 4 + length;
+
+ p = kmalloc(buf_len, GFP_KERNEL);
+ end = p + buf_len;
+ buf = p;
+
+ ceph_start_encoding(&p, 1, 1, buf_len - 6);
+
+ ceph_encode_32(&p, EVENT_TYPE_AIO_WRITE);
+
+ ceph_encode_64(&p, offset);
+ ceph_encode_64(&p, length);
+
+ ceph_encode_string(&p, end, data, length);
+
+ ceph_journaler_append(rbd_dev->journal->journaler, rbd_dev->journal->tag_tid, buf, buf_len, journal_future);
+ kfree(data);
+ kfree(buf);
+
+ return 0;
+}
+
+static int rbd_journal_append_discard_event(struct rbd_device *rbd_dev, struct bio *bio,
+ u64 offset, u64 length, struct ceph_journaler_future **journal_future)
+{
+ void *p = NULL;
+ void *buf = NULL;
+ uint32_t buf_len = 0;
+
+ buf_len = EVENT_FIXED_SIZE + 8 + 8;
+
+ p = kmalloc(buf_len, GFP_KERNEL);
+ buf = p;
+
+ ceph_start_encoding(&p, 1, 1, buf_len - 6);
+
+ ceph_encode_32(&p, EVENT_TYPE_AIO_DISCARD);
+
+ ceph_encode_64(&p, offset);
+ ceph_encode_64(&p, length);
+
+ ceph_journaler_append(rbd_dev->journal->journaler, rbd_dev->journal->tag_tid, buf, buf_len, journal_future);
+ kfree(buf);
+
+ return 0;
+}
+
+static int rbd_journal_append(struct rbd_device *rbd_dev, struct bio *bio,
+ u64 offset, u64 length, enum obj_operation_type op_type,
+ struct ceph_journaler_future **journal_future)
+{
+ switch (op_type) {
+ case OBJ_OP_WRITE:
+ return rbd_journal_append_write_event(rbd_dev, bio, offset, length, journal_future);
+ case OBJ_OP_DISCARD:
+ return rbd_journal_append_discard_event(rbd_dev, bio, offset, length, journal_future);
+ default:
+ return 0;
+ }
+}
+
/*
* Probe for the existence of the header object for the given rbd
* device. If this image is the one being mapped (i.e., not a
@@ -5634,6 +6093,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
if (ret)
goto err_out_probe;
+ if (rbd_dev->header.features & RBD_FEATURE_JOURNALING) {
+ ret = rbd_dev_open_journal(rbd_dev);
+ if (ret)
+ goto err_out_probe;
+ rbd_dev_allocate_journal_tag(rbd_dev);
+ }
+
dout("discovered format %u image, header name is %s\n",
rbd_dev->image_format, rbd_dev->header_oid.name);
return 0;
Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn> --- drivers/block/rbd.c | 478 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 472 insertions(+), 6 deletions(-)