From patchwork Mon Aug 14 22:04:56 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shaohua Li X-Patchwork-Id: 9900251 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork.web.codeaurora.org (Postfix) with ESMTP id 8E467602BA for ; Mon, 14 Aug 2017 22:05:11 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 808B123F88 for ; Mon, 14 Aug 2017 22:05:11 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 756132874B; Mon, 14 Aug 2017 22:05:11 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-6.9 required=2.0 tests=BAYES_00,RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 9D8ED23F88 for ; Mon, 14 Aug 2017 22:05:10 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752561AbdHNWFI (ORCPT ); Mon, 14 Aug 2017 18:05:08 -0400 Received: from mail.kernel.org ([198.145.29.99]:56982 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752577AbdHNWFF (ORCPT ); Mon, 14 Aug 2017 18:05:05 -0400 Received: from shli-virt.localdomain (unknown [199.201.64.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by mail.kernel.org (Postfix) with ESMTPSA id 462422395B; Mon, 14 Aug 2017 22:05:05 +0000 (UTC) DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 462422395B Authentication-Results: mail.kernel.org; dmarc=none (p=none dis=none) header.from=kernel.org Authentication-Results: mail.kernel.org; spf=fail smtp.mailfrom=shli@fb.com From: Shaohua Li To: linux-block@vger.kernel.org, linux-raid@vger.kernel.org Cc: kernel-team@fb.com, Kyungchan Koh , Shaohua Li Subject: [PATCH V2 5/9] nullb: support memory backed store Date: Mon, 14 Aug 2017 15:04:56 -0700 Message-Id: X-Mailer: git-send-email 2.11.0 In-Reply-To: References: In-Reply-To: References: Sender: linux-block-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-block@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP From: Shaohua Li This adds memory backed store in nullb. User configure 'memory_backed' attribute for this. By default, nullb disk doesn't use memory backed store. Based on original patch from Kyungchan Koh Signed-off-by: Kyungchan Koh Signed-off-by: Shaohua Li --- drivers/block/null_blk.c | 339 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 330 insertions(+), 9 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 2f66627..45e0b56 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -15,6 +15,14 @@ #include #include +#define SECTOR_SHIFT 9 +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_SIZE (1 << SECTOR_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + +#define FREE_BATCH 16 + struct nullb_cmd { struct list_head list; struct llist_node ll_list; @@ -24,6 +32,7 @@ struct nullb_cmd { unsigned int tag; struct nullb_queue *nq; struct hrtimer timer; + blk_status_t error; }; struct nullb_queue { @@ -46,9 +55,23 @@ enum nullb_device_flags { NULLB_DEV_FL_UP = 1, }; +/* + * nullb_page is a page in memory for nullb devices. + * + * @page: The page holding the data. + * @bitmap: The bitmap represents which sector in the page has data. + * Each bit represents one block size. For example, sector 8 + * will use the 7th bit + */ +struct nullb_page { + struct page *page; + unsigned long bitmap; +}; + struct nullb_device { struct nullb *nullb; struct config_item item; + struct radix_tree_root data; /* data stored in the disk */ unsigned long flags; /* device flags */ unsigned long size; /* device size in MB */ @@ -64,6 +87,7 @@ struct nullb_device { bool blocking; /* blocking blk-mq device */ bool use_per_node_hctx; /* use per-node allocation for hardware context */ bool power; /* power on/off the device */ + bool memory_backed; /* if data is stored in memory */ }; struct nullb { @@ -197,6 +221,7 @@ static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); static int null_add_dev(struct nullb_device *dev); +static void null_free_device_storage(struct nullb_device *dev); static inline struct nullb_device *to_nullb_device(struct config_item *item) { @@ -292,6 +317,7 @@ NULLB_DEVICE_ATTR(index, uint); NULLB_DEVICE_ATTR(use_lightnvm, bool); NULLB_DEVICE_ATTR(blocking, bool); NULLB_DEVICE_ATTR(use_per_node_hctx, bool); +NULLB_DEVICE_ATTR(memory_backed, bool); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -346,12 +372,16 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_blocking, &nullb_device_attr_use_per_node_hctx, &nullb_device_attr_power, + &nullb_device_attr_memory_backed, NULL, }; static void nullb_device_release(struct config_item *item) { - null_free_dev(to_nullb_device(item)); + struct nullb_device *dev = to_nullb_device(item); + + null_free_device_storage(dev); + null_free_dev(dev); } static struct configfs_item_operations nullb_device_ops = { @@ -395,7 +425,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "\n"); + return snprintf(page, PAGE_SIZE, "memory_backed\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -432,6 +462,7 @@ static struct nullb_device *null_alloc_dev(void) dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; + INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); dev->size = g_gb * 1024; dev->completion_nsec = g_completion_nsec; dev->submit_queues = g_submit_queues; @@ -532,13 +563,14 @@ static void end_cmd(struct nullb_cmd *cmd) switch (queue_mode) { case NULL_Q_MQ: - blk_mq_end_request(cmd->rq, BLK_STS_OK); + blk_mq_end_request(cmd->rq, cmd->error); return; case NULL_Q_RQ: INIT_LIST_HEAD(&cmd->rq->queuelist); - blk_end_request_all(cmd->rq, BLK_STS_OK); + blk_end_request_all(cmd->rq, cmd->error); break; case NULL_Q_BIO: + cmd->bio->bi_status = cmd->error; bio_endio(cmd->bio); break; } @@ -579,12 +611,297 @@ static void null_softirq_done_fn(struct request *rq) end_cmd(rq->special); } -static inline void null_handle_cmd(struct nullb_cmd *cmd) +static struct nullb_page *null_alloc_page(gfp_t gfp_flags) +{ + struct nullb_page *t_page; + + t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); + if (!t_page) + goto out; + + t_page->page = alloc_pages(gfp_flags, 0); + if (!t_page->page) + goto out_freepage; + + t_page->bitmap = 0; + return t_page; +out_freepage: + kfree(t_page); +out: + return NULL; +} + +static void null_free_page(struct nullb_page *t_page) +{ + __free_page(t_page->page); + kfree(t_page); +} + +static void null_free_sector(struct nullb *nullb, sector_t sector) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page, *ret; + struct radix_tree_root *root; + + root = &nullb->dev->data; + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + t_page = radix_tree_lookup(root, idx); + if (t_page) { + __clear_bit(sector_bit, &t_page->bitmap); + + if (!t_page->bitmap) { + ret = radix_tree_delete_item(root, idx, t_page); + WARN_ON(ret != t_page); + null_free_page(ret); + } + } +} + +static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, + struct nullb_page *t_page) { + struct radix_tree_root *root; + + root = &nullb->dev->data; + + if (radix_tree_insert(root, idx, t_page)) { + null_free_page(t_page); + t_page = radix_tree_lookup(root, idx); + WARN_ON(!t_page || t_page->page->index != idx); + } + + return t_page; +} + +static void null_free_device_storage(struct nullb_device *dev) +{ + unsigned long pos = 0; + int nr_pages; + struct nullb_page *ret, *t_pages[FREE_BATCH]; + struct radix_tree_root *root; + + root = &dev->data; + + do { + int i; + + nr_pages = radix_tree_gang_lookup(root, + (void **)t_pages, pos, FREE_BATCH); + + for (i = 0; i < nr_pages; i++) { + pos = t_pages[i]->page->index; + ret = radix_tree_delete_item(root, pos, t_pages[i]); + WARN_ON(ret != t_pages[i]); + null_free_page(ret); + } + + pos++; + } while (nr_pages == FREE_BATCH); +} + +static struct nullb_page *null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page; + + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + t_page = radix_tree_lookup(&nullb->dev->data, idx); + WARN_ON(t_page && t_page->page->index != idx); + + if (t_page && (for_write || test_bit(sector_bit, &t_page->bitmap))) + return t_page; + + return NULL; +} + +static struct nullb_page *null_insert_page(struct nullb *nullb, + sector_t sector) +{ + u64 idx; + struct nullb_page *t_page; + + t_page = null_lookup_page(nullb, sector, true); + if (t_page) + return t_page; + + spin_unlock_irq(&nullb->lock); + + t_page = null_alloc_page(GFP_NOIO); + if (!t_page) + goto out_lock; + + if (radix_tree_preload(GFP_NOIO)) + goto out_freepage; + + spin_lock_irq(&nullb->lock); + idx = sector >> PAGE_SECTORS_SHIFT; + t_page->page->index = idx; + t_page = null_radix_tree_insert(nullb, idx, t_page); + radix_tree_preload_end(); + + return t_page; +out_freepage: + null_free_page(t_page); +out_lock: + spin_lock_irq(&nullb->lock); + return null_lookup_page(nullb, sector, true); +} + +static int copy_to_nullb(struct nullb *nullb, struct page *source, + unsigned int off, sector_t sector, size_t n) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_insert_page(nullb, sector); + if (!t_page) + return -ENOSPC; + + src = kmap_atomic(source); + dst = kmap_atomic(t_page->page); + memcpy(dst + offset, src + off + count, temp); + kunmap_atomic(dst); + kunmap_atomic(src); + + __set_bit(sector & SECTOR_MASK, &t_page->bitmap); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static int copy_from_nullb(struct nullb *nullb, struct page *dest, + unsigned int off, sector_t sector, size_t n) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_lookup_page(nullb, sector, false); + + dst = kmap_atomic(dest); + if (!t_page) { + memset(dst + off + count, 0, temp); + goto next; + } + src = kmap_atomic(t_page->page); + memcpy(dst + off + count, src + offset, temp); + kunmap_atomic(src); +next: + kunmap_atomic(dst); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static int null_transfer(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off, bool is_write, sector_t sector) +{ + int err = 0; + + if (!is_write) { + err = copy_from_nullb(nullb, page, off, sector, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + err = copy_to_nullb(nullb, page, off, sector, len); + } + + return err; +} + +static int null_handle_rq(struct nullb_cmd *cmd) +{ + struct request *rq = cmd->rq; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector; + struct req_iterator iter; + struct bio_vec bvec; + + sector = blk_rq_pos(rq); + + spin_lock_irq(&nullb->lock); + rq_for_each_segment(bvec, rq, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(req_op(rq)), sector); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + + return 0; +} + +static int null_handle_bio(struct nullb_cmd *cmd) +{ + struct bio *bio = cmd->bio; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector; + struct bio_vec bvec; + struct bvec_iter iter; + + sector = bio->bi_iter.bi_sector; + + spin_lock_irq(&nullb->lock); + bio_for_each_segment(bvec, bio, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(bio_op(bio)), sector); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + return 0; +} + +static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + int err = 0; + + if (dev->memory_backed) { + if (dev->queue_mode == NULL_Q_BIO) + err = null_handle_bio(cmd); + else + err = null_handle_rq(cmd); + } + cmd->error = errno_to_blk_status(err); /* Complete IO by inline, softirq or timer */ - switch (cmd->nq->dev->irqmode) { + switch (dev->irqmode) { case NULL_IRQ_SOFTIRQ: - switch (cmd->nq->dev->queue_mode) { + switch (dev->queue_mode) { case NULL_Q_MQ: blk_mq_complete_request(cmd->rq); break; @@ -606,6 +923,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd) null_cmd_end_timer(cmd); break; } + return BLK_STS_OK; } static struct nullb_queue *nullb_to_queue(struct nullb *nullb) @@ -678,8 +996,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); - null_handle_cmd(cmd); - return BLK_STS_OK; + return null_handle_cmd(cmd); } static const struct blk_mq_ops null_mq_ops = { @@ -1050,6 +1367,10 @@ static void null_validate_conf(struct nullb_device *dev) dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); + + /* Do memory allocation, so set blocking */ + if (dev->memory_backed) + dev->blocking = true; } static int null_add_dev(struct nullb_device *dev)