@@ -5,13 +5,42 @@
#include <linux/module.h>
+#include "mpool_printk.h"
+
+#include "pd.h"
+
+/*
+ * Module params...
+ */
+unsigned int rsvd_bios_max __read_mostly = 16;
+module_param(rsvd_bios_max, uint, 0444);
+MODULE_PARM_DESC(rsvd_bios_max, "max reserved bios in mpool bioset");
+
+int chunk_size_kb __read_mostly = 128;
+module_param(chunk_size_kb, uint, 0644);
+MODULE_PARM_DESC(chunk_size_kb, "Chunk size (in KiB) for device I/O");
+
+static void mpool_exit_impl(void)
+{
+ pd_exit();
+}
+
static __init int mpool_init(void)
{
- return 0;
+ int rc;
+
+ rc = pd_init();
+ if (rc) {
+ mp_pr_err("pd init failed", rc);
+ mpool_exit_impl();
+ }
+
+ return rc;
}
static __exit void mpool_exit(void)
{
+ mpool_exit_impl();
}
module_init(mpool_init);
new file mode 100644
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2015-2020 Micron Technology, Inc. All rights reserved.
+ */
+
+#ifndef MPOOL_INIT_H
+#define MPOOL_INIT_H
+
+extern unsigned int rsvd_bios_max;
+extern int chunk_size_kb;
+
+#endif /* MPOOL_INIT_H */
new file mode 100644
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015-2020 Micron Technology, Inc. All rights reserved.
+ */
+/*
+ * Pool drive module with backing block devices.
+ *
+ * Defines functions for probing, reading, and writing drives in an mpool.
+ * IO is done using kerel BIO facilities.
+ */
+
+#define _LARGEFILE64_SOURCE
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/blk_types.h>
+
+#include "mpool_printk.h"
+#include "assert.h"
+
+#include "init.h"
+#include "omf_if.h"
+#include "pd.h"
+
+#ifndef SECTOR_SHIFT
+#define SECTOR_SHIFT 9
+#endif
+
+static struct bio_set mpool_bioset;
+
+static const fmode_t pd_bio_fmode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
+static char *pd_bio_holder = "mpool";
+
+int pd_dev_open(const char *path, struct pd_dev_parm *dparm, struct pd_prop *pd_prop)
+{
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_path(path, pd_bio_fmode, pd_bio_holder);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ dparm->dpr_dev_private = bdev;
+ dparm->dpr_prop = *pd_prop;
+
+ if ((pd_prop->pdp_devtype != PD_DEV_TYPE_BLOCK_STD) &&
+ (pd_prop->pdp_devtype != PD_DEV_TYPE_BLOCK_NVDIMM)) {
+ int rc = -EINVAL;
+
+ mp_pr_err("unsupported PD type %d", rc, pd_prop->pdp_devtype);
+ return rc;
+ }
+
+ return 0;
+}
+
+int pd_dev_close(struct pd_dev_parm *dparm)
+{
+ struct block_device *bdev = dparm->dpr_dev_private;
+
+ if (bdev) {
+ dparm->dpr_dev_private = NULL;
+ sync_blockdev(bdev);
+ invalidate_bdev(bdev);
+ blkdev_put(bdev, pd_bio_fmode);
+ }
+
+ return bdev ? 0 : -EINVAL;
+}
+
+int pd_dev_flush(struct pd_dev_parm *dparm)
+{
+ struct block_device *bdev;
+ int rc;
+
+ bdev = dparm->dpr_dev_private;
+ if (!bdev) {
+ rc = -EINVAL;
+ mp_pr_err("bdev %s not registered", rc, dparm->dpr_name);
+ return rc;
+ }
+
+ rc = blkdev_issue_flush(bdev, GFP_NOIO);
+ if (rc)
+ mp_pr_err("bdev %s, flush failed", rc, dparm->dpr_name);
+
+ return rc;
+}
+
+/**
+ * pd_bio_discard() - issue discard command to erase a byte-aligned region
+ * @dparm:
+ * @off:
+ * @len:
+ */
+static int pd_bio_discard(struct pd_dev_parm *dparm, u64 off, size_t len)
+{
+ struct block_device *bdev;
+ int rc;
+
+ bdev = dparm->dpr_dev_private;
+ if (!bdev) {
+ rc = -EINVAL;
+ mp_pr_err("bdev %s not registered", rc, dparm->dpr_name);
+ return rc;
+ }
+
+ /* Validate I/O offset is sector-aligned */
+ if (off & PD_SECTORMASK(&dparm->dpr_prop)) {
+ rc = -EINVAL;
+ mp_pr_err("bdev %s, offset 0x%lx not multiple of sec size %u",
+ rc, dparm->dpr_name, (ulong)off, (1 << PD_SECTORSZ(&dparm->dpr_prop)));
+ return rc;
+ }
+
+ if (off > PD_LEN(&dparm->dpr_prop)) {
+ rc = -EINVAL;
+ mp_pr_err("bdev %s, offset 0x%lx past end 0x%lx",
+ rc, dparm->dpr_name, (ulong)off, (ulong)PD_LEN(&dparm->dpr_prop));
+ return rc;
+ }
+
+ rc = blkdev_issue_discard(bdev, off >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_NOIO, 0);
+ if (rc)
+ mp_pr_err("bdev %s, offset 0x%lx len 0x%lx, discard faiure",
+ rc, dparm->dpr_name, (ulong)off, (ulong)len);
+
+ return rc;
+}
+
+/**
+ * pd_zone_erase() - issue write-zeros or discard commands to erase PD
+ * @dparm:
+ * @zaddr:
+ * @zonecnt:
+ * @flag:
+ * @afp:
+ */
+int pd_zone_erase(struct pd_dev_parm *dparm, u64 zaddr, u32 zonecnt, bool reads_erased)
+{
+ int rc = 0;
+ u64 cmdopt;
+
+ /* Validate args against zone param */
+ if (zaddr >= dparm->dpr_zonetot)
+ return -EINVAL;
+
+ if (zonecnt == 0)
+ zonecnt = dparm->dpr_zonetot - zaddr;
+
+ if (zonecnt > (dparm->dpr_zonetot - zaddr))
+ return -EINVAL;
+
+ if (zonecnt == 0)
+ return 0;
+
+ /*
+ * When both DIF and SED are enabled, read from a discared block
+ * would fail, so we can't discard blocks if both DIF and SED are
+ * enabled AND we need to read blocks after erase.
+ */
+ cmdopt = dparm->dpr_cmdopt;
+ if ((cmdopt & PD_CMD_DISCARD) &&
+ !(reads_erased && (cmdopt & PD_CMD_DIF_ENABLED) && (cmdopt & PD_CMD_SED_ENABLED))) {
+ size_t zlen;
+
+ zlen = dparm->dpr_zonepg << PAGE_SHIFT;
+ rc = pd_bio_discard(dparm, zaddr * zlen, zonecnt * zlen);
+ }
+
+ return rc;
+}
+
+static void pd_bio_init(struct bio *bio, struct block_device *bdev, int rw, loff_t off, int flags)
+{
+ bio_set_op_attrs(bio, rw, flags);
+ bio->bi_iter.bi_sector = off >> SECTOR_SHIFT;
+ bio_set_dev(bio, bdev);
+}
+
+static struct bio *pd_bio_chain(struct bio *target, unsigned int nr_pages, gfp_t gfp)
+{
+ struct bio *new;
+
+ new = bio_alloc_bioset(gfp, nr_pages, &mpool_bioset);
+
+ if (!target)
+ return new;
+
+ if (new) {
+ bio_chain(target, new);
+ submit_bio(target);
+ } else {
+ submit_bio_wait(target);
+ bio_put(target);
+ }
+
+ return new;
+}
+
+/**
+ * pd_bio_rw() -
+ * @dparm:
+ * @iov:
+ * @iovcnt:
+ * @off: offset in bytes on disk
+ * @rw:
+ * @opflags:
+ *
+ * pd_bio_rw() expects a list of kvecs wherein each base ptr is sector
+ * aligned and each length is multiple of sectors.
+ *
+ * If the IO is bigger than 1MiB (BIO_MAX_PAGES pages) or chunk_size_kb,
+ * it is split in several IOs.
+ */
+static int pd_bio_rw(struct pd_dev_parm *dparm, const struct kvec *iov,
+ int iovcnt, loff_t off, int rw, int opflags)
+{
+ struct block_device *bdev;
+ struct page *page;
+ struct bio *bio;
+ u64 iov_base, sector_mask;
+ u32 tot_pages, tot_len, len, iov_len, left;
+ u32 iolimit;
+ int i, cc, op;
+ int rc = 0;
+
+ if (iovcnt < 1)
+ return 0;
+
+ bdev = dparm->dpr_dev_private;
+ if (!bdev) {
+ rc = -EINVAL;
+ mp_pr_err("bdev %s not registered", rc, dparm->dpr_name);
+ return rc;
+ }
+
+ sector_mask = PD_SECTORMASK(&dparm->dpr_prop);
+ if (off & sector_mask) {
+ rc = -EINVAL;
+ mp_pr_err("bdev %s, %s offset 0x%lx not multiple of sector size %u",
+ rc, dparm->dpr_name, (rw == REQ_OP_READ) ? "read" : "write",
+ (ulong)off, (1 << PD_SECTORSZ(&dparm->dpr_prop)));
+ return rc;
+ }
+
+ if (off > PD_LEN(&dparm->dpr_prop)) {
+ rc = -EINVAL;
+ mp_pr_err("bdev %s, %s offset 0x%lx past device end 0x%lx",
+ rc, dparm->dpr_name, (rw == REQ_OP_READ) ? "read" : "write",
+ (ulong)off, (ulong)PD_LEN(&dparm->dpr_prop));
+ return rc;
+ }
+
+ tot_pages = 0;
+ tot_len = 0;
+ for (i = 0; i < iovcnt; i++) {
+ if (!PAGE_ALIGNED((uintptr_t)iov[i].iov_base) || (iov[i].iov_len & sector_mask)) {
+ rc = -EINVAL;
+ mp_pr_err("bdev %s, %s off 0x%lx, misaligned kvec, base 0x%lx, len 0x%lx",
+ rc, dparm->dpr_name, (rw == REQ_OP_READ) ? "read" : "write",
+ (ulong)off, (ulong)iov[i].iov_base, (ulong)iov[i].iov_len);
+ return rc;
+ }
+
+ iov_len = iov[i].iov_len;
+ tot_len += iov_len;
+ while (iov_len > 0) {
+ len = min_t(size_t, PAGE_SIZE, iov_len);
+ iov_len -= len;
+ tot_pages++;
+ }
+ }
+
+ if (off + tot_len > PD_LEN(&dparm->dpr_prop)) {
+ rc = -EINVAL;
+ mp_pr_err("bdev %s, %s I/O end past device end 0x%lx, 0x%lx:0x%x",
+ rc, dparm->dpr_name, (rw == REQ_OP_READ) ? "read" : "write",
+ (ulong)PD_LEN(&dparm->dpr_prop), (ulong)off, tot_len);
+ return rc;
+ }
+
+ if (tot_len == 0)
+ return 0;
+
+ /* IO size for each bio is determined by the chunk size. */
+ iolimit = chunk_size_kb >> (PAGE_SHIFT - 10);
+ iolimit = clamp_t(u32, iolimit, 32, BIO_MAX_PAGES);
+
+ left = 0;
+ bio = NULL;
+ op = (rw == REQ_OP_READ) ? READ : WRITE;
+
+ for (i = 0; i < iovcnt; i++) {
+ iov_base = (u64)iov[i].iov_base;
+ iov_len = iov[i].iov_len;
+
+ while (iov_len > 0) {
+ if (left == 0) {
+ left = min_t(size_t, tot_pages, iolimit);
+
+ bio = pd_bio_chain(bio, left, GFP_NOIO);
+ if (!bio)
+ return -ENOMEM;
+
+ pd_bio_init(bio, bdev, rw, off, opflags);
+ }
+
+ len = min_t(size_t, PAGE_SIZE, iov_len);
+ page = virt_to_page(iov_base);
+ cc = -1;
+
+ if (page)
+ cc = bio_add_page(bio, page, len, 0);
+
+ if (cc != len) {
+ if (cc == 0 && bio->bi_vcnt > 0) {
+ left = 0;
+ continue;
+ }
+
+ bio_io_error(bio);
+ bio_put(bio);
+ return -ENOTRECOVERABLE;
+ }
+
+ iov_len -= len;
+ iov_base += len;
+ off += len;
+ left--;
+ tot_pages--;
+ }
+ }
+
+ ASSERT(bio);
+ ASSERT(tot_pages == 0);
+
+ rc = submit_bio_wait(bio);
+ bio_put(bio);
+
+ return rc;
+}
+
+int pd_zone_pwritev(struct pd_dev_parm *dparm, const struct kvec *iov,
+ int iovcnt, u64 zaddr, loff_t boff, int opflags)
+{
+ loff_t woff;
+
+ woff = ((u64)dparm->dpr_zonepg << PAGE_SHIFT) * zaddr + boff;
+
+ return pd_bio_rw(dparm, iov, iovcnt, woff, REQ_OP_WRITE, opflags);
+}
+
+int pd_zone_pwritev_sync(struct pd_dev_parm *dparm, const struct kvec *iov,
+ int iovcnt, u64 zaddr, loff_t boff)
+{
+ struct block_device *bdev;
+ int rc;
+
+ rc = pd_zone_pwritev(dparm, iov, iovcnt, zaddr, boff, REQ_FUA);
+ if (rc)
+ return rc;
+
+ /*
+ * This sync & invalidate bdev ensures that the data written from the
+ * kernel is immediately visible to the user-space.
+ */
+ bdev = dparm->dpr_dev_private;
+ if (bdev) {
+ sync_blockdev(bdev);
+ invalidate_bdev(bdev);
+ }
+
+ return 0;
+}
+
+int pd_zone_preadv(struct pd_dev_parm *dparm, const struct kvec *iov,
+ int iovcnt, u64 zaddr, loff_t boff)
+{
+ loff_t roff;
+
+ roff = ((u64)dparm->dpr_zonepg << PAGE_SHIFT) * zaddr + boff;
+
+ return pd_bio_rw(dparm, iov, iovcnt, roff, REQ_OP_READ, 0);
+}
+
+void pd_dev_set_unavail(struct pd_dev_parm *dparm, struct omf_devparm_descriptor *omf_devparm)
+{
+ struct pd_prop *pd_prop = &(dparm->dpr_prop);
+
+ /*
+ * Fill in dparm for unavailable drive; sets zone parm and other
+ * PD properties we keep in metadata; no ops vector because we need
+ * the device to be available to know it (the discovery gets it).
+ */
+ strncpy(dparm->dpr_prop.pdp_didstr, PD_DEV_ID_PDUNAVAILABLE, PD_DEV_ID_LEN);
+ pd_prop->pdp_devstate = PD_DEV_STATE_UNAVAIL;
+ pd_prop->pdp_cmdopt = PD_CMD_NONE;
+
+ pd_prop->pdp_zparam.dvb_zonepg = omf_devparm->odp_zonepg;
+ pd_prop->pdp_zparam.dvb_zonetot = omf_devparm->odp_zonetot;
+ pd_prop->pdp_mclassp = omf_devparm->odp_mclassp;
+ pd_prop->pdp_phys_if = 0;
+ pd_prop->pdp_sectorsz = omf_devparm->odp_sectorsz;
+ pd_prop->pdp_devsz = omf_devparm->odp_devsz;
+}
+
+
+int pd_init(void)
+{
+ int rc;
+
+ chunk_size_kb = clamp_t(uint, chunk_size_kb, 128, 1024);
+
+ rsvd_bios_max = clamp_t(uint, rsvd_bios_max, 1, 1024);
+
+ rc = bioset_init(&mpool_bioset, rsvd_bios_max, 0, BIOSET_NEED_BVECS);
+ if (rc)
+ mp_pr_err("mpool bioset init failed", rc);
+
+ return rc;
+}
+
+void pd_exit(void)
+{
+ bioset_exit(&mpool_bioset);
+}