@@ -88,6 +88,7 @@ xfs-y += xfs_aops.o \
xfs_super.o \
xfs_symlink.o \
xfs_sysfs.o \
+ xfs_thin.o \
xfs_trans.o \
xfs_xattr.o \
kmem.o \
@@ -147,6 +147,11 @@ typedef struct xfs_mount {
* to various other kinds of pain inflicted on the pNFS server.
*/
__uint32_t m_generation;
+
+ bool m_thin_reserve;
+ struct mutex m_thin_res_lock;
+ uint32_t m_thin_sectpb;
+ sector_t m_thin_res;
} xfs_mount_t;
/*
new file mode 100644
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2016 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_fsops.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_sysfs.h"
+/* XXX: above copied from xfs_mount.c */
+#include "xfs_thin.h"
+
+/*
+ * Notes/Issues:
+ *
+ * - Reservation support depends on the '-o discard' mount option so freed
+ * extents are returned to the pool.
+ * - The absolute reservation value API is potentially racy. We can cover our
+ * own reservations/provisions with a mutex, but a delta reservation API might
+ * be better.
+ * - Local reservation accounting is not necessarily correct/accurate.
+ * Reservation leakage has been reproduced, particularly in ENOSPC conditions.
+ * The discard mechanism to return blocks to dm-thin has not been totally
+ * reliable either, which means filling, removing and filling an fs causes
+ * some space to be lost. This can be worked around with fstrim for the time
+ * being.
+ * - The locking in xfs_mod_fdblocks() is not quite correct/safe. Sleeping from
+ * invalid context BUG()'s are expected. Needs to be reworked.
+ * - Worst case reservation means each XFS filesystem block is considered a new
+ * dm block allocation. This translates to a significant amount of space given
+ * larger dm block sizes. For example, 4k XFS blocks to 64k dm blocks means
+ * we'll hit ENOSPC sooner and more frequently than typically expected.
+ * - The above also means large fallocate requests are problematic. Need to find
+ * a workaround for this. Perhaps a reduced reservation is safe for known
+ * contiguous extents? E.g., xfs_bmapi_write() w/ nimaps = 1;
+ * - The xfs_mod_fdblocks() implementation means the XFS reserve pool blocks are
+ * also reserved from the thin pool. XFS defaults to 8192 reserve pool blocks
+ * in most cases, which translates to 512MB of reserved space. This can be
+ * tuned with: 'xfs_io -xc "resblks <blks>" <mnt>'. Note that insufficient
+ * reserves will result in errors in unexpected areas of code (e.g., page
+ * discards on writeback, inode unlinked list removal failures, etc.).
+ * - The existing xfs_reserve_blocks() implementation is flaky and does not
+ * correctly reserve in the event of xfs_mod_fdblocks() failure. This will
+ * likely require some fixes independent of this feature. It also may depend
+ * on some kind of (currently undefined) "query available reservation" or
+ * "perform partial reservation" API to support partial XFS reserved blocks
+ * allocation.
+ */
+
+/*
+ * Convert an fsb count to a sector reservation.
+ */
+static inline sector_t
+XFS_FSB_TO_SECT(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsb)
+{
+ sector_t bb;
+
+ bb = fsb * mp->m_thin_sectpb;
+ return bb;
+}
+
+/*
+ * Reserve blocks from the underlying block device.
+ */
+int
+xfs_thin_reserve(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsb)
+{
+ int error;
+ sector_t bb;
+
+ bb = XFS_FSB_TO_SECT(mp, fsb);
+
+ mutex_lock(&mp->m_thin_res_lock);
+
+ error = blk_reserve_space(mp->m_ddev_targp->bt_bdev,
+ mp->m_thin_res + bb);
+ if (error) {
+ if (error == -ENOSPC)
+ trace_xfs_thin_reserve_enospc(mp, mp->m_thin_res, bb);
+ goto out;
+ }
+
+ trace_xfs_thin_reserve(mp, mp->m_thin_res, bb);
+ mp->m_thin_res += bb;
+
+out:
+ mutex_unlock(&mp->m_thin_res_lock);
+ return error;
+}
+
+static int
+__xfs_thin_unreserve(
+ struct xfs_mount *mp,
+ sector_t bb)
+{
+ int error;
+
+ if (bb > mp->m_thin_res) {
+ WARN(1, "unres (%lu) exceeds current res (%lu)", bb,
+ mp->m_thin_res);
+ bb = mp->m_thin_res;
+ }
+
+ error = blk_reserve_space(mp->m_ddev_targp->bt_bdev,
+ mp->m_thin_res - bb);
+ if (error)
+ return error;;
+
+ trace_xfs_thin_unreserve(mp, mp->m_thin_res, bb);
+ mp->m_thin_res -= bb;
+
+ return error;
+}
+
+/*
+ * Release a reservation back to the block device.
+ */
+int
+xfs_thin_unreserve(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsb)
+{
+ int error;
+ sector_t bb;
+
+ bb = XFS_FSB_TO_SECT(mp, fsb);
+
+ mutex_lock(&mp->m_thin_res_lock);
+ error = __xfs_thin_unreserve(mp, bb);
+ mutex_unlock(&mp->m_thin_res_lock);
+
+ return error;
+}
+
+/*
+ * Given a recently allocated extent, ask the block device to provision the
+ * underlying space.
+ */
+int
+xfs_thin_provision(
+ struct xfs_mount *mp,
+ xfs_fsblock_t offset,
+ xfs_fsblock_t len)
+{
+ sector_t bbres;
+ sector_t bbstart, bblen;
+ int count;
+ int error;
+
+ bbstart = XFS_FSB_TO_DADDR(mp, offset);
+ bbstart = round_down(bbstart, mp->m_thin_sectpb);
+ bblen = XFS_FSB_TO_BB(mp, len);
+ bblen = round_up(bblen, mp->m_thin_sectpb);
+
+ bbres = XFS_FSB_TO_SECT(mp, len);
+
+ mutex_lock(&mp->m_thin_res_lock);
+
+ WARN_ON(bblen > mp->m_thin_res);
+
+ /*
+ * XXX: alloc count here is kind of a hack. Need to find a local
+ * mechanism. Pass res to blk_provision_space?
+ */
+ count = blk_provision_space(mp->m_ddev_targp->bt_bdev, bbstart, bblen);
+ if (count < 0) {
+ error = count;
+ goto out;
+ }
+
+ trace_xfs_thin_provision(mp, count, bbres);
+
+ /*
+ * Update the local reservation based on the blocks that were actually
+ * allocated and release the rest of the unused reservation.
+ */
+ mp->m_thin_res -= count;
+ bbres -= count;
+ error = __xfs_thin_unreserve(mp, bbres);
+out:
+ mutex_unlock(&mp->m_thin_res_lock);
+ return error;
+}
+
+int
+xfs_thin_init(
+ struct xfs_mount *mp)
+{
+ sector_t res1 = 0, res2 = 0;
+ int error = 0;
+ unsigned int io_opt;
+
+ mp->m_thin_reserve = false;
+
+ if (!(mp->m_flags & XFS_MOUNT_DISCARD))
+ goto out;
+
+ mutex_init(&mp->m_thin_res_lock);
+
+ /* use optimal I/O size as dm-thin block size */
+ io_opt = bdev_io_opt(mp->m_super->s_bdev);
+ if ((io_opt % BBSIZE) || (io_opt < mp->m_sb.sb_blocksize))
+ goto out;
+ mp->m_thin_sectpb = io_opt / BBSIZE;
+
+ /*
+ * Run some test calls to determine whether the block device has
+ * support. Note: res is in 512b sector units.
+ */
+ error = xfs_thin_reserve(mp, 1);
+ if (error)
+ goto out;
+
+ error = blk_get_reserved_space(mp->m_ddev_targp->bt_bdev, &res1);
+ if (error)
+ goto out;
+
+ error = xfs_thin_unreserve(mp, 1);
+ if (error)
+ goto out;
+
+ error = blk_get_reserved_space(mp->m_ddev_targp->bt_bdev, &res2);
+ if (error)
+ goto out;
+
+ ASSERT(res1 >= 1 && res2 == 0);
+ mp->m_thin_reserve = true;
+out:
+ xfs_notice(mp, "Thin pool reservation %s", mp->m_thin_reserve ?
+ "enabled" : "disabled");
+ if (mp->m_thin_reserve)
+ xfs_notice(mp, "Thin reserve blocksize: %u sectors",
+ mp->m_thin_sectpb);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,9 @@
+#ifndef __XFS_THIN_H__
+#define __XFS_THIN_H__
+
+int xfs_thin_init(struct xfs_mount *);
+int xfs_thin_reserve(struct xfs_mount *, xfs_fsblock_t);
+int xfs_thin_unreserve(struct xfs_mount *, xfs_fsblock_t);
+int xfs_thin_provision(struct xfs_mount *, xfs_fsblock_t, xfs_fsblock_t);
+
+#endif /* __XFS_THIN_H__ */
@@ -2185,6 +2185,33 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);
+DECLARE_EVENT_CLASS(xfs_thin_class,
+ TP_PROTO(struct xfs_mount *mp, sector_t total, sector_t res),
+ TP_ARGS(mp, total, res),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(sector_t, total)
+ __field(sector_t, res)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->total = total;
+ __entry->res = res;
+ ),
+ TP_printk("dev %d:%d total %lu res %lu",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->total,
+ __entry->res)
+)
+
+#define DEFINE_THIN_EVENT(name) \
+DEFINE_EVENT(xfs_thin_class, name, \
+ TP_PROTO(struct xfs_mount *mp, sector_t total, sector_t res), \
+ TP_ARGS(mp, total, res))
+DEFINE_THIN_EVENT(xfs_thin_reserve);
+DEFINE_THIN_EVENT(xfs_thin_reserve_enospc);
+DEFINE_THIN_EVENT(xfs_thin_unreserve);
+DEFINE_THIN_EVENT(xfs_thin_provision);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
Add block device reservation infrastructure to XFS. This primarily consists of wrappers around the associated block device functions. This mechanism provides the ability to reserve, release and provision a set of blocks in the underlying block device. The mechanism enables the filesystem to adopt a block reservation model with the underlying device. In turn, this allows the filesystem to identify when the underlying device is out of space and propagate an error (-ENOSPC) gracefully before the device itself must handle the condition. The latter typically involves a read-only state transition and thus requires administrator intervention to resolve. Signed-off-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/Makefile | 1 + fs/xfs/xfs_mount.h | 5 + fs/xfs/xfs_thin.c | 273 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_thin.h | 9 ++ fs/xfs/xfs_trace.h | 27 ++++++ 5 files changed, 315 insertions(+) create mode 100644 fs/xfs/xfs_thin.c create mode 100644 fs/xfs/xfs_thin.h