@@ -138,7 +138,8 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
# xfs_rtbitmap is shared with libxfs
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
- xfs_zone_alloc.o
+ xfs_zone_alloc.o \
+ xfs_zone_space_resv.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
@@ -40,6 +40,7 @@
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -4788,12 +4789,18 @@ xfs_bmap_del_extent_delay(
da_diff = da_old - da_new;
fdblocks = da_diff;
- if (bflags & XFS_BMAPI_REMAP)
+ if (bflags & XFS_BMAPI_REMAP) {
;
- else if (isrt)
- xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
- else
+ } else if (isrt) {
+ xfs_rtbxlen_t rtxlen;
+
+ rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_add_available(mp, rtxlen);
+ xfs_add_frextents(mp, rtxlen);
+ } else {
fdblocks += del->br_blockcount;
+ }
xfs_add_fdblocks(mp, fdblocks);
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@@ -244,12 +244,22 @@ enum xfs_free_counter {
*/
XC_FREE_RTEXTENTS,
+ /*
+ * Number of available for use RT extents.
+ *
+ * This counter only exists for zoned RT device and indicates the number
+ * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS
+ * also includes blocks that have been written previously and freed, but
+ * sit in a rtgroup that still needs a zone reset.
+ */
+ XC_FREE_RTAVAILABLE,
XC_FREE_NR,
};
#define XFS_FREECOUNTER_STR \
{ XC_FREE_BLOCKS, "blocks" }, \
- { XC_FREE_RTEXTENTS, "rtextents" }
+ { XC_FREE_RTEXTENTS, "rtextents" }, \
+ { XC_FREE_RTAVAILABLE, "rtavailable" }
/*
* Type verifier functions
@@ -468,6 +468,7 @@ xfs_mount_reset_sbqflags(
static const char *const xfs_free_pool_name[] = {
[XC_FREE_BLOCKS] = "free blocks",
[XC_FREE_RTEXTENTS] = "free rt extents",
+ [XC_FREE_RTAVAILABLE] = "available rt extents",
};
uint64_t
@@ -475,22 +476,27 @@ xfs_default_resblks(
struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
- uint64_t resblks;
-
- if (ctr == XC_FREE_RTEXTENTS)
+ switch (ctr) {
+ case XC_FREE_BLOCKS:
+ /*
+ * Default to 5% or 8192 FSBs of space reserved, whichever is
+ * smaller.
+ *
+ * This is intended to cover concurrent allocation transactions
+ * when we initially hit ENOSPC. These each require a 4 block
+ * reservation. Hence by default we cover roughly 2000
+ * concurrent allocation reservations.
+ */
+ return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+ case XC_FREE_RTEXTENTS:
+ case XC_FREE_RTAVAILABLE:
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
+ return xfs_zoned_default_resblks(mp, ctr);
return 0;
-
- /*
- * We default to 5% or 8192 fsbs of space reserved, whichever is
- * smaller. This is intended to cover concurrent allocation
- * transactions when we initially hit enospc. These each require a 4
- * block reservation. Hence by default we cover roughly 2000 concurrent
- * allocation reservations.
- */
- resblks = mp->m_sb.sb_dblocks;
- do_div(resblks, 20);
- resblks = min_t(uint64_t, resblks, 8192);
- return resblks;
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
/* Ensure the summary counts are correct. */
@@ -363,6 +363,28 @@ DEFINE_EVENT(xfs_zone_alloc_class, name, \
TP_ARGS(oz, rgbno, len))
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+
+TRACE_EVENT(xfs_zones_mount,
+ TP_PROTO(struct xfs_mount *mp),
+ TP_ARGS(mp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgcount)
+ __field(uint32_t, blocks)
+ __field(unsigned int, max_open_zones)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rgcount = mp->m_sb.sb_rgcount;
+ __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
+ __entry->max_open_zones = mp->m_max_open_zones;
+ ),
+ TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgcount,
+ __entry->blocks,
+ __entry->max_open_zones)
+);
#endif /* CONFIG_XFS_RT */
TRACE_EVENT(xfs_inodegc_worker,
@@ -5766,6 +5788,7 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
+TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
@@ -922,6 +922,7 @@ xfs_mount_zones(
xfs_info(mp, "%u zones of %u blocks size (%u max open)",
mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
mp->m_max_open_zones);
+ trace_xfs_zones_mount(mp);
if (bdev_is_zoned(bt->bt_bdev)) {
error = blkdev_report_zones(bt->bt_bdev,
@@ -939,6 +940,7 @@ xfs_mount_zones(
}
}
+ xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
iz.available + iz.reclaimable);
return 0;
@@ -5,6 +5,30 @@
struct iomap_ioend;
struct xfs_open_zone;
+struct xfs_zone_alloc_ctx {
+ struct xfs_open_zone *open_zone;
+ xfs_filblks_t reserved_blocks;
+};
+
+/*
+ * Grab any available space, even if it is less than what the caller asked for.
+ */
+#define XFS_ZR_GREEDY (1U << 0)
+/*
+ * Only grab instantly available space, don't wait or GC.
+ */
+#define XFS_ZR_NOWAIT (1U << 1)
+/*
+ * Dip into the reserved pool.
+ */
+#define XFS_ZR_RESERVED (1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+ unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+ struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+
void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
struct xfs_open_zone **oz);
int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
@@ -18,6 +42,9 @@ void xfs_zoned_wake_all(struct xfs_mount *mp);
bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
+
#ifdef CONFIG_XFS_RT
int xfs_mount_zones(struct xfs_mount *mp);
void xfs_unmount_zones(struct xfs_mount *mp);
@@ -86,4 +86,6 @@ struct xfs_zone_info {
struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
+void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
+
#endif /* _XFS_ZONE_PRIV_H */
new file mode 100644
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+
+/*
+ * Note: the zoned allocator does not support a rtextsize > 1, so this code and
+ * the allocator itself uses file system blocks interchangeable with realtime
+ * extents without doing the otherwise required conversions.
+ */
+
+/*
+ * Per-task space reservation.
+ *
+ * Tasks that need to wait for GC to free up space allocate one of these
+ * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
+ * The GC thread will then wake the tasks in order when space becomes available.
+ */
+struct xfs_zone_reservation {
+ struct list_head entry;
+ struct task_struct *task;
+ xfs_filblks_t count_fsb;
+};
+
+/*
+ * Calculate the number of reserved blocks.
+ *
+ * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
+ * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
+ * available for writes without waiting for GC.
+ *
+ * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
+ * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
+ * is further restricted by at least one zone as well as the optional
+ * persistently reserved blocks. This allows the allocator to run more
+ * smoothly by not always triggering GC.
+ */
+uint64_t
+xfs_zoned_default_resblks(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ switch (ctr) {
+ case XC_FREE_RTEXTENTS:
+ return (uint64_t)XFS_RESERVED_ZONES *
+ mp->m_groups[XG_TYPE_RTG].blocks +
+ mp->m_sb.sb_rtreserved;
+ case XC_FREE_RTAVAILABLE:
+ return (uint64_t)XFS_GC_ZONES *
+ mp->m_groups[XG_TYPE_RTG].blocks;
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+void
+xfs_zoned_resv_wake_all(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation *reservation;
+
+ spin_lock(&zi->zi_reservation_lock);
+ list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
+ wake_up_process(reservation->task);
+ spin_unlock(&zi->zi_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+ struct xfs_mount *mp,
+ xfs_filblks_t count_fsb)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation *reservation;
+
+ if (list_empty_careful(&zi->zi_reclaim_reservations)) {
+ xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+ return;
+ }
+
+ spin_lock(&zi->zi_reservation_lock);
+ xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+ count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
+ list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
+ if (reservation->count_fsb > count_fsb)
+ break;
+ wake_up_process(reservation->task);
+ count_fsb -= reservation->count_fsb;
+
+ }
+ spin_unlock(&zi->zi_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+ struct xfs_mount *mp)
+{
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+ struct xfs_inode *ip,
+ xfs_filblks_t count_fsb,
+ unsigned int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation reservation = {
+ .task = current,
+ .count_fsb = count_fsb,
+ };
+ int error;
+
+ /*
+ * If there are no waiters, try to directly grab the available blocks
+ * from the percpu counter.
+ *
+ * If the caller wants to dip into the reserved pool also bypass the
+ * wait list. This relies on the fact that we have a very graciously
+ * sized reserved pool that always has enough space. If the reserved
+ * allocations fail we're in trouble.
+ */
+ if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
+ (flags & XFS_ZR_RESERVED))) {
+ error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ return error;
+ }
+
+ if (flags & XFS_ZR_NOWAIT)
+ return -EAGAIN;
+
+ spin_lock(&zi->zi_reservation_lock);
+ list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
+ while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+ set_current_state(TASK_KILLABLE);
+
+ error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ break;
+
+ spin_unlock(&zi->zi_reservation_lock);
+ schedule();
+ spin_lock(&zi->zi_reservation_lock);
+ }
+ list_del(&reservation.entry);
+ spin_unlock(&zi->zi_reservation_lock);
+
+ __set_current_state(TASK_RUNNING);
+ return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ *
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+ struct xfs_inode *ip,
+ xfs_filblks_t *count_fsb,
+ unsigned int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ s64 len = *count_fsb;
+ int error = -ENOSPC;
+
+ spin_lock(&zi->zi_reservation_lock);
+ len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+ if (len > 0) {
+ *count_fsb = len;
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
+ flags & XFS_ZR_RESERVED);
+ }
+ spin_unlock(&zi->zi_reservation_lock);
+ return error;
+}
+
+int
+xfs_zoned_space_reserve(
+ struct xfs_inode *ip,
+ xfs_filblks_t count_fsb,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ ASSERT(ac->reserved_blocks == 0);
+ ASSERT(ac->open_zone == NULL);
+
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
+ error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
+ if (error)
+ return error;
+
+ error = xfs_zoned_reserve_available(ip, count_fsb, flags);
+ if (error) {
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
+ return error;
+ }
+ ac->reserved_blocks = count_fsb;
+ return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+ struct xfs_inode *ip,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ if (ac->reserved_blocks > 0) {
+ struct xfs_mount *mp = ip->i_mount;
+
+ xfs_zoned_add_available(mp, ac->reserved_blocks);
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
+ }
+ if (ac->open_zone)
+ xfs_open_zone_put(ac->open_zone);
+}