@@ -28,6 +28,7 @@
#include <assert.h>
#include <stddef.h>
#include <linux/types.h>
+#include <linux/kernel.h>
#include <stdint.h>
#include <features.h>
@@ -358,6 +359,7 @@ do { \
/* Alignment check */
#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0)
+#define ALIGN(x, a) __ALIGN_KERNEL((x), (a))
static inline int is_power_of_2(unsigned long n)
{
@@ -162,6 +162,8 @@ struct alloc_chunk_ctl {
u64 max_chunk_size;
int total_devs;
u64 dev_offset;
+ int nparity;
+ int ncopies;
};
struct stripe {
@@ -457,6 +459,8 @@ int btrfs_scan_one_device(int fd, const char *path,
static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
{
+ u64 zone_size;
+
switch (device->fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
/*
@@ -465,11 +469,72 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
* make sure to start at an offset of at least 1MB.
*/
return max(start, BTRFS_BLOCK_RESERVED_1M_FOR_SUPER);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ zone_size = device->zone_info->zone_size;
+ return ALIGN(max_t(u64, start, zone_size), zone_size);
default:
BUG();
}
}
+static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
+ u64 *hole_start, u64 *hole_size,
+ u64 num_bytes)
+{
+ u64 zone_size = device->zone_info->zone_size;
+ u64 pos;
+ bool changed = false;
+
+ ASSERT(IS_ALIGNED(*hole_start, zone_size));
+
+ while (*hole_size > 0) {
+ pos = btrfs_find_allocatable_zones(device, *hole_start,
+ *hole_start + *hole_size,
+ num_bytes);
+ if (pos != *hole_start) {
+ *hole_size = *hole_start + *hole_size - pos;
+ *hole_start = pos;
+ changed = true;
+ if (*hole_size < num_bytes)
+ break;
+ }
+
+ *hole_start += zone_size;
+ *hole_size -= zone_size;
+ changed = true;
+ }
+
+ return changed;
+}
+
+/**
+ * dev_extent_hole_check - check if specified hole is suitable for allocation
+ * @device: the device which we have the hole
+ * @hole_start: starting position of the hole
+ * @hole_size: the size of the hole
+ * @num_bytes: the size of the free space that we need
+ *
+ * This function may modify @hole_start and @hole_size to reflect the suitable
+ * position for allocation. Returns true if hole position is updated, false
+ * otherwise.
+ */
+static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
+ u64 *hole_size, u64 num_bytes)
+{
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /* No check */
+ break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ return dev_extent_hole_check_zoned(device, hole_start,
+ hole_size, num_bytes);
+ default:
+ BUG();
+ }
+
+ return false;
+}
+
/*
* find_free_dev_extent_start - find free space in the specified device
* @device: the device which we search the free space in
@@ -507,6 +572,10 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
int ret;
int slot;
struct extent_buffer *l;
+ u64 zone_size = 0;
+
+ if (device->zone_info)
+ zone_size = device->zone_info->zone_size;
search_start = dev_extent_search_start(device, search_start);
@@ -517,6 +586,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
max_hole_start = search_start;
max_hole_size = 0;
+again:
if (search_start >= search_end) {
ret = -ENOSPC;
goto out;
@@ -562,11 +632,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
if (key.offset > search_start) {
hole_size = key.offset - search_start;
+ dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes);
- /*
- * Have to check before we set max_hole_start, otherwise
- * we could end up sending back this offset anyway.
- */
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
@@ -603,6 +671,12 @@ next:
* search_end may be smaller than search_start.
*/
if (search_end > search_start) {
+ if (dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes)) {
+ btrfs_release_path(path);
+ goto again;
+ }
+
hole_size = search_end - search_start;
if (hole_size > max_hole_size) {
@@ -618,6 +692,7 @@ next:
ret = 0;
out:
+ ASSERT(zone_size == 0 || IS_ALIGNED(max_hole_start, zone_size));
btrfs_free_path(path);
*start = max_hole_start;
if (len)
@@ -646,6 +721,11 @@ int btrfs_insert_dev_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
+ /* Check alignment to zone for a zoned block device */
+ ASSERT(!device->zone_info ||
+ device->zone_info->model != ZONED_HOST_MANAGED ||
+ IS_ALIGNED(start, device->zone_info->zone_size));
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1052,6 +1132,38 @@ static void init_alloc_chunk_ctl_policy_regular(struct btrfs_fs_info *info,
ctl->max_chunk_size = min(percent_max, ctl->max_chunk_size);
}
+static void init_alloc_chunk_ctl_policy_zoned(struct btrfs_fs_info *info,
+ struct alloc_chunk_ctl *ctl)
+{
+ u64 type = ctl->type;
+ u64 zone_size = info->zone_size;
+ int min_num_stripes = ctl->min_stripes * ctl->num_stripes;
+ int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
+ u64 min_chunk_size = min_data_stripes * zone_size;
+
+ ctl->stripe_size = zone_size;
+ ctl->min_stripe_size = zone_size;
+ if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ctl->max_chunk_size = SZ_16M;
+ ctl->max_stripes = BTRFS_MAX_DEVS_SYS_CHUNK;
+ } else if (type & BTRFS_BLOCK_GROUP_DATA) {
+ ctl->max_chunk_size = 10ULL * SZ_1G;
+ ctl->max_stripes = BTRFS_MAX_DEVS(info);
+ } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+ /* for larger filesystems, use larger metadata chunks */
+ if (info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ ctl->max_chunk_size = SZ_1G;
+ else
+ ctl->max_chunk_size = SZ_256M;
+ ctl->max_stripes = BTRFS_MAX_DEVS(info);
+ }
+ }
+
+ ctl->max_chunk_size = round_down(ctl->max_chunk_size, zone_size);
+ ctl->max_chunk_size = max(ctl->max_chunk_size, min_chunk_size);
+}
+
static void init_alloc_chunk_ctl(struct btrfs_fs_info *info,
struct alloc_chunk_ctl *ctl)
{
@@ -1066,11 +1178,16 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_info *info,
ctl->max_chunk_size = 4 * ctl->stripe_size;
ctl->total_devs = btrfs_super_num_devices(info->super_copy);
ctl->dev_offset = 0;
+ ctl->nparity = btrfs_raid_array[type].nparity;
+ ctl->ncopies = btrfs_raid_array[type].ncopies;
switch (info->fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
init_alloc_chunk_ctl_policy_regular(info, ctl);
break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ init_alloc_chunk_ctl_policy_zoned(info, ctl);
+ break;
default:
BUG();
}
@@ -1113,12 +1230,27 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl)
return 0;
}
+static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl)
+{
+ if (chunk_bytes_by_type(ctl) > ctl->max_chunk_size) {
+ /* stripe_size is fixed in ZONED. Reduce num_stripes instead. */
+ ctl->num_stripes = ctl->max_chunk_size * ctl->ncopies /
+ ctl->stripe_size;
+ if (ctl->num_stripes < ctl->min_stripes)
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
static int decide_stripe_size(struct btrfs_fs_info *info,
struct alloc_chunk_ctl *ctl)
{
switch (info->fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
return decide_stripe_size_regular(ctl);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ return decide_stripe_size_zoned(ctl);
default:
BUG();
}
@@ -1140,6 +1272,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
int index;
struct btrfs_key key;
u64 offset;
+ u64 zone_size = info->zone_size;
if (!ctl->start) {
ret = find_next_chunk(info, &offset);
@@ -1192,6 +1325,8 @@ static int create_chunk(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
+ ASSERT(!zone_size || IS_ALIGNED(dev_offset, zone_size));
+
device->bytes_used += ctl->stripe_size;
ret = btrfs_update_device(trans, device);
if (ret < 0)
@@ -74,6 +74,7 @@ struct btrfs_device {
enum btrfs_chunk_allocation_policy {
BTRFS_CHUNK_ALLOC_REGULAR,
+ BTRFS_CHUNK_ALLOC_ZONED,
};
struct btrfs_fs_devices {
@@ -512,6 +512,144 @@ size_t btrfs_sb_io(int fd, void *buf, off_t offset, int rw)
return ret_sz;
}
+/*
+ * btrfs_check_allocatable_zones - check if spcecifeid region is
+ * suitable for allocation
+ * @device: the device to allocate a region
+ * @pos: the position of the region
+ * @num_bytes: the size of the region
+ *
+ * In non-ZONED device, anywhere is suitable for allocation. In ZONED
+ * device, check if
+ * 1) the region is not on non-empty sequential zones,
+ * 2) all zones in the region have the same zone type,
+ * 3) it does not contain super block location
+ */
+bool btrfs_check_allocatable_zones(struct btrfs_device *device, u64 pos,
+ u64 num_bytes)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u64 nzones, begin, end;
+ u64 sb_pos;
+ bool is_sequential;
+ int shift;
+ int i;
+
+ if (!zinfo || zinfo->model == ZONED_NONE)
+ return true;
+
+ nzones = num_bytes / zinfo->zone_size;
+ begin = pos / zinfo->zone_size;
+ end = begin + nzones;
+
+ ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+ if (end > zinfo->nr_zones)
+ return false;
+
+ shift = ilog2(zinfo->zone_size);
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ sb_pos = sb_zone_number(shift, i);
+ if (!(end < sb_pos || sb_pos + 1 < begin))
+ return false;
+ }
+
+ is_sequential = btrfs_dev_is_sequential(device, pos);
+
+ while (num_bytes) {
+ if (is_sequential && !btrfs_dev_is_empty_zone(device, pos))
+ return false;
+ if (is_sequential != btrfs_dev_is_sequential(device, pos))
+ return false;
+
+ pos += zinfo->zone_size;
+ num_bytes -= zinfo->zone_size;
+ }
+
+ return true;
+}
+
+/**
+ * btrfs_find_allocatable_zones - find allocatable zones within a given region
+ *
+ * @device: the device to allocate a region on
+ * @hole_start: the position of the hole to allocate the region
+ * @num_bytes: size of wanted region
+ * @hole_end: the end of the hole
+ * @return: position of allocatable zones
+ *
+ * Allocatable region should not contain any superblock locations.
+ */
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ int shift = ilog2(zinfo->zone_size);
+ u64 nzones = num_bytes >> shift;
+ u64 pos = hole_start;
+ u64 begin, end;
+ bool is_sequential;
+ bool have_sb;
+ int i;
+
+ ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+ while (pos < hole_end) {
+ begin = pos >> shift;
+ end = begin + nzones;
+
+ if (end > zinfo->nr_zones)
+ return hole_end;
+
+ /*
+ * The zones must be all sequential (and empty), or
+ * conventional zones
+ */
+ is_sequential = btrfs_dev_is_sequential(device, pos);
+ for (i = 0; i < end - begin; i++) {
+ u64 zone_offset = pos + ((u64)i << shift);
+
+ if ((is_sequential &&
+ !btrfs_dev_is_empty_zone(device, zone_offset)) ||
+ (is_sequential !=
+ btrfs_dev_is_sequential(device, zone_offset))) {
+ pos += zinfo->zone_size;
+ continue;
+ }
+ }
+
+ have_sb = false;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_pos;
+
+ sb_zone = sb_zone_number(shift, i);
+ if (!(end <= sb_zone ||
+ sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
+ have_sb = true;
+ pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
+ break;
+ }
+
+ /* We also need to exclude regular superblock positions */
+ sb_pos = btrfs_sb_offset(i);
+ if (!(pos + num_bytes <= sb_pos ||
+ sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
+ have_sb = true;
+ pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
+ zinfo->zone_size);
+ break;
+ }
+ }
+ if (!have_sb)
+ break;
+ }
+
+ return pos;
+}
+
#endif
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
@@ -691,6 +829,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
fs_info->zone_size = zone_size;
fs_info->max_zone_append_size = max_zone_append_size;
+ fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
out:
return ret;
@@ -7,6 +7,7 @@
#include <stdbool.h>
#include "kerncompat.h"
+#include "kernel-shared/volumes.h"
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
@@ -56,7 +57,34 @@ static inline size_t sbwrite(int fd, void *buf, off_t offset)
{
return btrfs_sb_io(fd, buf, offset, WRITE);
}
+
+static inline bool zone_is_sequential(struct btrfs_zoned_device_info *zinfo,
+ u64 bytenr)
+{
+ unsigned int zno;
+
+ if (!zinfo || zinfo->model == ZONED_NONE)
+ return false;
+
+ zno = bytenr / zinfo->zone_size;
+ return zinfo->zones[zno].type == BLK_ZONE_TYPE_SEQWRITE_REQ;
+}
+
+static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ unsigned int zno;
+
+ if (!zone_is_sequential(zinfo, pos))
+ return true;
+
+ zno = pos / zinfo->zone_size;
+ return zinfo->zones[zno].cond == BLK_ZONE_COND_EMPTY;
+}
+
int btrfs_reset_dev_zone(int fd, struct blk_zone *zone);
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes);
#else
#define sbread(fd, buf, offset) \
pread64(fd, buf, BTRFS_SUPER_INFO_SIZE, offset)
@@ -68,6 +96,29 @@ static inline int btrfs_reset_dev_zone(int fd, struct blk_zone *zone)
return 0;
}
+static inline bool zone_is_sequential(struct btrfs_zoned_device_info *zinfo,
+ u64 bytenr)
+{
+ return false;
+}
+
+static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
+ u64 hole_start, u64 hole_end,
+ u64 num_bytes)
+{
+ return hole_start;
+}
+
+static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos)
+{
+ return true;
+}
+
#endif /* BTRFS_ZONED */
+static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
+{
+ return zone_is_sequential(device->zone_info, pos);
+}
+
#endif /* __BTRFS_ZONED_H__ */
Implement a zoned chunk and device extent allocator. One device zone becomes a device extent so that a zone reset affects only this device extent and does not change the state of blocks in the neighbor device extents. To implement the allocator, we need to extend the following functions for a zoned filesystem. - init_alloc_chunk_ctl - dev_extent_search_start - dev_extent_hole_check - decide_stripe_size Here, dev_extent_hole_check() is newly introduced to check the validity of a hole found. init_alloc_chunk_ctl_zoned() is mostly the same as regular one. It always set the stripe_size to the zone size and aligns the parameters to the zone size. dev_extent_search_start() only aligns the start offset to zone boundaries. We don't care about the first 1MB like in regular filesystem because we anyway reserve the first two zones for superblock logging. dev_extent_hole_check_zoned() checks if zones in given hole are either conventional or empty sequential zones. Also, it skips zones reserved for superblock logging. With the change to the hole, the new hole may now contain pending extents. So, in this case, loop again to check that. Finally, decide_stripe_size_zoned() should shrink the number of devices instead of stripe size because we need to honor stripe_size == zone_size. Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> --- kerncompat.h | 2 + kernel-shared/volumes.c | 143 ++++++++++++++++++++++++++++++++++++++-- kernel-shared/volumes.h | 1 + kernel-shared/zoned.c | 139 ++++++++++++++++++++++++++++++++++++++ kernel-shared/zoned.h | 51 ++++++++++++++ 5 files changed, 332 insertions(+), 4 deletions(-)