@@ -26,6 +26,7 @@
#include <dirent.h>
#include <blkid/blkid.h>
#include <linux/limits.h>
+#include <linux/fs.h>
#include <limits.h>
#include "kernel-lib/sizes.h"
#include "kernel-shared/disk-io.h"
@@ -76,7 +77,7 @@ int device_discard_blocks(int fd, u64 start, u64 len)
/*
* Write zeros to the given range [start, start + len)
*/
-int device_zero_blocks(int fd, off_t start, size_t len)
+int device_zero_blocks(int fd, off_t start, size_t len, const bool direct)
{
char *buf = malloc(len);
int ret = 0;
@@ -85,7 +86,7 @@ int device_zero_blocks(int fd, off_t start, size_t len)
if (!buf)
return -ENOMEM;
memset(buf, 0, len);
- written = pwrite(fd, buf, len, start);
+ written = btrfs_pwrite(fd, buf, len, start, direct);
if (written != len)
ret = -EIO;
free(buf);
@@ -115,7 +116,7 @@ static int zero_dev_clamped(int fd, struct btrfs_zoned_device_info *zinfo,
if (zinfo && zinfo->model == ZONED_HOST_MANAGED)
return zero_zone_blocks(fd, zinfo, start, end - start);
- return device_zero_blocks(fd, start, end - start);
+ return device_zero_blocks(fd, start, end - start, false);
}
/*
@@ -157,8 +158,10 @@ static int btrfs_wipe_existing_sb(int fd, struct btrfs_zoned_device_info *zinfo)
len = sizeof(buf);
if (!zone_is_sequential(zinfo, offset)) {
+ const bool direct = zinfo && zinfo->model == ZONED_HOST_MANAGED;
+
memset(buf, 0, len);
- ret = pwrite(fd, buf, len, offset);
+ ret = btrfs_pwrite(fd, buf, len, offset, direct);
if (ret < 0) {
error("cannot wipe existing superblock: %m");
ret = -1;
@@ -491,3 +494,68 @@ out:
close(sysfs_fd);
return ret;
}
+
+ssize_t btrfs_direct_pio(int rw, int fd, void *buf, size_t count, off_t offset)
+{
+ int alignment;
+ size_t iosize;
+ void *bounce_buf = NULL;
+ struct stat stat_buf;
+ unsigned long req;
+ int ret;
+ ssize_t ret_rw;
+
+ ASSERT(rw == READ || rw == WRITE);
+
+ if (fstat(fd, &stat_buf) == -1) {
+ error("fstat failed (%m)");
+ return 0;
+ }
+
+ if ((stat_buf.st_mode & S_IFMT) == S_IFBLK)
+ req = BLKSSZGET;
+ else
+ req = FIGETBSZ;
+
+ if (ioctl(fd, req, &alignment)) {
+ error("failed to get block size: %m");
+ return 0;
+ }
+
+ if (IS_ALIGNED((size_t)buf, alignment) && IS_ALIGNED(count, alignment)) {
+ if (rw == WRITE)
+ return pwrite(fd, buf, count, offset);
+ else
+ return pread(fd, buf, count, offset);
+ }
+
+ /* Cannot do anything if the write size is not aligned */
+ if (rw == WRITE && !IS_ALIGNED(count, alignment)) {
+ error("%lu is not aligned to %d", count, alignment);
+ return 0;
+ }
+
+ iosize = round_up(count, alignment);
+
+ ret = posix_memalign(&bounce_buf, alignment, iosize);
+ if (ret) {
+ error("failed to allocate bounce buffer: %m");
+ errno = ret;
+ return 0;
+ }
+
+ if (rw == WRITE) {
+ ASSERT(iosize == count);
+ memcpy(bounce_buf, buf, count);
+ ret_rw = pwrite(fd, bounce_buf, iosize, offset);
+ } else {
+ ret_rw = pread(fd, bounce_buf, iosize, offset);
+ if (ret_rw >= count) {
+ ret_rw = count;
+ memcpy(buf, bounce_buf, count);
+ }
+ }
+
+ free(bounce_buf);
+ return ret_rw;
+}
@@ -17,6 +17,8 @@
#ifndef __DEVICE_UTILS_H__
#define __DEVICE_UTILS_H__
+#include <stdbool.h>
+#include <unistd.h>
#include "kerncompat.h"
#include "sys/stat.h"
@@ -35,7 +37,7 @@
* Generic block device helpers
*/
int device_discard_blocks(int fd, u64 start, u64 len);
-int device_zero_blocks(int fd, off_t start, size_t len);
+int device_zero_blocks(int fd, off_t start, size_t len, const bool direct);
u64 device_get_partition_size(const char *dev);
u64 device_get_partition_size_fd(int fd);
int device_get_queue_param(const char *file, const char *param, char *buf, size_t len);
@@ -47,5 +49,20 @@ u64 device_get_zone_size(int fd, const char *name);
u64 btrfs_device_size(int fd, struct stat *st);
int btrfs_prepare_device(int fd, const char *file, u64 *block_count_ret,
u64 max_block_count, unsigned opflags);
+ssize_t btrfs_direct_pio(int rw, int fd, void *buf, size_t count, off_t offset);
+
+#ifdef BTRFS_ZONED
+static inline ssize_t btrfs_pwrite(int fd, void *buf, size_t count,
+ off_t offset, bool direct)
+{
+ if (!direct)
+ return pwrite(fd, buf, count, offset);
+
+ return btrfs_direct_pio(WRITE, fd, buf, count, offset);
+}
+#else
+#define btrfs_pwrite(fd, buf, count, offset, direct) \
+ ({ (void)(direct); pwrite(fd, buf, count, offset); })
+#endif
#endif
@@ -29,6 +29,7 @@
#include "kernel-shared/ctree.h"
#include "kernel-shared/volumes.h"
#include "common/utils.h"
+#include "common/device-utils.h"
#include "common/internal.h"
void extent_io_tree_init(struct extent_io_tree *tree)
@@ -809,7 +810,8 @@ out:
int write_extent_to_disk(struct extent_buffer *eb)
{
int ret;
- ret = pwrite(eb->fd, eb->data, eb->len, eb->dev_bytenr);
+ ret = btrfs_pwrite(eb->fd, eb->data, eb->len, eb->dev_bytenr,
+ eb->fs_info->zoned);
if (ret < 0)
goto out;
if (ret != eb->len) {
@@ -932,7 +934,8 @@ int write_data_to_disk(struct btrfs_fs_info *info, void *buf, u64 offset,
this_len = min(this_len, bytes_left);
dev_nr++;
- ret = pwrite(device->fd, buf + total_write, this_len, dev_bytenr);
+ ret = btrfs_pwrite(device->fd, buf + total_write,
+ this_len, dev_bytenr, info->zoned);
if (ret != this_len) {
if (ret < 0) {
fprintf(stderr, "Error writing to "
@@ -424,7 +424,7 @@ int zero_zone_blocks(int fd, struct btrfs_zoned_device_info *zinfo, off_t start,
count = zone_len - (ofst & (zone_len - 1));
if (!zone_is_sequential(zinfo, ofst)) {
- ret = device_zero_blocks(fd, ofst, count);
+ ret = device_zero_blocks(fd, ofst, count, true);
if (ret != 0)
return ret;
}
@@ -595,7 +595,7 @@ size_t btrfs_sb_io(int fd, void *buf, off_t offset, int rw)
if (rw == READ)
ret_sz = pread64(fd, buf, count, mapped);
else
- ret_sz = pwrite64(fd, buf, count, mapped);
+ ret_sz = btrfs_pwrite(fd, buf, count, mapped, true);
if (ret_sz != count)
return ret_sz;
@@ -54,7 +54,7 @@ static int btrfs_write_empty_tree(int fd, struct btrfs_mkfs_config *cfg,
btrfs_set_header_nritems(buf, 0);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
- ret = pwrite(fd, buf->data, cfg->nodesize, block);
+ ret = btrfs_pwrite(fd, buf->data, cfg->nodesize, block, cfg->zone_size);
if (ret != cfg->nodesize)
return ret < 0 ? -errno : -EIO;
return 0;
@@ -134,7 +134,8 @@ static int btrfs_create_tree_root(int fd, struct btrfs_mkfs_config *cfg,
cfg->csum_type);
/* write back root tree */
- ret = pwrite(fd, buf->data, cfg->nodesize, cfg->blocks[MKFS_ROOT_TREE]);
+ ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
+ cfg->blocks[MKFS_ROOT_TREE], cfg->zone_size);
if (ret != cfg->nodesize)
return (ret < 0 ? -errno : -EIO);
@@ -422,7 +423,8 @@ int make_btrfs(int fd, struct btrfs_mkfs_config *cfg)
btrfs_set_header_nritems(buf, nritems);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
- ret = pwrite(fd, buf->data, cfg->nodesize, cfg->blocks[MKFS_EXTENT_TREE]);
+ ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
+ cfg->blocks[MKFS_EXTENT_TREE], cfg->zone_size);
if (ret != cfg->nodesize) {
ret = (ret < 0 ? -errno : -EIO);
goto out;
@@ -510,7 +512,8 @@ int make_btrfs(int fd, struct btrfs_mkfs_config *cfg)
btrfs_set_header_nritems(buf, nritems);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
- ret = pwrite(fd, buf->data, cfg->nodesize, cfg->blocks[MKFS_CHUNK_TREE]);
+ ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
+ cfg->blocks[MKFS_CHUNK_TREE], cfg->zone_size);
if (ret != cfg->nodesize) {
ret = (ret < 0 ? -errno : -EIO);
goto out;
@@ -550,7 +553,8 @@ int make_btrfs(int fd, struct btrfs_mkfs_config *cfg)
btrfs_set_header_nritems(buf, nritems);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
- ret = pwrite(fd, buf->data, cfg->nodesize, cfg->blocks[MKFS_DEV_TREE]);
+ ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
+ cfg->blocks[MKFS_DEV_TREE], cfg->zone_size);
if (ret != cfg->nodesize) {
ret = (ret < 0 ? -errno : -EIO);
goto out;
Wrap pwrite with btrfs_pwrite(). It simply calls pwrite() on non-zoned btrfs (= opened without O_DIRECT). On zoned mode (= opened with O_DIRECT), it allocates an aligned bounce buffer, copy the contents and use it for direct-IO writing. Writes in device_zero_blocks() and btrfs_wipe_existing_sb() are a little tricky. We don't have fs_info on our hands, so use zinfo to determine it is a zoned device or not. Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> --- common/device-utils.c | 76 ++++++++++++++++++++++++++++++++++++--- common/device-utils.h | 19 +++++++++- kernel-shared/extent_io.c | 7 ++-- kernel-shared/zoned.c | 4 +-- mkfs/common.c | 14 +++++--- 5 files changed, 106 insertions(+), 14 deletions(-)