@@ -48,6 +48,10 @@ uint zuf_prepare_symname(struct zufs_ioc_new_inode *ioc_new_inode,
/* rw.c */
+ssize_t zuf_rw_read_iter(struct super_block *sb, struct inode *inode,
+ struct kiocb *kiocb, struct iov_iter *ii);
+ssize_t zuf_rw_write_iter(struct super_block *sb, struct inode *inode,
+ struct kiocb *kiocb, struct iov_iter *ii);
int zuf_trim_edge(struct inode *inode, ulong filepos, uint len);
/* super.c */
@@ -13,14 +13,443 @@
* Sagi Manole <sagim@netapp.com>"
*/
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/falloc.h>
+#include <linux/mman.h>
+#include <linux/fadvise.h>
+#include <linux/delay.h>
#include "zuf.h"
+static long zuf_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct zuf_inode_info *zii = ZUII(inode);
+ struct zufs_ioc_range ioc_range = {
+ .hdr.in_len = sizeof(ioc_range),
+ .hdr.operation = ZUFS_OP_FALLOCATE,
+ .zus_ii = ZUII(inode)->zus_ii,
+ .offset = offset,
+ .length = len,
+ .opflags = mode,
+ };
+ enum {FALLOC_RETRY = 7};
+ int retry = 0;
+ int err = 0;
+
+ zuf_dbg_vfs("[%ld] mode=0x%x offset=0x%llx len=0x%llx\n",
+ inode->i_ino, mode, offset, len);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ zuf_w_lock(zii);
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (i_size_read(inode) < offset + len)) {
+ err = inode_newsize_ok(inode, offset + len);
+ if (unlikely(err))
+ goto out;
+ }
+
+ zus_inode_cmtime_now(inode, zii->zi);
+
+ if (mode & (FALLOC_FL_ZERO_RANGE | FALLOC_FL_PUNCH_HOLE)) {
+ /* ASSUMING FS supports these two */
+ struct super_block *sb = inode->i_sb;
+ ulong off1 = offset & (sb->s_blocksize - 1);
+ ulong off2 = (offset + len) & (sb->s_blocksize - 1);
+
+ if (md_o2p(offset) == md_o2p(offset + len)) {
+ /* Same block. Just nullify the range and goto out */
+ err = zuf_trim_edge(inode, offset, off2 - off1);
+ goto out_update;
+ }
+ if (off1) {
+ uint l = sb->s_blocksize - off1;
+
+ err = zuf_trim_edge(inode, offset, l);
+ if (unlikely(err))
+ goto out;
+ if (mode & FALLOC_FL_ZERO_RANGE) {
+ ioc_range.offset += l;
+ ioc_range.length -= l;
+ }
+ }
+ if (off2) {
+ err = zuf_trim_edge(inode, (offset + len) - off2, off2);
+ if (unlikely(err))
+ goto out;
+ if (mode & FALLOC_FL_ZERO_RANGE)
+ ioc_range.length -= off2;
+ }
+ }
+
+ /* no length remains, but size might have changed in trim_edge */
+ if (!ioc_range.length)
+ goto out_update;
+
+again:
+ err = zufc_dispatch(ZUF_ROOT(SBI(inode->i_sb)), &ioc_range.hdr,
+ NULL, 0);
+ if (unlikely(err)) {
+ if (err == -EZUFS_RETRY) {
+ if (FALLOC_RETRY < retry++) {
+ zuf_dbg_err("[%ld] retry=%d\n",
+ inode->i_ino, retry);
+ msleep(retry - FALLOC_RETRY);
+ }
+ goto again;
+ }
+ zuf_dbg_err("[%ld] zufc_dispatch failed => %d\n",
+ inode->i_ino, err);
+ }
+
+out_update:
+ i_size_write(inode, le64_to_cpu(zii->zi->i_size));
+ inode->i_blocks = le64_to_cpu(zii->zi->i_blocks);
+
+out:
+ zuf_w_unlock(zii);
+
+ return err;
+}
+
+static loff_t zuf_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct zuf_inode_info *zii = ZUII(inode);
+ struct zufs_ioc_seek ioc_seek = {
+ .hdr.in_len = sizeof(ioc_seek),
+ .hdr.out_len = sizeof(ioc_seek),
+ .hdr.operation = ZUFS_OP_LLSEEK,
+ .zus_ii = zii->zus_ii,
+ .offset_in = offset,
+ .whence = whence,
+ };
+ int err = 0;
+
+ zuf_dbg_vfs("[%ld] offset=0x%llx whence=%d\n",
+ inode->i_ino, offset, whence);
+
+ if (whence != SEEK_DATA && whence != SEEK_HOLE)
+ return generic_file_llseek(file, offset, whence);
+
+ zuf_r_lock(zii);
+
+ if ((offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) ||
+ offset > inode->i_sb->s_maxbytes) {
+ err = -EINVAL;
+ goto out;
+ } else if (inode->i_size <= offset) {
+ err = -ENXIO;
+ goto out;
+ } else if (!inode->i_blocks) {
+ if (whence == SEEK_HOLE)
+ ioc_seek.offset_out = i_size_read(inode);
+ else
+ err = -ENXIO;
+ goto out;
+ }
+
+ err = zufc_dispatch(ZUF_ROOT(SBI(inode->i_sb)), &ioc_seek.hdr, NULL, 0);
+ if (unlikely(err)) {
+ zuf_dbg_err("zufc_dispatch failed => %d\n", err);
+ goto out;
+ }
+
+ if (ioc_seek.offset_out != file->f_pos) {
+ file->f_pos = ioc_seek.offset_out;
+ file->f_version = 0;
+ }
+
+out:
+ zuf_r_unlock(zii);
+
+ return err ?: ioc_seek.offset_out;
+}
+
+/* This callback is called when a file is closed */
+static int zuf_flush(struct file *file, fl_owner_t id)
+{
+ zuf_dbg_vfs("[%ld]\n", file->f_inode->i_ino);
+
+ return 0;
+}
+
+static int tozu_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 offset, u64 len)
+{
+ int err = -EOPNOTSUPP;
+ ulong start_index = md_o2p(offset);
+ ulong end_index = md_o2p_up(offset + len);
+ struct zuf_inode_info *zii = ZUII(inode);
+
+ zuf_dbg_vfs(
+ "[%ld] offset=0x%llx len=0x%llx i-start=0x%lx i-end=0x%lx\n",
+ inode->i_ino, offset, len, start_index, end_index);
+
+ if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))
+ return -EBADR;
+
+ zuf_r_lock(zii);
+
+ /* TODO: ZUS fiemap (&msi)*/
+
+ zuf_r_unlock(zii);
+ return err;
+}
+
+static void _lock_two_ziis(struct zuf_inode_info *zii1,
+ struct zuf_inode_info *zii2)
+{
+ if (zii1 > zii2)
+ swap(zii2, zii2);
+
+ zuf_w_lock(zii1);
+ if (zii1 != zii2)
+ zuf_w_lock_nested(zii2);
+}
+
+static void _unlock_two_ziis(struct zuf_inode_info *zii1,
+ struct zuf_inode_info *zii2)
+{
+ if (zii1 > zii2)
+ swap(zii2, zii2);
+
+ if (zii1 != zii2)
+ zuf_w_unlock(zii2);
+ zuf_w_unlock(zii1);
+}
+
+static int _clone_file_range(struct inode *src_inode, loff_t pos_in,
+ struct inode *dst_inode, loff_t pos_out,
+ u64 len, u64 len_up, int operation)
+{
+ struct zuf_inode_info *src_zii = ZUII(src_inode);
+ struct zuf_inode_info *dst_zii = ZUII(dst_inode);
+ struct zus_inode *dst_zi = dst_zii->zi;
+ struct super_block *sb = src_inode->i_sb;
+ struct zufs_ioc_clone ioc_clone = {
+ .hdr.in_len = sizeof(ioc_clone),
+ .hdr.out_len = sizeof(ioc_clone),
+ .hdr.operation = operation,
+ .src_zus_ii = src_zii->zus_ii,
+ .dst_zus_ii = dst_zii->zus_ii,
+ .pos_in = pos_in,
+ .pos_out = pos_out,
+ .len = len,
+ .len_up = len_up,
+ };
+ int err;
+
+ _lock_two_ziis(src_zii, dst_zii);
+
+ /* NOTE: len==0 means to-end-of-file which is what we want */
+ unmap_mapping_range(src_inode->i_mapping, pos_in, len, 0);
+ unmap_mapping_range(dst_inode->i_mapping, pos_out, len, 0);
+
+ zus_inode_cmtime_now(dst_inode, dst_zi);
+ err = zufc_dispatch(ZUF_ROOT(SBI(sb)), &ioc_clone.hdr, NULL, 0);
+ if (unlikely(err && err != -EINTR)) {
+ zuf_err("failed to clone %ld -> %ld ; err=%d\n",
+ src_inode->i_ino, dst_inode->i_ino, err);
+ goto out;
+ }
+
+ dst_inode->i_blocks = le64_to_cpu(dst_zi->i_blocks);
+ i_size_write(dst_inode, dst_zi->i_size);
+
+out:
+ _unlock_two_ziis(src_zii, dst_zii);
+
+ return err;
+}
+
+static loff_t zuf_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, uint remap_flags)
+{
+ struct inode *src_inode = file_inode(file_in);
+ struct inode *dst_inode = file_inode(file_out);
+ ulong src_size = i_size_read(src_inode);
+ ulong dst_size = i_size_read(dst_inode);
+ struct super_block *sb = src_inode->i_sb;
+ ulong len_up = len;
+ int err;
+
+ zuf_dbg_vfs(
+ "ino-in=%ld ino-out=%ld pos_in=0x%llx pos_out=0x%llx length=0x%llx\n",
+ src_inode->i_ino, dst_inode->i_ino, pos_in, pos_out, len);
+
+ if (remap_flags & ~REMAP_FILE_ADVISORY)
+ return -EINVAL;
+
+ if (src_inode == dst_inode) {
+ if (pos_in == pos_out) {
+ zuf_dbg_err("[%ld] Clone nothing!!\n",
+ src_inode->i_ino);
+ return 0;
+ }
+ if (pos_in < pos_out) {
+ if (pos_in + len > pos_out) {
+ zuf_dbg_err(
+ "[%ld] overlapping pos_in < pos_out?? => EINVAL\n",
+ src_inode->i_ino);
+ return -EINVAL;
+ }
+ } else {
+ if (pos_out + len > pos_in) {
+ zuf_dbg_err("[%ld] overlapping pos_out < pos_in?? => EINVAL\n",
+ src_inode->i_ino);
+ return -EINVAL;
+ }
+ }
+ }
+
+ if ((pos_in & (sb->s_blocksize - 1)) ||
+ (pos_out & (sb->s_blocksize - 1))) {
+ zuf_err("[%ld] Not aligned len=0x%llx pos_in=0x%llx "
+ "pos_out=0x%llx src-size=0x%llx dst-size=0x%llx\n",
+ src_inode->i_ino, len, pos_in, pos_out,
+ i_size_read(src_inode), i_size_read(dst_inode));
+ return -EINVAL;
+ }
+
+ /* STD says that len==0 means up to end of SRC */
+ if (!len)
+ len_up = len = src_size - pos_in;
+
+ if (!pos_in && !pos_out && (src_size <= pos_in + len) &&
+ (dst_size <= src_size)) {
+ len_up = 0;
+ } else if (len & (sb->s_blocksize - 1)) {
+ /* un-aligned len, see if it is beyond EOF */
+ if ((src_size > pos_in + len) ||
+ (dst_size > pos_out + len)) {
+ zuf_err("[%ld] Not aligned len=0x%llx pos_in=0x%llx "
+ "pos_out=0x%llx src-size=0x%lx dst-size=0x%lx\n",
+ src_inode->i_ino, len, pos_in, pos_out,
+ src_size, dst_size);
+ return -EINVAL;
+ }
+ len_up = md_p2o(md_o2p_up(len));
+ }
+
+ err = _clone_file_range(src_inode, pos_in, dst_inode, pos_out, len,
+ len_up, ZUFS_OP_CLONE);
+ if (unlikely(err))
+ zuf_err("_clone_file_range failed => %d\n", err);
+
+ return err ? err : len;
+}
+
+static ssize_t zuf_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, uint flags)
+{
+ struct inode *src_inode = file_inode(file_in);
+ struct inode *dst_inode = file_inode(file_out);
+ ssize_t ret;
+
+ zuf_dbg_vfs("ino-in=%ld ino-out=%ld pos_in=0x%llx pos_out=0x%llx length=0x%lx\n",
+ src_inode->i_ino, dst_inode->i_ino, pos_in, pos_out, len);
+
+ ret = zuf_clone_file_range(file_in, pos_in, file_out, pos_out, len,
+ REMAP_FILE_ADVISORY);
+
+ return ret ?: len;
+}
+
+/* ZUFS:
+ * make sure we clean up the resources consumed by zufs_init()
+ */
+static int zuf_file_release(struct inode *inode, struct file *filp)
+{
+ if (unlikely(filp->private_data))
+ zuf_err("not yet\n");
+
+ return 0;
+}
+
+static ssize_t zuf_read_iter(struct kiocb *kiocb, struct iov_iter *ii)
+{
+ struct inode *inode = file_inode(kiocb->ki_filp);
+ struct zuf_inode_info *zii = ZUII(inode);
+ ssize_t ret;
+
+ zuf_dbg_vfs("[%ld] ppos=0x%llx len=0x%zx\n",
+ inode->i_ino, kiocb->ki_pos, iov_iter_count(ii));
+
+ file_accessed(kiocb->ki_filp);
+
+ zuf_r_lock(zii);
+
+ ret = zuf_rw_read_iter(inode->i_sb, inode, kiocb, ii);
+
+ zuf_r_unlock(zii);
+
+ zuf_dbg_vfs("[%ld] => 0x%lx\n", inode->i_ino, ret);
+ return ret;
+}
+
+static ssize_t zuf_write_iter(struct kiocb *kiocb, struct iov_iter *ii)
+{
+ struct inode *inode = file_inode(kiocb->ki_filp);
+ struct zuf_inode_info *zii = ZUII(inode);
+ ssize_t ret;
+
+ ret = generic_write_checks(kiocb, ii);
+ if (unlikely(ret < 0)) {
+ zuf_dbg_vfs("[%ld] generic_write_checks => 0x%lx\n",
+ inode->i_ino, ret);
+ return ret;
+ }
+
+ zuf_r_lock(zii);
+
+ ret = file_remove_privs(kiocb->ki_filp);
+ if (unlikely(ret < 0))
+ goto out;
+
+ zus_inode_cmtime_now(inode, zii->zi);
+
+ ret = zuf_rw_write_iter(inode->i_sb, inode, kiocb, ii);
+ if (unlikely(ret < 0))
+ goto out;
+
+ if (i_size_read(inode) <= le64_to_cpu(zii->zi->i_size))
+ i_size_write(inode, le64_to_cpu(zii->zi->i_size));
+
+ inode->i_blocks = le64_to_cpu(zii->zi->i_blocks);
+
+out:
+ zuf_r_unlock(zii);
+
+ zuf_dbg_vfs("[%ld] => 0x%lx\n", inode->i_ino, ret);
+ return ret;
+}
+
const struct file_operations zuf_file_operations = {
+ .llseek = zuf_llseek,
+ .read_iter = zuf_read_iter,
+ .write_iter = zuf_write_iter,
.open = generic_file_open,
+ .flush = zuf_flush,
+ .release = zuf_file_release,
+ .fallocate = zuf_fallocate,
+ .copy_file_range = zuf_copy_file_range,
+ .remap_file_range = zuf_clone_file_range,
};
const struct inode_operations zuf_file_inode_operations = {
.setattr = zuf_setattr,
.getattr = zuf_getattr,
.update_time = zuf_update_time,
+ .fiemap = tozu_fiemap,
};
@@ -23,3 +23,15 @@ int zuf_trim_edge(struct inode *inode, ulong filepos, uint len)
{
return -EIO;
}
+
+ssize_t zuf_rw_read_iter(struct super_block *sb, struct inode *inode,
+ struct kiocb *kiocb, struct iov_iter *ii)
+{
+ return -EIO;
+}
+
+ssize_t zuf_rw_write_iter(struct super_block *sb, struct inode *inode,
+ struct kiocb *kiocb, struct iov_iter *ii)
+{
+ return -EIO;
+}
@@ -774,8 +774,12 @@ const char *zuf_op_name(enum e_zufs_operation op)
CASE_ENUM_NAME(ZUFS_OP_REMOVE_DENTRY );
CASE_ENUM_NAME(ZUFS_OP_RENAME );
CASE_ENUM_NAME(ZUFS_OP_READDIR );
+ CASE_ENUM_NAME(ZUFS_OP_CLONE );
+ CASE_ENUM_NAME(ZUFS_OP_COPY );
CASE_ENUM_NAME(ZUFS_OP_GET_SYMLINK );
CASE_ENUM_NAME(ZUFS_OP_SETATTR );
+ CASE_ENUM_NAME(ZUFS_OP_FALLOCATE );
+ CASE_ENUM_NAME(ZUFS_OP_LLSEEK );
CASE_ENUM_NAME(ZUFS_OP_BREAK );
default:
return "UNKNOWN";
@@ -337,9 +337,13 @@ enum e_zufs_operation {
ZUFS_OP_REMOVE_DENTRY,
ZUFS_OP_RENAME,
ZUFS_OP_READDIR,
+ ZUFS_OP_CLONE,
+ ZUFS_OP_COPY,
ZUFS_OP_GET_SYMLINK,
ZUFS_OP_SETATTR,
+ ZUFS_OP_FALLOCATE,
+ ZUFS_OP_LLSEEK,
ZUFS_OP_BREAK, /* Kernel telling Server to exit */
ZUFS_OP_MAX_OPT,
@@ -528,6 +532,47 @@ struct zufs_ioc_attr {
__u32 pad;
};
+enum ZUFS_RANGE_FLAGS {
+ ZUFS_RF_DONTNEED = 0x00000001,
+};
+
+/* ZUFS_OP_ISYNC, ZUFS_OP_FALLOCATE */
+struct zufs_ioc_range {
+ struct zufs_ioc_hdr hdr;
+ /* IN */
+ struct zus_inode_info *zus_ii;
+ __u64 offset, length;
+ __u32 opflags;
+ __u32 ioc_flags;
+
+ /* OUT */
+ __u64 write_unmapped;
+};
+
+/* ZUFS_OP_CLONE */
+struct zufs_ioc_clone {
+ struct zufs_ioc_hdr hdr;
+ /* IN */
+ struct zus_inode_info *src_zus_ii;
+ struct zus_inode_info *dst_zus_ii;
+ __u64 pos_in, pos_out;
+ __u64 len;
+ __u64 len_up;
+};
+
+/* ZUFS_OP_LLSEEK */
+struct zufs_ioc_seek {
+ struct zufs_ioc_hdr hdr;
+ /* IN */
+ struct zus_inode_info *zus_ii;
+ __u64 offset_in;
+ __u32 whence;
+ __u32 pad;
+
+ /* OUT */
+ __u64 offset_out;
+};
+
/* Allocate a special_file that will be a dual-port communication buffer with
* user mode.
* Server will access the buffer via the mmap of this file.