From patchwork Thu Sep 13 09:16:21 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Luis Henriques X-Patchwork-Id: 10599043 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 2E54113B6 for ; Thu, 13 Sep 2018 09:15:35 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 1C91D2A932 for ; Thu, 13 Sep 2018 09:15:35 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 1124F2A936; Thu, 13 Sep 2018 09:15:35 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 56ADB2A932 for ; Thu, 13 Sep 2018 09:15:34 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727690AbeIMOYH (ORCPT ); Thu, 13 Sep 2018 10:24:07 -0400 Received: from mx2.suse.de ([195.135.220.15]:54342 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1726862AbeIMOYH (ORCPT ); Thu, 13 Sep 2018 10:24:07 -0400 X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id B452FAD17; Thu, 13 Sep 2018 09:15:30 +0000 (UTC) From: Luis Henriques To: "Yan, Zheng" , Sage Weil , Ilya Dryomov , Gregory Farnum Cc: ceph-devel@vger.kernel.org, Luis Henriques Subject: [RFC PATCH v4 3/4] ceph: support copy_file_range file operation Date: Thu, 13 Sep 2018 10:16:21 +0100 Message-Id: <20180913091622.28489-4-lhenriques@suse.com> In-Reply-To: <20180913091622.28489-1-lhenriques@suse.com> References: <20180913091622.28489-1-lhenriques@suse.com> Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP This commit implements support for the copy_file_range syscall in cephfs. It is implemented using the RADOS 'copy-from' operation, which allows to do a remote object copy, without the need to download/upload data from/to the OSDs. Some manual copy may however be required if the source/destination file offsets aren't object aligned or if the copy lenght is smaller than the object size. Signed-off-by: Luis Henriques --- fs/ceph/file.c | 272 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 271 insertions(+), 1 deletion(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 92ab20433682..cf2d6fb4735e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include @@ -1829,6 +1830,275 @@ static long ceph_fallocate(struct file *file, int mode, return ret; } +/* + * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for + * src_ci. Two attempts are made to obtain both caps, and an error is return if + * this fails; zero is returned on success. + */ +static int get_rd_wr_caps(struct ceph_inode_info *src_ci, + loff_t src_endoff, int *src_got, + struct ceph_inode_info *dst_ci, + loff_t dst_endoff, int *dst_got) +{ + int ret = 0; + bool retrying = false; + +retry_caps: + ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + dst_endoff, dst_got, NULL); + if (ret < 0) + return ret; + + /* + * Since we're already holding the FILE_WR capability for the dst file, + * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some + * retry dance instead to try to get both capabilities. + */ + ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + false, src_got); + if (ret <= 0) { + /* Start by dropping dst_ci caps and getting src_ci caps */ + ceph_put_cap_refs(dst_ci, *dst_got); + if (retrying) { + if (!ret) + /* ceph_try_get_caps masks EAGAIN */ + ret = -EAGAIN; + return ret; + } + ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, src_endoff, + src_got, NULL); + if (ret < 0) + return ret; + /*... drop src_ci caps too, and retry */ + ceph_put_cap_refs(src_ci, *src_got); + retrying = true; + goto retry_caps; + } + return ret; +} + +static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, + struct ceph_inode_info *dst_ci, int dst_got) +{ + ceph_put_cap_refs(src_ci, src_got); + ceph_put_cap_refs(dst_ci, dst_got); +} + +static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + struct ceph_inode_info *src_ci = ceph_inode(src_inode); + struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(src_inode)->client->osdc; + struct ceph_cap_flush *prealloc_cf; + struct ceph_object_locator src_oloc, dst_oloc; + loff_t endoff = 0; + loff_t size; + ssize_t ret = -EIO; + int src_got = 0; + int dst_got = 0; + int err, dirty; + + if (src_inode == dst_inode) + return -EINVAL; + if (ceph_snap(dst_inode) != CEPH_NOSNAP) + return -EROFS; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + /* Start by sync'ing the source file */ + ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); + if (ret < 0) + goto out; + + /* + * If something fails we just return -EOPNOTSUPP and fallback to the VFS + * default copy_file_range implementation. + */ + ret = -EOPNOTSUPP; + + /* + * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other + * clients may have dirty data in their caches. And OSDs know nothing + * about caps, so they can't safely do the remote object copies. + */ + err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, + dst_ci, (dst_off + len), &dst_got); + if (err < 0) { + dout("get_rd_wr_caps returned %d\n", err); + goto out; + } + + size = i_size_read(src_inode); + /* + * Don't copy beyond source file EOF. Instead of simply setting lenght + * to (size - src_off), just drop to VFS default implementation, as the + * local i_size may be stale due to other clients writing to the source + * inode. + */ + if (src_off + len > size) { + dout("Copy beyond EOF (%llu + %ld > %llu)\n", + src_off, len, size); + goto out_caps; + } + size = i_size_read(dst_inode); + + endoff = dst_off + len; + ret = inode_newsize_ok(dst_inode, endoff); + if (ret) + goto out_caps; + + if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) { + ret = -EDQUOT; + goto out_caps; + } + + /* Drop dst file cached pages */ + ret = invalidate_inode_pages2_range(dst_inode->i_mapping, + dst_off >> PAGE_SHIFT, + endoff >> PAGE_SHIFT); + if (ret < 0) { + dout("Failed to invalidate inode pages (%ld)\n", ret); + ret = 0; /* XXX */ + } + src_oloc.pool = src_ci->i_layout.pool_id; + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); + dst_oloc.pool = dst_ci->i_layout.pool_id; + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); + + while (len > 0) { + struct ceph_object_id src_oid, dst_oid; + u64 src_objnum, dst_objnum, objoff; + u32 objlen; + size_t copy_len = min_t(size_t, src_ci->i_layout.object_size, len); + + ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, + copy_len, &src_objnum, &objoff, + &objlen); + + /* Do manual copy if: + * - source file offset isn't object aligned, or + * - copy length is smaller than object size + */ + if (objoff || (copy_len < src_ci->i_layout.object_size)) { + /* Do not copy beyond this object */ + if (copy_len > objlen) + copy_len = objlen; + /* we need to temporarily drop all caps as we'll be + * calling {read,write}_iter, which will get caps + * again. + */ + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + err = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, copy_len, flags); + if (err < 0) { + dout("do_splice_direct returned %d\n", err); + goto out; + } + len -= err; + ret += err; + err = get_rd_wr_caps(src_ci, (src_off + len), + &src_got, dst_ci, + (dst_off + len), &dst_got); + if (err < 0) { + goto out; + } + continue; + } + + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, + copy_len, &dst_objnum, &objoff, + &objlen); + /* Again... do a manual copy if: + * - destination file offset isn't object aligned, or + * - copy length is smaller than object size + * (although the object size should be the same for different + * files in the same filesystem...) + */ + if (objoff || (copy_len < dst_ci->i_layout.object_size)) { + if (copy_len > objlen) + copy_len = objlen; + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + err = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, copy_len, flags); + if (err < 0) { + dout("do_splice_direct returned %d\n", err); + goto out; + } + len -= err; + ret += err; + err = get_rd_wr_caps(src_ci, (src_off + len), + &src_got, dst_ci, + (dst_off + len), &dst_got); + if (err < 0) { + goto out; + } + continue; + } + ceph_oid_init(&src_oid); + ceph_oid_printf(&src_oid, "%llx.%08llx", + src_ci->i_vino.ino, src_objnum); + ceph_oid_init(&dst_oid); + ceph_oid_printf(&dst_oid, "%llx.%08llx", + dst_ci->i_vino.ino, dst_objnum); + /* Finally... do an object remote copy */ + err = ceph_osdc_copy_from( + osdc, src_ci->i_vino.snap, 0, /* XXX src_ci->i_version? */ + &src_oid, &src_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED, + dst_ci->i_vino.snap, &dst_oid, &dst_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + ceph_oid_destroy(&src_oid); + ceph_oid_destroy(&dst_oid); + if (err) { + dout("ceph_osdc_copy_from returned %d\n", err); + goto out_caps; + } + len -= copy_len; + src_off += copy_len; + dst_off += copy_len; + ret += copy_len; + } + file_update_time(dst_file); + if (endoff > size) { + int caps_flags = CHECK_CAPS_AUTHONLY; + + /* Let the MDS know about dst file size change */ + if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) + caps_flags |= CHECK_CAPS_NODELAY; + if (ceph_inode_set_size(dst_inode, endoff)) + caps_flags |= CHECK_CAPS_AUTHONLY; + if (caps_flags) + ceph_check_caps(dst_ci, caps_flags, NULL); + } + /* Mark Fw dirty */ + spin_lock(&dst_ci->i_ceph_lock); + dst_ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&dst_ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(dst_inode, dirty); + +out_caps: + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); +out: + ceph_free_cap_flush(prealloc_cf); + + if (!ret) + ret = err; + + return ret; +} + const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, @@ -1844,5 +2114,5 @@ const struct file_operations ceph_file_fops = { .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, .fallocate = ceph_fallocate, + .copy_file_range = ceph_copy_file_range, }; -