[V2,3/5] Orangefs: hooks and call-outs

Message ID	1421787111-27162-4-git-send-email-root@logtruck.clemson.edu (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-fsdevel-owner@kernel.org> From: Mike Marshall <hubcap@omnibond.com> To: viro@zeniv.linux.org.uk Cc: Mike Marshall <hubcap@omnibond.com>, linux-fsdevel@vger.kernel.org Subject: [PATCH V2 3/5] Orangefs: hooks and call-outs Date: Tue, 20 Jan 2015 15:51:49 -0500 Message-Id: <1421787111-27162-4-git-send-email-root@logtruck.clemson.edu> In-Reply-To: <1421787111-27162-1-git-send-email-root@logtruck.clemson.edu> References: <1421787111-27162-1-git-send-email-root@logtruck.clemson.edu> Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk

diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c new file mode 100644 index 0000000..af18cf1 --- /dev/null +++ b/fs/orangefs/acl.c @@ -0,0 +1,176 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-bufmap.h" +#include <linux/posix_acl_xattr.h> +#include <linux/fs_struct.h> + +struct posix_acl *pvfs2_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + int ret; + char *key = NULL, *value = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + key = PVFS2_XATTR_NAME_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + key = PVFS2_XATTR_NAME_ACL_DEFAULT; + break; + default: + gossip_err("pvfs2_get_acl: bogus value of type %d\n", type); + return ERR_PTR(-EINVAL); + } + /* + * Rather than incurring a network call just to determine the exact + * length of the attribute, I just allocate a max length to save on + * the network call. Conceivably, we could pass NULL to + * pvfs2_inode_getxattr() to probe the length of the value, but + * I don't do that for now. + */ + value = kmalloc(PVFS_MAX_XATTR_VALUELEN, GFP_KERNEL); + if (value == NULL) { + gossip_err("pvfs2_get_acl: Could not allocate value ptr\n"); + return ERR_PTR(-ENOMEM); + } + gossip_debug(GOSSIP_ACL_DEBUG, + "inode %pU, key %s, type %d\n", + get_khandle_from_ino(inode), + key, + type); + ret = pvfs2_inode_getxattr(inode, + "", + key, + value, + PVFS_MAX_XATTR_VALUELEN); + /* if the key exists, convert it to an in-memory rep */ + if (ret > 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, ret); + } else if (ret == -ENODATA || ret == -ENOSYS) { + acl = NULL; + } else { + gossip_err("inode %pU retrieving acl's failed with error %d\n", + get_khandle_from_ino(inode), + ret); + acl = ERR_PTR(ret); + } + /* kfree(NULL) is safe, so don't worry if value ever got used */ + kfree(value); + return acl; +} + +int pvfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); + int error = 0; + void *value = NULL; + size_t size = 0; + const char *name = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + name = PVFS2_XATTR_NAME_ACL_ACCESS; + if (acl) { + umode_t mode = inode->i_mode; + /* + * can we represent this with the traditional file + * mode permission bits? + */ + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) { + gossip_err("%s: posix_acl_equiv_mode err: %d\n", + __func__, + error); + return error; + } + + if (inode->i_mode != mode) + SetModeFlag(pvfs2_inode); + inode->i_mode = mode; + mark_inode_dirty_sync(inode); + if (error == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + name = PVFS2_XATTR_NAME_ACL_DEFAULT; + break; + default: + gossip_err("%s: invalid type %d!\n", __func__, type); + return -EINVAL; + } + + gossip_debug(GOSSIP_ACL_DEBUG, + "%s: inode %pU, key %s type %d\n", + __func__, get_khandle_from_ino(inode), + name, + type); + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (error < 0) + goto out; + } + + gossip_debug(GOSSIP_ACL_DEBUG, + "%s: name %s, value %p, size %zd, acl %p\n", + __func__, name, value, size, acl); + /* + * Go ahead and set the extended attribute now. NOTE: Suppose acl + * was NULL, then value will be NULL and size will be 0 and that + * will xlate to a removexattr. However, we don't want removexattr + * complain if attributes does not exist. + */ + error = pvfs2_inode_setxattr(inode, "", name, value, size, 0); + +out: + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + return error; +} + +int pvfs2_init_acl(struct inode *inode, struct inode *dir) +{ + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); + struct posix_acl *default_acl, *acl; + umode_t mode = inode->i_mode; + int error = 0; + + ClearModeFlag(pvfs2_inode); + + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + return error; + + if (default_acl) { + error = pvfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + + if (acl) { + if (!error) + error = pvfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + + /* If mode of the inode was changed, then do a forcible ->setattr */ + if (mode != inode->i_mode) { + SetModeFlag(pvfs2_inode); + inode->i_mode = mode; + pvfs2_flush_inode(inode); + } + + return error; +} diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c new file mode 100644 index 0000000..91d1e62 --- /dev/null +++ b/fs/orangefs/dir.c @@ -0,0 +1,395 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-bufmap.h" + +struct readdir_handle_t { + int buffer_index; + struct pvfs2_readdir_response readdir_response; + void *dents_buf; +}; + +/* + * decode routine needed by kmod to make sense of the shared page for readdirs. + */ +static long decode_dirents(char *ptr, struct pvfs2_readdir_response *readdir) +{ + int i; + struct pvfs2_readdir_response *rd = + (struct pvfs2_readdir_response *) ptr; + char *buf = ptr; + char **pptr = &buf; + + readdir->token = rd->token; + readdir->pvfs_dirent_outcount = rd->pvfs_dirent_outcount; + readdir->dirent_array = kmalloc(readdir->pvfs_dirent_outcount * + sizeof(*readdir->dirent_array), + GFP_KERNEL); + if (readdir->dirent_array == NULL) + return -ENOMEM; + *pptr += offsetof(struct pvfs2_readdir_response, dirent_array); + for (i = 0; i < readdir->pvfs_dirent_outcount; i++) { + dec_string(pptr, &readdir->dirent_array[i].d_name, + &readdir->dirent_array[i].d_length); + readdir->dirent_array[i].khandle = + *(struct pvfs2_khandle *) *pptr; + *pptr += 16; + } + return (unsigned long)*pptr - (unsigned long)ptr; +} + +static long readdir_handle_ctor(struct readdir_handle_t *rhandle, void *buf, + int buffer_index) +{ + long ret; + + if (buf == NULL) { + gossip_err + ("Invalid NULL buffer specified in readdir_handle_ctor\n"); + return -ENOMEM; + } + if (buffer_index < 0) { + gossip_err + ("Invalid buffer index specified in readdir_handle_ctor\n"); + return -EINVAL; + } + rhandle->buffer_index = buffer_index; + rhandle->dents_buf = buf; + ret = decode_dirents(buf, &rhandle->readdir_response); + if (ret < 0) { + gossip_err("Could not decode readdir from buffer %ld\n", ret); + rhandle->buffer_index = -1; + gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", buf); + vfree(buf); + rhandle->dents_buf = NULL; + } + return ret; +} + +static void readdir_handle_dtor(struct pvfs2_bufmap *bufmap, + struct readdir_handle_t *rhandle) +{ + if (rhandle == NULL) + return; + + /* kfree(NULL) is safe */ + kfree(rhandle->readdir_response.dirent_array); + rhandle->readdir_response.dirent_array = NULL; + + if (rhandle->buffer_index >= 0) { + readdir_index_put(bufmap, rhandle->buffer_index); + rhandle->buffer_index = -1; + } + if (rhandle->dents_buf) { + gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", + rhandle->dents_buf); + vfree(rhandle->dents_buf); + rhandle->dents_buf = NULL; + } + return; +} + +/* + * Read directory entries from an instance of an open directory. + * + * \note This routine was converted for the readdir to iterate change + * in "struct file_operations". "converted" mostly amounts to + * changing occurrences of "readdir" and "filldir" in the + * comments to "iterate" and "dir_emit". Also filldir calls + * were changed to dir_emit calls. + * + * \param dir_emit callback function called for each entry read. + * + * \retval <0 on error + * \retval 0 when directory has been completely traversed + * \retval >0 if we don't call dir_emit for all entries + * + * \note If the dir_emit call-back returns non-zero, then iterate should + * assume that it has had enough, and should return as well. + */ +static int pvfs2_readdir(struct file *file, struct dir_context *ctx) +{ + struct pvfs2_bufmap *bufmap = NULL; + int ret = 0; + int buffer_index; + uint64_t *ptoken = file->private_data; + uint64_t pos = 0; + ino_t ino = 0; + struct dentry *dentry = file->f_path.dentry; + struct pvfs2_kernel_op *new_op = NULL; + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(dentry->d_inode); + int buffer_full = 0; + struct readdir_handle_t rhandle; + int i = 0; + int len = 0; + ino_t current_ino = 0; + char *current_entry = NULL; + long bytes_decoded; + + gossip_ldebug(GOSSIP_DIR_DEBUG, + "%s: ctx->pos:%lld, token = %llu\n", + __func__, + lld(ctx->pos), + llu(*ptoken)); + + pos = (uint64_t) ctx->pos; + + /* are we done? */ + if (pos == PVFS_READDIR_END) { + gossip_debug(GOSSIP_DIR_DEBUG, + "Skipping to termination path\n"); + return 0; + } + + gossip_debug(GOSSIP_DIR_DEBUG, + "pvfs2_readdir called on %s (pos=%llu)\n", + dentry->d_name.name, llu(pos)); + + rhandle.buffer_index = -1; + rhandle.dents_buf = NULL; + memset(&rhandle.readdir_response, 0, sizeof(rhandle.readdir_response)); + + new_op = op_alloc(PVFS2_VFS_OP_READDIR); + if (!new_op) + return -ENOMEM; + + new_op->uses_shared_memory = 1; + new_op->upcall.req.readdir.refn = pvfs2_inode->refn; + new_op->upcall.req.readdir.max_dirent_count = MAX_DIRENT_COUNT_READDIR; + + gossip_debug(GOSSIP_DIR_DEBUG, + "%s: upcall.req.readdir.refn.khandle: %pU\n", + __func__, + &new_op->upcall.req.readdir.refn.khandle); + + /* + * NOTE: the position we send to the readdir upcall is out of + * sync with ctx->pos since: + * 1. pvfs2 doesn't include the "." and ".." entries that are + * added below. + * 2. the introduction of distributed directory logic makes token no + * longer be related to f_pos and pos. Instead an independent + * variable is used inside the function and stored in the + * private_data of the file structure. + */ + new_op->upcall.req.readdir.token = *ptoken; + +get_new_buffer_index: + ret = readdir_index_get(&bufmap, &buffer_index); + if (ret < 0) { + gossip_lerr("pvfs2_readdir: readdir_index_get() failure (%d)\n", + ret); + goto out_free_op; + } + new_op->upcall.req.readdir.buf_index = buffer_index; + + ret = service_operation(new_op, + "pvfs2_readdir", + get_interruptible_flag(dentry->d_inode)); + + gossip_debug(GOSSIP_DIR_DEBUG, + "Readdir downcall status is %d. ret:%d\n", + new_op->downcall.status, + ret); + + if (ret == -EAGAIN && op_state_purged(new_op)) { + /* + * readdir shared memory aread has been wiped due to + * pvfs2-client-core restarting, so we must get a new + * index into the shared memory. + */ + gossip_debug(GOSSIP_DIR_DEBUG, + "%s: Getting new buffer_index for retry of readdir..\n", + __func__); + readdir_index_put(bufmap, buffer_index); + goto get_new_buffer_index; + } + + if (ret == -EIO && op_state_purged(new_op)) { + gossip_err("%s: Client is down. Aborting readdir call.\n", + __func__); + readdir_index_put(bufmap, buffer_index); + goto out_free_op; + } + + if (ret < 0 || new_op->downcall.status != 0) { + gossip_debug(GOSSIP_DIR_DEBUG, + "Readdir request failed. Status:%d\n", + new_op->downcall.status); + readdir_index_put(bufmap, buffer_index); + if (ret >= 0) + ret = new_op->downcall.status; + goto out_free_op; + } + + bytes_decoded = + readdir_handle_ctor(&rhandle, + new_op->downcall.trailer_buf, + buffer_index); + if (bytes_decoded < 0) { + gossip_err("pvfs2_readdir: Could not decode trailer buffer into a readdir response %d\n", + ret); + ret = bytes_decoded; + readdir_index_put(bufmap, buffer_index); + goto out_free_op; + } + + if (bytes_decoded != new_op->downcall.trailer_size) { + gossip_err("pvfs2_readdir: # bytes decoded (%ld) != trailer size (%ld)\n", + bytes_decoded, + (long)new_op->downcall.trailer_size); + ret = -EINVAL; + goto out_destroy_handle; + } + + if (pos == 0) { + ino = get_ino_from_khandle(dentry->d_inode); + gossip_debug(GOSSIP_DIR_DEBUG, + "%s: calling dir_emit of \".\" with pos = %llu\n", + __func__, + llu(pos)); + ret = dir_emit(ctx, ".", 1, ino, DT_DIR); + if (ret < 0) + goto out_destroy_handle; + ctx->pos++; + gossip_ldebug(GOSSIP_DIR_DEBUG, + "%s: ctx->pos:%lld\n", + __func__, + lld(ctx->pos)); + pos++; + } + + if (pos == 1) { + ino = get_parent_ino_from_dentry(dentry); + gossip_debug(GOSSIP_DIR_DEBUG, + "%s: calling dir_emit of \"..\" with pos = %llu\n", + __func__, + llu(pos)); + ret = dir_emit(ctx, "..", 2, ino, DT_DIR); + if (ret < 0) + goto out_destroy_handle; + ctx->pos++; + gossip_ldebug(GOSSIP_DIR_DEBUG, + "%s: ctx->pos:%lld\n", + __func__, + lld(ctx->pos)); + pos++; + } + + for (i = 0; i < rhandle.readdir_response.pvfs_dirent_outcount; i++) { + len = rhandle.readdir_response.dirent_array[i].d_length; + current_entry = rhandle.readdir_response.dirent_array[i].d_name; + current_ino = pvfs2_khandle_to_ino( + &(rhandle.readdir_response.dirent_array[i].khandle)); + + gossip_debug(GOSSIP_DIR_DEBUG, + "calling dir_emit for %s with len %d, pos %ld\n", + current_entry, + len, + (unsigned long)pos); + ret = + dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN); + if (ret < 0) { + gossip_debug(GOSSIP_DIR_DEBUG, + "dir_emit() failed. ret:%d\n", + ret); + if (i < 2) { + gossip_err("dir_emit failed on one of the first two true PVFS directory entries.\n"); + gossip_err("Duplicate entries may appear.\n"); + } + buffer_full = 1; + break; + } + ctx->pos++; + gossip_ldebug(GOSSIP_DIR_DEBUG, + "%s: ctx->pos:%lld\n", + __func__, + lld(ctx->pos)); + + pos++; + } + + /* this means that all of the dir_emit calls succeeded */ + if (i == rhandle.readdir_response.pvfs_dirent_outcount) { + /* update token */ + *ptoken = rhandle.readdir_response.token; + } else { + /* this means a dir_emit call failed */ + if (rhandle.readdir_response.token == PVFS_READDIR_END) { + /* + * If PVFS hit end of directory, then there + * is no way to do math on the token that it + * returned. Instead we go by ctx->pos but + * back up to account for the artificial . + * and .. entries. + */ + ctx->pos -= 3; + } else { + /* + * this means a dir_emit call failed. !!! need to set + * back to previous ctx->pos, no middle value allowed + */ + pos -= (i - 1); + ctx->pos -= (i - 1); + } + gossip_debug(GOSSIP_DIR_DEBUG, + "at least one dir_emit call failed. Setting ctx->pos to: %lld\n", + lld(ctx->pos)); + } + + /* + * Did we hit the end of the directory? + */ + if (rhandle.readdir_response.token == PVFS_READDIR_END && + !buffer_full) { + gossip_debug(GOSSIP_DIR_DEBUG, "End of dir detected; setting ctx->pos to PVFS_READDIR_END.\n"); + ctx->pos = PVFS_READDIR_END; + } + + gossip_debug(GOSSIP_DIR_DEBUG, + "pos = %llu, token = %llu" + ", ctx->pos should have been %lld\n", + llu(pos), + llu(*ptoken), + lld(ctx->pos)); + +out_destroy_handle: + readdir_handle_dtor(bufmap, &rhandle); +out_free_op: + op_release(new_op); + gossip_debug(GOSSIP_DIR_DEBUG, "pvfs2_readdir returning %d\n", ret); + return ret; +} + +static int pvfs2_dir_open(struct inode *inode, struct file *file) +{ + uint64_t *ptoken; + + file->private_data = kmalloc(sizeof(uint64_t), GFP_KERNEL); + if (!file->private_data) + return -ENOMEM; + + ptoken = file->private_data; + *ptoken = PVFS_READDIR_START; + return 0; +} + +static int pvfs2_dir_release(struct inode *inode, struct file *file) +{ + pvfs2_flush_inode(inode); + kfree(file->private_data); + return 0; +} + +/** PVFS2 implementation of VFS directory operations */ +const struct file_operations pvfs2_dir_operations = { + .read = generic_read_dir, + .iterate = pvfs2_readdir, + .open = pvfs2_dir_open, + .release = pvfs2_dir_release, +}; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c new file mode 100644 index 0000000..5da2a20 --- /dev/null +++ b/fs/orangefs/inode.c @@ -0,0 +1,468 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +/* + * Linux VFS inode operations. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-bufmap.h" + +static int read_one_page(struct page *page) +{ + void *page_data; + int ret; + int max_block; + ssize_t bytes_read = 0; + struct inode *inode = page->mapping->host; + const uint32_t blocksize = PAGE_CACHE_SIZE; /* inode->i_blksize */ + const uint32_t blockbits = PAGE_CACHE_SHIFT; /* inode->i_blkbits */ + + gossip_debug(GOSSIP_INODE_DEBUG, + "pvfs2_readpage called with page %p\n", + page); + page_data = pvfs2_kmap(page); + + max_block = ((inode->i_size / blocksize) + 1); + + if (page->index < max_block) { + loff_t blockptr_offset = (((loff_t) page->index) << blockbits); + bytes_read = pvfs2_inode_read(inode, + page_data, + blocksize, + &blockptr_offset, + inode->i_size); + } + /* only zero remaining unread portions of the page data */ + if (bytes_read > 0) + memset(page_data + bytes_read, 0, blocksize - bytes_read); + else + memset(page_data, 0, blocksize); + /* takes care of potential aliasing */ + flush_dcache_page(page); + if (bytes_read < 0) { + ret = bytes_read; + SetPageError(page); + } else { + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + ret = 0; + } + pvfs2_kunmap(page); + /* unlock the page after the ->readpage() routine completes */ + unlock_page(page); + return ret; +} + +static int pvfs2_readpage(struct file *file, struct page *page) +{ + return read_one_page(page); +} + +static int pvfs2_readpages(struct file *file, + struct address_space *mapping, + struct list_head *pages, + unsigned nr_pages) +{ + int page_idx; + int ret; + + gossip_debug(GOSSIP_INODE_DEBUG, "pvfs2_readpages called\n"); + + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page; + page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + if (!add_to_page_cache(page, + mapping, + page->index, + GFP_KERNEL)) { + ret = read_one_page(page); + gossip_debug(GOSSIP_INODE_DEBUG, + "failure adding page to cache, read_one_page returned: %d\n", + ret); + } else { + page_cache_release(page); + } + } + BUG_ON(!list_empty(pages)); + return 0; +} + +static void pvfs2_invalidatepage(struct page *page, + unsigned int offset, + unsigned int length) +{ + gossip_debug(GOSSIP_INODE_DEBUG, + "pvfs2_invalidatepage called on page %p " + "(offset is %u)\n", + page, + offset); + + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + return; + +} + +static int pvfs2_releasepage(struct page *page, gfp_t foo) +{ + gossip_debug(GOSSIP_INODE_DEBUG, + "pvfs2_releasepage called on page %p\n", + page); + return 0; +} + +/* + * Having a direct_IO entry point in the address_space_operations + * struct causes the kernel to allows us to use O_DIRECT on + * open. Nothing will ever call this thing, but in the future we + * will need to be able to use O_DIRECT on open in order to support + * AIO. Modeled after NFS, they do this too. + */ +/* +static ssize_t pvfs2_direct_IO(int rw, + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + gossip_debug(GOSSIP_INODE_DEBUG, + "pvfs2_direct_IO: %s\n", + iocb->ki_filp->f_path.dentry->d_name.name); + + return -EINVAL; +} +*/ + +struct backing_dev_info pvfs2_backing_dev_info = { + .name = "pvfs2", + .ra_pages = 0, + .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, +}; + +/** PVFS2 implementation of address space operations */ +const struct address_space_operations pvfs2_address_operations = { + .readpage = pvfs2_readpage, + .readpages = pvfs2_readpages, + .invalidatepage = pvfs2_invalidatepage, + .releasepage = pvfs2_releasepage, +/* .direct_IO = pvfs2_direct_IO */ +}; + +static int pvfs2_setattr_size(struct inode *inode, struct iattr *iattr) +{ + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); + struct pvfs2_kernel_op *new_op; + loff_t orig_size = i_size_read(inode); + int ret = -EINVAL; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n", + __func__, + get_khandle_from_ino(inode), + &pvfs2_inode->refn.khandle, + pvfs2_inode->refn.fs_id, + iattr->ia_size); + + truncate_setsize(inode, iattr->ia_size); + + new_op = op_alloc(PVFS2_VFS_OP_TRUNCATE); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.truncate.refn = pvfs2_inode->refn; + new_op->upcall.req.truncate.size = (int64_t) iattr->ia_size; + + ret = service_operation(new_op, __func__, + get_interruptible_flag(inode)); + + /* + * the truncate has no downcall members to retrieve, but + * the status value tells us if it went through ok or not + */ + gossip_debug(GOSSIP_UTILS_DEBUG, + "pvfs2: pvfs2_truncate got return value of %d\n", + ret); + + op_release(new_op); + + if (ret != 0) + return ret; + + /* + * Only change the c/mtime if we are changing the size or we are + * explicitly asked to change it. This handles the semantic difference + * between truncate() and ftruncate() as implemented in the VFS. + * + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (orig_size != i_size_read(inode) && + !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { + iattr->ia_ctime = iattr->ia_mtime = + current_fs_time(inode->i_sb); + iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; + } + + return ret; +} + +/* + * Change attributes of an object referenced by dentry. + */ +int pvfs2_setattr(struct dentry *dentry, struct iattr *iattr) +{ + int ret = -EINVAL; + struct inode *inode = dentry->d_inode; + + gossip_debug(GOSSIP_INODE_DEBUG, + "pvfs2_setattr: called on %s\n", + dentry->d_name.name); + + ret = inode_change_ok(inode, iattr); + if (ret) + goto out; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + ret = pvfs2_setattr_size(inode, iattr); + if (ret) + goto out; + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + ret = pvfs2_inode_setattr(inode, iattr); + gossip_debug(GOSSIP_INODE_DEBUG, + "pvfs2_setattr: inode_setattr returned %d\n", + ret); + + if (!ret && (iattr->ia_valid & ATTR_MODE)) + /* change mod on a file that has ACLs */ + ret = posix_acl_chmod(inode, inode->i_mode); + +out: + gossip_debug(GOSSIP_INODE_DEBUG, "pvfs2_setattr: returning %d\n", ret); + return ret; +} + +/* + * Obtain attributes of an object given a dentry + */ +int pvfs2_getattr(struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *kstat) +{ + int ret = -ENOENT; + struct inode *inode = dentry->d_inode; + struct pvfs2_inode_s *pvfs2_inode = NULL; + + gossip_debug(GOSSIP_INODE_DEBUG, + "pvfs2_getattr: called on %s\n", + dentry->d_name.name); + + /* + * Similar to the above comment, a getattr also expects that all + * fields/attributes of the inode would be refreshed. So again, we + * dont have too much of a choice but refresh all the attributes. + */ + ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT); + if (ret == 0) { + generic_fillattr(inode, kstat); + /* override block size reported to stat */ + pvfs2_inode = PVFS2_I(inode); + kstat->blksize = pvfs2_inode->blksize; + } else { + /* assume an I/O error and flag inode as bad */ + gossip_debug(GOSSIP_INODE_DEBUG, + "%s:%s:%d calling make bad inode\n", + __FILE__, + __func__, + __LINE__); + pvfs2_make_bad_inode(inode); + } + return ret; +} + +/* PVFS2 implementation of VFS inode operations for files */ +struct inode_operations pvfs2_file_inode_operations = { + .get_acl = pvfs2_get_acl, + .set_acl = pvfs2_set_acl, + .setattr = pvfs2_setattr, + .getattr = pvfs2_getattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = pvfs2_listxattr, + .removexattr = generic_removexattr, +}; + +static int pvfs2_init_iops(struct inode *inode) +{ + inode->i_mapping->a_ops = &pvfs2_address_operations; + inode->i_mapping->backing_dev_info = &pvfs2_backing_dev_info; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &pvfs2_file_inode_operations; + inode->i_fop = &pvfs2_file_operations; + inode->i_blkbits = PAGE_CACHE_SHIFT; + break; + case S_IFLNK: + inode->i_op = &pvfs2_symlink_inode_operations; + break; + case S_IFDIR: + inode->i_op = &pvfs2_dir_inode_operations; + inode->i_fop = &pvfs2_dir_operations; + break; + default: + gossip_debug(GOSSIP_INODE_DEBUG, + "%s: unsupported mode\n", + __func__); + return -EINVAL; + } + + return 0; +} + +/* + * Given a PVFS2 object identifier (fsid, handle), convert it into a ino_t type + * that will be used as a hash-index from where the handle will + * be searched for in the VFS hash table of inodes. + */ +static inline ino_t pvfs2_handle_hash(PVFS_object_kref *ref) +{ + if (!ref) + return 0; + return pvfs2_khandle_to_ino(&(ref->khandle)); +} + +/* + * Called to set up an inode from iget5_locked. + */ +static int pvfs2_set_inode(struct inode *inode, void *data) +{ + PVFS_object_kref *ref = (PVFS_object_kref *) data; + struct pvfs2_inode_s *pvfs2_inode = NULL; + + /* Make sure that we have sane parameters */ + if (!data || !inode) + return 0; + pvfs2_inode = PVFS2_I(inode); + if (!pvfs2_inode) + return 0; + pvfs2_inode->refn.fs_id = ref->fs_id; + pvfs2_inode->refn.khandle = ref->khandle; + return 0; +} + +/* + * Called to determine if handles match. + */ +static int pvfs2_test_inode(struct inode *inode, void *data) +{ + PVFS_object_kref *ref = (PVFS_object_kref *) data; + struct pvfs2_inode_s *pvfs2_inode = NULL; + + pvfs2_inode = PVFS2_I(inode); + return (!PVFS_khandle_cmp(&(pvfs2_inode->refn.khandle), &(ref->khandle)) + && pvfs2_inode->refn.fs_id == ref->fs_id); +} + +/* + * Front-end to lookup the inode-cache maintained by the VFS using the PVFS2 + * file handle. + * + * @sb: the file system super block instance. + * @ref: The PVFS2 object for which we are trying to locate an inode structure. + */ +struct inode *pvfs2_iget(struct super_block *sb, PVFS_object_kref *ref) +{ + struct inode *inode = NULL; + unsigned long hash; + int error; + + hash = pvfs2_handle_hash(ref); + inode = iget5_locked(sb, hash, pvfs2_test_inode, pvfs2_set_inode, ref); + if (!inode || !(inode->i_state & I_NEW)) + return inode; + + error = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT); + if (error) { + iget_failed(inode); + return ERR_PTR(error); + } + + inode->i_ino = hash; /* needed for stat etc */ + pvfs2_init_iops(inode); + unlock_new_inode(inode); + + gossip_debug(GOSSIP_INODE_DEBUG, + "iget handle %pU, fsid %d hash %ld i_ino %lu\n", + &ref->khandle, + ref->fs_id, + hash, + inode->i_ino); + + return inode; +} + +/* + * Allocate an inode for a newly created file and insert it into the inode hash. + */ +struct inode *pvfs2_new_inode(struct super_block *sb, struct inode *dir, + int mode, dev_t dev, PVFS_object_kref *ref) +{ + unsigned long hash = pvfs2_handle_hash(ref); + struct inode *inode; + int error; + + gossip_debug(GOSSIP_INODE_DEBUG, + "pvfs2_get_custom_inode_common: called\n" + "(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n", + sb, + MAJOR(dev), + MINOR(dev), + mode); + + inode = new_inode(sb); + if (!inode) + return NULL; + + pvfs2_set_inode(inode, ref); + inode->i_ino = hash; /* needed for stat etc */ + + error = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT); + if (error) + goto out_iput; + + pvfs2_init_iops(inode); + + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_size = PAGE_CACHE_SIZE; + inode->i_rdev = dev; + + error = insert_inode_locked4(inode, hash, pvfs2_test_inode, ref); + if (error < 0) + goto out_iput; + + gossip_debug(GOSSIP_ACL_DEBUG, + "Initializing ACL's for inode %pU\n", + get_khandle_from_ino(inode)); + pvfs2_init_acl(inode, dir); + return inode; + +out_iput: + iput(inode); + return ERR_PTR(error); +} diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c new file mode 100644 index 0000000..8f69dd2 --- /dev/null +++ b/fs/orangefs/namei.c @@ -0,0 +1,473 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +/* + * Linux VFS namei operations. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" + +/* + * Get a newly allocated inode to go with a negative dentry. + */ +static int pvfs2_create(struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool exclusive) +{ + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op *new_op; + struct inode *inode; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__); + + new_op = op_alloc(PVFS2_VFS_OP_CREATE); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.create.parent_refn = parent->refn; + + fill_default_sys_attrs(new_op->upcall.req.create.attributes, + PVFS_TYPE_METAFILE, mode); + + strncpy(new_op->upcall.req.create.d_name, + dentry->d_name.name, PVFS2_NAME_LEN); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Create Got PVFS2 handle %pU on fsid %d (ret=%d)\n", + &new_op->downcall.resp.create.refn.khandle, + new_op->downcall.resp.create.refn.fs_id, ret); + + if (ret < 0) { + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: failed with error code %d\n", + __func__, ret); + goto out; + } + + inode = pvfs2_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, + &new_op->downcall.resp.create.refn); + if (IS_ERR(inode)) { + gossip_err("*** Failed to allocate pvfs2 file inode\n"); + ret = PTR_ERR(inode); + goto out; + } + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Assigned file inode new number of %pU\n", + get_khandle_from_ino(inode)); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Inode (Regular File) %pU -> %s\n", + get_khandle_from_ino(inode), + dentry->d_name.name); + + SetMtimeFlag(parent); + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + mark_inode_dirty_sync(dir); + ret = 0; +out: + op_release(new_op); + gossip_debug(GOSSIP_NAME_DEBUG, "%s: returning %d\n", __func__, ret); + return ret; +} + +/* + * Attempt to resolve an object name (dentry->d_name), parent handle, and + * fsid into a handle for the object. + */ +static struct dentry *pvfs2_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op *new_op; + struct inode *inode; + struct dentry *res; + int ret = -EINVAL; + + /* + * in theory we could skip a lookup here (if the intent is to + * create) in order to avoid a potentially failed lookup, but + * leaving it in can skip a valid lookup and try to create a file + * that already exists (e.g. the vfs already handles checking for + * -EEXIST on O_EXCL opens, which is broken if we skip this lookup + * in the create path) + */ + gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n", + __func__, dentry->d_name.name); + + if (dentry->d_name.len > (PVFS2_NAME_LEN - 1)) + return ERR_PTR(-ENAMETOOLONG); + + new_op = op_alloc(PVFS2_VFS_OP_LOOKUP); + if (!new_op) + return ERR_PTR(-ENOMEM); + + new_op->upcall.req.lookup.sym_follow = flags & LOOKUP_FOLLOW; + + gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n", + __FILE__, + __func__, + __LINE__, + &parent->refn.khandle); + new_op->upcall.req.lookup.parent_refn = parent->refn; + + strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name, + PVFS2_NAME_LEN); + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: doing lookup on %s under %pU,%d (follow=%s)\n", + __func__, + new_op->upcall.req.lookup.d_name, + &new_op->upcall.req.lookup.parent_refn.khandle, + new_op->upcall.req.lookup.parent_refn.fs_id, + ((new_op->upcall.req.lookup.sym_follow == + PVFS2_LOOKUP_LINK_FOLLOW) ? "yes" : "no")); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Lookup Got %pU, fsid %d (ret=%d)\n", + &new_op->downcall.resp.lookup.refn.khandle, + new_op->downcall.resp.lookup.refn.fs_id, + ret); + + if (ret < 0) { + if (ret == -ENOENT) { + /* + * if no inode was found, add a negative dentry to + * dcache anyway; if we don't, we don't hold expected + * lookup semantics and we most noticeably break + * during directory renames. + * + * however, if the operation failed or exited, do not + * add the dentry (e.g. in the case that a touch is + * issued on a file that already exists that was + * interrupted during this lookup -- no need to add + * another negative dentry for an existing file) + */ + + gossip_debug(GOSSIP_NAME_DEBUG, + "pvfs2_lookup: Adding *negative* dentry " + "%p for %s\n", + dentry, + dentry->d_name.name); + + d_add(dentry, NULL); + res = NULL; + goto out; + } + + /* must be a non-recoverable error */ + res = ERR_PTR(ret); + goto out; + } + + inode = pvfs2_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); + if (IS_ERR(inode)) { + gossip_debug(GOSSIP_NAME_DEBUG, + "error %ld from iget\n", PTR_ERR(inode)); + res = ERR_CAST(inode); + goto out; + } + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s:%s:%d " + "Found good inode [%lu] with count [%d]\n", + __FILE__, + __func__, + __LINE__, + inode->i_ino, + (int)atomic_read(&inode->i_count)); + + /* update dentry/inode pair into dcache */ + res = d_splice_alias(inode, dentry); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Lookup success (inode ct = %d)\n", + (int)atomic_read(&inode->i_count)); +out: + op_release(new_op); + return res; +} + +/* return 0 on success; non-zero otherwise */ +static int pvfs2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op *new_op; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: called on %s\n" + " (inode %pU): Parent is %pU | fs_id %d\n", + __func__, + dentry->d_name.name, + get_khandle_from_ino(inode), + &parent->refn.khandle, + parent->refn.fs_id); + + new_op = op_alloc(PVFS2_VFS_OP_REMOVE); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.remove.parent_refn = parent->refn; + strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name, + PVFS2_NAME_LEN); + + ret = service_operation(new_op, "pvfs2_unlink", + get_interruptible_flag(inode)); + + /* when request is serviced properly, free req op struct */ + op_release(new_op); + + if (!ret) { + drop_nlink(inode); + + SetMtimeFlag(parent); + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + mark_inode_dirty_sync(dir); + } + return ret; +} + +/* + * pvfs2_link() is only implemented here to make sure that we return a + * reasonable error code (the kernel will return a misleading EPERM + * otherwise). PVFS2 does not support hard links. + */ +static int pvfs2_link(struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + return -EOPNOTSUPP; +} + +/* + * pvfs2_mknod() is only implemented here to make sure that we return a + * reasonable error code (the kernel will return a misleading EPERM + * otherwise). PVFS2 does not support special files such as fifos or devices. + */ +static int pvfs2_mknod(struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return -EOPNOTSUPP; +} + +static int pvfs2_symlink(struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op *new_op; + struct inode *inode; + int mode = 755; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__); + + if (!symname) + return -EINVAL; + + new_op = op_alloc(PVFS2_VFS_OP_SYMLINK); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.sym.parent_refn = parent->refn; + + fill_default_sys_attrs(new_op->upcall.req.sym.attributes, + PVFS_TYPE_SYMLINK, + mode); + + strncpy(new_op->upcall.req.sym.entry_name, + dentry->d_name.name, + PVFS2_NAME_LEN); + strncpy(new_op->upcall.req.sym.target, symname, PVFS2_NAME_LEN); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Symlink Got PVFS2 handle %pU on fsid %d (ret=%d)\n", + &new_op->downcall.resp.sym.refn.khandle, + new_op->downcall.resp.sym.refn.fs_id, ret); + + if (ret < 0) { + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: failed with error code %d\n", + __func__, ret); + goto out; + } + + inode = pvfs2_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, + &new_op->downcall.resp.sym.refn); + if (IS_ERR(inode)) { + gossip_err + ("*** Failed to allocate pvfs2 symlink inode\n"); + ret = PTR_ERR(inode); + goto out; + } + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Assigned symlink inode new number of %pU\n", + get_khandle_from_ino(inode)); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Inode (Symlink) %pU -> %s\n", + get_khandle_from_ino(inode), + dentry->d_name.name); + + SetMtimeFlag(parent); + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + mark_inode_dirty_sync(dir); + ret = 0; +out: + op_release(new_op); + return ret; +} + +static int pvfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op *new_op; + struct inode *inode; + int ret; + + new_op = op_alloc(PVFS2_VFS_OP_MKDIR); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.mkdir.parent_refn = parent->refn; + + fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes, + PVFS_TYPE_DIRECTORY, mode); + + strncpy(new_op->upcall.req.mkdir.d_name, + dentry->d_name.name, PVFS2_NAME_LEN); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Mkdir Got PVFS2 handle %pU on fsid %d\n", + &new_op->downcall.resp.mkdir.refn.khandle, + new_op->downcall.resp.mkdir.refn.fs_id); + + if (ret < 0) { + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: failed with error code %d\n", + __func__, ret); + goto out; + } + + inode = pvfs2_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, + &new_op->downcall.resp.mkdir.refn); + if (IS_ERR(inode)) { + gossip_err("*** Failed to allocate pvfs2 dir inode\n"); + ret = PTR_ERR(inode); + goto out; + } + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Assigned dir inode new number of %pU\n", + get_khandle_from_ino(inode)); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Inode (Directory) %pU -> %s\n", + get_khandle_from_ino(inode), + dentry->d_name.name); + + /* + * NOTE: we have no good way to keep nlink consistent for directories + * across clients; keep constant at 1. + */ + SetMtimeFlag(parent); + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + mark_inode_dirty_sync(dir); +out: + op_release(new_op); + return ret; +} + +static int pvfs2_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct pvfs2_kernel_op *new_op; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, + "pvfs2_rename: called (%s/%s => %s/%s) ct=%d\n", + old_dentry->d_parent->d_name.name, + old_dentry->d_name.name, + new_dentry->d_parent->d_name.name, + new_dentry->d_name.name, + d_count(new_dentry)); + + new_op = op_alloc(PVFS2_VFS_OP_RENAME); + if (!new_op) + return -EINVAL; + + new_op->upcall.req.rename.old_parent_refn = PVFS2_I(old_dir)->refn; + new_op->upcall.req.rename.new_parent_refn = PVFS2_I(new_dir)->refn; + + strncpy(new_op->upcall.req.rename.d_old_name, + old_dentry->d_name.name, + PVFS2_NAME_LEN); + strncpy(new_op->upcall.req.rename.d_new_name, + new_dentry->d_name.name, + PVFS2_NAME_LEN); + + ret = service_operation(new_op, + "pvfs2_rename", + get_interruptible_flag(old_dentry->d_inode)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "pvfs2_rename: got downcall status %d\n", + ret); + + if (new_dentry->d_inode) + new_dentry->d_inode->i_ctime = CURRENT_TIME; + + op_release(new_op); + return ret; +} + +/* PVFS2 implementation of VFS inode operations for directories */ +struct inode_operations pvfs2_dir_inode_operations = { + .lookup = pvfs2_lookup, + .get_acl = pvfs2_get_acl, + .set_acl = pvfs2_set_acl, + .create = pvfs2_create, + .link = pvfs2_link, + .unlink = pvfs2_unlink, + .symlink = pvfs2_symlink, + .mkdir = pvfs2_mkdir, + .rmdir = pvfs2_unlink, + .mknod = pvfs2_mknod, + .rename = pvfs2_rename, + .setattr = pvfs2_setattr, + .getattr = pvfs2_getattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = pvfs2_listxattr, +}; diff --git a/fs/orangefs/pvfs2-utils.c b/fs/orangefs/pvfs2-utils.c new file mode 100644 index 0000000..42c5f3f --- /dev/null +++ b/fs/orangefs/pvfs2-utils.c @@ -0,0 +1,914 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-dev-proto.h" +#include "pvfs2-bufmap.h" + +int32_t fsid_of_op(struct pvfs2_kernel_op *op) +{ + int32_t fsid = PVFS_FS_ID_NULL; + if (op) { + switch (op->upcall.type) { + case PVFS2_VFS_OP_FILE_IO: + fsid = op->upcall.req.io.refn.fs_id; + break; + case PVFS2_VFS_OP_LOOKUP: + fsid = op->upcall.req.lookup.parent_refn.fs_id; + break; + case PVFS2_VFS_OP_CREATE: + fsid = op->upcall.req.create.parent_refn.fs_id; + break; + case PVFS2_VFS_OP_GETATTR: + fsid = op->upcall.req.getattr.refn.fs_id; + break; + case PVFS2_VFS_OP_REMOVE: + fsid = op->upcall.req.remove.parent_refn.fs_id; + break; + case PVFS2_VFS_OP_MKDIR: + fsid = op->upcall.req.mkdir.parent_refn.fs_id; + break; + case PVFS2_VFS_OP_READDIR: + fsid = op->upcall.req.readdir.refn.fs_id; + break; + case PVFS2_VFS_OP_SETATTR: + fsid = op->upcall.req.setattr.refn.fs_id; + break; + case PVFS2_VFS_OP_SYMLINK: + fsid = op->upcall.req.sym.parent_refn.fs_id; + break; + case PVFS2_VFS_OP_RENAME: + fsid = op->upcall.req.rename.old_parent_refn.fs_id; + break; + case PVFS2_VFS_OP_STATFS: + fsid = op->upcall.req.statfs.fs_id; + break; + case PVFS2_VFS_OP_TRUNCATE: + fsid = op->upcall.req.truncate.refn.fs_id; + break; + case PVFS2_VFS_OP_MMAP_RA_FLUSH: + fsid = op->upcall.req.ra_cache_flush.refn.fs_id; + break; + case PVFS2_VFS_OP_FS_UMOUNT: + fsid = op->upcall.req.fs_umount.fs_id; + break; + case PVFS2_VFS_OP_GETXATTR: + fsid = op->upcall.req.getxattr.refn.fs_id; + break; + case PVFS2_VFS_OP_SETXATTR: + fsid = op->upcall.req.setxattr.refn.fs_id; + break; + case PVFS2_VFS_OP_LISTXATTR: + fsid = op->upcall.req.listxattr.refn.fs_id; + break; + case PVFS2_VFS_OP_REMOVEXATTR: + fsid = op->upcall.req.removexattr.refn.fs_id; + break; + case PVFS2_VFS_OP_FSYNC: + fsid = op->upcall.req.fsync.refn.fs_id; + break; + default: + break; + } + } + return fsid; +} + +static void pvfs2_set_inode_flags(struct inode *inode, + struct PVFS_sys_attr_s *attrs) +{ + if (attrs->flags & PVFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + + if (attrs->flags & PVFS_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + + if (attrs->flags & PVFS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + + return; +} + +/* NOTE: symname is ignored unless the inode is a sym link */ +static int copy_attributes_to_inode(struct inode *inode, + struct PVFS_sys_attr_s *attrs, + char *symname) +{ + int ret = -1; + int perm_mode = 0; + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); + loff_t inode_size = 0; + loff_t rounded_up_size = 0; + + + /* + arbitrarily set the inode block size; FIXME: we need to + resolve the difference between the reported inode blocksize + and the PAGE_CACHE_SIZE, since our block count will always + be wrong. + + For now, we're setting the block count to be the proper + number assuming the block size is 512 bytes, and the size is + rounded up to the nearest 4K. This is apparently required + to get proper size reports from the 'du' shell utility. + + changing the inode->i_blkbits to something other than + PAGE_CACHE_SHIFT breaks mmap/execution as we depend on that. + */ + gossip_debug(GOSSIP_UTILS_DEBUG, + "attrs->mask = %x (objtype = %s)\n", + attrs->mask, + attrs->objtype == PVFS_TYPE_METAFILE ? "file" : + attrs->objtype == PVFS_TYPE_DIRECTORY ? "directory" : + attrs->objtype == PVFS_TYPE_SYMLINK ? "symlink" : + "invalid/unknown"); + + switch (attrs->objtype) { + case PVFS_TYPE_METAFILE: + pvfs2_set_inode_flags(inode, attrs); + if (attrs->mask & PVFS_ATTR_SYS_SIZE) { + inode_size = (loff_t) attrs->size; + rounded_up_size = + (inode_size + (4096 - (inode_size % 4096))); + + pvfs2_lock_inode(inode); + inode->i_bytes = inode_size; + inode->i_blocks = + (unsigned long)(rounded_up_size / 512); + pvfs2_unlock_inode(inode); + + /* + * NOTE: make sure all the places we're called + * from have the inode->i_sem lock. We're fine + * in 99% of the cases since we're mostly + * called from a lookup. + */ + inode->i_size = inode_size; + } + break; + case PVFS_TYPE_SYMLINK: + if (symname != NULL) { + inode->i_size = (loff_t) strlen(symname); + break; + } + /*FALLTHRU*/ + default: + pvfs2_lock_inode(inode); + inode->i_bytes = PAGE_CACHE_SIZE; + inode->i_blocks = (unsigned long)(PAGE_CACHE_SIZE / 512); + pvfs2_unlock_inode(inode); + + inode->i_size = PAGE_CACHE_SIZE; + break; + } + + inode->i_uid = make_kuid(&init_user_ns, attrs->owner); + inode->i_gid = make_kgid(&init_user_ns, attrs->group); + inode->i_atime.tv_sec = (time_t) attrs->atime; + inode->i_mtime.tv_sec = (time_t) attrs->mtime; + inode->i_ctime.tv_sec = (time_t) attrs->ctime; + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + + if (attrs->perms & PVFS_O_EXECUTE) + perm_mode |= S_IXOTH; + if (attrs->perms & PVFS_O_WRITE) + perm_mode |= S_IWOTH; + if (attrs->perms & PVFS_O_READ) + perm_mode |= S_IROTH; + + if (attrs->perms & PVFS_G_EXECUTE) + perm_mode |= S_IXGRP; + if (attrs->perms & PVFS_G_WRITE) + perm_mode |= S_IWGRP; + if (attrs->perms & PVFS_G_READ) + perm_mode |= S_IRGRP; + + if (attrs->perms & PVFS_U_EXECUTE) + perm_mode |= S_IXUSR; + if (attrs->perms & PVFS_U_WRITE) + perm_mode |= S_IWUSR; + if (attrs->perms & PVFS_U_READ) + perm_mode |= S_IRUSR; + + if (attrs->perms & PVFS_G_SGID) + perm_mode |= S_ISGID; + if (attrs->perms & PVFS_U_SUID) + perm_mode |= S_ISUID; + + inode->i_mode = perm_mode; + + if (is_root_handle(inode)) { + /* special case: mark the root inode as sticky */ + inode->i_mode |= S_ISVTX; + gossip_debug(GOSSIP_UTILS_DEBUG, + "Marking inode %pU as sticky\n", + get_khandle_from_ino(inode)); + } + + switch (attrs->objtype) { + case PVFS_TYPE_METAFILE: + inode->i_mode |= S_IFREG; + ret = 0; + break; + case PVFS_TYPE_DIRECTORY: + inode->i_mode |= S_IFDIR; + /* NOTE: we have no good way to keep nlink consistent + * for directories across clients; keep constant at 1. + * Why 1? If we go with 2, then find(1) gets confused + * and won't work properly withouth the -noleaf option + */ + set_nlink(inode, 1); + ret = 0; + break; + case PVFS_TYPE_SYMLINK: + inode->i_mode |= S_IFLNK; + + /* copy link target to inode private data */ + if (pvfs2_inode && symname) { + strncpy(pvfs2_inode->link_target, + symname, + PVFS_NAME_MAX); + gossip_debug(GOSSIP_UTILS_DEBUG, + "Copied attr link target %s\n", + pvfs2_inode->link_target); + } + gossip_debug(GOSSIP_UTILS_DEBUG, + "symlink mode %o\n", + inode->i_mode); + ret = 0; + break; + default: + gossip_err("pvfs2: copy_attributes_to_inode: got invalid attribute type %x\n", + attrs->objtype); + } + + gossip_debug(GOSSIP_UTILS_DEBUG, + "pvfs2: copy_attributes_to_inode: setting i_mode to %o, i_size to %lu\n", + inode->i_mode, + (unsigned long)i_size_read(inode)); + + return ret; +} + +/* + * NOTE: in kernel land, we never use the sys_attr->link_target for + * anything, so don't bother copying it into the sys_attr object here. + */ +static inline int copy_attributes_from_inode(struct inode *inode, + struct PVFS_sys_attr_s *attrs, + struct iattr *iattr) +{ + umode_t tmp_mode; + + if (!iattr || !inode || !attrs) { + gossip_err("NULL iattr (%p), inode (%p), attrs (%p) " + "in copy_attributes_from_inode!\n", + iattr, + inode, + attrs); + return -EINVAL; + } + /* + * We need to be careful to only copy the attributes out of the + * iattr object that we know are valid. + */ + attrs->mask = 0; + if (iattr->ia_valid & ATTR_UID) { + attrs->owner = from_kuid(current_user_ns(), iattr->ia_uid); + attrs->mask |= PVFS_ATTR_SYS_UID; + gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner); + } + if (iattr->ia_valid & ATTR_GID) { + attrs->group = from_kgid(current_user_ns(), iattr->ia_gid); + attrs->mask |= PVFS_ATTR_SYS_GID; + gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group); + } + + if (iattr->ia_valid & ATTR_ATIME) { + attrs->mask |= PVFS_ATTR_SYS_ATIME; + if (iattr->ia_valid & ATTR_ATIME_SET) { + attrs->atime = + pvfs2_convert_time_field((void *)&iattr->ia_atime); + attrs->mask |= PVFS_ATTR_SYS_ATIME_SET; + } + } + if (iattr->ia_valid & ATTR_MTIME) { + attrs->mask |= PVFS_ATTR_SYS_MTIME; + if (iattr->ia_valid & ATTR_MTIME_SET) { + attrs->mtime = + pvfs2_convert_time_field((void *)&iattr->ia_mtime); + attrs->mask |= PVFS_ATTR_SYS_MTIME_SET; + } + } + if (iattr->ia_valid & ATTR_CTIME) + attrs->mask |= PVFS_ATTR_SYS_CTIME; + + /* + * PVFS2 cannot set size with a setattr operation. Probably not likely + * to be requested through the VFS, but just in case, don't worry about + * ATTR_SIZE + */ + + if (iattr->ia_valid & ATTR_MODE) { + tmp_mode = iattr->ia_mode; + if (tmp_mode & (S_ISVTX)) { + if (is_root_handle(inode)) { + /* + * allow sticky bit to be set on root (since + * it shows up that way by default anyhow), + * but don't show it to the server + */ + tmp_mode -= S_ISVTX; + } else { + gossip_debug(GOSSIP_UTILS_DEBUG, + "User attempted to set sticky bit on non-root directory; returning EINVAL.\n"); + return -EINVAL; + } + } + + if (tmp_mode & (S_ISUID)) { + gossip_debug(GOSSIP_UTILS_DEBUG, + "Attempting to set setuid bit (not supported); returning EINVAL.\n"); + return -EINVAL; + } + + attrs->perms = PVFS_util_translate_mode(tmp_mode); + attrs->mask |= PVFS_ATTR_SYS_PERM; + } + + return 0; +} + +/* + * issues a pvfs2 getattr request and fills in the appropriate inode + * attributes if successful. returns 0 on success; -errno otherwise + */ +int pvfs2_inode_getattr(struct inode *inode, uint32_t getattr_mask) +{ + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); + struct pvfs2_kernel_op *new_op; + int ret = -EINVAL; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: called on inode %pU\n", + __func__, + get_khandle_from_ino(inode)); + + new_op = op_alloc(PVFS2_VFS_OP_GETATTR); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.getattr.refn = pvfs2_inode->refn; + new_op->upcall.req.getattr.mask = getattr_mask; + + ret = service_operation(new_op, __func__, + get_interruptible_flag(inode)); + if (ret != 0) + goto out; + + if (copy_attributes_to_inode(inode, + &new_op->downcall.resp.getattr.attributes, + new_op->downcall.resp.getattr.link_target)) { + gossip_err("%s: failed to copy attributes\n", __func__); + ret = -ENOENT; + goto out; + } + + /* + * Store blksize in pvfs2 specific part of inode structure; we are + * only going to use this to report to stat to make sure it doesn't + * perturb any inode related code paths. + */ + if (new_op->downcall.resp.getattr.attributes.objtype == + PVFS_TYPE_METAFILE) { + pvfs2_inode->blksize = + new_op->downcall.resp.getattr.attributes.blksize; + } else { + /* mimic behavior of generic_fillattr() for other types. */ + pvfs2_inode->blksize = (1 << inode->i_blkbits); + + } + +out: + gossip_debug(GOSSIP_UTILS_DEBUG, + "Getattr on handle %pU, " + "fsid %d\n (inode ct = %d) returned %d\n", + &pvfs2_inode->refn.khandle, + pvfs2_inode->refn.fs_id, + (int)atomic_read(&inode->i_count), + ret); + + op_release(new_op); + return ret; +} + +/* + * issues a pvfs2 setattr request to make sure the new attribute values + * take effect if successful. returns 0 on success; -errno otherwise + */ +int pvfs2_inode_setattr(struct inode *inode, struct iattr *iattr) +{ + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); + struct pvfs2_kernel_op *new_op; + int ret; + + new_op = op_alloc(PVFS2_VFS_OP_SETATTR); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.setattr.refn = pvfs2_inode->refn; + ret = copy_attributes_from_inode(inode, + &new_op->upcall.req.setattr.attributes, + iattr); + if (ret < 0) { + op_release(new_op); + return ret; + } + + ret = service_operation(new_op, __func__, + get_interruptible_flag(inode)); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "pvfs2_inode_setattr: returning %d\n", + ret); + + /* when request is serviced properly, free req op struct */ + op_release(new_op); + + /* + * successful setattr should clear the atime, mtime and + * ctime flags. + */ + if (ret == 0) { + ClearAtimeFlag(pvfs2_inode); + ClearMtimeFlag(pvfs2_inode); + ClearCtimeFlag(pvfs2_inode); + ClearModeFlag(pvfs2_inode); + } + + return ret; +} + +int pvfs2_flush_inode(struct inode *inode) +{ + /* + * If it is a dirty inode, this function gets called. + * Gather all the information that needs to be setattr'ed + * Right now, this will only be used for mode, atime, mtime + * and/or ctime. + */ + struct iattr wbattr; + int ret; + int mtime_flag; + int ctime_flag; + int atime_flag; + int mode_flag; + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); + + memset(&wbattr, 0, sizeof(wbattr)); + + /* + * check inode flags up front, and clear them if they are set. This + * will prevent multiple processes from all trying to flush the same + * inode if they call close() simultaneously + */ + mtime_flag = MtimeFlag(pvfs2_inode); + ClearMtimeFlag(pvfs2_inode); + ctime_flag = CtimeFlag(pvfs2_inode); + ClearCtimeFlag(pvfs2_inode); + atime_flag = AtimeFlag(pvfs2_inode); + ClearAtimeFlag(pvfs2_inode); + mode_flag = ModeFlag(pvfs2_inode); + ClearModeFlag(pvfs2_inode); + + /* -- Lazy atime,mtime and ctime update -- + * Note: all times are dictated by server in the new scheme + * and not by the clients + * + * Also mode updates are being handled now.. + */ + + if (mtime_flag) + wbattr.ia_valid |= ATTR_MTIME; + if (ctime_flag) + wbattr.ia_valid |= ATTR_CTIME; + if (atime_flag) + wbattr.ia_valid |= ATTR_ATIME; + + if (mode_flag) { + wbattr.ia_mode = inode->i_mode; + wbattr.ia_valid |= ATTR_MODE; + } + + gossip_debug(GOSSIP_UTILS_DEBUG, + "*********** pvfs2_flush_inode: %pU " + "(ia_valid %d)\n", + get_khandle_from_ino(inode), + wbattr.ia_valid); + if (wbattr.ia_valid == 0) { + gossip_debug(GOSSIP_UTILS_DEBUG, + "pvfs2_flush_inode skipping setattr()\n"); + return 0; + } + + gossip_debug(GOSSIP_UTILS_DEBUG, + "pvfs2_flush_inode (%pU) writing mode %o\n", + get_khandle_from_ino(inode), + inode->i_mode); + + ret = pvfs2_inode_setattr(inode, &wbattr); + + return ret; +} + +int pvfs2_unmount_sb(struct super_block *sb) +{ + int ret = -EINVAL; + struct pvfs2_kernel_op *new_op = NULL; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "pvfs2_unmount_sb called on sb %p\n", + sb); + + new_op = op_alloc(PVFS2_VFS_OP_FS_UMOUNT); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.fs_umount.id = PVFS2_SB(sb)->id; + new_op->upcall.req.fs_umount.fs_id = PVFS2_SB(sb)->fs_id; + strncpy(new_op->upcall.req.fs_umount.pvfs2_config_server, + PVFS2_SB(sb)->devname, + PVFS_MAX_SERVER_ADDR_LEN); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Attempting PVFS2 Unmount via host %s\n", + new_op->upcall.req.fs_umount.pvfs2_config_server); + + ret = service_operation(new_op, "pvfs2_fs_umount", 0); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "pvfs2_unmount: got return value of %d\n", ret); + if (ret) + sb = ERR_PTR(ret); + else + PVFS2_SB(sb)->mount_pending = 1; + + op_release(new_op); + return ret; +} + +/* + * NOTE: on successful cancellation, be sure to return -EINTR, as + * that's the return value the caller expects + */ +int pvfs2_cancel_op_in_progress(uint64_t tag) +{ + int ret = -EINVAL; + struct pvfs2_kernel_op *new_op = NULL; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "pvfs2_cancel_op_in_progress called on tag %llu\n", + llu(tag)); + + new_op = op_alloc(PVFS2_VFS_OP_CANCEL); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.cancel.op_tag = tag; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "Attempting PVFS2 operation cancellation of tag %llu\n", + llu(new_op->upcall.req.cancel.op_tag)); + + ret = service_operation(new_op, "pvfs2_cancel", PVFS2_OP_CANCELLATION); + + gossip_debug(GOSSIP_UTILS_DEBUG, + "pvfs2_cancel_op_in_progress: got return value of %d\n", + ret); + + op_release(new_op); + return ret; +} + +void pvfs2_op_initialize(struct pvfs2_kernel_op *op) +{ + if (op) { + spin_lock(&op->lock); + op->io_completed = 0; + + op->upcall.type = PVFS2_VFS_OP_INVALID; + op->downcall.type = PVFS2_VFS_OP_INVALID; + op->downcall.status = -1; + + op->op_state = OP_VFS_STATE_UNKNOWN; + op->tag = 0; + spin_unlock(&op->lock); + } +} + +void pvfs2_make_bad_inode(struct inode *inode) +{ + if (is_root_handle(inode)) { + /* + * if this occurs, the pvfs2-client-core was killed but we + * can't afford to lose the inode operations and such + * associated with the root handle in any case. + */ + gossip_debug(GOSSIP_UTILS_DEBUG, + "*** NOT making bad root inode %pU\n", + get_khandle_from_ino(inode)); + } else { + gossip_debug(GOSSIP_UTILS_DEBUG, + "*** making bad inode %pU\n", + get_khandle_from_ino(inode)); + make_bad_inode(inode); + } +} + +/* this code is based on linux/net/sunrpc/clnt.c:rpc_clnt_sigmask */ +void mask_blocked_signals(sigset_t *orig_sigset) +{ + unsigned long sigallow = sigmask(SIGKILL); + unsigned long irqflags = 0; + struct k_sigaction *action = pvfs2_current_sigaction; + + sigallow |= ((action[SIGINT - 1].sa.sa_handler == SIG_DFL) ? + sigmask(SIGINT) : + 0); + sigallow |= ((action[SIGQUIT - 1].sa.sa_handler == SIG_DFL) ? + sigmask(SIGQUIT) : + 0); + + spin_lock_irqsave(&pvfs2_current_signal_lock, irqflags); + *orig_sigset = current->blocked; + siginitsetinv(&current->blocked, sigallow & ~orig_sigset->sig[0]); + recalc_sigpending(); + spin_unlock_irqrestore(&pvfs2_current_signal_lock, irqflags); +} + +/* this code is based on linux/net/sunrpc/clnt.c:rpc_clnt_sigunmask */ +void unmask_blocked_signals(sigset_t *orig_sigset) +{ + unsigned long irqflags = 0; + + spin_lock_irqsave(&pvfs2_current_signal_lock, irqflags); + current->blocked = *orig_sigset; + recalc_sigpending(); + spin_unlock_irqrestore(&pvfs2_current_signal_lock, irqflags); +} + +uint64_t pvfs2_convert_time_field(void *time_ptr) +{ + uint64_t pvfs2_time; + struct timespec *tspec = (struct timespec *)time_ptr; + pvfs2_time = (uint64_t) ((time_t) tspec->tv_sec); + return pvfs2_time; +} + +/* macro defined in include/pvfs2-types.h */ +DECLARE_ERRNO_MAPPING_AND_FN(); + +int pvfs2_normalize_to_errno(int32_t error_code) +{ + if (error_code > 0) { + gossip_err("pvfs2: error status receieved.\n"); + gossip_err("pvfs2: assuming error code is inverted.\n"); + error_code = -error_code; + } + + /* convert any error codes that are in pvfs2 format */ + if (IS_PVFS_NON_ERRNO_ERROR(-error_code)) { + if (PVFS_NON_ERRNO_ERROR_CODE(-error_code) == PVFS_ECANCEL) { + /* + * cancellation error codes generally correspond to + * a timeout from the client's perspective + */ + error_code = -ETIMEDOUT; + } else { + /* assume a default error code */ + gossip_err("pvfs2: warning: got error code without errno equivalent: %d.\n", + error_code); + error_code = -EINVAL; + } + } else if (IS_PVFS_ERROR(-error_code)) { + error_code = -PVFS_ERROR_TO_ERRNO(-error_code); + } + return error_code; +} + +#define NUM_MODES 11 +int32_t PVFS_util_translate_mode(int mode) +{ + int ret = 0; + int i = 0; + static int modes[NUM_MODES] = { + S_IXOTH, S_IWOTH, S_IROTH, + S_IXGRP, S_IWGRP, S_IRGRP, + S_IXUSR, S_IWUSR, S_IRUSR, + S_ISGID, S_ISUID + }; + static int pvfs2_modes[NUM_MODES] = { + PVFS_O_EXECUTE, PVFS_O_WRITE, PVFS_O_READ, + PVFS_G_EXECUTE, PVFS_G_WRITE, PVFS_G_READ, + PVFS_U_EXECUTE, PVFS_U_WRITE, PVFS_U_READ, + PVFS_G_SGID, PVFS_U_SUID + }; + + for (i = 0; i < NUM_MODES; i++) + if (mode & modes[i]) + ret |= pvfs2_modes[i]; + + return ret; +} +#undef NUM_MODES + +static char *pvfs2_strtok(char *s, const char *toks) +{ + /* original string */ + static char *in_string_p; + /* starting value of in_string_p during this iteration. */ + char *this_string_p; + /* # of tokens */ + uint32_t toks_len = strlen(toks); + /* index */ + uint32_t i; + + /* when s has a value, we are using a new input string */ + if (s) + in_string_p = s; + + /* set new starting position */ + this_string_p = in_string_p; + + /* + * loop through the string until a token or end-of-string(null) + * is found. + */ + for (; *in_string_p; in_string_p++) + /* Is character a token? */ + for (i = 0; i < toks_len; i++) + if (*in_string_p == toks[i]) { + /*token found => end-of-word */ + *in_string_p = 0; + in_string_p++; + return this_string_p; + } + + if (*this_string_p == 0) + return NULL; + + return this_string_p; +} + +/*convert 64-bit debug mask into a readable string of keywords*/ +static int proc_mask_to_debug(struct __keyword_mask_t *mask_map, + int num_mask_map, + uint64_t mask, + char *debug_string) +{ + unsigned int index = 0; + unsigned int i; + + memset(debug_string, 0, PVFS2_MAX_DEBUG_STRING_LEN); + + for (i = 0; i < num_mask_map; i++) { + if ((index + strlen(mask_map[i].keyword)) >= + PVFS2_MAX_DEBUG_STRING_LEN) + return 0; + + switch (mask_map[i].mask_val) { + case GOSSIP_NO_DEBUG: + if (mask == GOSSIP_NO_DEBUG) { + strcpy(debug_string, mask_map[i].keyword); + return 0; + } + break; + case GOSSIP_MAX_DEBUG: + if (mask == GOSSIP_MAX_DEBUG) { + strcpy(debug_string, mask_map[i].keyword); + return 0; + } + break; + default: + if ((mask & mask_map[i].mask_val) != + mask_map[i].mask_val) + /*mask does NOT contain the mask value */ + break; + + if (index != 0) { + /* + * add comma for second and subsequent mask + * keywords + */ + (debug_string[index]) = ','; + index++; + } + + /*add keyword and slide index */ + memcpy(&debug_string[index], + mask_map[i].keyword, + strlen(mask_map[i].keyword)); + index += strlen(mask_map[i].keyword); + } + } + + return 0; +} + +static uint64_t proc_debug_to_mask(struct __keyword_mask_t *mask_map, + int num_mask_map, + const char *event_logging) +{ + uint64_t mask = 0; + char *s = NULL; + char *t = NULL; + const char *toks = ", "; + int i = 0; + int negate = 0; + int slen = 0; + + if (event_logging) { + /* s = strdup(event_logging); */ + slen = strlen(event_logging); + s = kmalloc(slen + 1, GFP_KERNEL); + if (!s) + return -ENOMEM; + memset(s, 0, slen + 1); + memcpy(s, event_logging, slen); + + /* t = strtok(s, toks); */ + t = pvfs2_strtok(s, toks); + + while (t) { + if (*t == '-') { + negate = 1; + ++t; + } + + for (i = 0; i < num_mask_map; i++) { + if (!strcmp(t, mask_map[i].keyword)) { + + if (negate) + mask &= ~mask_map[i].mask_val; + else + mask |= mask_map[i].mask_val; + + break; + } + } + /* t = strtok(NULL, toks); */ + t = pvfs2_strtok(NULL, toks); + } + kfree(s); + } + return mask; +} + +/* + * Based on human readable keywords, translate them into + * a mask value appropriate for the debugging level desired. + * The 'computed' mask is returned; 0 if no keywords are + * present or recognized. Unrecognized keywords are ignored when + * mixed with recognized keywords. + * + * Prefix a keyword with "-" to turn it off. All keywords + * processed in specified order. + */ +uint64_t PVFS_proc_debug_eventlog_to_mask(const char *event_logging) +{ + return proc_debug_to_mask(s_keyword_mask_map, + num_keyword_mask_map, + event_logging); +} + +uint64_t PVFS_proc_kmod_eventlog_to_mask(const char *event_logging) +{ + return proc_debug_to_mask(s_kmod_keyword_mask_map, + num_kmod_keyword_mask_map, + event_logging); +} + +int PVFS_proc_kmod_mask_to_eventlog(uint64_t mask, char *debug_string) +{ + return proc_mask_to_debug(s_kmod_keyword_mask_map, + num_kmod_keyword_mask_map, + mask, + debug_string); +} + +int PVFS_proc_mask_to_eventlog(uint64_t mask, char *debug_string) +{ + + return proc_mask_to_debug(s_keyword_mask_map, + num_keyword_mask_map, + mask, + debug_string); +} diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c new file mode 100644 index 0000000..298a85e --- /dev/null +++ b/fs/orangefs/super.c @@ -0,0 +1,548 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-bufmap.h" +#include <linux/parser.h> + +/* a cache for pvfs2-inode objects (i.e. pvfs2 inode private data) */ +static struct kmem_cache *pvfs2_inode_cache; + +/* list for storing pvfs2 specific superblocks in use */ +LIST_HEAD(pvfs2_superblocks); + +DEFINE_SPINLOCK(pvfs2_superblocks_lock); + +enum { + Opt_intr, + Opt_acl, + + Opt_err +}; + +static const match_table_t tokens = { + { Opt_acl, "acl" }, + { Opt_intr, "intr" }, + { Opt_err, NULL } +}; + + +static int parse_mount_options(struct super_block *sb, char *options, + int silent) +{ + struct pvfs2_sb_info_s *pvfs2_sb = PVFS2_SB(sb); + substring_t args[MAX_OPT_ARGS]; + char *p; + + sb->s_flags &= ~MS_POSIXACL; + pvfs2_sb->flags &= ~PVFS2_OPT_INTR; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_acl: + sb->s_flags |= MS_POSIXACL; + break; + case Opt_intr: + pvfs2_sb->flags |= PVFS2_OPT_INTR; + break; + default: + goto fail; + } + } + + return 0; +fail: + if (!silent) + gossip_err("Error: mount option [%s] is not supported.\n", p); + return -EINVAL; +} + +static void pvfs2_inode_cache_ctor(void *req) +{ + struct pvfs2_inode_s *pvfs2_inode = req; + + inode_init_once(&pvfs2_inode->vfs_inode); + init_rwsem(&pvfs2_inode->xattr_sem); + + pvfs2_inode->vfs_inode.i_version = 1; +} + +static struct inode *pvfs2_alloc_inode(struct super_block *sb) +{ + struct pvfs2_inode_s *pvfs2_inode; + + pvfs2_inode = kmem_cache_alloc(pvfs2_inode_cache, + PVFS2_CACHE_ALLOC_FLAGS); + if (pvfs2_inode == NULL) { + gossip_err("Failed to allocate pvfs2_inode\n"); + return NULL; + } + + /* + * We want to clear everything except for rw_semaphore and the + * vfs_inode. + */ + memset(&pvfs2_inode->refn.khandle, 0, 16); + pvfs2_inode->refn.fs_id = PVFS_FS_ID_NULL; + pvfs2_inode->last_failed_block_index_read = 0; + memset(pvfs2_inode->link_target, 0, sizeof(pvfs2_inode->link_target)); + pvfs2_inode->pinode_flags = 0; + + gossip_debug(GOSSIP_SUPER_DEBUG, + "pvfs2_alloc_inode: allocated %p\n", + &pvfs2_inode->vfs_inode); + return &pvfs2_inode->vfs_inode; +} + +static void pvfs2_destroy_inode(struct inode *inode) +{ + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "%s: deallocated %p destroying inode %pU\n", + __func__, pvfs2_inode, get_khandle_from_ino(inode)); + + kmem_cache_free(pvfs2_inode_cache, pvfs2_inode); +} + +/* + * NOTE: information filled in here is typically reflected in the + * output of the system command 'df' +*/ +static int pvfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + int ret = -ENOMEM; + struct pvfs2_kernel_op *new_op = NULL; + int flags = 0; + struct super_block *sb = NULL; + + sb = dentry->d_sb; + + gossip_debug(GOSSIP_SUPER_DEBUG, + "pvfs2_statfs: called on sb %p (fs_id is %d)\n", + sb, + (int)(PVFS2_SB(sb)->fs_id)); + + new_op = op_alloc(PVFS2_VFS_OP_STATFS); + if (!new_op) + return ret; + new_op->upcall.req.statfs.fs_id = PVFS2_SB(sb)->fs_id; + + if (PVFS2_SB(sb)->flags & PVFS2_OPT_INTR) + flags = PVFS2_OP_INTERRUPTIBLE; + + ret = service_operation(new_op, "pvfs2_statfs", flags); + + if (new_op->downcall.status < 0) + goto out_op_release; + + gossip_debug(GOSSIP_SUPER_DEBUG, + "pvfs2_statfs: got %ld blocks available | " + "%ld blocks total | %ld block size\n", + (long)new_op->downcall.resp.statfs.blocks_avail, + (long)new_op->downcall.resp.statfs.blocks_total, + (long)new_op->downcall.resp.statfs.block_size); + + buf->f_type = sb->s_magic; + memcpy(&buf->f_fsid, &PVFS2_SB(sb)->fs_id, sizeof(buf->f_fsid)); + buf->f_bsize = new_op->downcall.resp.statfs.block_size; + buf->f_namelen = PVFS2_NAME_LEN; + + buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total; + buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail; + buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail; + buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total; + buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail; + buf->f_frsize = sb->s_blocksize; + +out_op_release: + op_release(new_op); + gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_statfs: returning %d\n", ret); + return ret; +} + +/* + * Remount as initiated by VFS layer. We just need to reparse the mount + * options, no need to signal pvfs2-client-core about it. + */ +static int pvfs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_remount_fs: called\n"); + return parse_mount_options(sb, data, 1); +} + +/* + * Remount as initiated by pvfs2-client-core on restart. This is used to + * repopulate mount information left from previous pvfs2-client-core. + * + * the idea here is that given a valid superblock, we're + * re-initializing the user space client with the initial mount + * information specified when the super block was first initialized. + * this is very different than the first initialization/creation of a + * superblock. we use the special service_priority_operation to make + * sure that the mount gets ahead of any other pending operation that + * is waiting for servicing. this means that the pvfs2-client won't + * fail to start several times for all other pending operations before + * the client regains all of the mount information from us. + * NOTE: this function assumes that the request_mutex is already acquired! + */ +int pvfs2_remount(struct super_block *sb) +{ + struct pvfs2_kernel_op *new_op; + int ret = -EINVAL; + + gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_remount: called\n"); + + new_op = op_alloc(PVFS2_VFS_OP_FS_MOUNT); + if (!new_op) + return -ENOMEM; + strncpy(new_op->upcall.req.fs_mount.pvfs2_config_server, + PVFS2_SB(sb)->devname, + PVFS_MAX_SERVER_ADDR_LEN); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "Attempting PVFS2 Remount via host %s\n", + new_op->upcall.req.fs_mount.pvfs2_config_server); + + /* + * we assume that the calling function has already acquire the + * request_mutex to prevent other operations from bypassing + * this one + */ + ret = service_operation(new_op, "pvfs2_remount", + PVFS2_OP_PRIORITY | PVFS2_OP_NO_SEMAPHORE); + gossip_debug(GOSSIP_SUPER_DEBUG, + "pvfs2_remount: mount got return value of %d\n", + ret); + if (ret == 0) { + /* + * store the id assigned to this sb -- it's just a + * short-lived mapping that the system interface uses + * to map this superblock to a particular mount entry + */ + PVFS2_SB(sb)->id = new_op->downcall.resp.fs_mount.id; + PVFS2_SB(sb)->mount_pending = 0; + } + + op_release(new_op); + return ret; +} + +int fsid_key_table_initialize(void) +{ + return 0; +} + +void fsid_key_table_finalize(void) +{ + return; +} + +/* Called whenever the VFS dirties the inode in response to atime updates */ +static void pvfs2_dirty_inode(struct inode *inode, int flags) +{ + struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "pvfs2_dirty_inode: %pU\n", + get_khandle_from_ino(inode)); + SetAtimeFlag(pvfs2_inode); +} + +struct super_operations pvfs2_s_ops = { + .alloc_inode = pvfs2_alloc_inode, + .destroy_inode = pvfs2_destroy_inode, + .dirty_inode = pvfs2_dirty_inode, + .drop_inode = generic_delete_inode, + .statfs = pvfs2_statfs, + .remount_fs = pvfs2_remount_fs, + .show_options = generic_show_options, +}; + +struct dentry *pvfs2_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, + int fh_type) +{ + PVFS_object_kref refn; + + if (fh_len < 5 || fh_type > 2) + return NULL; + + PVFS_khandle_from(&(refn.khandle), fid->raw, 16); + refn.fs_id = (u32) fid->raw[4]; + gossip_debug(GOSSIP_SUPER_DEBUG, + "fh_to_dentry: handle %pU, fs_id %d\n", + &refn.khandle, + refn.fs_id); + + return d_obtain_alias(pvfs2_iget(sb, &refn)); +} + +int pvfs2_encode_fh(struct inode *inode, + __u32 *fh, + int *max_len, + struct inode *parent) +{ + int len = parent ? 10 : 5; + int type = 1; + PVFS_object_kref refn; + + if (*max_len < len) { + gossip_lerr("fh buffer is too small for encoding\n"); + *max_len = len; + type = 255; + goto out; + } + + refn = PVFS2_I(inode)->refn; + PVFS_khandle_to(&refn.khandle, fh, 16); + fh[4] = refn.fs_id; + + gossip_debug(GOSSIP_SUPER_DEBUG, + "Encoding fh: handle %pU, fsid %u\n", + &refn.khandle, + refn.fs_id); + + + if (parent) { + refn = PVFS2_I(parent)->refn; + PVFS_khandle_to(&refn.khandle, (char *) fh + 20, 16); + fh[9] = refn.fs_id; + + type = 2; + gossip_debug(GOSSIP_SUPER_DEBUG, + "Encoding parent: handle %pU, fsid %u\n", + &refn.khandle, + refn.fs_id); + } + *max_len = len; + +out: + return type; +} + +static struct export_operations pvfs2_export_ops = { + .encode_fh = pvfs2_encode_fh, + .fh_to_dentry = pvfs2_fh_to_dentry, +}; + +int pvfs2_fill_sb(struct super_block *sb, void *data, int silent) +{ + int ret = -EINVAL; + struct inode *root = NULL; + struct dentry *root_dentry = NULL; + struct pvfs2_mount_sb_info_t *mount_sb_info = + (struct pvfs2_mount_sb_info_t *) data; + PVFS_object_kref root_object; + + /* alloc and init our private pvfs2 sb info */ + sb->s_fs_info = + kmalloc(sizeof(struct pvfs2_sb_info_s), PVFS2_GFP_FLAGS); + if (!PVFS2_SB(sb)) + return -ENOMEM; + memset(sb->s_fs_info, 0, sizeof(struct pvfs2_sb_info_s)); + PVFS2_SB(sb)->sb = sb; + + PVFS2_SB(sb)->root_khandle = mount_sb_info->root_khandle; + PVFS2_SB(sb)->fs_id = mount_sb_info->fs_id; + PVFS2_SB(sb)->id = mount_sb_info->id; + + if (mount_sb_info->data) { + ret = parse_mount_options(sb, mount_sb_info->data, + silent); + if (ret) + return ret; + } + + /* Hang the xattr handlers off the superblock */ + sb->s_xattr = pvfs2_xattr_handlers; + sb->s_magic = PVFS2_SUPER_MAGIC; + sb->s_op = &pvfs2_s_ops; + sb->s_d_op = &pvfs2_dentry_operations; + + sb->s_blocksize = pvfs_bufmap_size_query(); + sb->s_blocksize_bits = pvfs_bufmap_shift_query(); + sb->s_maxbytes = MAX_LFS_FILESIZE; + + root_object.khandle = PVFS2_SB(sb)->root_khandle; + root_object.fs_id = PVFS2_SB(sb)->fs_id; + gossip_debug(GOSSIP_SUPER_DEBUG, + "get inode %pU, fsid %d\n", + &root_object.khandle, + root_object.fs_id); + + root = pvfs2_iget(sb, &root_object); + if (IS_ERR(root)) + return PTR_ERR(root); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "Allocated root inode [%p] with mode %x\n", + root, + root->i_mode); + + /* allocates and places root dentry in dcache */ + root_dentry = d_make_root(root); + if (!root_dentry) { + iput(root); + return -ENOMEM; + } + + sb->s_export_op = &pvfs2_export_ops; + sb->s_root = root_dentry; + return 0; +} + +struct dentry *pvfs2_mount(struct file_system_type *fst, + int flags, + const char *devname, + void *data) +{ + int ret = -EINVAL; + struct super_block *sb = ERR_PTR(-EINVAL); + struct pvfs2_kernel_op *new_op; + struct pvfs2_mount_sb_info_t mount_sb_info; + struct dentry *mnt_sb_d = ERR_PTR(-EINVAL); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "pvfs2_mount: called with devname %s\n", + devname); + + if (!devname) { + gossip_err("ERROR: device name not specified.\n"); + return ERR_PTR(-EINVAL); + } + + new_op = op_alloc(PVFS2_VFS_OP_FS_MOUNT); + if (!new_op) + return ERR_PTR(-ENOMEM); + + strncpy(new_op->upcall.req.fs_mount.pvfs2_config_server, + devname, + PVFS_MAX_SERVER_ADDR_LEN); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "Attempting PVFS2 Mount via host %s\n", + new_op->upcall.req.fs_mount.pvfs2_config_server); + + ret = service_operation(new_op, "pvfs2_mount", 0); + gossip_debug(GOSSIP_SUPER_DEBUG, + "pvfs2_mount: mount got return value of %d\n", ret); + if (ret) + goto free_op; + + if (new_op->downcall.resp.fs_mount.fs_id == PVFS_FS_ID_NULL) { + gossip_err("ERROR: Retrieved null fs_id\n"); + ret = -EINVAL; + goto free_op; + } + + /* fill in temporary structure passed to fill_sb method */ + mount_sb_info.data = data; + mount_sb_info.root_khandle = + new_op->downcall.resp.fs_mount.root_khandle; + mount_sb_info.fs_id = new_op->downcall.resp.fs_mount.fs_id; + mount_sb_info.id = new_op->downcall.resp.fs_mount.id; + + /* + * the mount_sb_info structure looks odd, but it's used because + * the private sb info isn't allocated until we call + * pvfs2_fill_sb, yet we have the info we need to fill it with + * here. so we store it temporarily and pass all of the info + * to fill_sb where it's properly copied out + */ + mnt_sb_d = mount_nodev(fst, + flags, + (void *)&mount_sb_info, + pvfs2_fill_sb); + if (IS_ERR(mnt_sb_d)) { + sb = ERR_CAST(mnt_sb_d); + goto free_op; + } + + sb = mnt_sb_d->d_sb; + + /* + * on successful mount, store the devname and data + * used + */ + strncpy(PVFS2_SB(sb)->devname, + devname, + PVFS_MAX_SERVER_ADDR_LEN); + + /* mount_pending must be cleared */ + PVFS2_SB(sb)->mount_pending = 0; + + /* + * finally, add this sb to our list of known pvfs2 + * sb's + */ + add_pvfs2_sb(sb); + op_release(new_op); + return mnt_sb_d; + +free_op: + gossip_err("pvfs2_mount: mount request failed with %d\n", ret); + if (ret == -EINVAL) { + gossip_err("Ensure that all pvfs2-servers have the same FS configuration files\n"); + gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n"); + } + + op_release(new_op); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "pvfs2_mount: returning dentry %p\n", + mnt_sb_d); + return mnt_sb_d; +} + +void pvfs2_kill_sb(struct super_block *sb) +{ + gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_kill_sb: called\n"); + + /* + * issue the unmount to userspace to tell it to remove the + * dynamic mount info it has for this superblock + */ + pvfs2_unmount_sb(sb); + + /* remove the sb from our list of pvfs2 specific sb's */ + remove_pvfs2_sb(sb); + + /* provided sb cleanup */ + kill_anon_super(sb); + + /* free the pvfs2 superblock private data */ + kfree(PVFS2_SB(sb)); +} + +int pvfs2_inode_cache_initialize(void) +{ + pvfs2_inode_cache = kmem_cache_create("pvfs2_inode_cache", + sizeof(struct pvfs2_inode_s), + 0, + PVFS2_CACHE_CREATE_FLAGS, + pvfs2_inode_cache_ctor); + + if (!pvfs2_inode_cache) { + gossip_err("Cannot create pvfs2_inode_cache\n"); + return -ENOMEM; + } + return 0; +} + +int pvfs2_inode_cache_finalize(void) +{ + kmem_cache_destroy(pvfs2_inode_cache); + return 0; +} diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c new file mode 100644 index 0000000..7fed227 --- /dev/null +++ b/fs/orangefs/symlink.c @@ -0,0 +1,30 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-bufmap.h" + +static void *pvfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *target = PVFS2_I(dentry->d_inode)->link_target; + + gossip_debug(GOSSIP_INODE_DEBUG, + "pvfs2: %s called on %s (target is %p)\n", + __func__, (char *)dentry->d_name.name, target); + + nd_set_link(nd, target); + return NULL; +} + +struct inode_operations pvfs2_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = pvfs2_follow_link, + .setattr = pvfs2_setattr, + .getattr = pvfs2_getattr, + .listxattr = pvfs2_listxattr, + .setxattr = generic_setxattr, +};

[V2,3/5] Orangefs: hooks and call-outs

Commit Message

Patch