new file mode 100644
@@ -0,0 +1,176 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/fs_struct.h>
+
+struct posix_acl *pvfs2_get_acl(struct inode *inode, int type)
+{
+ struct posix_acl *acl;
+ int ret;
+ char *key = NULL, *value = NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ key = PVFS2_XATTR_NAME_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ key = PVFS2_XATTR_NAME_ACL_DEFAULT;
+ break;
+ default:
+ gossip_err("pvfs2_get_acl: bogus value of type %d\n", type);
+ return ERR_PTR(-EINVAL);
+ }
+ /*
+ * Rather than incurring a network call just to determine the exact
+ * length of the attribute, I just allocate a max length to save on
+ * the network call. Conceivably, we could pass NULL to
+ * pvfs2_inode_getxattr() to probe the length of the value, but
+ * I don't do that for now.
+ */
+ value = kmalloc(PVFS_MAX_XATTR_VALUELEN, GFP_KERNEL);
+ if (value == NULL) {
+ gossip_err("pvfs2_get_acl: Could not allocate value ptr\n");
+ return ERR_PTR(-ENOMEM);
+ }
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "inode %pU, key %s, type %d\n",
+ get_khandle_from_ino(inode),
+ key,
+ type);
+ ret = pvfs2_inode_getxattr(inode,
+ "",
+ key,
+ value,
+ PVFS_MAX_XATTR_VALUELEN);
+ /* if the key exists, convert it to an in-memory rep */
+ if (ret > 0) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, ret);
+ } else if (ret == -ENODATA || ret == -ENOSYS) {
+ acl = NULL;
+ } else {
+ gossip_err("inode %pU retrieving acl's failed with error %d\n",
+ get_khandle_from_ino(inode),
+ ret);
+ acl = ERR_PTR(ret);
+ }
+ /* kfree(NULL) is safe, so don't worry if value ever got used */
+ kfree(value);
+ return acl;
+}
+
+int pvfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+ int error = 0;
+ void *value = NULL;
+ size_t size = 0;
+ const char *name = NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = PVFS2_XATTR_NAME_ACL_ACCESS;
+ if (acl) {
+ umode_t mode = inode->i_mode;
+ /*
+ * can we represent this with the traditional file
+ * mode permission bits?
+ */
+ error = posix_acl_equiv_mode(acl, &mode);
+ if (error < 0) {
+ gossip_err("%s: posix_acl_equiv_mode err: %d\n",
+ __func__,
+ error);
+ return error;
+ }
+
+ if (inode->i_mode != mode)
+ SetModeFlag(pvfs2_inode);
+ inode->i_mode = mode;
+ mark_inode_dirty_sync(inode);
+ if (error == 0)
+ acl = NULL;
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = PVFS2_XATTR_NAME_ACL_DEFAULT;
+ break;
+ default:
+ gossip_err("%s: invalid type %d!\n", __func__, type);
+ return -EINVAL;
+ }
+
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "%s: inode %pU, key %s type %d\n",
+ __func__, get_khandle_from_ino(inode),
+ name,
+ type);
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_KERNEL);
+ if (!value)
+ return -ENOMEM;
+
+ error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (error < 0)
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "%s: name %s, value %p, size %zd, acl %p\n",
+ __func__, name, value, size, acl);
+ /*
+ * Go ahead and set the extended attribute now. NOTE: Suppose acl
+ * was NULL, then value will be NULL and size will be 0 and that
+ * will xlate to a removexattr. However, we don't want removexattr
+ * complain if attributes does not exist.
+ */
+ error = pvfs2_inode_setxattr(inode, "", name, value, size, 0);
+
+out:
+ kfree(value);
+ if (!error)
+ set_cached_acl(inode, type, acl);
+ return error;
+}
+
+int pvfs2_init_acl(struct inode *inode, struct inode *dir)
+{
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+ struct posix_acl *default_acl, *acl;
+ umode_t mode = inode->i_mode;
+ int error = 0;
+
+ ClearModeFlag(pvfs2_inode);
+
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error)
+ return error;
+
+ if (default_acl) {
+ error = pvfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
+ }
+
+ if (acl) {
+ if (!error)
+ error = pvfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
+
+ /* If mode of the inode was changed, then do a forcible ->setattr */
+ if (mode != inode->i_mode) {
+ SetModeFlag(pvfs2_inode);
+ inode->i_mode = mode;
+ pvfs2_flush_inode(inode);
+ }
+
+ return error;
+}
new file mode 100644
@@ -0,0 +1,395 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+
+struct readdir_handle_t {
+ int buffer_index;
+ struct pvfs2_readdir_response readdir_response;
+ void *dents_buf;
+};
+
+/*
+ * decode routine needed by kmod to make sense of the shared page for readdirs.
+ */
+static long decode_dirents(char *ptr, struct pvfs2_readdir_response *readdir)
+{
+ int i;
+ struct pvfs2_readdir_response *rd =
+ (struct pvfs2_readdir_response *) ptr;
+ char *buf = ptr;
+ char **pptr = &buf;
+
+ readdir->token = rd->token;
+ readdir->pvfs_dirent_outcount = rd->pvfs_dirent_outcount;
+ readdir->dirent_array = kmalloc(readdir->pvfs_dirent_outcount *
+ sizeof(*readdir->dirent_array),
+ GFP_KERNEL);
+ if (readdir->dirent_array == NULL)
+ return -ENOMEM;
+ *pptr += offsetof(struct pvfs2_readdir_response, dirent_array);
+ for (i = 0; i < readdir->pvfs_dirent_outcount; i++) {
+ dec_string(pptr, &readdir->dirent_array[i].d_name,
+ &readdir->dirent_array[i].d_length);
+ readdir->dirent_array[i].khandle =
+ *(struct pvfs2_khandle *) *pptr;
+ *pptr += 16;
+ }
+ return (unsigned long)*pptr - (unsigned long)ptr;
+}
+
+static long readdir_handle_ctor(struct readdir_handle_t *rhandle, void *buf,
+ int buffer_index)
+{
+ long ret;
+
+ if (buf == NULL) {
+ gossip_err
+ ("Invalid NULL buffer specified in readdir_handle_ctor\n");
+ return -ENOMEM;
+ }
+ if (buffer_index < 0) {
+ gossip_err
+ ("Invalid buffer index specified in readdir_handle_ctor\n");
+ return -EINVAL;
+ }
+ rhandle->buffer_index = buffer_index;
+ rhandle->dents_buf = buf;
+ ret = decode_dirents(buf, &rhandle->readdir_response);
+ if (ret < 0) {
+ gossip_err("Could not decode readdir from buffer %ld\n", ret);
+ rhandle->buffer_index = -1;
+ gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", buf);
+ vfree(buf);
+ rhandle->dents_buf = NULL;
+ }
+ return ret;
+}
+
+static void readdir_handle_dtor(struct pvfs2_bufmap *bufmap,
+ struct readdir_handle_t *rhandle)
+{
+ if (rhandle == NULL)
+ return;
+
+ /* kfree(NULL) is safe */
+ kfree(rhandle->readdir_response.dirent_array);
+ rhandle->readdir_response.dirent_array = NULL;
+
+ if (rhandle->buffer_index >= 0) {
+ readdir_index_put(bufmap, rhandle->buffer_index);
+ rhandle->buffer_index = -1;
+ }
+ if (rhandle->dents_buf) {
+ gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n",
+ rhandle->dents_buf);
+ vfree(rhandle->dents_buf);
+ rhandle->dents_buf = NULL;
+ }
+ return;
+}
+
+/*
+ * Read directory entries from an instance of an open directory.
+ *
+ * \note This routine was converted for the readdir to iterate change
+ * in "struct file_operations". "converted" mostly amounts to
+ * changing occurrences of "readdir" and "filldir" in the
+ * comments to "iterate" and "dir_emit". Also filldir calls
+ * were changed to dir_emit calls.
+ *
+ * \param dir_emit callback function called for each entry read.
+ *
+ * \retval <0 on error
+ * \retval 0 when directory has been completely traversed
+ * \retval >0 if we don't call dir_emit for all entries
+ *
+ * \note If the dir_emit call-back returns non-zero, then iterate should
+ * assume that it has had enough, and should return as well.
+ */
+static int pvfs2_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct pvfs2_bufmap *bufmap = NULL;
+ int ret = 0;
+ int buffer_index;
+ uint64_t *ptoken = file->private_data;
+ uint64_t pos = 0;
+ ino_t ino = 0;
+ struct dentry *dentry = file->f_path.dentry;
+ struct pvfs2_kernel_op *new_op = NULL;
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(dentry->d_inode);
+ int buffer_full = 0;
+ struct readdir_handle_t rhandle;
+ int i = 0;
+ int len = 0;
+ ino_t current_ino = 0;
+ char *current_entry = NULL;
+ long bytes_decoded;
+
+ gossip_ldebug(GOSSIP_DIR_DEBUG,
+ "%s: ctx->pos:%lld, token = %llu\n",
+ __func__,
+ lld(ctx->pos),
+ llu(*ptoken));
+
+ pos = (uint64_t) ctx->pos;
+
+ /* are we done? */
+ if (pos == PVFS_READDIR_END) {
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "Skipping to termination path\n");
+ return 0;
+ }
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "pvfs2_readdir called on %s (pos=%llu)\n",
+ dentry->d_name.name, llu(pos));
+
+ rhandle.buffer_index = -1;
+ rhandle.dents_buf = NULL;
+ memset(&rhandle.readdir_response, 0, sizeof(rhandle.readdir_response));
+
+ new_op = op_alloc(PVFS2_VFS_OP_READDIR);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->uses_shared_memory = 1;
+ new_op->upcall.req.readdir.refn = pvfs2_inode->refn;
+ new_op->upcall.req.readdir.max_dirent_count = MAX_DIRENT_COUNT_READDIR;
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: upcall.req.readdir.refn.khandle: %pU\n",
+ __func__,
+ &new_op->upcall.req.readdir.refn.khandle);
+
+ /*
+ * NOTE: the position we send to the readdir upcall is out of
+ * sync with ctx->pos since:
+ * 1. pvfs2 doesn't include the "." and ".." entries that are
+ * added below.
+ * 2. the introduction of distributed directory logic makes token no
+ * longer be related to f_pos and pos. Instead an independent
+ * variable is used inside the function and stored in the
+ * private_data of the file structure.
+ */
+ new_op->upcall.req.readdir.token = *ptoken;
+
+get_new_buffer_index:
+ ret = readdir_index_get(&bufmap, &buffer_index);
+ if (ret < 0) {
+ gossip_lerr("pvfs2_readdir: readdir_index_get() failure (%d)\n",
+ ret);
+ goto out_free_op;
+ }
+ new_op->upcall.req.readdir.buf_index = buffer_index;
+
+ ret = service_operation(new_op,
+ "pvfs2_readdir",
+ get_interruptible_flag(dentry->d_inode));
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "Readdir downcall status is %d. ret:%d\n",
+ new_op->downcall.status,
+ ret);
+
+ if (ret == -EAGAIN && op_state_purged(new_op)) {
+ /*
+ * readdir shared memory aread has been wiped due to
+ * pvfs2-client-core restarting, so we must get a new
+ * index into the shared memory.
+ */
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: Getting new buffer_index for retry of readdir..\n",
+ __func__);
+ readdir_index_put(bufmap, buffer_index);
+ goto get_new_buffer_index;
+ }
+
+ if (ret == -EIO && op_state_purged(new_op)) {
+ gossip_err("%s: Client is down. Aborting readdir call.\n",
+ __func__);
+ readdir_index_put(bufmap, buffer_index);
+ goto out_free_op;
+ }
+
+ if (ret < 0 || new_op->downcall.status != 0) {
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "Readdir request failed. Status:%d\n",
+ new_op->downcall.status);
+ readdir_index_put(bufmap, buffer_index);
+ if (ret >= 0)
+ ret = new_op->downcall.status;
+ goto out_free_op;
+ }
+
+ bytes_decoded =
+ readdir_handle_ctor(&rhandle,
+ new_op->downcall.trailer_buf,
+ buffer_index);
+ if (bytes_decoded < 0) {
+ gossip_err("pvfs2_readdir: Could not decode trailer buffer into a readdir response %d\n",
+ ret);
+ ret = bytes_decoded;
+ readdir_index_put(bufmap, buffer_index);
+ goto out_free_op;
+ }
+
+ if (bytes_decoded != new_op->downcall.trailer_size) {
+ gossip_err("pvfs2_readdir: # bytes decoded (%ld) != trailer size (%ld)\n",
+ bytes_decoded,
+ (long)new_op->downcall.trailer_size);
+ ret = -EINVAL;
+ goto out_destroy_handle;
+ }
+
+ if (pos == 0) {
+ ino = get_ino_from_khandle(dentry->d_inode);
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: calling dir_emit of \".\" with pos = %llu\n",
+ __func__,
+ llu(pos));
+ ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
+ if (ret < 0)
+ goto out_destroy_handle;
+ ctx->pos++;
+ gossip_ldebug(GOSSIP_DIR_DEBUG,
+ "%s: ctx->pos:%lld\n",
+ __func__,
+ lld(ctx->pos));
+ pos++;
+ }
+
+ if (pos == 1) {
+ ino = get_parent_ino_from_dentry(dentry);
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: calling dir_emit of \"..\" with pos = %llu\n",
+ __func__,
+ llu(pos));
+ ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
+ if (ret < 0)
+ goto out_destroy_handle;
+ ctx->pos++;
+ gossip_ldebug(GOSSIP_DIR_DEBUG,
+ "%s: ctx->pos:%lld\n",
+ __func__,
+ lld(ctx->pos));
+ pos++;
+ }
+
+ for (i = 0; i < rhandle.readdir_response.pvfs_dirent_outcount; i++) {
+ len = rhandle.readdir_response.dirent_array[i].d_length;
+ current_entry = rhandle.readdir_response.dirent_array[i].d_name;
+ current_ino = pvfs2_khandle_to_ino(
+ &(rhandle.readdir_response.dirent_array[i].khandle));
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "calling dir_emit for %s with len %d, pos %ld\n",
+ current_entry,
+ len,
+ (unsigned long)pos);
+ ret =
+ dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
+ if (ret < 0) {
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "dir_emit() failed. ret:%d\n",
+ ret);
+ if (i < 2) {
+ gossip_err("dir_emit failed on one of the first two true PVFS directory entries.\n");
+ gossip_err("Duplicate entries may appear.\n");
+ }
+ buffer_full = 1;
+ break;
+ }
+ ctx->pos++;
+ gossip_ldebug(GOSSIP_DIR_DEBUG,
+ "%s: ctx->pos:%lld\n",
+ __func__,
+ lld(ctx->pos));
+
+ pos++;
+ }
+
+ /* this means that all of the dir_emit calls succeeded */
+ if (i == rhandle.readdir_response.pvfs_dirent_outcount) {
+ /* update token */
+ *ptoken = rhandle.readdir_response.token;
+ } else {
+ /* this means a dir_emit call failed */
+ if (rhandle.readdir_response.token == PVFS_READDIR_END) {
+ /*
+ * If PVFS hit end of directory, then there
+ * is no way to do math on the token that it
+ * returned. Instead we go by ctx->pos but
+ * back up to account for the artificial .
+ * and .. entries.
+ */
+ ctx->pos -= 3;
+ } else {
+ /*
+ * this means a dir_emit call failed. !!! need to set
+ * back to previous ctx->pos, no middle value allowed
+ */
+ pos -= (i - 1);
+ ctx->pos -= (i - 1);
+ }
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "at least one dir_emit call failed. Setting ctx->pos to: %lld\n",
+ lld(ctx->pos));
+ }
+
+ /*
+ * Did we hit the end of the directory?
+ */
+ if (rhandle.readdir_response.token == PVFS_READDIR_END &&
+ !buffer_full) {
+ gossip_debug(GOSSIP_DIR_DEBUG, "End of dir detected; setting ctx->pos to PVFS_READDIR_END.\n");
+ ctx->pos = PVFS_READDIR_END;
+ }
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "pos = %llu, token = %llu"
+ ", ctx->pos should have been %lld\n",
+ llu(pos),
+ llu(*ptoken),
+ lld(ctx->pos));
+
+out_destroy_handle:
+ readdir_handle_dtor(bufmap, &rhandle);
+out_free_op:
+ op_release(new_op);
+ gossip_debug(GOSSIP_DIR_DEBUG, "pvfs2_readdir returning %d\n", ret);
+ return ret;
+}
+
+static int pvfs2_dir_open(struct inode *inode, struct file *file)
+{
+ uint64_t *ptoken;
+
+ file->private_data = kmalloc(sizeof(uint64_t), GFP_KERNEL);
+ if (!file->private_data)
+ return -ENOMEM;
+
+ ptoken = file->private_data;
+ *ptoken = PVFS_READDIR_START;
+ return 0;
+}
+
+static int pvfs2_dir_release(struct inode *inode, struct file *file)
+{
+ pvfs2_flush_inode(inode);
+ kfree(file->private_data);
+ return 0;
+}
+
+/** PVFS2 implementation of VFS directory operations */
+const struct file_operations pvfs2_dir_operations = {
+ .read = generic_read_dir,
+ .iterate = pvfs2_readdir,
+ .open = pvfs2_dir_open,
+ .release = pvfs2_dir_release,
+};
new file mode 100644
@@ -0,0 +1,468 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS inode operations.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+
+static int read_one_page(struct page *page)
+{
+ void *page_data;
+ int ret;
+ int max_block;
+ ssize_t bytes_read = 0;
+ struct inode *inode = page->mapping->host;
+ const uint32_t blocksize = PAGE_CACHE_SIZE; /* inode->i_blksize */
+ const uint32_t blockbits = PAGE_CACHE_SHIFT; /* inode->i_blkbits */
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "pvfs2_readpage called with page %p\n",
+ page);
+ page_data = pvfs2_kmap(page);
+
+ max_block = ((inode->i_size / blocksize) + 1);
+
+ if (page->index < max_block) {
+ loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
+ bytes_read = pvfs2_inode_read(inode,
+ page_data,
+ blocksize,
+ &blockptr_offset,
+ inode->i_size);
+ }
+ /* only zero remaining unread portions of the page data */
+ if (bytes_read > 0)
+ memset(page_data + bytes_read, 0, blocksize - bytes_read);
+ else
+ memset(page_data, 0, blocksize);
+ /* takes care of potential aliasing */
+ flush_dcache_page(page);
+ if (bytes_read < 0) {
+ ret = bytes_read;
+ SetPageError(page);
+ } else {
+ SetPageUptodate(page);
+ if (PageError(page))
+ ClearPageError(page);
+ ret = 0;
+ }
+ pvfs2_kunmap(page);
+ /* unlock the page after the ->readpage() routine completes */
+ unlock_page(page);
+ return ret;
+}
+
+static int pvfs2_readpage(struct file *file, struct page *page)
+{
+ return read_one_page(page);
+}
+
+static int pvfs2_readpages(struct file *file,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned nr_pages)
+{
+ int page_idx;
+ int ret;
+
+ gossip_debug(GOSSIP_INODE_DEBUG, "pvfs2_readpages called\n");
+
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ struct page *page;
+ page = list_entry(pages->prev, struct page, lru);
+ list_del(&page->lru);
+ if (!add_to_page_cache(page,
+ mapping,
+ page->index,
+ GFP_KERNEL)) {
+ ret = read_one_page(page);
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "failure adding page to cache, read_one_page returned: %d\n",
+ ret);
+ } else {
+ page_cache_release(page);
+ }
+ }
+ BUG_ON(!list_empty(pages));
+ return 0;
+}
+
+static void pvfs2_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "pvfs2_invalidatepage called on page %p "
+ "(offset is %u)\n",
+ page,
+ offset);
+
+ ClearPageUptodate(page);
+ ClearPageMappedToDisk(page);
+ return;
+
+}
+
+static int pvfs2_releasepage(struct page *page, gfp_t foo)
+{
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "pvfs2_releasepage called on page %p\n",
+ page);
+ return 0;
+}
+
+/*
+ * Having a direct_IO entry point in the address_space_operations
+ * struct causes the kernel to allows us to use O_DIRECT on
+ * open. Nothing will ever call this thing, but in the future we
+ * will need to be able to use O_DIRECT on open in order to support
+ * AIO. Modeled after NFS, they do this too.
+ */
+/*
+static ssize_t pvfs2_direct_IO(int rw,
+ struct kiocb *iocb,
+ struct iov_iter *iter,
+ loff_t offset)
+{
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "pvfs2_direct_IO: %s\n",
+ iocb->ki_filp->f_path.dentry->d_name.name);
+
+ return -EINVAL;
+}
+*/
+
+struct backing_dev_info pvfs2_backing_dev_info = {
+ .name = "pvfs2",
+ .ra_pages = 0,
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+/** PVFS2 implementation of address space operations */
+const struct address_space_operations pvfs2_address_operations = {
+ .readpage = pvfs2_readpage,
+ .readpages = pvfs2_readpages,
+ .invalidatepage = pvfs2_invalidatepage,
+ .releasepage = pvfs2_releasepage,
+/* .direct_IO = pvfs2_direct_IO */
+};
+
+static int pvfs2_setattr_size(struct inode *inode, struct iattr *iattr)
+{
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+ struct pvfs2_kernel_op *new_op;
+ loff_t orig_size = i_size_read(inode);
+ int ret = -EINVAL;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n",
+ __func__,
+ get_khandle_from_ino(inode),
+ &pvfs2_inode->refn.khandle,
+ pvfs2_inode->refn.fs_id,
+ iattr->ia_size);
+
+ truncate_setsize(inode, iattr->ia_size);
+
+ new_op = op_alloc(PVFS2_VFS_OP_TRUNCATE);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.truncate.refn = pvfs2_inode->refn;
+ new_op->upcall.req.truncate.size = (int64_t) iattr->ia_size;
+
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+
+ /*
+ * the truncate has no downcall members to retrieve, but
+ * the status value tells us if it went through ok or not
+ */
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "pvfs2: pvfs2_truncate got return value of %d\n",
+ ret);
+
+ op_release(new_op);
+
+ if (ret != 0)
+ return ret;
+
+ /*
+ * Only change the c/mtime if we are changing the size or we are
+ * explicitly asked to change it. This handles the semantic difference
+ * between truncate() and ftruncate() as implemented in the VFS.
+ *
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+ * special case where we need to update the times despite not having
+ * these flags set. For all other operations the VFS set these flags
+ * explicitly if it wants a timestamp update.
+ */
+ if (orig_size != i_size_read(inode) &&
+ !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
+ iattr->ia_ctime = iattr->ia_mtime =
+ current_fs_time(inode->i_sb);
+ iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+ }
+
+ return ret;
+}
+
+/*
+ * Change attributes of an object referenced by dentry.
+ */
+int pvfs2_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ int ret = -EINVAL;
+ struct inode *inode = dentry->d_inode;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "pvfs2_setattr: called on %s\n",
+ dentry->d_name.name);
+
+ ret = inode_change_ok(inode, iattr);
+ if (ret)
+ goto out;
+
+ if ((iattr->ia_valid & ATTR_SIZE) &&
+ iattr->ia_size != i_size_read(inode)) {
+ ret = pvfs2_setattr_size(inode, iattr);
+ if (ret)
+ goto out;
+ }
+
+ setattr_copy(inode, iattr);
+ mark_inode_dirty(inode);
+
+ ret = pvfs2_inode_setattr(inode, iattr);
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "pvfs2_setattr: inode_setattr returned %d\n",
+ ret);
+
+ if (!ret && (iattr->ia_valid & ATTR_MODE))
+ /* change mod on a file that has ACLs */
+ ret = posix_acl_chmod(inode, inode->i_mode);
+
+out:
+ gossip_debug(GOSSIP_INODE_DEBUG, "pvfs2_setattr: returning %d\n", ret);
+ return ret;
+}
+
+/*
+ * Obtain attributes of an object given a dentry
+ */
+int pvfs2_getattr(struct vfsmount *mnt,
+ struct dentry *dentry,
+ struct kstat *kstat)
+{
+ int ret = -ENOENT;
+ struct inode *inode = dentry->d_inode;
+ struct pvfs2_inode_s *pvfs2_inode = NULL;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "pvfs2_getattr: called on %s\n",
+ dentry->d_name.name);
+
+ /*
+ * Similar to the above comment, a getattr also expects that all
+ * fields/attributes of the inode would be refreshed. So again, we
+ * dont have too much of a choice but refresh all the attributes.
+ */
+ ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT);
+ if (ret == 0) {
+ generic_fillattr(inode, kstat);
+ /* override block size reported to stat */
+ pvfs2_inode = PVFS2_I(inode);
+ kstat->blksize = pvfs2_inode->blksize;
+ } else {
+ /* assume an I/O error and flag inode as bad */
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "%s:%s:%d calling make bad inode\n",
+ __FILE__,
+ __func__,
+ __LINE__);
+ pvfs2_make_bad_inode(inode);
+ }
+ return ret;
+}
+
+/* PVFS2 implementation of VFS inode operations for files */
+struct inode_operations pvfs2_file_inode_operations = {
+ .get_acl = pvfs2_get_acl,
+ .set_acl = pvfs2_set_acl,
+ .setattr = pvfs2_setattr,
+ .getattr = pvfs2_getattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = pvfs2_listxattr,
+ .removexattr = generic_removexattr,
+};
+
+static int pvfs2_init_iops(struct inode *inode)
+{
+ inode->i_mapping->a_ops = &pvfs2_address_operations;
+ inode->i_mapping->backing_dev_info = &pvfs2_backing_dev_info;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &pvfs2_file_inode_operations;
+ inode->i_fop = &pvfs2_file_operations;
+ inode->i_blkbits = PAGE_CACHE_SHIFT;
+ break;
+ case S_IFLNK:
+ inode->i_op = &pvfs2_symlink_inode_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &pvfs2_dir_inode_operations;
+ inode->i_fop = &pvfs2_dir_operations;
+ break;
+ default:
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "%s: unsupported mode\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Given a PVFS2 object identifier (fsid, handle), convert it into a ino_t type
+ * that will be used as a hash-index from where the handle will
+ * be searched for in the VFS hash table of inodes.
+ */
+static inline ino_t pvfs2_handle_hash(PVFS_object_kref *ref)
+{
+ if (!ref)
+ return 0;
+ return pvfs2_khandle_to_ino(&(ref->khandle));
+}
+
+/*
+ * Called to set up an inode from iget5_locked.
+ */
+static int pvfs2_set_inode(struct inode *inode, void *data)
+{
+ PVFS_object_kref *ref = (PVFS_object_kref *) data;
+ struct pvfs2_inode_s *pvfs2_inode = NULL;
+
+ /* Make sure that we have sane parameters */
+ if (!data || !inode)
+ return 0;
+ pvfs2_inode = PVFS2_I(inode);
+ if (!pvfs2_inode)
+ return 0;
+ pvfs2_inode->refn.fs_id = ref->fs_id;
+ pvfs2_inode->refn.khandle = ref->khandle;
+ return 0;
+}
+
+/*
+ * Called to determine if handles match.
+ */
+static int pvfs2_test_inode(struct inode *inode, void *data)
+{
+ PVFS_object_kref *ref = (PVFS_object_kref *) data;
+ struct pvfs2_inode_s *pvfs2_inode = NULL;
+
+ pvfs2_inode = PVFS2_I(inode);
+ return (!PVFS_khandle_cmp(&(pvfs2_inode->refn.khandle), &(ref->khandle))
+ && pvfs2_inode->refn.fs_id == ref->fs_id);
+}
+
+/*
+ * Front-end to lookup the inode-cache maintained by the VFS using the PVFS2
+ * file handle.
+ *
+ * @sb: the file system super block instance.
+ * @ref: The PVFS2 object for which we are trying to locate an inode structure.
+ */
+struct inode *pvfs2_iget(struct super_block *sb, PVFS_object_kref *ref)
+{
+ struct inode *inode = NULL;
+ unsigned long hash;
+ int error;
+
+ hash = pvfs2_handle_hash(ref);
+ inode = iget5_locked(sb, hash, pvfs2_test_inode, pvfs2_set_inode, ref);
+ if (!inode || !(inode->i_state & I_NEW))
+ return inode;
+
+ error = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT);
+ if (error) {
+ iget_failed(inode);
+ return ERR_PTR(error);
+ }
+
+ inode->i_ino = hash; /* needed for stat etc */
+ pvfs2_init_iops(inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "iget handle %pU, fsid %d hash %ld i_ino %lu\n",
+ &ref->khandle,
+ ref->fs_id,
+ hash,
+ inode->i_ino);
+
+ return inode;
+}
+
+/*
+ * Allocate an inode for a newly created file and insert it into the inode hash.
+ */
+struct inode *pvfs2_new_inode(struct super_block *sb, struct inode *dir,
+ int mode, dev_t dev, PVFS_object_kref *ref)
+{
+ unsigned long hash = pvfs2_handle_hash(ref);
+ struct inode *inode;
+ int error;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "pvfs2_get_custom_inode_common: called\n"
+ "(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n",
+ sb,
+ MAJOR(dev),
+ MINOR(dev),
+ mode);
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ pvfs2_set_inode(inode, ref);
+ inode->i_ino = hash; /* needed for stat etc */
+
+ error = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT);
+ if (error)
+ goto out_iput;
+
+ pvfs2_init_iops(inode);
+
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_size = PAGE_CACHE_SIZE;
+ inode->i_rdev = dev;
+
+ error = insert_inode_locked4(inode, hash, pvfs2_test_inode, ref);
+ if (error < 0)
+ goto out_iput;
+
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "Initializing ACL's for inode %pU\n",
+ get_khandle_from_ino(inode));
+ pvfs2_init_acl(inode, dir);
+ return inode;
+
+out_iput:
+ iput(inode);
+ return ERR_PTR(error);
+}
new file mode 100644
@@ -0,0 +1,473 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS namei operations.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+
+/*
+ * Get a newly allocated inode to go with a negative dentry.
+ */
+static int pvfs2_create(struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ bool exclusive)
+{
+ struct pvfs2_inode_s *parent = PVFS2_I(dir);
+ struct pvfs2_kernel_op *new_op;
+ struct inode *inode;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
+
+ new_op = op_alloc(PVFS2_VFS_OP_CREATE);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.create.parent_refn = parent->refn;
+
+ fill_default_sys_attrs(new_op->upcall.req.create.attributes,
+ PVFS_TYPE_METAFILE, mode);
+
+ strncpy(new_op->upcall.req.create.d_name,
+ dentry->d_name.name, PVFS2_NAME_LEN);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Create Got PVFS2 handle %pU on fsid %d (ret=%d)\n",
+ &new_op->downcall.resp.create.refn.khandle,
+ new_op->downcall.resp.create.refn.fs_id, ret);
+
+ if (ret < 0) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: failed with error code %d\n",
+ __func__, ret);
+ goto out;
+ }
+
+ inode = pvfs2_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
+ &new_op->downcall.resp.create.refn);
+ if (IS_ERR(inode)) {
+ gossip_err("*** Failed to allocate pvfs2 file inode\n");
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Assigned file inode new number of %pU\n",
+ get_khandle_from_ino(inode));
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Inode (Regular File) %pU -> %s\n",
+ get_khandle_from_ino(inode),
+ dentry->d_name.name);
+
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+ ret = 0;
+out:
+ op_release(new_op);
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s: returning %d\n", __func__, ret);
+ return ret;
+}
+
+/*
+ * Attempt to resolve an object name (dentry->d_name), parent handle, and
+ * fsid into a handle for the object.
+ */
+static struct dentry *pvfs2_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct pvfs2_inode_s *parent = PVFS2_I(dir);
+ struct pvfs2_kernel_op *new_op;
+ struct inode *inode;
+ struct dentry *res;
+ int ret = -EINVAL;
+
+ /*
+ * in theory we could skip a lookup here (if the intent is to
+ * create) in order to avoid a potentially failed lookup, but
+ * leaving it in can skip a valid lookup and try to create a file
+ * that already exists (e.g. the vfs already handles checking for
+ * -EEXIST on O_EXCL opens, which is broken if we skip this lookup
+ * in the create path)
+ */
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n",
+ __func__, dentry->d_name.name);
+
+ if (dentry->d_name.len > (PVFS2_NAME_LEN - 1))
+ return ERR_PTR(-ENAMETOOLONG);
+
+ new_op = op_alloc(PVFS2_VFS_OP_LOOKUP);
+ if (!new_op)
+ return ERR_PTR(-ENOMEM);
+
+ new_op->upcall.req.lookup.sym_follow = flags & LOOKUP_FOLLOW;
+
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n",
+ __FILE__,
+ __func__,
+ __LINE__,
+ &parent->refn.khandle);
+ new_op->upcall.req.lookup.parent_refn = parent->refn;
+
+ strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name,
+ PVFS2_NAME_LEN);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: doing lookup on %s under %pU,%d (follow=%s)\n",
+ __func__,
+ new_op->upcall.req.lookup.d_name,
+ &new_op->upcall.req.lookup.parent_refn.khandle,
+ new_op->upcall.req.lookup.parent_refn.fs_id,
+ ((new_op->upcall.req.lookup.sym_follow ==
+ PVFS2_LOOKUP_LINK_FOLLOW) ? "yes" : "no"));
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Lookup Got %pU, fsid %d (ret=%d)\n",
+ &new_op->downcall.resp.lookup.refn.khandle,
+ new_op->downcall.resp.lookup.refn.fs_id,
+ ret);
+
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ /*
+ * if no inode was found, add a negative dentry to
+ * dcache anyway; if we don't, we don't hold expected
+ * lookup semantics and we most noticeably break
+ * during directory renames.
+ *
+ * however, if the operation failed or exited, do not
+ * add the dentry (e.g. in the case that a touch is
+ * issued on a file that already exists that was
+ * interrupted during this lookup -- no need to add
+ * another negative dentry for an existing file)
+ */
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "pvfs2_lookup: Adding *negative* dentry "
+ "%p for %s\n",
+ dentry,
+ dentry->d_name.name);
+
+ d_add(dentry, NULL);
+ res = NULL;
+ goto out;
+ }
+
+ /* must be a non-recoverable error */
+ res = ERR_PTR(ret);
+ goto out;
+ }
+
+ inode = pvfs2_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
+ if (IS_ERR(inode)) {
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "error %ld from iget\n", PTR_ERR(inode));
+ res = ERR_CAST(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s:%s:%d "
+ "Found good inode [%lu] with count [%d]\n",
+ __FILE__,
+ __func__,
+ __LINE__,
+ inode->i_ino,
+ (int)atomic_read(&inode->i_count));
+
+ /* update dentry/inode pair into dcache */
+ res = d_splice_alias(inode, dentry);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Lookup success (inode ct = %d)\n",
+ (int)atomic_read(&inode->i_count));
+out:
+ op_release(new_op);
+ return res;
+}
+
+/* return 0 on success; non-zero otherwise */
+static int pvfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ struct pvfs2_inode_s *parent = PVFS2_I(dir);
+ struct pvfs2_kernel_op *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: called on %s\n"
+ " (inode %pU): Parent is %pU | fs_id %d\n",
+ __func__,
+ dentry->d_name.name,
+ get_khandle_from_ino(inode),
+ &parent->refn.khandle,
+ parent->refn.fs_id);
+
+ new_op = op_alloc(PVFS2_VFS_OP_REMOVE);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.remove.parent_refn = parent->refn;
+ strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name,
+ PVFS2_NAME_LEN);
+
+ ret = service_operation(new_op, "pvfs2_unlink",
+ get_interruptible_flag(inode));
+
+ /* when request is serviced properly, free req op struct */
+ op_release(new_op);
+
+ if (!ret) {
+ drop_nlink(inode);
+
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+ }
+ return ret;
+}
+
+/*
+ * pvfs2_link() is only implemented here to make sure that we return a
+ * reasonable error code (the kernel will return a misleading EPERM
+ * otherwise). PVFS2 does not support hard links.
+ */
+static int pvfs2_link(struct dentry *old_dentry,
+ struct inode *dir,
+ struct dentry *dentry)
+{
+ return -EOPNOTSUPP;
+}
+
+/*
+ * pvfs2_mknod() is only implemented here to make sure that we return a
+ * reasonable error code (the kernel will return a misleading EPERM
+ * otherwise). PVFS2 does not support special files such as fifos or devices.
+ */
+static int pvfs2_mknod(struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t rdev)
+{
+ return -EOPNOTSUPP;
+}
+
+static int pvfs2_symlink(struct inode *dir,
+ struct dentry *dentry,
+ const char *symname)
+{
+ struct pvfs2_inode_s *parent = PVFS2_I(dir);
+ struct pvfs2_kernel_op *new_op;
+ struct inode *inode;
+ int mode = 755;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
+
+ if (!symname)
+ return -EINVAL;
+
+ new_op = op_alloc(PVFS2_VFS_OP_SYMLINK);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.sym.parent_refn = parent->refn;
+
+ fill_default_sys_attrs(new_op->upcall.req.sym.attributes,
+ PVFS_TYPE_SYMLINK,
+ mode);
+
+ strncpy(new_op->upcall.req.sym.entry_name,
+ dentry->d_name.name,
+ PVFS2_NAME_LEN);
+ strncpy(new_op->upcall.req.sym.target, symname, PVFS2_NAME_LEN);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Symlink Got PVFS2 handle %pU on fsid %d (ret=%d)\n",
+ &new_op->downcall.resp.sym.refn.khandle,
+ new_op->downcall.resp.sym.refn.fs_id, ret);
+
+ if (ret < 0) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: failed with error code %d\n",
+ __func__, ret);
+ goto out;
+ }
+
+ inode = pvfs2_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0,
+ &new_op->downcall.resp.sym.refn);
+ if (IS_ERR(inode)) {
+ gossip_err
+ ("*** Failed to allocate pvfs2 symlink inode\n");
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Assigned symlink inode new number of %pU\n",
+ get_khandle_from_ino(inode));
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Inode (Symlink) %pU -> %s\n",
+ get_khandle_from_ino(inode),
+ dentry->d_name.name);
+
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+ ret = 0;
+out:
+ op_release(new_op);
+ return ret;
+}
+
+static int pvfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct pvfs2_inode_s *parent = PVFS2_I(dir);
+ struct pvfs2_kernel_op *new_op;
+ struct inode *inode;
+ int ret;
+
+ new_op = op_alloc(PVFS2_VFS_OP_MKDIR);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.mkdir.parent_refn = parent->refn;
+
+ fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
+ PVFS_TYPE_DIRECTORY, mode);
+
+ strncpy(new_op->upcall.req.mkdir.d_name,
+ dentry->d_name.name, PVFS2_NAME_LEN);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Mkdir Got PVFS2 handle %pU on fsid %d\n",
+ &new_op->downcall.resp.mkdir.refn.khandle,
+ new_op->downcall.resp.mkdir.refn.fs_id);
+
+ if (ret < 0) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: failed with error code %d\n",
+ __func__, ret);
+ goto out;
+ }
+
+ inode = pvfs2_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0,
+ &new_op->downcall.resp.mkdir.refn);
+ if (IS_ERR(inode)) {
+ gossip_err("*** Failed to allocate pvfs2 dir inode\n");
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Assigned dir inode new number of %pU\n",
+ get_khandle_from_ino(inode));
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Inode (Directory) %pU -> %s\n",
+ get_khandle_from_ino(inode),
+ dentry->d_name.name);
+
+ /*
+ * NOTE: we have no good way to keep nlink consistent for directories
+ * across clients; keep constant at 1.
+ */
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+out:
+ op_release(new_op);
+ return ret;
+}
+
+static int pvfs2_rename(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+{
+ struct pvfs2_kernel_op *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "pvfs2_rename: called (%s/%s => %s/%s) ct=%d\n",
+ old_dentry->d_parent->d_name.name,
+ old_dentry->d_name.name,
+ new_dentry->d_parent->d_name.name,
+ new_dentry->d_name.name,
+ d_count(new_dentry));
+
+ new_op = op_alloc(PVFS2_VFS_OP_RENAME);
+ if (!new_op)
+ return -EINVAL;
+
+ new_op->upcall.req.rename.old_parent_refn = PVFS2_I(old_dir)->refn;
+ new_op->upcall.req.rename.new_parent_refn = PVFS2_I(new_dir)->refn;
+
+ strncpy(new_op->upcall.req.rename.d_old_name,
+ old_dentry->d_name.name,
+ PVFS2_NAME_LEN);
+ strncpy(new_op->upcall.req.rename.d_new_name,
+ new_dentry->d_name.name,
+ PVFS2_NAME_LEN);
+
+ ret = service_operation(new_op,
+ "pvfs2_rename",
+ get_interruptible_flag(old_dentry->d_inode));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "pvfs2_rename: got downcall status %d\n",
+ ret);
+
+ if (new_dentry->d_inode)
+ new_dentry->d_inode->i_ctime = CURRENT_TIME;
+
+ op_release(new_op);
+ return ret;
+}
+
+/* PVFS2 implementation of VFS inode operations for directories */
+struct inode_operations pvfs2_dir_inode_operations = {
+ .lookup = pvfs2_lookup,
+ .get_acl = pvfs2_get_acl,
+ .set_acl = pvfs2_set_acl,
+ .create = pvfs2_create,
+ .link = pvfs2_link,
+ .unlink = pvfs2_unlink,
+ .symlink = pvfs2_symlink,
+ .mkdir = pvfs2_mkdir,
+ .rmdir = pvfs2_unlink,
+ .mknod = pvfs2_mknod,
+ .rename = pvfs2_rename,
+ .setattr = pvfs2_setattr,
+ .getattr = pvfs2_getattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = pvfs2_listxattr,
+};
new file mode 100644
@@ -0,0 +1,914 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-dev-proto.h"
+#include "pvfs2-bufmap.h"
+
+int32_t fsid_of_op(struct pvfs2_kernel_op *op)
+{
+ int32_t fsid = PVFS_FS_ID_NULL;
+ if (op) {
+ switch (op->upcall.type) {
+ case PVFS2_VFS_OP_FILE_IO:
+ fsid = op->upcall.req.io.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_LOOKUP:
+ fsid = op->upcall.req.lookup.parent_refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_CREATE:
+ fsid = op->upcall.req.create.parent_refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_GETATTR:
+ fsid = op->upcall.req.getattr.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_REMOVE:
+ fsid = op->upcall.req.remove.parent_refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_MKDIR:
+ fsid = op->upcall.req.mkdir.parent_refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_READDIR:
+ fsid = op->upcall.req.readdir.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_SETATTR:
+ fsid = op->upcall.req.setattr.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_SYMLINK:
+ fsid = op->upcall.req.sym.parent_refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_RENAME:
+ fsid = op->upcall.req.rename.old_parent_refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_STATFS:
+ fsid = op->upcall.req.statfs.fs_id;
+ break;
+ case PVFS2_VFS_OP_TRUNCATE:
+ fsid = op->upcall.req.truncate.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_MMAP_RA_FLUSH:
+ fsid = op->upcall.req.ra_cache_flush.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_FS_UMOUNT:
+ fsid = op->upcall.req.fs_umount.fs_id;
+ break;
+ case PVFS2_VFS_OP_GETXATTR:
+ fsid = op->upcall.req.getxattr.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_SETXATTR:
+ fsid = op->upcall.req.setxattr.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_LISTXATTR:
+ fsid = op->upcall.req.listxattr.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_REMOVEXATTR:
+ fsid = op->upcall.req.removexattr.refn.fs_id;
+ break;
+ case PVFS2_VFS_OP_FSYNC:
+ fsid = op->upcall.req.fsync.refn.fs_id;
+ break;
+ default:
+ break;
+ }
+ }
+ return fsid;
+}
+
+static void pvfs2_set_inode_flags(struct inode *inode,
+ struct PVFS_sys_attr_s *attrs)
+{
+ if (attrs->flags & PVFS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ else
+ inode->i_flags &= ~S_IMMUTABLE;
+
+ if (attrs->flags & PVFS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ else
+ inode->i_flags &= ~S_APPEND;
+
+ if (attrs->flags & PVFS_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+ else
+ inode->i_flags &= ~S_NOATIME;
+
+ return;
+}
+
+/* NOTE: symname is ignored unless the inode is a sym link */
+static int copy_attributes_to_inode(struct inode *inode,
+ struct PVFS_sys_attr_s *attrs,
+ char *symname)
+{
+ int ret = -1;
+ int perm_mode = 0;
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+ loff_t inode_size = 0;
+ loff_t rounded_up_size = 0;
+
+
+ /*
+ arbitrarily set the inode block size; FIXME: we need to
+ resolve the difference between the reported inode blocksize
+ and the PAGE_CACHE_SIZE, since our block count will always
+ be wrong.
+
+ For now, we're setting the block count to be the proper
+ number assuming the block size is 512 bytes, and the size is
+ rounded up to the nearest 4K. This is apparently required
+ to get proper size reports from the 'du' shell utility.
+
+ changing the inode->i_blkbits to something other than
+ PAGE_CACHE_SHIFT breaks mmap/execution as we depend on that.
+ */
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "attrs->mask = %x (objtype = %s)\n",
+ attrs->mask,
+ attrs->objtype == PVFS_TYPE_METAFILE ? "file" :
+ attrs->objtype == PVFS_TYPE_DIRECTORY ? "directory" :
+ attrs->objtype == PVFS_TYPE_SYMLINK ? "symlink" :
+ "invalid/unknown");
+
+ switch (attrs->objtype) {
+ case PVFS_TYPE_METAFILE:
+ pvfs2_set_inode_flags(inode, attrs);
+ if (attrs->mask & PVFS_ATTR_SYS_SIZE) {
+ inode_size = (loff_t) attrs->size;
+ rounded_up_size =
+ (inode_size + (4096 - (inode_size % 4096)));
+
+ pvfs2_lock_inode(inode);
+ inode->i_bytes = inode_size;
+ inode->i_blocks =
+ (unsigned long)(rounded_up_size / 512);
+ pvfs2_unlock_inode(inode);
+
+ /*
+ * NOTE: make sure all the places we're called
+ * from have the inode->i_sem lock. We're fine
+ * in 99% of the cases since we're mostly
+ * called from a lookup.
+ */
+ inode->i_size = inode_size;
+ }
+ break;
+ case PVFS_TYPE_SYMLINK:
+ if (symname != NULL) {
+ inode->i_size = (loff_t) strlen(symname);
+ break;
+ }
+ /*FALLTHRU*/
+ default:
+ pvfs2_lock_inode(inode);
+ inode->i_bytes = PAGE_CACHE_SIZE;
+ inode->i_blocks = (unsigned long)(PAGE_CACHE_SIZE / 512);
+ pvfs2_unlock_inode(inode);
+
+ inode->i_size = PAGE_CACHE_SIZE;
+ break;
+ }
+
+ inode->i_uid = make_kuid(&init_user_ns, attrs->owner);
+ inode->i_gid = make_kgid(&init_user_ns, attrs->group);
+ inode->i_atime.tv_sec = (time_t) attrs->atime;
+ inode->i_mtime.tv_sec = (time_t) attrs->mtime;
+ inode->i_ctime.tv_sec = (time_t) attrs->ctime;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_mtime.tv_nsec = 0;
+ inode->i_ctime.tv_nsec = 0;
+
+ if (attrs->perms & PVFS_O_EXECUTE)
+ perm_mode |= S_IXOTH;
+ if (attrs->perms & PVFS_O_WRITE)
+ perm_mode |= S_IWOTH;
+ if (attrs->perms & PVFS_O_READ)
+ perm_mode |= S_IROTH;
+
+ if (attrs->perms & PVFS_G_EXECUTE)
+ perm_mode |= S_IXGRP;
+ if (attrs->perms & PVFS_G_WRITE)
+ perm_mode |= S_IWGRP;
+ if (attrs->perms & PVFS_G_READ)
+ perm_mode |= S_IRGRP;
+
+ if (attrs->perms & PVFS_U_EXECUTE)
+ perm_mode |= S_IXUSR;
+ if (attrs->perms & PVFS_U_WRITE)
+ perm_mode |= S_IWUSR;
+ if (attrs->perms & PVFS_U_READ)
+ perm_mode |= S_IRUSR;
+
+ if (attrs->perms & PVFS_G_SGID)
+ perm_mode |= S_ISGID;
+ if (attrs->perms & PVFS_U_SUID)
+ perm_mode |= S_ISUID;
+
+ inode->i_mode = perm_mode;
+
+ if (is_root_handle(inode)) {
+ /* special case: mark the root inode as sticky */
+ inode->i_mode |= S_ISVTX;
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Marking inode %pU as sticky\n",
+ get_khandle_from_ino(inode));
+ }
+
+ switch (attrs->objtype) {
+ case PVFS_TYPE_METAFILE:
+ inode->i_mode |= S_IFREG;
+ ret = 0;
+ break;
+ case PVFS_TYPE_DIRECTORY:
+ inode->i_mode |= S_IFDIR;
+ /* NOTE: we have no good way to keep nlink consistent
+ * for directories across clients; keep constant at 1.
+ * Why 1? If we go with 2, then find(1) gets confused
+ * and won't work properly withouth the -noleaf option
+ */
+ set_nlink(inode, 1);
+ ret = 0;
+ break;
+ case PVFS_TYPE_SYMLINK:
+ inode->i_mode |= S_IFLNK;
+
+ /* copy link target to inode private data */
+ if (pvfs2_inode && symname) {
+ strncpy(pvfs2_inode->link_target,
+ symname,
+ PVFS_NAME_MAX);
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Copied attr link target %s\n",
+ pvfs2_inode->link_target);
+ }
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "symlink mode %o\n",
+ inode->i_mode);
+ ret = 0;
+ break;
+ default:
+ gossip_err("pvfs2: copy_attributes_to_inode: got invalid attribute type %x\n",
+ attrs->objtype);
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "pvfs2: copy_attributes_to_inode: setting i_mode to %o, i_size to %lu\n",
+ inode->i_mode,
+ (unsigned long)i_size_read(inode));
+
+ return ret;
+}
+
+/*
+ * NOTE: in kernel land, we never use the sys_attr->link_target for
+ * anything, so don't bother copying it into the sys_attr object here.
+ */
+static inline int copy_attributes_from_inode(struct inode *inode,
+ struct PVFS_sys_attr_s *attrs,
+ struct iattr *iattr)
+{
+ umode_t tmp_mode;
+
+ if (!iattr || !inode || !attrs) {
+ gossip_err("NULL iattr (%p), inode (%p), attrs (%p) "
+ "in copy_attributes_from_inode!\n",
+ iattr,
+ inode,
+ attrs);
+ return -EINVAL;
+ }
+ /*
+ * We need to be careful to only copy the attributes out of the
+ * iattr object that we know are valid.
+ */
+ attrs->mask = 0;
+ if (iattr->ia_valid & ATTR_UID) {
+ attrs->owner = from_kuid(current_user_ns(), iattr->ia_uid);
+ attrs->mask |= PVFS_ATTR_SYS_UID;
+ gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner);
+ }
+ if (iattr->ia_valid & ATTR_GID) {
+ attrs->group = from_kgid(current_user_ns(), iattr->ia_gid);
+ attrs->mask |= PVFS_ATTR_SYS_GID;
+ gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group);
+ }
+
+ if (iattr->ia_valid & ATTR_ATIME) {
+ attrs->mask |= PVFS_ATTR_SYS_ATIME;
+ if (iattr->ia_valid & ATTR_ATIME_SET) {
+ attrs->atime =
+ pvfs2_convert_time_field((void *)&iattr->ia_atime);
+ attrs->mask |= PVFS_ATTR_SYS_ATIME_SET;
+ }
+ }
+ if (iattr->ia_valid & ATTR_MTIME) {
+ attrs->mask |= PVFS_ATTR_SYS_MTIME;
+ if (iattr->ia_valid & ATTR_MTIME_SET) {
+ attrs->mtime =
+ pvfs2_convert_time_field((void *)&iattr->ia_mtime);
+ attrs->mask |= PVFS_ATTR_SYS_MTIME_SET;
+ }
+ }
+ if (iattr->ia_valid & ATTR_CTIME)
+ attrs->mask |= PVFS_ATTR_SYS_CTIME;
+
+ /*
+ * PVFS2 cannot set size with a setattr operation. Probably not likely
+ * to be requested through the VFS, but just in case, don't worry about
+ * ATTR_SIZE
+ */
+
+ if (iattr->ia_valid & ATTR_MODE) {
+ tmp_mode = iattr->ia_mode;
+ if (tmp_mode & (S_ISVTX)) {
+ if (is_root_handle(inode)) {
+ /*
+ * allow sticky bit to be set on root (since
+ * it shows up that way by default anyhow),
+ * but don't show it to the server
+ */
+ tmp_mode -= S_ISVTX;
+ } else {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
+ return -EINVAL;
+ }
+ }
+
+ if (tmp_mode & (S_ISUID)) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Attempting to set setuid bit (not supported); returning EINVAL.\n");
+ return -EINVAL;
+ }
+
+ attrs->perms = PVFS_util_translate_mode(tmp_mode);
+ attrs->mask |= PVFS_ATTR_SYS_PERM;
+ }
+
+ return 0;
+}
+
+/*
+ * issues a pvfs2 getattr request and fills in the appropriate inode
+ * attributes if successful. returns 0 on success; -errno otherwise
+ */
+int pvfs2_inode_getattr(struct inode *inode, uint32_t getattr_mask)
+{
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+ struct pvfs2_kernel_op *new_op;
+ int ret = -EINVAL;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: called on inode %pU\n",
+ __func__,
+ get_khandle_from_ino(inode));
+
+ new_op = op_alloc(PVFS2_VFS_OP_GETATTR);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.getattr.refn = pvfs2_inode->refn;
+ new_op->upcall.req.getattr.mask = getattr_mask;
+
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+ if (ret != 0)
+ goto out;
+
+ if (copy_attributes_to_inode(inode,
+ &new_op->downcall.resp.getattr.attributes,
+ new_op->downcall.resp.getattr.link_target)) {
+ gossip_err("%s: failed to copy attributes\n", __func__);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * Store blksize in pvfs2 specific part of inode structure; we are
+ * only going to use this to report to stat to make sure it doesn't
+ * perturb any inode related code paths.
+ */
+ if (new_op->downcall.resp.getattr.attributes.objtype ==
+ PVFS_TYPE_METAFILE) {
+ pvfs2_inode->blksize =
+ new_op->downcall.resp.getattr.attributes.blksize;
+ } else {
+ /* mimic behavior of generic_fillattr() for other types. */
+ pvfs2_inode->blksize = (1 << inode->i_blkbits);
+
+ }
+
+out:
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Getattr on handle %pU, "
+ "fsid %d\n (inode ct = %d) returned %d\n",
+ &pvfs2_inode->refn.khandle,
+ pvfs2_inode->refn.fs_id,
+ (int)atomic_read(&inode->i_count),
+ ret);
+
+ op_release(new_op);
+ return ret;
+}
+
+/*
+ * issues a pvfs2 setattr request to make sure the new attribute values
+ * take effect if successful. returns 0 on success; -errno otherwise
+ */
+int pvfs2_inode_setattr(struct inode *inode, struct iattr *iattr)
+{
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+ struct pvfs2_kernel_op *new_op;
+ int ret;
+
+ new_op = op_alloc(PVFS2_VFS_OP_SETATTR);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.setattr.refn = pvfs2_inode->refn;
+ ret = copy_attributes_from_inode(inode,
+ &new_op->upcall.req.setattr.attributes,
+ iattr);
+ if (ret < 0) {
+ op_release(new_op);
+ return ret;
+ }
+
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "pvfs2_inode_setattr: returning %d\n",
+ ret);
+
+ /* when request is serviced properly, free req op struct */
+ op_release(new_op);
+
+ /*
+ * successful setattr should clear the atime, mtime and
+ * ctime flags.
+ */
+ if (ret == 0) {
+ ClearAtimeFlag(pvfs2_inode);
+ ClearMtimeFlag(pvfs2_inode);
+ ClearCtimeFlag(pvfs2_inode);
+ ClearModeFlag(pvfs2_inode);
+ }
+
+ return ret;
+}
+
+int pvfs2_flush_inode(struct inode *inode)
+{
+ /*
+ * If it is a dirty inode, this function gets called.
+ * Gather all the information that needs to be setattr'ed
+ * Right now, this will only be used for mode, atime, mtime
+ * and/or ctime.
+ */
+ struct iattr wbattr;
+ int ret;
+ int mtime_flag;
+ int ctime_flag;
+ int atime_flag;
+ int mode_flag;
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+
+ memset(&wbattr, 0, sizeof(wbattr));
+
+ /*
+ * check inode flags up front, and clear them if they are set. This
+ * will prevent multiple processes from all trying to flush the same
+ * inode if they call close() simultaneously
+ */
+ mtime_flag = MtimeFlag(pvfs2_inode);
+ ClearMtimeFlag(pvfs2_inode);
+ ctime_flag = CtimeFlag(pvfs2_inode);
+ ClearCtimeFlag(pvfs2_inode);
+ atime_flag = AtimeFlag(pvfs2_inode);
+ ClearAtimeFlag(pvfs2_inode);
+ mode_flag = ModeFlag(pvfs2_inode);
+ ClearModeFlag(pvfs2_inode);
+
+ /* -- Lazy atime,mtime and ctime update --
+ * Note: all times are dictated by server in the new scheme
+ * and not by the clients
+ *
+ * Also mode updates are being handled now..
+ */
+
+ if (mtime_flag)
+ wbattr.ia_valid |= ATTR_MTIME;
+ if (ctime_flag)
+ wbattr.ia_valid |= ATTR_CTIME;
+ if (atime_flag)
+ wbattr.ia_valid |= ATTR_ATIME;
+
+ if (mode_flag) {
+ wbattr.ia_mode = inode->i_mode;
+ wbattr.ia_valid |= ATTR_MODE;
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "*********** pvfs2_flush_inode: %pU "
+ "(ia_valid %d)\n",
+ get_khandle_from_ino(inode),
+ wbattr.ia_valid);
+ if (wbattr.ia_valid == 0) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "pvfs2_flush_inode skipping setattr()\n");
+ return 0;
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "pvfs2_flush_inode (%pU) writing mode %o\n",
+ get_khandle_from_ino(inode),
+ inode->i_mode);
+
+ ret = pvfs2_inode_setattr(inode, &wbattr);
+
+ return ret;
+}
+
+int pvfs2_unmount_sb(struct super_block *sb)
+{
+ int ret = -EINVAL;
+ struct pvfs2_kernel_op *new_op = NULL;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "pvfs2_unmount_sb called on sb %p\n",
+ sb);
+
+ new_op = op_alloc(PVFS2_VFS_OP_FS_UMOUNT);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.fs_umount.id = PVFS2_SB(sb)->id;
+ new_op->upcall.req.fs_umount.fs_id = PVFS2_SB(sb)->fs_id;
+ strncpy(new_op->upcall.req.fs_umount.pvfs2_config_server,
+ PVFS2_SB(sb)->devname,
+ PVFS_MAX_SERVER_ADDR_LEN);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Attempting PVFS2 Unmount via host %s\n",
+ new_op->upcall.req.fs_umount.pvfs2_config_server);
+
+ ret = service_operation(new_op, "pvfs2_fs_umount", 0);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "pvfs2_unmount: got return value of %d\n", ret);
+ if (ret)
+ sb = ERR_PTR(ret);
+ else
+ PVFS2_SB(sb)->mount_pending = 1;
+
+ op_release(new_op);
+ return ret;
+}
+
+/*
+ * NOTE: on successful cancellation, be sure to return -EINTR, as
+ * that's the return value the caller expects
+ */
+int pvfs2_cancel_op_in_progress(uint64_t tag)
+{
+ int ret = -EINVAL;
+ struct pvfs2_kernel_op *new_op = NULL;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "pvfs2_cancel_op_in_progress called on tag %llu\n",
+ llu(tag));
+
+ new_op = op_alloc(PVFS2_VFS_OP_CANCEL);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.cancel.op_tag = tag;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Attempting PVFS2 operation cancellation of tag %llu\n",
+ llu(new_op->upcall.req.cancel.op_tag));
+
+ ret = service_operation(new_op, "pvfs2_cancel", PVFS2_OP_CANCELLATION);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "pvfs2_cancel_op_in_progress: got return value of %d\n",
+ ret);
+
+ op_release(new_op);
+ return ret;
+}
+
+void pvfs2_op_initialize(struct pvfs2_kernel_op *op)
+{
+ if (op) {
+ spin_lock(&op->lock);
+ op->io_completed = 0;
+
+ op->upcall.type = PVFS2_VFS_OP_INVALID;
+ op->downcall.type = PVFS2_VFS_OP_INVALID;
+ op->downcall.status = -1;
+
+ op->op_state = OP_VFS_STATE_UNKNOWN;
+ op->tag = 0;
+ spin_unlock(&op->lock);
+ }
+}
+
+void pvfs2_make_bad_inode(struct inode *inode)
+{
+ if (is_root_handle(inode)) {
+ /*
+ * if this occurs, the pvfs2-client-core was killed but we
+ * can't afford to lose the inode operations and such
+ * associated with the root handle in any case.
+ */
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "*** NOT making bad root inode %pU\n",
+ get_khandle_from_ino(inode));
+ } else {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "*** making bad inode %pU\n",
+ get_khandle_from_ino(inode));
+ make_bad_inode(inode);
+ }
+}
+
+/* this code is based on linux/net/sunrpc/clnt.c:rpc_clnt_sigmask */
+void mask_blocked_signals(sigset_t *orig_sigset)
+{
+ unsigned long sigallow = sigmask(SIGKILL);
+ unsigned long irqflags = 0;
+ struct k_sigaction *action = pvfs2_current_sigaction;
+
+ sigallow |= ((action[SIGINT - 1].sa.sa_handler == SIG_DFL) ?
+ sigmask(SIGINT) :
+ 0);
+ sigallow |= ((action[SIGQUIT - 1].sa.sa_handler == SIG_DFL) ?
+ sigmask(SIGQUIT) :
+ 0);
+
+ spin_lock_irqsave(&pvfs2_current_signal_lock, irqflags);
+ *orig_sigset = current->blocked;
+ siginitsetinv(¤t->blocked, sigallow & ~orig_sigset->sig[0]);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&pvfs2_current_signal_lock, irqflags);
+}
+
+/* this code is based on linux/net/sunrpc/clnt.c:rpc_clnt_sigunmask */
+void unmask_blocked_signals(sigset_t *orig_sigset)
+{
+ unsigned long irqflags = 0;
+
+ spin_lock_irqsave(&pvfs2_current_signal_lock, irqflags);
+ current->blocked = *orig_sigset;
+ recalc_sigpending();
+ spin_unlock_irqrestore(&pvfs2_current_signal_lock, irqflags);
+}
+
+uint64_t pvfs2_convert_time_field(void *time_ptr)
+{
+ uint64_t pvfs2_time;
+ struct timespec *tspec = (struct timespec *)time_ptr;
+ pvfs2_time = (uint64_t) ((time_t) tspec->tv_sec);
+ return pvfs2_time;
+}
+
+/* macro defined in include/pvfs2-types.h */
+DECLARE_ERRNO_MAPPING_AND_FN();
+
+int pvfs2_normalize_to_errno(int32_t error_code)
+{
+ if (error_code > 0) {
+ gossip_err("pvfs2: error status receieved.\n");
+ gossip_err("pvfs2: assuming error code is inverted.\n");
+ error_code = -error_code;
+ }
+
+ /* convert any error codes that are in pvfs2 format */
+ if (IS_PVFS_NON_ERRNO_ERROR(-error_code)) {
+ if (PVFS_NON_ERRNO_ERROR_CODE(-error_code) == PVFS_ECANCEL) {
+ /*
+ * cancellation error codes generally correspond to
+ * a timeout from the client's perspective
+ */
+ error_code = -ETIMEDOUT;
+ } else {
+ /* assume a default error code */
+ gossip_err("pvfs2: warning: got error code without errno equivalent: %d.\n",
+ error_code);
+ error_code = -EINVAL;
+ }
+ } else if (IS_PVFS_ERROR(-error_code)) {
+ error_code = -PVFS_ERROR_TO_ERRNO(-error_code);
+ }
+ return error_code;
+}
+
+#define NUM_MODES 11
+int32_t PVFS_util_translate_mode(int mode)
+{
+ int ret = 0;
+ int i = 0;
+ static int modes[NUM_MODES] = {
+ S_IXOTH, S_IWOTH, S_IROTH,
+ S_IXGRP, S_IWGRP, S_IRGRP,
+ S_IXUSR, S_IWUSR, S_IRUSR,
+ S_ISGID, S_ISUID
+ };
+ static int pvfs2_modes[NUM_MODES] = {
+ PVFS_O_EXECUTE, PVFS_O_WRITE, PVFS_O_READ,
+ PVFS_G_EXECUTE, PVFS_G_WRITE, PVFS_G_READ,
+ PVFS_U_EXECUTE, PVFS_U_WRITE, PVFS_U_READ,
+ PVFS_G_SGID, PVFS_U_SUID
+ };
+
+ for (i = 0; i < NUM_MODES; i++)
+ if (mode & modes[i])
+ ret |= pvfs2_modes[i];
+
+ return ret;
+}
+#undef NUM_MODES
+
+static char *pvfs2_strtok(char *s, const char *toks)
+{
+ /* original string */
+ static char *in_string_p;
+ /* starting value of in_string_p during this iteration. */
+ char *this_string_p;
+ /* # of tokens */
+ uint32_t toks_len = strlen(toks);
+ /* index */
+ uint32_t i;
+
+ /* when s has a value, we are using a new input string */
+ if (s)
+ in_string_p = s;
+
+ /* set new starting position */
+ this_string_p = in_string_p;
+
+ /*
+ * loop through the string until a token or end-of-string(null)
+ * is found.
+ */
+ for (; *in_string_p; in_string_p++)
+ /* Is character a token? */
+ for (i = 0; i < toks_len; i++)
+ if (*in_string_p == toks[i]) {
+ /*token found => end-of-word */
+ *in_string_p = 0;
+ in_string_p++;
+ return this_string_p;
+ }
+
+ if (*this_string_p == 0)
+ return NULL;
+
+ return this_string_p;
+}
+
+/*convert 64-bit debug mask into a readable string of keywords*/
+static int proc_mask_to_debug(struct __keyword_mask_t *mask_map,
+ int num_mask_map,
+ uint64_t mask,
+ char *debug_string)
+{
+ unsigned int index = 0;
+ unsigned int i;
+
+ memset(debug_string, 0, PVFS2_MAX_DEBUG_STRING_LEN);
+
+ for (i = 0; i < num_mask_map; i++) {
+ if ((index + strlen(mask_map[i].keyword)) >=
+ PVFS2_MAX_DEBUG_STRING_LEN)
+ return 0;
+
+ switch (mask_map[i].mask_val) {
+ case GOSSIP_NO_DEBUG:
+ if (mask == GOSSIP_NO_DEBUG) {
+ strcpy(debug_string, mask_map[i].keyword);
+ return 0;
+ }
+ break;
+ case GOSSIP_MAX_DEBUG:
+ if (mask == GOSSIP_MAX_DEBUG) {
+ strcpy(debug_string, mask_map[i].keyword);
+ return 0;
+ }
+ break;
+ default:
+ if ((mask & mask_map[i].mask_val) !=
+ mask_map[i].mask_val)
+ /*mask does NOT contain the mask value */
+ break;
+
+ if (index != 0) {
+ /*
+ * add comma for second and subsequent mask
+ * keywords
+ */
+ (debug_string[index]) = ',';
+ index++;
+ }
+
+ /*add keyword and slide index */
+ memcpy(&debug_string[index],
+ mask_map[i].keyword,
+ strlen(mask_map[i].keyword));
+ index += strlen(mask_map[i].keyword);
+ }
+ }
+
+ return 0;
+}
+
+static uint64_t proc_debug_to_mask(struct __keyword_mask_t *mask_map,
+ int num_mask_map,
+ const char *event_logging)
+{
+ uint64_t mask = 0;
+ char *s = NULL;
+ char *t = NULL;
+ const char *toks = ", ";
+ int i = 0;
+ int negate = 0;
+ int slen = 0;
+
+ if (event_logging) {
+ /* s = strdup(event_logging); */
+ slen = strlen(event_logging);
+ s = kmalloc(slen + 1, GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+ memset(s, 0, slen + 1);
+ memcpy(s, event_logging, slen);
+
+ /* t = strtok(s, toks); */
+ t = pvfs2_strtok(s, toks);
+
+ while (t) {
+ if (*t == '-') {
+ negate = 1;
+ ++t;
+ }
+
+ for (i = 0; i < num_mask_map; i++) {
+ if (!strcmp(t, mask_map[i].keyword)) {
+
+ if (negate)
+ mask &= ~mask_map[i].mask_val;
+ else
+ mask |= mask_map[i].mask_val;
+
+ break;
+ }
+ }
+ /* t = strtok(NULL, toks); */
+ t = pvfs2_strtok(NULL, toks);
+ }
+ kfree(s);
+ }
+ return mask;
+}
+
+/*
+ * Based on human readable keywords, translate them into
+ * a mask value appropriate for the debugging level desired.
+ * The 'computed' mask is returned; 0 if no keywords are
+ * present or recognized. Unrecognized keywords are ignored when
+ * mixed with recognized keywords.
+ *
+ * Prefix a keyword with "-" to turn it off. All keywords
+ * processed in specified order.
+ */
+uint64_t PVFS_proc_debug_eventlog_to_mask(const char *event_logging)
+{
+ return proc_debug_to_mask(s_keyword_mask_map,
+ num_keyword_mask_map,
+ event_logging);
+}
+
+uint64_t PVFS_proc_kmod_eventlog_to_mask(const char *event_logging)
+{
+ return proc_debug_to_mask(s_kmod_keyword_mask_map,
+ num_kmod_keyword_mask_map,
+ event_logging);
+}
+
+int PVFS_proc_kmod_mask_to_eventlog(uint64_t mask, char *debug_string)
+{
+ return proc_mask_to_debug(s_kmod_keyword_mask_map,
+ num_kmod_keyword_mask_map,
+ mask,
+ debug_string);
+}
+
+int PVFS_proc_mask_to_eventlog(uint64_t mask, char *debug_string)
+{
+
+ return proc_mask_to_debug(s_keyword_mask_map,
+ num_keyword_mask_map,
+ mask,
+ debug_string);
+}
new file mode 100644
@@ -0,0 +1,548 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+#include <linux/parser.h>
+
+/* a cache for pvfs2-inode objects (i.e. pvfs2 inode private data) */
+static struct kmem_cache *pvfs2_inode_cache;
+
+/* list for storing pvfs2 specific superblocks in use */
+LIST_HEAD(pvfs2_superblocks);
+
+DEFINE_SPINLOCK(pvfs2_superblocks_lock);
+
+enum {
+ Opt_intr,
+ Opt_acl,
+
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ { Opt_acl, "acl" },
+ { Opt_intr, "intr" },
+ { Opt_err, NULL }
+};
+
+
+static int parse_mount_options(struct super_block *sb, char *options,
+ int silent)
+{
+ struct pvfs2_sb_info_s *pvfs2_sb = PVFS2_SB(sb);
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
+
+ sb->s_flags &= ~MS_POSIXACL;
+ pvfs2_sb->flags &= ~PVFS2_OPT_INTR;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_acl:
+ sb->s_flags |= MS_POSIXACL;
+ break;
+ case Opt_intr:
+ pvfs2_sb->flags |= PVFS2_OPT_INTR;
+ break;
+ default:
+ goto fail;
+ }
+ }
+
+ return 0;
+fail:
+ if (!silent)
+ gossip_err("Error: mount option [%s] is not supported.\n", p);
+ return -EINVAL;
+}
+
+static void pvfs2_inode_cache_ctor(void *req)
+{
+ struct pvfs2_inode_s *pvfs2_inode = req;
+
+ inode_init_once(&pvfs2_inode->vfs_inode);
+ init_rwsem(&pvfs2_inode->xattr_sem);
+
+ pvfs2_inode->vfs_inode.i_version = 1;
+}
+
+static struct inode *pvfs2_alloc_inode(struct super_block *sb)
+{
+ struct pvfs2_inode_s *pvfs2_inode;
+
+ pvfs2_inode = kmem_cache_alloc(pvfs2_inode_cache,
+ PVFS2_CACHE_ALLOC_FLAGS);
+ if (pvfs2_inode == NULL) {
+ gossip_err("Failed to allocate pvfs2_inode\n");
+ return NULL;
+ }
+
+ /*
+ * We want to clear everything except for rw_semaphore and the
+ * vfs_inode.
+ */
+ memset(&pvfs2_inode->refn.khandle, 0, 16);
+ pvfs2_inode->refn.fs_id = PVFS_FS_ID_NULL;
+ pvfs2_inode->last_failed_block_index_read = 0;
+ memset(pvfs2_inode->link_target, 0, sizeof(pvfs2_inode->link_target));
+ pvfs2_inode->pinode_flags = 0;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "pvfs2_alloc_inode: allocated %p\n",
+ &pvfs2_inode->vfs_inode);
+ return &pvfs2_inode->vfs_inode;
+}
+
+static void pvfs2_destroy_inode(struct inode *inode)
+{
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "%s: deallocated %p destroying inode %pU\n",
+ __func__, pvfs2_inode, get_khandle_from_ino(inode));
+
+ kmem_cache_free(pvfs2_inode_cache, pvfs2_inode);
+}
+
+/*
+ * NOTE: information filled in here is typically reflected in the
+ * output of the system command 'df'
+*/
+static int pvfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ int ret = -ENOMEM;
+ struct pvfs2_kernel_op *new_op = NULL;
+ int flags = 0;
+ struct super_block *sb = NULL;
+
+ sb = dentry->d_sb;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "pvfs2_statfs: called on sb %p (fs_id is %d)\n",
+ sb,
+ (int)(PVFS2_SB(sb)->fs_id));
+
+ new_op = op_alloc(PVFS2_VFS_OP_STATFS);
+ if (!new_op)
+ return ret;
+ new_op->upcall.req.statfs.fs_id = PVFS2_SB(sb)->fs_id;
+
+ if (PVFS2_SB(sb)->flags & PVFS2_OPT_INTR)
+ flags = PVFS2_OP_INTERRUPTIBLE;
+
+ ret = service_operation(new_op, "pvfs2_statfs", flags);
+
+ if (new_op->downcall.status < 0)
+ goto out_op_release;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "pvfs2_statfs: got %ld blocks available | "
+ "%ld blocks total | %ld block size\n",
+ (long)new_op->downcall.resp.statfs.blocks_avail,
+ (long)new_op->downcall.resp.statfs.blocks_total,
+ (long)new_op->downcall.resp.statfs.block_size);
+
+ buf->f_type = sb->s_magic;
+ memcpy(&buf->f_fsid, &PVFS2_SB(sb)->fs_id, sizeof(buf->f_fsid));
+ buf->f_bsize = new_op->downcall.resp.statfs.block_size;
+ buf->f_namelen = PVFS2_NAME_LEN;
+
+ buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total;
+ buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+ buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+ buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total;
+ buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail;
+ buf->f_frsize = sb->s_blocksize;
+
+out_op_release:
+ op_release(new_op);
+ gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_statfs: returning %d\n", ret);
+ return ret;
+}
+
+/*
+ * Remount as initiated by VFS layer. We just need to reparse the mount
+ * options, no need to signal pvfs2-client-core about it.
+ */
+static int pvfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_remount_fs: called\n");
+ return parse_mount_options(sb, data, 1);
+}
+
+/*
+ * Remount as initiated by pvfs2-client-core on restart. This is used to
+ * repopulate mount information left from previous pvfs2-client-core.
+ *
+ * the idea here is that given a valid superblock, we're
+ * re-initializing the user space client with the initial mount
+ * information specified when the super block was first initialized.
+ * this is very different than the first initialization/creation of a
+ * superblock. we use the special service_priority_operation to make
+ * sure that the mount gets ahead of any other pending operation that
+ * is waiting for servicing. this means that the pvfs2-client won't
+ * fail to start several times for all other pending operations before
+ * the client regains all of the mount information from us.
+ * NOTE: this function assumes that the request_mutex is already acquired!
+ */
+int pvfs2_remount(struct super_block *sb)
+{
+ struct pvfs2_kernel_op *new_op;
+ int ret = -EINVAL;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_remount: called\n");
+
+ new_op = op_alloc(PVFS2_VFS_OP_FS_MOUNT);
+ if (!new_op)
+ return -ENOMEM;
+ strncpy(new_op->upcall.req.fs_mount.pvfs2_config_server,
+ PVFS2_SB(sb)->devname,
+ PVFS_MAX_SERVER_ADDR_LEN);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Attempting PVFS2 Remount via host %s\n",
+ new_op->upcall.req.fs_mount.pvfs2_config_server);
+
+ /*
+ * we assume that the calling function has already acquire the
+ * request_mutex to prevent other operations from bypassing
+ * this one
+ */
+ ret = service_operation(new_op, "pvfs2_remount",
+ PVFS2_OP_PRIORITY | PVFS2_OP_NO_SEMAPHORE);
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "pvfs2_remount: mount got return value of %d\n",
+ ret);
+ if (ret == 0) {
+ /*
+ * store the id assigned to this sb -- it's just a
+ * short-lived mapping that the system interface uses
+ * to map this superblock to a particular mount entry
+ */
+ PVFS2_SB(sb)->id = new_op->downcall.resp.fs_mount.id;
+ PVFS2_SB(sb)->mount_pending = 0;
+ }
+
+ op_release(new_op);
+ return ret;
+}
+
+int fsid_key_table_initialize(void)
+{
+ return 0;
+}
+
+void fsid_key_table_finalize(void)
+{
+ return;
+}
+
+/* Called whenever the VFS dirties the inode in response to atime updates */
+static void pvfs2_dirty_inode(struct inode *inode, int flags)
+{
+ struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "pvfs2_dirty_inode: %pU\n",
+ get_khandle_from_ino(inode));
+ SetAtimeFlag(pvfs2_inode);
+}
+
+struct super_operations pvfs2_s_ops = {
+ .alloc_inode = pvfs2_alloc_inode,
+ .destroy_inode = pvfs2_destroy_inode,
+ .dirty_inode = pvfs2_dirty_inode,
+ .drop_inode = generic_delete_inode,
+ .statfs = pvfs2_statfs,
+ .remount_fs = pvfs2_remount_fs,
+ .show_options = generic_show_options,
+};
+
+struct dentry *pvfs2_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len,
+ int fh_type)
+{
+ PVFS_object_kref refn;
+
+ if (fh_len < 5 || fh_type > 2)
+ return NULL;
+
+ PVFS_khandle_from(&(refn.khandle), fid->raw, 16);
+ refn.fs_id = (u32) fid->raw[4];
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "fh_to_dentry: handle %pU, fs_id %d\n",
+ &refn.khandle,
+ refn.fs_id);
+
+ return d_obtain_alias(pvfs2_iget(sb, &refn));
+}
+
+int pvfs2_encode_fh(struct inode *inode,
+ __u32 *fh,
+ int *max_len,
+ struct inode *parent)
+{
+ int len = parent ? 10 : 5;
+ int type = 1;
+ PVFS_object_kref refn;
+
+ if (*max_len < len) {
+ gossip_lerr("fh buffer is too small for encoding\n");
+ *max_len = len;
+ type = 255;
+ goto out;
+ }
+
+ refn = PVFS2_I(inode)->refn;
+ PVFS_khandle_to(&refn.khandle, fh, 16);
+ fh[4] = refn.fs_id;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Encoding fh: handle %pU, fsid %u\n",
+ &refn.khandle,
+ refn.fs_id);
+
+
+ if (parent) {
+ refn = PVFS2_I(parent)->refn;
+ PVFS_khandle_to(&refn.khandle, (char *) fh + 20, 16);
+ fh[9] = refn.fs_id;
+
+ type = 2;
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Encoding parent: handle %pU, fsid %u\n",
+ &refn.khandle,
+ refn.fs_id);
+ }
+ *max_len = len;
+
+out:
+ return type;
+}
+
+static struct export_operations pvfs2_export_ops = {
+ .encode_fh = pvfs2_encode_fh,
+ .fh_to_dentry = pvfs2_fh_to_dentry,
+};
+
+int pvfs2_fill_sb(struct super_block *sb, void *data, int silent)
+{
+ int ret = -EINVAL;
+ struct inode *root = NULL;
+ struct dentry *root_dentry = NULL;
+ struct pvfs2_mount_sb_info_t *mount_sb_info =
+ (struct pvfs2_mount_sb_info_t *) data;
+ PVFS_object_kref root_object;
+
+ /* alloc and init our private pvfs2 sb info */
+ sb->s_fs_info =
+ kmalloc(sizeof(struct pvfs2_sb_info_s), PVFS2_GFP_FLAGS);
+ if (!PVFS2_SB(sb))
+ return -ENOMEM;
+ memset(sb->s_fs_info, 0, sizeof(struct pvfs2_sb_info_s));
+ PVFS2_SB(sb)->sb = sb;
+
+ PVFS2_SB(sb)->root_khandle = mount_sb_info->root_khandle;
+ PVFS2_SB(sb)->fs_id = mount_sb_info->fs_id;
+ PVFS2_SB(sb)->id = mount_sb_info->id;
+
+ if (mount_sb_info->data) {
+ ret = parse_mount_options(sb, mount_sb_info->data,
+ silent);
+ if (ret)
+ return ret;
+ }
+
+ /* Hang the xattr handlers off the superblock */
+ sb->s_xattr = pvfs2_xattr_handlers;
+ sb->s_magic = PVFS2_SUPER_MAGIC;
+ sb->s_op = &pvfs2_s_ops;
+ sb->s_d_op = &pvfs2_dentry_operations;
+
+ sb->s_blocksize = pvfs_bufmap_size_query();
+ sb->s_blocksize_bits = pvfs_bufmap_shift_query();
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+ root_object.khandle = PVFS2_SB(sb)->root_khandle;
+ root_object.fs_id = PVFS2_SB(sb)->fs_id;
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "get inode %pU, fsid %d\n",
+ &root_object.khandle,
+ root_object.fs_id);
+
+ root = pvfs2_iget(sb, &root_object);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Allocated root inode [%p] with mode %x\n",
+ root,
+ root->i_mode);
+
+ /* allocates and places root dentry in dcache */
+ root_dentry = d_make_root(root);
+ if (!root_dentry) {
+ iput(root);
+ return -ENOMEM;
+ }
+
+ sb->s_export_op = &pvfs2_export_ops;
+ sb->s_root = root_dentry;
+ return 0;
+}
+
+struct dentry *pvfs2_mount(struct file_system_type *fst,
+ int flags,
+ const char *devname,
+ void *data)
+{
+ int ret = -EINVAL;
+ struct super_block *sb = ERR_PTR(-EINVAL);
+ struct pvfs2_kernel_op *new_op;
+ struct pvfs2_mount_sb_info_t mount_sb_info;
+ struct dentry *mnt_sb_d = ERR_PTR(-EINVAL);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "pvfs2_mount: called with devname %s\n",
+ devname);
+
+ if (!devname) {
+ gossip_err("ERROR: device name not specified.\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ new_op = op_alloc(PVFS2_VFS_OP_FS_MOUNT);
+ if (!new_op)
+ return ERR_PTR(-ENOMEM);
+
+ strncpy(new_op->upcall.req.fs_mount.pvfs2_config_server,
+ devname,
+ PVFS_MAX_SERVER_ADDR_LEN);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Attempting PVFS2 Mount via host %s\n",
+ new_op->upcall.req.fs_mount.pvfs2_config_server);
+
+ ret = service_operation(new_op, "pvfs2_mount", 0);
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "pvfs2_mount: mount got return value of %d\n", ret);
+ if (ret)
+ goto free_op;
+
+ if (new_op->downcall.resp.fs_mount.fs_id == PVFS_FS_ID_NULL) {
+ gossip_err("ERROR: Retrieved null fs_id\n");
+ ret = -EINVAL;
+ goto free_op;
+ }
+
+ /* fill in temporary structure passed to fill_sb method */
+ mount_sb_info.data = data;
+ mount_sb_info.root_khandle =
+ new_op->downcall.resp.fs_mount.root_khandle;
+ mount_sb_info.fs_id = new_op->downcall.resp.fs_mount.fs_id;
+ mount_sb_info.id = new_op->downcall.resp.fs_mount.id;
+
+ /*
+ * the mount_sb_info structure looks odd, but it's used because
+ * the private sb info isn't allocated until we call
+ * pvfs2_fill_sb, yet we have the info we need to fill it with
+ * here. so we store it temporarily and pass all of the info
+ * to fill_sb where it's properly copied out
+ */
+ mnt_sb_d = mount_nodev(fst,
+ flags,
+ (void *)&mount_sb_info,
+ pvfs2_fill_sb);
+ if (IS_ERR(mnt_sb_d)) {
+ sb = ERR_CAST(mnt_sb_d);
+ goto free_op;
+ }
+
+ sb = mnt_sb_d->d_sb;
+
+ /*
+ * on successful mount, store the devname and data
+ * used
+ */
+ strncpy(PVFS2_SB(sb)->devname,
+ devname,
+ PVFS_MAX_SERVER_ADDR_LEN);
+
+ /* mount_pending must be cleared */
+ PVFS2_SB(sb)->mount_pending = 0;
+
+ /*
+ * finally, add this sb to our list of known pvfs2
+ * sb's
+ */
+ add_pvfs2_sb(sb);
+ op_release(new_op);
+ return mnt_sb_d;
+
+free_op:
+ gossip_err("pvfs2_mount: mount request failed with %d\n", ret);
+ if (ret == -EINVAL) {
+ gossip_err("Ensure that all pvfs2-servers have the same FS configuration files\n");
+ gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n");
+ }
+
+ op_release(new_op);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "pvfs2_mount: returning dentry %p\n",
+ mnt_sb_d);
+ return mnt_sb_d;
+}
+
+void pvfs2_kill_sb(struct super_block *sb)
+{
+ gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_kill_sb: called\n");
+
+ /*
+ * issue the unmount to userspace to tell it to remove the
+ * dynamic mount info it has for this superblock
+ */
+ pvfs2_unmount_sb(sb);
+
+ /* remove the sb from our list of pvfs2 specific sb's */
+ remove_pvfs2_sb(sb);
+
+ /* provided sb cleanup */
+ kill_anon_super(sb);
+
+ /* free the pvfs2 superblock private data */
+ kfree(PVFS2_SB(sb));
+}
+
+int pvfs2_inode_cache_initialize(void)
+{
+ pvfs2_inode_cache = kmem_cache_create("pvfs2_inode_cache",
+ sizeof(struct pvfs2_inode_s),
+ 0,
+ PVFS2_CACHE_CREATE_FLAGS,
+ pvfs2_inode_cache_ctor);
+
+ if (!pvfs2_inode_cache) {
+ gossip_err("Cannot create pvfs2_inode_cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int pvfs2_inode_cache_finalize(void)
+{
+ kmem_cache_destroy(pvfs2_inode_cache);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,30 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+
+static void *pvfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ char *target = PVFS2_I(dentry->d_inode)->link_target;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "pvfs2: %s called on %s (target is %p)\n",
+ __func__, (char *)dentry->d_name.name, target);
+
+ nd_set_link(nd, target);
+ return NULL;
+}
+
+struct inode_operations pvfs2_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = pvfs2_follow_link,
+ .setattr = pvfs2_setattr,
+ .getattr = pvfs2_getattr,
+ .listxattr = pvfs2_listxattr,
+ .setxattr = generic_setxattr,
+};