[V2,2/5] Orangefs: protocol between kernel and userspace

Message ID	1421787111-27162-3-git-send-email-root@logtruck.clemson.edu (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-fsdevel-owner@kernel.org> From: Mike Marshall <hubcap@omnibond.com> To: viro@zeniv.linux.org.uk Cc: Mike Marshall <hubcap@omnibond.com>, linux-fsdevel@vger.kernel.org Subject: [PATCH V2 2/5] Orangefs: protocol between kernel and userspace Date: Tue, 20 Jan 2015 15:51:48 -0500 Message-Id: <1421787111-27162-3-git-send-email-root@logtruck.clemson.edu> In-Reply-To: <1421787111-27162-1-git-send-email-root@logtruck.clemson.edu> References: <1421787111-27162-1-git-send-email-root@logtruck.clemson.edu> Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk

diff --git a/fs/orangefs/pvfs2-bufmap.c b/fs/orangefs/pvfs2-bufmap.c new file mode 100644 index 0000000..9a756ee --- /dev/null +++ b/fs/orangefs/pvfs2-bufmap.c @@ -0,0 +1,970 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-bufmap.h" + +DECLARE_WAIT_QUEUE_HEAD(pvfs2_bufmap_init_waitq); + +struct pvfs2_bufmap { + atomic_t refcnt; + + int desc_size; + int desc_shift; + int desc_count; + int total_size; + int page_count; + + struct page **page_array; + struct pvfs_bufmap_desc *desc_array; + + /* array to track usage of buffer descriptors */ + int *buffer_index_array; + spinlock_t buffer_index_lock; + + /* array to track usage of buffer descriptors for readdir */ + int readdir_index_array[PVFS2_READDIR_DEFAULT_DESC_COUNT]; + spinlock_t readdir_index_lock; +} *__pvfs2_bufmap; + +static DEFINE_SPINLOCK(pvfs2_bufmap_lock); + +static void +pvfs2_bufmap_unmap(struct pvfs2_bufmap *bufmap) +{ + int i; + + for (i = 0; i < bufmap->page_count; i++) + page_cache_release(bufmap->page_array[i]); +} + +static void +pvfs2_bufmap_free(struct pvfs2_bufmap *bufmap) +{ + kfree(bufmap->page_array); + kfree(bufmap->desc_array); + kfree(bufmap->buffer_index_array); + kfree(bufmap); +} + +struct pvfs2_bufmap *pvfs2_bufmap_ref(void) +{ + struct pvfs2_bufmap *bufmap = NULL; + + spin_lock(&pvfs2_bufmap_lock); + if (__pvfs2_bufmap) { + bufmap = __pvfs2_bufmap; + atomic_inc(&bufmap->refcnt); + } + spin_unlock(&pvfs2_bufmap_lock); + return bufmap; +} + +void pvfs2_bufmap_unref(struct pvfs2_bufmap *bufmap) +{ + if (atomic_dec_and_lock(&bufmap->refcnt, &pvfs2_bufmap_lock)) { + __pvfs2_bufmap = NULL; + spin_unlock(&pvfs2_bufmap_lock); + + pvfs2_bufmap_unmap(bufmap); + pvfs2_bufmap_free(bufmap); + } +} + +inline int pvfs_bufmap_size_query(void) +{ + struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); + int size = bufmap ? bufmap->desc_size : 0; + + pvfs2_bufmap_unref(bufmap); + return size; +} + +inline int pvfs_bufmap_shift_query(void) +{ + struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); + int shift = bufmap ? bufmap->desc_shift : 0; + + pvfs2_bufmap_unref(bufmap); + return shift; +} + +static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); +static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); + +/* + * get_bufmap_init + * + * If bufmap_init is 1, then the shared memory system, including the + * buffer_index_array, is available. Otherwise, it is not. + * + * returns the value of bufmap_init + */ +int get_bufmap_init(void) +{ + return __pvfs2_bufmap ? 1 : 0; +} + + +static struct pvfs2_bufmap * +pvfs2_bufmap_alloc(struct PVFS_dev_map_desc *user_desc) +{ + struct pvfs2_bufmap *bufmap; + + bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL); + if (!bufmap) + goto out; + + atomic_set(&bufmap->refcnt, 1); + bufmap->total_size = user_desc->total_size; + bufmap->desc_count = user_desc->count; + bufmap->desc_size = user_desc->size; + bufmap->desc_shift = ilog2(bufmap->desc_size); + + spin_lock_init(&bufmap->buffer_index_lock); + bufmap->buffer_index_array = + kcalloc(bufmap->desc_count, sizeof(int), GFP_KERNEL); + if (!bufmap->buffer_index_array) { + gossip_err("pvfs2: could not allocate %d buffer indices\n", + bufmap->desc_count); + goto out_free_bufmap; + } + spin_lock_init(&bufmap->readdir_index_lock); + + bufmap->desc_array = + kcalloc(bufmap->desc_count, sizeof(struct pvfs_bufmap_desc), + GFP_KERNEL); + if (!bufmap->desc_array) { + gossip_err("pvfs2: could not allocate %d descriptors\n", + bufmap->desc_count); + goto out_free_index_array; + } + + bufmap->page_count = bufmap->total_size / PAGE_SIZE; + + /* allocate storage to track our page mappings */ + bufmap->page_array = + kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL); + if (!bufmap->page_array) + goto out_free_desc_array; + + return bufmap; + +out_free_desc_array: + kfree(bufmap->desc_array); +out_free_index_array: + kfree(bufmap->buffer_index_array); +out_free_bufmap: + kfree(bufmap); +out: + return NULL; +} + +static int +pvfs2_bufmap_map(struct pvfs2_bufmap *bufmap, + struct PVFS_dev_map_desc *user_desc) +{ + int pages_per_desc = bufmap->desc_size / PAGE_SIZE; + int offset = 0, ret, i; + + /* map the pages */ + down_write(&current->mm->mmap_sem); + ret = get_user_pages(current, + current->mm, + (unsigned long)user_desc->ptr, + bufmap->page_count, + 1, + 0, + bufmap->page_array, + NULL); + up_write(&current->mm->mmap_sem); + + if (ret < 0) + return ret; + + if (ret != bufmap->page_count) { + gossip_err("pvfs2 error: asked for %d pages, only got %d.\n", + bufmap->page_count, ret); + + for (i = 0; i < ret; i++) { + SetPageError(bufmap->page_array[i]); + page_cache_release(bufmap->page_array[i]); + } + return -ENOMEM; + } + + /* + * ideally we want to get kernel space pointers for each page, but + * we can't kmap that many pages at once if highmem is being used. + * so instead, we just kmap/kunmap the page address each time the + * kaddr is needed. + */ + for (i = 0; i < bufmap->page_count; i++) + flush_dcache_page(bufmap->page_array[i]); + + /* build a list of available descriptors */ + for (offset = 0, i = 0; i < bufmap->desc_count; i++) { + bufmap->desc_array[i].page_array = &bufmap->page_array[offset]; + bufmap->desc_array[i].array_count = pages_per_desc; + bufmap->desc_array[i].uaddr = + (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE)); + offset += pages_per_desc; + } + + return 0; +} + +/* + * pvfs_bufmap_initialize() + * + * initializes the mapped buffer interface + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_initialize(struct PVFS_dev_map_desc *user_desc) +{ + struct pvfs2_bufmap *bufmap; + int ret = -EINVAL; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_initialize: called (ptr (" + "%p) sz (%d) cnt(%d).\n", + user_desc->ptr, + user_desc->size, + user_desc->count); + + /* + * sanity check alignment and size of buffer that caller wants to + * work with + */ + if (PAGE_ALIGN((unsigned long)user_desc->ptr) != + (unsigned long)user_desc->ptr) { + gossip_err("pvfs2 error: memory alignment (front). %p\n", + user_desc->ptr); + goto out; + } + + if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size)) + != (unsigned long)(user_desc->ptr + user_desc->total_size)) { + gossip_err("pvfs2 error: memory alignment (back).(%p + %d)\n", + user_desc->ptr, + user_desc->total_size); + goto out; + } + + if (user_desc->total_size != (user_desc->size * user_desc->count)) { + gossip_err("pvfs2 error: user provided an oddly sized buffer: (%d, %d, %d)\n", + user_desc->total_size, + user_desc->size, + user_desc->count); + goto out; + } + + if ((user_desc->size % PAGE_SIZE) != 0) { + gossip_err("pvfs2 error: bufmap size not page size divisible (%d).\n", + user_desc->size); + goto out; + } + + ret = -ENOMEM; + bufmap = pvfs2_bufmap_alloc(user_desc); + if (!bufmap) + goto out; + + ret = pvfs2_bufmap_map(bufmap, user_desc); + if (ret) + goto out_free_bufmap; + + + spin_lock(&pvfs2_bufmap_lock); + if (__pvfs2_bufmap) { + spin_unlock(&pvfs2_bufmap_lock); + gossip_err("pvfs2: error: bufmap already initialized.\n"); + ret = -EALREADY; + goto out_unmap_bufmap; + } + __pvfs2_bufmap = bufmap; + spin_unlock(&pvfs2_bufmap_lock); + + /* + * If there are operations in pvfs2_bufmap_init_waitq, wake them up. + * This scenario occurs when the client-core is restarted and I/O + * requests in the in-progress or waiting tables are restarted. I/O + * requests cannot be restarted until the shared memory system is + * completely re-initialized, so we put the I/O requests in this + * waitq until initialization has completed. NOTE: the I/O requests + * are also on a timer, so they don't wait forever just in case the + * client-core doesn't come back up. + */ + wake_up_interruptible(&pvfs2_bufmap_init_waitq); + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_initialize: exiting normally\n"); + return 0; + +out_unmap_bufmap: + pvfs2_bufmap_unmap(bufmap); +out_free_bufmap: + pvfs2_bufmap_free(bufmap); +out: + return ret; +} + +/* + * pvfs_bufmap_finalize() + * + * shuts down the mapped buffer interface and releases any resources + * associated with it + * + * no return value + */ +void pvfs_bufmap_finalize(void) +{ + gossip_debug(GOSSIP_BUFMAP_DEBUG, "pvfs2_bufmap_finalize: called\n"); + BUG_ON(!__pvfs2_bufmap); // XXX + pvfs2_bufmap_unref(__pvfs2_bufmap); + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs2_bufmap_finalize: exiting normally\n"); +} + +struct slot_args { + int slot_count; + int *slot_array; + spinlock_t *slot_lock; + wait_queue_head_t *slot_wq; +}; + +static int wait_for_a_slot(struct slot_args *slargs, int *buffer_index) +{ + int ret = -1; + int i = 0; + DECLARE_WAITQUEUE(my_wait, current); + + + add_wait_queue_exclusive(slargs->slot_wq, &my_wait); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + /* + * check for available desc, slot_lock is the appropriate + * index_lock + */ + spin_lock(slargs->slot_lock); + for (i = 0; i < slargs->slot_count; i++) + if (slargs->slot_array[i] == 0) { + slargs->slot_array[i] = 1; + *buffer_index = i; + ret = 0; + break; + } + spin_unlock(slargs->slot_lock); + + /* if we acquired a buffer, then break out of while */ + if (ret == 0) + break; + + if (!signal_pending(current)) { + int timeout = + MSECS_TO_JIFFIES(1000 * slot_timeout_secs); + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "[BUFMAP]: waiting %d " + "seconds for a slot\n", + slot_timeout_secs); + if (!schedule_timeout(timeout)) { + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "*** wait_for_a_slot timed out\n"); + ret = -ETIMEDOUT; + break; + } + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "[BUFMAP]: woken up by a slot becoming available.\n"); + continue; + } + + gossip_debug(GOSSIP_BUFMAP_DEBUG, "pvfs2: %s interrupted.\n", + __func__); + ret = -EINTR; + break; + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(slargs->slot_wq, &my_wait); + return ret; +} + +static void put_back_slot(struct slot_args *slargs, int buffer_index) +{ + /* slot_lock is the appropriate index_lock */ + spin_lock(slargs->slot_lock); + if (buffer_index < 0 || buffer_index >= slargs->slot_count) { + spin_unlock(slargs->slot_lock); + return; + } + + /* put the desc back on the queue */ + slargs->slot_array[buffer_index] = 0; + spin_unlock(slargs->slot_lock); + + /* wake up anyone who may be sleeping on the queue */ + wake_up_interruptible(slargs->slot_wq); +} + +/* + * pvfs_bufmap_get() + * + * gets a free mapped buffer descriptor, will sleep until one becomes + * available if necessary + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_get(struct pvfs2_bufmap **mapp, int *buffer_index) +{ + struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); + struct slot_args slargs; + int ret; + + if (!bufmap) { + gossip_err("pvfs2: please confirm that pvfs2-client daemon is running.\n"); + return -EIO; + } + + slargs.slot_count = bufmap->desc_count; + slargs.slot_array = bufmap->buffer_index_array; + slargs.slot_lock = &bufmap->buffer_index_lock; + slargs.slot_wq = &bufmap_waitq; + ret = wait_for_a_slot(&slargs, buffer_index); + if (ret) + pvfs2_bufmap_unref(bufmap); + *mapp = bufmap; + return ret; +} + +/* + * pvfs_bufmap_put() + * + * returns a mapped buffer descriptor to the collection + * + * no return value + */ +void pvfs_bufmap_put(struct pvfs2_bufmap *bufmap, int buffer_index) +{ + struct slot_args slargs; + + slargs.slot_count = bufmap->desc_count; + slargs.slot_array = bufmap->buffer_index_array; + slargs.slot_lock = &bufmap->buffer_index_lock; + slargs.slot_wq = &bufmap_waitq; + put_back_slot(&slargs, buffer_index); + pvfs2_bufmap_unref(bufmap); +} + +/* + * readdir_index_get() + * + * gets a free descriptor, will sleep until one becomes + * available if necessary. + * Although the readdir buffers are not mapped into kernel space + * we could do that at a later point of time. Regardless, these + * indices are used by the client-core. + * + * returns 0 on success, -errno on failure + */ +int readdir_index_get(struct pvfs2_bufmap **mapp, int *buffer_index) +{ + struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); + struct slot_args slargs; + int ret; + + if (!bufmap) { + gossip_err("pvfs2: please confirm that pvfs2-client daemon is running.\n"); + return -EIO; + } + + slargs.slot_count = PVFS2_READDIR_DEFAULT_DESC_COUNT; + slargs.slot_array = bufmap->readdir_index_array; + slargs.slot_lock = &bufmap->readdir_index_lock; + slargs.slot_wq = &readdir_waitq; + ret = wait_for_a_slot(&slargs, buffer_index); + if (ret) + pvfs2_bufmap_unref(bufmap); + *mapp = bufmap; + return ret; +} + +void readdir_index_put(struct pvfs2_bufmap *bufmap, int buffer_index) +{ + struct slot_args slargs; + + slargs.slot_count = PVFS2_READDIR_DEFAULT_DESC_COUNT; + slargs.slot_array = bufmap->readdir_index_array; + slargs.slot_lock = &bufmap->readdir_index_lock; + slargs.slot_wq = &readdir_waitq; + put_back_slot(&slargs, buffer_index); + pvfs2_bufmap_unref(bufmap); +} + +/* + * pvfs_bufmap_copy_iovec_from_user() + * + * copies data from several user space address's in an iovec + * to a mapped buffer + * + * Note that the mapped buffer is a series of pages and therefore + * the copies have to be split by PAGE_SIZE bytes at a time. + * Note that this routine checks that summation of iov_len + * across all the elements of iov is equal to size. + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_copy_iovec_from_user(struct pvfs2_bufmap *bufmap, + int buffer_index, + const struct iovec *iov, + unsigned long nr_segs, + size_t size) +{ + size_t ret = 0; + size_t amt_copied = 0; + size_t cur_copy_size = 0; + unsigned int to_page_offset = 0; + unsigned int to_page_index = 0; + void *to_kaddr = NULL; + void __user *from_addr = NULL; + struct iovec *copied_iovec = NULL; + struct pvfs_bufmap_desc *to; + unsigned int seg; + char *tmp_printer = NULL; + int tmp_int = 0; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_copy_iovec_from_user: index %d, " + "size %zd\n", + buffer_index, + size); + + to = &bufmap->desc_array[buffer_index]; + + /* + * copy the passed in iovec so that we can change some of its fields + */ + copied_iovec = kmalloc(nr_segs * sizeof(*copied_iovec), + PVFS2_BUFMAP_GFP_FLAGS); + if (copied_iovec == NULL) { + gossip_err("pvfs2_bufmap_copy_iovec_from_user: failed allocating memory\n"); + return -ENOMEM; + } + memcpy(copied_iovec, iov, nr_segs * sizeof(*copied_iovec)); + /* + * Go through each segment in the iovec and make sure that + * the summation of iov_len matches the given size. + */ + for (seg = 0, amt_copied = 0; seg < nr_segs; seg++) + amt_copied += copied_iovec[seg].iov_len; + if (amt_copied != size) { + gossip_err( + "pvfs2_bufmap_copy_iovec_from_user: computed total (" + "%zd) is not equal to (%zd)\n", + amt_copied, + size); + kfree(copied_iovec); + return -EINVAL; + } + + to_page_index = 0; + to_page_offset = 0; + amt_copied = 0; + seg = 0; + /* + * Go through each segment in the iovec and copy its + * buffer into the mapped buffer one page at a time though + */ + while (amt_copied < size) { + struct iovec *iv = &copied_iovec[seg]; + int inc_to_page_index; + + if (iv->iov_len < (PAGE_SIZE - to_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + from_addr = iv->iov_base; + inc_to_page_index = 0; + } else if (iv->iov_len == (PAGE_SIZE - to_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + from_addr = iv->iov_base; + inc_to_page_index = 1; + } else { + cur_copy_size = + PVFS_util_min(PAGE_SIZE - to_page_offset, + size - amt_copied); + from_addr = iv->iov_base; + iv->iov_base += cur_copy_size; + iv->iov_len -= cur_copy_size; + inc_to_page_index = 1; + } + to_kaddr = pvfs2_kmap(to->page_array[to_page_index]); + ret = + copy_from_user(to_kaddr + to_page_offset, + from_addr, + cur_copy_size); + if (!PageReserved(to->page_array[to_page_index])) + SetPageDirty(to->page_array[to_page_index]); + + if (!tmp_printer) { + tmp_printer = (char *)(to_kaddr + to_page_offset); + tmp_int += tmp_printer[0]; + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "First character (integer value) in pvfs_bufmap_copy_from_user: %d\n", + tmp_int); + } + + pvfs2_kunmap(to->page_array[to_page_index]); + if (ret) { + gossip_err("Failed to copy data from user space\n"); + kfree(copied_iovec); + return -EFAULT; + } + + amt_copied += cur_copy_size; + if (inc_to_page_index) { + to_page_offset = 0; + to_page_index++; + } else { + to_page_offset += cur_copy_size; + } + } + kfree(copied_iovec); + return 0; +} + +/* + * pvfs_bufmap_copy_iovec_from_kernel() + * + * copies data from several kernel space address's in an iovec + * to a mapped buffer + * + * Note that the mapped buffer is a series of pages and therefore + * the copies have to be split by PAGE_SIZE bytes at a time. + * Note that this routine checks that summation of iov_len + * across all the elements of iov is equal to size. + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_copy_iovec_from_kernel(struct pvfs2_bufmap *bufmap, + int buffer_index, const struct iovec *iov, + unsigned long nr_segs, size_t size) +{ + size_t amt_copied = 0; + size_t cur_copy_size = 0; + int to_page_index = 0; + void *to_kaddr = NULL; + void *from_kaddr = NULL; + struct iovec *copied_iovec = NULL; + struct pvfs_bufmap_desc *to; + unsigned int seg; + unsigned to_page_offset = 0; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_copy_iovec_from_kernel: index %d, " + "size %zd\n", + buffer_index, + size); + + to = &bufmap->desc_array[buffer_index]; + /* + * copy the passed in iovec so that we can change some of its fields + */ + copied_iovec = kmalloc(nr_segs * sizeof(*copied_iovec), + PVFS2_BUFMAP_GFP_FLAGS); + if (copied_iovec == NULL) { + gossip_err("pvfs2_bufmap_copy_iovec_from_kernel: failed allocating memory\n"); + return -ENOMEM; + } + memcpy(copied_iovec, iov, nr_segs * sizeof(*copied_iovec)); + /* + * Go through each segment in the iovec and make sure that + * the summation of iov_len matches the given size. + */ + for (seg = 0, amt_copied = 0; seg < nr_segs; seg++) + amt_copied += copied_iovec[seg].iov_len; + if (amt_copied != size) { + gossip_err("pvfs2_bufmap_copy_iovec_from_kernel: computed total(%zd) is not equal to (%zd)\n", + amt_copied, + size); + kfree(copied_iovec); + return -EINVAL; + } + + to_page_index = 0; + amt_copied = 0; + seg = 0; + to_page_offset = 0; + /* + * Go through each segment in the iovec and copy its + * buffer into the mapped buffer one page at a time though + */ + while (amt_copied < size) { + struct iovec *iv = &copied_iovec[seg]; + int inc_to_page_index; + + if (iv->iov_len < (PAGE_SIZE - to_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + from_kaddr = iv->iov_base; + inc_to_page_index = 0; + } else if (iv->iov_len == (PAGE_SIZE - to_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + from_kaddr = iv->iov_base; + inc_to_page_index = 1; + } else { + cur_copy_size = + PVFS_util_min(PAGE_SIZE - to_page_offset, + size - amt_copied); + from_kaddr = iv->iov_base; + iv->iov_base += cur_copy_size; + iv->iov_len -= cur_copy_size; + inc_to_page_index = 1; + } + to_kaddr = pvfs2_kmap(to->page_array[to_page_index]); + memcpy(to_kaddr + to_page_offset, from_kaddr, cur_copy_size); + if (!PageReserved(to->page_array[to_page_index])) + SetPageDirty(to->page_array[to_page_index]); + pvfs2_kunmap(to->page_array[to_page_index]); + amt_copied += cur_copy_size; + if (inc_to_page_index) { + to_page_offset = 0; + to_page_index++; + } else { + to_page_offset += cur_copy_size; + } + } + kfree(copied_iovec); + return 0; +} + +/* + * pvfs_bufmap_copy_to_user_iovec() + * + * copies data to several user space address's in an iovec + * from a mapped buffer + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_copy_to_user_iovec(struct pvfs2_bufmap *bufmap, + int buffer_index, const struct iovec *iov, + unsigned long nr_segs, size_t size) +{ + size_t ret = 0; + size_t amt_copied = 0; + size_t cur_copy_size = 0; + int from_page_index = 0; + void *from_kaddr = NULL; + void __user *to_addr = NULL; + struct iovec *copied_iovec = NULL; + struct pvfs_bufmap_desc *from; + unsigned int seg; + unsigned from_page_offset = 0; + char *tmp_printer = NULL; + int tmp_int = 0; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_copy_to_user_iovec: index %d, size %zd\n", + buffer_index, + size); + + from = &bufmap->desc_array[buffer_index]; + /* + * copy the passed in iovec so that we can change some of its fields + */ + copied_iovec = kmalloc(nr_segs * sizeof(*copied_iovec), + PVFS2_BUFMAP_GFP_FLAGS); + if (copied_iovec == NULL) { + gossip_err("pvfs2_bufmap_copy_to_user_iovec: failed allocating memory\n"); + return -ENOMEM; + } + memcpy(copied_iovec, iov, nr_segs * sizeof(*copied_iovec)); + /* + * Go through each segment in the iovec and make sure that + * the summation of iov_len is greater than the given size. + */ + for (seg = 0, amt_copied = 0; seg < nr_segs; seg++) + amt_copied += copied_iovec[seg].iov_len; + if (amt_copied < size) { + gossip_err("pvfs2_bufmap_copy_to_user_iovec: computed total (%zd) is less than (%zd)\n", + amt_copied, + size); + kfree(copied_iovec); + return -EINVAL; + } + + from_page_index = 0; + amt_copied = 0; + seg = 0; + from_page_offset = 0; + /* + * Go through each segment in the iovec and copy from the mapper buffer, + * but make sure that we do so one page at a time. + */ + while (amt_copied < size) { + struct iovec *iv = &copied_iovec[seg]; + int inc_from_page_index; + + if (iv->iov_len < (PAGE_SIZE - from_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + to_addr = iv->iov_base; + inc_from_page_index = 0; + } else if (iv->iov_len == (PAGE_SIZE - from_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + to_addr = iv->iov_base; + inc_from_page_index = 1; + } else { + cur_copy_size = + PVFS_util_min(PAGE_SIZE - from_page_offset, + size - amt_copied); + to_addr = iv->iov_base; + iv->iov_base += cur_copy_size; + iv->iov_len -= cur_copy_size; + inc_from_page_index = 1; + } + from_kaddr = pvfs2_kmap(from->page_array[from_page_index]); + if (!tmp_printer) { + tmp_printer = (char *)(from_kaddr + from_page_offset); + tmp_int += tmp_printer[0]; + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "First character (integer value) in pvfs_bufmap_copy_to_user_iovec: %d\n", + tmp_int); + } + ret = + copy_to_user(to_addr, + from_kaddr + from_page_offset, + cur_copy_size); + pvfs2_kunmap(from->page_array[from_page_index]); + if (ret) { + gossip_err("Failed to copy data to user space\n"); + kfree(copied_iovec); + return -EFAULT; + } + + amt_copied += cur_copy_size; + if (inc_from_page_index) { + from_page_offset = 0; + from_page_index++; + } else { + from_page_offset += cur_copy_size; + } + } + kfree(copied_iovec); + return 0; +} + +/* + * pvfs_bufmap_copy_to_kernel_iovec() + * + * copies data to several kernel space address's in an iovec + * from a mapped buffer + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_copy_to_kernel_iovec(struct pvfs2_bufmap *bufmap, + int buffer_index, const struct iovec *iov, + unsigned long nr_segs, size_t size) +{ + size_t amt_copied = 0; + size_t cur_copy_size = 0; + int from_page_index = 0; + void *from_kaddr = NULL; + void *to_kaddr = NULL; + struct iovec *copied_iovec = NULL; + struct pvfs_bufmap_desc *from; + unsigned int seg; + unsigned int from_page_offset = 0; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_copy_to_kernel_iovec: index %d, size %zd\n", + buffer_index, + size); + + from = &bufmap->desc_array[buffer_index]; + /* + * copy the passed in iovec so that we can change some of its fields + */ + copied_iovec = kmalloc(nr_segs * sizeof(*copied_iovec), + PVFS2_BUFMAP_GFP_FLAGS); + if (copied_iovec == NULL) { + gossip_err("pvfs2_bufmap_copy_to_kernel_iovec: failed allocating memory\n"); + return -ENOMEM; + } + memcpy(copied_iovec, iov, nr_segs * sizeof(*copied_iovec)); + /* + * Go through each segment in the iovec and make sure that + * the summation of iov_len is greater than the given size. + */ + for (seg = 0, amt_copied = 0; seg < nr_segs; seg++) + amt_copied += copied_iovec[seg].iov_len; + + if (amt_copied < size) { + gossip_err("pvfs2_bufmap_copy_to_kernel_iovec: computed total (%zd) is less than (%zd)\n", + amt_copied, + size); + kfree(copied_iovec); + return -EINVAL; + } + + from_page_index = 0; + amt_copied = 0; + seg = 0; + from_page_offset = 0; + /* + * Go through each segment in the iovec and copy from the mapper buffer, + * but make sure that we do so one page at a time. + */ + while (amt_copied < size) { + struct iovec *iv = &copied_iovec[seg]; + int inc_from_page_index; + + if (iv->iov_len < (PAGE_SIZE - from_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + to_kaddr = iv->iov_base; + inc_from_page_index = 0; + } else if (iv->iov_len == (PAGE_SIZE - from_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + to_kaddr = iv->iov_base; + inc_from_page_index = 1; + } else { + cur_copy_size = + PVFS_util_min(PAGE_SIZE - from_page_offset, + size - amt_copied); + to_kaddr = iv->iov_base; + iv->iov_base += cur_copy_size; + iv->iov_len -= cur_copy_size; + inc_from_page_index = 1; + } + from_kaddr = pvfs2_kmap(from->page_array[from_page_index]); + memcpy(to_kaddr, from_kaddr + from_page_offset, cur_copy_size); + pvfs2_kunmap(from->page_array[from_page_index]); + amt_copied += cur_copy_size; + if (inc_from_page_index) { + from_page_offset = 0; + from_page_index++; + } else { + from_page_offset += cur_copy_size; + } + } + kfree(copied_iovec); + return 0; +} diff --git a/fs/orangefs/pvfs2-cache.c b/fs/orangefs/pvfs2-cache.c new file mode 100644 index 0000000..e449ba5 --- /dev/null +++ b/fs/orangefs/pvfs2-cache.c @@ -0,0 +1,258 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" + +/* tags assigned to kernel upcall operations */ +static uint64_t next_tag_value; +static DEFINE_SPINLOCK(next_tag_value_lock); + +/* the pvfs2 memory caches */ + +/* a cache for pvfs2 upcall/downcall operations */ +static struct kmem_cache *op_cache; + +/* a cache for device (/dev/pvfs2-req) communication */ +static struct kmem_cache *dev_req_cache; + +/* a cache for pvfs2_kiocb objects (i.e pvfs2 iocb structures ) */ +static struct kmem_cache *pvfs2_kiocb_cache; + +int op_cache_initialize(void) +{ + op_cache = kmem_cache_create("pvfs2_op_cache", + sizeof(struct pvfs2_kernel_op), + 0, + PVFS2_CACHE_CREATE_FLAGS, + NULL); + + if (!op_cache) { + gossip_err("Cannot create pvfs2_op_cache\n"); + return -ENOMEM; + } + + /* initialize our atomic tag counter */ + spin_lock(&next_tag_value_lock); + next_tag_value = 100; + spin_unlock(&next_tag_value_lock); + return 0; +} + +int op_cache_finalize(void) +{ + kmem_cache_destroy(op_cache); + return 0; +} + +char *get_opname_string(struct pvfs2_kernel_op *new_op) +{ + if (new_op) { + int32_t type = new_op->upcall.type; + if (type == PVFS2_VFS_OP_FILE_IO) + return "OP_FILE_IO"; + else if (type == PVFS2_VFS_OP_LOOKUP) + return "OP_LOOKUP"; + else if (type == PVFS2_VFS_OP_CREATE) + return "OP_CREATE"; + else if (type == PVFS2_VFS_OP_GETATTR) + return "OP_GETATTR"; + else if (type == PVFS2_VFS_OP_REMOVE) + return "OP_REMOVE"; + else if (type == PVFS2_VFS_OP_MKDIR) + return "OP_MKDIR"; + else if (type == PVFS2_VFS_OP_READDIR) + return "OP_READDIR"; + else if (type == PVFS2_VFS_OP_READDIRPLUS) + return "OP_READDIRPLUS"; + else if (type == PVFS2_VFS_OP_SETATTR) + return "OP_SETATTR"; + else if (type == PVFS2_VFS_OP_SYMLINK) + return "OP_SYMLINK"; + else if (type == PVFS2_VFS_OP_RENAME) + return "OP_RENAME"; + else if (type == PVFS2_VFS_OP_STATFS) + return "OP_STATFS"; + else if (type == PVFS2_VFS_OP_TRUNCATE) + return "OP_TRUNCATE"; + else if (type == PVFS2_VFS_OP_MMAP_RA_FLUSH) + return "OP_MMAP_RA_FLUSH"; + else if (type == PVFS2_VFS_OP_FS_MOUNT) + return "OP_FS_MOUNT"; + else if (type == PVFS2_VFS_OP_FS_UMOUNT) + return "OP_FS_UMOUNT"; + else if (type == PVFS2_VFS_OP_GETXATTR) + return "OP_GETXATTR"; + else if (type == PVFS2_VFS_OP_SETXATTR) + return "OP_SETXATTR"; + else if (type == PVFS2_VFS_OP_LISTXATTR) + return "OP_LISTXATTR"; + else if (type == PVFS2_VFS_OP_REMOVEXATTR) + return "OP_REMOVEXATTR"; + else if (type == PVFS2_VFS_OP_PARAM) + return "OP_PARAM"; + else if (type == PVFS2_VFS_OP_PERF_COUNT) + return "OP_PERF_COUNT"; + else if (type == PVFS2_VFS_OP_CANCEL) + return "OP_CANCEL"; + else if (type == PVFS2_VFS_OP_FSYNC) + return "OP_FSYNC"; + else if (type == PVFS2_VFS_OP_FSKEY) + return "OP_FSKEY"; + else if (type == PVFS2_VFS_OP_FILE_IOX) + return "OP_FILE_IOX"; + } + return "OP_UNKNOWN?"; +} + +static struct pvfs2_kernel_op *op_alloc_common(int32_t op_linger, int32_t type) +{ + struct pvfs2_kernel_op *new_op = NULL; + + new_op = kmem_cache_alloc(op_cache, PVFS2_CACHE_ALLOC_FLAGS); + if (new_op) { + memset(new_op, 0, sizeof(struct pvfs2_kernel_op)); + + INIT_LIST_HEAD(&new_op->list); + spin_lock_init(&new_op->lock); + init_waitqueue_head(&new_op->waitq); + + init_waitqueue_head(&new_op->io_completion_waitq); + atomic_set(&new_op->aio_ref_count, 0); + + pvfs2_op_initialize(new_op); + + /* initialize the op specific tag and upcall credentials */ + spin_lock(&next_tag_value_lock); + new_op->tag = next_tag_value++; + if (next_tag_value == 0) + next_tag_value = 100; + spin_unlock(&next_tag_value_lock); + new_op->upcall.type = type; + new_op->attempts = 0; + gossip_debug(GOSSIP_CACHE_DEBUG, + "Alloced OP (%p: %llu %s)\n", + new_op, + llu(new_op->tag), + get_opname_string(new_op)); + + new_op->upcall.uid = from_kuid(current_user_ns(), current_fsuid()); + + new_op->upcall.gid = from_kgid(current_user_ns(), current_fsgid()); + + new_op->op_linger = new_op->op_linger_tmp = op_linger; + } else { + gossip_err("op_alloc: kmem_cache_alloc failed!\n"); + } + return new_op; +} + +struct pvfs2_kernel_op *op_alloc(int32_t type) +{ + return op_alloc_common(1, type); +} + +struct pvfs2_kernel_op *op_alloc_trailer(int32_t type) +{ + return op_alloc_common(2, type); +} + +void op_release(struct pvfs2_kernel_op *pvfs2_op) +{ + if (pvfs2_op) { + gossip_debug(GOSSIP_CACHE_DEBUG, + "Releasing OP (%p: %llu)\n", + pvfs2_op, + llu(pvfs2_op->tag)); + pvfs2_op_initialize(pvfs2_op); + kmem_cache_free(op_cache, pvfs2_op); + } else { + gossip_err("NULL pointer in op_release\n"); + } +} + +int dev_req_cache_initialize(void) +{ + dev_req_cache = kmem_cache_create("pvfs2_devreqcache", + MAX_ALIGNED_DEV_REQ_DOWNSIZE, + 0, + PVFS2_CACHE_CREATE_FLAGS, + NULL); + + if (!dev_req_cache) { + gossip_err("Cannot create pvfs2_dev_req_cache\n"); + return -ENOMEM; + } + return 0; +} + +int dev_req_cache_finalize(void) +{ + kmem_cache_destroy(dev_req_cache); + return 0; +} + +void *dev_req_alloc(void) +{ + void *buffer; + + buffer = kmem_cache_alloc(dev_req_cache, PVFS2_CACHE_ALLOC_FLAGS); + if (buffer == NULL) + gossip_err("Failed to allocate from dev_req_cache\n"); + else + memset(buffer, 0, sizeof(MAX_ALIGNED_DEV_REQ_DOWNSIZE)); + return buffer; +} + +void dev_req_release(void *buffer) +{ + if (buffer) + kmem_cache_free(dev_req_cache, buffer); + else + gossip_err("NULL pointer passed to dev_req_release\n"); + return; +} + +int kiocb_cache_initialize(void) +{ + pvfs2_kiocb_cache = kmem_cache_create("pvfs2_kiocbcache", + sizeof(struct pvfs2_kiocb_s), + 0, + PVFS2_CACHE_CREATE_FLAGS, + NULL); + + if (!pvfs2_kiocb_cache) { + gossip_err("Cannot create pvfs2_kiocb_cache!\n"); + return -ENOMEM; + } + return 0; +} + +int kiocb_cache_finalize(void) +{ + kmem_cache_destroy(pvfs2_kiocb_cache); + return 0; +} + +struct pvfs2_kiocb_s *kiocb_alloc(void) +{ + struct pvfs2_kiocb_s *x = NULL; + + x = kmem_cache_alloc(pvfs2_kiocb_cache, PVFS2_CACHE_ALLOC_FLAGS); + if (x == NULL) + gossip_err("kiocb_alloc: kmem_cache_alloc failed!\n"); + else + memset(x, 0, sizeof(struct pvfs2_kiocb_s)); + return x; +} + +void kiocb_release(struct pvfs2_kiocb_s *x) +{ + if (x) + kmem_cache_free(pvfs2_kiocb_cache, x); + else + gossip_err("kiocb_release: kmem_cache_free NULL pointer!\n"); +} diff --git a/fs/orangefs/pvfs2-mod.c b/fs/orangefs/pvfs2-mod.c new file mode 100644 index 0000000..b59bdf2 --- /dev/null +++ b/fs/orangefs/pvfs2-mod.c @@ -0,0 +1,346 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * Changes by Acxiom Corporation to add proc file handler for pvfs2 client + * parameters, Copyright Acxiom Corporation, 2005. + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-proc.h" + +/* PVFS2_VERSION is a ./configure define */ +#ifndef PVFS2_VERSION +#define PVFS2_VERSION "Unknown" +#endif + +#define DEBUG_HELP_STRING_SIZE 4096 + +/* + * global variables declared here + */ + +/* the size of the hash tables for ops in progress */ +int hash_table_size = 509; + +/* the insmod command only understands "unsigned long" and NOT + * "unsigned long long" as an input parameter. So, to accomodate + * both 32- and 64- bit machines, we will read the debug mask parameter + * as an unsigned long (4-bytes on a 32-bit machine and 8-bytes + * on a 64-bit machine) and then cast the "unsigned long" to an + * "unsigned long long" once we have the value in the kernel. In this + * way, the gossip_debug_mask can remain as a "uint64_t" and the kernel + * and client may continue to use the same gossip functions. + * NOTE: the kernel debug mask currently does not have more than 32 + * valid keywords, so only reading a 32-bit integer from the insmod + * command line is not a problem. However, the + * /proc/sys/pvfs2/kernel-debug functionality can accomodate up to + * 64 keywords, in the event that the kernel debug mask supports more + * than 32 keywords. + */ +uint32_t module_parm_debug_mask = 0; +uint64_t gossip_debug_mask = 0; +unsigned int kernel_mask_set_mod_init = false; +int op_timeout_secs = PVFS2_DEFAULT_OP_TIMEOUT_SECS; +int slot_timeout_secs = PVFS2_DEFAULT_SLOT_TIMEOUT_SECS; +uint32_t DEBUG_LINE = 50; +char debug_help_string[DEBUG_HELP_STRING_SIZE] = { 0 }; + +int fake_mmap_shared = 0; + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("PVFS2 Development Team"); +MODULE_DESCRIPTION("The Linux Kernel VFS interface to PVFS2"); +MODULE_PARM_DESC(debug, "debugging level (see pvfs2-debug.h for values)"); +MODULE_PARM_DESC(op_timeout_secs, "Operation timeout in seconds"); +MODULE_PARM_DESC(slot_timeout_secs, "Slot timeout in seconds"); +MODULE_PARM_DESC(hash_table_size, + "size of hash table for operations in progress"); +MODULE_PARM_DESC(fake_mmap_shared, + "perform mmap with MAP_SHARED flag as if called with MAP_PRIVATE"); + +static struct file_system_type pvfs2_fs_type = { + .name = "pvfs2", + .mount = pvfs2_mount, + .kill_sb = pvfs2_kill_sb, + .owner = THIS_MODULE, +}; + +module_param(hash_table_size, int, 0); +module_param(module_parm_debug_mask, uint, 0); +module_param(op_timeout_secs, int, 0); +module_param(slot_timeout_secs, int, 0); +module_param(fake_mmap_shared, int, 0); + +/* synchronizes the request device file */ +struct mutex devreq_mutex; + +/* + blocks non-priority requests from being queued for servicing. this + could be used for protecting the request list data structure, but + for now it's only being used to stall the op addition to the request + list +*/ +struct mutex request_mutex; + +/* hash table for storing operations waiting for matching downcall */ +struct list_head *htable_ops_in_progress = NULL; +DEFINE_SPINLOCK(htable_ops_in_progress_lock); + +/* list for queueing upcall operations */ +LIST_HEAD(pvfs2_request_list); + +/* used to protect the above pvfs2_request_list */ +DEFINE_SPINLOCK(pvfs2_request_list_lock); + +/* used for incoming request notification */ +DECLARE_WAIT_QUEUE_HEAD(pvfs2_request_list_waitq); + +static int __init pvfs2_init(void) +{ + int ret = -1; + uint32_t index = 0; + char client_title[] = "Client Debug Keywords:\n"; + char kernel_title[] = "Kernel Debug Keywords:\n"; + uint32_t i = 0; + + /* convert input debug mask to a 64-bit unsigned integer */ + gossip_debug_mask = (uint64_t) module_parm_debug_mask; + + /* + * set the kernel's gossip debug string; invalid mask values will + * be ignored. + */ + PVFS_proc_kmod_mask_to_eventlog(gossip_debug_mask, kernel_debug_string); + + /* remove any invalid values from the mask */ + gossip_debug_mask = + PVFS_proc_kmod_eventlog_to_mask(kernel_debug_string); + + /* + * if the mask has a non-zero value, then indicate that the mask + * was set when the kernel module was loaded. The pvfs2 dev ioctl + * command will look at this boolean to determine if the kernel's + * debug mask should be overwritten when the client-core is started. + */ + if (gossip_debug_mask != 0) + kernel_mask_set_mod_init = true; + + /* print information message to the system log */ + pr_info("pvfs2: pvfs2_init called with debug mask: \"%s\" (0x%08llx)\n", + kernel_debug_string, + gossip_debug_mask); + + /* + * load debug_help_string...this string is used during the + * /proc/sys/pvfs2/debug-help operation + */ + if (strlen(client_title) < DEBUG_LINE) { + memcpy(&debug_help_string[index], + client_title, + sizeof(client_title)); + index += strlen(client_title); + } + + for (i = 0; i < num_keyword_mask_map; i++) + if ((strlen(s_keyword_mask_map[i].keyword) + 2) < DEBUG_LINE) { + debug_help_string[index] = '\t'; + index++; + memcpy(&debug_help_string[index], + s_keyword_mask_map[i].keyword, + strlen(s_keyword_mask_map[i].keyword)); + index += strlen(s_keyword_mask_map[i].keyword); + debug_help_string[index] = '\n'; + index++; + } + + if ((strlen(kernel_title) + 1) < DEBUG_LINE) { + debug_help_string[index] = '\n'; + index++; + + memcpy(&debug_help_string[index], + kernel_title, + sizeof(kernel_title)); + index += strlen(kernel_title); + } + + for (i = 0; i < num_kmod_keyword_mask_map; i++) + if ((strlen(s_kmod_keyword_mask_map[i].keyword) + 2) < + DEBUG_LINE) { + debug_help_string[index] = '\t'; + index++; + memcpy(&debug_help_string[index], + s_kmod_keyword_mask_map[i].keyword, + strlen(s_kmod_keyword_mask_map[i].keyword)); + index += strlen(s_kmod_keyword_mask_map[i].keyword); + debug_help_string[index] = '\n'; + index++; + } + + ret = bdi_init(&pvfs2_backing_dev_info); + + if (ret) + return ret; + + if (op_timeout_secs < 0) + op_timeout_secs = 0; + + if (slot_timeout_secs < 0) + slot_timeout_secs = 0; + + /* initialize global book keeping data structures */ + ret = op_cache_initialize(); + if (ret < 0) + goto err; + + ret = dev_req_cache_initialize(); + if (ret < 0) + goto cleanup_op; + + ret = pvfs2_inode_cache_initialize(); + if (ret < 0) + goto cleanup_req; + + ret = kiocb_cache_initialize(); + if (ret < 0) + goto cleanup_inode; + + /* Initialize the pvfsdev subsystem. */ + ret = pvfs2_dev_init(); + if (ret < 0) { + gossip_err("pvfs2: could not initialize device subsystem %d!\n", + ret); + goto cleanup_kiocb; + } + + mutex_init(&devreq_mutex); + mutex_init(&request_mutex); + + htable_ops_in_progress = + kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL); + if (!htable_ops_in_progress) { + gossip_err("Failed to initialize op hashtable"); + ret = -ENOMEM; + goto cleanup_device; + } + + /* initialize a doubly linked at each hash table index */ + for (i = 0; i < hash_table_size; i++) + INIT_LIST_HEAD(&htable_ops_in_progress[i]); + + ret = fsid_key_table_initialize(); + if (ret < 0) + goto cleanup_progress_table; + + pvfs2_proc_initialize(); + ret = register_filesystem(&pvfs2_fs_type); + if (ret == 0) { + pr_info("pvfs2: module version %s loaded\n", PVFS2_VERSION); + return 0; + } + + pvfs2_proc_finalize(); + fsid_key_table_finalize(); + +cleanup_progress_table: + kfree(htable_ops_in_progress); + +cleanup_device: + pvfs2_dev_cleanup(); + +cleanup_kiocb: + kiocb_cache_finalize(); + +cleanup_inode: + pvfs2_inode_cache_finalize(); + +cleanup_req: + dev_req_cache_finalize(); + +cleanup_op: + op_cache_finalize(); + +err: + bdi_destroy(&pvfs2_backing_dev_info); + return ret; +} + +static void __exit pvfs2_exit(void) +{ + int i = 0; + struct pvfs2_kernel_op *cur_op = NULL; + + gossip_debug(GOSSIP_INIT_DEBUG, "pvfs2: pvfs2_exit called\n"); + + unregister_filesystem(&pvfs2_fs_type); + pvfs2_proc_finalize(); + fsid_key_table_finalize(); + pvfs2_dev_cleanup(); + /* clear out all pending upcall op requests */ + spin_lock(&pvfs2_request_list_lock); + while (!list_empty(&pvfs2_request_list)) { + cur_op = list_entry(pvfs2_request_list.next, + struct pvfs2_kernel_op, + list); + list_del(&cur_op->list); + gossip_debug(GOSSIP_INIT_DEBUG, + "Freeing unhandled upcall request type %d\n", + cur_op->upcall.type); + op_release(cur_op); + } + spin_unlock(&pvfs2_request_list_lock); + + for (i = 0; i < hash_table_size; i++) + while (!list_empty(&htable_ops_in_progress[i])) { + cur_op = list_entry(htable_ops_in_progress[i].next, + struct pvfs2_kernel_op, + list); + op_release(cur_op); + } + + kiocb_cache_finalize(); + pvfs2_inode_cache_finalize(); + dev_req_cache_finalize(); + op_cache_finalize(); + + kfree(htable_ops_in_progress); + + bdi_destroy(&pvfs2_backing_dev_info); + + pr_info("pvfs2: module version %s unloaded\n", PVFS2_VERSION); +} + +/* + * What we do in this function is to walk the list of operations + * that are in progress in the hash table and mark them as purged as well. + */ +void purge_inprogress_ops(void) +{ + int i; + + for (i = 0; i < hash_table_size; i++) { + struct pvfs2_kernel_op *op; + struct pvfs2_kernel_op *next; + + list_for_each_entry_safe(op, + next, + &htable_ops_in_progress[i], + list) { + spin_lock(&op->lock); + gossip_debug(GOSSIP_INIT_DEBUG, + "pvfs2-client-core: purging in-progress op tag " + "%llu %s\n", + llu(op->tag), + get_opname_string(op)); + set_op_state_purged(op); + spin_unlock(&op->lock); + wake_up_interruptible(&op->waitq); + } + } + return; +} + +module_init(pvfs2_init); +module_exit(pvfs2_exit); diff --git a/fs/orangefs/pvfs2-proc.c b/fs/orangefs/pvfs2-proc.c new file mode 100644 index 0000000..789f1606 --- /dev/null +++ b/fs/orangefs/pvfs2-proc.c @@ -0,0 +1,698 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * Changes by Acxiom Corporation to add proc file handler for pvfs2 client + * parameters, Copyright Acxiom Corporation, 2005. + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" + +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include "pvfs2-proc.h" + +/* PVFS2_VERSION is set in ./configure */ +#ifndef PVFS2_VERSION +#define PVFS2_VERSION "Unknown" +#endif + +/* + * CONFIG_SYSCTL is set in .config - most of this source file is inside + * this ifdef... + */ +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> + +#define KERNEL_DEBUG "kernel-debug" +#define CLIENT_DEBUG "client-debug" +#define DEBUG_HELP "debug-help" + +/* + * these strings will be initialized by invoking the PVFS_DEV_DEBUG ioctl + * command when the client-core is started. otherwise, these variables are + * only set via the proc sys calls. + */ +char client_debug_string[PVFS2_MAX_DEBUG_STRING_LEN] = "none"; +char kernel_debug_string[PVFS2_MAX_DEBUG_STRING_LEN] = "none"; +extern char debug_help_string[]; + +/* extra parameters provided to pvfs2 param proc handlers */ +struct pvfs2_param_extra { + int op; /* parameter type */ + int min; /* minimum value */ + int max; /* maximum value */ +}; + +/* + * pvfs2_proc_debug_mask_handler() + * proc file handler that will take a debug string and convert it + * into the proper debug value and then send a request to update the + * debug mask if client or update the local debug mask if kernel. +*/ +static int pvfs2_proc_debug_mask_handler(struct ctl_table *ctl, + int write, + void *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret = 0; + struct pvfs2_kernel_op *new_op = NULL; + + gossip_debug(GOSSIP_PROC_DEBUG, + "Executing pvfs2_proc_debug_mask_handler...\n"); + + /* use generic proc string handling function to retrieve/set string. */ + ret = proc_dostring(ctl, write, buffer, lenp, ppos); + if (ret != 0) + return ret; + + gossip_debug(GOSSIP_PROC_DEBUG, + "%s: debug string: %s\n", + "pvfs2_proc_debug_mask_handler", + (char *)ctl->data); + + /* + * For a user write, ctl->data will now contain the new debug + * string as given by the user. For a user read, the user's "buffer" + * will now contain the string stored in ctl->data. + */ + + /* + * For a write, we must convert the debug string into the proper + * debug mask. The conversion will ignore any invalid keywords sent + * in by the user, so we re-convert the debug mask back into the + * correct debug string. + */ + if (write && !strcmp(ctl->procname, KERNEL_DEBUG)) { + gossip_debug_mask = + PVFS_proc_kmod_eventlog_to_mask((const char *)ctl->data); + ret = PVFS_proc_kmod_mask_to_eventlog(gossip_debug_mask, + (char *)ctl->data); + + gossip_debug(GOSSIP_PROC_DEBUG, + "pvfs2_proc_debug_mask_handler: kernel debug mask:" + " %lu\n", + (unsigned long)gossip_debug_mask); + gossip_debug(GOSSIP_PROC_DEBUG, + "New kernel debug string is %s.\n", + kernel_debug_string); + pr_info("PVFS: kernel debug mask has been modified to \"%s\" (0x%08llx).\n", + kernel_debug_string, + llu(gossip_debug_mask)); + } else if (write && !strcmp(ctl->procname, CLIENT_DEBUG)) { + new_op = op_alloc(PVFS2_VFS_OP_PARAM); + if (!new_op) + return -ENOMEM; + strcpy(new_op->upcall.req.param.s_value, ctl->data); + new_op->upcall.req.param.type = PVFS2_PARAM_REQUEST_SET; + new_op->upcall.req.param.op = + PVFS2_PARAM_REQUEST_OP_CLIENT_DEBUG; + + ret = + service_operation(new_op, + "pvfs2_param", + PVFS2_OP_INTERRUPTIBLE); + + if (ret == 0) { + gossip_debug(GOSSIP_PROC_DEBUG, + "Downcall:\treturn status:%d" + "\treturn value:%x\n", + (int)new_op->downcall.status, + (int)new_op->downcall.resp.param.value); + + ret = + PVFS_proc_mask_to_eventlog(new_op->downcall.resp. + param.value, + client_debug_string); + gossip_debug(GOSSIP_PROC_DEBUG, + "New client debug string is %s\n", + client_debug_string); + } + op_release(new_op); + pr_info("PVFS: client debug mask has been modified to \"%s\" (0x%08llx).\n", + client_debug_string, + llu(new_op->downcall.resp.param.value)); + } else if (write && !strcmp(ctl->procname, DEBUG_HELP)) { + /*do nothing...the user can only READ the debug help */ + return 0; + } + + return 0; +} + +/* + * pvfs2_param_proc_handler() + * + * Generic proc file handler for getting and setting various tunable + * pvfs2-client parameters. + */ +static int pvfs2_param_proc_handler(struct ctl_table *ctl, + int write, + void *buffer, + size_t *lenp, + loff_t *ppos) +{ + struct pvfs2_kernel_op *new_op = NULL; + struct pvfs2_param_extra *extra = ctl->extra1; + int val = 0; + int ret = 0; + struct ctl_table tmp_ctl = *ctl; + + /* + * override fields in control structure for call to generic proc + * handler + */ + tmp_ctl.data = &val; + tmp_ctl.extra1 = &extra->min; + tmp_ctl.extra2 = &extra->max; + + /* build an op structure to send request to pvfs2-client */ + new_op = op_alloc(PVFS2_VFS_OP_PARAM); + if (!new_op) + return -ENOMEM; + + if (write) { + /* use generic proc handling function to retrive value to set */ + ret = proc_dointvec_minmax(&tmp_ctl, write, buffer, lenp, ppos); + if (ret != 0) { + op_release(new_op); + return ret; + } + gossip_debug(GOSSIP_PROC_DEBUG, "pvfs2: proc write %d\n", val); + new_op->upcall.req.param.value = val; + new_op->upcall.req.param.type = PVFS2_PARAM_REQUEST_SET; + } else { + /* get parameter from client, we will output afterwards */ + new_op->upcall.req.param.type = PVFS2_PARAM_REQUEST_GET; + } + + new_op->upcall.req.param.op = extra->op; + + /* perform operation (get or set) */ + ret = service_operation(new_op, "pvfs2_param", PVFS2_OP_INTERRUPTIBLE); + + if (ret == 0 && !write) { + /* use generic proc handling function to output value */ + val = (int)new_op->downcall.resp.param.value; + gossip_debug(GOSSIP_PROC_DEBUG, "pvfs2: proc read %d\n", val); + ret = proc_dointvec_minmax(&tmp_ctl, write, buffer, lenp, ppos); + } + + op_release(new_op); + return ret; +} + +static int pvfs2_pc_proc_handler(struct ctl_table *ctl, + int write, + void *buffer, + size_t *lenp, + loff_t *ppos) +{ + struct pvfs2_kernel_op *new_op = NULL; + int ret; + int pos = 0; + int to_copy = 0; + int *pc_type = ctl->extra1; + loff_t *offset = ppos; + + if (write) { + /* don't allow writes to this file */ + *lenp = 0; + return -EPERM; + } + + /* build an op structure to send request to pvfs2-client */ + new_op = op_alloc(PVFS2_VFS_OP_PERF_COUNT); + + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.perf_count.type = *pc_type; + + /* retrieve performance counters */ + ret = service_operation(new_op, + "pvfs2_perf_count", + PVFS2_OP_INTERRUPTIBLE); + + if (ret == 0) { + /* figure out how many bytes we will copy out */ + pos = strlen(new_op->downcall.resp.perf_count.buffer); + to_copy = pos - *offset; + + if (to_copy < 0) + to_copy = 0; + + if (to_copy > *lenp) + to_copy = *lenp; + + if (to_copy) { + /* copy correct portion of the string buffer */ + if (copy_to_user(buffer, + (new_op->downcall.resp.perf_count.buffer + (*offset)), + to_copy)) { + ret = -EFAULT; + } else { + /* update offsets etc. if successful */ + *lenp = to_copy; + *offset += to_copy; + ret = to_copy; + } + } else { + *lenp = 0; + ret = 0; + } + } + + op_release(new_op); + + return ret; +} + +static struct ctl_table_header *fs_table_header; + +static struct pvfs2_param_extra acache_timeout_extra = { + .op = PVFS2_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra acache_hard_extra = { + .op = PVFS2_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra acache_soft_extra = { + .op = PVFS2_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra acache_rec_extra = { + .op = PVFS2_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE, + .min = 0, + .max = 100, +}; + +static struct pvfs2_param_extra static_acache_timeout_extra = { + .op = PVFS2_PARAM_REQUEST_OP_STATIC_ACACHE_TIMEOUT_MSECS, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra static_acache_hard_extra = { + .op = PVFS2_PARAM_REQUEST_OP_STATIC_ACACHE_HARD_LIMIT, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra static_acache_soft_extra = { + .op = PVFS2_PARAM_REQUEST_OP_STATIC_ACACHE_SOFT_LIMIT, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra static_acache_rec_extra = { + .op = PVFS2_PARAM_REQUEST_OP_STATIC_ACACHE_RECLAIM_PERCENTAGE, + .min = 0, + .max = 100, +}; + +static struct pvfs2_param_extra ncache_timeout_extra = { + .op = PVFS2_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra ncache_hard_extra = { + .op = PVFS2_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra ncache_soft_extra = { + .op = PVFS2_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra ncache_rec_extra = { + .op = PVFS2_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE, + .min = 0, + .max = 100, +}; + +static struct pvfs2_param_extra perf_time_interval_extra = { + .op = PVFS2_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS, + .min = 0, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra perf_history_size_extra = { + .op = PVFS2_PARAM_REQUEST_OP_PERF_HISTORY_SIZE, + .min = 1, + .max = INT_MAX, +}; + +static struct pvfs2_param_extra perf_reset_extra = { + .op = PVFS2_PARAM_REQUEST_OP_PERF_RESET, + .min = 0, + .max = 1, +}; + +static int min_op_timeout_secs[] = { 0 }; +static int max_op_timeout_secs[] = { INT_MAX }; +static int min_slot_timeout_secs[] = { 0 }; +static int max_slot_timeout_secs[] = { INT_MAX }; + +#define UNNUMBERED_OR_VAL(x) x + +#define CTL_NAME(c_name) + +#define CTL_STRATEGY(strat) + +static struct ctl_table pvfs2_acache_table[] = { + /* controls acache timeout */ + { + CTL_NAME(1) + .procname = "timeout-msecs", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &acache_timeout_extra}, + /* controls acache hard limit */ + { + CTL_NAME(2) + .procname = "hard-limit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &acache_hard_extra}, + /* controls acache soft limit */ + { + CTL_NAME(3) + .procname = "soft-limit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &acache_soft_extra}, + /* controls acache reclaim percentage */ + { + CTL_NAME(4) + .procname = "reclaim-percentage", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &acache_rec_extra, + }, + {CTL_NAME(CTL_NONE)} +}; + +static struct ctl_table pvfs2_static_acache_table[] = { + /* controls static acache timeout */ + { + CTL_NAME(1) + .procname = "timeout-msecs", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &static_acache_timeout_extra}, + /* controls static acache hard limit */ + { + CTL_NAME(2) + .procname = "hard-limit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &static_acache_hard_extra}, + /* controls static acache soft limit */ + { + CTL_NAME(3) + .procname = "soft-limit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &static_acache_soft_extra}, + /* controls static acache reclaim percentage */ + { + CTL_NAME(4) + .procname = "reclaim-percentage", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &static_acache_rec_extra, + }, + {CTL_NAME(CTL_NONE)} +}; + +static struct ctl_table pvfs2_ncache_table[] = { + /* controls ncache timeout */ + { + CTL_NAME(1) + .procname = "timeout-msecs", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &ncache_timeout_extra}, + /* controls ncache hard limit */ + { + CTL_NAME(2) + .procname = "hard-limit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &ncache_hard_extra}, + /* controls ncache soft limit */ + { + CTL_NAME(3) + .procname = "soft-limit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &ncache_soft_extra}, + /* controls ncache reclaim percentage */ + { + CTL_NAME(4) + .procname = "reclaim-percentage", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &ncache_rec_extra}, + {CTL_NAME(CTL_NONE)} +}; + +static int acache_perf_count = PVFS2_PERF_COUNT_REQUEST_ACACHE; +static int static_acache_perf_count = PVFS2_PERF_COUNT_REQUEST_STATIC_ACACHE; +static int ncache_perf_count = PVFS2_PERF_COUNT_REQUEST_NCACHE; +static struct ctl_table pvfs2_pc_table[] = { + { + CTL_NAME(1) + .procname = "acache", + .maxlen = 4096, + .mode = 0444, + .proc_handler = pvfs2_pc_proc_handler, + .extra1 = &acache_perf_count, + }, + { + CTL_NAME(1) + .procname = "static-acache", + .maxlen = 4096, + .mode = 0444, + .proc_handler = pvfs2_pc_proc_handler, + .extra1 = &static_acache_perf_count, + }, + { + CTL_NAME(2) + .procname = "ncache", + .maxlen = 4096, + .mode = 0444, + .proc_handler = pvfs2_pc_proc_handler, + .extra1 = &ncache_perf_count}, + {CTL_NAME(CTL_NONE)} +}; + +struct pvfs2_stats g_pvfs2_stats; + +static struct ctl_table pvfs2_stats_table[] = { + /* shows number of hits in cache */ + { + CTL_NAME(1) + .procname = "hits", + .data = &g_pvfs2_stats.cache_hits, + .maxlen = sizeof(unsigned long), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + CTL_NAME(2) + .procname = "misses", + .data = &g_pvfs2_stats.cache_misses, + .maxlen = sizeof(unsigned long), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .procname = "reads", + .data = &g_pvfs2_stats.reads, + .maxlen = sizeof(unsigned long), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + CTL_NAME(4) + .procname = "writes", + .data = &g_pvfs2_stats.writes, + .maxlen = sizeof(unsigned long), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + {CTL_NAME(CTL_NONE)} +}; + +static struct ctl_table pvfs2_table[] = { + /* outputs the available debugging keywords */ + { + CTL_NAME(14) + .procname = DEBUG_HELP, + .data = &debug_help_string, + .maxlen = PVFS2_MAX_DEBUG_STRING_LEN, + .mode = 0444, + .proc_handler = &pvfs2_proc_debug_mask_handler}, + /* controls client-core debugging level */ + { + CTL_NAME(1) + .procname = CLIENT_DEBUG, + .data = &client_debug_string, + .maxlen = PVFS2_MAX_DEBUG_STRING_LEN, + .mode = 0644, + .proc_handler = &pvfs2_proc_debug_mask_handler}, + /* controls kernel debugging level using string input */ + { + CTL_NAME(2) + .procname = KERNEL_DEBUG, + .data = &kernel_debug_string, + .maxlen = PVFS2_MAX_DEBUG_STRING_LEN, + .mode = 0644, + .proc_handler = &pvfs2_proc_debug_mask_handler}, + /* operation timeout */ + { + CTL_NAME(3) + .procname = "op-timeout-secs", + .data = &op_timeout_secs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + CTL_STRATEGY(&sysctl_intvec) + .extra1 = &min_op_timeout_secs, + .extra2 = &max_op_timeout_secs}, + /* slot timeout */ + { + CTL_NAME(4) + .procname = "slot-timeout-secs", + .data = &slot_timeout_secs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + CTL_STRATEGY(&sysctl_intvec) + .extra1 = &min_slot_timeout_secs, + .extra2 = &max_slot_timeout_secs}, + /* time interval for client side performance counters */ + { + CTL_NAME(5) + .procname = "perf-time-interval-secs", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &perf_time_interval_extra}, + /* time interval for client side performance counters */ + { + CTL_NAME(6) + .procname = "perf-history-size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &perf_history_size_extra}, + /* reset performance counters */ + { + CTL_NAME(7) + .procname = "perf-counter-reset", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pvfs2_param_proc_handler, + .extra1 = &perf_reset_extra, + }, + /* subdir for acache control */ + { + CTL_NAME(8) + .procname = "acache", + .maxlen = 0, + .mode = 0555, + .child = pvfs2_acache_table}, + /* subdir for static acache control */ + { + CTL_NAME(9) + .procname = "static-acache", + .maxlen = 0, + .mode = 0555, + .child = pvfs2_static_acache_table}, + { + CTL_NAME(10) + .procname = "perf-counters", + .maxlen = 0, + .mode = 0555, + .child = pvfs2_pc_table}, + /* subdir for ncache control */ + { + CTL_NAME(11) + .procname = "ncache", + .maxlen = 0, + .mode = 0555, + .child = pvfs2_ncache_table}, + /* + * statistics maintained by the kernel module (output only + * below this) + */ + { + CTL_NAME(12) + .procname = "stats", + .maxlen = 0, + .mode = 0555, + .child = pvfs2_stats_table}, + {CTL_NAME(CTL_NONE)} +}; + +static struct ctl_table fs_table[] = { + { + CTL_NAME(13) + .procname = "pvfs2", + .mode = 0555, + .child = pvfs2_table}, + {CTL_NAME(CTL_NONE)} +}; +#endif + +void pvfs2_proc_initialize(void) +{ +/* CONFIG_SYSCTL is set in .config */ +#ifdef CONFIG_SYSCTL + if (!fs_table_header) + fs_table_header = register_sysctl_table(fs_table); +#endif + + return; +} + +void pvfs2_proc_finalize(void) +{ +/* CONFIG_SYSCTL is set in .config */ +#ifdef CONFIG_SYSCTL + if (fs_table_header) { + unregister_sysctl_table(fs_table_header); + fs_table_header = NULL; + } +#endif + return; +} diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c new file mode 100644 index 0000000..569e8f0 --- /dev/null +++ b/fs/orangefs/waitqueue.c @@ -0,0 +1,522 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * (C) 2011 Omnibond Systems + * + * Changes by Acxiom Corporation to implement generic service_operation() + * function, Copyright Acxiom Corporation, 2005. + * + * See COPYING in top-level directory. + */ + +/* + * In-kernel waitqueue operations. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-bufmap.h" + +/* + * What we do in this function is to walk the list of operations that are + * present in the request queue and mark them as purged. + * NOTE: This is called from the device close after client-core has + * guaranteed that no new operations could appear on the list since the + * client-core is anyway going to exit. + */ +void purge_waiting_ops(void) +{ + struct pvfs2_kernel_op *op; + spin_lock(&pvfs2_request_list_lock); + list_for_each_entry(op, &pvfs2_request_list, list) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "pvfs2-client-core: purging op tag %llu %s\n", + llu(op->tag), + get_opname_string(op)); + spin_lock(&op->lock); + set_op_state_purged(op); + spin_unlock(&op->lock); + wake_up_interruptible(&op->waitq); + } + spin_unlock(&pvfs2_request_list_lock); + return; +} + +/* + * submits a PVFS2 operation and waits for it to complete + * + * Note op->downcall.status will contain the status of the operation (in + * errno format), whether provided by pvfs2-client or a result of failure to + * service the operation. If the caller wishes to distinguish, then + * op->state can be checked to see if it was serviced or not. + * + * Returns contents of op->downcall.status for convenience + */ +int service_operation(struct pvfs2_kernel_op *op, + const char *op_name, + int flags) +{ + /* flags to modify behavior */ + sigset_t orig_sigset; + int ret = 0; + + /* irqflags and wait_entry are only used IF the client-core aborts */ + unsigned long irqflags; + + DECLARE_WAITQUEUE(wait_entry, current); + + op->upcall.tgid = current->tgid; + op->upcall.pid = current->pid; + +retry_servicing: + op->downcall.status = 0; + gossip_debug(GOSSIP_WAIT_DEBUG, + "pvfs2: service_operation: %s %p\n", + op_name, + op); + gossip_debug(GOSSIP_WAIT_DEBUG, + "pvfs2: operation posted by process: %s, pid: %i\n", + current->comm, + current->pid); + + /* mask out signals if this operation is not to be interrupted */ + if (!(flags & PVFS2_OP_INTERRUPTIBLE)) + mask_blocked_signals(&orig_sigset); + + if (!(flags & PVFS2_OP_NO_SEMAPHORE)) { + ret = mutex_lock_interruptible(&request_mutex); + /* + * check to see if we were interrupted while waiting for + * semaphore + */ + if (ret < 0) { + if (!(flags & PVFS2_OP_INTERRUPTIBLE)) + unmask_blocked_signals(&orig_sigset); + op->downcall.status = ret; + gossip_debug(GOSSIP_WAIT_DEBUG, + "pvfs2: service_operation interrupted.\n"); + return ret; + } + } + + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:About to call is_daemon_in_service().\n", + __func__); + + if (is_daemon_in_service() < 0) { + /* + * By incrementing the per-operation attempt counter, we + * directly go into the timeout logic while waiting for + * the matching downcall to be read + */ + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:client core is NOT in service(%d).\n", + __func__, + is_daemon_in_service()); + op->attempts++; + } + + /* queue up the operation */ + if (flags & PVFS2_OP_PRIORITY) { + add_priority_op_to_request_list(op); + } else { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:About to call add_op_to_request_list().\n", + __func__); + add_op_to_request_list(op); + } + + if (!(flags & PVFS2_OP_NO_SEMAPHORE)) + mutex_unlock(&request_mutex); + + /* + * If we are asked to service an asynchronous operation from + * VFS perspective, we are done. + */ + if (flags & PVFS2_OP_ASYNC) + return 0; + + if (flags & PVFS2_OP_CANCELLATION) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:" + "About to call wait_for_cancellation_downcall.\n", + __func__); + ret = wait_for_cancellation_downcall(op); + } else { + ret = wait_for_matching_downcall(op); + } + + if (ret < 0) { + /* failed to get matching downcall */ + if (ret == -ETIMEDOUT) { + gossip_err("pvfs2: %s -- wait timed out; aborting attempt.\n", + op_name); + } + op->downcall.status = ret; + } else { + /* got matching downcall; make sure status is in errno format */ + op->downcall.status = + pvfs2_normalize_to_errno(op->downcall.status); + ret = op->downcall.status; + } + + if (!(flags & PVFS2_OP_INTERRUPTIBLE)) + unmask_blocked_signals(&orig_sigset); + + BUG_ON(ret != op->downcall.status); + /* retry if operation has not been serviced and if requested */ + if (!op_state_serviced(op) && op->downcall.status == -EAGAIN) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "pvfs2: tag %llu (%s)" + " -- operation to be retried (%d attempt)\n", + llu(op->tag), + op_name, + op->attempts + 1); + + if (!op->uses_shared_memory) + /* + * this operation doesn't use the shared memory + * system + */ + goto retry_servicing; + + /* op uses shared memory */ + if (get_bufmap_init() == 0) { + /* + * This operation uses the shared memory system AND + * the system is not yet ready. This situation occurs + * when the client-core is restarted AND there were + * operations waiting to be processed or were already + * in process. + */ + gossip_debug(GOSSIP_WAIT_DEBUG, + "uses_shared_memory is true.\n"); + gossip_debug(GOSSIP_WAIT_DEBUG, + "Client core in-service status(%d).\n", + is_daemon_in_service()); + gossip_debug(GOSSIP_WAIT_DEBUG, "bufmap_init:%d.\n", + get_bufmap_init()); + gossip_debug(GOSSIP_WAIT_DEBUG, + "operation's status is 0x%0x.\n", + op->op_state); + + /* + * let process sleep for a few seconds so shared + * memory system can be initialized. + */ + spin_lock_irqsave(&op->lock, irqflags); + add_wait_queue(&pvfs2_bufmap_init_waitq, &wait_entry); + spin_unlock_irqrestore(&op->lock, irqflags); + + set_current_state(TASK_INTERRUPTIBLE); + + /* + * Wait for pvfs_bufmap_initialize() to wake me up + * within the allotted time. + */ + ret = schedule_timeout(MSECS_TO_JIFFIES + (1000 * PVFS2_BUFMAP_WAIT_TIMEOUT_SECS)); + + gossip_debug(GOSSIP_WAIT_DEBUG, + "Value returned from schedule_timeout:" + "%d.\n", + ret); + gossip_debug(GOSSIP_WAIT_DEBUG, + "Is shared memory available? (%d).\n", + get_bufmap_init()); + + spin_lock_irqsave(&op->lock, irqflags); + remove_wait_queue(&pvfs2_bufmap_init_waitq, + &wait_entry); + spin_unlock_irqrestore(&op->lock, irqflags); + + if (get_bufmap_init() == 0) { + gossip_err("%s:The shared memory system has not started in %d seconds after the client core restarted. Aborting user's request(%s).\n", + __func__, + PVFS2_BUFMAP_WAIT_TIMEOUT_SECS, + get_opname_string(op)); + return -EIO; + } + + /* + * Return to the calling function and re-populate a + * shared memory buffer. + */ + return -EAGAIN; + } + } + + gossip_debug(GOSSIP_WAIT_DEBUG, + "pvfs2: service_operation %s returning: %d for %p.\n", + op_name, + ret, + op); + return ret; +} + +void pvfs2_clean_up_interrupted_operation(struct pvfs2_kernel_op *op) +{ + /* + * handle interrupted cases depending on what state we were in when + * the interruption is detected. there is a coarse grained lock + * across the operation. + * + * NOTE: be sure not to reverse lock ordering by locking an op lock + * while holding the request_list lock. Here, we first lock the op + * and then lock the appropriate list. + */ + if (!op) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s: op is null, ignoring\n", + __func__); + return; + } + + /* + * one more sanity check, make sure it's in one of the possible states + * or don't try to cancel it + */ + if (!(op_state_waiting(op) || + op_state_in_progress(op) || + op_state_serviced(op) || + op_state_purged(op))) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s: op %p not in a valid state (%0x), " + "ignoring\n", + __func__, + op, + op->op_state); + return; + } + + spin_lock(&op->lock); + + if (op_state_waiting(op)) { + /* + * upcall hasn't been read; remove op from upcall request + * list. + */ + spin_unlock(&op->lock); + remove_op_from_request_list(op); + gossip_debug(GOSSIP_WAIT_DEBUG, + "Interrupted: Removed op %p from request_list\n", + op); + } else if (op_state_in_progress(op)) { + /* op must be removed from the in progress htable */ + spin_unlock(&op->lock); + spin_lock(&htable_ops_in_progress_lock); + list_del(&op->list); + spin_unlock(&htable_ops_in_progress_lock); + gossip_debug(GOSSIP_WAIT_DEBUG, + "Interrupted: Removed op %p" + " from htable_ops_in_progress\n", + op); + } else if (!op_state_serviced(op)) { + spin_unlock(&op->lock); + gossip_err("interrupted operation is in a weird state 0x%x\n", + op->op_state); + } +} + +/* + * sleeps on waitqueue waiting for matching downcall. + * if client-core finishes servicing, then we are good to go. + * else if client-core exits, we get woken up here, and retry with a timeout + * + * Post when this call returns to the caller, the specified op will no + * longer be on any list or htable. + * + * Returns 0 on success and -errno on failure + * Errors are: + * EAGAIN in case we want the caller to requeue and try again.. + * EINTR/EIO/ETIMEDOUT indicating we are done trying to service this + * operation since client-core seems to be exiting too often + * or if we were interrupted. + */ +int wait_for_matching_downcall(struct pvfs2_kernel_op *op) +{ + int ret = -EINVAL; + DECLARE_WAITQUEUE(wait_entry, current); + + spin_lock(&op->lock); + add_wait_queue(&op->waitq, &wait_entry); + spin_unlock(&op->lock); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + spin_lock(&op->lock); + if (op_state_serviced(op)) { + spin_unlock(&op->lock); + ret = 0; + break; + } + spin_unlock(&op->lock); + + if (!signal_pending(current)) { + /* + * if this was our first attempt and client-core + * has not purged our operation, we are happy to + * simply wait + */ + spin_lock(&op->lock); + if (op->attempts == 0 && !op_state_purged(op)) { + spin_unlock(&op->lock); + schedule(); + } else { + spin_unlock(&op->lock); + /* + * subsequent attempts, we retry exactly once + * with timeouts + */ + if (!schedule_timeout(MSECS_TO_JIFFIES + (1000 * op_timeout_secs))) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "*** %s:" + " operation timed out (tag" + " %llu, %p, att %d)\n", + __func__, + llu(op->tag), + op, + op->attempts); + ret = -ETIMEDOUT; + pvfs2_clean_up_interrupted_operation + (op); + break; + } + } + spin_lock(&op->lock); + op->attempts++; + /* + * if the operation was purged in the meantime, it + * is better to requeue it afresh but ensure that + * we have not been purged repeatedly. This could + * happen if client-core crashes when an op + * is being serviced, so we requeue the op, client + * core crashes again so we requeue the op, client + * core starts, and so on... + */ + if (op_state_purged(op)) { + ret = (op->attempts < PVFS2_PURGE_RETRY_COUNT) ? + -EAGAIN : + -EIO; + spin_unlock(&op->lock); + gossip_debug(GOSSIP_WAIT_DEBUG, + "*** %s:" + " operation purged (tag " + "%llu, %p, att %d)\n", + __func__, + llu(op->tag), + op, + op->attempts); + pvfs2_clean_up_interrupted_operation(op); + break; + } + spin_unlock(&op->lock); + continue; + } + + gossip_debug(GOSSIP_WAIT_DEBUG, + "*** %s:" + " operation interrupted by a signal (tag " + "%llu, op %p)\n", + __func__, + llu(op->tag), + op); + pvfs2_clean_up_interrupted_operation(op); + ret = -EINTR; + break; + } + + set_current_state(TASK_RUNNING); + + spin_lock(&op->lock); + remove_wait_queue(&op->waitq, &wait_entry); + spin_unlock(&op->lock); + + return ret; +} + +/* + * similar to wait_for_matching_downcall(), but used in the special case + * of I/O cancellations. + * + * Note we need a special wait function because if this is called we already + * know that a signal is pending in current and need to service the + * cancellation upcall anyway. the only way to exit this is to either + * timeout or have the cancellation be serviced properly. + */ +int wait_for_cancellation_downcall(struct pvfs2_kernel_op *op) +{ + int ret = -EINVAL; + DECLARE_WAITQUEUE(wait_entry, current); + + spin_lock(&op->lock); + add_wait_queue(&op->waitq, &wait_entry); + spin_unlock(&op->lock); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + spin_lock(&op->lock); + if (op_state_serviced(op)) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:op-state is SERVICED.\n", + __func__); + spin_unlock(&op->lock); + ret = 0; + break; + } + spin_unlock(&op->lock); + + if (signal_pending(current)) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:operation interrupted by a signal (tag" + " %llu, op %p)\n", + __func__, + llu(op->tag), + op); + pvfs2_clean_up_interrupted_operation(op); + ret = -EINTR; + break; + } + + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:About to call schedule_timeout.\n", + __func__); + ret = + schedule_timeout(MSECS_TO_JIFFIES(1000 * op_timeout_secs)); + + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:Value returned from schedule_timeout(%d).\n", + __func__, + ret); + if (!ret) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:*** operation timed out: %p\n", + __func__, + op); + pvfs2_clean_up_interrupted_operation(op); + ret = -ETIMEDOUT; + break; + } + + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:Breaking out of loop, regardless of value returned by schedule_timeout.\n", + __func__); + ret = -ETIMEDOUT; + break; + } + + set_current_state(TASK_RUNNING); + + spin_lock(&op->lock); + remove_wait_queue(&op->waitq, &wait_entry); + spin_unlock(&op->lock); + + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:returning ret(%d)\n", + __func__, + ret); + + return ret; +}

[V2,2/5] Orangefs: protocol between kernel and userspace

Commit Message

Patch