Orangefs ABI documentation

Still busted, exactly the same, I think. The doomed op gets a good
return code from is_daemon_in_service in service_operation but
gets EAGAIN from wait_for_matching_downcall... an edge case kind of
problem.

Here's the raw (well, slightly edited for readability) logs showing
the doomed op and subsequent failed op that uses the bogus handle
and fsid from the doomed op.



Alloced OP (ffff880012898000: 10889 OP_CREATE)
service_operation: orangefs_create op:ffff880012898000:



wait_for_matching_downcall: operation purged (tag 10889, ffff880012898000, att 0
service_operation: wait_for_matching_downcall returned -11 for ffff880012898000
Interrupted: Removed op ffff880012898000 from htable_ops_in_progress
tag 10889 (orangefs_create) -- operation to be retried (1 attempt)
service_operation: orangefs_create op:ffff880012898000:
service_operation:client core is NOT in service, ffff880012898000



service_operation: wait_for_matching_downcall returned 0 for ffff880012898000
service_operation orangefs_create returning: 0 for ffff880012898000
orangefs_create: PPTOOLS1.PPA:
handle:00000000-0000-0000-0000-000000000000: fsid:0:
new_op:ffff880012898000: ret:0:



Alloced OP (ffff880012888000: 10958 OP_GETATTR)
service_operation: orangefs_inode_getattr op:ffff880012888000:
service_operation: wait_for_matching_downcall returned 0 for ffff880012888000
service_operation orangefs_inode_getattr returning: -22 for ffff880012888000
Releasing OP (ffff880012888000: 10958
orangefs_create: Failed to allocate inode for file :PPTOOLS1.PPA:
Releasing OP (ffff880012898000: 10889




What I'm testing with differs from what is at kernel.org#for-next by
  - diffs from Al's most recent email
  - 1 souped up gossip message
  - changed 0 to OP_VFS_STATE_UNKNOWN one place in service_operation
  - reinit_completion(&op->waitq) in orangefs_clean_up_interrupted_operation



      "Interrupted: Removed op %p from request_list\n",
@@ -225,24 +231,18 @@ static void
orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
  /* op must be removed from the in progress htable */
  spin_unlock(&op->lock);
  spin_lock(&htable_ops_in_progress_lock);
- list_del(&op->list);
+ list_del_init(&op->list);
  spin_unlock(&htable_ops_in_progress_lock);
  gossip_debug(GOSSIP_WAIT_DEBUG,
      "Interrupted: Removed op %p"
      " from htable_ops_in_progress\n",
      op);
- } else if (!op_state_serviced(op)) {
+ } else {
  spin_unlock(&op->lock);
  gossip_err("interrupted operation is in a weird state 0x%x\n",
    op->op_state);
- } else {
- /*
- * It is not intended for execution to flow here,
- * but having this unlock here makes sparse happy.
- */
- gossip_err("%s: can't get here.\n", __func__);
- spin_unlock(&op->lock);
  }
+ reinit_completion(&op->waitq);
 }

 /*

On Thu, Feb 18, 2016 at 6:11 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, Feb 18, 2016 at 12:04:39AM +0000, Al Viro wrote:
>> Looks like the right approach is to have orangefs_clean_... hitting the
>> sucker being copied to/from daemon to wait until that's finished (and
>> discarded).  That, BTW, would have an extra benefit of making life simpler
>> for refcounting.
>>
>> So...  We need to have them marked as "being copied" for the duration, instead
>> of bumping the refcount.  That setting and dropping that flag should happen
>> under op->lock.  Setting it should happen only if it's not given up (that would
>> be interpreted as "not found").  Cleaning, OTOH, would recheck the "given up"
>> and do complete(&op->waitq) in case it's been given up...
>>
>> How about this (instead of the previous variant, includes a fix for
>> errno bogosity spotted a bit upthread; if it works, it'll need a bit of
>> splitup)
>
> Better yet, let's use list_del_init() on op->list instead of those list_del().
> Then, seeing that ..._clean_interrupted_... can't be called in case of
> serviced (we hadn't dropped op->lock since the time we'd checked it), we
> can use list_empty(&op->list) as a test for "given up while copying to/from
> daemon", so there's no need for separate flag that way:
>         * we never pick given up op from list/hash
>         * daemon read/write_iter never modifies op->list after op has
> been given up
>         * if op is given up while copying to/from userland in daemon
> read/write_iter, it will call complete(&op->waitq) once it finds that,
> so giveup side can wait for completion if it finds op it's about to give
> up not on any list.
>
> Should be equivalent to the previous variant, but IMO it's cleaner that
> way...
>
> diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
> index b27ed1c..f7914f5 100644
> --- a/fs/orangefs/devorangefs-req.c
> +++ b/fs/orangefs/devorangefs-req.c
> @@ -58,9 +58,9 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
>                                  next,
>                                  &htable_ops_in_progress[index],
>                                  list) {
> -               if (op->tag == tag && !op_state_purged(op)) {
> +               if (op->tag == tag && !op_state_purged(op) &&
> +                   !op_state_given_up(op)) {
>                         list_del_init(&op->list);
> -                       get_op(op); /* increase ref count. */
>                         spin_unlock(&htable_ops_in_progress_lock);
>                         return op;
>                 }
> @@ -133,7 +133,7 @@ restart:
>                 __s32 fsid;
>                 /* This lock is held past the end of the loop when we break. */
>                 spin_lock(&op->lock);
> -               if (unlikely(op_state_purged(op))) {
> +               if (unlikely(op_state_purged(op) || op_state_given_up(op))) {
>                         spin_unlock(&op->lock);
>                         continue;
>                 }
> @@ -199,13 +199,12 @@ restart:
>          */
>         if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
>                 gossip_err("orangefs: ERROR: Current op already queued.\n");
> -               list_del(&cur_op->list);
> +               list_del_init(&cur_op->list);
>                 spin_unlock(&cur_op->lock);
>                 spin_unlock(&orangefs_request_list_lock);
>                 return -EAGAIN;
>         }
>         list_del_init(&cur_op->list);
> -       get_op(op);
>         spin_unlock(&orangefs_request_list_lock);
>
>         spin_unlock(&cur_op->lock);
> @@ -230,7 +229,7 @@ restart:
>         if (unlikely(op_state_given_up(cur_op))) {
>                 spin_unlock(&cur_op->lock);
>                 spin_unlock(&htable_ops_in_progress_lock);
> -               op_release(cur_op);
> +               complete(&cur_op->waitq);
>                 goto restart;
>         }
>
> @@ -242,7 +241,6 @@ restart:
>         orangefs_devreq_add_op(cur_op);
>         spin_unlock(&cur_op->lock);
>         spin_unlock(&htable_ops_in_progress_lock);
> -       op_release(cur_op);
>
>         /* The client only asks to read one size buffer. */
>         return MAX_DEV_REQ_UPSIZE;
> @@ -258,10 +256,12 @@ error:
>         if (likely(!op_state_given_up(cur_op))) {
>                 set_op_state_waiting(cur_op);
>                 list_add(&cur_op->list, &orangefs_request_list);
> +               spin_unlock(&cur_op->lock);
> +       } else {
> +               spin_unlock(&cur_op->lock);
> +               complete(&cur_op->waitq);
>         }
> -       spin_unlock(&cur_op->lock);
>         spin_unlock(&orangefs_request_list_lock);
> -       op_release(cur_op);
>         return -EFAULT;
>  }
>
> @@ -333,8 +333,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>         n = copy_from_iter(&op->downcall, downcall_size, iter);
>         if (n != downcall_size) {
>                 gossip_err("%s: failed to copy downcall.\n", __func__);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>         if (op->downcall.status)
> @@ -354,8 +353,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>                            downcall_size,
>                            op->downcall.trailer_size,
>                            total);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>         /* Only READDIR operations should have trailers. */
> @@ -364,8 +362,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>                 gossip_err("%s: %x operation with trailer.",
>                            __func__,
>                            op->downcall.type);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>         /* READDIR operations should always have trailers. */
> @@ -374,8 +371,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>                 gossip_err("%s: %x operation with no trailer.",
>                            __func__,
>                            op->downcall.type);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>         if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
> @@ -386,8 +382,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>         if (op->downcall.trailer_buf == NULL) {
>                 gossip_err("%s: failed trailer vmalloc.\n",
>                            __func__);
> -               ret = -ENOMEM;
> -               goto Broken;
> +               goto Enomem;
>         }
>         memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
>         n = copy_from_iter(op->downcall.trailer_buf,
> @@ -396,8 +391,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>         if (n != op->downcall.trailer_size) {
>                 gossip_err("%s: failed to copy trailer.\n", __func__);
>                 vfree(op->downcall.trailer_buf);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>  wakeup:
> @@ -406,38 +400,27 @@ wakeup:
>          * that this op is done
>          */
>         spin_lock(&op->lock);
> -       if (unlikely(op_state_given_up(op))) {
> +       if (unlikely(op_is_cancel(op))) {
>                 spin_unlock(&op->lock);
> -               goto out;
> -       }
> -       set_op_state_serviced(op);
> -       spin_unlock(&op->lock);
> -
> -       /*
> -        * If this operation is an I/O operation we need to wait
> -        * for all data to be copied before we can return to avoid
> -        * buffer corruption and races that can pull the buffers
> -        * out from under us.
> -        *
> -        * Essentially we're synchronizing with other parts of the
> -        * vfs implicitly by not allowing the user space
> -        * application reading/writing this device to return until
> -        * the buffers are done being used.
> -        */
> -out:
> -       if (unlikely(op_is_cancel(op)))
>                 put_cancel(op);
> -       op_release(op);
> -       return ret;
> -
> -Broken:
> -       spin_lock(&op->lock);
> -       if (!op_state_given_up(op)) {
> -               op->downcall.status = ret;
> +       } else if (unlikely(op_state_given_up(op))) {
> +               spin_unlock(&op->lock);
> +               complete(&op->waitq);
> +       } else {
>                 set_op_state_serviced(op);
> +               spin_unlock(&op->lock);
>         }
> -       spin_unlock(&op->lock);
> -       goto out;
> +       return ret;
> +
> +Efault:
> +       op->downcall.status = -(ORANGEFS_ERROR_BIT | 9);
> +       ret = -EFAULT;
> +       goto wakeup;
> +
> +Enomem:
> +       op->downcall.status = -(ORANGEFS_ERROR_BIT | 8);
> +       ret = -ENOMEM;
> +       goto wakeup;
>  }
>
>  /* Returns whether any FS are still pending remounted */
> diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
> index 817092a..900a2e3 100644
> --- a/fs/orangefs/orangefs-cache.c
> +++ b/fs/orangefs/orangefs-cache.c
> @@ -120,8 +120,6 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type)
>                 spin_lock_init(&new_op->lock);
>                 init_completion(&new_op->waitq);
>
> -               atomic_set(&new_op->ref_count, 1);
> -
>                 new_op->upcall.type = ORANGEFS_VFS_OP_INVALID;
>                 new_op->downcall.type = ORANGEFS_VFS_OP_INVALID;
>                 new_op->downcall.status = -1;
> @@ -149,7 +147,7 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type)
>         return new_op;
>  }
>
> -void __op_release(struct orangefs_kernel_op_s *orangefs_op)
> +void op_release(struct orangefs_kernel_op_s *orangefs_op)
>  {
>         if (orangefs_op) {
>                 gossip_debug(GOSSIP_CACHE_DEBUG,
> diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
> index 1f8310c..e387d3c 100644
> --- a/fs/orangefs/orangefs-kernel.h
> +++ b/fs/orangefs/orangefs-kernel.h
> @@ -205,8 +205,6 @@ struct orangefs_kernel_op_s {
>         struct completion waitq;
>         spinlock_t lock;
>
> -       atomic_t ref_count;
> -
>         /* VFS aio fields */
>
>         int attempts;
> @@ -230,23 +228,7 @@ static inline void set_op_state_serviced(struct orangefs_kernel_op_s *op)
>  #define op_state_given_up(op)    ((op)->op_state & OP_VFS_STATE_GIVEN_UP)
>  #define op_is_cancel(op)         ((op)->downcall.type == ORANGEFS_VFS_OP_CANCEL)
>
> -static inline void get_op(struct orangefs_kernel_op_s *op)
> -{
> -       atomic_inc(&op->ref_count);
> -       gossip_debug(GOSSIP_DEV_DEBUG,
> -                       "(get) Alloced OP (%p:%llu)\n", op, llu(op->tag));
> -}
> -
> -void __op_release(struct orangefs_kernel_op_s *op);
> -
> -static inline void op_release(struct orangefs_kernel_op_s *op)
> -{
> -       if (atomic_dec_and_test(&op->ref_count)) {
> -               gossip_debug(GOSSIP_DEV_DEBUG,
> -                       "(put) Releasing OP (%p:%llu)\n", op, llu((op)->tag));
> -               __op_release(op);
> -       }
> -}
> +void op_release(struct orangefs_kernel_op_s *op);
>
>  extern void orangefs_bufmap_put(int);
>  static inline void put_cancel(struct orangefs_kernel_op_s *op)
> @@ -259,7 +241,7 @@ static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
>  {
>         spin_lock(&op->lock);
>         if (unlikely(op_is_cancel(op))) {
> -               list_del(&op->list);
> +               list_del_init(&op->list);
>                 spin_unlock(&op->lock);
>                 put_cancel(op);
>         } else {
> diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
> index d980240..3f9e430 100644
> --- a/fs/orangefs/waitqueue.c
> +++ b/fs/orangefs/waitqueue.c
> @@ -208,15 +208,20 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
>          * Called with op->lock held.
>          */
>         op->op_state |= OP_VFS_STATE_GIVEN_UP;
> -
> -       if (op_state_waiting(op)) {
> +       /* from that point on it can't be moved by anybody else */
> +       if (list_empty(&op->list)) {
> +               /* caught copying to/from daemon */
> +               BUG_ON(op_state_serviced(op));
> +               spin_unlock(&op->lock);
> +               wait_for_completion(&op->waitq);
> +       } else if (op_state_waiting(op)) {
>                 /*
>                  * upcall hasn't been read; remove op from upcall request
>                  * list.
>                  */
>                 spin_unlock(&op->lock);
>                 spin_lock(&orangefs_request_list_lock);
> -               list_del(&op->list);
> +               list_del_init(&op->list);
>                 spin_unlock(&orangefs_request_list_lock);
>                 gossip_debug(GOSSIP_WAIT_DEBUG,
>                              "Interrupted: Removed op %p from request_list\n",
> @@ -225,23 +230,16 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
>                 /* op must be removed from the in progress htable */
>                 spin_unlock(&op->lock);
>                 spin_lock(&htable_ops_in_progress_lock);
> -               list_del(&op->list);
> +               list_del_init(&op->list);
>                 spin_unlock(&htable_ops_in_progress_lock);
>                 gossip_debug(GOSSIP_WAIT_DEBUG,
>                              "Interrupted: Removed op %p"
>                              " from htable_ops_in_progress\n",
>                              op);
> -       } else if (!op_state_serviced(op)) {
> +       } else {
>                 spin_unlock(&op->lock);
>                 gossip_err("interrupted operation is in a weird state 0x%x\n",
>                            op->op_state);
> -       } else {
> -               /*
> -                * It is not intended for execution to flow here,
> -                * but having this unlock here makes sparse happy.
> -                */
> -               gossip_err("%s: can't get here.\n", __func__);
> -               spin_unlock(&op->lock);
>         }
>         reinit_completion(&op->waitq);
>  }
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Orangefs ABI documentation

Commit Message

Comments

Patch