diff mbox series

[RFC,v5,15/16] fuse: {io-uring} Prevent mount point hang on fuse-server termination

Message ID 20241107-fuse-uring-for-6-10-rfc4-v5-15-e8660a991499@ddn.com (mailing list archive)
State New
Headers show
Series fuse: fuse-over-io-uring | expand

Commit Message

Bernd Schubert Nov. 7, 2024, 5:03 p.m. UTC
When the fuse-server terminates while the fuse-client or kernel
still has queued URING_CMDs, these commands retain references
to the struct file used by the fuse connection. This prevents
fuse_dev_release() from being invoked, resulting in a hung mount
point.

This patch addresses the issue by making queued URING_CMDs
cancelable, allowing fuse_dev_release() to proceed as expected
and preventing the mount point from hanging.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
---
 fs/fuse/dev_uring.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 6 deletions(-)

Comments

Joanne Koong Nov. 18, 2024, 7:32 p.m. UTC | #1
On Thu, Nov 7, 2024 at 9:04 AM Bernd Schubert <bschubert@ddn.com> wrote:
>
> When the fuse-server terminates while the fuse-client or kernel
> still has queued URING_CMDs, these commands retain references
> to the struct file used by the fuse connection. This prevents
> fuse_dev_release() from being invoked, resulting in a hung mount
> point.

Could you explain the flow of what happens after a fuse server
terminates? How does that trigger the IO_URING_F_CANCEL uring cmd?

>
> This patch addresses the issue by making queued URING_CMDs
> cancelable, allowing fuse_dev_release() to proceed as expected
> and preventing the mount point from hanging.
>
> Signed-off-by: Bernd Schubert <bschubert@ddn.com>
> ---
>  fs/fuse/dev_uring.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 70 insertions(+), 6 deletions(-)
>
> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> index 6af515458695ccb2e32cc8c62c45471e6710c15f..b465da41c42c47eaf69f09bab1423061bc8fcc68 100644
> --- a/fs/fuse/dev_uring.c
> +++ b/fs/fuse/dev_uring.c
> @@ -23,6 +23,7 @@ MODULE_PARM_DESC(enable_uring,
>
>  struct fuse_uring_cmd_pdu {
>         struct fuse_ring_ent *ring_ent;
> +       struct fuse_ring_queue *queue;
>  };
>
>  /*
> @@ -382,6 +383,61 @@ void fuse_uring_stop_queues(struct fuse_ring *ring)
>         }
>  }
>
> +/*
> + * Handle IO_URING_F_CANCEL, typically should come on daemon termination
> + */
> +static void fuse_uring_cancel(struct io_uring_cmd *cmd,
> +                             unsigned int issue_flags, struct fuse_conn *fc)
> +{
> +       struct fuse_uring_cmd_pdu *pdu = (struct fuse_uring_cmd_pdu *)cmd->pdu;
> +       struct fuse_ring_queue *queue = pdu->queue;
> +       struct fuse_ring_ent *ent;
> +       bool found = false;
> +       bool need_cmd_done = false;
> +
> +       spin_lock(&queue->lock);
> +
> +       /* XXX: This is cumbersome for large queues. */
> +       list_for_each_entry(ent, &queue->ent_avail_queue, list) {
> +               if (pdu->ring_ent == ent) {
> +                       found = true;
> +                       break;
> +               }
> +       }
> +
> +       if (!found) {
> +               pr_info("qid=%d Did not find ent=%p", queue->qid, ent);
> +               spin_unlock(&queue->lock);
> +               return;
> +       }
> +
> +       if (ent->state == FRRS_WAIT) {
> +               ent->state = FRRS_USERSPACE;
> +               list_move(&ent->list, &queue->ent_in_userspace);
> +               need_cmd_done = true;
> +       }
> +       spin_unlock(&queue->lock);
> +
> +       if (need_cmd_done)
> +               io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
> +
> +       /*
> +        * releasing the last entry should trigger fuse_dev_release() if
> +        * the daemon was terminated
> +        */
> +}
> +
> +static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags,
> +                                     struct fuse_ring_ent *ring_ent)
> +{
> +       struct fuse_uring_cmd_pdu *pdu = (struct fuse_uring_cmd_pdu *)cmd->pdu;
> +
> +       pdu->ring_ent = ring_ent;
> +       pdu->queue = ring_ent->queue;
> +
> +       io_uring_cmd_mark_cancelable(cmd, issue_flags);
> +}
> +
>  /*
>   * Checks for errors and stores it into the request
>   */
> @@ -606,7 +662,8 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ring_ent)
>   * Put a ring request onto hold, it is no longer used for now.
>   */
>  static void fuse_uring_ent_avail(struct fuse_ring_ent *ring_ent,
> -                                struct fuse_ring_queue *queue)
> +                                struct fuse_ring_queue *queue,
> +                                unsigned int issue_flags)
>         __must_hold(&queue->lock)
>  {
>         struct fuse_ring *ring = queue->ring;
> @@ -626,6 +683,7 @@ static void fuse_uring_ent_avail(struct fuse_ring_ent *ring_ent,
>         list_move(&ring_ent->list, &queue->ent_avail_queue);
>
>         ring_ent->state = FRRS_WAIT;
> +       fuse_uring_prepare_cancel(ring_ent->cmd, issue_flags, ring_ent);
>  }
>
>  /* Used to find the request on SQE commit */
> @@ -729,7 +787,8 @@ static void fuse_uring_commit(struct fuse_ring_ent *ring_ent,
>   * Get the next fuse req and send it
>   */
>  static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ring_ent,
> -                                   struct fuse_ring_queue *queue)
> +                                   struct fuse_ring_queue *queue,
> +                                   unsigned int issue_flags)
>  {
>         int has_next, err;
>         int prev_state = ring_ent->state;
> @@ -738,7 +797,7 @@ static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ring_ent,
>                 spin_lock(&queue->lock);
>                 has_next = fuse_uring_ent_assign_req(ring_ent);
>                 if (!has_next) {
> -                       fuse_uring_ent_avail(ring_ent, queue);
> +                       fuse_uring_ent_avail(ring_ent, queue, issue_flags);
>                         spin_unlock(&queue->lock);
>                         break; /* no request left */
>                 }
> @@ -813,7 +872,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
>          * and fetching is done in one step vs legacy fuse, which has separated
>          * read (fetch request) and write (commit result).
>          */
> -       fuse_uring_next_fuse_req(ring_ent, queue);
> +       fuse_uring_next_fuse_req(ring_ent, queue, issue_flags);
>         return 0;
>  }
>
> @@ -853,7 +912,7 @@ static void _fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
>         struct fuse_ring *ring = queue->ring;
>
>         spin_lock(&queue->lock);
> -       fuse_uring_ent_avail(ring_ent, queue);
> +       fuse_uring_ent_avail(ring_ent, queue, issue_flags);
>         ring_ent->cmd = cmd;
>         spin_unlock(&queue->lock);
>
> @@ -1021,6 +1080,11 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
>         if (fc->aborted)
>                 goto out;
>
> +       if ((unlikely(issue_flags & IO_URING_F_CANCEL))) {
> +               fuse_uring_cancel(cmd, issue_flags, fc);
> +               return 0;
> +       }
> +
>         switch (cmd_op) {
>         case FUSE_URING_REQ_FETCH:
>                 err = fuse_uring_fetch(cmd, issue_flags, fc);
> @@ -1080,7 +1144,7 @@ fuse_uring_send_req_in_task(struct io_uring_cmd *cmd,
>
>         return;
>  err:
> -       fuse_uring_next_fuse_req(ring_ent, queue);
> +       fuse_uring_next_fuse_req(ring_ent, queue, issue_flags);
>  }
>
>  /* queue a fuse request and send it if a ring entry is available */
>
> --
> 2.43.0
>
Bernd Schubert Nov. 18, 2024, 7:55 p.m. UTC | #2
On 11/18/24 20:32, Joanne Koong wrote:
> On Thu, Nov 7, 2024 at 9:04 AM Bernd Schubert <bschubert@ddn.com> wrote:
>>
>> When the fuse-server terminates while the fuse-client or kernel
>> still has queued URING_CMDs, these commands retain references
>> to the struct file used by the fuse connection. This prevents
>> fuse_dev_release() from being invoked, resulting in a hung mount
>> point.
> 
> Could you explain the flow of what happens after a fuse server
> terminates? How does that trigger the IO_URING_F_CANCEL uring cmd?

This is all about daemon termination, when the mount point is still
alive. Basically without this patch even plain (non forced umount)
hangs.
Without queued IORING_OP_URING_CMDs there is a call into 
fuse_dev_release() on daemon termination, with queued
IORING_OP_URING_CMDs this doesn't happen as each of these commands
holds a file reference.

IO_URING_F_CANCEL is triggered from from io-uring, I guess when
the io-uring file descriptor is released 
(note: 'io-uring fd' != '/dev/fuse fd').


I guess I need to improve the commit message a bit.



Cheers,
Bernd
Joanne Koong Nov. 18, 2024, 11:10 p.m. UTC | #3
On Mon, Nov 18, 2024 at 11:55 AM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
> On 11/18/24 20:32, Joanne Koong wrote:
> > On Thu, Nov 7, 2024 at 9:04 AM Bernd Schubert <bschubert@ddn.com> wrote:
> >>
> >> When the fuse-server terminates while the fuse-client or kernel
> >> still has queued URING_CMDs, these commands retain references
> >> to the struct file used by the fuse connection. This prevents
> >> fuse_dev_release() from being invoked, resulting in a hung mount
> >> point.
> >
> > Could you explain the flow of what happens after a fuse server
> > terminates? How does that trigger the IO_URING_F_CANCEL uring cmd?
>
> This is all about daemon termination, when the mount point is still
> alive. Basically without this patch even plain (non forced umount)
> hangs.
> Without queued IORING_OP_URING_CMDs there is a call into
> fuse_dev_release() on daemon termination, with queued
> IORING_OP_URING_CMDs this doesn't happen as each of these commands
> holds a file reference.
>
> IO_URING_F_CANCEL is triggered from from io-uring, I guess when
> the io-uring file descriptor is released
> (note: 'io-uring fd' != '/dev/fuse fd').

Gotcha. I took a look at the io_uring code and it looks like the call
chain looks something like this:

io_uring_release()
  io_ring_ctx_wait_and_kill()
    io_ring_exit_work()
      io_uring_try_cancel_requests()
        io_uring_try_cancel_uring_cmd()
            file->f_op->uring_cmd() w/ IO_URING_F_CANCEL issues_flag

>
>
> I guess I need to improve the commit message a bit.
>
>
>
> Cheers,
> Bernd
>
>
>
>
Joanne Koong Nov. 18, 2024, 11:30 p.m. UTC | #4
On Thu, Nov 7, 2024 at 9:04 AM Bernd Schubert <bschubert@ddn.com> wrote:
>
> When the fuse-server terminates while the fuse-client or kernel
> still has queued URING_CMDs, these commands retain references
> to the struct file used by the fuse connection. This prevents
> fuse_dev_release() from being invoked, resulting in a hung mount
> point.
>
> This patch addresses the issue by making queued URING_CMDs
> cancelable, allowing fuse_dev_release() to proceed as expected
> and preventing the mount point from hanging.
>
> Signed-off-by: Bernd Schubert <bschubert@ddn.com>
> ---
>  fs/fuse/dev_uring.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 70 insertions(+), 6 deletions(-)
>
> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> index 6af515458695ccb2e32cc8c62c45471e6710c15f..b465da41c42c47eaf69f09bab1423061bc8fcc68 100644
> --- a/fs/fuse/dev_uring.c
> +++ b/fs/fuse/dev_uring.c
> @@ -23,6 +23,7 @@ MODULE_PARM_DESC(enable_uring,
>
>  struct fuse_uring_cmd_pdu {
>         struct fuse_ring_ent *ring_ent;
> +       struct fuse_ring_queue *queue;
>  };
>
>  /*
> @@ -382,6 +383,61 @@ void fuse_uring_stop_queues(struct fuse_ring *ring)
>         }
>  }
>
> +/*
> + * Handle IO_URING_F_CANCEL, typically should come on daemon termination
> + */
> +static void fuse_uring_cancel(struct io_uring_cmd *cmd,
> +                             unsigned int issue_flags, struct fuse_conn *fc)
> +{
> +       struct fuse_uring_cmd_pdu *pdu = (struct fuse_uring_cmd_pdu *)cmd->pdu;
> +       struct fuse_ring_queue *queue = pdu->queue;
> +       struct fuse_ring_ent *ent;
> +       bool found = false;
> +       bool need_cmd_done = false;
> +
> +       spin_lock(&queue->lock);
> +
> +       /* XXX: This is cumbersome for large queues. */
> +       list_for_each_entry(ent, &queue->ent_avail_queue, list) {
> +               if (pdu->ring_ent == ent) {
> +                       found = true;
> +                       break;
> +               }
> +       }

Do we have to check that the entry is on the ent_avail_queue, or can
we assume that if the ent->state is FRRS_WAIT, the only queue it'll be
on is the ent_avail_queue? I see only one case where this isn't true,
for teardown in fuse_uring_stop_list_entries() - if we had a
workaround for this, eg having some teardown state signifying that
io_uring_cmd_done() needs to be called on the cmd and clearing
FRRS_WAIT, then we could get rid of iteration through ent_avail_queue
for every cancelled cmd.

> +
> +       if (!found) {
> +               pr_info("qid=%d Did not find ent=%p", queue->qid, ent);
> +               spin_unlock(&queue->lock);
> +               return;
> +       }
> +
> +       if (ent->state == FRRS_WAIT) {
> +               ent->state = FRRS_USERSPACE;
> +               list_move(&ent->list, &queue->ent_in_userspace);
> +               need_cmd_done = true;
> +       }
> +       spin_unlock(&queue->lock);
> +
> +       if (need_cmd_done)
> +               io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
> +
> +       /*
> +        * releasing the last entry should trigger fuse_dev_release() if
> +        * the daemon was terminated
> +        */
> +}
> +
> +static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags,
> +                                     struct fuse_ring_ent *ring_ent)
> +{
> +       struct fuse_uring_cmd_pdu *pdu = (struct fuse_uring_cmd_pdu *)cmd->pdu;
> +
> +       pdu->ring_ent = ring_ent;
> +       pdu->queue = ring_ent->queue;
> +
> +       io_uring_cmd_mark_cancelable(cmd, issue_flags);
> +}
> +
>  /*
>   * Checks for errors and stores it into the request
>   */
> @@ -606,7 +662,8 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ring_ent)
>   * Put a ring request onto hold, it is no longer used for now.
>   */
>  static void fuse_uring_ent_avail(struct fuse_ring_ent *ring_ent,
> -                                struct fuse_ring_queue *queue)
> +                                struct fuse_ring_queue *queue,
> +                                unsigned int issue_flags)
>         __must_hold(&queue->lock)
>  {
>         struct fuse_ring *ring = queue->ring;
> @@ -626,6 +683,7 @@ static void fuse_uring_ent_avail(struct fuse_ring_ent *ring_ent,
>         list_move(&ring_ent->list, &queue->ent_avail_queue);
>
>         ring_ent->state = FRRS_WAIT;
> +       fuse_uring_prepare_cancel(ring_ent->cmd, issue_flags, ring_ent);
>  }
>
>  /* Used to find the request on SQE commit */
> @@ -729,7 +787,8 @@ static void fuse_uring_commit(struct fuse_ring_ent *ring_ent,
>   * Get the next fuse req and send it
>   */
>  static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ring_ent,
> -                                   struct fuse_ring_queue *queue)
> +                                   struct fuse_ring_queue *queue,
> +                                   unsigned int issue_flags)
>  {
>         int has_next, err;
>         int prev_state = ring_ent->state;
> @@ -738,7 +797,7 @@ static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ring_ent,
>                 spin_lock(&queue->lock);
>                 has_next = fuse_uring_ent_assign_req(ring_ent);
>                 if (!has_next) {
> -                       fuse_uring_ent_avail(ring_ent, queue);
> +                       fuse_uring_ent_avail(ring_ent, queue, issue_flags);
>                         spin_unlock(&queue->lock);
>                         break; /* no request left */
>                 }
> @@ -813,7 +872,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
>          * and fetching is done in one step vs legacy fuse, which has separated
>          * read (fetch request) and write (commit result).
>          */
> -       fuse_uring_next_fuse_req(ring_ent, queue);
> +       fuse_uring_next_fuse_req(ring_ent, queue, issue_flags);
>         return 0;
>  }
>
> @@ -853,7 +912,7 @@ static void _fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
>         struct fuse_ring *ring = queue->ring;
>
>         spin_lock(&queue->lock);
> -       fuse_uring_ent_avail(ring_ent, queue);
> +       fuse_uring_ent_avail(ring_ent, queue, issue_flags);
>         ring_ent->cmd = cmd;
>         spin_unlock(&queue->lock);
>
> @@ -1021,6 +1080,11 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
>         if (fc->aborted)
>                 goto out;
>
> +       if ((unlikely(issue_flags & IO_URING_F_CANCEL))) {
> +               fuse_uring_cancel(cmd, issue_flags, fc);
> +               return 0;
> +       }
> +
>         switch (cmd_op) {
>         case FUSE_URING_REQ_FETCH:
>                 err = fuse_uring_fetch(cmd, issue_flags, fc);
> @@ -1080,7 +1144,7 @@ fuse_uring_send_req_in_task(struct io_uring_cmd *cmd,
>
>         return;
>  err:
> -       fuse_uring_next_fuse_req(ring_ent, queue);
> +       fuse_uring_next_fuse_req(ring_ent, queue, issue_flags);
>  }
>
>  /* queue a fuse request and send it if a ring entry is available */
>
> --
> 2.43.0
>
Bernd Schubert Nov. 18, 2024, 11:47 p.m. UTC | #5
On 11/19/24 00:30, Joanne Koong wrote:
> On Thu, Nov 7, 2024 at 9:04 AM Bernd Schubert <bschubert@ddn.com> wrote:
>>
>> When the fuse-server terminates while the fuse-client or kernel
>> still has queued URING_CMDs, these commands retain references
>> to the struct file used by the fuse connection. This prevents
>> fuse_dev_release() from being invoked, resulting in a hung mount
>> point.
>>
>> This patch addresses the issue by making queued URING_CMDs
>> cancelable, allowing fuse_dev_release() to proceed as expected
>> and preventing the mount point from hanging.
>>
>> Signed-off-by: Bernd Schubert <bschubert@ddn.com>
>> ---
>>  fs/fuse/dev_uring.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 70 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
>> index 6af515458695ccb2e32cc8c62c45471e6710c15f..b465da41c42c47eaf69f09bab1423061bc8fcc68 100644
>> --- a/fs/fuse/dev_uring.c
>> +++ b/fs/fuse/dev_uring.c
>> @@ -23,6 +23,7 @@ MODULE_PARM_DESC(enable_uring,
>>
>>  struct fuse_uring_cmd_pdu {
>>         struct fuse_ring_ent *ring_ent;
>> +       struct fuse_ring_queue *queue;
>>  };
>>
>>  /*
>> @@ -382,6 +383,61 @@ void fuse_uring_stop_queues(struct fuse_ring *ring)
>>         }
>>  }
>>
>> +/*
>> + * Handle IO_URING_F_CANCEL, typically should come on daemon termination
>> + */
>> +static void fuse_uring_cancel(struct io_uring_cmd *cmd,
>> +                             unsigned int issue_flags, struct fuse_conn *fc)
>> +{
>> +       struct fuse_uring_cmd_pdu *pdu = (struct fuse_uring_cmd_pdu *)cmd->pdu;
>> +       struct fuse_ring_queue *queue = pdu->queue;
>> +       struct fuse_ring_ent *ent;
>> +       bool found = false;
>> +       bool need_cmd_done = false;
>> +
>> +       spin_lock(&queue->lock);
>> +
>> +       /* XXX: This is cumbersome for large queues. */
>> +       list_for_each_entry(ent, &queue->ent_avail_queue, list) {
>> +               if (pdu->ring_ent == ent) {
>> +                       found = true;
>> +                       break;
>> +               }
>> +       }
> 
> Do we have to check that the entry is on the ent_avail_queue, or can
> we assume that if the ent->state is FRRS_WAIT, the only queue it'll be
> on is the ent_avail_queue? I see only one case where this isn't true,
> for teardown in fuse_uring_stop_list_entries() - if we had a
> workaround for this, eg having some teardown state signifying that
> io_uring_cmd_done() needs to be called on the cmd and clearing
> FRRS_WAIT, then we could get rid of iteration through ent_avail_queue
> for every cancelled cmd.


I'm scared that we would run into races - I don't want to access memory
pointed to by pdu->ring_ent, before I'm not sure it is on the list.
Remember the long discussion Miklos and I had about the tiny 'tag'
variable and finding requests using existing hash lists [0] ? 
Personally I would prefer an array of 

struct queue_entries {
	struct fuse_ring_ent *ring_ent;
	bool valid;
}


in struct fuse_ring_queue {
    ...
    struct queue_entries *entries[]
}

And that array would only get freed on queue destruction. Besides
avoiding hash lists, it would also allow to use 'valid' to know if
we can access the ring_entry and then check the state.

Thanks,
Bernd


[0] https://lore.kernel.org/linux-fsdevel/CAJfpegu_UQ1BNp0UDHeOZFWwUoXbJ_LP4W=o+UX6MB3DsJbH8g@mail.gmail.com/T/#t
Joanne Koong Nov. 19, 2024, 2:02 a.m. UTC | #6
On Mon, Nov 18, 2024 at 3:47 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
> On 11/19/24 00:30, Joanne Koong wrote:
> > On Thu, Nov 7, 2024 at 9:04 AM Bernd Schubert <bschubert@ddn.com> wrote:
> >>
> >> When the fuse-server terminates while the fuse-client or kernel
> >> still has queued URING_CMDs, these commands retain references
> >> to the struct file used by the fuse connection. This prevents
> >> fuse_dev_release() from being invoked, resulting in a hung mount
> >> point.
> >>
> >> This patch addresses the issue by making queued URING_CMDs
> >> cancelable, allowing fuse_dev_release() to proceed as expected
> >> and preventing the mount point from hanging.
> >>
> >> Signed-off-by: Bernd Schubert <bschubert@ddn.com>
> >> ---
> >>  fs/fuse/dev_uring.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++-----
> >>  1 file changed, 70 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> >> index 6af515458695ccb2e32cc8c62c45471e6710c15f..b465da41c42c47eaf69f09bab1423061bc8fcc68 100644
> >> --- a/fs/fuse/dev_uring.c
> >> +++ b/fs/fuse/dev_uring.c
> >> @@ -23,6 +23,7 @@ MODULE_PARM_DESC(enable_uring,
> >>
> >>  struct fuse_uring_cmd_pdu {
> >>         struct fuse_ring_ent *ring_ent;
> >> +       struct fuse_ring_queue *queue;
> >>  };
> >>
> >>  /*
> >> @@ -382,6 +383,61 @@ void fuse_uring_stop_queues(struct fuse_ring *ring)
> >>         }
> >>  }
> >>
> >> +/*
> >> + * Handle IO_URING_F_CANCEL, typically should come on daemon termination
> >> + */
> >> +static void fuse_uring_cancel(struct io_uring_cmd *cmd,
> >> +                             unsigned int issue_flags, struct fuse_conn *fc)
> >> +{
> >> +       struct fuse_uring_cmd_pdu *pdu = (struct fuse_uring_cmd_pdu *)cmd->pdu;
> >> +       struct fuse_ring_queue *queue = pdu->queue;
> >> +       struct fuse_ring_ent *ent;
> >> +       bool found = false;
> >> +       bool need_cmd_done = false;
> >> +
> >> +       spin_lock(&queue->lock);
> >> +
> >> +       /* XXX: This is cumbersome for large queues. */
> >> +       list_for_each_entry(ent, &queue->ent_avail_queue, list) {
> >> +               if (pdu->ring_ent == ent) {
> >> +                       found = true;
> >> +                       break;
> >> +               }
> >> +       }
> >
> > Do we have to check that the entry is on the ent_avail_queue, or can
> > we assume that if the ent->state is FRRS_WAIT, the only queue it'll be
> > on is the ent_avail_queue? I see only one case where this isn't true,
> > for teardown in fuse_uring_stop_list_entries() - if we had a
> > workaround for this, eg having some teardown state signifying that
> > io_uring_cmd_done() needs to be called on the cmd and clearing
> > FRRS_WAIT, then we could get rid of iteration through ent_avail_queue
> > for every cancelled cmd.
>
>
> I'm scared that we would run into races - I don't want to access memory
> pointed to by pdu->ring_ent, before I'm not sure it is on the list.

Oh, I was seeing that we mark the cmd as cancellable (eg in
fuse_uring_prepare_cancel()) only after the ring_ent is moved to the
ent_avail_queue (in fuse_uring_ent_avail()) and that this is done in
the scope of the queue->lock, so we would only call into
fuse_uring_cancel() when the ring_ent is on the list. Could there
still be a race condition here?

Alternatively, inspired by your "bool valid;" idea below, maybe
another solution would be having a bit in "struct fuse_ring_ent"
tracking if io_uring_cmd_done() needs to be called on it?

This is fairly unimportant though - this part could always be
optimized in a future patchset if you think it needs to be optimized,
but was just curious if these would work.


Thanks,
Joanne

> Remember the long discussion Miklos and I had about the tiny 'tag'
> variable and finding requests using existing hash lists [0] ?
> Personally I would prefer an array of
>
> struct queue_entries {
>         struct fuse_ring_ent *ring_ent;
>         bool valid;
> }
>
>
> in struct fuse_ring_queue {
>     ...
>     struct queue_entries *entries[]
> }
>
> And that array would only get freed on queue destruction. Besides
> avoiding hash lists, it would also allow to use 'valid' to know if
> we can access the ring_entry and then check the state.
>
> Thanks,
> Bernd
>
>
> [0] https://lore.kernel.org/linux-fsdevel/CAJfpegu_UQ1BNp0UDHeOZFWwUoXbJ_LP4W=o+UX6MB3DsJbH8g@mail.gmail.com/T/#t
Bernd Schubert Nov. 19, 2024, 9:32 a.m. UTC | #7
On 11/19/24 03:02, Joanne Koong wrote:
> On Mon, Nov 18, 2024 at 3:47 PM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>> On 11/19/24 00:30, Joanne Koong wrote:
>>> On Thu, Nov 7, 2024 at 9:04 AM Bernd Schubert <bschubert@ddn.com> wrote:
>>>>
>>>> When the fuse-server terminates while the fuse-client or kernel
>>>> still has queued URING_CMDs, these commands retain references
>>>> to the struct file used by the fuse connection. This prevents
>>>> fuse_dev_release() from being invoked, resulting in a hung mount
>>>> point.
>>>>
>>>> This patch addresses the issue by making queued URING_CMDs
>>>> cancelable, allowing fuse_dev_release() to proceed as expected
>>>> and preventing the mount point from hanging.
>>>>
>>>> Signed-off-by: Bernd Schubert <bschubert@ddn.com>
>>>> ---
>>>>  fs/fuse/dev_uring.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++-----
>>>>  1 file changed, 70 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
>>>> index 6af515458695ccb2e32cc8c62c45471e6710c15f..b465da41c42c47eaf69f09bab1423061bc8fcc68 100644
>>>> --- a/fs/fuse/dev_uring.c
>>>> +++ b/fs/fuse/dev_uring.c
>>>> @@ -23,6 +23,7 @@ MODULE_PARM_DESC(enable_uring,
>>>>
>>>>  struct fuse_uring_cmd_pdu {
>>>>         struct fuse_ring_ent *ring_ent;
>>>> +       struct fuse_ring_queue *queue;
>>>>  };
>>>>
>>>>  /*
>>>> @@ -382,6 +383,61 @@ void fuse_uring_stop_queues(struct fuse_ring *ring)
>>>>         }
>>>>  }
>>>>
>>>> +/*
>>>> + * Handle IO_URING_F_CANCEL, typically should come on daemon termination
>>>> + */
>>>> +static void fuse_uring_cancel(struct io_uring_cmd *cmd,
>>>> +                             unsigned int issue_flags, struct fuse_conn *fc)
>>>> +{
>>>> +       struct fuse_uring_cmd_pdu *pdu = (struct fuse_uring_cmd_pdu *)cmd->pdu;
>>>> +       struct fuse_ring_queue *queue = pdu->queue;
>>>> +       struct fuse_ring_ent *ent;
>>>> +       bool found = false;
>>>> +       bool need_cmd_done = false;
>>>> +
>>>> +       spin_lock(&queue->lock);
>>>> +
>>>> +       /* XXX: This is cumbersome for large queues. */
>>>> +       list_for_each_entry(ent, &queue->ent_avail_queue, list) {
>>>> +               if (pdu->ring_ent == ent) {
>>>> +                       found = true;
>>>> +                       break;
>>>> +               }
>>>> +       }
>>>
>>> Do we have to check that the entry is on the ent_avail_queue, or can
>>> we assume that if the ent->state is FRRS_WAIT, the only queue it'll be
>>> on is the ent_avail_queue? I see only one case where this isn't true,
>>> for teardown in fuse_uring_stop_list_entries() - if we had a
>>> workaround for this, eg having some teardown state signifying that
>>> io_uring_cmd_done() needs to be called on the cmd and clearing
>>> FRRS_WAIT, then we could get rid of iteration through ent_avail_queue
>>> for every cancelled cmd.
>>
>>
>> I'm scared that we would run into races - I don't want to access memory
>> pointed to by pdu->ring_ent, before I'm not sure it is on the list.
> 
> Oh, I was seeing that we mark the cmd as cancellable (eg in
> fuse_uring_prepare_cancel()) only after the ring_ent is moved to the
> ent_avail_queue (in fuse_uring_ent_avail()) and that this is done in
> the scope of the queue->lock, so we would only call into
> fuse_uring_cancel() when the ring_ent is on the list. Could there
> still be a race condition here?
> 
> Alternatively, inspired by your "bool valid;" idea below, maybe
> another solution would be having a bit in "struct fuse_ring_ent"
> tracking if io_uring_cmd_done() needs to be called on it?

What I mean is that daemon termination might race with normal umount.
Umount does everything cleanly and iterates through lists, but might
free 'struct fuse_ring_ent', see fuse_uring_entry_teardown().
On the other hand, daemon termination with IO_URING_F_CANCEL has 
the pointer to ring_ent - but that pointer might be already freed 
by umount. That also means another bit in "struct fuse_ring_ent" 
won't help.

> 
> This is fairly unimportant though - this part could always be
> optimized in a future patchset if you think it needs to be optimized,
> but was just curious if these would work.
> 

I'm going to change logic a bit and will introduce another list
'freeable_ring_ent'. Entries will be moved to that new list and
only freed in fuse_uring_destruct(). After that IO_URING_F_CANCEL
can check stat of ring_ent directly


Thanks for the discussion!


Bernd
diff mbox series

Patch

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 6af515458695ccb2e32cc8c62c45471e6710c15f..b465da41c42c47eaf69f09bab1423061bc8fcc68 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -23,6 +23,7 @@  MODULE_PARM_DESC(enable_uring,
 
 struct fuse_uring_cmd_pdu {
 	struct fuse_ring_ent *ring_ent;
+	struct fuse_ring_queue *queue;
 };
 
 /*
@@ -382,6 +383,61 @@  void fuse_uring_stop_queues(struct fuse_ring *ring)
 	}
 }
 
+/*
+ * Handle IO_URING_F_CANCEL, typically should come on daemon termination
+ */
+static void fuse_uring_cancel(struct io_uring_cmd *cmd,
+			      unsigned int issue_flags, struct fuse_conn *fc)
+{
+	struct fuse_uring_cmd_pdu *pdu = (struct fuse_uring_cmd_pdu *)cmd->pdu;
+	struct fuse_ring_queue *queue = pdu->queue;
+	struct fuse_ring_ent *ent;
+	bool found = false;
+	bool need_cmd_done = false;
+
+	spin_lock(&queue->lock);
+
+	/* XXX: This is cumbersome for large queues. */
+	list_for_each_entry(ent, &queue->ent_avail_queue, list) {
+		if (pdu->ring_ent == ent) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		pr_info("qid=%d Did not find ent=%p", queue->qid, ent);
+		spin_unlock(&queue->lock);
+		return;
+	}
+
+	if (ent->state == FRRS_WAIT) {
+		ent->state = FRRS_USERSPACE;
+		list_move(&ent->list, &queue->ent_in_userspace);
+		need_cmd_done = true;
+	}
+	spin_unlock(&queue->lock);
+
+	if (need_cmd_done)
+		io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
+
+	/*
+	 * releasing the last entry should trigger fuse_dev_release() if
+	 * the daemon was terminated
+	 */
+}
+
+static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags,
+				      struct fuse_ring_ent *ring_ent)
+{
+	struct fuse_uring_cmd_pdu *pdu = (struct fuse_uring_cmd_pdu *)cmd->pdu;
+
+	pdu->ring_ent = ring_ent;
+	pdu->queue = ring_ent->queue;
+
+	io_uring_cmd_mark_cancelable(cmd, issue_flags);
+}
+
 /*
  * Checks for errors and stores it into the request
  */
@@ -606,7 +662,8 @@  static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ring_ent)
  * Put a ring request onto hold, it is no longer used for now.
  */
 static void fuse_uring_ent_avail(struct fuse_ring_ent *ring_ent,
-				 struct fuse_ring_queue *queue)
+				 struct fuse_ring_queue *queue,
+				 unsigned int issue_flags)
 	__must_hold(&queue->lock)
 {
 	struct fuse_ring *ring = queue->ring;
@@ -626,6 +683,7 @@  static void fuse_uring_ent_avail(struct fuse_ring_ent *ring_ent,
 	list_move(&ring_ent->list, &queue->ent_avail_queue);
 
 	ring_ent->state = FRRS_WAIT;
+	fuse_uring_prepare_cancel(ring_ent->cmd, issue_flags, ring_ent);
 }
 
 /* Used to find the request on SQE commit */
@@ -729,7 +787,8 @@  static void fuse_uring_commit(struct fuse_ring_ent *ring_ent,
  * Get the next fuse req and send it
  */
 static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ring_ent,
-				    struct fuse_ring_queue *queue)
+				    struct fuse_ring_queue *queue,
+				    unsigned int issue_flags)
 {
 	int has_next, err;
 	int prev_state = ring_ent->state;
@@ -738,7 +797,7 @@  static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ring_ent,
 		spin_lock(&queue->lock);
 		has_next = fuse_uring_ent_assign_req(ring_ent);
 		if (!has_next) {
-			fuse_uring_ent_avail(ring_ent, queue);
+			fuse_uring_ent_avail(ring_ent, queue, issue_flags);
 			spin_unlock(&queue->lock);
 			break; /* no request left */
 		}
@@ -813,7 +872,7 @@  static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
 	 * and fetching is done in one step vs legacy fuse, which has separated
 	 * read (fetch request) and write (commit result).
 	 */
-	fuse_uring_next_fuse_req(ring_ent, queue);
+	fuse_uring_next_fuse_req(ring_ent, queue, issue_flags);
 	return 0;
 }
 
@@ -853,7 +912,7 @@  static void _fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
 	struct fuse_ring *ring = queue->ring;
 
 	spin_lock(&queue->lock);
-	fuse_uring_ent_avail(ring_ent, queue);
+	fuse_uring_ent_avail(ring_ent, queue, issue_flags);
 	ring_ent->cmd = cmd;
 	spin_unlock(&queue->lock);
 
@@ -1021,6 +1080,11 @@  int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	if (fc->aborted)
 		goto out;
 
+	if ((unlikely(issue_flags & IO_URING_F_CANCEL))) {
+		fuse_uring_cancel(cmd, issue_flags, fc);
+		return 0;
+	}
+
 	switch (cmd_op) {
 	case FUSE_URING_REQ_FETCH:
 		err = fuse_uring_fetch(cmd, issue_flags, fc);
@@ -1080,7 +1144,7 @@  fuse_uring_send_req_in_task(struct io_uring_cmd *cmd,
 
 	return;
 err:
-	fuse_uring_next_fuse_req(ring_ent, queue);
+	fuse_uring_next_fuse_req(ring_ent, queue, issue_flags);
 }
 
 /* queue a fuse request and send it if a ring entry is available */