diff mbox series

[v8,2/3] fuse: add optional kernel-enforced timeout for requests

Message ID 20241011191320.91592-3-joannelkoong@gmail.com (mailing list archive)
State New
Headers show
Series fuse: add kernel-enforced request timeout option | expand

Commit Message

Joanne Koong Oct. 11, 2024, 7:13 p.m. UTC
There are situations where fuse servers can become unresponsive or
stuck, for example if the server is deadlocked. Currently, there's no
good way to detect if a server is stuck and needs to be killed manually.

This commit adds an option for enforcing a timeout (in minutes) for
requests where if the timeout elapses without the server responding to
the request, the connection will be automatically aborted.

Please note that these timeouts are not 100% precise. The request may
take an extra FUSE_TIMEOUT_TIMER_FREQ seconds beyond the requested max
timeout due to how it's internally implemented.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
 fs/fuse/dev.c    | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h | 21 +++++++++++++
 fs/fuse/inode.c  | 21 +++++++++++++
 3 files changed, 122 insertions(+)

Comments

Bernd Schubert Oct. 29, 2024, 7:17 p.m. UTC | #1
On 10/11/24 21:13, Joanne Koong wrote:
> There are situations where fuse servers can become unresponsive or
> stuck, for example if the server is deadlocked. Currently, there's no
> good way to detect if a server is stuck and needs to be killed manually.
> 
> This commit adds an option for enforcing a timeout (in minutes) for
> requests where if the timeout elapses without the server responding to
> the request, the connection will be automatically aborted.
> 
> Please note that these timeouts are not 100% precise. The request may
> take an extra FUSE_TIMEOUT_TIMER_FREQ seconds beyond the requested max
> timeout due to how it's internally implemented.
> 
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>  fs/fuse/dev.c    | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/fuse/fuse_i.h | 21 +++++++++++++
>  fs/fuse/inode.c  | 21 +++++++++++++
>  3 files changed, 122 insertions(+)
> 
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 1f64ae6d7a69..054bfa2a26ed 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -45,6 +45,82 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
>  	return READ_ONCE(file->private_data);
>  }
>  
> +static bool request_expired(struct fuse_conn *fc, struct fuse_req *req)
> +{
> +	return jiffies > req->create_time + fc->timeout.req_timeout;
> +}
> +
> +/*
> + * Check if any requests aren't being completed by the specified request
> + * timeout. To do so, we:
> + * - check the fiq pending list
> + * - check the bg queue
> + * - check the fpq io and processing lists
> + *
> + * To make this fast, we only check against the head request on each list since
> + * these are generally queued in order of creation time (eg newer requests get
> + * queued to the tail). We might miss a few edge cases (eg requests transitioning
> + * between lists, re-sent requests at the head of the pending list having a
> + * later creation time than other requests on that list, etc.) but that is fine
> + * since if the request never gets fulfilled, it will eventually be caught.
> + */
> +void fuse_check_timeout(struct timer_list *timer)
> +{
> +	struct fuse_conn *fc = container_of(timer, struct fuse_conn, timeout.timer);
> +	struct fuse_iqueue *fiq = &fc->iq;
> +	struct fuse_req *req;
> +	struct fuse_dev *fud;
> +	struct fuse_pqueue *fpq;
> +	bool expired = false;
> +	int i;
> +
> +	spin_lock(&fiq->lock);
> +	req = list_first_entry_or_null(&fiq->pending, struct fuse_req, list);
> +	if (req)
> +		expired = request_expired(fc, req);
> +	spin_unlock(&fiq->lock);
> +	if (expired)
> +		goto abort_conn;
> +
> +	spin_lock(&fc->bg_lock);
> +	req = list_first_entry_or_null(&fc->bg_queue, struct fuse_req, list);
> +	if (req)
> +		expired = request_expired(fc, req);
> +	spin_unlock(&fc->bg_lock);
> +	if (expired)
> +		goto abort_conn;
> +
> +	spin_lock(&fc->lock);
> +	if (!fc->connected) {
> +		spin_unlock(&fc->lock);
> +		return;
> +	}
> +	list_for_each_entry(fud, &fc->devices, entry) {
> +		fpq = &fud->pq;
> +		spin_lock(&fpq->lock);
> +		req = list_first_entry_or_null(&fpq->io, struct fuse_req, list);
> +		if (req && request_expired(fc, req))
> +			goto fpq_abort;
> +
> +		for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) {
> +			req = list_first_entry_or_null(&fpq->processing[i], struct fuse_req, list);
> +			if (req && request_expired(fc, req))
> +				goto fpq_abort;
> +		}
> +		spin_unlock(&fpq->lock);
> +	}
> +	spin_unlock(&fc->lock);

I really don't have a strong opinion on that - I wonder if it wouldn't
be better for this part to have an extra timeout list per fud or pq as
previously. That would slightly increases memory usage and overhead per
request as a second list is needed, but would reduce these 1/min cpu
spikes as only one list per fud would need to be checked. But then, it
would be easy to change that later, if timeout checks turn out to be a
problem.


> +
> +	mod_timer(&fc->timeout.timer, jiffies + FUSE_TIMEOUT_TIMER_FREQ);
> +	return;
> +
> +fpq_abort:
> +	spin_unlock(&fpq->lock);
> +	spin_unlock(&fc->lock);
> +abort_conn:
> +	fuse_abort_conn(fc);
> +}
> +
>  static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
>  {
>  	INIT_LIST_HEAD(&req->list);
> @@ -53,6 +129,7 @@ static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
>  	refcount_set(&req->count, 1);
>  	__set_bit(FR_PENDING, &req->flags);
>  	req->fm = fm;
> +	req->create_time = jiffies;
>  }
>  
>  static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
> @@ -2296,6 +2373,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
>  		spin_unlock(&fc->lock);
>  
>  		end_requests(&to_end);
> +
> +		if (fc->timeout.req_timeout)
> +			timer_delete(&fc->timeout.timer);
>  	} else {
>  		spin_unlock(&fc->lock);
>  	}
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 7ff00bae4a84..ef4558c2c44e 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -435,6 +435,9 @@ struct fuse_req {
>  
>  	/** fuse_mount this request belongs to */
>  	struct fuse_mount *fm;
> +
> +	/** When (in jiffies) the request was created */
> +	unsigned long create_time;
>  };
>  
>  struct fuse_iqueue;
> @@ -525,6 +528,16 @@ struct fuse_pqueue {
>  	struct list_head io;
>  };
>  
> +/* Frequency (in seconds) of request timeout checks, if opted into */
> +#define FUSE_TIMEOUT_TIMER_FREQ 60 * HZ
> +
> +struct fuse_timeout {
> +	struct timer_list timer;
> +
> +	/* Request timeout (in jiffies). 0 = no timeout */
> +	unsigned long req_timeout;
> +};
> +
>  /**
>   * Fuse device instance
>   */
> @@ -571,6 +584,8 @@ struct fuse_fs_context {
>  	enum fuse_dax_mode dax_mode;
>  	unsigned int max_read;
>  	unsigned int blksize;
> +	/*  Request timeout (in minutes). 0 = no timeout (infinite wait) */
> +	unsigned int req_timeout;
>  	const char *subtype;
>  
>  	/* DAX device, may be NULL */
> @@ -914,6 +929,9 @@ struct fuse_conn {
>  	/** IDR for backing files ids */
>  	struct idr backing_files_map;
>  #endif
> +
> +	/** Only used if the connection enforces request timeouts */
> +	struct fuse_timeout timeout;
>  };
>  
>  /*
> @@ -1175,6 +1193,9 @@ void fuse_request_end(struct fuse_req *req);
>  void fuse_abort_conn(struct fuse_conn *fc);
>  void fuse_wait_aborted(struct fuse_conn *fc);
>  
> +/* Check if any requests timed out */
> +void fuse_check_timeout(struct timer_list *timer);
> +
>  /**
>   * Invalidate inode attributes
>   */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index f1779ff3f8d1..a78aac76b942 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -735,6 +735,7 @@ enum {
>  	OPT_ALLOW_OTHER,
>  	OPT_MAX_READ,
>  	OPT_BLKSIZE,
> +	OPT_REQUEST_TIMEOUT,
>  	OPT_ERR
>  };
>  
> @@ -749,6 +750,7 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = {
>  	fsparam_u32	("max_read",		OPT_MAX_READ),
>  	fsparam_u32	("blksize",		OPT_BLKSIZE),
>  	fsparam_string	("subtype",		OPT_SUBTYPE),
> +	fsparam_u16	("request_timeout",	OPT_REQUEST_TIMEOUT),
>  	{}
>  };
>  
> @@ -844,6 +846,10 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
>  		ctx->blksize = result.uint_32;
>  		break;
>  
> +	case OPT_REQUEST_TIMEOUT:
> +		ctx->req_timeout = result.uint_16;
> +		break;
> +
>  	default:
>  		return -EINVAL;
>  	}
> @@ -973,6 +979,8 @@ void fuse_conn_put(struct fuse_conn *fc)
>  
>  		if (IS_ENABLED(CONFIG_FUSE_DAX))
>  			fuse_dax_conn_free(fc);
> +		if (fc->timeout.req_timeout)
> +			timer_shutdown_sync(&fc->timeout.timer);
>  		if (fiq->ops->release)
>  			fiq->ops->release(fiq);
>  		put_pid_ns(fc->pid_ns);
> @@ -1691,6 +1699,18 @@ int fuse_init_fs_context_submount(struct fs_context *fsc)
>  }
>  EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount);
>  
> +static void fuse_init_fc_timeout(struct fuse_conn *fc, struct fuse_fs_context *ctx)
> +{
> +	if (ctx->req_timeout) {
> +		if (check_mul_overflow(ctx->req_timeout * 60, HZ, &fc->timeout.req_timeout))
> +			fc->timeout.req_timeout = U32_MAX;


ULONG_MAX?

> +		timer_setup(&fc->timeout.timer, fuse_check_timeout, 0);
> +		mod_timer(&fc->timeout.timer, jiffies + FUSE_TIMEOUT_TIMER_FREQ);
> +	} else {
> +		fc->timeout.req_timeout = 0;
> +	}
> +}
> +
>  int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
>  {
>  	struct fuse_dev *fud = NULL;
> @@ -1753,6 +1773,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
>  	fc->destroy = ctx->destroy;
>  	fc->no_control = ctx->no_control;
>  	fc->no_force_umount = ctx->no_force_umount;
> +	fuse_init_fc_timeout(fc, ctx);
>  
>  	err = -ENOMEM;
>  	root = fuse_get_root_inode(sb, ctx->rootmode);


Reviewed-by: Bernd Schubert <bschubert@ddn.com>
Joanne Koong Oct. 30, 2024, 5:21 p.m. UTC | #2
On Tue, Oct 29, 2024 at 12:17 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 10/11/24 21:13, Joanne Koong wrote:
> > There are situations where fuse servers can become unresponsive or
> > stuck, for example if the server is deadlocked. Currently, there's no
> > good way to detect if a server is stuck and needs to be killed manually.
> >
> > This commit adds an option for enforcing a timeout (in minutes) for
> > requests where if the timeout elapses without the server responding to
> > the request, the connection will be automatically aborted.
> >
> > Please note that these timeouts are not 100% precise. The request may
> > take an extra FUSE_TIMEOUT_TIMER_FREQ seconds beyond the requested max
> > timeout due to how it's internally implemented.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> >  fs/fuse/dev.c    | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/fuse/fuse_i.h | 21 +++++++++++++
> >  fs/fuse/inode.c  | 21 +++++++++++++
> >  3 files changed, 122 insertions(+)
> >
> > diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> > index 1f64ae6d7a69..054bfa2a26ed 100644
> > --- a/fs/fuse/dev.c
> > +++ b/fs/fuse/dev.c
> > @@ -45,6 +45,82 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
> >       return READ_ONCE(file->private_data);
> >  }
> >
> > +static bool request_expired(struct fuse_conn *fc, struct fuse_req *req)
> > +{
> > +     return jiffies > req->create_time + fc->timeout.req_timeout;
> > +}
> > +
> > +/*
> > + * Check if any requests aren't being completed by the specified request
> > + * timeout. To do so, we:
> > + * - check the fiq pending list
> > + * - check the bg queue
> > + * - check the fpq io and processing lists
> > + *
> > + * To make this fast, we only check against the head request on each list since
> > + * these are generally queued in order of creation time (eg newer requests get
> > + * queued to the tail). We might miss a few edge cases (eg requests transitioning
> > + * between lists, re-sent requests at the head of the pending list having a
> > + * later creation time than other requests on that list, etc.) but that is fine
> > + * since if the request never gets fulfilled, it will eventually be caught.
> > + */
> > +void fuse_check_timeout(struct timer_list *timer)
> > +{
> > +     struct fuse_conn *fc = container_of(timer, struct fuse_conn, timeout.timer);
> > +     struct fuse_iqueue *fiq = &fc->iq;
> > +     struct fuse_req *req;
> > +     struct fuse_dev *fud;
> > +     struct fuse_pqueue *fpq;
> > +     bool expired = false;
> > +     int i;
> > +
> > +     spin_lock(&fiq->lock);
> > +     req = list_first_entry_or_null(&fiq->pending, struct fuse_req, list);
> > +     if (req)
> > +             expired = request_expired(fc, req);
> > +     spin_unlock(&fiq->lock);
> > +     if (expired)
> > +             goto abort_conn;
> > +
> > +     spin_lock(&fc->bg_lock);
> > +     req = list_first_entry_or_null(&fc->bg_queue, struct fuse_req, list);
> > +     if (req)
> > +             expired = request_expired(fc, req);
> > +     spin_unlock(&fc->bg_lock);
> > +     if (expired)
> > +             goto abort_conn;
> > +
> > +     spin_lock(&fc->lock);
> > +     if (!fc->connected) {
> > +             spin_unlock(&fc->lock);
> > +             return;
> > +     }
> > +     list_for_each_entry(fud, &fc->devices, entry) {
> > +             fpq = &fud->pq;
> > +             spin_lock(&fpq->lock);
> > +             req = list_first_entry_or_null(&fpq->io, struct fuse_req, list);
> > +             if (req && request_expired(fc, req))
> > +                     goto fpq_abort;
> > +
> > +             for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) {
> > +                     req = list_first_entry_or_null(&fpq->processing[i], struct fuse_req, list);
> > +                     if (req && request_expired(fc, req))
> > +                             goto fpq_abort;
> > +             }
> > +             spin_unlock(&fpq->lock);
> > +     }
> > +     spin_unlock(&fc->lock);
>
> I really don't have a strong opinion on that - I wonder if it wouldn't
> be better for this part to have an extra timeout list per fud or pq as
> previously. That would slightly increases memory usage and overhead per
> request as a second list is needed, but would reduce these 1/min cpu
> spikes as only one list per fud would need to be checked. But then, it
> would be easy to change that later, if timeout checks turn out to be a
> problem.
>

Thanks for the review.

On v7 [1] which used an extra timeout list, the feedback was

"One thing I worry about is adding more roadblocks on the way to making
request queuing more scalable.

Currently there's fc->num_waiting that's touched on all requests and
bg_queue/bg_lock that are touched on background requests.  We should
be trying to fix these bottlenecks instead of adding more.

Can't we use the existing lists to scan requests?

It's more complex, obviously, but at least it doesn't introduce yet
another per-fc list to worry about."


[1] https://lore.kernel.org/linux-fsdevel/CAJfpegs9A7iBbZpPMF-WuR48Ho_=z_ZWfjrLQG2ob0k6NbcaUg@mail.gmail.com/

>
> > +
> > +     mod_timer(&fc->timeout.timer, jiffies + FUSE_TIMEOUT_TIMER_FREQ);
> > +     return;
> > +
> > +fpq_abort:
> > +     spin_unlock(&fpq->lock);
> > +     spin_unlock(&fc->lock);
> > +abort_conn:
> > +     fuse_abort_conn(fc);
> > +}
> > +
> >  static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
> >  {
> >       INIT_LIST_HEAD(&req->list);
> > @@ -53,6 +129,7 @@ static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
> >       refcount_set(&req->count, 1);
> >       __set_bit(FR_PENDING, &req->flags);
> >       req->fm = fm;
> > +     req->create_time = jiffies;
> >  }
> >
> >  static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
> > @@ -2296,6 +2373,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
> >               spin_unlock(&fc->lock);
> >
> >               end_requests(&to_end);
> > +
> > +             if (fc->timeout.req_timeout)
> > +                     timer_delete(&fc->timeout.timer);
> >       } else {
> >               spin_unlock(&fc->lock);
> >       }
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 7ff00bae4a84..ef4558c2c44e 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -435,6 +435,9 @@ struct fuse_req {
> >
> >       /** fuse_mount this request belongs to */
> >       struct fuse_mount *fm;
> > +
> > +     /** When (in jiffies) the request was created */
> > +     unsigned long create_time;
> >  };
> >
> >  struct fuse_iqueue;
> > @@ -525,6 +528,16 @@ struct fuse_pqueue {
> >       struct list_head io;
> >  };
> >
> > +/* Frequency (in seconds) of request timeout checks, if opted into */
> > +#define FUSE_TIMEOUT_TIMER_FREQ 60 * HZ
> > +
> > +struct fuse_timeout {
> > +     struct timer_list timer;
> > +
> > +     /* Request timeout (in jiffies). 0 = no timeout */
> > +     unsigned long req_timeout;
> > +};
> > +
> >  /**
> >   * Fuse device instance
> >   */
> > @@ -571,6 +584,8 @@ struct fuse_fs_context {
> >       enum fuse_dax_mode dax_mode;
> >       unsigned int max_read;
> >       unsigned int blksize;
> > +     /*  Request timeout (in minutes). 0 = no timeout (infinite wait) */
> > +     unsigned int req_timeout;
> >       const char *subtype;
> >
> >       /* DAX device, may be NULL */
> > @@ -914,6 +929,9 @@ struct fuse_conn {
> >       /** IDR for backing files ids */
> >       struct idr backing_files_map;
> >  #endif
> > +
> > +     /** Only used if the connection enforces request timeouts */
> > +     struct fuse_timeout timeout;
> >  };
> >
> >  /*
> > @@ -1175,6 +1193,9 @@ void fuse_request_end(struct fuse_req *req);
> >  void fuse_abort_conn(struct fuse_conn *fc);
> >  void fuse_wait_aborted(struct fuse_conn *fc);
> >
> > +/* Check if any requests timed out */
> > +void fuse_check_timeout(struct timer_list *timer);
> > +
> >  /**
> >   * Invalidate inode attributes
> >   */
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index f1779ff3f8d1..a78aac76b942 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -735,6 +735,7 @@ enum {
> >       OPT_ALLOW_OTHER,
> >       OPT_MAX_READ,
> >       OPT_BLKSIZE,
> > +     OPT_REQUEST_TIMEOUT,
> >       OPT_ERR
> >  };
> >
> > @@ -749,6 +750,7 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = {
> >       fsparam_u32     ("max_read",            OPT_MAX_READ),
> >       fsparam_u32     ("blksize",             OPT_BLKSIZE),
> >       fsparam_string  ("subtype",             OPT_SUBTYPE),
> > +     fsparam_u16     ("request_timeout",     OPT_REQUEST_TIMEOUT),
> >       {}
> >  };
> >
> > @@ -844,6 +846,10 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
> >               ctx->blksize = result.uint_32;
> >               break;
> >
> > +     case OPT_REQUEST_TIMEOUT:
> > +             ctx->req_timeout = result.uint_16;
> > +             break;
> > +
> >       default:
> >               return -EINVAL;
> >       }
> > @@ -973,6 +979,8 @@ void fuse_conn_put(struct fuse_conn *fc)
> >
> >               if (IS_ENABLED(CONFIG_FUSE_DAX))
> >                       fuse_dax_conn_free(fc);
> > +             if (fc->timeout.req_timeout)
> > +                     timer_shutdown_sync(&fc->timeout.timer);
> >               if (fiq->ops->release)
> >                       fiq->ops->release(fiq);
> >               put_pid_ns(fc->pid_ns);
> > @@ -1691,6 +1699,18 @@ int fuse_init_fs_context_submount(struct fs_context *fsc)
> >  }
> >  EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount);
> >
> > +static void fuse_init_fc_timeout(struct fuse_conn *fc, struct fuse_fs_context *ctx)
> > +{
> > +     if (ctx->req_timeout) {
> > +             if (check_mul_overflow(ctx->req_timeout * 60, HZ, &fc->timeout.req_timeout))
> > +                     fc->timeout.req_timeout = U32_MAX;
>
>
> ULONG_MAX?

Nice, I'll change this to ULONG_MAX. We only run into this overflow on
32-bit systems (and only if the kernel has configured HZ to greater
than 1092) so U32_MAX is the same as ULONG_MAX,  but ULONG_MAX looks
nicer since "fc->timeout.req_timeout" is an unsigned long.


Thanks,
Joanne

>
> > +             timer_setup(&fc->timeout.timer, fuse_check_timeout, 0);
> > +             mod_timer(&fc->timeout.timer, jiffies + FUSE_TIMEOUT_TIMER_FREQ);
> > +     } else {
> > +             fc->timeout.req_timeout = 0;
> > +     }
> > +}
> > +
> >  int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
> >  {
> >       struct fuse_dev *fud = NULL;
> > @@ -1753,6 +1773,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
> >       fc->destroy = ctx->destroy;
> >       fc->no_control = ctx->no_control;
> >       fc->no_force_umount = ctx->no_force_umount;
> > +     fuse_init_fc_timeout(fc, ctx);
> >
> >       err = -ENOMEM;
> >       root = fuse_get_root_inode(sb, ctx->rootmode);
>
>
> Reviewed-by: Bernd Schubert <bschubert@ddn.com>
Bernd Schubert Oct. 30, 2024, 5:50 p.m. UTC | #3
On 10/30/24 18:21, Joanne Koong wrote:
> On Tue, Oct 29, 2024 at 12:17 PM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 10/11/24 21:13, Joanne Koong wrote:
>>> There are situations where fuse servers can become unresponsive or
>>> stuck, for example if the server is deadlocked. Currently, there's no
>>> good way to detect if a server is stuck and needs to be killed manually.
>>>
>>> This commit adds an option for enforcing a timeout (in minutes) for
>>> requests where if the timeout elapses without the server responding to
>>> the request, the connection will be automatically aborted.
>>>
>>> Please note that these timeouts are not 100% precise. The request may
>>> take an extra FUSE_TIMEOUT_TIMER_FREQ seconds beyond the requested max
>>> timeout due to how it's internally implemented.
>>>
>>> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
>>> ---
>>>  fs/fuse/dev.c    | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>  fs/fuse/fuse_i.h | 21 +++++++++++++
>>>  fs/fuse/inode.c  | 21 +++++++++++++
>>>  3 files changed, 122 insertions(+)
>>>
>>> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
>>> index 1f64ae6d7a69..054bfa2a26ed 100644
>>> --- a/fs/fuse/dev.c
>>> +++ b/fs/fuse/dev.c
>>> @@ -45,6 +45,82 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
>>>       return READ_ONCE(file->private_data);
>>>  }
>>>
>>> +static bool request_expired(struct fuse_conn *fc, struct fuse_req *req)
>>> +{
>>> +     return jiffies > req->create_time + fc->timeout.req_timeout;
>>> +}
>>> +
>>> +/*
>>> + * Check if any requests aren't being completed by the specified request
>>> + * timeout. To do so, we:
>>> + * - check the fiq pending list
>>> + * - check the bg queue
>>> + * - check the fpq io and processing lists
>>> + *
>>> + * To make this fast, we only check against the head request on each list since
>>> + * these are generally queued in order of creation time (eg newer requests get
>>> + * queued to the tail). We might miss a few edge cases (eg requests transitioning
>>> + * between lists, re-sent requests at the head of the pending list having a
>>> + * later creation time than other requests on that list, etc.) but that is fine
>>> + * since if the request never gets fulfilled, it will eventually be caught.
>>> + */
>>> +void fuse_check_timeout(struct timer_list *timer)
>>> +{
>>> +     struct fuse_conn *fc = container_of(timer, struct fuse_conn, timeout.timer);
>>> +     struct fuse_iqueue *fiq = &fc->iq;
>>> +     struct fuse_req *req;
>>> +     struct fuse_dev *fud;
>>> +     struct fuse_pqueue *fpq;
>>> +     bool expired = false;
>>> +     int i;
>>> +
>>> +     spin_lock(&fiq->lock);
>>> +     req = list_first_entry_or_null(&fiq->pending, struct fuse_req, list);
>>> +     if (req)
>>> +             expired = request_expired(fc, req);
>>> +     spin_unlock(&fiq->lock);
>>> +     if (expired)
>>> +             goto abort_conn;
>>> +
>>> +     spin_lock(&fc->bg_lock);
>>> +     req = list_first_entry_or_null(&fc->bg_queue, struct fuse_req, list);
>>> +     if (req)
>>> +             expired = request_expired(fc, req);
>>> +     spin_unlock(&fc->bg_lock);
>>> +     if (expired)
>>> +             goto abort_conn;
>>> +
>>> +     spin_lock(&fc->lock);
>>> +     if (!fc->connected) {
>>> +             spin_unlock(&fc->lock);
>>> +             return;
>>> +     }
>>> +     list_for_each_entry(fud, &fc->devices, entry) {
>>> +             fpq = &fud->pq;
>>> +             spin_lock(&fpq->lock);
>>> +             req = list_first_entry_or_null(&fpq->io, struct fuse_req, list);
>>> +             if (req && request_expired(fc, req))
>>> +                     goto fpq_abort;
>>> +
>>> +             for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) {
>>> +                     req = list_first_entry_or_null(&fpq->processing[i], struct fuse_req, list);
>>> +                     if (req && request_expired(fc, req))
>>> +                             goto fpq_abort;
>>> +             }
>>> +             spin_unlock(&fpq->lock);
>>> +     }
>>> +     spin_unlock(&fc->lock);
>>
>> I really don't have a strong opinion on that - I wonder if it wouldn't
>> be better for this part to have an extra timeout list per fud or pq as
>> previously. That would slightly increases memory usage and overhead per
>> request as a second list is needed, but would reduce these 1/min cpu
>> spikes as only one list per fud would need to be checked. But then, it
>> would be easy to change that later, if timeout checks turn out to be a
>> problem.
>>
> 
> Thanks for the review.
> 
> On v7 [1] which used an extra timeout list, the feedback was
> 
> "One thing I worry about is adding more roadblocks on the way to making
> request queuing more scalable.
> 
> Currently there's fc->num_waiting that's touched on all requests and
> bg_queue/bg_lock that are touched on background requests.  We should
> be trying to fix these bottlenecks instead of adding more.
> 
> Can't we use the existing lists to scan requests?
> 
> It's more complex, obviously, but at least it doesn't introduce yet
> another per-fc list to worry about."
> 
> 
> [1] https://lore.kernel.org/linux-fsdevel/CAJfpegs9A7iBbZpPMF-WuR48Ho_=z_ZWfjrLQG2ob0k6NbcaUg@mail.gmail.com/


Yeah I know, but I'm just a bit scared about the cpu spikes this gives
on a recent systems (like 96 cores AMDs we have in the lab) and when
device clone is activated (and Miklos had also asked to use the same 
hash lists to find requests with fuse-uring). 
The previous discussion was mainly about avoiding a new global lock,
i.e. I wonder if can't use a mix - avoid the new list and its lock,
but still avoid scanning through [FUSE_PQ_HASH_SIZE] lists per
cloned-device/core.

I'm also not opposed against the current version and optimize it later.

Thanks,
Bernd
diff mbox series

Patch

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1f64ae6d7a69..054bfa2a26ed 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -45,6 +45,82 @@  static struct fuse_dev *fuse_get_dev(struct file *file)
 	return READ_ONCE(file->private_data);
 }
 
+static bool request_expired(struct fuse_conn *fc, struct fuse_req *req)
+{
+	return jiffies > req->create_time + fc->timeout.req_timeout;
+}
+
+/*
+ * Check if any requests aren't being completed by the specified request
+ * timeout. To do so, we:
+ * - check the fiq pending list
+ * - check the bg queue
+ * - check the fpq io and processing lists
+ *
+ * To make this fast, we only check against the head request on each list since
+ * these are generally queued in order of creation time (eg newer requests get
+ * queued to the tail). We might miss a few edge cases (eg requests transitioning
+ * between lists, re-sent requests at the head of the pending list having a
+ * later creation time than other requests on that list, etc.) but that is fine
+ * since if the request never gets fulfilled, it will eventually be caught.
+ */
+void fuse_check_timeout(struct timer_list *timer)
+{
+	struct fuse_conn *fc = container_of(timer, struct fuse_conn, timeout.timer);
+	struct fuse_iqueue *fiq = &fc->iq;
+	struct fuse_req *req;
+	struct fuse_dev *fud;
+	struct fuse_pqueue *fpq;
+	bool expired = false;
+	int i;
+
+	spin_lock(&fiq->lock);
+	req = list_first_entry_or_null(&fiq->pending, struct fuse_req, list);
+	if (req)
+		expired = request_expired(fc, req);
+	spin_unlock(&fiq->lock);
+	if (expired)
+		goto abort_conn;
+
+	spin_lock(&fc->bg_lock);
+	req = list_first_entry_or_null(&fc->bg_queue, struct fuse_req, list);
+	if (req)
+		expired = request_expired(fc, req);
+	spin_unlock(&fc->bg_lock);
+	if (expired)
+		goto abort_conn;
+
+	spin_lock(&fc->lock);
+	if (!fc->connected) {
+		spin_unlock(&fc->lock);
+		return;
+	}
+	list_for_each_entry(fud, &fc->devices, entry) {
+		fpq = &fud->pq;
+		spin_lock(&fpq->lock);
+		req = list_first_entry_or_null(&fpq->io, struct fuse_req, list);
+		if (req && request_expired(fc, req))
+			goto fpq_abort;
+
+		for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) {
+			req = list_first_entry_or_null(&fpq->processing[i], struct fuse_req, list);
+			if (req && request_expired(fc, req))
+				goto fpq_abort;
+		}
+		spin_unlock(&fpq->lock);
+	}
+	spin_unlock(&fc->lock);
+
+	mod_timer(&fc->timeout.timer, jiffies + FUSE_TIMEOUT_TIMER_FREQ);
+	return;
+
+fpq_abort:
+	spin_unlock(&fpq->lock);
+	spin_unlock(&fc->lock);
+abort_conn:
+	fuse_abort_conn(fc);
+}
+
 static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
 {
 	INIT_LIST_HEAD(&req->list);
@@ -53,6 +129,7 @@  static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
 	refcount_set(&req->count, 1);
 	__set_bit(FR_PENDING, &req->flags);
 	req->fm = fm;
+	req->create_time = jiffies;
 }
 
 static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
@@ -2296,6 +2373,9 @@  void fuse_abort_conn(struct fuse_conn *fc)
 		spin_unlock(&fc->lock);
 
 		end_requests(&to_end);
+
+		if (fc->timeout.req_timeout)
+			timer_delete(&fc->timeout.timer);
 	} else {
 		spin_unlock(&fc->lock);
 	}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7ff00bae4a84..ef4558c2c44e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -435,6 +435,9 @@  struct fuse_req {
 
 	/** fuse_mount this request belongs to */
 	struct fuse_mount *fm;
+
+	/** When (in jiffies) the request was created */
+	unsigned long create_time;
 };
 
 struct fuse_iqueue;
@@ -525,6 +528,16 @@  struct fuse_pqueue {
 	struct list_head io;
 };
 
+/* Frequency (in seconds) of request timeout checks, if opted into */
+#define FUSE_TIMEOUT_TIMER_FREQ 60 * HZ
+
+struct fuse_timeout {
+	struct timer_list timer;
+
+	/* Request timeout (in jiffies). 0 = no timeout */
+	unsigned long req_timeout;
+};
+
 /**
  * Fuse device instance
  */
@@ -571,6 +584,8 @@  struct fuse_fs_context {
 	enum fuse_dax_mode dax_mode;
 	unsigned int max_read;
 	unsigned int blksize;
+	/*  Request timeout (in minutes). 0 = no timeout (infinite wait) */
+	unsigned int req_timeout;
 	const char *subtype;
 
 	/* DAX device, may be NULL */
@@ -914,6 +929,9 @@  struct fuse_conn {
 	/** IDR for backing files ids */
 	struct idr backing_files_map;
 #endif
+
+	/** Only used if the connection enforces request timeouts */
+	struct fuse_timeout timeout;
 };
 
 /*
@@ -1175,6 +1193,9 @@  void fuse_request_end(struct fuse_req *req);
 void fuse_abort_conn(struct fuse_conn *fc);
 void fuse_wait_aborted(struct fuse_conn *fc);
 
+/* Check if any requests timed out */
+void fuse_check_timeout(struct timer_list *timer);
+
 /**
  * Invalidate inode attributes
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f1779ff3f8d1..a78aac76b942 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -735,6 +735,7 @@  enum {
 	OPT_ALLOW_OTHER,
 	OPT_MAX_READ,
 	OPT_BLKSIZE,
+	OPT_REQUEST_TIMEOUT,
 	OPT_ERR
 };
 
@@ -749,6 +750,7 @@  static const struct fs_parameter_spec fuse_fs_parameters[] = {
 	fsparam_u32	("max_read",		OPT_MAX_READ),
 	fsparam_u32	("blksize",		OPT_BLKSIZE),
 	fsparam_string	("subtype",		OPT_SUBTYPE),
+	fsparam_u16	("request_timeout",	OPT_REQUEST_TIMEOUT),
 	{}
 };
 
@@ -844,6 +846,10 @@  static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
 		ctx->blksize = result.uint_32;
 		break;
 
+	case OPT_REQUEST_TIMEOUT:
+		ctx->req_timeout = result.uint_16;
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -973,6 +979,8 @@  void fuse_conn_put(struct fuse_conn *fc)
 
 		if (IS_ENABLED(CONFIG_FUSE_DAX))
 			fuse_dax_conn_free(fc);
+		if (fc->timeout.req_timeout)
+			timer_shutdown_sync(&fc->timeout.timer);
 		if (fiq->ops->release)
 			fiq->ops->release(fiq);
 		put_pid_ns(fc->pid_ns);
@@ -1691,6 +1699,18 @@  int fuse_init_fs_context_submount(struct fs_context *fsc)
 }
 EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount);
 
+static void fuse_init_fc_timeout(struct fuse_conn *fc, struct fuse_fs_context *ctx)
+{
+	if (ctx->req_timeout) {
+		if (check_mul_overflow(ctx->req_timeout * 60, HZ, &fc->timeout.req_timeout))
+			fc->timeout.req_timeout = U32_MAX;
+		timer_setup(&fc->timeout.timer, fuse_check_timeout, 0);
+		mod_timer(&fc->timeout.timer, jiffies + FUSE_TIMEOUT_TIMER_FREQ);
+	} else {
+		fc->timeout.req_timeout = 0;
+	}
+}
+
 int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 {
 	struct fuse_dev *fud = NULL;
@@ -1753,6 +1773,7 @@  int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 	fc->destroy = ctx->destroy;
 	fc->no_control = ctx->no_control;
 	fc->no_force_umount = ctx->no_force_umount;
+	fuse_init_fc_timeout(fc, ctx);
 
 	err = -ENOMEM;
 	root = fuse_get_root_inode(sb, ctx->rootmode);