Message ID | 20230814211116.3224759-2-aahringo@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | fs: nfs: async lock request changes | expand |
On Mon, 2023-08-14 at 17:11 -0400, Alexander Aring wrote: > This patch fixes a race in async lock request handling between adding > the relevant struct nlm_block to nlm_blocked list after the request was > sent by vfs_lock_file() and nlmsvc_grant_deferred() does a lookup of the > nlm_block in the nlm_blocked list. It could be that the async request is > completed before the nlm_block was added to the list. This would end > in a -ENOENT and a kernel log message of "lockd: grant for unknown > block". > > To solve this issue we add the nlm_block before the vfs_lock_file() call > to be sure it has been added when a possible nlmsvc_grant_deferred() is > called. If the vfs_lock_file() results in an case when it wouldn't be > added to nlm_blocked list, the nlm_block struct will be removed from > this list again. > > The introducing of the new B_PENDING_CALLBACK nlm_block flag will handle > async lock requests on a pending lock requests as a retry on the caller > level to hit the -EAGAIN case. > > Signed-off-by: Alexander Aring <aahringo@redhat.com> > --- > fs/lockd/svclock.c | 100 ++++++++++++++++++++++++++---------- > include/linux/lockd/lockd.h | 2 + > 2 files changed, 74 insertions(+), 28 deletions(-) > > diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c > index c43ccdf28ed9..7d63524bdb81 100644 > --- a/fs/lockd/svclock.c > +++ b/fs/lockd/svclock.c > @@ -133,6 +133,7 @@ nlmsvc_remove_block(struct nlm_block *block) > { > if (!list_empty(&block->b_list)) { > spin_lock(&nlm_blocked_lock); > + block->b_flags &= ~B_PENDING_CALLBACK; > list_del_init(&block->b_list); > spin_unlock(&nlm_blocked_lock); > nlmsvc_release_block(block); > @@ -232,6 +233,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, > kref_init(&block->b_count); > INIT_LIST_HEAD(&block->b_list); > INIT_LIST_HEAD(&block->b_flist); > + mutex_init(&block->b_cb_mutex); > > if (!nlmsvc_setgrantargs(call, lock)) > goto failed_free; > @@ -289,6 +291,8 @@ static void nlmsvc_free_block(struct kref *kref) > > dprintk("lockd: freeing block %p...\n", block); > > + WARN_ON_ONCE(block->b_flags & B_PENDING_CALLBACK); > + > /* Remove block from file's list of blocks */ > list_del_init(&block->b_flist); > mutex_unlock(&file->f_mutex); > @@ -532,6 +536,13 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > goto out; > } > > + mutex_lock(&block->b_cb_mutex); > + if (block->b_flags & B_PENDING_CALLBACK) > + goto pending_request; > + > + /* Append to list of blocked */ > + nlmsvc_insert_block(block, NLM_NEVER); > + > if (!wait) > lock->fl.fl_flags &= ~FL_SLEEP; > mode = lock_to_openmode(&lock->fl); > @@ -541,9 +552,13 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > dprintk("lockd: vfs_lock_file returned %d\n", error); > switch (error) { > case 0: > + nlmsvc_remove_block(block); > ret = nlm_granted; > - goto out; > + goto out_cb_mutex; > case -EAGAIN: > + if (!wait) > + nlmsvc_remove_block(block); > +pending_request: > /* > * If this is a blocking request for an > * already pending lock request then we need > @@ -552,26 +567,29 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > if (wait) > break; > ret = async_block ? nlm_lck_blocked : nlm_lck_denied; > - goto out; > + goto out_cb_mutex; > case FILE_LOCK_DEFERRED: > + block->b_flags |= B_PENDING_CALLBACK; > + > if (wait) > break; > /* Filesystem lock operation is in progress > Add it to the queue waiting for callback */ > ret = nlmsvc_defer_lock_rqst(rqstp, block); > - goto out; > + goto out_cb_mutex; > case -EDEADLK: > + nlmsvc_remove_block(block); > ret = nlm_deadlock; > - goto out; > + goto out_cb_mutex; > default: /* includes ENOLCK */ > + nlmsvc_remove_block(block); > ret = nlm_lck_denied_nolocks; > - goto out; > + goto out_cb_mutex; > } > > ret = nlm_lck_blocked; > - > - /* Append to list of blocked */ > - nlmsvc_insert_block(block, NLM_NEVER); > +out_cb_mutex: > + mutex_unlock(&block->b_cb_mutex); > out: > mutex_unlock(&file->f_mutex); > nlmsvc_release_block(block); > @@ -728,34 +746,60 @@ nlmsvc_update_deferred_block(struct nlm_block *block, int result) > block->b_flags |= B_TIMED_OUT; > } > > +static int __nlmsvc_grant_deferred(struct nlm_block *block, > + struct file_lock *fl, > + int result) > +{ > + int rc = 0; > + > + dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", > + block, block->b_flags); > + if (block->b_flags & B_QUEUED) { > + if (block->b_flags & B_TIMED_OUT) { > + rc = -ENOLCK; > + goto out; > + } > + nlmsvc_update_deferred_block(block, result); > + } else if (result == 0) > + block->b_granted = 1; > + > + nlmsvc_insert_block_locked(block, 0); > + svc_wake_up(block->b_daemon); > +out: > + return rc; > +} > + > static int nlmsvc_grant_deferred(struct file_lock *fl, int result) > { > - struct nlm_block *block; > - int rc = -ENOENT; > + struct nlm_block *iter, *block = NULL; > + int rc; > > spin_lock(&nlm_blocked_lock); > - list_for_each_entry(block, &nlm_blocked, b_list) { > - if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { > - dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", > - block, block->b_flags); > - if (block->b_flags & B_QUEUED) { > - if (block->b_flags & B_TIMED_OUT) { > - rc = -ENOLCK; > - break; > - } > - nlmsvc_update_deferred_block(block, result); > - } else if (result == 0) > - block->b_granted = 1; > - > - nlmsvc_insert_block_locked(block, 0); > - svc_wake_up(block->b_daemon); > - rc = 0; > + list_for_each_entry(iter, &nlm_blocked, b_list) { > + if (nlm_compare_locks(&iter->b_call->a_args.lock.fl, fl)) { > + kref_get(&iter->b_count); > + block = iter; > break; > } > } > spin_unlock(&nlm_blocked_lock); > - if (rc == -ENOENT) > - printk(KERN_WARNING "lockd: grant for unknown block\n"); > + > + if (!block) { > + pr_warn("lockd: grant for unknown pending block\n"); > + return -ENOENT; > + } > + > + /* don't interfere with nlmsvc_lock() */ > + mutex_lock(&block->b_cb_mutex); > + block->b_flags &= ~B_PENDING_CALLBACK; > + > + spin_lock(&nlm_blocked_lock); > + WARN_ON_ONCE(list_empty(&block->b_list)); > + rc = __nlmsvc_grant_deferred(block, fl, result); > + spin_unlock(&nlm_blocked_lock); > + mutex_unlock(&block->b_cb_mutex); > + > + nlmsvc_release_block(block); > return rc; > } > > diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h > index f42594a9efe0..91f55458f5fc 100644 > --- a/include/linux/lockd/lockd.h > +++ b/include/linux/lockd/lockd.h > @@ -185,10 +185,12 @@ struct nlm_block { > struct nlm_file * b_file; /* file in question */ > struct cache_req * b_cache_req; /* deferred request handling */ > struct cache_deferred_req * b_deferred_req > + struct mutex b_cb_mutex; /* callback mutex */ There is no mention at all of this new mutex in the changelog or comments. It's not at all clear to me what this is intended to protect. In general, with lockd being a single-threaded service, we want to avoid sleeping locks. This will need some clear justification. At a glance, it looks like you're trying to use this to hold B_PENDING_CALLBACK steady while a lock request is being handled. That suggests that you're using this mutex to serialize access to a section of code and not one or more specific data structures. We usually like to avoid that sort of thing, since locks that protect arbitrary sections of code become difficult to work with over time. I'm going to go out on a limb here though and suggest that there is probably a way to solve this problem that doesn't involve adding new locks. > unsigned int b_flags; /* block flags */ > #define B_QUEUED 1 /* lock queued */ > #define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ > #define B_TIMED_OUT 4 /* filesystem too slow to respond */ > +#define B_PENDING_CALLBACK 8 /* pending callback for lock request */ > }; > > /* Do we need this new flag at all? It seems redundant. If we have a block on the list, then it is sort of by definition "pending callback". If it's not on the list anymore, then it's not. No?
On Tue, 2023-08-15 at 13:49 -0400, Jeff Layton wrote: > On Mon, 2023-08-14 at 17:11 -0400, Alexander Aring wrote: > > This patch fixes a race in async lock request handling between adding > > the relevant struct nlm_block to nlm_blocked list after the request was > > sent by vfs_lock_file() and nlmsvc_grant_deferred() does a lookup of the > > nlm_block in the nlm_blocked list. It could be that the async request is > > completed before the nlm_block was added to the list. This would end > > in a -ENOENT and a kernel log message of "lockd: grant for unknown > > block". > > > > To solve this issue we add the nlm_block before the vfs_lock_file() call > > to be sure it has been added when a possible nlmsvc_grant_deferred() is > > called. If the vfs_lock_file() results in an case when it wouldn't be > > added to nlm_blocked list, the nlm_block struct will be removed from > > this list again. > > > > The introducing of the new B_PENDING_CALLBACK nlm_block flag will handle > > async lock requests on a pending lock requests as a retry on the caller > > level to hit the -EAGAIN case. > > > > Signed-off-by: Alexander Aring <aahringo@redhat.com> > > --- > > fs/lockd/svclock.c | 100 ++++++++++++++++++++++++++---------- > > include/linux/lockd/lockd.h | 2 + > > 2 files changed, 74 insertions(+), 28 deletions(-) > > > > [...] > > diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h > > index f42594a9efe0..91f55458f5fc 100644 > > --- a/include/linux/lockd/lockd.h > > +++ b/include/linux/lockd/lockd.h > > @@ -185,10 +185,12 @@ struct nlm_block { > > struct nlm_file * b_file; /* file in question */ > > struct cache_req * b_cache_req; /* deferred request handling */ > > struct cache_deferred_req * b_deferred_req > > + struct mutex b_cb_mutex; /* callback mutex */ > > There is no mention at all of this new mutex in the changelog or > comments. It's not at all clear to me what this is intended to protect. > In general, with lockd being a single-threaded service, we want to avoid > sleeping locks. This will need some clear justification. > > At a glance, it looks like you're trying to use this to hold > B_PENDING_CALLBACK steady while a lock request is being handled. That > suggests that you're using this mutex to serialize access to a section > of code and not one or more specific data structures. We usually like to > avoid that sort of thing, since locks that protect arbitrary sections of > code become difficult to work with over time. > > I'm going to go out on a limb here though and suggest that there is > probably a way to solve this problem that doesn't involve adding new > locks. > > > unsigned int b_flags; /* block flags */ > > #define B_QUEUED 1 /* lock queued */ > > #define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ > > #define B_TIMED_OUT 4 /* filesystem too slow to respond */ > > +#define B_PENDING_CALLBACK 8 /* pending callback for lock request */ > > }; > > > > /* > > Do we need this new flag at all? It seems redundant. If we have a block > on the list, then it is sort of by definition "pending callback". If > it's not on the list anymore, then it's not. No? > Do we need anything more than a patch along these lines? Note that this is untested, so RFC: ---------------------8<----------------------- [RFC PATCH] lockd: alternate fix for race between deferred lock and grant Signed-off-by: Jeff Layton <jlayton@kernel.org> --- fs/lockd/svclock.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c43ccdf28ed9..e9a84363c26e 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -446,6 +446,8 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) block->b_flags |= B_QUEUED; + /* FIXME: remove and reinsert w/o dropping spinlock */ + nlmsvc_remove_block(block); nlmsvc_insert_block(block, NLM_TIMEOUT); block->b_cache_req = &rqstp->rq_chandle; @@ -535,6 +537,9 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; mode = lock_to_openmode(&lock->fl); + + /* Append to list of blocked */ + nlmsvc_insert_block(block, NLM_NEVER); error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; @@ -542,6 +547,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, switch (error) { case 0: ret = nlm_granted; + nlmsvc_remove_block(block); goto out; case -EAGAIN: /* @@ -552,6 +558,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, if (wait) break; ret = async_block ? nlm_lck_blocked : nlm_lck_denied; + nlmsvc_remove_block(block); goto out; case FILE_LOCK_DEFERRED: if (wait) @@ -570,8 +577,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlm_lck_blocked; - /* Append to list of blocked */ - nlmsvc_insert_block(block, NLM_NEVER); out: mutex_unlock(&file->f_mutex); nlmsvc_release_block(block);
Hi, On Tue, Aug 15, 2023 at 1:49 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Mon, 2023-08-14 at 17:11 -0400, Alexander Aring wrote: > > This patch fixes a race in async lock request handling between adding > > the relevant struct nlm_block to nlm_blocked list after the request was > > sent by vfs_lock_file() and nlmsvc_grant_deferred() does a lookup of the > > nlm_block in the nlm_blocked list. It could be that the async request is > > completed before the nlm_block was added to the list. This would end > > in a -ENOENT and a kernel log message of "lockd: grant for unknown > > block". > > > > To solve this issue we add the nlm_block before the vfs_lock_file() call > > to be sure it has been added when a possible nlmsvc_grant_deferred() is > > called. If the vfs_lock_file() results in an case when it wouldn't be > > added to nlm_blocked list, the nlm_block struct will be removed from > > this list again. > > > > The introducing of the new B_PENDING_CALLBACK nlm_block flag will handle > > async lock requests on a pending lock requests as a retry on the caller > > level to hit the -EAGAIN case. > > > > Signed-off-by: Alexander Aring <aahringo@redhat.com> > > --- > > fs/lockd/svclock.c | 100 ++++++++++++++++++++++++++---------- > > include/linux/lockd/lockd.h | 2 + > > 2 files changed, 74 insertions(+), 28 deletions(-) > > > > diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c > > index c43ccdf28ed9..7d63524bdb81 100644 > > --- a/fs/lockd/svclock.c > > +++ b/fs/lockd/svclock.c > > @@ -133,6 +133,7 @@ nlmsvc_remove_block(struct nlm_block *block) > > { > > if (!list_empty(&block->b_list)) { > > spin_lock(&nlm_blocked_lock); > > + block->b_flags &= ~B_PENDING_CALLBACK; > > list_del_init(&block->b_list); > > spin_unlock(&nlm_blocked_lock); > > nlmsvc_release_block(block); > > @@ -232,6 +233,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, > > kref_init(&block->b_count); > > INIT_LIST_HEAD(&block->b_list); > > INIT_LIST_HEAD(&block->b_flist); > > + mutex_init(&block->b_cb_mutex); > > > > if (!nlmsvc_setgrantargs(call, lock)) > > goto failed_free; > > @@ -289,6 +291,8 @@ static void nlmsvc_free_block(struct kref *kref) > > > > dprintk("lockd: freeing block %p...\n", block); > > > > + WARN_ON_ONCE(block->b_flags & B_PENDING_CALLBACK); > > + > > /* Remove block from file's list of blocks */ > > list_del_init(&block->b_flist); > > mutex_unlock(&file->f_mutex); > > @@ -532,6 +536,13 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > > goto out; > > } > > > > + mutex_lock(&block->b_cb_mutex); > > + if (block->b_flags & B_PENDING_CALLBACK) > > + goto pending_request; > > + > > + /* Append to list of blocked */ > > + nlmsvc_insert_block(block, NLM_NEVER); > > + > > if (!wait) > > lock->fl.fl_flags &= ~FL_SLEEP; > > mode = lock_to_openmode(&lock->fl); > > @@ -541,9 +552,13 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > > dprintk("lockd: vfs_lock_file returned %d\n", error); > > switch (error) { > > case 0: > > + nlmsvc_remove_block(block); > > ret = nlm_granted; > > - goto out; > > + goto out_cb_mutex; > > case -EAGAIN: > > + if (!wait) > > + nlmsvc_remove_block(block); > > +pending_request: > > /* > > * If this is a blocking request for an > > * already pending lock request then we need > > @@ -552,26 +567,29 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > > if (wait) > > break; > > ret = async_block ? nlm_lck_blocked : nlm_lck_denied; > > - goto out; > > + goto out_cb_mutex; > > case FILE_LOCK_DEFERRED: > > + block->b_flags |= B_PENDING_CALLBACK; > > + > > if (wait) > > break; > > /* Filesystem lock operation is in progress > > Add it to the queue waiting for callback */ > > ret = nlmsvc_defer_lock_rqst(rqstp, block); > > - goto out; > > + goto out_cb_mutex; > > case -EDEADLK: > > + nlmsvc_remove_block(block); > > ret = nlm_deadlock; > > - goto out; > > + goto out_cb_mutex; > > default: /* includes ENOLCK */ > > + nlmsvc_remove_block(block); > > ret = nlm_lck_denied_nolocks; > > - goto out; > > + goto out_cb_mutex; > > } > > > > ret = nlm_lck_blocked; > > - > > - /* Append to list of blocked */ > > - nlmsvc_insert_block(block, NLM_NEVER); > > +out_cb_mutex: > > + mutex_unlock(&block->b_cb_mutex); > > out: > > mutex_unlock(&file->f_mutex); > > nlmsvc_release_block(block); > > @@ -728,34 +746,60 @@ nlmsvc_update_deferred_block(struct nlm_block *block, int result) > > block->b_flags |= B_TIMED_OUT; > > } > > > > +static int __nlmsvc_grant_deferred(struct nlm_block *block, > > + struct file_lock *fl, > > + int result) > > +{ > > + int rc = 0; > > + > > + dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", > > + block, block->b_flags); > > + if (block->b_flags & B_QUEUED) { > > + if (block->b_flags & B_TIMED_OUT) { > > + rc = -ENOLCK; > > + goto out; > > + } > > + nlmsvc_update_deferred_block(block, result); > > + } else if (result == 0) > > + block->b_granted = 1; > > + > > + nlmsvc_insert_block_locked(block, 0); > > + svc_wake_up(block->b_daemon); > > +out: > > + return rc; > > +} > > + > > static int nlmsvc_grant_deferred(struct file_lock *fl, int result) > > { > > - struct nlm_block *block; > > - int rc = -ENOENT; > > + struct nlm_block *iter, *block = NULL; > > + int rc; > > > > spin_lock(&nlm_blocked_lock); > > - list_for_each_entry(block, &nlm_blocked, b_list) { > > - if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { > > - dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", > > - block, block->b_flags); > > - if (block->b_flags & B_QUEUED) { > > - if (block->b_flags & B_TIMED_OUT) { > > - rc = -ENOLCK; > > - break; > > - } > > - nlmsvc_update_deferred_block(block, result); > > - } else if (result == 0) > > - block->b_granted = 1; > > - > > - nlmsvc_insert_block_locked(block, 0); > > - svc_wake_up(block->b_daemon); > > - rc = 0; > > + list_for_each_entry(iter, &nlm_blocked, b_list) { > > + if (nlm_compare_locks(&iter->b_call->a_args.lock.fl, fl)) { > > + kref_get(&iter->b_count); > > + block = iter; > > break; > > } > > } > > spin_unlock(&nlm_blocked_lock); > > - if (rc == -ENOENT) > > - printk(KERN_WARNING "lockd: grant for unknown block\n"); > > + > > + if (!block) { > > + pr_warn("lockd: grant for unknown pending block\n"); > > + return -ENOENT; > > + } > > + > > + /* don't interfere with nlmsvc_lock() */ > > + mutex_lock(&block->b_cb_mutex); > > + block->b_flags &= ~B_PENDING_CALLBACK; > > + > > + spin_lock(&nlm_blocked_lock); > > + WARN_ON_ONCE(list_empty(&block->b_list)); > > + rc = __nlmsvc_grant_deferred(block, fl, result); > > + spin_unlock(&nlm_blocked_lock); > > + mutex_unlock(&block->b_cb_mutex); > > + > > + nlmsvc_release_block(block); > > return rc; > > } > > > > diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h > > index f42594a9efe0..91f55458f5fc 100644 > > --- a/include/linux/lockd/lockd.h > > +++ b/include/linux/lockd/lockd.h > > @@ -185,10 +185,12 @@ struct nlm_block { > > struct nlm_file * b_file; /* file in question */ > > struct cache_req * b_cache_req; /* deferred request handling */ > > struct cache_deferred_req * b_deferred_req > > + struct mutex b_cb_mutex; /* callback mutex */ > > There is no mention at all of this new mutex in the changelog or > comments. It's not at all clear to me what this is intended to protect. > In general, with lockd being a single-threaded service, we want to avoid > sleeping locks. This will need some clear justification. > The idea was to protect the PENDING flag and to wait until nlmsvc_lock() is "mostly" done and then allowing lm_grant() callback to proceed. Before the f_mutex was doing that, but since I made !wait request (non-blocking requests) being handled synchronously there was a deadlock because the callback can be run in an unknown order. It was fixed by having a new mutex on a per nlm_block basis. However I will remove those changes and try to find another way. > At a glance, it looks like you're trying to use this to hold > B_PENDING_CALLBACK steady while a lock request is being handled. That > suggests that you're using this mutex to serialize access to a section > of code and not one or more specific data structures. We usually like to > avoid that sort of thing, since locks that protect arbitrary sections of > code become difficult to work with over time. > > I'm going to go out on a limb here though and suggest that there is > probably a way to solve this problem that doesn't involve adding new > locks. > ok. > > unsigned int b_flags; /* block flags */ > > #define B_QUEUED 1 /* lock queued */ > > #define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ > > #define B_TIMED_OUT 4 /* filesystem too slow to respond */ > > +#define B_PENDING_CALLBACK 8 /* pending callback for lock request */ > > }; > > > > /* > > Do we need this new flag at all? It seems redundant. If we have a block > on the list, then it is sort of by definition "pending callback". If > it's not on the list anymore, then it's not. No? > I will try to find another way. A pending callback is more a check on if it's a wait request (blocking request) and is it on the list or not. This flag was also there to avoid multiple wait requests while a wait request is pending. I ran into this case and the DLM implementation was never able to handle it. I can return -EAGAIN but DLM lock() needs to keep track of those things, I prefer to handle this situation in the upper layer. I am not sure why lockd do multiple wait requests for the same nlm_block when it's pending and waiting for lm_grant() callback. I will split this handling out of this patch as it is something what doesn't belong to the race fix between request and lm_grant() callback. - Alex
Hi, On Tue, Aug 15, 2023 at 2:21 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Tue, 2023-08-15 at 13:49 -0400, Jeff Layton wrote: > > On Mon, 2023-08-14 at 17:11 -0400, Alexander Aring wrote: > > > This patch fixes a race in async lock request handling between adding > > > the relevant struct nlm_block to nlm_blocked list after the request was > > > sent by vfs_lock_file() and nlmsvc_grant_deferred() does a lookup of the > > > nlm_block in the nlm_blocked list. It could be that the async request is > > > completed before the nlm_block was added to the list. This would end > > > in a -ENOENT and a kernel log message of "lockd: grant for unknown > > > block". > > > > > > To solve this issue we add the nlm_block before the vfs_lock_file() call > > > to be sure it has been added when a possible nlmsvc_grant_deferred() is > > > called. If the vfs_lock_file() results in an case when it wouldn't be > > > added to nlm_blocked list, the nlm_block struct will be removed from > > > this list again. > > > > > > The introducing of the new B_PENDING_CALLBACK nlm_block flag will handle > > > async lock requests on a pending lock requests as a retry on the caller > > > level to hit the -EAGAIN case. > > > > > > Signed-off-by: Alexander Aring <aahringo@redhat.com> > > > --- > > > fs/lockd/svclock.c | 100 ++++++++++++++++++++++++++---------- > > > include/linux/lockd/lockd.h | 2 + > > > 2 files changed, 74 insertions(+), 28 deletions(-) > > > > > > > > [...] > > > > diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h > > > index f42594a9efe0..91f55458f5fc 100644 > > > --- a/include/linux/lockd/lockd.h > > > +++ b/include/linux/lockd/lockd.h > > > @@ -185,10 +185,12 @@ struct nlm_block { > > > struct nlm_file * b_file; /* file in question */ > > > struct cache_req * b_cache_req; /* deferred request handling */ > > > struct cache_deferred_req * b_deferred_req > > > + struct mutex b_cb_mutex; /* callback mutex */ > > > > There is no mention at all of this new mutex in the changelog or > > comments. It's not at all clear to me what this is intended to protect. > > In general, with lockd being a single-threaded service, we want to avoid > > sleeping locks. This will need some clear justification. > > > > At a glance, it looks like you're trying to use this to hold > > B_PENDING_CALLBACK steady while a lock request is being handled. That > > suggests that you're using this mutex to serialize access to a section > > of code and not one or more specific data structures. We usually like to > > avoid that sort of thing, since locks that protect arbitrary sections of > > code become difficult to work with over time. > > > > I'm going to go out on a limb here though and suggest that there is > > probably a way to solve this problem that doesn't involve adding new > > locks. > > > > > unsigned int b_flags; /* block flags */ > > > #define B_QUEUED 1 /* lock queued */ > > > #define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ > > > #define B_TIMED_OUT 4 /* filesystem too slow to respond */ > > > +#define B_PENDING_CALLBACK 8 /* pending callback for lock request */ > > > }; > > > > > > /* > > > > Do we need this new flag at all? It seems redundant. If we have a block > > on the list, then it is sort of by definition "pending callback". If > > it's not on the list anymore, then it's not. No? > > > > Do we need anything more than a patch along these lines? Note that this > is untested, so RFC: > > ---------------------8<----------------------- > > [RFC PATCH] lockd: alternate fix for race between deferred lock and grant > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > --- > fs/lockd/svclock.c | 9 +++++++-- > 1 file changed, 7 insertions(+), 2 deletions(-) > > diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c > index c43ccdf28ed9..e9a84363c26e 100644 > --- a/fs/lockd/svclock.c > +++ b/fs/lockd/svclock.c > @@ -446,6 +446,8 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) > > block->b_flags |= B_QUEUED; > > + /* FIXME: remove and reinsert w/o dropping spinlock */ > + nlmsvc_remove_block(block); > nlmsvc_insert_block(block, NLM_TIMEOUT); > a insert should just be okay, because there is an atomic switch if it's already part of nlm_blocked and it will just update the timeout of nlm_block in the list and it's order (because nlm_blocked is kind of sorted according their timeouts in nlm_blocked). > block->b_cache_req = &rqstp->rq_chandle; > @@ -535,6 +537,9 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > if (!wait) > lock->fl.fl_flags &= ~FL_SLEEP; > mode = lock_to_openmode(&lock->fl); > + > + /* Append to list of blocked */ > + nlmsvc_insert_block(block, NLM_NEVER); > error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); > lock->fl.fl_flags &= ~FL_SLEEP; > > @@ -542,6 +547,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > switch (error) { > case 0: > ret = nlm_granted; > + nlmsvc_remove_block(block); > goto out; > case -EAGAIN: > /* > @@ -552,6 +558,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > if (wait) > break; > ret = async_block ? nlm_lck_blocked : nlm_lck_denied; > + nlmsvc_remove_block(block); > goto out; > case FILE_LOCK_DEFERRED: > if (wait) > @@ -570,8 +577,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > > ret = nlm_lck_blocked; > > - /* Append to list of blocked */ > - nlmsvc_insert_block(block, NLM_NEVER); ok. I will try to start with that. - Alex
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c43ccdf28ed9..7d63524bdb81 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -133,6 +133,7 @@ nlmsvc_remove_block(struct nlm_block *block) { if (!list_empty(&block->b_list)) { spin_lock(&nlm_blocked_lock); + block->b_flags &= ~B_PENDING_CALLBACK; list_del_init(&block->b_list); spin_unlock(&nlm_blocked_lock); nlmsvc_release_block(block); @@ -232,6 +233,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, kref_init(&block->b_count); INIT_LIST_HEAD(&block->b_list); INIT_LIST_HEAD(&block->b_flist); + mutex_init(&block->b_cb_mutex); if (!nlmsvc_setgrantargs(call, lock)) goto failed_free; @@ -289,6 +291,8 @@ static void nlmsvc_free_block(struct kref *kref) dprintk("lockd: freeing block %p...\n", block); + WARN_ON_ONCE(block->b_flags & B_PENDING_CALLBACK); + /* Remove block from file's list of blocks */ list_del_init(&block->b_flist); mutex_unlock(&file->f_mutex); @@ -532,6 +536,13 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + mutex_lock(&block->b_cb_mutex); + if (block->b_flags & B_PENDING_CALLBACK) + goto pending_request; + + /* Append to list of blocked */ + nlmsvc_insert_block(block, NLM_NEVER); + if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; mode = lock_to_openmode(&lock->fl); @@ -541,9 +552,13 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, dprintk("lockd: vfs_lock_file returned %d\n", error); switch (error) { case 0: + nlmsvc_remove_block(block); ret = nlm_granted; - goto out; + goto out_cb_mutex; case -EAGAIN: + if (!wait) + nlmsvc_remove_block(block); +pending_request: /* * If this is a blocking request for an * already pending lock request then we need @@ -552,26 +567,29 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, if (wait) break; ret = async_block ? nlm_lck_blocked : nlm_lck_denied; - goto out; + goto out_cb_mutex; case FILE_LOCK_DEFERRED: + block->b_flags |= B_PENDING_CALLBACK; + if (wait) break; /* Filesystem lock operation is in progress Add it to the queue waiting for callback */ ret = nlmsvc_defer_lock_rqst(rqstp, block); - goto out; + goto out_cb_mutex; case -EDEADLK: + nlmsvc_remove_block(block); ret = nlm_deadlock; - goto out; + goto out_cb_mutex; default: /* includes ENOLCK */ + nlmsvc_remove_block(block); ret = nlm_lck_denied_nolocks; - goto out; + goto out_cb_mutex; } ret = nlm_lck_blocked; - - /* Append to list of blocked */ - nlmsvc_insert_block(block, NLM_NEVER); +out_cb_mutex: + mutex_unlock(&block->b_cb_mutex); out: mutex_unlock(&file->f_mutex); nlmsvc_release_block(block); @@ -728,34 +746,60 @@ nlmsvc_update_deferred_block(struct nlm_block *block, int result) block->b_flags |= B_TIMED_OUT; } +static int __nlmsvc_grant_deferred(struct nlm_block *block, + struct file_lock *fl, + int result) +{ + int rc = 0; + + dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", + block, block->b_flags); + if (block->b_flags & B_QUEUED) { + if (block->b_flags & B_TIMED_OUT) { + rc = -ENOLCK; + goto out; + } + nlmsvc_update_deferred_block(block, result); + } else if (result == 0) + block->b_granted = 1; + + nlmsvc_insert_block_locked(block, 0); + svc_wake_up(block->b_daemon); +out: + return rc; +} + static int nlmsvc_grant_deferred(struct file_lock *fl, int result) { - struct nlm_block *block; - int rc = -ENOENT; + struct nlm_block *iter, *block = NULL; + int rc; spin_lock(&nlm_blocked_lock); - list_for_each_entry(block, &nlm_blocked, b_list) { - if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { - dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", - block, block->b_flags); - if (block->b_flags & B_QUEUED) { - if (block->b_flags & B_TIMED_OUT) { - rc = -ENOLCK; - break; - } - nlmsvc_update_deferred_block(block, result); - } else if (result == 0) - block->b_granted = 1; - - nlmsvc_insert_block_locked(block, 0); - svc_wake_up(block->b_daemon); - rc = 0; + list_for_each_entry(iter, &nlm_blocked, b_list) { + if (nlm_compare_locks(&iter->b_call->a_args.lock.fl, fl)) { + kref_get(&iter->b_count); + block = iter; break; } } spin_unlock(&nlm_blocked_lock); - if (rc == -ENOENT) - printk(KERN_WARNING "lockd: grant for unknown block\n"); + + if (!block) { + pr_warn("lockd: grant for unknown pending block\n"); + return -ENOENT; + } + + /* don't interfere with nlmsvc_lock() */ + mutex_lock(&block->b_cb_mutex); + block->b_flags &= ~B_PENDING_CALLBACK; + + spin_lock(&nlm_blocked_lock); + WARN_ON_ONCE(list_empty(&block->b_list)); + rc = __nlmsvc_grant_deferred(block, fl, result); + spin_unlock(&nlm_blocked_lock); + mutex_unlock(&block->b_cb_mutex); + + nlmsvc_release_block(block); return rc; } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index f42594a9efe0..91f55458f5fc 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -185,10 +185,12 @@ struct nlm_block { struct nlm_file * b_file; /* file in question */ struct cache_req * b_cache_req; /* deferred request handling */ struct cache_deferred_req * b_deferred_req; + struct mutex b_cb_mutex; /* callback mutex */ unsigned int b_flags; /* block flags */ #define B_QUEUED 1 /* lock queued */ #define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ #define B_TIMED_OUT 4 /* filesystem too slow to respond */ +#define B_PENDING_CALLBACK 8 /* pending callback for lock request */ }; /*
This patch fixes a race in async lock request handling between adding the relevant struct nlm_block to nlm_blocked list after the request was sent by vfs_lock_file() and nlmsvc_grant_deferred() does a lookup of the nlm_block in the nlm_blocked list. It could be that the async request is completed before the nlm_block was added to the list. This would end in a -ENOENT and a kernel log message of "lockd: grant for unknown block". To solve this issue we add the nlm_block before the vfs_lock_file() call to be sure it has been added when a possible nlmsvc_grant_deferred() is called. If the vfs_lock_file() results in an case when it wouldn't be added to nlm_blocked list, the nlm_block struct will be removed from this list again. The introducing of the new B_PENDING_CALLBACK nlm_block flag will handle async lock requests on a pending lock requests as a retry on the caller level to hit the -EAGAIN case. Signed-off-by: Alexander Aring <aahringo@redhat.com> --- fs/lockd/svclock.c | 100 ++++++++++++++++++++++++++---------- include/linux/lockd/lockd.h | 2 + 2 files changed, 74 insertions(+), 28 deletions(-)