Message ID | 20230917230551.30483-2-trondmy@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/2] NFSv4: Fix a nfs4_state_manager() race | expand |
On Mon, 18 Sep 2023, trondmy@kernel.org wrote: > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by commit > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because it > prevents the setup of new threads to handle reboot recovery, while the > older recovery thread is stuck returning delegations. I hadn't realised that the state manager thread did these two distinct tasks and might need to be doing both at once - requiring two threads. Thanks for highlighting that. It seems to me that even with the new code, we can still get a deadlock when swap is enabled, as we only ever run one thread in that case. Is that correct, or did I miss something? Maybe we need two threads - a state manager and a delegation recall handler. And when swap is enabled, both need to be running permanently ?? Thanks, NeilBrown > > Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if swap is enabled") > Cc: stable@vger.kernel.org > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > --- > fs/nfs/nfs4proc.c | 4 +++- > fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------ > 2 files changed, 29 insertions(+), 13 deletions(-) > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c > index 5deeaea8026e..a19e809cad16 100644 > --- a/fs/nfs/nfs4proc.c > +++ b/fs/nfs/nfs4proc.c > @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode *inode) > */ > struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; > > - nfs4_schedule_state_manager(clp); > + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > + wake_up_var(&clp->cl_state); > } > > static const struct inode_operations nfs4_dir_inode_operations = { > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c > index 0bc160fbabec..5751a6886da4 100644 > --- a/fs/nfs/nfs4state.c > +++ b/fs/nfs/nfs4state.c > @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) > { > struct task_struct *task; > char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; > + struct rpc_clnt *clnt = clp->cl_rpcclient; > + bool swapon = false; > > - if (clp->cl_rpcclient->cl_shutdown) > + if (clnt->cl_shutdown) > return; > > set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { > - wake_up_var(&clp->cl_state); > - return; > + > + if (atomic_read(&clnt->cl_swapper)) { > + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, > + &clp->cl_state); > + if (!swapon) { > + wake_up_var(&clp->cl_state); > + return; > + } > } > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > + > + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) > + return; > + > __module_get(THIS_MODULE); > refcount_inc(&clp->cl_count); > > @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) > __func__, PTR_ERR(task)); > if (!nfs_client_init_is_complete(clp)) > nfs_mark_client_ready(clp, PTR_ERR(task)); > + if (swapon) > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > nfs4_clear_state_manager_bit(clp); > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > nfs_put_client(clp); > module_put(THIS_MODULE); > } > @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr) > > allow_signal(SIGKILL); > again: > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > nfs4_state_manager(clp); > - if (atomic_read(&cl->cl_swapper)) { > + > + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { > wait_var_event_interruptible(&clp->cl_state, > test_bit(NFS4CLNT_RUN_MANAGER, > &clp->cl_state)); > - if (atomic_read(&cl->cl_swapper) && > - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) > + if (!atomic_read(&cl->cl_swapper)) > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > + if (refcount_read(&clp->cl_count) > 1 && !signalled()) > goto again; > /* Either no longer a swapper, or were signalled */ > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > + nfs4_clear_state_manager_bit(clp); > } > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > > if (refcount_read(&clp->cl_count) > 1 && !signalled() && > test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && > - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) > goto again; > > nfs_put_client(clp); > -- > 2.41.0 > >
On Mon, 2023-09-18 at 11:25 +1000, NeilBrown wrote: > On Mon, 18 Sep 2023, trondmy@kernel.org wrote: > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by > > commit > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because > > it > > prevents the setup of new threads to handle reboot recovery, while > > the > > older recovery thread is stuck returning delegations. > > I hadn't realised that the state manager thread did these two > distinct > tasks and might need to be doing both at once - requiring two > threads. > Thanks for highlighting that. > > It seems to me that even with the new code, we can still get a > deadlock > when swap is enabled, as we only ever run one thread in that case. > Is that correct, or did I miss something? That is correct. I did try to point this out when you were submitting the swap patches, but my understanding was that you were assuming that delegations would not be enabled when swap is enabled. > > Maybe we need two threads - a state manager and a delegation recall > handler. And when swap is enabled, both need to be running > permanently > ?? Possibly. > > > > > Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if > > swap is enabled") > > Cc: stable@vger.kernel.org > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > --- > > fs/nfs/nfs4proc.c | 4 +++- > > fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------ > > 2 files changed, 29 insertions(+), 13 deletions(-) > > > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c > > index 5deeaea8026e..a19e809cad16 100644 > > --- a/fs/nfs/nfs4proc.c > > +++ b/fs/nfs/nfs4proc.c > > @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode > > *inode) > > */ > > struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; > > > > - nfs4_schedule_state_manager(clp); > > + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > > + wake_up_var(&clp->cl_state); > > } > > > > static const struct inode_operations nfs4_dir_inode_operations = { > > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c > > index 0bc160fbabec..5751a6886da4 100644 > > --- a/fs/nfs/nfs4state.c > > +++ b/fs/nfs/nfs4state.c > > @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct > > nfs_client *clp) > > { > > struct task_struct *task; > > char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; > > + struct rpc_clnt *clnt = clp->cl_rpcclient; > > + bool swapon = false; > > > > - if (clp->cl_rpcclient->cl_shutdown) > > + if (clnt->cl_shutdown) > > return; > > > > set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > > - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state) != 0) { > > - wake_up_var(&clp->cl_state); > > - return; > > + > > + if (atomic_read(&clnt->cl_swapper)) { > > + swapon = > > !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, > > + &clp->cl_state); > > + if (!swapon) { > > + wake_up_var(&clp->cl_state); > > + return; > > + } > > } > > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > > + > > + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp- > > >cl_state) != 0) > > + return; > > + > > __module_get(THIS_MODULE); > > refcount_inc(&clp->cl_count); > > > > @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct > > nfs_client *clp) > > __func__, PTR_ERR(task)); > > if (!nfs_client_init_is_complete(clp)) > > nfs_mark_client_ready(clp, PTR_ERR(task)); > > + if (swapon) > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state); > > nfs4_clear_state_manager_bit(clp); > > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state); > > nfs_put_client(clp); > > module_put(THIS_MODULE); > > } > > @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void > > *ptr) > > > > allow_signal(SIGKILL); > > again: > > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > > nfs4_state_manager(clp); > > - if (atomic_read(&cl->cl_swapper)) { > > + > > + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && > > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp- > > >cl_state)) { > > wait_var_event_interruptible(&clp->cl_state, > > > > test_bit(NFS4CLNT_RUN_MANAGER, > > &clp- > > >cl_state)); > > - if (atomic_read(&cl->cl_swapper) && > > - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) > > + if (!atomic_read(&cl->cl_swapper)) > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state); > > + if (refcount_read(&clp->cl_count) > 1 && > > !signalled()) > > goto again; > > /* Either no longer a swapper, or were signalled */ > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state); > > + nfs4_clear_state_manager_bit(clp); > > } > > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > > > > if (refcount_read(&clp->cl_count) > 1 && !signalled() && > > test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && > > - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state)) > > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp- > > >cl_state)) > > goto again; > > > > nfs_put_client(clp); > > -- > > 2.41.0 > > > > >
Hi Trond, On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote: > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by commit > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because it > prevents the setup of new threads to handle reboot recovery, while the > older recovery thread is stuck returning delegations. I'm seeing a possible deadlock with xfstests generic/472 on NFS v4.x after applying this patch. The test itself checks for various swapfile edge cases, so it seems likely something is going on there. Let me know if you need more info Anna > > Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if swap is enabled") > Cc: stable@vger.kernel.org > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > --- > fs/nfs/nfs4proc.c | 4 +++- > fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------ > 2 files changed, 29 insertions(+), 13 deletions(-) > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c > index 5deeaea8026e..a19e809cad16 100644 > --- a/fs/nfs/nfs4proc.c > +++ b/fs/nfs/nfs4proc.c > @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode *inode) > */ > struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; > > - nfs4_schedule_state_manager(clp); > + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > + wake_up_var(&clp->cl_state); > } > > static const struct inode_operations nfs4_dir_inode_operations = { > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c > index 0bc160fbabec..5751a6886da4 100644 > --- a/fs/nfs/nfs4state.c > +++ b/fs/nfs/nfs4state.c > @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) > { > struct task_struct *task; > char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; > + struct rpc_clnt *clnt = clp->cl_rpcclient; > + bool swapon = false; > > - if (clp->cl_rpcclient->cl_shutdown) > + if (clnt->cl_shutdown) > return; > > set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { > - wake_up_var(&clp->cl_state); > - return; > + > + if (atomic_read(&clnt->cl_swapper)) { > + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, > + &clp->cl_state); > + if (!swapon) { > + wake_up_var(&clp->cl_state); > + return; > + } > } > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > + > + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) > + return; > + > __module_get(THIS_MODULE); > refcount_inc(&clp->cl_count); > > @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) > __func__, PTR_ERR(task)); > if (!nfs_client_init_is_complete(clp)) > nfs_mark_client_ready(clp, PTR_ERR(task)); > + if (swapon) > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > nfs4_clear_state_manager_bit(clp); > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > nfs_put_client(clp); > module_put(THIS_MODULE); > } > @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr) > > allow_signal(SIGKILL); > again: > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > nfs4_state_manager(clp); > - if (atomic_read(&cl->cl_swapper)) { > + > + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { > wait_var_event_interruptible(&clp->cl_state, > test_bit(NFS4CLNT_RUN_MANAGER, > &clp->cl_state)); > - if (atomic_read(&cl->cl_swapper) && > - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) > + if (!atomic_read(&cl->cl_swapper)) > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > + if (refcount_read(&clp->cl_count) > 1 && !signalled()) > goto again; > /* Either no longer a swapper, or were signalled */ > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > + nfs4_clear_state_manager_bit(clp); > } > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > > if (refcount_read(&clp->cl_count) > 1 && !signalled() && > test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && > - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) > goto again; > > nfs_put_client(clp); > -- > 2.41.0 >
On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote: > Hi Trond, > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote: > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by > > commit > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because > > it > > prevents the setup of new threads to handle reboot recovery, while > > the > > older recovery thread is stuck returning delegations. > > I'm seeing a possible deadlock with xfstests generic/472 on NFS v4.x > after applying this patch. The test itself checks for various > swapfile > edge cases, so it seems likely something is going on there. > > Let me know if you need more info > Anna > Did you turn off delegations on your server? If you don't, then swap will deadlock itself under various scenarios. > > > > Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if > > swap is enabled") > > Cc: stable@vger.kernel.org > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > --- > > fs/nfs/nfs4proc.c | 4 +++- > > fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------ > > 2 files changed, 29 insertions(+), 13 deletions(-) > > > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c > > index 5deeaea8026e..a19e809cad16 100644 > > --- a/fs/nfs/nfs4proc.c > > +++ b/fs/nfs/nfs4proc.c > > @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode > > *inode) > > */ > > struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; > > > > - nfs4_schedule_state_manager(clp); > > + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > > + wake_up_var(&clp->cl_state); > > } > > > > static const struct inode_operations nfs4_dir_inode_operations = { > > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c > > index 0bc160fbabec..5751a6886da4 100644 > > --- a/fs/nfs/nfs4state.c > > +++ b/fs/nfs/nfs4state.c > > @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct > > nfs_client *clp) > > { > > struct task_struct *task; > > char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; > > + struct rpc_clnt *clnt = clp->cl_rpcclient; > > + bool swapon = false; > > > > - if (clp->cl_rpcclient->cl_shutdown) > > + if (clnt->cl_shutdown) > > return; > > > > set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > > - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state) != 0) { > > - wake_up_var(&clp->cl_state); > > - return; > > + > > + if (atomic_read(&clnt->cl_swapper)) { > > + swapon = > > !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, > > + &clp->cl_state); > > + if (!swapon) { > > + wake_up_var(&clp->cl_state); > > + return; > > + } > > } > > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > > + > > + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp- > > >cl_state) != 0) > > + return; > > + > > __module_get(THIS_MODULE); > > refcount_inc(&clp->cl_count); > > > > @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct > > nfs_client *clp) > > __func__, PTR_ERR(task)); > > if (!nfs_client_init_is_complete(clp)) > > nfs_mark_client_ready(clp, PTR_ERR(task)); > > + if (swapon) > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state); > > nfs4_clear_state_manager_bit(clp); > > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state); > > nfs_put_client(clp); > > module_put(THIS_MODULE); > > } > > @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void > > *ptr) > > > > allow_signal(SIGKILL); > > again: > > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > > nfs4_state_manager(clp); > > - if (atomic_read(&cl->cl_swapper)) { > > + > > + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && > > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp- > > >cl_state)) { > > wait_var_event_interruptible(&clp->cl_state, > > > > test_bit(NFS4CLNT_RUN_MANAGER, > > &clp- > > >cl_state)); > > - if (atomic_read(&cl->cl_swapper) && > > - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) > > + if (!atomic_read(&cl->cl_swapper)) > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state); > > + if (refcount_read(&clp->cl_count) > 1 && > > !signalled()) > > goto again; > > /* Either no longer a swapper, or were signalled */ > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state); > > + nfs4_clear_state_manager_bit(clp); > > } > > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > > > > if (refcount_read(&clp->cl_count) > 1 && !signalled() && > > test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && > > - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > >cl_state)) > > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp- > > >cl_state)) > > goto again; > > > > nfs_put_client(clp); > > -- > > 2.41.0 > >
On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust <trondmy@hammerspace.com> wrote: > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote: > > Hi Trond, > > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote: > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by > > > commit > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because > > > it > > > prevents the setup of new threads to handle reboot recovery, while > > > the > > > older recovery thread is stuck returning delegations. > > > > I'm seeing a possible deadlock with xfstests generic/472 on NFS v4.x > > after applying this patch. The test itself checks for various > > swapfile > > edge cases, so it seems likely something is going on there. > > > > Let me know if you need more info > > Anna > > > > Did you turn off delegations on your server? If you don't, then swap > will deadlock itself under various scenarios. Is there documentation somewhere that says that delegations must be turned off on the server if NFS over swap is enabled? If the client can't handle delegations + swap, then shouldn't this be solved by (1) checking if we are in NFS over swap and then proactively setting 'dont want delegation' on open and/or (2) proactively return the delegation if received so that we don't get into the deadlock? I think this is similar to Anna's. With this patch, I'm running into a problem running against an ONTAP server using xfstests (no problems without the patch). During the run two stuck threads are: [root@unknown000c291be8aa aglo]# cat /proc/3724/stack [<0>] nfs4_run_state_manager+0x1c0/0x1f8 [nfsv4] [<0>] kthread+0x100/0x110 [<0>] ret_from_fork+0x10/0x20 [root@unknown000c291be8aa aglo]# cat /proc/3725/stack [<0>] nfs_wait_bit_killable+0x1c/0x88 [nfs] [<0>] nfs4_wait_clnt_recover+0xb4/0xf0 [nfsv4] [<0>] nfs4_client_recover_expired_lease+0x34/0x88 [nfsv4] [<0>] _nfs4_do_open.isra.0+0x94/0x408 [nfsv4] [<0>] nfs4_do_open+0x9c/0x238 [nfsv4] [<0>] nfs4_atomic_open+0x100/0x118 [nfsv4] [<0>] nfs4_file_open+0x11c/0x240 [nfsv4] [<0>] do_dentry_open+0x140/0x528 [<0>] vfs_open+0x30/0x38 [<0>] do_open+0x14c/0x360 [<0>] path_openat+0x104/0x250 [<0>] do_filp_open+0x84/0x138 [<0>] file_open_name+0x134/0x190 [<0>] __do_sys_swapoff+0x58/0x6e8 [<0>] __arm64_sys_swapoff+0x18/0x28 [<0>] invoke_syscall.constprop.0+0x7c/0xd0 [<0>] do_el0_svc+0xb4/0xd0 [<0>] el0_svc+0x50/0x228 [<0>] el0t_64_sync_handler+0x134/0x150 [<0>] el0t_64_sync+0x17c/0x180 > > > > > > > Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if > > > swap is enabled") > > > Cc: stable@vger.kernel.org > > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > > --- > > > fs/nfs/nfs4proc.c | 4 +++- > > > fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------ > > > 2 files changed, 29 insertions(+), 13 deletions(-) > > > > > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c > > > index 5deeaea8026e..a19e809cad16 100644 > > > --- a/fs/nfs/nfs4proc.c > > > +++ b/fs/nfs/nfs4proc.c > > > @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode > > > *inode) > > > */ > > > struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; > > > > > > - nfs4_schedule_state_manager(clp); > > > + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > > > + wake_up_var(&clp->cl_state); > > > } > > > > > > static const struct inode_operations nfs4_dir_inode_operations = { > > > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c > > > index 0bc160fbabec..5751a6886da4 100644 > > > --- a/fs/nfs/nfs4state.c > > > +++ b/fs/nfs/nfs4state.c > > > @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct > > > nfs_client *clp) > > > { > > > struct task_struct *task; > > > char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; > > > + struct rpc_clnt *clnt = clp->cl_rpcclient; > > > + bool swapon = false; > > > > > > - if (clp->cl_rpcclient->cl_shutdown) > > > + if (clnt->cl_shutdown) > > > return; > > > > > > set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); > > > - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > > >cl_state) != 0) { > > > - wake_up_var(&clp->cl_state); > > > - return; > > > + > > > + if (atomic_read(&clnt->cl_swapper)) { > > > + swapon = > > > !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, > > > + &clp->cl_state); > > > + if (!swapon) { > > > + wake_up_var(&clp->cl_state); > > > + return; > > > + } > > > } > > > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > > > + > > > + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp- > > > >cl_state) != 0) > > > + return; > > > + > > > __module_get(THIS_MODULE); > > > refcount_inc(&clp->cl_count); > > > > > > @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct > > > nfs_client *clp) > > > __func__, PTR_ERR(task)); > > > if (!nfs_client_init_is_complete(clp)) > > > nfs_mark_client_ready(clp, PTR_ERR(task)); > > > + if (swapon) > > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > > >cl_state); > > > nfs4_clear_state_manager_bit(clp); > > > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > > >cl_state); > > > nfs_put_client(clp); > > > module_put(THIS_MODULE); > > > } > > > @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void > > > *ptr) > > > > > > allow_signal(SIGKILL); > > > again: > > > - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); > > > nfs4_state_manager(clp); > > > - if (atomic_read(&cl->cl_swapper)) { > > > + > > > + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && > > > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp- > > > >cl_state)) { > > > wait_var_event_interruptible(&clp->cl_state, > > > > > > test_bit(NFS4CLNT_RUN_MANAGER, > > > &clp- > > > >cl_state)); > > > - if (atomic_read(&cl->cl_swapper) && > > > - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) > > > + if (!atomic_read(&cl->cl_swapper)) > > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > > >cl_state); > > > + if (refcount_read(&clp->cl_count) > 1 && > > > !signalled()) > > > goto again; > > > /* Either no longer a swapper, or were signalled */ > > > + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > > >cl_state); > > > + nfs4_clear_state_manager_bit(clp); > > > } > > > - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); > > > > > > if (refcount_read(&clp->cl_count) > 1 && !signalled() && > > > test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && > > > - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp- > > > >cl_state)) > > > + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp- > > > >cl_state)) > > > goto again; > > > > > > nfs_put_client(clp); > > > -- > > > 2.41.0 > > > > > -- > Trond Myklebust > Linux NFS client maintainer, Hammerspace > trond.myklebust@hammerspace.com > >
On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote: > On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust > <trondmy@hammerspace.com> wrote: > > > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote: > > > Hi Trond, > > > > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote: > > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by > > > > commit > > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") > > > > because > > > > it > > > > prevents the setup of new threads to handle reboot recovery, > > > > while > > > > the > > > > older recovery thread is stuck returning delegations. > > > > > > I'm seeing a possible deadlock with xfstests generic/472 on NFS > > > v4.x > > > after applying this patch. The test itself checks for various > > > swapfile > > > edge cases, so it seems likely something is going on there. > > > > > > Let me know if you need more info > > > Anna > > > > > > > Did you turn off delegations on your server? If you don't, then > > swap > > will deadlock itself under various scenarios. > > Is there documentation somewhere that says that delegations must be > turned off on the server if NFS over swap is enabled? I think the question is more generally "Is there documentation for NFS swap?" > > If the client can't handle delegations + swap, then shouldn't this be > solved by (1) checking if we are in NFS over swap and then > proactively > setting 'dont want delegation' on open and/or (2) proactively return > the delegation if received so that we don't get into the deadlock? We could do that for NFSv4.1 and NFSv4.2, but the protocol will not allow us to do that for NFSv4.0. > > I think this is similar to Anna's. With this patch, I'm running into > a > problem running against an ONTAP server using xfstests (no problems > without the patch). During the run two stuck threads are: > [root@unknown000c291be8aa aglo]# cat /proc/3724/stack > [<0>] nfs4_run_state_manager+0x1c0/0x1f8 [nfsv4] > [<0>] kthread+0x100/0x110 > [<0>] ret_from_fork+0x10/0x20 > [root@unknown000c291be8aa aglo]# cat /proc/3725/stack > [<0>] nfs_wait_bit_killable+0x1c/0x88 [nfs] > [<0>] nfs4_wait_clnt_recover+0xb4/0xf0 [nfsv4] > [<0>] nfs4_client_recover_expired_lease+0x34/0x88 [nfsv4] > [<0>] _nfs4_do_open.isra.0+0x94/0x408 [nfsv4] > [<0>] nfs4_do_open+0x9c/0x238 [nfsv4] > [<0>] nfs4_atomic_open+0x100/0x118 [nfsv4] > [<0>] nfs4_file_open+0x11c/0x240 [nfsv4] > [<0>] do_dentry_open+0x140/0x528 > [<0>] vfs_open+0x30/0x38 > [<0>] do_open+0x14c/0x360 > [<0>] path_openat+0x104/0x250 > [<0>] do_filp_open+0x84/0x138 > [<0>] file_open_name+0x134/0x190 > [<0>] __do_sys_swapoff+0x58/0x6e8 > [<0>] __arm64_sys_swapoff+0x18/0x28 > [<0>] invoke_syscall.constprop.0+0x7c/0xd0 > [<0>] do_el0_svc+0xb4/0xd0 > [<0>] el0_svc+0x50/0x228 > [<0>] el0t_64_sync_handler+0x134/0x150 > [<0>] el0t_64_sync+0x17c/0x180 Oh crap... Yes, that is a bug. Can you please apply the attached patch on top of the original, and see if that fixes the problem?
On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust <trondmy@hammerspace.com> wrote: > > On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote: > > On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust > > <trondmy@hammerspace.com> wrote: > > > > > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote: > > > > Hi Trond, > > > > > > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote: > > > > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > > > > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by > > > > > commit > > > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") > > > > > because > > > > > it > > > > > prevents the setup of new threads to handle reboot recovery, > > > > > while > > > > > the > > > > > older recovery thread is stuck returning delegations. > > > > > > > > I'm seeing a possible deadlock with xfstests generic/472 on NFS > > > > v4.x > > > > after applying this patch. The test itself checks for various > > > > swapfile > > > > edge cases, so it seems likely something is going on there. > > > > > > > > Let me know if you need more info > > > > Anna > > > > > > > > > > Did you turn off delegations on your server? If you don't, then > > > swap > > > will deadlock itself under various scenarios. > > > > Is there documentation somewhere that says that delegations must be > > turned off on the server if NFS over swap is enabled? > > I think the question is more generally "Is there documentation for NFS > swap?" > > > > > If the client can't handle delegations + swap, then shouldn't this be > > solved by (1) checking if we are in NFS over swap and then > > proactively > > setting 'dont want delegation' on open and/or (2) proactively return > > the delegation if received so that we don't get into the deadlock? > > We could do that for NFSv4.1 and NFSv4.2, but the protocol will not > allow us to do that for NFSv4.0. > > > > > I think this is similar to Anna's. With this patch, I'm running into > > a > > problem running against an ONTAP server using xfstests (no problems > > without the patch). During the run two stuck threads are: > > [root@unknown000c291be8aa aglo]# cat /proc/3724/stack > > [<0>] nfs4_run_state_manager+0x1c0/0x1f8 [nfsv4] > > [<0>] kthread+0x100/0x110 > > [<0>] ret_from_fork+0x10/0x20 > > [root@unknown000c291be8aa aglo]# cat /proc/3725/stack > > [<0>] nfs_wait_bit_killable+0x1c/0x88 [nfs] > > [<0>] nfs4_wait_clnt_recover+0xb4/0xf0 [nfsv4] > > [<0>] nfs4_client_recover_expired_lease+0x34/0x88 [nfsv4] > > [<0>] _nfs4_do_open.isra.0+0x94/0x408 [nfsv4] > > [<0>] nfs4_do_open+0x9c/0x238 [nfsv4] > > [<0>] nfs4_atomic_open+0x100/0x118 [nfsv4] > > [<0>] nfs4_file_open+0x11c/0x240 [nfsv4] > > [<0>] do_dentry_open+0x140/0x528 > > [<0>] vfs_open+0x30/0x38 > > [<0>] do_open+0x14c/0x360 > > [<0>] path_openat+0x104/0x250 > > [<0>] do_filp_open+0x84/0x138 > > [<0>] file_open_name+0x134/0x190 > > [<0>] __do_sys_swapoff+0x58/0x6e8 > > [<0>] __arm64_sys_swapoff+0x18/0x28 > > [<0>] invoke_syscall.constprop.0+0x7c/0xd0 > > [<0>] do_el0_svc+0xb4/0xd0 > > [<0>] el0_svc+0x50/0x228 > > [<0>] el0t_64_sync_handler+0x134/0x150 > > [<0>] el0t_64_sync+0x17c/0x180 > > Oh crap... Yes, that is a bug. Can you please apply the attached patch > on top of the original, and see if that fixes the problem? I can't consistently reproduce the problem. Out of several xfstests runs a couple got stuck in that state. So when I apply that patch and run, I can't tell if i'm no longer hitting or if I'm just not hitting the right condition. Given I don't exactly know what's caused it I'm trying to find something I can hit consistently. Any ideas? I mean this stack trace seems to imply a recovery open but I'm not doing any server reboots or connection drops. > > -- > Trond Myklebust > Linux NFS client maintainer, Hammerspace > trond.myklebust@hammerspace.com > >
On Fri, 2023-09-22 at 17:00 -0400, Olga Kornievskaia wrote: > On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust > > > > Oh crap... Yes, that is a bug. Can you please apply the attached > > patch > > on top of the original, and see if that fixes the problem? > > I can't consistently reproduce the problem. Out of several xfstests > runs a couple got stuck in that state. So when I apply that patch and > run, I can't tell if i'm no longer hitting or if I'm just not hitting > the right condition. > > Given I don't exactly know what's caused it I'm trying to find > something I can hit consistently. Any ideas? I mean this stack trace > seems to imply a recovery open but I'm not doing any server reboots > or > connection drops. > > > If I'm right about the root cause, then just turning off delegations on the server, setting up a NFS swap partition and then running some ordinary file open/close workload against the exact same server would probably suffice to trigger your stack trace 100% reliably. I'll see if I can find time to test it over the weekend.
On Fri, 2023-09-22 at 17:06 -0400, Trond Myklebust wrote: > On Fri, 2023-09-22 at 17:00 -0400, Olga Kornievskaia wrote: > > On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust > > > > > > Oh crap... Yes, that is a bug. Can you please apply the attached > > > patch > > > on top of the original, and see if that fixes the problem? > > > > I can't consistently reproduce the problem. Out of several xfstests > > runs a couple got stuck in that state. So when I apply that patch > > and > > run, I can't tell if i'm no longer hitting or if I'm just not > > hitting > > the right condition. > > > > Given I don't exactly know what's caused it I'm trying to find > > something I can hit consistently. Any ideas? I mean this stack > > trace > > seems to imply a recovery open but I'm not doing any server reboots > > or > > connection drops. > > > > > > > If I'm right about the root cause, then just turning off delegations > on > the server, setting up a NFS swap partition and then running some > ordinary file open/close workload against the exact same server would > probably suffice to trigger your stack trace 100% reliably. > > I'll see if I can find time to test it over the weekend. > Yep... Creating a 4G empty file on /mnt/nfs/swap/swapfile, running mkswap and then swapon followed by a simple bash line of echo "foo" >/mnt/nfs/foobar will immediately lead to a hang. When I look at the stack for the bash process, I see the following dump, which matches yours: [root@vmw-test-1 ~]# cat /proc/1120/stack [<0>] nfs_wait_bit_killable+0x11/0x60 [nfs] [<0>] nfs4_wait_clnt_recover+0x54/0x90 [nfsv4] [<0>] nfs4_client_recover_expired_lease+0x29/0x60 [nfsv4] [<0>] nfs4_do_open+0x170/0xa90 [nfsv4] [<0>] nfs4_atomic_open+0x94/0x100 [nfsv4] [<0>] nfs_atomic_open+0x2d9/0x670 [nfs] [<0>] path_openat+0x3c3/0xd40 [<0>] do_filp_open+0xb4/0x160 [<0>] do_sys_openat2+0x81/0xe0 [<0>] __x64_sys_openat+0x81/0xa0 [<0>] do_syscall_64+0x68/0xa0 [<0>] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 With the fix I sent you: [root@vmw-test-1 ~]# mount -t nfs -overs=4.2 vmw-test-2:/export /mnt/nfs [root@vmw-test-1 ~]# mkswap /mnt/nfs/swap/swapfile mkswap: /mnt/nfs/swap/swapfile: warning: wiping old swap signature. Setting up swapspace version 1, size = 4 GiB (4294963200 bytes) no label, UUID=1360b0a3-833a-4ba7-b467-8a59d3723013 [root@vmw-test-1 ~]# swapon /mnt/nfs/swap/swapfile [root@vmw-test-1 ~]# ps -efww | grep manage root 1214 2 0 13:04 ? 00:00:00 [192.168.76.251-manager] root 1216 1147 0 13:04 pts/0 00:00:00 grep --color=auto manage [root@vmw-test-1 ~]# echo "foo" >/mnt/nfs/foobar [root@vmw-test-1 ~]# cat /mnt/nfs/foobar foo So that returns behaviour to normal in my testing, and I no longer see the hangs. Let me send out a PATCHv2...
On Sat, 23 Sep 2023, Trond Myklebust wrote: > On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote: > > On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust > > <trondmy@hammerspace.com> wrote: > > > > > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote: > > > > Hi Trond, > > > > > > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote: > > > > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > > > > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by > > > > > commit > > > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") > > > > > because > > > > > it > > > > > prevents the setup of new threads to handle reboot recovery, > > > > > while > > > > > the > > > > > older recovery thread is stuck returning delegations. > > > > > > > > I'm seeing a possible deadlock with xfstests generic/472 on NFS > > > > v4.x > > > > after applying this patch. The test itself checks for various > > > > swapfile > > > > edge cases, so it seems likely something is going on there. > > > > > > > > Let me know if you need more info > > > > Anna > > > > > > > > > > Did you turn off delegations on your server? If you don't, then > > > swap > > > will deadlock itself under various scenarios. > > > > Is there documentation somewhere that says that delegations must be > > turned off on the server if NFS over swap is enabled? > > I think the question is more generally "Is there documentation for NFS > swap?" The main difference between using NFS for swap and for regular file IO is in the handling of writes, and particularly in the style of memory allocation that is safe while handling a write request (or anything which might block some write request, etc). For buffered IO, memory allocations must be GFP_NOIO or PF_MEMALLOC_NOIO. For swap-out, memory allocations must be GFP_MEMALLOC or PG_MEMALLOC. That is the primary difference - all other differences are minor. This difference might justify documentation suggesting that /proc/sys/vm/min_free_kbytes could usefully be increased, but I don't see that more is needed. The NOIO/MEMALLOC distinction is properly plumbed through nfs, sunrpc, and networking and all "just works". The problem area is that kthread_create() doesn't take a gfpflags_t argument, so it uses GFP_KERNEL allocations to create the new thread. This means that when a write cannot proceed without state management, and state management requests that a threads be started, there is a risk of memory allocation deadlock. I believe the risk is there even for buffered IO, but I'm not 100% certain and in practice I don't think a deadlock has ever been reported. With swap-out it is fairly easy to trigger a deadlock if there is heavy swap-out traffic when state management is needed. The common pattern in the kernel when a thread might be needed to support writeout is to keep the thread running permanently (rather than to add a gfpflags_t to kthread_create), so that is what I added to the nfsv4 state manager. However the state manager thread has a second use - returning delegations. This sometimes needs to run concurrently with state management, so one thread is not enough. What is that context for delegation return? Does it ever block writes? If it doesn't, would it make sense to use a work queue for returning delegations - maybe system_wq? I think it might be best to have the nfsv4 state manager thread always be running whether swap is enabled or not. As I say I think there is at least a theoretical risk of a deadlock even without swap, and having a small test matrix is usually a good thing. Thanks, NeilBrown
On Tue, 2023-09-26 at 08:28 +1000, NeilBrown wrote: > On Sat, 23 Sep 2023, Trond Myklebust wrote: > > On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote: > > > On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust > > > <trondmy@hammerspace.com> wrote: > > > > > > > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote: > > > > > Hi Trond, > > > > > > > > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote: > > > > > > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > > > > > > > Commit 4dc73c679114 reintroduces the deadlock that was > > > > > > fixed by > > > > > > commit > > > > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") > > > > > > because > > > > > > it > > > > > > prevents the setup of new threads to handle reboot > > > > > > recovery, > > > > > > while > > > > > > the > > > > > > older recovery thread is stuck returning delegations. > > > > > > > > > > I'm seeing a possible deadlock with xfstests generic/472 on > > > > > NFS > > > > > v4.x > > > > > after applying this patch. The test itself checks for various > > > > > swapfile > > > > > edge cases, so it seems likely something is going on there. > > > > > > > > > > Let me know if you need more info > > > > > Anna > > > > > > > > > > > > > Did you turn off delegations on your server? If you don't, then > > > > swap > > > > will deadlock itself under various scenarios. > > > > > > Is there documentation somewhere that says that delegations must > > > be > > > turned off on the server if NFS over swap is enabled? > > > > I think the question is more generally "Is there documentation for > > NFS > > swap?" > > The main difference between using NFS for swap and for regular file > IO > is in the handling of writes, and particularly in the style of memory > allocation that is safe while handling a write request (or anything > which might block some write request, etc). > > For buffered IO, memory allocations must be GFP_NOIO or > PF_MEMALLOC_NOIO. > For swap-out, memory allocations must be GFP_MEMALLOC or PG_MEMALLOC. > > That is the primary difference - all other differences are minor. > This > difference might justify documentation suggesting that > /proc/sys/vm/min_free_kbytes could usefully be increased, but I don't > see that more is needed. > > The NOIO/MEMALLOC distinction is properly plumbed through nfs, > sunrpc, > and networking and all "just works". The problem area is that > kthread_create() doesn't take a gfpflags_t argument, so it uses > GFP_KERNEL allocations to create the new thread. > > This means that when a write cannot proceed without state management, > and state management requests that a threads be started, there is a > risk > of memory allocation deadlock. > I believe the risk is there even for buffered IO, but I'm not 100% > certain and in practice I don't think a deadlock has ever been > reported. > With swap-out it is fairly easy to trigger a deadlock if there is > heavy > swap-out traffic when state management is needed. > > The common pattern in the kernel when a thread might be needed to > support writeout is to keep the thread running permanently (rather > than > to add a gfpflags_t to kthread_create), so that is what I added to > the > nfsv4 state manager. > > However the state manager thread has a second use - returning > delegations. This sometimes needs to run concurrently with state > management, so one thread is not enough. > > What is that context for delegation return? Does it ever block > writes? > If it doesn't, would it make sense to use a work queue for returning > delegations - maybe system_wq? These are potentially long-lived processes because there may be lock recovery involved, and because of the conflict with state recovery, so it does not make sense to put them on a workqueue. > > I think it might be best to have the nfsv4 state manager thread > always > be running whether swap is enabled or not. As I say I think there is > at > least a theoretical risk of a deadlock even without swap, and having > a > small test matrix is usually a good thing. > That's a "prove me wrong" argument.
On Tue, 26 Sep 2023, Trond Myklebust wrote: > On Tue, 2023-09-26 at 08:28 +1000, NeilBrown wrote: > > On Sat, 23 Sep 2023, Trond Myklebust wrote: > > > On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote: > > > > On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust > > > > <trondmy@hammerspace.com> wrote: > > > > > > > > > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote: > > > > > > Hi Trond, > > > > > > > > > > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote: > > > > > > > > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > > > > > > > > > Commit 4dc73c679114 reintroduces the deadlock that was > > > > > > > fixed by > > > > > > > commit > > > > > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") > > > > > > > because > > > > > > > it > > > > > > > prevents the setup of new threads to handle reboot > > > > > > > recovery, > > > > > > > while > > > > > > > the > > > > > > > older recovery thread is stuck returning delegations. > > > > > > > > > > > > I'm seeing a possible deadlock with xfstests generic/472 on > > > > > > NFS > > > > > > v4.x > > > > > > after applying this patch. The test itself checks for various > > > > > > swapfile > > > > > > edge cases, so it seems likely something is going on there. > > > > > > > > > > > > Let me know if you need more info > > > > > > Anna > > > > > > > > > > > > > > > > Did you turn off delegations on your server? If you don't, then > > > > > swap > > > > > will deadlock itself under various scenarios. > > > > > > > > Is there documentation somewhere that says that delegations must > > > > be > > > > turned off on the server if NFS over swap is enabled? > > > > > > I think the question is more generally "Is there documentation for > > > NFS > > > swap?" > > > > The main difference between using NFS for swap and for regular file > > IO > > is in the handling of writes, and particularly in the style of memory > > allocation that is safe while handling a write request (or anything > > which might block some write request, etc). > > > > For buffered IO, memory allocations must be GFP_NOIO or > > PF_MEMALLOC_NOIO. > > For swap-out, memory allocations must be GFP_MEMALLOC or PG_MEMALLOC. > > > > That is the primary difference - all other differences are minor. > > This > > difference might justify documentation suggesting that > > /proc/sys/vm/min_free_kbytes could usefully be increased, but I don't > > see that more is needed. > > > > The NOIO/MEMALLOC distinction is properly plumbed through nfs, > > sunrpc, > > and networking and all "just works". The problem area is that > > kthread_create() doesn't take a gfpflags_t argument, so it uses > > GFP_KERNEL allocations to create the new thread. > > > > This means that when a write cannot proceed without state management, > > and state management requests that a threads be started, there is a > > risk > > of memory allocation deadlock. > > I believe the risk is there even for buffered IO, but I'm not 100% > > certain and in practice I don't think a deadlock has ever been > > reported. > > With swap-out it is fairly easy to trigger a deadlock if there is > > heavy > > swap-out traffic when state management is needed. > > > > The common pattern in the kernel when a thread might be needed to > > support writeout is to keep the thread running permanently (rather > > than > > to add a gfpflags_t to kthread_create), so that is what I added to > > the > > nfsv4 state manager. > > > > However the state manager thread has a second use - returning > > delegations. This sometimes needs to run concurrently with state > > management, so one thread is not enough. > > > > What is that context for delegation return? Does it ever block > > writes? > > If it doesn't, would it make sense to use a work queue for returning > > delegations - maybe system_wq? > > These are potentially long-lived processes because there may be lock > recovery involved, and because of the conflict with state recovery, so > it does not make sense to put them on a workqueue. > Makes sense - thanks. Are writes blocked while the delegation returns proceeds? If not, would it be reasonable to start a separate kthread on-demand when a return is requested? Thanks, NeilBrown
On Tue, 2023-09-26 at 09:04 +1000, NeilBrown wrote: > > Are writes blocked while the delegation returns proceeds? If not, > would > it be reasonable to start a separate kthread on-demand when a return > is > requested? > That's what we've done historically. We initially made it be the same thread as the standard recovery thread because we do want to serialise recovery and delegation return. However the recovery thread has the ability to block all other RPC to the server in question, so that requirement that we serialise does not depend on the two threads being the same. In practice, therefore, we usually ended up with multiple separate threads when reboot recovery was required during a delegation return, or if a single sweep of the delegations took too long.
On Fri, Sep 22, 2023 at 5:07 PM Trond Myklebust <trondmy@hammerspace.com> wrote: > > On Fri, 2023-09-22 at 17:00 -0400, Olga Kornievskaia wrote: > > On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust > > > > > > Oh crap... Yes, that is a bug. Can you please apply the attached > > > patch > > > on top of the original, and see if that fixes the problem? > > > > I can't consistently reproduce the problem. Out of several xfstests > > runs a couple got stuck in that state. So when I apply that patch and > > run, I can't tell if i'm no longer hitting or if I'm just not hitting > > the right condition. > > > > Given I don't exactly know what's caused it I'm trying to find > > something I can hit consistently. Any ideas? I mean this stack trace > > seems to imply a recovery open but I'm not doing any server reboots > > or > > connection drops. > > > > > > > If I'm right about the root cause, then just turning off delegations on > the server, setting up a NFS swap partition and then running some > ordinary file open/close workload against the exact same server would > probably suffice to trigger your stack trace 100% reliably. Getting back to this now. Thanks for v2 and I'll test. But I'm still unclear, is there a requirement that delegations have to be off for when the client has NFS over swap enabled? I always run with delegations on in ONTAP and run xfstests. I don't usually run with NFS over swap enabled but sometimes I do. So what should be the expectations? If I choose this kernel configuration, then deadlock is possible? > > I'll see if I can find time to test it over the weekend. > > -- > Trond Myklebust > Linux NFS client maintainer, Hammerspace > trond.myklebust@hammerspace.com > >
On Sun, Sep 24, 2023 at 1:08 PM Trond Myklebust <trondmy@hammerspace.com> wrote: > > On Fri, 2023-09-22 at 17:06 -0400, Trond Myklebust wrote: > > On Fri, 2023-09-22 at 17:00 -0400, Olga Kornievskaia wrote: > > > On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust > > > > > > > > Oh crap... Yes, that is a bug. Can you please apply the attached > > > > patch > > > > on top of the original, and see if that fixes the problem? > > > > > > I can't consistently reproduce the problem. Out of several xfstests > > > runs a couple got stuck in that state. So when I apply that patch > > > and > > > run, I can't tell if i'm no longer hitting or if I'm just not > > > hitting > > > the right condition. > > > > > > Given I don't exactly know what's caused it I'm trying to find > > > something I can hit consistently. Any ideas? I mean this stack > > > trace > > > seems to imply a recovery open but I'm not doing any server reboots > > > or > > > connection drops. > > > > > > > > > > > If I'm right about the root cause, then just turning off delegations > > on > > the server, setting up a NFS swap partition and then running some > > ordinary file open/close workload against the exact same server would > > probably suffice to trigger your stack trace 100% reliably. > > > > I'll see if I can find time to test it over the weekend. > > > > > Yep... Creating a 4G empty file on /mnt/nfs/swap/swapfile, running > mkswap and then swapon followed by a simple bash line of > echo "foo" >/mnt/nfs/foobar > > will immediately lead to a hang. When I look at the stack for the bash > process, I see the following dump, which matches yours: > > [root@vmw-test-1 ~]# cat /proc/1120/stack > [<0>] nfs_wait_bit_killable+0x11/0x60 [nfs] > [<0>] nfs4_wait_clnt_recover+0x54/0x90 [nfsv4] > [<0>] nfs4_client_recover_expired_lease+0x29/0x60 [nfsv4] > [<0>] nfs4_do_open+0x170/0xa90 [nfsv4] > [<0>] nfs4_atomic_open+0x94/0x100 [nfsv4] > [<0>] nfs_atomic_open+0x2d9/0x670 [nfs] > [<0>] path_openat+0x3c3/0xd40 > [<0>] do_filp_open+0xb4/0x160 > [<0>] do_sys_openat2+0x81/0xe0 > [<0>] __x64_sys_openat+0x81/0xa0 > [<0>] do_syscall_64+0x68/0xa0 > [<0>] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 > > With the fix I sent you: > > [root@vmw-test-1 ~]# mount -t nfs -overs=4.2 vmw-test-2:/export /mnt/nfs > [root@vmw-test-1 ~]# mkswap /mnt/nfs/swap/swapfile > mkswap: /mnt/nfs/swap/swapfile: warning: wiping old swap signature. > Setting up swapspace version 1, size = 4 GiB (4294963200 bytes) > no label, UUID=1360b0a3-833a-4ba7-b467-8a59d3723013 > [root@vmw-test-1 ~]# swapon /mnt/nfs/swap/swapfile > [root@vmw-test-1 ~]# ps -efww | grep manage > root 1214 2 0 13:04 ? 00:00:00 [192.168.76.251-manager] > root 1216 1147 0 13:04 pts/0 00:00:00 grep --color=auto manage > [root@vmw-test-1 ~]# echo "foo" >/mnt/nfs/foobar > [root@vmw-test-1 ~]# cat /mnt/nfs/foobar > foo > > So that returns behaviour to normal in my testing, and I no longer see > the hangs. > > Let me send out a PATCHv2... I'm liking patch v2 much better! I was testing with a Linux server with default configuration, and it's nice that the xfstests swapfile tests aren't hanging anymore :) Anna > -- > Trond Myklebust > Linux NFS client maintainer, Hammerspace > trond.myklebust@hammerspace.com > >
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5deeaea8026e..a19e809cad16 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode *inode) */ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + wake_up_var(&clp->cl_state); } static const struct inode_operations nfs4_dir_inode_operations = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0bc160fbabec..5751a6886da4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *clnt = clp->cl_rpcclient; + bool swapon = false; - if (clp->cl_rpcclient->cl_shutdown) + if (clnt->cl_shutdown) return; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { - wake_up_var(&clp->cl_state); - return; + + if (atomic_read(&clnt->cl_swapper)) { + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, + &clp->cl_state); + if (!swapon) { + wake_up_var(&clp->cl_state); + return; + } } - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; + __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) __func__, PTR_ERR(task)); if (!nfs_client_init_is_complete(clp)) nfs_mark_client_ready(clp, PTR_ERR(task)); + if (swapon) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs4_clear_state_manager_bit(clp); - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); again: - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); - if (atomic_read(&cl->cl_swapper)) { + + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { wait_var_event_interruptible(&clp->cl_state, test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)); - if (atomic_read(&cl->cl_swapper) && - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + if (!atomic_read(&cl->cl_swapper)) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + if (refcount_read(&clp->cl_count) > 1 && !signalled()) goto again; /* Either no longer a swapper, or were signalled */ + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + nfs4_clear_state_manager_bit(clp); } - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); if (refcount_read(&clp->cl_count) > 1 && !signalled() && test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; nfs_put_client(clp);