@@ -113,7 +113,8 @@ struct nfs4_state_owner {
enum {
NFS_OWNER_RECLAIM_REBOOT,
- NFS_OWNER_RECLAIM_NOGRACE
+ NFS_OWNER_RECLAIM_NOGRACE,
+ NFS_OWNER_STALE,
};
#define NFS_LOCK_NEW 0
@@ -400,6 +400,8 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
p = &parent->rb_left;
else if (cred > sp->so_cred)
p = &parent->rb_right;
+ else if (test_bit(NFS_OWNER_STALE, &sp->so_flags))
+ p = &parent->rb_left;
else {
if (!list_empty(&sp->so_lru))
list_del_init(&sp->so_lru);
@@ -427,6 +429,8 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
p = &parent->rb_left;
else if (new->so_cred > sp->so_cred)
p = &parent->rb_right;
+ else if (test_bit(NFS_OWNER_STALE, &sp->so_flags))
+ p = &parent->rb_left;
else {
if (!list_empty(&sp->so_lru))
list_del_init(&sp->so_lru);
@@ -499,19 +503,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
static void
nfs4_drop_state_owner(struct nfs4_state_owner *sp)
{
- struct rb_node *rb_node = &sp->so_server_node;
-
- if (!RB_EMPTY_NODE(rb_node)) {
- struct nfs_server *server = sp->so_server;
- struct nfs_client *clp = server->nfs_client;
-
- spin_lock(&clp->cl_lock);
- if (!RB_EMPTY_NODE(rb_node)) {
- rb_erase(rb_node, &server->state_owners);
- RB_CLEAR_NODE(rb_node);
- }
- spin_unlock(&clp->cl_lock);
- }
+ set_bit(NFS_OWNER_STALE, &sp->so_flags);
}
static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
Hi Trond/Anna, I'd like your thoughts on this patch. I have a customer whose NFS client occasionally gets into a state where it spins indefinitely sending WRITE (or OPEN) and getting an NFS4ERR_BAD_STATEID (or NFS4ERR_BAD_SEQID) error. The state manager doesn't try any recovery as the open owner is no longer in the rbtree - presumably due to an earlier NFS4ERR_BAD_SEQID. I've seen at least one NFS: v4 server XXX returned a bad sequence-id error! in the logs. I don't know if this is a server problem or a client problem - client didn't have Commit: c7848f69ec4a ("NFSv4: OPEN must handle the NFS4ERR_IO return code correctly") which cause cause BAD_SEQID errors. However wherever the bug is we don't want the NFS client to spin like that. This patch removes the spinning when tested against a hacked NFS server which can be convinced to return BAD_SEQID and BAD_STATED for a given open owner. My main concern is: is there some other reason to remove the open owner from the rbtree other than to make sure it isn't used for new opens? Thanks, NeilBrown When an NFS4ERR_BAD_SEQID is received the open-owner is removed from the ->state_owners rbtree so that it will no longer be used. If any stateids attached to this open-owner are still in use, and if a request using one get an NFS4ERR_BAD_STATEID reply, this can for bad. The state is marked as needing recovery and the nfs4_state_manager() is scheduled to clean up. nfs4_state_manager() finds states to be recovered by walking the state_owners rbtree. As the open-owner is not in the rbtree, the bad state is not found so nfs4_state_manager() completes having done nothing. The request is then retried, with a predicatable result (indefinite retries). This patch changes NFS4ERR_BAD_SEQID handling to leave the open-owner in the rbtree but mark it a 'stale'. With this the indefinite retries no longer happen. Errors get to user-space instead if recovery doesn't work. Signed-off-by: NeilBrown <neilb@suse.com> --- fs/nfs/nfs4_fs.h | 3 ++- fs/nfs/nfs4state.c | 18 +++++------------- 2 files changed, 7 insertions(+), 14 deletions(-)