diff mbox series

NFS: add barriers when testing for NFS_FSDATA_BLOCKED

Message ID 171677905033.27191.7405469009187788343@noble.neil.brown.name (mailing list archive)
State New, archived
Headers show
Series NFS: add barriers when testing for NFS_FSDATA_BLOCKED | expand

Commit Message

NeilBrown May 27, 2024, 3:04 a.m. UTC
dentry->d_fsdata is set to NFS_FSDATA_BLOCKED while unlinking or
renaming-over a file to ensure that no open succeeds while the NFS
operation progressed on the server.

Setting dentry->d_fsdata to NFS_FSDATA_BLOCKED is done under ->d_lock
after checking the refcount is not elevated.  Any attempt to open the
file (through that name) will go through lookp_open() which will take
->d_lock while incrementing the refcount, we can be sure that once the
new value is set, __nfs_lookup_revalidate() *will* see the new value and
will block.

We don't have any locking guarantee that when we set ->d_fsdata to NULL,
the wait_var_event() in __nfs_lookup_revalidate() will notice.
wait/wake primitives do NOT provide barriers to guarantee order.  We
must use smp_load_acquire() in wait_var_event() to ensure we look at an
up-to-date value, and must use smp_store_release() before wake_up_var().

This patch adds those barrier functions and factors out
block_revalidate() and unblock_revalidate() far clarity.

There is also a hypothetical bug in that if memory allocation fails
(which never happens in practice) we might leave ->d_fsdata locked.
This patch adds the missing call to unblock_revalidate().

Reported-and-tested-by: Richard Kojedzinszky <richard+debian+bugreport@kojedz.in>
Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1071501
Fixes: 3c59366c207e ("NFS: don't unhash dentry during unlink/rename")
Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/nfs/dir.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

Comments

Trond Myklebust May 27, 2024, 4:14 p.m. UTC | #1
On Mon, 2024-05-27 at 13:04 +1000, NeilBrown wrote:
> 
> dentry->d_fsdata is set to NFS_FSDATA_BLOCKED while unlinking or
> renaming-over a file to ensure that no open succeeds while the NFS
> operation progressed on the server.
> 
> Setting dentry->d_fsdata to NFS_FSDATA_BLOCKED is done under ->d_lock
> after checking the refcount is not elevated.  Any attempt to open the
> file (through that name) will go through lookp_open() which will take
> ->d_lock while incrementing the refcount, we can be sure that once
> the
> new value is set, __nfs_lookup_revalidate() *will* see the new value
> and
> will block.
> 
> We don't have any locking guarantee that when we set ->d_fsdata to
> NULL,
> the wait_var_event() in __nfs_lookup_revalidate() will notice.
> wait/wake primitives do NOT provide barriers to guarantee order.  We
> must use smp_load_acquire() in wait_var_event() to ensure we look at
> an
> up-to-date value, and must use smp_store_release() before
> wake_up_var().
> 
> This patch adds those barrier functions and factors out
> block_revalidate() and unblock_revalidate() far clarity.
> 
> There is also a hypothetical bug in that if memory allocation fails
> (which never happens in practice) we might leave ->d_fsdata locked.
> This patch adds the missing call to unblock_revalidate().
> 
> Reported-and-tested-by: Richard Kojedzinszky
> <richard+debian+bugreport@kojedz.in>
> Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1071501
> Fixes: 3c59366c207e ("NFS: don't unhash dentry during unlink/rename")
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfs/dir.c | 44 +++++++++++++++++++++++++++++---------------
>  1 file changed, 29 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index ac505671efbd..c91dc36d41cc 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -1802,9 +1802,10 @@ __nfs_lookup_revalidate(struct dentry *dentry,
> unsigned int flags,
>  		if (parent != READ_ONCE(dentry->d_parent))
>  			return -ECHILD;
>  	} else {
> -		/* Wait for unlink to complete */
> +		/* Wait for unlink to complete - see
> unblock_revalidate() */
>  		wait_var_event(&dentry->d_fsdata,
> -			       dentry->d_fsdata !=
> NFS_FSDATA_BLOCKED);
> +			       smp_load_acquire(&dentry->d_fsdata)
> +			       != NFS_FSDATA_BLOCKED);

Doesn't this end up being a reversed ACQUIRE+RELEASE as described in
the "LOCK ACQUISITION FUNCTIONS" section of Documentation/memory-
barriers.txt?

IOW: Shouldn't the above rather be using READ_ONCE()?

>  		parent = dget_parent(dentry);
>  		ret = reval(d_inode(parent), dentry, flags);
>  		dput(parent);
> @@ -1817,6 +1818,26 @@ static int nfs_lookup_revalidate(struct dentry
> *dentry, unsigned int flags)
>  	return __nfs_lookup_revalidate(dentry, flags,
> nfs_do_lookup_revalidate);
>  }
>  
> +static void block_revalidate(struct dentry *dentry)
> +{
> +	/* old devname - just in case */
> +	kfree(dentry->d_fsdata);
> +
> +	/* Any new reference that could lead to an open
> +	 * will take ->d_lock in lookup_open() -> d_lookup().
> +	 */
> +	lockdep_assert_held(&dentry->d_lock);
> +
> +	dentry->d_fsdata = NULL;

Why are you doing a barrier free change to dentry->d_fsdata here when
you have the memory barrier protected change in unblock_revalidate()?

> +}
> +
> +static void unblock_revalidate(struct dentry *dentry)
> +{
> +	/* store_release ensures wait_var_event() sees the update */
> +	smp_store_release(&dentry->d_fsdata, NULL);

Shouldn't this be a WRITE_ONCE(), for the same reason as above?

> +	wake_up_var(&dentry->d_fsdata);
> +}
> +
>  /*
>   * A weaker form of d_revalidate for revalidating just the
> d_inode(dentry)
>   * when we don't really care about the dentry name. This is called
> when a
> @@ -2501,15 +2522,12 @@ int nfs_unlink(struct inode *dir, struct
> dentry *dentry)
>  		spin_unlock(&dentry->d_lock);
>  		goto out;
>  	}
> -	/* old devname */
> -	kfree(dentry->d_fsdata);
> -	dentry->d_fsdata = NFS_FSDATA_BLOCKED;
> +	block_revalidate(dentry);
>  
>  	spin_unlock(&dentry->d_lock);
>  	error = nfs_safe_remove(dentry);
>  	nfs_dentry_remove_handle_error(dir, dentry, error);
> -	dentry->d_fsdata = NULL;
> -	wake_up_var(&dentry->d_fsdata);
> +	unblock_revalidate(dentry);
>  out:
>  	trace_nfs_unlink_exit(dir, dentry, error);
>  	return error;
> @@ -2616,8 +2634,7 @@ nfs_unblock_rename(struct rpc_task *task,
> struct nfs_renamedata *data)
>  {
>  	struct dentry *new_dentry = data->new_dentry;
>  
> -	new_dentry->d_fsdata = NULL;
> -	wake_up_var(&new_dentry->d_fsdata);
> +	unblock_revalidate(new_dentry);
>  }
>  
>  /*
> @@ -2679,11 +2696,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct
> inode *old_dir,
>  		if (WARN_ON(new_dentry->d_flags &
> DCACHE_NFSFS_RENAMED) ||
>  		    WARN_ON(new_dentry->d_fsdata ==
> NFS_FSDATA_BLOCKED))
>  			goto out;
> -		if (new_dentry->d_fsdata) {
> -			/* old devname */
> -			kfree(new_dentry->d_fsdata);
> -			new_dentry->d_fsdata = NULL;
> -		}
>  
>  		spin_lock(&new_dentry->d_lock);
>  		if (d_count(new_dentry) > 2) {
> @@ -2705,7 +2717,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct
> inode *old_dir,
>  			new_dentry = dentry;
>  			new_inode = NULL;
>  		} else {
> -			new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
> +			block_revalidate(new_dentry);
>  			must_unblock = true;
>  			spin_unlock(&new_dentry->d_lock);
>  		}
> @@ -2717,6 +2729,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct
> inode *old_dir,
>  	task = nfs_async_rename(old_dir, new_dir, old_dentry,
> new_dentry,
>  				must_unblock ? nfs_unblock_rename :
> NULL);
>  	if (IS_ERR(task)) {
> +		if (must_unblock)
> +			unblock_revalidate(new_dentry);
>  		error = PTR_ERR(task);
>  		goto out;
>  	}
NeilBrown May 28, 2024, 1:19 a.m. UTC | #2
On Tue, 28 May 2024, Trond Myklebust wrote:
> On Mon, 2024-05-27 at 13:04 +1000, NeilBrown wrote:
> > 
> > dentry->d_fsdata is set to NFS_FSDATA_BLOCKED while unlinking or
> > renaming-over a file to ensure that no open succeeds while the NFS
> > operation progressed on the server.
> > 
> > Setting dentry->d_fsdata to NFS_FSDATA_BLOCKED is done under ->d_lock
> > after checking the refcount is not elevated.  Any attempt to open the
> > file (through that name) will go through lookp_open() which will take
> > ->d_lock while incrementing the refcount, we can be sure that once
> > the
> > new value is set, __nfs_lookup_revalidate() *will* see the new value
> > and
> > will block.
> > 
> > We don't have any locking guarantee that when we set ->d_fsdata to
> > NULL,
> > the wait_var_event() in __nfs_lookup_revalidate() will notice.
> > wait/wake primitives do NOT provide barriers to guarantee order.  We
> > must use smp_load_acquire() in wait_var_event() to ensure we look at
> > an
> > up-to-date value, and must use smp_store_release() before
> > wake_up_var().
> > 
> > This patch adds those barrier functions and factors out
> > block_revalidate() and unblock_revalidate() far clarity.
> > 
> > There is also a hypothetical bug in that if memory allocation fails
> > (which never happens in practice) we might leave ->d_fsdata locked.
> > This patch adds the missing call to unblock_revalidate().
> > 
> > Reported-and-tested-by: Richard Kojedzinszky
> > <richard+debian+bugreport@kojedz.in>
> > Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1071501
> > Fixes: 3c59366c207e ("NFS: don't unhash dentry during unlink/rename")
> > Signed-off-by: NeilBrown <neilb@suse.de>
> > ---
> >  fs/nfs/dir.c | 44 +++++++++++++++++++++++++++++---------------
> >  1 file changed, 29 insertions(+), 15 deletions(-)
> > 
> > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > index ac505671efbd..c91dc36d41cc 100644
> > --- a/fs/nfs/dir.c
> > +++ b/fs/nfs/dir.c
> > @@ -1802,9 +1802,10 @@ __nfs_lookup_revalidate(struct dentry *dentry,
> > unsigned int flags,
> >  		if (parent != READ_ONCE(dentry->d_parent))
> >  			return -ECHILD;
> >  	} else {
> > -		/* Wait for unlink to complete */
> > +		/* Wait for unlink to complete - see
> > unblock_revalidate() */
> >  		wait_var_event(&dentry->d_fsdata,
> > -			       dentry->d_fsdata !=
> > NFS_FSDATA_BLOCKED);
> > +			       smp_load_acquire(&dentry->d_fsdata)
> > +			       != NFS_FSDATA_BLOCKED);
> 
> Doesn't this end up being a reversed ACQUIRE+RELEASE as described in
> the "LOCK ACQUISITION FUNCTIONS" section of Documentation/memory-
> barriers.txt?

I don't think so.  That section is talking about STORE operations which
can move backwards through ACQUIRE and forwards through RELEASE.

Above we have a LOAD operation which mustn't move backwards through
ACQUIRE.  Below there is a STORE operation which mustn't move forwards
through a RELEASE.  Both of those are guaranteed.

> 
> IOW: Shouldn't the above rather be using READ_ONCE()?
> 
> >  		parent = dget_parent(dentry);
> >  		ret = reval(d_inode(parent), dentry, flags);
> >  		dput(parent);
> > @@ -1817,6 +1818,26 @@ static int nfs_lookup_revalidate(struct dentry
> > *dentry, unsigned int flags)
> >  	return __nfs_lookup_revalidate(dentry, flags,
> > nfs_do_lookup_revalidate);
> >  }
> >  
> > +static void block_revalidate(struct dentry *dentry)
> > +{
> > +	/* old devname - just in case */
> > +	kfree(dentry->d_fsdata);
> > +
> > +	/* Any new reference that could lead to an open
> > +	 * will take ->d_lock in lookup_open() -> d_lookup().
> > +	 */
> > +	lockdep_assert_held(&dentry->d_lock);
> > +
> > +	dentry->d_fsdata = NULL;
> 
> Why are you doing a barrier free change to dentry->d_fsdata here when
> you have the memory barrier protected change in unblock_revalidate()?

Ouch. This should be

	dentry->d_fsdata = NFS_FSDATA_BLOCKED;

I messed that up when rearranging the code after testing.

This doesn't need a barrier because a spinlock is held.  We check the
refcount under the spinlock and only proceed if there are no other
references.  So if __nfs_lookup_revalidate gets called concurrently, it
must have got a new reference, and that requires the same spinlock.
So if it is called after this assignment, the spinlock will provide all
needed barriers.

> 
> > +}
> > +
> > +static void unblock_revalidate(struct dentry *dentry)
> > +{
> > +	/* store_release ensures wait_var_event() sees the update */
> > +	smp_store_release(&dentry->d_fsdata, NULL);
> 
> Shouldn't this be a WRITE_ONCE(), for the same reason as above?

No, for the same reason as above.  WRITE_ONCE() doesn't provide any
memory barriers, it only avoid compiler optimisations.  Here we really
need the barrier on some CPUs.

Thanks for the review.

NeilBrown

> 
> > +	wake_up_var(&dentry->d_fsdata);
> > +}
> > +
> >  /*
> >   * A weaker form of d_revalidate for revalidating just the
> > d_inode(dentry)
> >   * when we don't really care about the dentry name. This is called
> > when a
> > @@ -2501,15 +2522,12 @@ int nfs_unlink(struct inode *dir, struct
> > dentry *dentry)
> >  		spin_unlock(&dentry->d_lock);
> >  		goto out;
> >  	}
> > -	/* old devname */
> > -	kfree(dentry->d_fsdata);
> > -	dentry->d_fsdata = NFS_FSDATA_BLOCKED;
> > +	block_revalidate(dentry);
> >  
> >  	spin_unlock(&dentry->d_lock);
> >  	error = nfs_safe_remove(dentry);
> >  	nfs_dentry_remove_handle_error(dir, dentry, error);
> > -	dentry->d_fsdata = NULL;
> > -	wake_up_var(&dentry->d_fsdata);
> > +	unblock_revalidate(dentry);
> >  out:
> >  	trace_nfs_unlink_exit(dir, dentry, error);
> >  	return error;
> > @@ -2616,8 +2634,7 @@ nfs_unblock_rename(struct rpc_task *task,
> > struct nfs_renamedata *data)
> >  {
> >  	struct dentry *new_dentry = data->new_dentry;
> >  
> > -	new_dentry->d_fsdata = NULL;
> > -	wake_up_var(&new_dentry->d_fsdata);
> > +	unblock_revalidate(new_dentry);
> >  }
> >  
> >  /*
> > @@ -2679,11 +2696,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct
> > inode *old_dir,
> >  		if (WARN_ON(new_dentry->d_flags &
> > DCACHE_NFSFS_RENAMED) ||
> >  		    WARN_ON(new_dentry->d_fsdata ==
> > NFS_FSDATA_BLOCKED))
> >  			goto out;
> > -		if (new_dentry->d_fsdata) {
> > -			/* old devname */
> > -			kfree(new_dentry->d_fsdata);
> > -			new_dentry->d_fsdata = NULL;
> > -		}
> >  
> >  		spin_lock(&new_dentry->d_lock);
> >  		if (d_count(new_dentry) > 2) {
> > @@ -2705,7 +2717,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct
> > inode *old_dir,
> >  			new_dentry = dentry;
> >  			new_inode = NULL;
> >  		} else {
> > -			new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
> > +			block_revalidate(new_dentry);
> >  			must_unblock = true;
> >  			spin_unlock(&new_dentry->d_lock);
> >  		}
> > @@ -2717,6 +2729,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct
> > inode *old_dir,
> >  	task = nfs_async_rename(old_dir, new_dir, old_dentry,
> > new_dentry,
> >  				must_unblock ? nfs_unblock_rename :
> > NULL);
> >  	if (IS_ERR(task)) {
> > +		if (must_unblock)
> > +			unblock_revalidate(new_dentry);
> >  		error = PTR_ERR(task);
> >  		goto out;
> >  	}
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
> 
> 
>
Trond Myklebust May 29, 2024, 1:27 a.m. UTC | #3
On Tue, 2024-05-28 at 11:19 +1000, NeilBrown wrote:
> On Tue, 28 May 2024, Trond Myklebust wrote:
> > On Mon, 2024-05-27 at 13:04 +1000, NeilBrown wrote:
> > > 
> > > dentry->d_fsdata is set to NFS_FSDATA_BLOCKED while unlinking or
> > > renaming-over a file to ensure that no open succeeds while the
> > > NFS
> > > operation progressed on the server.
> > > 
> > > Setting dentry->d_fsdata to NFS_FSDATA_BLOCKED is done under -
> > > >d_lock
> > > after checking the refcount is not elevated.  Any attempt to open
> > > the
> > > file (through that name) will go through lookp_open() which will
> > > take
> > > ->d_lock while incrementing the refcount, we can be sure that
> > > once
> > > the
> > > new value is set, __nfs_lookup_revalidate() *will* see the new
> > > value
> > > and
> > > will block.
> > > 
> > > We don't have any locking guarantee that when we set ->d_fsdata
> > > to
> > > NULL,
> > > the wait_var_event() in __nfs_lookup_revalidate() will notice.
> > > wait/wake primitives do NOT provide barriers to guarantee order. 
> > > We
> > > must use smp_load_acquire() in wait_var_event() to ensure we look
> > > at
> > > an
> > > up-to-date value, and must use smp_store_release() before
> > > wake_up_var().
> > > 
> > > This patch adds those barrier functions and factors out
> > > block_revalidate() and unblock_revalidate() far clarity.
> > > 
> > > There is also a hypothetical bug in that if memory allocation
> > > fails
> > > (which never happens in practice) we might leave ->d_fsdata
> > > locked.
> > > This patch adds the missing call to unblock_revalidate().
> > > 
> > > Reported-and-tested-by: Richard Kojedzinszky
> > > <richard+debian+bugreport@kojedz.in>
> > > Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1071501
> > > Fixes: 3c59366c207e ("NFS: don't unhash dentry during
> > > unlink/rename")
> > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > ---
> > >  fs/nfs/dir.c | 44 +++++++++++++++++++++++++++++---------------
> > >  1 file changed, 29 insertions(+), 15 deletions(-)
> > > 
> > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > index ac505671efbd..c91dc36d41cc 100644
> > > --- a/fs/nfs/dir.c
> > > +++ b/fs/nfs/dir.c
> > > @@ -1802,9 +1802,10 @@ __nfs_lookup_revalidate(struct dentry
> > > *dentry,
> > > unsigned int flags,
> > >  		if (parent != READ_ONCE(dentry->d_parent))
> > >  			return -ECHILD;
> > >  	} else {
> > > -		/* Wait for unlink to complete */
> > > +		/* Wait for unlink to complete - see
> > > unblock_revalidate() */
> > >  		wait_var_event(&dentry->d_fsdata,
> > > -			       dentry->d_fsdata !=
> > > NFS_FSDATA_BLOCKED);
> > > +			       smp_load_acquire(&dentry-
> > > >d_fsdata)
> > > +			       != NFS_FSDATA_BLOCKED);
> > 
> > Doesn't this end up being a reversed ACQUIRE+RELEASE as described
> > in
> > the "LOCK ACQUISITION FUNCTIONS" section of Documentation/memory-
> > barriers.txt?
> 
> I don't think so.  That section is talking about STORE operations
> which
> can move backwards through ACQUIRE and forwards through RELEASE.
> 
> Above we have a LOAD operation which mustn't move backwards through
> ACQUIRE.  Below there is a STORE operation which mustn't move
> forwards
> through a RELEASE.  Both of those are guaranteed.

It isn't necessary for the LOAD to move backwards through the ACQUIRE.
As I understand it, the point is that the ACQUIRE can move through the
RELEASE as per the following paragraph in that document:

            Similarly, the reverse case of a RELEASE followed by an ACQUIRE does
            not imply a full memory barrier.  Therefore, the CPU's execution of the
            critical sections corresponding to the RELEASE and the ACQUIRE can cross,
            so that:
            
                    *A = a;
                    RELEASE M
                    ACQUIRE N
                    *B = b;
            
            could occur as:
            
                    ACQUIRE N, STORE *B, STORE *A, RELEASE M

This would presumably be why the function clear_and_wake_up_bit() needs
a full memory barrier on most architectures, instead of being just an
smp_wmb(). Is my understanding of this wrong?

> 
> > 
> > IOW: Shouldn't the above rather be using READ_ONCE()?
> > 
> > >  		parent = dget_parent(dentry);
> > >  		ret = reval(d_inode(parent), dentry, flags);
> > >  		dput(parent);
> > > @@ -1817,6 +1818,26 @@ static int nfs_lookup_revalidate(struct
> > > dentry
> > > *dentry, unsigned int flags)
> > >  	return __nfs_lookup_revalidate(dentry, flags,
> > > nfs_do_lookup_revalidate);
> > >  }
> > >  
> > > +static void block_revalidate(struct dentry *dentry)
> > > +{
> > > +	/* old devname - just in case */
> > > +	kfree(dentry->d_fsdata);
> > > +
> > > +	/* Any new reference that could lead to an open
> > > +	 * will take ->d_lock in lookup_open() -> d_lookup().
> > > +	 */
> > > +	lockdep_assert_held(&dentry->d_lock);
> > > +
> > > +	dentry->d_fsdata = NULL;
> > 
> > Why are you doing a barrier free change to dentry->d_fsdata here
> > when
> > you have the memory barrier protected change in
> > unblock_revalidate()?
> 
> Ouch. This should be
> 
> 	dentry->d_fsdata = NFS_FSDATA_BLOCKED;
> 
> I messed that up when rearranging the code after testing.
> 
> This doesn't need a barrier because a spinlock is held.  We check the
> refcount under the spinlock and only proceed if there are no other
> references.  So if __nfs_lookup_revalidate gets called concurrently,
> it
> must have got a new reference, and that requires the same spinlock.
> So if it is called after this assignment, the spinlock will provide
> all
> needed barriers.
> 
> > 
> > > +}
> > > +
> > > +static void unblock_revalidate(struct dentry *dentry)
> > > +{
> > > +	/* store_release ensures wait_var_event() sees the
> > > update */
> > > +	smp_store_release(&dentry->d_fsdata, NULL);
> > 
> > Shouldn't this be a WRITE_ONCE(), for the same reason as above?
> 
> No, for the same reason as above.  WRITE_ONCE() doesn't provide any
> memory barriers, it only avoid compiler optimisations.  Here we
> really
> need the barrier on some CPUs.
> 
> Thanks for the review.
> 
> NeilBrown
> 
> > 
> > > +	wake_up_var(&dentry->d_fsdata);
> > > +}
> > > +
> > >  /*
> > >   * A weaker form of d_revalidate for revalidating just the
> > > d_inode(dentry)
> > >   * when we don't really care about the dentry name. This is
> > > called
> > > when a
> > > @@ -2501,15 +2522,12 @@ int nfs_unlink(struct inode *dir, struct
> > > dentry *dentry)
> > >  		spin_unlock(&dentry->d_lock);
> > >  		goto out;
> > >  	}
> > > -	/* old devname */
> > > -	kfree(dentry->d_fsdata);
> > > -	dentry->d_fsdata = NFS_FSDATA_BLOCKED;
> > > +	block_revalidate(dentry);
> > >  
> > >  	spin_unlock(&dentry->d_lock);
> > >  	error = nfs_safe_remove(dentry);
> > >  	nfs_dentry_remove_handle_error(dir, dentry, error);
> > > -	dentry->d_fsdata = NULL;
> > > -	wake_up_var(&dentry->d_fsdata);
> > > +	unblock_revalidate(dentry);
> > >  out:
> > >  	trace_nfs_unlink_exit(dir, dentry, error);
> > >  	return error;
> > > @@ -2616,8 +2634,7 @@ nfs_unblock_rename(struct rpc_task *task,
> > > struct nfs_renamedata *data)
> > >  {
> > >  	struct dentry *new_dentry = data->new_dentry;
> > >  
> > > -	new_dentry->d_fsdata = NULL;
> > > -	wake_up_var(&new_dentry->d_fsdata);
> > > +	unblock_revalidate(new_dentry);
> > >  }
> > >  
> > >  /*
> > > @@ -2679,11 +2696,6 @@ int nfs_rename(struct mnt_idmap *idmap,
> > > struct
> > > inode *old_dir,
> > >  		if (WARN_ON(new_dentry->d_flags &
> > > DCACHE_NFSFS_RENAMED) ||
> > >  		    WARN_ON(new_dentry->d_fsdata ==
> > > NFS_FSDATA_BLOCKED))
> > >  			goto out;
> > > -		if (new_dentry->d_fsdata) {
> > > -			/* old devname */
> > > -			kfree(new_dentry->d_fsdata);
> > > -			new_dentry->d_fsdata = NULL;
> > > -		}
> > >  
> > >  		spin_lock(&new_dentry->d_lock);
> > >  		if (d_count(new_dentry) > 2) {
> > > @@ -2705,7 +2717,7 @@ int nfs_rename(struct mnt_idmap *idmap,
> > > struct
> > > inode *old_dir,
> > >  			new_dentry = dentry;
> > >  			new_inode = NULL;
> > >  		} else {
> > > -			new_dentry->d_fsdata =
> > > NFS_FSDATA_BLOCKED;
> > > +			block_revalidate(new_dentry);
> > >  			must_unblock = true;
> > >  			spin_unlock(&new_dentry->d_lock);
> > >  		}
> > > @@ -2717,6 +2729,8 @@ int nfs_rename(struct mnt_idmap *idmap,
> > > struct
> > > inode *old_dir,
> > >  	task = nfs_async_rename(old_dir, new_dir, old_dentry,
> > > new_dentry,
> > >  				must_unblock ?
> > > nfs_unblock_rename :
> > > NULL);
> > >  	if (IS_ERR(task)) {
> > > +		if (must_unblock)
> > > +			unblock_revalidate(new_dentry);
> > >  		error = PTR_ERR(task);
> > >  		goto out;
> > >  	}
> >
NeilBrown May 29, 2024, 3:47 a.m. UTC | #4
On Wed, 29 May 2024, Trond Myklebust wrote:
> On Tue, 2024-05-28 at 11:19 +1000, NeilBrown wrote:
> > On Tue, 28 May 2024, Trond Myklebust wrote:
> > > On Mon, 2024-05-27 at 13:04 +1000, NeilBrown wrote:
> > > > 
> > > > dentry->d_fsdata is set to NFS_FSDATA_BLOCKED while unlinking or
> > > > renaming-over a file to ensure that no open succeeds while the
> > > > NFS
> > > > operation progressed on the server.
> > > > 
> > > > Setting dentry->d_fsdata to NFS_FSDATA_BLOCKED is done under -
> > > > >d_lock
> > > > after checking the refcount is not elevated.  Any attempt to open
> > > > the
> > > > file (through that name) will go through lookp_open() which will
> > > > take
> > > > ->d_lock while incrementing the refcount, we can be sure that
> > > > once
> > > > the
> > > > new value is set, __nfs_lookup_revalidate() *will* see the new
> > > > value
> > > > and
> > > > will block.
> > > > 
> > > > We don't have any locking guarantee that when we set ->d_fsdata
> > > > to
> > > > NULL,
> > > > the wait_var_event() in __nfs_lookup_revalidate() will notice.
> > > > wait/wake primitives do NOT provide barriers to guarantee order. 
> > > > We
> > > > must use smp_load_acquire() in wait_var_event() to ensure we look
> > > > at
> > > > an
> > > > up-to-date value, and must use smp_store_release() before
> > > > wake_up_var().
> > > > 
> > > > This patch adds those barrier functions and factors out
> > > > block_revalidate() and unblock_revalidate() far clarity.
> > > > 
> > > > There is also a hypothetical bug in that if memory allocation
> > > > fails
> > > > (which never happens in practice) we might leave ->d_fsdata
> > > > locked.
> > > > This patch adds the missing call to unblock_revalidate().
> > > > 
> > > > Reported-and-tested-by: Richard Kojedzinszky
> > > > <richard+debian+bugreport@kojedz.in>
> > > > Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1071501
> > > > Fixes: 3c59366c207e ("NFS: don't unhash dentry during
> > > > unlink/rename")
> > > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > > ---
> > > >  fs/nfs/dir.c | 44 +++++++++++++++++++++++++++++---------------
> > > >  1 file changed, 29 insertions(+), 15 deletions(-)
> > > > 
> > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > > index ac505671efbd..c91dc36d41cc 100644
> > > > --- a/fs/nfs/dir.c
> > > > +++ b/fs/nfs/dir.c
> > > > @@ -1802,9 +1802,10 @@ __nfs_lookup_revalidate(struct dentry
> > > > *dentry,
> > > > unsigned int flags,
> > > >  		if (parent != READ_ONCE(dentry->d_parent))
> > > >  			return -ECHILD;
> > > >  	} else {
> > > > -		/* Wait for unlink to complete */
> > > > +		/* Wait for unlink to complete - see
> > > > unblock_revalidate() */
> > > >  		wait_var_event(&dentry->d_fsdata,
> > > > -			       dentry->d_fsdata !=
> > > > NFS_FSDATA_BLOCKED);
> > > > +			       smp_load_acquire(&dentry-
> > > > >d_fsdata)
> > > > +			       != NFS_FSDATA_BLOCKED);
> > > 
> > > Doesn't this end up being a reversed ACQUIRE+RELEASE as described
> > > in
> > > the "LOCK ACQUISITION FUNCTIONS" section of Documentation/memory-
> > > barriers.txt?
> > 
> > I don't think so.  That section is talking about STORE operations
> > which
> > can move backwards through ACQUIRE and forwards through RELEASE.
> > 
> > Above we have a LOAD operation which mustn't move backwards through
> > ACQUIRE.  Below there is a STORE operation which mustn't move
> > forwards
> > through a RELEASE.  Both of those are guaranteed.
> 
> It isn't necessary for the LOAD to move backwards through the ACQUIRE.
> As I understand it, the point is that the ACQUIRE can move through the
> RELEASE as per the following paragraph in that document:
> 
>             Similarly, the reverse case of a RELEASE followed by an ACQUIRE does
>             not imply a full memory barrier.  Therefore, the CPU's execution of the
>             critical sections corresponding to the RELEASE and the ACQUIRE can cross,
>             so that:
>             
>                     *A = a;
>                     RELEASE M
>                     ACQUIRE N
>                     *B = b;
>             
>             could occur as:
>             
>                     ACQUIRE N, STORE *B, STORE *A, RELEASE M

On the wakeup side we have:

  STORE ->d_fsdata
  RELEASE
  spin_lock
  LOAD: check is waitq is empty

and on the wait side we have

  STORE: add to waitq
  spin_unlock
  ACQUIRE
  LOAD ->d_fsdata

I believe that spin_lock is an ACQUIRE operation and spin_unlock is a
RELEASE operation.  So both of these have "ACQUIRE ; RELEASE" which
creates a full barrier.

> 
> This would presumably be why the function clear_and_wake_up_bit() needs
> a full memory barrier on most architectures, instead of being just an
> smp_wmb(). Is my understanding of this wrong?

clear_and_wake_up_bit() calls __wake_up_bit() which calls
waitqueue_active() without taking a lock.  So there is no ACQUIRE after
the RELEASE in clear_bit_unlock() and before testing for list-empty.  So
we need to explicitly add a barrier.

At least that is my understanding just now.  I hadn't worked through all
the details before, but I'm glad you prompted me to - thanks.

NeilBrown
diff mbox series

Patch

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ac505671efbd..c91dc36d41cc 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1802,9 +1802,10 @@  __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
 		if (parent != READ_ONCE(dentry->d_parent))
 			return -ECHILD;
 	} else {
-		/* Wait for unlink to complete */
+		/* Wait for unlink to complete - see unblock_revalidate() */
 		wait_var_event(&dentry->d_fsdata,
-			       dentry->d_fsdata != NFS_FSDATA_BLOCKED);
+			       smp_load_acquire(&dentry->d_fsdata)
+			       != NFS_FSDATA_BLOCKED);
 		parent = dget_parent(dentry);
 		ret = reval(d_inode(parent), dentry, flags);
 		dput(parent);
@@ -1817,6 +1818,26 @@  static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 	return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
 }
 
+static void block_revalidate(struct dentry *dentry)
+{
+	/* old devname - just in case */
+	kfree(dentry->d_fsdata);
+
+	/* Any new reference that could lead to an open
+	 * will take ->d_lock in lookup_open() -> d_lookup().
+	 */
+	lockdep_assert_held(&dentry->d_lock);
+
+	dentry->d_fsdata = NULL;
+}
+
+static void unblock_revalidate(struct dentry *dentry)
+{
+	/* store_release ensures wait_var_event() sees the update */
+	smp_store_release(&dentry->d_fsdata, NULL);
+	wake_up_var(&dentry->d_fsdata);
+}
+
 /*
  * A weaker form of d_revalidate for revalidating just the d_inode(dentry)
  * when we don't really care about the dentry name. This is called when a
@@ -2501,15 +2522,12 @@  int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dentry->d_lock);
 		goto out;
 	}
-	/* old devname */
-	kfree(dentry->d_fsdata);
-	dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+	block_revalidate(dentry);
 
 	spin_unlock(&dentry->d_lock);
 	error = nfs_safe_remove(dentry);
 	nfs_dentry_remove_handle_error(dir, dentry, error);
-	dentry->d_fsdata = NULL;
-	wake_up_var(&dentry->d_fsdata);
+	unblock_revalidate(dentry);
 out:
 	trace_nfs_unlink_exit(dir, dentry, error);
 	return error;
@@ -2616,8 +2634,7 @@  nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
 {
 	struct dentry *new_dentry = data->new_dentry;
 
-	new_dentry->d_fsdata = NULL;
-	wake_up_var(&new_dentry->d_fsdata);
+	unblock_revalidate(new_dentry);
 }
 
 /*
@@ -2679,11 +2696,6 @@  int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
 		    WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
 			goto out;
-		if (new_dentry->d_fsdata) {
-			/* old devname */
-			kfree(new_dentry->d_fsdata);
-			new_dentry->d_fsdata = NULL;
-		}
 
 		spin_lock(&new_dentry->d_lock);
 		if (d_count(new_dentry) > 2) {
@@ -2705,7 +2717,7 @@  int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 			new_dentry = dentry;
 			new_inode = NULL;
 		} else {
-			new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+			block_revalidate(new_dentry);
 			must_unblock = true;
 			spin_unlock(&new_dentry->d_lock);
 		}
@@ -2717,6 +2729,8 @@  int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
 				must_unblock ? nfs_unblock_rename : NULL);
 	if (IS_ERR(task)) {
+		if (must_unblock)
+			unblock_revalidate(new_dentry);
 		error = PTR_ERR(task);
 		goto out;
 	}